功能描述
- 目标:获取上交所和深交所所有股票的名称和交易信息
- 输出:保存到文件中
- 技术路线:requests+bs4+re
数据网站的选择
- 新浪股票:http://finance.sina.com.cn/stock/
- 百度股票:https://gupiao.baidu.com/stock/
- 选取原则:股票信息静态存在于html页面中,非js代码生成,没有Robots协议限制。
- 选取方法:浏览器F12,源代码查看等
- 选取心态:不要纠结于某个网站,多找信息源尝试
- 东方财富网股票列表:http://quote.eastmoney.com/stocklist.html
程序的结构设计
- 步骤1:从东方财富网获取股票列表
- 步骤2:根据股票列表逐个到百度股票获取个股信息
- 步骤3:将结果存储到文件
1 import requests 2 from bs4 import BeautifulSoup 3 import traceback 4 import re 5 6 7 def getHTMLText(url, code="utf-8"): 8 try: 9 r = requests.get(url) 10 r.raise_for_status() 11 r.encoding = code 12 return r.text 13 except: 14 return "" 15 16 17 def getStockList(lst, stockURL): 18 html = getHTMLText(stockURL, "GB2312") # 东方财富网编码格式GB2312 19 soup = BeautifulSoup(html, ‘html.parser‘) 20 a = soup.find_all(‘a‘) 21 for i in a: 22 try: 23 href = i.attrs[‘href‘] 24 lst.append(re.findall(r"[s][hz]\d{6}", href)[0]) 25 except: 26 continue 27 28 29 def getStockInfo(lst, stockURL, fpath): 30 count = 0 31 for stock in lst: 32 url = stockURL + stock + ".html" 33 html = getHTMLText(url) 34 try: 35 if html == "": 36 continue 37 infoDict = {} # 保存一个个股的信息(类似名片) 38 soup = BeautifulSoup(html, ‘html.parser‘) 39 # 查找有{‘class‘:‘stock-bets‘}的div标签 40 stockInfo = soup.find(‘div‘, attrs={‘class‘: ‘stock-bets‘}) 41 42 name = stockInfo.find_all(attrs={‘class‘: ‘bets-name‘})[0] 43 infoDict.update({‘股票名称‘: name.text.split()[0]}) 44 45 keyList = stockInfo.find_all(‘dt‘) # 股票的具体信息key 46 valueList = stockInfo.find_all(‘dd‘) # 股票的具体信息value 47 for i in range(len(keyList)): 48 key = keyList[i].text 49 val = valueList[i].text 50 infoDict[key] = val 51 52 with open(fpath, ‘a‘, encoding=‘utf-8‘) as f: 53 f.write(str(infoDict) + ‘\n‘) 54 count = count + 1 55 print("\r当前进度: {:.2f}%".format(count * 100 / len(lst)), end="") 56 except: 57 count = count + 1 58 print("\r当前进度: {:.2f}%".format(count * 100 / len(lst)), end="") 59 continue 60 61 62 def main(): 63 stock_list_url = ‘http://quote.eastmoney.com/stocklist.html‘ 64 stock_info_url = ‘http://gupiao.baidu.com/stock/‘ 65 output_file = ‘D:/BaiduStockInfo.txt‘ 66 slist = [] 67 getStockList(slist, stock_list_url) 68 getStockInfo(slist, stock_info_url, output_file) 69 70 71 main()