python 东方财富网&百度股票数据定向爬虫实例

来源：互联网发布：速卖通数据分析软件编辑：程序博客网时间：2024/04/29 13:07

功能：

1、获取上交所深交所的股票信息

2、输出保存到文件中

技术路线： requests -beatiful soup - re

候选：数据网站选择

1、静态网站，信息静态存在HTML页面中，非js 代码生成

2、 F12 ，源代码查看

多找信息源

方法：

1、从东方财富网获取股票列表信息

2、根据股票列表逐个到百度股票获取个股信息

3、将结果存储到文件中

import requestsfrom bs4 import BeautifulSoupimport tracebackimport redef getHTMLText(url):    try:        r = requests.get(url)        r.raise_for_status()        r.encoding = r.apparent_encoding        return r.text    except:        return ""def getStockList(lst, stockURL):    html = getHTMLText(stockURL)    soup = BeautifulSoup(html, 'html.parser')     a = soup.find_all('a')    for i in a:        try:            href = i.attrs['href']            lst.append(re.findall(r"[s][hz]\d{6}", href)[0])        except:            continuedef getStockInfo(lst, stockURL, fpath):    for stock in lst:        url = stockURL + stock + ".html"        html = getHTMLText(url)        try:            if html=="":                continue            infoDict = {}            soup = BeautifulSoup(html, 'html.parser')            stockInfo = soup.find('div',attrs={'class':'stock-bets'})            name = stockInfo.find_all(attrs={'class':'bets-name'})[0]            infoDict.update({'股票名称': name.text.split()[0]})            keyList = stockInfo.find_all('dt')            valueList = stockInfo.find_all('dd')            for i in range(len(keyList)):                key = keyList[i].text                val = valueList[i].text                infoDict[key] = val            with open(fpath, 'a', encoding='utf-8') as f:                f.write( str(infoDict) + '\n' )        except:            traceback.print_exc()            continuedef main():    stock_list_url = 'http://quote.eastmoney.com/stocklist.html'    stock_info_url = 'https://gupiao.baidu.com/stock/'    output_file = 'D:/BaiduStockInfo.txt'    slist=[]    getStockList(slist, stock_list_url)    getStockInfo(slist, stock_info_url, output_file)main()

改进代码 —-1、添加进度条，增加用户体验
2、

import requestsfrom bs4 import BeautifulSoupimport tracebackimport redef getHTMLText(url, code="utf-8"):    try:        r = requests.get(url)        r.raise_for_status()        r.encoding = code        return r.text    except:        return ""def getStockList(lst, stockURL):    html = getHTMLText(stockURL, "GB2312")    soup = BeautifulSoup(html, 'html.parser')     a = soup.find_all('a')    for i in a:        try:            href = i.attrs['href']            lst.append(re.findall(r"[s][hz]\d{6}", href)[0])        except:            continuedef getStockInfo(lst, stockURL, fpath):    count = 0    for stock in lst:        url = stockURL + stock + ".html"        html = getHTMLText(url)        try:            if html=="":                continue            infoDict = {}            soup = BeautifulSoup(html, 'html.parser')            stockInfo = soup.find('div',attrs={'class':'stock-bets'})            name = stockInfo.find_all(attrs={'class':'bets-name'})[0]            infoDict.update({'股票名称': name.text.split()[0]})            keyList = stockInfo.find_all('dt')            valueList = stockInfo.find_all('dd')            for i in range(len(keyList)):                key = keyList[i].text                val = valueList[i].text                infoDict[key] = val            with open(fpath, 'a', encoding='utf-8') as f:                f.write( str(infoDict) + '\n' )                count = count + 1                print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="")        except:            count = count + 1            print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="")            continuedef main():    stock_list_url = 'http://quote.eastmoney.com/stocklist.html'    stock_info_url = 'https://gupiao.baidu.com/stock/'    output_file = 'D:/BaiduStockInfo.txt'    slist=[]    getStockList(slist, stock_list_url)    getStockInfo(slist, stock_info_url, output_file)main()

0 0

python 东方财富网&百度股票数据定向爬虫 实例

python 东方财富网&百度股票数据定向爬虫实例