银华数据抓取
来源:互联网 发布:上海都绚网络 编辑:程序博客网 时间:2024/04/30 11:13
http://cn.morningstar.com/quicktake/F000000416?place=qq#portfolio
import sysimport osimport pandas as pdimport loggingimport timeimport urllibfrom urllib.parse import urlencodefrom bs4 import BeautifulSoupimport randomimport reimport http.cookiejarimport datetimeimport configparserfrom sqlalchemy import create_engineimport psycopg2import jsondef Setup_logging(log_path, config_path, default_level): """setup logging config""" config_path = os.path.join(os.getcwd() , 'Log', config_path) log_path = os.path.join(os.getcwd() , 'Log', log_path) if os.path.exists(config_path): with open(config_path, 'rt') as f: config = json.load(f) config["handlers"]["error_file_handler"]["filename"] = log_path logging.config.dictConfig(config) else: logging.basicConfig(level=default_level)class YinHuaCrawler(): #得到网站原始数据 def getContent1(self): goon_lst = [] goon = True url='http://cn.morningstar.com/quicktake/F000000416?place=qq' headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Cookie':'ASP.NET_SessionId=hyykexil2mgax045ennw5si0; Hm_lvt_eca85e284f8b74d1200a42c9faa85464=1508480165; Hm_lpvt_eca85e284f8b74d1200a42c9faa85464=1508489139; BIGipServercn=2241287690.20480.0000; __utmt=1; __utma=172984700.1526292971.1508480165.1508482049.1508489139.3; __utmb=172984700.1.10.1508489139; __utmc=172984700; __utmz=172984700.1508480165.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 'Host':'cn.morningstar.com', 'Proxy-Connection':'keep-alive', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36'} proxy = urllib.request.ProxyHandler({'http':'http://WANGHONGWEI950:Whw5479043@10.37.84.36:8080'}) opener = urllib.request.build_opener(proxy) try: req = urllib.request.Request(url,headers=headers) res = opener.open(req) page_content = res.read() page_content = page_content.decode('utf-8') logger.info ('visit site succeed')# print (page_content) return page_content except: logger.info('error_url: ' + url) goon = False goon_lst.append(goon) #解析数据 def JX(self,page_content): soup = BeautifulSoup(page_content,'html.parser') a = soup.find('ul',id="qt_sector",class_="clearfix").find_all(class_="col1") for line in a: if line.contents[0] != '代码': print (line.contents[0])# report_info = soup.find('div',attrs={'class':"info"})# report_info = (report_info.table.tr.td.get_text() )## soup = BeautifulSoup(page_content[7:],'html.parser')# report_info = soup.find('div',attrs={'class':"info"})# report_info = (report_info.table.tr.td.get_text() )if __name__ == "__main__": ## 配置日志 logger = logging.getLogger(__name__) Setup_logging(log_path= 'crawlerLog.log', config_path='LogConfig.json', default_level=logging.DEBUG) crawler = YinHuaCrawler() page_content = crawler.getContent1() crawler.JX(page_content)
阅读全文
0 0
- 银华数据抓取
- 晨星银华富裕主题数据抓取详细版
- 数据抓取
- 数据抓取
- 抓取数据
- 抓取数据
- 数据抓取之数据抓取流程
- 规律抓取游戏数据
- 关于抓取网页数据
- 抓取网页数据
- 抓取网页中的数据
- web抓取数据
- 如何抓取股票数据
- java抓取网站数据
- java抓取网页数据
- 用PHP抓取数据
- 一个数据抓取项目
- 用curl抓取数据
- A Corrupt Mayor's Performance Art HDU
- codeforces 835B Key races
- 利用javaMail发送邮件
- 爱辉辉ERP——javaweb项目实战(一)
- uva 1629
- 银华数据抓取
- HTML入门6
- 网络学习之校园ip的分配
- Unity 自带函数汇总
- 分布式拒绝服务攻击 DDoS攻击
- PullToRefresh ScrollView的Listview和轮播图ViewPagerAdapter的配置
- C#语言 第一章 .NET体系结构
- 分享一些JAVA相关资源
- CodeForces 580D (状压DP) Kefa and Dishes