银华数据抓取

来源:互联网 发布:上海都绚网络 编辑:程序博客网 时间:2024/04/30 11:13

http://cn.morningstar.com/quicktake/F000000416?place=qq#portfolio

import sysimport osimport pandas as pdimport loggingimport timeimport urllibfrom urllib.parse import urlencodefrom bs4 import  BeautifulSoupimport randomimport reimport http.cookiejarimport datetimeimport configparserfrom sqlalchemy import create_engineimport psycopg2import jsondef Setup_logging(log_path, config_path, default_level):    """setup logging config"""    config_path = os.path.join(os.getcwd() , 'Log', config_path)    log_path = os.path.join(os.getcwd() , 'Log', log_path)    if os.path.exists(config_path):        with open(config_path, 'rt') as f:            config = json.load(f)        config["handlers"]["error_file_handler"]["filename"] = log_path        logging.config.dictConfig(config)    else:        logging.basicConfig(level=default_level)class YinHuaCrawler():    #得到网站原始数据    def getContent1(self):        goon_lst = []        goon = True        url='http://cn.morningstar.com/quicktake/F000000416?place=qq'        headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',                    'Accept-Language':'zh-CN,zh;q=0.8',                    'Cache-Control':'max-age=0',                    'Cookie':'ASP.NET_SessionId=hyykexil2mgax045ennw5si0; Hm_lvt_eca85e284f8b74d1200a42c9faa85464=1508480165; Hm_lpvt_eca85e284f8b74d1200a42c9faa85464=1508489139; BIGipServercn=2241287690.20480.0000; __utmt=1; __utma=172984700.1526292971.1508480165.1508482049.1508489139.3; __utmb=172984700.1.10.1508489139; __utmc=172984700; __utmz=172984700.1508480165.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',                    'Host':'cn.morningstar.com',                    'Proxy-Connection':'keep-alive',                    'Upgrade-Insecure-Requests':'1',                    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36'}        proxy = urllib.request.ProxyHandler({'http':'http://WANGHONGWEI950:Whw5479043@10.37.84.36:8080'})        opener = urllib.request.build_opener(proxy)        try:            req = urllib.request.Request(url,headers=headers)            res = opener.open(req)             page_content = res.read()            page_content = page_content.decode('utf-8')            logger.info ('visit site succeed')#            print (page_content)            return page_content        except:            logger.info('error_url: ' + url)            goon = False            goon_lst.append(goon)    #解析数据    def JX(self,page_content):        soup = BeautifulSoup(page_content,'html.parser')        a = soup.find('ul',id="qt_sector",class_="clearfix").find_all(class_="col1")        for line in a:            if line.contents[0] != '代码':                print (line.contents[0])#        report_info = soup.find('div',attrs={'class':"info"})#        report_info = (report_info.table.tr.td.get_text() )##        soup = BeautifulSoup(page_content[7:],'html.parser')#        report_info = soup.find('div',attrs={'class':"info"})#        report_info = (report_info.table.tr.td.get_text() )if __name__ == "__main__":    ## 配置日志    logger = logging.getLogger(__name__)    Setup_logging(log_path= 'crawlerLog.log', config_path='LogConfig.json', default_level=logging.DEBUG)    crawler = YinHuaCrawler()    page_content = crawler.getContent1()    crawler.JX(page_content)
原创粉丝点击