python伪代码之爬取全国高校专业人气榜运行代码持续更新:【内向即失败--王奕君】

来源:互联网 发布:网络广告宣传图片 编辑:程序博客网 时间:2024/04/30 03:37
# -*- coding:utf-8 -*-from concurrent.futures import ThreadPoolExecutorfrom function.data_tool import clean_dataimport threadpoolfrom function.database_tool import auto_sqlseverfrom function.monitor_tool import cuber_monitorfrom function.file_tool import operate_filefrom function.data_tool import deal_datafrom function.network_tool import download_toolfrom function.data_tool import extract_datafrom function.data_tool import transform_datafrom lxml import htmlimport reimport collectionsW=auto_sqlsever.Mssql(database='school',datatable=['schoolrank','schoolmajor'])down=download_tool.Downloader()City_log='D:\大排行\city.json'Url_log='D:\大排行\main_url.json'Mark_log='D:\大排行\mark.txt'heat_table=['max', 'monthly', 'week']school_table=['10052', '10051', '10140', '10142', '10141', '10143', '10050']city_map=collections.OrderedDict([('北京', '10003'), ('天津', '10006'), ('河北', '10016'), ('河南', '10017'), ('山东', '10009'), ('山西', '10010'), ('内蒙古', '10002'), ('陕西', '10029'), ('辽宁', '10027'),                                  ('吉林', '10004'), ('黑龙江', '10031'), ('上海', '10000'), ('江苏', '10014'), ('浙江', '10018'), ('安徽', '10008'), ('江西', '10015'), ('湖北', '10021'), ('湖南', '10022'),                                  ('重庆', '10028'), ('四川', '10005'), ('贵州', '10026'), ('云南', '10001'), ('广东', '10011'), ('广西', '10012'), ('福建', '10024'), ('甘肃', '10023'), ('青海', '10030'),                                  ('宁夏', '10007'), ('新疆', '10013'), ('西藏', '10025'), ('海南', '10019')])city_name=operate_file.read_file(City_log,container=['r1','json','read'])class SelfError(SyntaxWarning):    passclass Spider:    def __init__(self):        pass    def boy(self,task):        reform_table = collections.OrderedDict()        city_code = [city_map[city_name]]        reform_table['heat_table'] = heat_table        reform_table['city_table'] = city_code        reform_table['school_table'] = school_table        url_table = deal_data.treelist_tolist(reform_table)        correct_url = deal_data.scan_url(url_table, judge={'1': '没有找到相关数据'})        url_file = operate_file.setup_file(filename=Url_log, container=['w', 'json', correct_url], exists=1)        for url in operate_file.read_file(filename=url_file, container=['r2', 'json']):            cuber_monitor.record_task(bigtask=City_log,smalltask=Url_log,log_name=Mark_log,Task=['大学热度排名','31',task,'2017-12-13'])            self.box={}            self.girl(url)            operate_file.read_file(filename=Url_log, container=['x', 'json'], field=url)        operate_file.deal_file(file=Url_log, method='remove')    def girl(self,url):        self.url = url        pool = threadpool.ThreadPool(20)        request = threadpool.makeRequests(self.fetch_list_pagecode(),'')        for req in request:            pool.putRequest(req)            try:                pool.wait()            except SelfError as e:                break        self.parse_list_pagecode()    def fetch_list_pagecode(self):        container=down.visit(url=self.url,response_container=['text','utf-8'])        paragraph=extract_data.extract_define_labels(label=["<tbody class='lin-seachtable'>","<!--表格页面导航-->"],mate='one',text=container[-1])        self.box[task] = paragraph.replace('\n','').replace('\t','').replace('\r\n','').replace('\r','')    def parse_list_pagecode(self):        # print(self.url)        heat='总人气'if 'max' in self.url else('月人气' if 'monthly' in self.url else '周人气')        school_type='独立学院' if '10052' in self.url else('高职高专' if '10051'        in self.url else('中外合作办学' if '10140' in self.url else        ('HND项目' if '10142' in self.url else('远程教育学院' if '10141' in self.url else('成人教育' if '10143' in self.url else '普通本科')))))        list_content=self.box[task].replace('&nbsp;开设专业','')        field_re = re.compile(';">(\d+)<.*?&nbsp;(.*?)</a>.*? >(.*?)<.*?(/schoolhtm.*?)".*?FF6600">(.*?)<',re.S)        result=field_re.findall(list_content)        for i in result:            self.heat=deal_data.define_dict(container=['Heat','Url','Rank','Name','Province','Type','Heat_value','Major','Main_url'])            self.heat['Rank']=i[0];self.heat['Name']=i[1];self.heat['Heat_value']=i[-1];self.heat['Province']=i[2];self.heat['Heat']=heat;self.heat['Type']=school_type            self.fetch_content_pagcode(i[3])    def fetch_content_pagcode(self,url):        content_url='http://gkcx.eol.cn'+url        self.heat['Url']=content_url        response=down.visit(url=content_url,response_container=['text','utf-8'])        major_container=[]        for ul in html.fromstring(response[-1]).xpath('//ul[@class="li-major grid"]/li/a'):            self.major_talbe = deal_data.define_dict(container=['Major', 'Link', 'Content'])            link='http://gkcx.eol.cn/'+ul.xpath('./@href')[0]            major=ul.xpath('./text()')[0]            content=self.fetch_major(link)            self.major_talbe['Link']=link;self.major_talbe['Major']=major;self.major_talbe['Content']=content            major_container.append(self.major_talbe)        self.heat['Major']=major_container        self.heat['Main_url']=self.url        # print(transform_data.correct_encode(self.heat))           #一条网址,会得到所有专业        W.insert_data(self.heat)        # for i in self.heat['Major']:        #     pass    def fetch_major(self,link):        """这个地方要大改啊"""        resoponse=down.visit(url=link,response_container=['text','utf-8'])        return transform_data.correct_encode(clean_data.clean_html(html.fromstring(resoponse[-1]).xpath('string(//div[@class="content news"])')))a=Spider()pool = ThreadPoolExecutor(max_workers=4)   #四核进程池for task in operate_file.read_file(filename=City_log,container=['r2','json']):    f1 = pool.submit(a.boy(task))    operate_file.read_file(filename=City_log,container=['x','json'],field=task)W.operator()
阅读全文
0 0