关于python多线程的爬虫的一些实践

来源:互联网 发布:nba总决赛个人数据统计 编辑:程序博客网 时间:2024/05/16 23:43
#coding=utf-8#!/usr/bin/python# 从html文档中提取出url,经过检验后加入待抓去队列,应该是producer#       url是否重复#       url是否符合模式# 从待抓去队列中get url,然后去抓去,consumer# -*- coding: utf-8 -*-import threadingimport timeimport sysimport urllib.requestimport chardetimport html.parserimport urllib.parseimport reclass MyHtmlParser(html.parser.HTMLParser):    def __init__(self,data):        html.parser.HTMLParser.__init__(self)        self.data = dataclass Producer(threading.Thread):    def __init__(self,name,queue):        threading.Thread.__init__(self)        self.name = name        self.queue = queue    def run(self):        for i in range(50):            pass            # self.queue.put('a wonton')            # print('producing wontons'+str(self.queue.qsize()))class Consumer(threading.Thread):    def __init__(self, name, queue):        threading.Thread.__init__(self)        self.name = name        self.queue = queue        self.headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',                        'Accept-Language':'zh-CN,zh;q=0.8',                        'Connection':'keep-alive',                        'Upgrade-Insecure-Requests':'1',                        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}    def run(self):        url = self.queue.get()        self.headers['host'] = urllib.parse.urlparse(url).netloc        req = urllib.request.Request(url,headers=self.headers)        with urllib.request.urlopen(req) as f:            # lang_info_temp = chardet.detect(f.read())            # print(lang_info_temp)            # print(list(f.getheaders()))            html_str = f.read().decode('utf-8')        # lang_info = chardet.detect(data)        # print()        # lang_info['encoding']        # print(data.decode('utf-8'))        with open(r'./'+str(self.headers['host']),'w',encoding='utf-8') as output:            output.write(html_str)def main():    import queue    to_crawl_queue = queue.Queue()    to_crawl_queue.put('http://www.cma.gov.cn/')    to_crawl_queue.put('http://www.sina.com.cn/')    to_crawl_queue.put('https://www.baidu.com/')    producer = Producer('xiaomachaoshou',to_crawl_queue)    consumer = Consumer('wo',to_crawl_queue)    producer.start()    consumer.start()if __name__=='__main__':    p = '\\".*html\\"'    real_p = re.compile(p)    input_string = 'nishi sm yisi me "hello.html" hello world"cradle.html"'    match_list = real_p.findall(input_string)    print(match_list)    # main()    try:        sys.exit(0)    except SystemExit:        print('The mini_spider has exited gracefully')
原创粉丝点击