多线程爬虫之糗事百科

来源：互联网发布：如何给给淘宝号升级编辑：程序博客网时间：2024/06/05 20:56
# coding=utf-8import requestsfrom retrying import retryfrom lxml import etreeimport jsonfrom queue import Queueimport threadingclass QiushiSpider(object):    def __init__(self):        self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"        self.headers = {            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"}        self.url_queue = Queue()        self.html_queue = Queue()        self.content_queue = Queue()    def get_url_list(self):        for i in range(1, 14):            self.url_queue.put(self.url_temp.format(i))    @retry(stop_max_attempt_number=3)    def _parse_url(self, url):        response = requests.get(url, headers=self.headers, timeout=5)        assert response.status_code == 200        html = etree.HTML(response.content)        return html    def parse_url(self):  # 2. 发送请求获取相应        while 1:            url = self.url_queue.get()            print(" now parseing", url)            try:                html = self._parse_url(url)            except Exception as e:                print(e)                html = None            self.html_queue.put(html)            self.url_queue.task_done()    def get_content_list(self):        while 1:            html = self.html_queue.get()            div_list = html.xpath("//div[contains(@id,'qiushi_tag_')]")            content_list = []            for div in div_list:                item = {}                src = div.xpath("./div[@class='author clearfix']/a[1]/img/@src")                item["src"] = "https:" + src[0] if len(src) > 0 else None                author_name = div.xpath("./div[@class='author clearfix']/a[1]/img/@alt")                item["author_name"] = author_name[0] if len(author_name) > 0 else None                author_gender = div.xpath(".//div[contains(@class,'articleGender')]/@class")                item["author_gender"] = author_gender[0].split(" ")[-1].replace("Icon", "") if len(                    author_gender) > 0 else None                author_age = div.xpath(".//div[contains(@class,'articleGender')]/text()")                item["author_age"] = author_age[0] if len(author_age) > 0 else None                duanzi_content = div.xpath(".//div[@class='content']/span/text()")                item["duanzi_content"] = [i.replace("\n", "") for i in duanzi_content]                content_list.append(item)            self.content_queue.put(content_list)            self.html_queue.task_done()    def save_content_list(self):        while 1:            content_list = self.content_queue.get()            file_path = './qiushi.txt'            with open(file_path, "a", encoding='utf-8') as f:                for content in content_list:                    f.write(json.dumps(content, ensure_ascii=False, indent=4))                    f.write("\n")            self.content_queue.task_done()    def run(self):        thread_list = []        # 1.url_list        t_url = threading.Thread(target=self.get_url_list)        thread_list.append(t_url)        # 2.遍历url_list发送请求，获取响应        for i in range(3):            t_parse_url = threading.Thread(target=self.parse_url)            thread_list.append(t_parse_url)        # 3.提取数据        t_get_content = threading.Thread(target=self.get_content_list)        thread_list.append(t_get_content)        # 4.保存        t_save = threading.Thread(target=self.save_content_list)        thread_list.append(t_save)        for t in thread_list:            t.setDaemon(True)            t.start()        for q in [self.url_queue,self.html_queue,self.content_queue]:            q.join()if __name__ == '__main__':    qiushi = QiushiSpider()    qiushi.run()
阅读全文
0 0