多线程糗百爬虫

来源:互联网 发布:美国留学 知乎 编辑:程序博客网 时间:2024/04/25 05:02

知识点:

  1. Thread设置守护线程
  2. 写入csv
    这里写图片描述
# -*- coding:utf-8 -*-# Createdon 2017/11/12 0012 下午 18:06import refrom lxml import etreeimport requestsfrom threading import Threadfrom queue import Queueclass Qiubai(object):    def __init__(self):        self.queue_url = Queue()        self.queue_response = Queue()        self.queue_data = Queue()        self.main_url = 'https://www.qiushibaike.com'        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'                                      ' AppleWebKit/537.36 (KHTML, like Gecko)'                                      ' Chrome/60.0.3112.101 Safari/537.36'}        self.url_pattern = 'https://www.qiushibaike.com/8hr/page/{}/'        import csv        self.csvfile = open('qiubai_thread.csv', 'w', encoding='gbk')        fieldnames = ['username', 'laugh', 'comment', 'url', 'img_url']        self.writer = csv.DictWriter(self.csvfile, fieldnames=fieldnames)        self.writer.writeheader()    # 生成url, 加入url队列    def generate_url(self):        for i in range(1,14):            self.queue_url.put(self.url_pattern.format(i))    # 获取url, 生成response加入response队列    def generate_response(self):        while True:            url = self.queue_url.get()            response = requests.get(url, headers=self.headers)            if response.status_code == 200:                print('{}请求成功'.format(url))                self.queue_response.put(response)            else:                print('{}请求失败'.format(url))                self.queue_url.put(url)            self.queue_url.task_done()    # 写入csv    def save_to_csv(self):        while True:            data = self.queue_data.get()            self.writer.writerow(data)            self.queue_data.task_done()    # 获取response,解析获得data,加入data队列    def parse(self):        while True:            response = self.queue_response.get()            html = etree.HTML(response.content)            divs = html.xpath('//div[@id="content-left"]/div')            for div in divs:                item = {}                # 处理匿名问题                username = div.xpath('./div/a/h2/text()')                item['username'] = re.sub(r'["\s]', '' ,username[0]) if username else '匿名用户'                print(item['username'])                item['laugh'] = div.xpath('.//span[@class="stats-vote"]/i/text()')[0]                item['comment'] = div.xpath('.//span[@class="stats-comments"]/a/i/text()')[0]                item['url'] = self.main_url + div.xpath('./a/@href')[0]                # 处理图片链接不全的问题                img_url = div.xpath('./div[@class="thumb"]/a/img/@src')                if img_url:                    img = img_url[0]                    item['img_url'] = img if img.startswith('http:') else 'http:' + img                else:                    item['img_url'] = 'NA'                self.queue_data.put(item)            self.queue_response.task_done()    def run(self):        thread_list = []        # 生成url的线程        t_generate_url = Thread(target=self.generate_url)        thread_list.append(t_generate_url)        # 创建请求线程        for i in range(3):            t = Thread(target=self.generate_response)            thread_list.append(t)        # 创建解析线程        for i in range(3):            t = Thread(target=self.parse)            thread_list.append(t)        # 创建保存数据进程        t_save = Thread(target=self.save_to_csv)        thread_list.append(t_save)        # 设置守护线程        for t in thread_list:            t.setDaemon(True)            t.start()        # 队列join,一个队列元素取完并执行完之后,子线程阻塞在get(),q.join()通过,        # 所有队列执行完,主线程结束,子线程作为守护线程的职责就结束,所有子线程关闭        for q in [self.queue_url, self.queue_response, self.queue_data]:            q.join()if __name__ == '__main__':    qiubai = Qiubai()    qiubai.run()
原创粉丝点击