多线程爬虫之糗事百科
来源:互联网 发布:如何给给淘宝号升级 编辑:程序博客网 时间:2024/06/05 20:56
# coding=utf-8import requestsfrom retrying import retryfrom lxml import etreeimport jsonfrom queue import Queueimport threadingclass QiushiSpider(object): def __init__(self): self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/" self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"} self.url_queue = Queue() self.html_queue = Queue() self.content_queue = Queue() def get_url_list(self): for i in range(1, 14): self.url_queue.put(self.url_temp.format(i)) @retry(stop_max_attempt_number=3) def _parse_url(self, url): response = requests.get(url, headers=self.headers, timeout=5) assert response.status_code == 200 html = etree.HTML(response.content) return html def parse_url(self): # 2. 发送请求获取相应 while 1: url = self.url_queue.get() print(" now parseing", url) try: html = self._parse_url(url) except Exception as e: print(e) html = None self.html_queue.put(html) self.url_queue.task_done() def get_content_list(self): while 1: html = self.html_queue.get() div_list = html.xpath("//div[contains(@id,'qiushi_tag_')]") content_list = [] for div in div_list: item = {} src = div.xpath("./div[@class='author clearfix']/a[1]/img/@src") item["src"] = "https:" + src[0] if len(src) > 0 else None author_name = div.xpath("./div[@class='author clearfix']/a[1]/img/@alt") item["author_name"] = author_name[0] if len(author_name) > 0 else None author_gender = div.xpath(".//div[contains(@class,'articleGender')]/@class") item["author_gender"] = author_gender[0].split(" ")[-1].replace("Icon", "") if len( author_gender) > 0 else None author_age = div.xpath(".//div[contains(@class,'articleGender')]/text()") item["author_age"] = author_age[0] if len(author_age) > 0 else None duanzi_content = div.xpath(".//div[@class='content']/span/text()") item["duanzi_content"] = [i.replace("\n", "") for i in duanzi_content] content_list.append(item) self.content_queue.put(content_list) self.html_queue.task_done() def save_content_list(self): while 1: content_list = self.content_queue.get() file_path = './qiushi.txt' with open(file_path, "a", encoding='utf-8') as f: for content in content_list: f.write(json.dumps(content, ensure_ascii=False, indent=4)) f.write("\n") self.content_queue.task_done() def run(self): thread_list = [] # 1.url_list t_url = threading.Thread(target=self.get_url_list) thread_list.append(t_url) # 2.遍历url_list发送请求,获取响应 for i in range(3): t_parse_url = threading.Thread(target=self.parse_url) thread_list.append(t_parse_url) # 3.提取数据 t_get_content = threading.Thread(target=self.get_content_list) thread_list.append(t_get_content) # 4.保存 t_save = threading.Thread(target=self.save_content_list) thread_list.append(t_save) for t in thread_list: t.setDaemon(True) t.start() for q in [self.url_queue,self.html_queue,self.content_queue]: q.join()if __name__ == '__main__': qiushi = QiushiSpider() qiushi.run()
阅读全文
0 0
- 多线程爬虫之糗事百科
- 爬虫——多线程糗事百科案例
- 爬虫——多线程糗事百科案例
- Python爬虫实战之爬糗事百科
- Python爬虫实例2-多线程爬虫抓取糗事百科数据
- 糗事百科爬虫
- 糗事百科爬虫
- python爬虫糗事百科
- 糗事百科爬虫改进
- 糗事百科交互式爬虫
- 爬虫实战--糗事百科
- 糗事百科简单爬虫
- [python3]糗事百科爬虫
- Python糗事百科爬虫
- python爬虫“糗事百科”
- 爬虫-糗事百科段子
- python爬虫糗事百科
- python糗事百科爬虫
- 春思
- 基于python3的socket文件传输和校验
- 并行信号处理技术-序
- Graph Valid Tree
- HDU
- 多线程爬虫之糗事百科
- Git的简单使用
- spring学习笔记十五 切面的优先级
- Java并发编程:volatile关键字解析
- 9.11总结
- MySQL日期函数总结
- codility Max-Nonoverlapping-Segments
- HGDB(pg)行锁现象
- 102. Binary Tree Level Order Traversal