【Python】抓取人人都是产品经理的文章

来源：互联网发布：淘宝招牌图片素材尺寸编辑：程序博客网时间：2024/06/04 00:53

简介

使用python3.5
支持自动切换User-Agent（基于fake_useragent）
支持增量爬取（基于pybloom）
支持中断续爬

代码

# -*- coding: utf-8 -*-#-------------------------------------# author: maqingxiong# date:   2017-10-09# desc:   抓取人人都是产品经理最新文章#-------------------------------------import requestsimport sysimport iosys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')from pybloom import ScalableBloomFilterfrom fake_useragent import UserAgentclass PmSpider(object):    def __init__(self):        self.session = requests.session()        self.ua = UserAgent()        self.sbf = ScalableBloomFilter()        self.load_url_in_bloomfilter()    def load_url_in_bloomfilter(self):        with open('crawled_url.txt', 'r') as f:            for line in f:                self.sbf.add(line.replace('\n', ''))    def construct_all_url(self):        url_list = []        base_url = 'http://www.woshipm.com/__api/v1/stream-list?paged={}&action=laodpost'        for i in range(1, 11):            url_list.append(base_url.format(str(i)))        return url_list    def get_req_headers(self):        headers = {            'User-Agent': self.ua.random,            'Host': 'www.woshipm.com',            'Referer': 'http://www.woshipm.com/'        }        return headers    def get_html(self, url):        headers = self.get_req_headers()        try:            response = self.session.get(url=url, headers=headers)            if response.status_code == 200:                return response.json()            else:                return None        except Exception as e:            return self.get_html(url=url)    def parse_html(self, html):        for item in html['payload']:            print(item['id'])            print(item['title'])            print(item['permalink'])            print(item['date'])            print(item['image'])    def run(self):        f = open("crawled_url.txt", 'w')        url_list = self.construct_all_url()        for url in url_list:            if url in self.sbf:                continue            else:                f.write(url+'\n')                html = self.get_html(url=url)                if html:                    self.parse_html(html)        f.close()if __name__ == "__main__":    pm_spider = PmSpider()    pm_spider.run()

阅读全文

0 0