scrapy打造知乎后花园三: 抓取关注用户数据 json异步动态加载

来源:互联网 发布:操作系统调度算法代码 编辑:程序博客网 时间:2024/06/06 06:31

一、爬取思路

        1.利用上一篇完成登录,然后去抓取数据。

      2.随便找一个用户作为入口,然后利用Chrome工具找到他关注用户的API接口网址和传递的参数。

      3.获取API接口返回的JSON数据。然后对JSON数据进行处理。

      4.从关注者数大于10000的用户中随机取一个,去抓取他关注的人。如此循环。

二、具体操作

      1.随便找一个用户Crossin,   点击[关注了],查看他关注了哪些人。

         https://www.zhihu.com/people/crossin/following

 


2.按F12,打开调试工具。点击network。找到请求API的URL和参数。

https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}

include:data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics

offset:20

limit:20

重点:保存authorization:到headers,不然抓取不到数据报错401。


点击preview,可以看到返回的的JSON结构数据。下一页。



myspider.py

# -*- coding: utf-8 -*-import scrapyfrom scrapy.http import Request, FormRequestfrom zhihu.items import ZhihuItemimport timefrom PIL import Imageimport jsonimport requestsfrom random import choiceclass MyspiderSpider(scrapy.Spider):    name = 'myspider'    allowed_domains = ['zhihu.com']    start_urls = ['https://www.zhihu.com/']        headers_zhihu = {             'Host':'www.zhihu.com ',             'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',           'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',             'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',             'Accept-Encoding':'gzip,deflate,sdch',             'Referer':'https://www.zhihu.com ',             'If-None-Match':"FpeHbcRb4rpt_GuDL6-34nrLgGKd.gz",             'Cache-Control':'max-age=0',             'Connection':'keep-alive',           'authorization':'Bearer Mi4wQUdCQzBlWkZEUXdBTU1MYmVyUVBEQmNBQUFCaEFsVk4tWXVlV1FCR3pSbE1WbVpQeU5OODdrTUNlM21jZ2ZTUnBB|1500970746|f8c1997fd0e539beec76bcbb15cffd29971d7c05'     }      def start_requests(self):        return [Request("https://www.zhihu.com/",meta={'cookiejar':1},headers = self.headers_zhihu,callback=self.captcha)]           def captcha(self,response):        xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract()[0]        t = str(int(time.time() * 1000))        captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + '&type=login&lang=en'        return [Request(captcha_url, callback=self.parser_captcha,meta={'cookiejar':response.meta['cookiejar'],'xsrf':xsrf})]             def parser_captcha(self, response):                with open('captcha.jpg', 'wb') as f:              f.write(response.body)              f.close()          im = Image.open('captcha.jpg')        im.show()        im.close()        captcha = raw_input("请输入验证码:")         xsrf = response.meta['xsrf']        return  FormRequest('https://www.zhihu.com/login/phone_num',                method='POST',                meta = {'cookiejar':response.meta['cookiejar']},                callback = self.after_login,                dont_filter = True,                headers = self.headers_zhihu,                 formdata = {                    'phone_num':'138*********',                    'password':'*******',                    '_xsrf':xsrf,                    'captcha_type':'en',                    'captcha':captcha,                },)#用户关注列表API接口    follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'    follows_query ='data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'    start_user = 'crossin'    user_token=[]        def after_login(self,response):          json_file = json.loads(response.text)#返回登录JSON信息        if json_file['r'] == 0:           print('登录成功.....开始爬了。。。。')           yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20),                         callback=self.parse_follows,                         meta = {'cookiejar':response.meta['cookiejar']},                         headers = self.headers_zhihu,                        )        else:           print json_file['msg'].encode('utf-8')            def parse_follows(self, response):        results = json.loads(response.text)        item = ZhihuItem()        if 'data' in results.keys():            for result in results.get('data'):#读取JSON结构化数据                item['name'] =  result.get('name').encode('utf-8')                url_token = result.get('url_token')                item['url_token'] = 'https://www.zhihu.com/people/' + url_token                item['answer_count'] = result.get('answer_count')                item['articles_count'] = result.get('articles_count')                follower_count = result.get('follower_count')                item['follower_count'] = follower_count                gender = result.get('gender')                if gender==0:                   item['gender']='女'                else:                   item['gender']='男'                                   if follower_count > 10000:                   self.user_token.append(url_token)#保存关注数大于10000的用户                yield item                if 'paging' in results.keys() and results.get('paging').get('is_end')==False:#没有到最后一页,继续爬这个用户            next_page = results.get('paging').get('next')            yield Request(next_page,                          callback=self.parse_follows,                          meta = {'cookiejar':response.meta['cookiejar']},                          headers = self.headers_zhihu,                         )        else:            start_user = choice(self.user_token)#到最后一页后,随机选一个用户,继续爬            yield Request(self.follows_url.format(user=start_user,include=self.follows_query,offset=0,limit=20),                         callback=self.parse_follows,                         meta = {'cookiejar':response.meta['cookiejar']},                         headers = self.headers_zhihu,                         )       

items.py

import scrapyclass ZhihuItem(scrapy.Item):    # define the fields for your item here like:    name = scrapy.Field()    answer_count = scrapy.Field()    articles_count = scrapy.Field()    follower_count = scrapy.Field()    gender = scrapy.Field()    url_token = scrapy.Field()

pipelines.py 保存为excel格式。

# -*- coding: utf-8 -*-import timefrom openpyxl import Workbookclass ZhihuPipeline(object):     def __init__(self):        self.wb = Workbook()        self.ws = self.wb.active        self.ws.append(['姓名','性别','回答','文章','关注者','网址'])     def process_item(self, item, spider):         line = [item['name'],item['gender'],item['answer_count'],item['articles_count'],item['follower_count'],item['url_token']]         self.ws.append(line)         now = time.strftime('%Y-%m-%d',time.localtime())         filename ='/home/soft/zhihu/'+ now +'.xlsx'         self.wb.save(filename)         return item



三、代码运行

[root@master zhihu]# scrapy crawl myspider



代码运行过程。



excel保存内容。



四、触类旁通

下面是关注者和问答的API接口URL和参数




阅读全文
1 0
原创粉丝点击