[置顶]scrapy 知乎关键字爬虫spider代码

来源:互联网 发布:亚瑟士t4c1n4993 知乎 编辑:程序博客网 时间:2024/06/05 14:10
以下是spider部分的代码。爬知乎是需要登录的,建议使用cookie就可以了,如果需要爬的数量预计不多,请不要使用过大的线程数量,否则会过快的被封杀,需要等十几个小时账号才能重新使用,比起损失的这十几个小时的时间,即使是单线程也能够爬取很多页面了,得不偿失。

知乎是基于账号策略反爬的,换ua和ip并没用,如果需要高并发,需要采用几十个账号的方式来爬取。
  1 # -*- coding: utf-8 -*-  2 import scrapy  3 from scrapy import Request  4 from scrapy import log  5 import logging  6 #from zhihu.items import ZhihuItem  7 from zhihu.items import ZhihuItem  8 from scrapy_redis.spiders import RedisSpider  9 import re 10 import json 11 import time 12  13 class BaoxianSpider(RedisSpider):       ##使用redis分布式 14  15     name = "baoxian" 16     allowed_domains = ["zhihu.com"] 17     #redis_key='baoxian:start_urls' 18     keywords='软件测试'                                        ###要爬的关键词 19     from urllib import quote 20     urlencode_keywords=quote(keywords) 21  22     start_urls = ['https://www.zhihu.com/r/search?q='+urlencode_keywords+'&type=content&offset=0'] #'https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=0' 23     def start_requests(self): 24         for url in self.start_urls: 25             yield Request(url=url, callback=self.parse,dont_filter=True) 26  27     def parse(self, response): 28         body=response.body  #{"paging":{"next":"\/r\/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=50"},"htmls" 29         #print body 30  31         #获取问题链接 32         question_href_reg=r'<div class=\\"title\\"><a target=\\"_blank\\" href=\\"\\/question\\/(.*?)\\"' 33         all_question_href=re.findall(question_href_reg,body) 34         print 'all_question_href:',all_question_href 35         for aqh in all_question_href: 36             question_href='https://www.zhihu.com/question/'+str(aqh) 37             yield Request(url=question_href, callback=self.parse_question,dont_filter=True) 38             print question_href 39  40             log.msg("question_href:%s \n list_question_page:%s"%(question_href,response.url), level=log.INFO) 41             #self.log 42         #获取下一页的链接 43  44         reg=r'{"paging":{"next":"(\\/r\\/search\?q=.*?&type=content&offset=.*?)"},"htmls"' 45         next_page=re.findall(reg,body) 46         print '下一页问题:',next_page 47         if len(next_page): 48             #print next_page[0]   #https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=10 49             next_page_url='https://www.zhihu.com'+ next_page[0].replace('\\','') 50             print 'next_page_url:',next_page_url 51             yield Request(url=next_page_url, callback=self.parse,dont_filter=True) 52             log.msg("next_page_url:%s"%next_page_url, level=log.INFO) 53  54                                            #data-type=\"Answer\"><div class=\"title\"><a target=\"_blank\" href=\"\/question\/22316395\" 55  56  57     def parse_question(self,response):                             ####问题详情页面 58         #print response.body 59  60         print 'response.url:',response.url 61         title=response.xpath('//h1[@class="QuestionHeader-title"]/text()').extract_first() 62         print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) 63         print 'title:',title 64         #editableDetail&quot;:&quot;,国内的保险员说风险太大,不受法律保护什么的。大神推荐我赴港买保险吗?&quot;,&quot;visitCount&quot 65         reg='editableDetail&quot;:&quot;([\s\S]*?)&quot;,&quot;visitCount&quot' 66         content_match=re.findall(reg,response.body) 67         if  content_match: 68             content=content_match[0] 69         else: 70             content=''               #有可能问题无具体描述 71         print 'content:',content 72         question={} 73         question['url']=response.url 74         question['title']=title 75  76         question['content']=content 77         #https://www.zhihu.com/question/19904068 78         question['comment']=[] 79         #https://www.zhihu.com/api/v4/questions/20214716/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=3&offset=3 80         answer_json='https://www.zhihu.com/api/v4/questions/'+re.findall('(\d+)',response.url)[0]+'/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=0' 81         print 'answer_json:',answer_json 82         yield Request(url=answer_json, callback=self.parse_json,meta=question,dont_filter=False) 83         """ 84         item=ZhihuItem() 85         item['title']=question['title'] 86         item['url']=question['url'] 87         item['content']=question['content'] 88         yield item 89         print item 90         """ 91  92     def parse_json(self,response):                           ####答案列表 93         meta=response.meta 94         dict=json.loads(response.body) 95  96         #print 'dict:',dict 97         print 'dcit to json:',json.dumps(dict,ensure_ascii=False) 98         comment_list=meta['comment'] 99         for data  in  dict['data']:                    # dict['data']是列表,每个元素是字典100             try:101                 comment_dict={}102                 comment_dict['comment_content']=data['content']103                 if data['author']['name']:104                     comment_dict['author']=data['author']['name']105                 else:106                     comment_dict['author']=''107                 comment_dict['voteup_count']=data['voteup_count']108                 comment_dict['comment_count']=data['comment_count']109                 comment_dict['comment_time']=time.strftime('%Y-%m-%d',time.localtime(data['created_time']))110                 comment_list.append(comment_dict)111             except Exception,e:112                 print e113         meta['comment']=comment_list114         meta['answer_num']=dict['paging']['totals']115 116 117 118         if dict['paging']['is_end']==False:             ###自动翻页 119             yield Request(url=dict['paging']['next'], callback=self.parse_json,meta=meta,dont_filter=False)120         else:121             #log.msg("last:%s"%next_page_url, level=log.INFO)122             print 'last:',meta['title'],meta['url'] ,meta['content'],meta['answer_num'],len(meta['comment'])#,meta['comment']123             item=ZhihuItem()124             item['title']=meta['title']125             item['url']=meta['url']126             item['content']=meta['content']127             item['answer_num']=meta['answer_num']128             item['comment']=meta['comment']129             yield item

 

发下运行结果,存储用的mongodb

 

 

 

comment的内容

 

原创粉丝点击