[置顶]scrapy 知乎关键字爬虫spider代码
来源:互联网 发布:亚瑟士t4c1n4993 知乎 编辑:程序博客网 时间:2024/06/05 14:10
以下是spider部分的代码。爬知乎是需要登录的,建议使用cookie就可以了,如果需要爬的数量预计不多,请不要使用过大的线程数量,否则会过快的被封杀,需要等十几个小时账号才能重新使用,比起损失的这十几个小时的时间,即使是单线程也能够爬取很多页面了,得不偿失。
知乎是基于账号策略反爬的,换ua和ip并没用,如果需要高并发,需要采用几十个账号的方式来爬取。
知乎是基于账号策略反爬的,换ua和ip并没用,如果需要高并发,需要采用几十个账号的方式来爬取。
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy import Request 4 from scrapy import log 5 import logging 6 #from zhihu.items import ZhihuItem 7 from zhihu.items import ZhihuItem 8 from scrapy_redis.spiders import RedisSpider 9 import re 10 import json 11 import time 12 13 class BaoxianSpider(RedisSpider): ##使用redis分布式 14 15 name = "baoxian" 16 allowed_domains = ["zhihu.com"] 17 #redis_key='baoxian:start_urls' 18 keywords='软件测试' ###要爬的关键词 19 from urllib import quote 20 urlencode_keywords=quote(keywords) 21 22 start_urls = ['https://www.zhihu.com/r/search?q='+urlencode_keywords+'&type=content&offset=0'] #'https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=0' 23 def start_requests(self): 24 for url in self.start_urls: 25 yield Request(url=url, callback=self.parse,dont_filter=True) 26 27 def parse(self, response): 28 body=response.body #{"paging":{"next":"\/r\/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=50"},"htmls" 29 #print body 30 31 #获取问题链接 32 question_href_reg=r'<div class=\\"title\\"><a target=\\"_blank\\" href=\\"\\/question\\/(.*?)\\"' 33 all_question_href=re.findall(question_href_reg,body) 34 print 'all_question_href:',all_question_href 35 for aqh in all_question_href: 36 question_href='https://www.zhihu.com/question/'+str(aqh) 37 yield Request(url=question_href, callback=self.parse_question,dont_filter=True) 38 print question_href 39 40 log.msg("question_href:%s \n list_question_page:%s"%(question_href,response.url), level=log.INFO) 41 #self.log 42 #获取下一页的链接 43 44 reg=r'{"paging":{"next":"(\\/r\\/search\?q=.*?&type=content&offset=.*?)"},"htmls"' 45 next_page=re.findall(reg,body) 46 print '下一页问题:',next_page 47 if len(next_page): 48 #print next_page[0] #https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=10 49 next_page_url='https://www.zhihu.com'+ next_page[0].replace('\\','') 50 print 'next_page_url:',next_page_url 51 yield Request(url=next_page_url, callback=self.parse,dont_filter=True) 52 log.msg("next_page_url:%s"%next_page_url, level=log.INFO) 53 54 #data-type=\"Answer\"><div class=\"title\"><a target=\"_blank\" href=\"\/question\/22316395\" 55 56 57 def parse_question(self,response): ####问题详情页面 58 #print response.body 59 60 print 'response.url:',response.url 61 title=response.xpath('//h1[@class="QuestionHeader-title"]/text()').extract_first() 62 print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) 63 print 'title:',title 64 #editableDetail":",国内的保险员说风险太大,不受法律保护什么的。大神推荐我赴港买保险吗?","visitCount" 65 reg='editableDetail":"([\s\S]*?)","visitCount"' 66 content_match=re.findall(reg,response.body) 67 if content_match: 68 content=content_match[0] 69 else: 70 content='' #有可能问题无具体描述 71 print 'content:',content 72 question={} 73 question['url']=response.url 74 question['title']=title 75 76 question['content']=content 77 #https://www.zhihu.com/question/19904068 78 question['comment']=[] 79 #https://www.zhihu.com/api/v4/questions/20214716/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=3&offset=3 80 answer_json='https://www.zhihu.com/api/v4/questions/'+re.findall('(\d+)',response.url)[0]+'/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=0' 81 print 'answer_json:',answer_json 82 yield Request(url=answer_json, callback=self.parse_json,meta=question,dont_filter=False) 83 """ 84 item=ZhihuItem() 85 item['title']=question['title'] 86 item['url']=question['url'] 87 item['content']=question['content'] 88 yield item 89 print item 90 """ 91 92 def parse_json(self,response): ####答案列表 93 meta=response.meta 94 dict=json.loads(response.body) 95 96 #print 'dict:',dict 97 print 'dcit to json:',json.dumps(dict,ensure_ascii=False) 98 comment_list=meta['comment'] 99 for data in dict['data']: # dict['data']是列表,每个元素是字典100 try:101 comment_dict={}102 comment_dict['comment_content']=data['content']103 if data['author']['name']:104 comment_dict['author']=data['author']['name']105 else:106 comment_dict['author']=''107 comment_dict['voteup_count']=data['voteup_count']108 comment_dict['comment_count']=data['comment_count']109 comment_dict['comment_time']=time.strftime('%Y-%m-%d',time.localtime(data['created_time']))110 comment_list.append(comment_dict)111 except Exception,e:112 print e113 meta['comment']=comment_list114 meta['answer_num']=dict['paging']['totals']115 116 117 118 if dict['paging']['is_end']==False: ###自动翻页 119 yield Request(url=dict['paging']['next'], callback=self.parse_json,meta=meta,dont_filter=False)120 else:121 #log.msg("last:%s"%next_page_url, level=log.INFO)122 print 'last:',meta['title'],meta['url'] ,meta['content'],meta['answer_num'],len(meta['comment'])#,meta['comment']123 item=ZhihuItem()124 item['title']=meta['title']125 item['url']=meta['url']126 item['content']=meta['content']127 item['answer_num']=meta['answer_num']128 item['comment']=meta['comment']129 yield item
发下运行结果,存储用的mongodb
comment的内容
阅读全文
0 0
- [置顶]scrapy 知乎关键字爬虫spider代码
- scrapy 入门教程 爬虫 Spider
- scrapy爬虫之Spider
- 爬虫Scrapy-05Spider
- Scrapy爬虫 - 获取知乎用户数据
- Scrapy spider代码片段
- Scrapy爬虫入门教程四 Spider(爬虫)
- python爬虫 scrapy框架 知乎zhihu 模拟登陆
- 【python爬虫03】使用Scrapy框架模拟登录知乎
- 【Scrapy】学习记录2_爬虫Spider
- 搜索引擎–Python下开源爬虫(spider)框架scrapy的使用
- 知乎爬虫
- 知乎爬虫
- python 爬虫 知乎
- 知乎爬虫
- python爬虫知乎
- 知乎图片爬虫
- JAVA知乎爬虫
- fiddler抓包,搞定接口
- 将博客搬至CSDN
- Apache用户目录枚举工具apache-users
- [置顶]百度贴吧自动回帖的两种方式,使用requests(urllib2)和selenium两种方式回帖
- FPGA静态时序分析——IO口时序(Input Delay /output Delay)
- [置顶]scrapy 知乎关键字爬虫spider代码
- [置顶]使用scrapy_redis,自动实时增量更新东方头条网全站新闻
- swift3.0 类字符串转类(字符串转ViewController)
- H264码流的两种形式:Annex B和AVCC——非常详细的翻译
- wordpress点击文章标题调到站外URL
- python3中字符串、列表、字典的排序
- C++ 整型所能表示的数据范围
- DSP 内联函数(inline)在x86上实现的一些要点之后的验证方法(一)感悟
- 整车厂的四大工艺