scrapy 爬取百度知道,多spider子一个项目中,使用一个pielines

来源:互联网 发布:怎么修改淘宝会员名字 编辑:程序博客网 时间:2024/06/06 22:01

爬取过程中 遇见 百度蜘蛛反爬 robot.txt,我们可以在scrapy 的setting.py 配置文件下配置

ROBOTSTXT_OBEY = False



最终代码

# -*- coding: utf-8 -*-from scrapy.spider import Spiderfrom scrapy.contrib.spiders import CrawlSpider, Rule#from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractorfrom scrapy.linkextractors import LinkExtractorfrom scrapy.selector import Selectorfrom scrapy.http import Request, HtmlResponsefrom scrapy import logfrom items import BDzdItemclass BDzdSpider(CrawlSpider):    global qa_number;    qa_number=0;    """爬取百度知道 银行"""    log.msg("log",level=log.DEBUG)    def _requests_to_follow(self, response):        if not isinstance(response, HtmlResponse):            return        seen = set()        for n, rule in enumerate(self._rules):            links = [lnk for lnk in rule.link_extractor.extract_links(response)                     if lnk not in seen]            if links and rule.process_links:                links = rule.process_links(links)            for link in links:                if link.text.find("银行") == -1:                    continue;                seen.add(link)                r = Request(url=link.url, callback=self._response_downloaded)                r.meta.update(rule=n, link_text=link.text)                yield rule.process_request(r)    name = "bankSpider"    download_delay = 1    allowed_domains = ["zhidao.baidu.com"]    start_urls = [        "https://zhidao.baidu.com/question/1796062605517856547.html?fr=iks&word=%D2%F8%D0%D0&ie=gbk"    ]    rules = [        Rule(LinkExtractor(allow=('/question/.*'),                               restrict_xpaths=('//a[@class="related-link"]')),             callback='parse_item',             follow=True)    ]    def parse_item(self, response):        #return;       # open("aa.txt", 'wb').write(response.body)        sel = Selector(response)        url=response._url;        question=sel.xpath('//span[@class="ask-title "]/text()').extract()        answer = sel.xpath('//pre[@class="best-text mb-10"]/text()').extract()        otherAnswer=sel.xpath('//div[@class="answer-text line"]/span/text()').extract()        #sites=sel.xpath('//a[@class="related-link"]')        item = BDzdItem()        item["question"] = ''.join(question);        if len(answer) > 0:            item["answer"] = ''.join(answer);#因为xpath text()截出来可能是字符数组,要转成字符        elif len(otherAnswer) > 0:            item["answer"] = ''.join(otherAnswer[0]);        else:            return;        global qa_number        qa_number=qa_number+1;        item["number"]=qa_number        print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 第" + str(qa_number)+" 条";        print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + url;        print "##########################################" + item["question"];        print "*******************************************" +  item["answer"];        yield item

如果有多个spider在一个项目中,可以在pipelines.py中这样写

# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport jsonimport codecsclass TutorialPipeline(object):    def process_item(self, item, spider):        print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%0"        return itemclass BDzdPipeline(object):    def __init__(self):        self.bankFile = codecs.open('data_bank.json', 'wb', encoding='utf-8')#银行        self.mobileFile = codecs.open('data_mobile.json', 'wb', encoding='utf-8')#移动        self.baoxianFile = codecs.open('data_baoxian.json', 'wb', encoding='utf-8')#保险        self.jinrongFile = codecs.open('data_jinrong.json', 'wb', encoding='utf-8')#金融    def process_item(self, item, spider):        line = json.dumps(dict(item)) + '\n'        if spider.name=='bankSpider':            self.bankFile.write(line.decode("unicode_escape"))        elif spider.name == 'mobileSpider':            self.mobileFile.write(line.decode("unicode_escape"))        elif spider.name == 'baoxianSpider':            self.baoxianFile.write(line.decode("unicode_escape"))        elif spider.name == 'jinrongSpider':            self.jinrongFile.write(line.decode("unicode_escape"))        return item



0 0
原创粉丝点击