Scrapy爬虫Demo

来源：互联网发布：雨荷数据恢复视频教程编辑：程序博客网时间：2024/06/05 04:50

#coding=utf-8import scrapyimport timeimport refrom qqcrawler.items import QqcrawlerItemclass QzoneSpider(scrapy.Spider):    name = "qzone"    # allowed_domains = ["qzone.qq.com/"]    start_urls = [        # "http://www.ncst.edu.cn/"        "http://qzone.qq.com/"        # ,"http://www.qq.com/"    ]    def parse(self, response):        try:            qq_item = QqcrawlerItem()   #爬取的数据                                   qq_item['c_time'] = time.time()            qq_item['url'] = response.url            if response.xpath('/html/head/title'):                qq_item['title'] = response.xpath('/html/head/title').extract()            else:                qq_item['title']=None            yield qq_item            if response.xpath('//@href'):                for i in response.xpath('//@href').extract():                    if re.match('^http.*qzone\.qq.*',i):                        print i,'================'                        yield scrapy.Request(i, callback=self.parse)    #继续向爬虫中添加url        except:            print ''

0 0