Scrapy框架学习笔记（一）

来源：互联网发布：淘宝卖家帐号在那登陆编辑：程序博客网时间：2024/05/22 15:15

Scrapy框架学习笔记（一）

使用步骤：

1、首先建立自己的Item文件，其中定义的是抓取的内容的数据类型。

2、接下来建立自己的spider文件，

3、最后建立自己的pipeline文件，负责接收spider传送过来的Item，并在这个文件中进行处理，可以导出到文件，可以存入数据库。

碰到的问题：

1、爬取的中文输出到文件中全为unicode字符，初期以为是字符编码的问题，尝试了各种编码以后，发现非也。实为list输出错误，只需输出list[0]即可解决。

2、勿忘在settings.py注册pipeline。

下面为几个递归爬取的例子：

BsaeSpier

方法1:将item和Request对象都放入到items中，通过返回items，让框架自己去识别时item还是Request。

class SlyySpider(BaseSpider):    name = "a"    allowed_domains = [".com"]    start_urls = ["****"]    def parse(self, response):        hxs = HtmlXPathSelector(response)                items = []        h3 = hxs.select('''*****''').extract()        h3_unicode = "".join(h3)        t1 = hxs.select('''****''').extract()        items.append(SlyyItem(head=h3_unicode, url=response.url))        for url in hxs.select('''***''').extract():            items.append(Request(url, callback=self.parse))       return items

方法2：通过yield来区别对待item和request

class SlyySpider(BaseSpider):    name = "slyy2"    allowed_domains = ["***"]    start_urls = ["***"]    def parse(self, response):        hxs = HtmlXPathSelector(response)        h3 = hxs.select('''***''').extract()        h3_unicode = "".join(h3)        yield SlyyItem(head=h3_unicode, url=response.url)        for url in hxs.select('''***''').extract():            yield Request(url, callback=self.parse) 方法3: 例一class SlyySpider(BaseSpider):    name = "slyy3"    allowed_domains = ["***"]    start_urls = ["***"]        def parse(self, response):        hxs = HtmlXPathSelector(response)        items = []        firspost = hxs.select('''***''').extract()[0]        items.extend([self.make_requests_from_url(firspost).replace(callback=self.parse_post)])        url2 = hxs.select('''***''').extract()[0]        items.append(self.make_requests_from_url(url2))                return items            def parse_post(self, response):        hxs = HtmlXPathSelector(response)        h3 = hxs.select('''***''').extract()[0]        print h3        item = SlyyItem()        item['url'] = response.url        item['head'] = h3        return item例二1 from scrapy.selector import HtmlXPathSelector 2   3 def parse(self, response): 4     hxs = HtmlXPathSelector(response) 5     items = [] 6   7     newurls = hxs.select('//a/@href').extract() 8     validurls = [] 9     for url in newurls:10             #判断URL是否合法11             if true: 12                     validurls.append(url)13     items.extend([self.make_requests_from_url(url).replace(callback=self.parse) for url in validurls])15  16     sites = hxs.select('//ul/li')17     items = []18     for site in sites:19             item = DmozItem()20             item['title'] = site.select('a/text()').extract()21             item['link'] = site.select('a/@href').extract()22             item['desc'] = site.select('text()').extract()23             items.append(item)24  25     return itemsCrawlSpier1 from scrapy.selector import HtmlXPathSelector 2  from sitemap.items import SitemapItem 3   4  import urllib 5  import simplejson 6  import exceptions 7  import pickle 8   9  class SitemapSpider(CrawlSpider):10      name = 'sitemap_spider'11      allowed_domains = ['qunar.com']12      start_urls = ['http://www.qunar.com/routes/']13  14      rules = (15          #Rule(SgmlLinkExtractor(allow=(r'http://www.qunar.com/routes/.*')), callback='parse'),16          #Rule(SgmlLinkExtractor(allow=('http:.*/routes/.*')), callback='parse'),17      )18  19      def parse(self, response):20          item = SitemapItem()21          x         = HtmlXPathSelector(response)22          raw_urls  = x.select("//a/@href").extract()23          urls      = []24          for url in raw_urls:25              if 'routes' in url:26                  if 'http' not in url:27                      url = 'http://www.qunar.com' + url28                  urls.append(url)29  30          for url in urls:31              yield Request(url)32  33          item['url']         = response.url.encode('UTF-8')34          arr_keywords        = x.select("//meta[@name='keywords']/@content").extract()35          item['keywords']    = arr_keywords[0].encode('UTF-8')36          arr_description     = x.select("//meta[@name='description']/@content").extract()37          item['description'] = arr_description[0].encode('UTF-8')38          yield item

关于rule：定义了一系列的相关链接

allow属性为允许的链接

deny属性为不允许的链接

callback属性为回调函数

rules = (

#下面是符合规则的网址,但是不抓取内容,只是提取该页的链接(这里网址是虚构的,实际使用时请替换)

Rule(SgmlLinkExtractor(allow=(r'http://test_url/test?page_index=\d+'))),

#下面是符合规则的网址,提取内容,(这里网址是虚构的,实际使用时请替换)

Rule(SgmlLinkExtractor(allow=(r'http://test_rul/test?product_id=\d+')), callback="parse_item"),

)

以下为我编写的爬取沪江网站的简单爬虫：

items.py:

# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/topics/items.htmlfrom scrapy.item import Item, Fieldclass MyproItem(Item):        # define the fields for your item here like:        # name = Field()        id = Field()th = Field()        zh = Field()        url = Field()        title = Field()

pipelines.py:

class MyproPipeline(object):    def __init__(self):        self.file = open('th.txt' , 'w')        self.file2 = open('zh.txt' , 'w')    def process_item(self, item, spider):        if len(item['th']) > 0 and len(item['zh']) > 0 :            if(len(item['th']) == len(item['zh'])):                              self.file.write(str(item['title'][0].encode("utf-8"))+ '\n')                for i in range(len(item['th'])):                    self.file.write(str(item['th'][i].encode("utf-8")) + '\n')                self.file2.write(str(item['title'][0].encode("utf-8"))+ '\n')                for i in range(len(item['zh'])):                    self.file2.write(str(item['zh'][i].encode("utf-8")) + '\n')        return item

hjspider.py:

from scrapy.contrib.spiders import CrawlSpider,Rulefrom scrapy.contrib.linkextractors.sgml import SgmlLinkExtractorfrom scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom mypro.items import MyproItemfrom scrapy.http import Requestclass HjSpider(BaseSpider):    name = "hujiang"    allowed_domain = ["hujiang.com"]    start_urls = [            "http://th.hujiang.com/new/"        ]    def parse(self , response):        hxs = HtmlXPathSelector(response)        items = []        urls = []        raw_title = hxs.select('//title').extract()        raw_th = hxs.select("//div[@class='langs_en']/text()").extract()        raw_zh = hxs.select("//div[@class='langs_cn']/text()").extract()        items.append(MyproItem(title = raw_title , zh = raw_zh , th = raw_th))                raw_urls = hxs.select('//a/@href').extract()        for url in raw_urls:             if('http' not in url):                 if('new' in url):                     if (url not in urls):                         url = "http://th.hujiang.com" + url                         #item = MyproItem()                         #item['url'] = url                         urls.append(url)                         items.append(Request(url , callback = self.parse))        return items