scrapy框架爬取校花网站的升级版

来源：互联网发布：wps mac版下载编辑：程序博客网时间：2024/05/16 18:48
           **spider目录下的文件：定义DemoSpider类**# -*- coding: utf-8 -*-from scrapy.spiders import CrawlSpider,Rulefrom scrapy.linkextractors import LinkExtractorfrom img.items import ImgItem#from bs4 import BeautifulSoup#import urllib#import requestsclass  DemoSpider(CrawlSpider):    name='demo'    start_urls = ['http://www.xiaohuar.com/list-1-2.html']    """第一个Rule是用来筛选所有的网页       第二个是用来搜索当前页面的所有校花的子url       allow里面的是正则表达式索引带有这个正则的url       restrict_xpaths限定的是搜索的范围       callback回调函数，用来处理页面       process_links用来定义出来url的链接，其中定义的函数要传入参数links       follow是用来定义是否跟进    """    rules={Rule(LinkExtractor(allow=('http://www.xiaohuar.com/list'),                              restrict_xpaths=("//div[@class='page_num']")),                              #callback="paser_url",                              follow=True),           Rule(LinkExtractor(allow='/p',restrict_xpaths="//div[@class='title']"),                callback="paser_item",                follow=False                )           }    def paser_item(self,response):        item=ImgItem()        url=response.url        print "url=%s"%url        #检查异常        try:            img_url=response.xpath("//div[@class='infoleft_imgdiv']/a/img/@src").extract()[0]            name=response.xpath("//div[@class='infodiv']/table/tbody/tr[1]/td[2]/text()").extract()            school=response.xpath("//div[@class='infodiv']/table/tbody/tr[5]/td[2]/text()").extract()            if 'http://www.xiaohuar.com' not in img_url:                item['url'] = 'http://www.xiaohuar.com'+img_url            else:                item['url']=img_url            item['name'] = name            item['school'] = school            yield item        except Exception:            print 'error'        **定义items文件**import scrapyclass ImgItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    url=scrapy.Field()    name=scrapy.Field()    school=scrapy.Field()    *******定义pipelines文件***********    import codecsimport jsonimport urllibimport osclass ImgPipeline(object):    def __init__(self):        self.file=codecs.open('items.json','w',encoding='utf-8') #以json文件的方式打开，编码为utf-8，否则会乱码        # self.file_path=os.path.normpath("h:\\scrapy\\img\\img_picture")        # self.count=1    def process_item(self, item, spider):    #必须实现的函数，用于操作item        line=json.dumps(dict(item),ensure_ascii=False)+'\n'    #将item中的每一个数据转换成json格式的并且每一个数据都要换行娴熟        # if not os.path.exists(self.file_path):        #     os.mkdir(self.file_path)        #     img_name=os.path.normpath("h:\\scrapy\\img\\img_picture\\%s.jpg"%self.count)        #     urllib.urlretrieve(item['url'],img_name)        #     self.count+=1        self.file.write(line)        return item      #最后一般都要返回item，以便后续还要操作item    def close_file(self):        self.file.close() ***********seetting**************** #在setting文件中加上下面这句话 ITEM_PIPELINES={        "img.pipelines.ImgPipeline":300}
0 0