scrapy爬取电影网站

来源:互联网 发布:php location延时跳转 编辑:程序博客网 时间:2024/04/30 02:16

new_movie.py 定义爬虫

import scrapyimport reimport urllibfrom ..items import NewMovieItem class new_movie(scrapy.Spider):name='new_movie'start_urls=['http://www.87movie.com/tag/喜剧']allowed_domains=['www.87movie.com']def parse_info(self,response):movie_info=response.meta['movie_info']movie_info['name']=response.xpath('//div[@class="white-div"]//div[@class="col-md-8"]/h3/text()').extract()movie_info['pic']=response.xpath('//div[@class="white-div"]//img/@src').extract()movie_info['content']=response.xpath('//div[@class="white-div"]//div[@class="col-md-8"]/text()').extract()movie_info['download']=response.xpath('//div[@class="white-div"]//ul[@class="list-unstyled"]/li/a/@href').extract()return movie_info #return标志着程序终止def parse_page(self,response):movie_list=response.xpath('//ul[@class="list-unstyled mlist"]/li/div/div[@class="col-md-10"]/h4/a/@href').extract()basic_url="http://www.87movie.com"for i in movie_list:movie_info=NewMovieItem()#所以要把NewMovieItem导入进来yield scrapy.Request(basic_url+i,meta={'movie_info':movie_info},callback=self.parse_info)def parse(self,response): #parse函数是scrapy第一个函数tmp_str=response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').extract()num_max=int(tmp_str[0].split('/')[-1].split('?')[0])for i in range(1,num_max+1):basic_url="http://www.87movie.com/tag/喜剧/{}?o=date".format(str(i))print(basic_url)yield scrapy.Request(basic_url,callback=self.parse_page) #程序不可以在此终止,所以用yield而不用return


Item.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items#item.py# See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass NewMovieItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()name=scrapy.Field()pic=scrapy.Field()content=scrapy.Field()download=scrapy.Field()

pipline.py

# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport json#将数据存入jsonimport codecsclass NewMoviePipeline(object):def __init__(self):self.file=codecs.open('movie_info.json','w',encoding='utf-8')def process_item(self, item, spider):item['content']=item['content'][-2]#将抓取的数据进行处理加工item['pic']='www.87movie.com'+item['pic'][0]line=json.dumps(dict(item),ensure_ascii=False)+"\n"self.file.write(line)return itemdef spider_closed(self,spider):self.file.close()



setting.py

BOT_NAME = 'new_movie'SPIDER_MODULES = ['new_movie.spiders']NEWSPIDER_MODULE = 'new_movie.spiders'DOWNLOAD_DELAY = 1ITEM_PIPELINES = {    'new_movie.pipelines.NewMoviePipeline': 300,}