scrapy当当当当 连衣裙分类

来源:互联网 发布:php curl获取不到数据 编辑:程序博客网 时间:2024/04/25 18:41
scrapy startproject dangdangscrapy genspider -t basic cao dangdang.com
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass DangdangItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    #name 表示商品名称    name = scrapy.Field()    # 商品价格    price = scrapy.Field()    # 商品链接地址    link = scrapy.Field()
# -*- coding: utf-8 -*-import scrapyfrom dangdang.items import DangdangItemfrom scrapy.http import Requestimport jsonclass CaoSpider(scrapy.Spider):    name = "cao"    allowed_domains = ["dangdang.com"]    start_urls = (        'http://category.dangdang.com/pg1-cid4008149.html',    )    def parse(self, response):        item = DangdangItem()        item['name'] = response.xpath("//p[@class='name']/a[@name='itemlist-title']/text()").extract()        item['price'] = response.xpath("//p[@class='price']/span[@class='price_n']/text()").extract()        item['link'] = response.xpath('//a[@class="pic"]/@href').extract()        # 提取完成后返回item        yield item        for i in range(1,10):            url = "http://category.dangdang.com/pg"+str(i)+"-cid4008149.html"            yield Request(url,callback=self.parse)        for i in range(100):            print(item['name'][i]+'----' +item['price'][i] + '---' + item['link'][i])

pipelines.py 存储本地json文件

# 修改settings.pyITEM_PIPELINES = {    'dangdang.pipelines.DangdangPipeline': 300,}
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport codecsimport jsonclass DangdangPipeline(object):    def __init__(self):        self.file = codecs.open("./data.json",'wb',encoding="utf-8")    def process_item(self, item, spider):        i = json.dumps(dict(item),ensure_ascii=False)        line = i + '\n'        self.file.write(line)        return item    def close_spider(self,spider):        self.file.close()