Scrapy ImagesPipeline下载图片

来源:互联网 发布:自动谱曲软件下载 编辑:程序博客网 时间:2024/06/02 00:09

项目源码下载:http://download.csdn.net/download/adam_zs/10166641

1.项目结构,下载图片截图



2.项目简介

settings.py

ITEM_PIPELINES = {    # 'scrapy.pipelines.images.ImagesPipeline': 1    "ImagesPipelineTest.pipelines.MyImagesPipeline":1}IMAGES_STORE = 'E:\\shetuwang2017'

items.py

import scrapyclass ImageItem(scrapy.Item):    image_urls = scrapy.Field()    images = scrapy.Field()# image_urls和images是固定的

she_tu_wang.py

# -*- coding: utf-8 -*-import scrapyfrom ImagesPipelineTest.items import ImageItemclass XiaohuaSpider(scrapy.Spider):    name = "shetuwang"    allowed_domains = ["699pic.com"]    start_urls = ['http://699pic.com/people.html']    download_delay = 2    def parse(self, response):        item = ImageItem()        srcs = response.xpath('//div[@class="swipeboxEx"]/div[@class="list"]/a/img/@data-original').extract()        item['image_urls'] = srcs        yield item

pipelines.py

from scrapy.pipelines.images import ImagesPipelinefrom scrapy.exceptions import DropItemfrom scrapy.http import Requestclass MyImagesPipeline(ImagesPipeline):    def get_media_requests(self, item, info):        for image_url in item['image_urls']:            yield Request(image_url)    def item_completed(self, results, item, info):        image_path = [x['path'] for ok, x in results if ok]        if not image_path:            raise DropItem('Item contains no images')        item['image_paths'] = image_path        return item

3.运行项目

pycharm中运行begin.py

from scrapy import cmdline# cmdline.execute("scrapy crawl dmoz".split())cmdline.execute("scrapy crawl shetuwang".split())


原创粉丝点击