MongoPipeline,ImagePipeline,CsvPipeline,JsonPipeline,XmlWritePipeline
来源:互联网 发布:ovid数据库检索方式 编辑:程序博客网 时间:2024/06/05 14:12
- 改进版的MongoPipeline
- MongoPipeline
- ImagePipeline
- CsvPipeline
- JsonPipeline
- XmlWritePipeline
改进版的MongoPipeline
2017/11/5
import pymongofrom scrapy.conf import settings## 在settings.py中配置MONGO_URI,MONGO_DATABASE,MONGO_COLLECTION,如果不配置,则默认使用localhost, 项目名和爬虫名class MongoPipeline(object): def __init__(self): self.mongo_uri = settings.get('MONGO_URI','localhost') self.mongo_db = settings.get('MONGO_DATABASE', settings['BOT_NAME']) self.mongo_collection = settings.get('MONGO_COLLECTION') self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self, item, spider): if not self.mongo_collection: self.mongo_collection = spider.name self.db[self.mongo_collection].insert_one(dict(item)) return item def close_spider(self, spider): self.client.close()
MongoPipeline
来自官网,需要在settings中定义MONGO_URI和MONGO_DATABASE
import pymongoclass MongoPipeline(object): collection_name = 'scrapy_items' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.db[self.collection_name].insert_one(dict(item)) return item
ImagePipeline
自定义的图片下载器,以列表形式保存img_urls,title为文件夹的名字,图片的名称为index+1
在settings.py中需要制定保存路径
IMAGES_STORE=’F:/images’
from scrapy.pipelines.images import ImagesPipelinefrom scrapy.http import Requestimport osclass ImagePipeline(ImagesPipeline): # 自定义图片下载器 def get_media_requests(self, item, info): '''发生图片下载请求,其中item['front_image_url']字段是scrapy中我们自定义的url字段, 以数组的方式存储,遍历数组请求图片''' for i, image_url in enumerate(item['imgs']): yield Request(image_url, meta={'item':item, 'index':i+1}) def file_path(self, request, response=None, info=None): item = request.meta['item'] name = item['name'] index = request.meta['index'] image_guid = name + str(index) + '.' + request.url.split('.')[-1] imagepath = 'full/{}/{}'.format(name, image_guid) return imagepath
CsvPipeline
import csvclass CsvPipeline(object): def __init__(self): self.csvfp = open('pipeline.csv', 'w', encoding='utf8') fieldnames = ['tea_hd', 'name', 'title', 'img_url', 'content'] self.writer =csv.DictWriter(self.csvfp, fieldnames=fieldnames) self.writer.writeheader() def process_item(self, item, spider): self.writer.writerow(item) return item def close_spider(self, spider): self.csvfp.close()
JsonPipeline
import jsonclass JsonPipeline(object): def open_spider(self, spider): self.fp = open('itcast.json','w', encoding='utf8') self.fp.write('[') def process_item(self, item, spider): dict_data = dict(item) str_data = json.dumps(dict_data,ensure_ascii=False) + ',\n' self.fp.write(str_data) return item def close_spider(self, spider): self.fp.seek(self.fp.tell() - 3, 0) self.fp.write(']') self.fp.close()
XmlWritePipeline
from scrapy import signalsfrom scrapy import logfrom TestSpider.items import TestspiderItemfrom twisted.enterprise import adbapifrom scrapy.contrib.exporter import XmlItemExporter# class TestspiderPipeline(object):# def process_item(self, item, spider):# return itemclass XmlWritePipeline(object): def __init__(self): pass @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('bbsData.xml', 'wb') self.expoter = XmlItemExporter(self.file) self.expoter.start_exporting() def spider_closed(self, spider): self.expoter.finish_exporting() self.file.close() # process the crawled data, define and call dataProcess function # dataProcess('bbsData.xml', 'text.txt') def process_item(self, item, spider): self.expoter.export_item(item) return item
阅读全文
0 0
- MongoPipeline,ImagePipeline,CsvPipeline,JsonPipeline,XmlWritePipeline
- Fresco中的ImagePipeline
- Gradle sync failed: Could not download imagepipeline.aar (com.facebook.fresco:imagepipeline:1.3.0):
- Fresco源码解析 - 创建一个ImagePipeline(一)
- Fresco源码解析 - 创建一个ImagePipeline(一)
- Python:Scrapy中重写ImagePipeline组件的file_path函数,自定义图片的路径和名称
- Fresco 源码分析(三) Fresco服务端处理(1) ImagePipeline为何物
- "com.facebook.imagepipeline.bitmaps.TooManyBitmapsException" Fresco使用过程中遇到的坑
- Android Studio导入Fresco项目编译报错unable to expand TAR 'imagepipeline\build\downloads\libjpeg-turbo-1.3.1.tar.gz'解决
- git文件过大无法上传解决
- 第三天:浪迹天涯网上商城(1.0版本)--后台管理系统--商品列表的查询
- Java中代码的执行顺序
- VMware Workstation的使用过程中出现的错误及解决方案
- [RK3288][Android6.0] PWM backlight 驱动流程小结
- MongoPipeline,ImagePipeline,CsvPipeline,JsonPipeline,XmlWritePipeline
- 君生我未生!Stata
- 堆(heap)
- synchronized和lock的区别和使用
- 编程语言之存储结构篇
- 【JavaScript】原生JS实现多条件筛选
- 强化学习基本概念
- 人工智能 无人驾驶的第一本书推荐
- android 中的多线程使用