scrapy中文存储

来源：互联网发布：阿里云青云 ucloud 编辑：程序博客网时间：2024/06/16 05:53

修改setteings文件

ITEM_PIPELINES = {      'mypjt.pipelines.MypjtPipeline': 300, 6 }

编写pipelines文件

# -*- coding: utf-8 -*-import codecs# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlclass MypjtPipeline(object):    def __init__(self):        self.file = codecs.open("mydata1.txt","wb",encoding = "utf-8")    def process_item(self, item, spider):        #i=json.dumps(dict(item),ensure_ascii=False)当存储json格式时        l = str(item)+'\n'        print(l)        self.file.write(l)        return item    def close_spider(self,spider):        self.file.close()

spider代码

# -*- coding: utf-8 -*-import scrapyfrom mypjt.items import MypjtItemclass MyspdSpider(scrapy.Spider):    name = 'myspd'    allowed_domains = ['sina.com.cn']    start_urls = ['http://tech.sina.com.cn/d/s/2016-09-17/doc-ifxvyqwa3324638.shtml']    def parse(self, response):        item = MypjtItem()          item["title"] = response.xpath("/html/head/title/text()") #当存储json格式时后面加extract()即可        print (item["title"])        yield item                #对item进行迭代,否则文件中将为空

阅读全文

0 0