scrapy处理个中文本格式HTML,XML,CSV

来源:互联网 发布:2016黑马java百度网盘 编辑:程序博客网 时间:2024/06/07 06:10

网页

#创建项目 $scrapy startproject mypjt#基于basic模板创建名为xxx的爬虫文件$ scrapy genspider -t basic xxx sina.com.cn

html格式

class CaoItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    urlname = scrapy.Field()    urlkey = scrapy.Field()    urlcr = scrapy.Field()    urladd = scrapy.Field()# 可以从命令行指定输入的地址class AbcSpider(scrapy.Spider):    name = 'abc'    start_urls = [        'http://python.jobbole.com/',        'http://blog.csdn.net/    ]    def __init__(self,myurl=None,*args,**kwargs):        super(AbcSpider, self).__init__(*args,**kwargs)        print ("要爬取的网址为: %s" %myurl)        self.start_urls=["%s" %myurl]    def parse(self,response):        item = CaoItem()        item['urlname'] = response.xpath('/html/head/title/text()').extract()$ scrapy crawl abc --nolog -a myurl="http://mp3.baidu.com"要爬取的网址为: http://mp3.baidu.com百度音乐-听到极致

XMLFeedSpider

class MyxmlItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    title = scrapy.Field()    link = scrapy.Field()    author = scrapy.Field()class MycsvspiderSpider(CSVFeedSpider):    name = 'mycsvspider'    allowed_domains = ['iqianyue.com']    start_urls = ['这里地址自行定义,找一个xml文档,有上述字段']    headers = ['name','sex','add','email']    # 定义间隔符    delimiter = ','    def parse_row(self, response, row):        i = MycsvItem()        i['name'] = row['name'].encode()        i['sex'] = row['sex'].encode()        print("名字是:")        print (i['name'])        print ("性别是:")        print (i['sex'])        print ('------------')        return i

CSVFeedSpider

class MycsvItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    name = scrapy.Field()    sex = scrapy.Field()class MycsvspiderSpider(CSVFeedSpider):    name = 'mycsvspider'    allowed_domains = ['iqianyue.com']    start_urls = ['自定义一个CSV文档用逗号分割的']    headers = ['name','sex','add','email']    # 定义间隔符    delimiter = ','    def parse_row(self, response, row):        i = MycsvItem()        i['name'] = row['name'].encode()        i['sex'] = row['sex'].encode()        print("名字是:")        print (i['name'])        print ("性别是:")        print (i['sex'])        print ('------------')        return i$ scrapy craw1 mycsvspider --nolog