Scrapy初探

来源:互联网 发布:java编程 窗口不置顶 编辑:程序博客网 时间:2024/05/29 13:21

1.新建Item

import scrapyclass zhaopinItem(scrapy.Item):    company = scrapy.Field() #公司    content = scrapy.Field() #内容要求    url = scrapy.Field() #链接    pay = scrapy.Field() #薪资    zhiwei = scrapy.Field() #职位    didian = scrapy.Field() #地点

(2)pipelines.py文件中添加json支持

import jsonimport codecsclass JsonWithEncodingCnblogsPipeline(object):    def __init__(self):        FILE_SOURCE = '/Users/DDD/PycharmProjects/truespider/truespider/Json/'        filname = '招聘.json'        true_path = FILE_SOURCE+filname        self.file = codecs.open(true_path,'w',encoding='utf-8')        self.first_item = True    def process_item(self,item,spider):        line = json.dumps(dict(item), ensure_ascii=False) + ",\n"        self.file.write(line)        return item    def spider_close(self,spider):        self.file.close()

在setting.py中添加

ITEM_PIPELINES = {   'truespider.pipelines.JsonWithEncodingCnblogsPipeline': 300,}

(3)编写爬虫

# -*- coding:utf-8 -*-import scrapyfrom truespider.Item.zhaopin_Item import zhaopinItemfrom scrapy.http import Requestclass zhaopinSpider(scrapy.spiders.Spider):    name = 'zhaopin'    allowed_domains = ['sou.zhaopin.com']    start_urls = [        'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=python%E5%B7%A5%E7%A8%8B%E5%B8%88&sm=0&p=1'    ]    # 由于详情页的结构很混乱,这里就先放下了    def parse2(self,response):        for info in response.xpath('//div[@class="tab-cont-box"]/div[@class="tab-inner-cont"]'):            print info.extract()    def parse(self,response):        items = []        for info in response.xpath('//div[@class="newlist_list_content"]/table')[1:]:            item = zhaopinItem()            item['company'] = info.xpath('tr/td[@class="gsmc"]/a/text()').extract()            item['content'] = info.xpath('tr[@class="newlist_tr_detail"]/td/div[@class="newlist_detail"]/div[@class="clearfix"]/ul/li[@class="newlist_deatil_last"]/text()').extract()            item['url']=info.xpath('tr/td[@class="zwmc"]/div/a/@href').extract()            item['pay']=info.xpath('tr/td[@class="zwyx"]/text()').extract()            item['zhiwei']=info.xpath('tr/td[@class="zwmc"]/div/a/text()').extract()            items.append(item)            yield item  #添加json        # 根据提取的url继续爬取        for item in items:            # 因为url作用域和allowed_domains不同,所以添加dont_filter=True            yield Request(item['url'][0],callback=self.parse2,dont_filter=True)





(4)方便运行添加main文件

# -*- coding:utf-8 -*-from scrapy import cmdlinecmdline.execute('scrapy crawl zhaopin'.split())

(5)添加到数据库

首先用建表工具建一个表(我用是phpMyAdmin),设置好表头和类型

在pipelines.py中添加

from twisted.enterprise import adbapiimport MySQLdbimport MySQLdb.cursorsclass MySQLStorePipeline(object):    def __init__(self):        dbargs = dict(             host = '127.0.0.1',             db = 'dbname',             user = 'root',             passwd = 'pass',             cursorclass = MySQLdb.cursors.DictCursor,             charset = 'utf8',             use_unicode = True            )        self.dbpool = adbapi.ConnectionPool('MySQLdb',**dbargs)    def process_item(self,item,spider):        res = self.dbpool.runInteraction(self.insert_into_table,item)        return item    def insert_into_table(self,conn,item):        conn.execute('insert into zhaopin(company,pay,url,zhiwei,content) VALUES(%s,%s,%s,%s,%s)',(            item['company'],            item['pay'],            item['url'],            item['zhiwei'],            item['content']        ))
setting.py添加
ITEM_PIPELINES = {   'truespider.pipelines.JsonWithEncodingCnblogsPipeline': 300,   'truespider.pipelines.MySQLStorePipeline':300,}
结果:


0 0