scrapy 爬取腾讯招聘网

来源：互联网发布：韩火火do not tag 淘宝编辑：程序博客网时间：2024/05/16 18:59
********************主要的爬取类*****************# -*- coding: utf-8 -*-import scrapyfrom scrapy.spiders import Rule,CrawlSpiderfrom Tencent.items import TencentItemfrom scrapy.linkextractors import LinkExtractorclass TenxunSpider(CrawlSpider):    name = "tenxun"    #allowed_domains = ["Tencent.com"]    start_urls = ['http://hr.tencent.com/position.php']    rules={        Rule(LinkExtractor(allow='position\.php',restrict_xpaths="//div[@class='pagenav']"),follow=True),        Rule(LinkExtractor(allow="position_detail\.php",restrict_xpaths="//td[@class='l square']"),follow=False,callback="paser_item")    }    def paser_item(self,response):        item=TencentItem()        print response.url        item['title']=response.xpath("//tr[@class='h']/td/text()").extract()        item['workLoction']=response.xpath("//tr[@class='c bottomline']/td[1]/text()")[0].extract()        item['person_number']=response.xpath("//tr[@class='c bottomline']/td[3]/text()").re('(\d+)')[0]        item["duty"]=response.xpath("//tr[@class='c bottomline']/td[2]/text()")[0].extract()        item['url']=response.url        item["Job_requirement"]=response.xpath("//tr[@class='c']")[1].xpath('//li/text()').extract()        item["Job_duty"]=response.xpath("//tr[@class='c']")[0].xpath('//li/text()').extract()        yield item*******************items.py****************from scrapy import Item,Fieldclass TencentItem(Item):    # define the fields for your item here like:    # name = scrapy.Field()    workLoction=Field()    #工作地点    person_number=Field()   #招聘人数    duty=Field()   #职业类别    title=Field()  #标题    Job_requirement=Field()#工作要求    Job_duty=Field()   #工作职责    url=Field()   #网页链接*****************pipelines文件*******************import json,codecsclass TencentPipeline(object):    def __init__(self):        self.file=codecs.open('duty_file.json','w',encoding='utf-8')    def process_item(self, item, spider):        line=json.dumps(dict(item),ensure_ascii=False)+'\n'        self.file.write(line)        return item    def close_file(self,spider):        self.file.close()**************setting文件要添加的内容*********ITEM_PIPELINES={    "Tencent.pipelines.TencentPipeline":300,}之后就是运行了。想要了解代码的意思就看我上篇用scrapy写的爬取校花的升级版的文章
0 0