Scrapy框架学习
来源:互联网 发布:棉柔巾 知乎 编辑:程序博客网 时间:2024/05/17 06:19
项目步骤
1.分析网页,确定数据爬取规则
2.创建项目
3.创建数据模型Item
4.创建爬虫Spider,进行数据爬取
5.创建Item Pipeline,进行数据处理
6.按需求设置配置文件
源码
items.py
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass TencentPositionItem(scrapy.Item): """腾讯招聘爬虫Item""" # 职位名称 title=scrapy.Field() # 职位类别 type=scrapy.Field() # 招聘人数 count=scrapy.Field() # 地点 location=scrapy.Field() # 发布时间 pubtime=scrapy.Field()
spiders/tencent_position_spider.py
# !/usr/bin/env python# -*- coding:utf-8 -*-import scrapyfrom myscrapy.items import TencentPositionItemclass PositionSpider(scrapy.Spider): name = 'tencent_position' allowed_domains=['hr.tencent.com'] offset=0 base_url='http://hr.tencent.com/position.php?&start=' start_urls=[base_url+str(offset)] def parse(self, response): # 一页中的所有职位 positions=response.xpath('//tr[@class="even"]|//tr[@class="odd"]') for position in positions: title = position.xpath('./td[position()=1]/a/text()').extract()[0] type = position.xpath('./td[position()=2]/text()').extract() # 好坑爹,第6页中有一个职位没有类别 if len(type): type=type[0] else: type='马化腾二大爷' count = position.xpath('./td[position()=3]/text()').extract()[0] location = position.xpath('./td[position()=4]/text()').extract()[0] pubtime = position.xpath('./td[position()=5]/text()').extract()[0] item=TencentPositionItem() item['title']=title item['type']=type item['count']=count item['location']=location item['pubtime']=pubtime # 将item传递给Pipeline处理 yield item # 如果没有到最后一页,就爬取下一页 if self.offset<=2680: self.offset+=10 yield scrapy.Request(url=self.base_url+str(self.offset),callback=self.parse)pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport jsonclass TencentPositionPipeline(object): """腾讯招聘爬虫Item Pipeline""" def __init__(self): self.f=open('tencentposition.json',mode='w') self.f.write('[') def process_item(self,item,spider): data=json.dumps(dict(item),ensure_ascii=False,indent=4) self.f.write(data.encode('utf-8')+',\n') return item def close_spider(self,spider): self.f.write(']') self.f.close()
settings.py
# -*- coding: utf-8 -*-# Scrapy settings for myscrapy project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:## http://doc.scrapy.org/en/latest/topics/settings.html# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'myscrapy'SPIDER_MODULES = ['myscrapy.spiders']NEWSPIDER_MODULE = 'myscrapy.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'myscrapy (+http://www.yourdomain.com)'# Obey robots.txt rules# ROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docsDOWNLOAD_DELAY = 1# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', # 'User-Agent':'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# 'myscrapy.middlewares.MyscrapySpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# 'myscrapy.middlewares.MyCustomDownloaderMiddleware': 543,#}# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = { # 'myscrapy.pipelines.MyPipeline': 300, 'myscrapy.pipelines.TencentPositionPipeline': 300,}# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
运行结果展示
阅读全文