Scrapy框架学习

来源：互联网发布：棉柔巾知乎编辑：程序博客网时间：2024/05/17 06:19

项目步骤

1.分析网页，确定数据爬取规则

2.创建项目

3.创建数据模型Item

4.创建爬虫Spider,进行数据爬取

5.创建Item Pipeline，进行数据处理

6.按需求设置配置文件

源码

items.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass TencentPositionItem(scrapy.Item):    """腾讯招聘爬虫Item"""    # 职位名称    title=scrapy.Field()    # 职位类别    type=scrapy.Field()    # 招聘人数    count=scrapy.Field()    # 地点    location=scrapy.Field()    # 发布时间    pubtime=scrapy.Field()

spiders/tencent_position_spider.py

# !/usr/bin/env python# -*- coding:utf-8 -*-import scrapyfrom myscrapy.items import TencentPositionItemclass PositionSpider(scrapy.Spider):    name = 'tencent_position'    allowed_domains=['hr.tencent.com']    offset=0    base_url='http://hr.tencent.com/position.php?&start='    start_urls=[base_url+str(offset)]    def parse(self, response):        # 一页中的所有职位        positions=response.xpath('//tr[@class="even"]|//tr[@class="odd"]')        for position in positions:            title = position.xpath('./td[position()=1]/a/text()').extract()[0]            type = position.xpath('./td[position()=2]/text()').extract()            # 好坑爹,第6页中有一个职位没有类别            if len(type):                type=type[0]            else:                type='马化腾二大爷'            count = position.xpath('./td[position()=3]/text()').extract()[0]            location = position.xpath('./td[position()=4]/text()').extract()[0]            pubtime = position.xpath('./td[position()=5]/text()').extract()[0]            item=TencentPositionItem()            item['title']=title            item['type']=type            item['count']=count            item['location']=location            item['pubtime']=pubtime            # 将item传递给Pipeline处理            yield item        # 如果没有到最后一页,就爬取下一页        if self.offset<=2680:            self.offset+=10            yield scrapy.Request(url=self.base_url+str(self.offset),callback=self.parse)
pipelines.py

# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport jsonclass TencentPositionPipeline(object):    """腾讯招聘爬虫Item Pipeline"""    def __init__(self):        self.f=open('tencentposition.json',mode='w')        self.f.write('[')    def process_item(self,item,spider):        data=json.dumps(dict(item),ensure_ascii=False,indent=4)        self.f.write(data.encode('utf-8')+',\n')        return item    def close_spider(self,spider):        self.f.write(']')        self.f.close()

settings.py

# -*- coding: utf-8 -*-# Scrapy settings for myscrapy project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##     http://doc.scrapy.org/en/latest/topics/settings.html#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'myscrapy'SPIDER_MODULES = ['myscrapy.spiders']NEWSPIDER_MODULE = 'myscrapy.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'myscrapy (+http://www.yourdomain.com)'# Obey robots.txt rules# ROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docsDOWNLOAD_DELAY = 1# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:DEFAULT_REQUEST_HEADERS = {    # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',    # 'Accept-Language': 'en',    # 'User-Agent':'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0',    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {#    'myscrapy.middlewares.MyscrapySpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {#    'myscrapy.middlewares.MyCustomDownloaderMiddleware': 543,#}# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html#EXTENSIONS = {#    'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {   # 'myscrapy.pipelines.MyPipeline': 300,   'myscrapy.pipelines.TencentPositionPipeline': 300,}# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

运行结果展示

阅读全文

'); })();