Scrapy框架学习

来源:互联网 发布:棉柔巾 知乎 编辑:程序博客网 时间:2024/05/17 06:19

项目步骤

1.分析网页,确定数据爬取规则

2.创建项目

3.创建数据模型Item

4.创建爬虫Spider,进行数据爬取

5.创建Item Pipeline,进行数据处理

6.按需求设置配置文件


源码

items.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass TencentPositionItem(scrapy.Item):    """腾讯招聘爬虫Item"""    # 职位名称    title=scrapy.Field()    # 职位类别    type=scrapy.Field()    # 招聘人数    count=scrapy.Field()    # 地点    location=scrapy.Field()    # 发布时间    pubtime=scrapy.Field()

spiders/tencent_position_spider.py

# !/usr/bin/env python# -*- coding:utf-8 -*-import scrapyfrom myscrapy.items import TencentPositionItemclass PositionSpider(scrapy.Spider):    name = 'tencent_position'    allowed_domains=['hr.tencent.com']    offset=0    base_url='http://hr.tencent.com/position.php?&start='    start_urls=[base_url+str(offset)]    def parse(self, response):        # 一页中的所有职位        positions=response.xpath('//tr[@class="even"]|//tr[@class="odd"]')        for position in positions:            title = position.xpath('./td[position()=1]/a/text()').extract()[0]            type = position.xpath('./td[position()=2]/text()').extract()            # 好坑爹,6页中有一个职位没有类别            if len(type):                type=type[0]            else:                type='马化腾二大爷'            count = position.xpath('./td[position()=3]/text()').extract()[0]            location = position.xpath('./td[position()=4]/text()').extract()[0]            pubtime = position.xpath('./td[position()=5]/text()').extract()[0]            item=TencentPositionItem()            item['title']=title            item['type']=type            item['count']=count            item['location']=location            item['pubtime']=pubtime            # item传递给Pipeline处理            yield item        # 如果没有到最后一页,就爬取下一页        if self.offset<=2680:            self.offset+=10            yield scrapy.Request(url=self.base_url+str(self.offset),callback=self.parse)
pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport jsonclass TencentPositionPipeline(object):    """腾讯招聘爬虫Item Pipeline"""    def __init__(self):        self.f=open('tencentposition.json',mode='w')        self.f.write('[')    def process_item(self,item,spider):        data=json.dumps(dict(item),ensure_ascii=False,indent=4)        self.f.write(data.encode('utf-8')+',\n')        return item    def close_spider(self,spider):        self.f.write(']')        self.f.close()

settings.py
# -*- coding: utf-8 -*-# Scrapy settings for myscrapy project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##     http://doc.scrapy.org/en/latest/topics/settings.html#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'myscrapy'SPIDER_MODULES = ['myscrapy.spiders']NEWSPIDER_MODULE = 'myscrapy.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'myscrapy (+http://www.yourdomain.com)'# Obey robots.txt rules# ROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docsDOWNLOAD_DELAY = 1# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:DEFAULT_REQUEST_HEADERS = {    # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',    # 'Accept-Language': 'en',    # 'User-Agent':'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0',    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {#    'myscrapy.middlewares.MyscrapySpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {#    'myscrapy.middlewares.MyCustomDownloaderMiddleware': 543,#}# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html#EXTENSIONS = {#    'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {   # 'myscrapy.pipelines.MyPipeline': 300,   'myscrapy.pipelines.TencentPositionPipeline': 300,}# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

运行结果展示

阅读全文
'); })();
0 0
原创粉丝点击
热门IT博客
热门问题 老师的惩罚 人脸识别 我在镇武司摸鱼那些年 重生之率土为王 我在大康的咸鱼生活 盘龙之生命进化 天生仙种 凡人之先天五行 春回大明朝 姑娘不必设防,我是瞎子 32g闪迪内存卡多少钱 闪迪内存卡防伪查询 三星和闪迪的内存卡哪个好 内存卡闪迪和金士顿哪个好 三星和闪迪tf卡哪个好 sandisk闪迪内存卡 闪迪内存卡真假辨别 闪迪防伪电话 sd卡 金士顿 闪迪 手机闪退 闪退 ios13闪退 闪退修复 抖音闪退 gta5闪退 浏览器闪退 gta闪退 ios软件闪退 吃鸡闪退 应用闪退 闪退修复器 修复闪退 为什么闪退 pp闪退修复 safari闪退 5s闪退修复 淘宝闪退 4s闪退修复 ipadapp闪退 ios app闪退 appflow闪退 手机闪退什么原因 手机软件闪退 手机闪退修复神器 手机闪退解决办法 手机炉石闪退修复2019 手机炉石闪退修复 冒险岛手游闪退 手机老是闪退 第五人格闪退 app闪退怎么解决方法