Scrapy框架的用法实例

来源：互联网发布：安顺行知高中编辑：程序博客网时间：2024/06/09 13:58

首先执行如下命令创建一个scrapy项目

# scrapy startproject projectName

主要有如下几个核心文件：

items.py: 在项目的根目录

middlewares.py: 在项目的根目录

pipelines.py: 在项目的根目录

projectName.py: 在spiders目录

settings.py: 在项目的根目录

我的实例是爬取和讯人物信息，一个实例地址为：http://renwu.hexun.com/figure_2789.shtml

项目目标是：根据几个初始的url，爬取到html源码，并从源码中提取出同样的url，进行迭代爬取

本人项目的名称为：crawlHexunRenwu

第一，item.py文件的内容如下：

import scrapyclass CrawlhexunrenwuItem(scrapy.Item):    # define the fields for your item here like:    filename = scrapy.Field()    html_content = scrapy.Field()

该文件的作用是：定义需要从html中抽取出来的字段名称

第二，spiders目录下的projectName.py文件内容，我的名字是叫crawl.py

import scrapyimport codecsfrom scrapy.spiders import CrawlSpider, Rulefrom scrapy.linkextractors import LinkExtractorfrom crawlHexunRenwu.items import CrawlhexunrenwuItemfrom scrapy.contrib.linkextractors.sgml import SgmlLinkExtractorclass CrawlHexun(CrawlSpider):    name = 'crawl_hexun'    allowed_domains = ['renwu.hexun.com']    start_urls = [#定义爬取的初始url        "http://renwu.hexun.com/figure_2606.shtml",        "http://renwu.hexun.com/figure_6782.shtml",        "http://renwu.hexun.com/figure_4679.shtml",        "http://renwu.hexun.com/figure_1001.shtml"    ]    rules = [#定义从html中提取的url规则，用于迭代爬取        Rule(SgmlLinkExtractor(allow=(),            restrict_xpaths=('//a[contains(@href,"figure_")]')),            callback='parse_item',            follow=True)            ]    def parse_start_url(self, response):#处理上面定义的start_urls中的url，这个其实跟下面的parse_item内容基本一样        item = CrawlhexunrenwuItem()        item['html_content'] = response.body        item['filename'] = response.url.split("/")[-1]        return item#将最终抽取出来的数据写到item中并返回    def parse_item(self, response):#处理除了初始化url之外的迭代出来的url        item = CrawlhexunrenwuItem()        item['html_content'] = response.body        item['filename'] = response.url.split("/")[-1]        return item#将最终抽取出来的数据写到item中并返回

该文件的作用，是定义初始url，从response中提取字段，提取迭代url，返回提取出来的数据

第三，pipilines.py

import codecsfrom random import randrangeclass CrawlhexunrenwuPipeline(object):    def process_item(self, item, spider):        content = item['html_content']        filename = item['filename']        with open("/search/hexunrenwu/" + filename, 'wb') as f:            f.write(content)              return item

第二步中parse_start_url和parse_item方法返回的item，这里可以获取到，可以在这里将爬取到的内容写到文件中

第四， middlewares.py文件内容如下

# -*- coding: utf-8 -*-# Define here the models for your spider middleware## See documentation in:# http://doc.scrapy.org/en/latest/topics/spider-middleware.htmlfrom scrapy import signalsimport randomfrom scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddlewareclass CrawlhexunrenwuSpiderMiddleware(object):    # Not all methods need to be defined. If a method is not defined,    # scrapy acts as if the spider middleware does not modify the    # passed objects.    @classmethod    def from_crawler(cls, crawler):        # This method is used by Scrapy to create your spiders.        s = cls()        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)        return s    def process_spider_input(response, spider):        # Called for each response that goes through the spider        # middleware and into the spider.        # Should return None or raise an exception.        return None    def process_spider_output(response, result, spider):        # Called with the results returned from the Spider, after        # it has processed the response.        # Must return an iterable of Request, dict or Item objects.        for i in result:            yield i    def process_spider_exception(response, exception, spider):        # Called when a spider or process_spider_input() method        # (from other spider middleware) raises an exception.        # Should return either None or an iterable of Response, dict        # or Item objects.        pass    def process_start_requests(start_requests, spider):        # Called with the start requests of the spider, and works        # similarly to the process_spider_output() method, except        # that it doesn’t have a response associated.        # Must return only requests (not items).        for r in start_requests:            yield r    def spider_opened(self, spider):        spider.logger.info('Spider opened: %s' % spider.name)class ProxyMiddleWare(object):    lst_https_proxy = [            'https://xxxxx1:9090',            'https://xxxxx2:9090',            'https://xxxxx3:9090',            'https://xxxxx4:9090',            'https://xxxxx5:9090',            'https://xxxxx6:9090'            ]    lst_http_proxy = [            'http://ttttt1:8080',            'http://ttttt2:8080',            'http://ttttt3:8080',            'http://ttttt4:8080',            'http://ttttt5:8080',             ]    def random_select_proxy(self):        len_all = len(ProxyMiddleWare.lst_https_proxy) + len(ProxyMiddleWare.lst_http_proxy)        idx = int(random.random() * len_all)        if idx < len(ProxyMiddleWare.lst_https_proxy):            return ProxyMiddleWare.lst_https_proxy[idx]        else:            return ProxyMiddleWare.lst_http_proxy[idx - len(ProxyMiddleWare.lst_https_proxy)]    def random_select_https_proxy(self):        idx = int(random.random() * len(ProxyMiddleWare.lst_https_proxy))        return ProxyMiddleWare.lst_https_proxy[idx]    #覆写该方法，可对request设置请求头，这里是配置代理，因为爬取如果直接用本机爬，容易被封，所以需要用代理机器，当然你如果不用代理就不用定义该类    def process_request(self, request, spider):        if request.url.find('https') == 0:            request.meta['proxy'] = self.random_select_https_proxy()        else:            request.meta['proxy'] = self.random_select_proxy()#class RotateUserAgentMiddleware(UserAgentMiddleware):  class RotateUserAgentMiddleware(object):    def __init__(self, user_agent=''):        self.user_agent = user_agent#覆写该方法来配置User-Agent    def process_request(self, request, spider):        ua = random.choice(self.user_agent_list)        if ua:            request.headers.setdefault('User-Agent', ua)    user_agent_list = [        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"        ]

该文件的核心有两个类ProxyMiddleWare和RotateUserAgentMiddleware

ProxyMiddleWare: 该类是给request请求设置代理

RotateUserAgentMiddleware：该类是给request请求配置User-Agent

所以该文件的核心就是对request进行预处理

但是要用上面的两个类，必须进行配置

第五， settings.py文件

# -*- coding: utf-8 -*-# Scrapy settings for crawlHexunRenwu project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##     http://doc.scrapy.org/en/latest/topics/settings.html#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'crawlHexunRenwu'SPIDER_MODULES = ['crawlHexunRenwu.spiders']NEWSPIDER_MODULE = 'crawlHexunRenwu.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'crawlHexunRenwu (+http://www.yourdomain.com)'# Obey robots.txt rules# 是否遵循robots.txt协议ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#防止了网站使用了cookieds识别爬虫COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:#DEFAULT_REQUEST_HEADERS = {#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',#   'Accept-Language': 'en',#}# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {#    'crawlHexunRenwu.middlewares.CrawlhexunrenwuSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#配置中间件DOWNLOADER_MIDDLEWARES = {    'crawlHexunRenwu.middlewares.ProxyMiddleWare': 100,    'crawlHexunRenwu.middlewares.RotateUserAgentMiddleware': 101}# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html#EXTENSIONS = {#    'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html#激活自定义的pipeline组件ITEM_PIPELINES = {    'crawlHexunRenwu.pipelines.CrawlhexunrenwuPipeline': 300,}#设置下载超时时间DOWNLOAD_TIMEOUT = 15# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False#默认的重复请求检测过滤，可以自己实现RPFDupeFilter的子类，覆写他的request_fingerprint方法DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'#scrapy默认使用LIFO队列存储请求，即以深度优先方式进行抓取。通过以上设置，以广度优先方式进行抓取DEPTH_PRIORITY = 1SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

大功告成咯

接下来就启动爬虫吧：

# scrapy crawl crawl

0 0