七月算法课程《python爬虫》第六课: scrapy爬虫整体示例

来源：互联网发布：单片机 usb 虚拟串口编辑：程序博客网时间：2024/06/04 19:39

两个例子，爬豆瓣文本例程 douban 和图片例程 douban_imgs 。

例程1： douban

目录树

douban--douban  --spiders    --__init__.py    --bookspider.py    --douban_comment_spider.py    --doumailspider.py  --__init__.py  --items.py  --pipelines.py  --settings.py--scrapy.cfg

–spiders–init.py

# This package will contain the spiders of your Scrapy project## Please refer to the documentation for information on how to create and manage# your spiders.

bookspider.py

# -*- coding:utf-8 -*-'''by sudo rm -rf  http://imchenkun.com'''import scrapyfrom douban.items import DoubanBookItemclass BookSpider(scrapy.Spider):    name = 'douban-book'    allowed_domains = ['douban.com']    start_urls = [        'https://book.douban.com/top250'    ]    def parse(self, response):        # 请求第一页        yield scrapy.Request(response.url, callback=self.parse_next)        # 请求其它页        for page in response.xpath('//div[@class="paginator"]/a'):            link = page.xpath('@href').extract()[0]            yield scrapy.Request(link, callback=self.parse_next)    def parse_next(self, response):        for item in response.xpath('//tr[@class="item"]'):            book = DoubanBookItem()            book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]            book['content'] = item.xpath('td[2]/p/text()').extract()[0]            book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0]            yield book

douban_comment_spider.py

# -*- coding:utf-8 -*-import scrapyfrom faker import Factoryfrom douban.items import DoubanMovieCommentItemimport urlparsef = Factory.create()class MailSpider(scrapy.Spider):    name = 'douban-comment'    allowed_domains = ['accounts.douban.com', 'douban.com']    start_urls = [        'https://www.douban.com/'    ]    headers = {        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',        'Accept-Encoding': 'gzip, deflate, br',        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',        'Connection': 'keep-alive',        'Host': 'accounts.douban.com',        'User-Agent': f.user_agent()    }    formdata = {        'form_email': '你的邮箱',        'form_password': '你的密码',        # 'captcha-solution': '',        # 'captcha-id': '',        'login': '登录',        'redir': 'https://www.douban.com/',        'source': 'None'    }    def start_requests(self):        return [scrapy.Request(url='https://www.douban.com/accounts/login',                               headers=self.headers,                               meta={'cookiejar': 1},                               callback=self.parse_login)]    def parse_login(self, response):        # 如果有验证码要人为处理        if 'captcha_image' in response.body:            print 'Copy the link:'            link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0]            print link            captcha_solution = raw_input('captcha-solution:')            captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id']            self.formdata['captcha-solution'] = captcha_solution            self.formdata['captcha-id'] = captcha_id        return [scrapy.FormRequest.from_response(response,                                                 formdata=self.formdata,                                                 headers=self.headers,                                                 meta={'cookiejar': response.meta['cookiejar']},                                                 callback=self.after_login                                                 )]    def after_login(self, response):        print response.status        self.headers['Host'] = "www.douban.com"        yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',                              meta={'cookiejar': response.meta['cookiejar']},                              headers=self.headers,                              callback=self.parse_comment_url)        yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',                              meta={'cookiejar': response.meta['cookiejar']},                              headers=self.headers,                              callback=self.parse_next_page,                              dont_filter = True)   #不去重    def parse_next_page(self, response):        print response.status        try:            next_url = response.urljoin(response.xpath('//span[@class="next"]/a/@href').extract()[0])            print "下一页"            print next_url            yield scrapy.Request(url=next_url,                              meta={'cookiejar': response.meta['cookiejar']},                              headers=self.headers,                              callback=self.parse_comment_url,                              dont_filter = True)            yield scrapy.Request(url=next_url,                              meta={'cookiejar': response.meta['cookiejar']},                              headers=self.headers,                              callback=self.parse_next_page,                              dont_filter = True)        except:            print "Next page Error"            return    def parse_comment_url(self, response):        print response.status        for item in response.xpath('//div[@class="main review-item"]'):            comment_url = item.xpath('header/h3[@class="title"]/a/@href').extract()[0]            comment_title = item.xpath('header/h3[@class="title"]/a/text()').extract()[0]            print comment_title            print comment_url            yield scrapy.Request(url=comment_url,                              meta={'cookiejar': response.meta['cookiejar']},                              headers=self.headers,                              callback=self.parse_comment)    def parse_comment(self, response):        print response.status        for item in response.xpath('//div[@id="content"]'):            comment = DoubanMovieCommentItem()            comment['useful_num'] = item.xpath('//div[@class="main-panel-useful"]/button[1]/text()').extract()[0].strip()            comment['no_help_num'] = item.xpath('//div[@class="main-panel-useful"]/button[2]/text()').extract()[0].strip()            comment['people'] = item.xpath('//span[@property="v:reviewer"]/text()').extract()[0]            comment['people_url'] = item.xpath('//header[@class="main-hd"]/a[1]/@href').extract()[0]            comment['star'] = item.xpath('//header[@class="main-hd"]/span[1]/@title').extract()[0]            data_type = item.xpath('//div[@id="link-report"]/div/@data-original').extract()[0]            print "data_type: "+data_type            if data_type == '0':                comment['comment'] = "\t#####\t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div/p/text()').extract()))            elif data_type == '1':                comment['comment'] = "\t#####\t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div[1]/text()').extract()))            comment['title'] = item.xpath('//span[@property="v:summary"]/text()').extract()[0]            comment['comment_page_url'] = response.url            #print comment            yield comment

doumailspider.py

# -*- coding:utf-8 -*-'''by sudo rm -rf  http://imchenkun.com'''import scrapyfrom faker import Factoryfrom douban.items import DoubanMailItemimport urlparsef = Factory.create()class MailSpider(scrapy.Spider):    name = 'douban-mail'    allowed_domains = ['accounts.douban.com', 'douban.com']    start_urls = [        'https://www.douban.com/'    ]    headers = {        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',        'Accept-Encoding': 'gzip, deflate, br',        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',        'Connection': 'keep-alive',        'Host': 'accounts.douban.com',        'User-Agent': f.user_agent()    }    formdata = {        'form_email': '你的邮箱',        'form_password': '你的密码',        # 'captcha-solution': '',        # 'captcha-id': '',        'login': '登录',        'redir': 'https://www.douban.com/',        'source': 'None'    }    def start_requests(self):        return [scrapy.Request(url='https://www.douban.com/accounts/login',                               headers=self.headers,                               meta={'cookiejar': 1},                               callback=self.parse_login)]    def parse_login(self, response):        # 如果有验证码要人为处理        if 'captcha_image' in response.body:            print 'Copy the link:'            link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0]            print link            captcha_solution = raw_input('captcha-solution:')            captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id']            self.formdata['captcha-solution'] = captcha_solution            self.formdata['captcha-id'] = captcha_id        return [scrapy.FormRequest.from_response(response,                                                 formdata=self.formdata,                                                 headers=self.headers,                                                 meta={'cookiejar': response.meta['cookiejar']},                                                 callback=self.after_login                                                 )]    def after_login(self, response):        print response.status        self.headers['Host'] = "www.douban.com"        return scrapy.Request(url='https://www.douban.com/doumail/',                              meta={'cookiejar': response.meta['cookiejar']},                              headers=self.headers,                              callback=self.parse_mail)    def parse_mail(self, response):        print response.status        for item in response.xpath('//div[@class="doumail-list"]/ul/li'):            mail = DoubanMailItem()            mail['sender_time'] = item.xpath('div[2]/div/span[1]/text()').extract()[0]            mail['sender_from'] = item.xpath('div[2]/div/span[2]/text()').extract()[0]            mail['url'] = item.xpath('div[2]/p/a/@href').extract()[0]            mail['title'] = item.xpath('div[2]/p/a/text()').extract()[0]            print mail            yield mail

init.py

(此文件内无代码)

items.py

# -*- coding: utf-8 -*-import scrapyclass DoubanBookItem(scrapy.Item):    name = scrapy.Field()            # 书名    price = scrapy.Field()           # 价格    edition_year = scrapy.Field()    # 出版年份    publisher = scrapy.Field()       # 出版社    ratings = scrapy.Field()         # 评分    author = scrapy.Field()          # 作者    content = scrapy.Field()class DoubanMailItem(scrapy.Item):    sender_time = scrapy.Field()     # 发送时间    sender_from = scrapy.Field()     # 发送人    url = scrapy.Field()             # 豆邮详细地址    title = scrapy.Field()           # 豆邮标题class DoubanMovieCommentItem(scrapy.Item):    useful_num = scrapy.Field()      # 多少人评论有用    no_help_num = scrapy.Field()     # 多少人评论无用    people = scrapy.Field()          # 评论者    people_url = scrapy.Field()      # 评论者页面    star = scrapy.Field()            # 评分    comment = scrapy.Field()         # 评论    title = scrapy.Field()           # 标题    comment_page_url = scrapy.Field()# 当前页

pipelines.py

# -*- coding: utf-8 -*-class DoubanBookPipeline(object):    def process_item(self, item, spider):        info = item['content'].split(' / ')  # [法] 圣埃克苏佩里 / 马振聘 / 人民文学出版社 / 2003-8 / 22.00元        item['name'] = item['name']        item['price'] = info[-1]        item['edition_year'] = info[-2]        item['publisher'] = info[-3]        return itemclass DoubanMailPipeline(object):    def process_item(self, item, spider):        item['title'] = item['title'].replace(' ', '').replace('\\n', '')        return itemclass DoubanMovieCommentPipeline(object):    def process_item(self, item, spider):        return item

settings.py

# -*- coding: utf-8 -*-# Scrapy settings for douban project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##     http://doc.scrapy.org/en/latest/topics/settings.html#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'douban'SPIDER_MODULES = ['douban.spiders']NEWSPIDER_MODULE = 'douban.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agentfrom faker import Factoryf = Factory.create()USER_AGENT = f.user_agent()# Obey robots.txt rulesROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:DEFAULT_REQUEST_HEADERS = {    'Host': 'book.douban.com',    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',    'Accept-Encoding': 'gzip, deflate, br',    'Connection': 'keep-alive',}#DEFAULT_REQUEST_HEADERS = {#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',#   'Accept-Language': 'en',#}# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {#    'douban.middlewares.MyCustomSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {#    'douban.middlewares.MyCustomDownloaderMiddleware': 543,#}# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html#EXTENSIONS = {#    'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {    #'douban.pipelines.DoubanBookPipeline': 300,    #'douban.pipelines.DoubanMailPipeline': 600,    'douban.pipelines.DoubanMovieCommentPipeline': 900,}# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

scrapy.cfg

# Automatically created by: scrapy startproject## For more information about the [deploy] section see:# https://scrapyd.readthedocs.org/en/latest/deploy.html[settings]default = douban.settings[deploy]#url = http://localhost:6800/project = douban

例程2： douban_imgs

目录树

douban_imgs--douban  --spiders    --__init__.py    --download_douban.py  --__init__.py  --items.py  --pipelines.py  --run_spider.py  --settings.py--scrapy.cfg

–spiders–init.py

# This package will contain the spiders of your Scrapy project## Please refer to the documentation for information on how to create and manage# your spiders.

download_douban.py

# coding=utf-8from scrapy.spiders import Spiderimport refrom scrapy import Requestfrom douban_imgs.items import DoubanImgsItemclass download_douban(Spider):    name = 'download_douban'    default_headers = {        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',        'Accept-Encoding': 'gzip, deflate, sdch, br',        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',        'Cache-Control': 'max-age=0',        'Connection': 'keep-alive',        'Host': 'www.douban.com',        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',    }    def __init__(self, url='1638835355', *args, **kwargs):        self.allowed_domains = ['douban.com']        self.start_urls = [            'http://www.douban.com/photos/album/%s/' % (url)]        self.url = url        # call the father base function        #super(download_douban, self).__init__(*args, **kwargs)    def start_requests(self):        for url in self.start_urls:            yield Request(url=url, headers=self.default_headers, callback=self.parse)    def parse(self, response):        list_imgs = response.xpath('//div[@class="photolst clearfix"]//img/@src').extract()        if list_imgs:            item = DoubanImgsItem()            item['image_urls'] = list_imgs            yield item

init.py

(此文件内无代码)

items.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyfrom scrapy import Item, Fieldclass DoubanImgsItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    image_urls = Field()    images = Field()    image_paths = Field()

pipelines.py

# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlfrom scrapy.pipelines.images import ImagesPipelinefrom scrapy.exceptions import DropItemfrom scrapy import Requestfrom scrapy import logclass DoubanImgsPipeline(object):    def process_item(self, item, spider):        return itemclass DoubanImgDownloadPipeline(ImagesPipeline):    default_headers = {        'accept': 'image/webp,image/*,*/*;q=0.8',        'accept-encoding': 'gzip, deflate, sdch, br',        'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',        'cookie': 'bid=yQdC/AzTaCw',        'referer': 'https://www.douban.com/photos/photo/2370443040/',        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',    }    def get_media_requests(self, item, info):        for image_url in item['image_urls']:            self.default_headers['referer'] = image_url            yield Request(image_url, headers=self.default_headers)    def item_completed(self, results, item, info):        image_paths = [x['path'] for ok, x in results if ok]        if not image_paths:            raise DropItem("Item contains no images")        item['image_paths'] = image_paths        return item

run_spider.py

from scrapy import cmdlinecmd_str = 'scrapy crawl download_douban'cmdline.execute(cmd_str.split(' '))

settings.py

# -*- coding: utf-8 -*-# Scrapy settings for douban_imgs project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##     http://doc.scrapy.org/en/latest/topics/settings.html#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'douban_imgs'SPIDER_MODULES = ['douban_imgs.spiders']NEWSPIDER_MODULE = 'douban_imgs.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent# USER_AGENT = 'douban_imgs (+http://www.yourdomain.com)'# Configure maximum concurrent requests performed by Scrapy (default: 16)# CONCURRENT_REQUESTS=32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs# DOWNLOAD_DELAY=3# The download delay setting will honor only one of:# CONCURRENT_REQUESTS_PER_DOMAIN=16# CONCURRENT_REQUESTS_PER_IP=16# Disable cookies (enabled by default)# COOKIES_ENABLED=False# Disable Telnet Console (enabled by default)# TELNETCONSOLE_ENABLED=False# Override the default request headers:# DEFAULT_REQUEST_HEADERS = {#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',#   'Accept-Language': 'en',# }# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html# SPIDER_MIDDLEWARES = {#    'douban_imgs.middlewares.MyCustomSpiderMiddleware': 543,# }# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html# DOWNLOADER_MIDDLEWARES = {#    'douban_imgs.middlewares.MyCustomDownloaderMiddleware': 543,# }# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html# EXTENSIONS = {#    'scrapy.telnet.TelnetConsole': None,# }# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {    'douban_imgs.pipelines.DoubanImgDownloadPipeline': 300,}IMAGES_STORE = 'D:\\doubanimgs'#IMAGES_STORE = '/tmp'IMAGES_EXPIRES = 90# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html# NOTE: AutoThrottle will honour the standard settings for concurrency and delay# AUTOTHROTTLE_ENABLED=True# The initial download delay# AUTOTHROTTLE_START_DELAY=5# The maximum download delay to be set in case of high latencies# AUTOTHROTTLE_MAX_DELAY=60# Enable showing throttling stats for every response received:# AUTOTHROTTLE_DEBUG=False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings# HTTPCACHE_ENABLED=True# HTTPCACHE_EXPIRATION_SECS=0# HTTPCACHE_DIR='httpcache'# HTTPCACHE_IGNORE_HTTP_CODES=[]# HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'

scrapy.cfg

# Automatically created by: scrapy startproject## For more information about the [deploy] section see:# https://scrapyd.readthedocs.org/en/latest/deploy.html[settings]default = douban_imgs.settings[deploy]#url = http://localhost:6800/project = douban_imgs

0 0