使用scrapy抓取糗事百科

来源：互联网发布：厦门软件行业发展前景编辑：程序博客网时间：2024/05/29 05:02

先scrapy startapp tut01

scrapy genspider qsbk www.qsbk.com

qsbk.py

# -*- coding: utf-8 -*-import scrapyQSBK_HOST = u"http://www.qiushibaike.com"COUNT = 0def debug(msg):scrapy.log.msg(msg.decode("utf-8").encode("gb2312"), level=scrapy.log.DEBUG)#糗事百科单位"""作者内容点击量"""class QBItem(scrapy.Item):author = scrapy.Field()content = scrapy.Field()ctr = scrapy.Field()def __str__(self):return "%s:%s %s:%s %s:%s" %('author',self['author'].encode("GB2312"),'content',self['content'].encode("GB2312"),'clickcount',self['ctr'].encode("GB2312"),)"""糗事百科爬虫需要定制USER_AGENT和HEADERS，否则服务器不接收"""class QsbkSpider(scrapy.Spider):name = "qsbk"allowed_domains = ["www.qiushibaike.com",]start_urls = ('http://www.qiushibaike.com/',)def read_QBItems(self, response):global COUNTCOUNT += 1debug("正在爬取第{}页".format(COUNT))css_block = "div.article.block"css_author = "div.author h2::text"css_content = "div.content::text"css_ctr = "div > span > i::text"for b in response.selector.css(css_block):try:qb = QBItem()qb['author'] = b.css(css_author).extract()[0]qb['content'] = b.css(css_content).extract()[0]qb['ctr'] = b.css(css_ctr).extract()[0]#返回数据结果，用于输出yield qbexcept Exception,e:debug("抓取失败{}".format(response.url.encode("GB2312")))css_next = "div.pageto a.next::attr(href)"npage = response.selector.css(css_next).extract()[0]if npage.startswith(u'/'):next_url = QSBK_HOST + npageyield scrapy.Request(next_url, self.read_QBItems)def parse(self, response):global COUNTif not COUNT:debug("准备开始解析数据...")return self.read_QBItems(response)

tut01/settings.py

# -*- coding: utf-8 -*-# Scrapy settings for tut01 project## For simplicity, this file contains only the most important settings by# default. All the other settings are documented here:##     http://doc.scrapy.org/en/latest/topics/settings.html#BOT_NAME = 'tut01'SPIDER_MODULES = ['tut01.spiders']NEWSPIDER_MODULE = 'tut01.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent#如果使用默认的scrapy用户代理，QSBK不让访问USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36''''DEFAULT_REQUEST_HEADERS = {'Proxy-Connection': 'keep-alive','Cache-Control': 'max-age=0','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Encoding': 'gzip, deflate, sdch','Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',}'''

1 0