基于python Scrapy的爬虫——爬取某网站新闻内容

来源：互联网发布：淘宝买iphone7靠谱商家编辑：程序博客网时间：2024/06/15 02:13

【完整源码】https://github.com/beng0305/ThirtySixSpider
【环境】python 2.7 scrapy 1.4 PhantomJS Pyodbc sqlserver 2008

数据库用的pyodbc + sqlserver，pyodbc的编码问题花了很长时间才解决。

采用PhantomJS 来获取js动态内容，虽然速度会相当慢，但是也是windows系统上不得已的选择。

网上谈到的方式也是五花八门，尝试了用scrapy-splash，据说速度可以，但是splash是基于docker容器的，windows上安装docker，问题层出不穷，最后还是放弃了。

这个爬虫速度是慢了一点，初步爬取300个新闻内容大概需要20多分钟，但还算稳定。

【效果】

【源码】

主类ThirtySixSpider.py

# -*- coding: utf-8 -*-# Author:   BinBin# Email:    289594665@qq.com# Time :    2017/07/27import urllib2import sysimport reimport scrapyimport loggingfrom pip._vendor.requests.packages import chardetfrom scrapy.http import HtmlResponsefrom scrapy.selector import HtmlXPathSelectorfrom ..items import ArticleItemfrom scrapy import Requestimport loggingreload(sys)sys.setdefaultencoding("utf-8")class ThirtySixSpider(scrapy.Spider):    name = "ThirtySix"    allowed_domains = ["36kr.com"]    start_urls = ['http://36kr.com']    def parse(self, response):        print "url:" + response.url        print "response:" + response.__str__()        links = response.xpath('//a[contains(@href, "/p/")]//@href').extract()        #1、获取类似http://36kr.com/p/5055572.html这样的链接        newsFullLinks = response.xpath('//a[re:test(@href, ".+(/p/\d+\.html)$")]/@href').extract()        for link in newsFullLinks:            yield Request(link, callback=self.parse_item)        #2、获取类似/p/5084179.html这样的链接        newsIncompleteLinks = response.xpath('//a[re:test(@href, "^(/p/\d+\.html)$")]/@href').extract()        for link in newsIncompleteLinks:            link = response.urljoin(link)            #print link            yield Request(link, callback=self.parse_item)        # 3、获取/tags/***、/user/***、/topics/****这样的链接        otherIncompleteLinks = response.xpath('//a[re:test(@href, "(^/tags/|^/user/|^/topics/).*")]/@href').extract()        for link in otherIncompleteLinks:            link = response.urljoin(link)            #print link            yield Request(link, callback=self.parse_next)        # 3、获取http://36kr.com/tags/***、http://36kr.com/user/***、http://36kr.com/topics/****这样的链接        otherFullLinks = response.xpath('//a[re:test(@href, "(^.+/tags/|^.+/user/|^.+/topics/).*")]/@href').extract()        for link in otherFullLinks:            #print link            yield Request(link, callback=self.parse_next)    #爬去下一个页面    def parse_next(self, response):        links = response.xpath('//a[contains(@href, "/p/")]//@href').extract()        # 1、获取类似http://36kr.com/p/5055572.html这样的链接        newsFullLinks = response.xpath('//a[re:test(@href, ".+(/p/\d+\.html)$")]/@href').extract()        for link in newsFullLinks:            yield Request(link, callback=self.parse_item)        # 2、获取类似/p/5084179.html这样的链接        newsIncompleteLinks = response.xpath('//a[re:test(@href, "^(/p/\d+\.html)$")]/@href').extract()        for link in newsIncompleteLinks:            link = response.urljoin(link)            print link            yield Request(link, callback=self.parse_item)        # 3、获取/tags/***、/user/***、/topics/****这样的链接        otherIncompleteLinks = response.xpath('//a[re:test(@href, "(^/tags/|^/user/|^/topics/).*")]/@href').extract()        for link in otherIncompleteLinks:            link = response.urljoin(link)            #print link            yield Request(link, callback=self.parse_next)        # 3、获取http://36kr.com/tags/***、http://36kr.com/user/***、http://36kr.com/topics/****这样的链接        otherFullLinks = response.xpath('//a[re:test(@href, "(^.+/tags/|^.+/user/|^.+/topics/).*")]/@href').extract()        for link in otherFullLinks:            #print link            yield Request(link, callback=self.parse_next)    #分析新闻内容    def parse_item(self, response):        print "parse_item url:" + response.url        item = ArticleItem()        article_titles = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/h1/text()').extract()        if (article_titles.count > 0):            print "article_title:" + article_titles[0]            item["article_title"] = article_titles[0]        article_authors = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/div[contains(@class, "author-panel")]/div[contains(@class, "author")]/a/span/text()').extract()        if (article_authors.count > 0):            print "article_author:" + article_authors[0]            item["article_author"] = article_authors[0]        article_summarys = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/section[@class="summary"]/text()').extract()        print "article_summarys:" + article_summarys.__str__()        if (article_summarys.count > 0):            print "article_summary:" + article_summarys[0]            item["article_summary"] = article_summarys[0]        article_icons = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/section[@class="headimg"]/img/@src').extract()        print "article_icons:" + article_icons.__str__()        if (article_icons.count > 0):            print "article_icon:" + article_icons[0]            item["article_icon"] = article_icons[0]        article_contents = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/div[2]/section').extract()        print "article_contents:" + article_contents.__str__()        if (article_contents.count > 0):            print "article_content:" + article_contents[0]            item["article_content"] = article_contents[0]        item["article_url"] = response.url        if (item["article_title"] is not None):             yield item

数据库封装DBHelper.py

# -*- coding: utf-8 -*-# Author:   BinBin# Email:    289594665@qq.com# Time :    2017/07/27import pyodbcclass DBHelper(object):    def __init__(self, serverIp, port, dbName, uid, pwd):        conn_info = 'DRIVER={SQL Server};DATABASE=%s;SERVER=%s,%s;UID=%s;PWD=%s' % (dbName, serverIp, port, uid, pwd)        self.connection = pyodbc.connect(conn_info, unicode_results=True)        self.cursor = self.connection.cursor()    def __del__(self):        if self.cursor:            self.cursor.close()            self.cursor = None            print(self.cursor, '__del__ cursor closed')        if self.connection:            self.connection.close()            self.connection = None    def destroy(self):        if self.cursor:            print(self.cursor, 'destroy cursor closed')            self.cursor.close()            self.cursor = None        if self.connection:            self.connection.close()            self.connection = None    # 获取全部查询结果    def queryAll(self, qryStr):        print(qryStr.decode('gbk'))        self.cursor.execute(qryStr)        return self.cursor.fetchall()    # 获取前maxcnt条查询结果    def querySome(self, qryStr, maxCount):        self.cursor.execute(qryStr)        return self.cursor.fetchmany(maxCount)    #获取分页查询结果    def queryPage(self, qryStr, skipCnt, pageSize):        self.cursor.execute(qryStr)        self.cursor.skip(skipCnt)        return self.cursor.fetchmany(pageSize)    #获取查询条数    def count(self, sql):        self.cursor.execute(sql)        return self.cursor.fetchone()[0]    #执行语句，包括增删改，返回变更数据数量    def execute(self, sql):        count = self.cursor.execute(sql).rowcount        self.connection.commit()        return count

新闻信息类items.py

# -*- coding: utf-8 -*-# Author:   BinBin# Email:    289594665@qq.com# Time :    2017/07/27import scrapyfrom scrapy import Fieldclass ArticleItem(scrapy.Item):    article_title = Field()    article_author = Field()    article_src = Field()    article_url = Field()    article_type = Field()    article_content = Field()    article_summary = Field()    article_icon = Field()    article_time = Field()

中间件middlewares.py

# -*- coding: utf-8 -*-# Author:   BinBin# Email:    289594665@qq.com# Time :    2017/07/27import stringfrom DBHelper import DBHelperclass ThirtySixPipeline(object):    def __init__(self):        self.helper = DBHelper('120.*.215.*', '1433', 'TestForBinBin', 'sa', '******')    def process_item(self, item, spider):        print "process_item title" +  item["article_title"]        #插入数据库的sql语句        sql = u'insert into T_Article(article_title, article_author, article_url, article_content, article_summary, article_icon) values (\'{t}\',\'{a}\',\'{u}\',\'{c}\',\'{s}\',\'{i}\')'\            .format(                t = item["article_title"],                a = item["article_author"],                u = item["article_url"],                c = item["article_content"],                s = item["article_summary"],                i = item["article_icon"]            )        #这里要特殊处理这个\xa0，是空格，GBK无法转化这个编码        sql.replace(u'\xa0', u' ')        row = self.helper.execute(sql.encode('GBK', 'ignore'))        return item

数据处理pipelines.py

# -*- coding: utf-8 -*-# Author:   BinBin# Email:    289594665@qq.com# Time :    2017/07/27import stringfrom DBHelper import DBHelperclass ThirtySixPipeline(object):    def __init__(self):        self.helper = DBHelper('120.*.215.*', '1433', 'TestForBinBin', 'sa', '******')    def process_item(self, item, spider):        print "process_item title" +  item["article_title"]        #插入数据库的sql语句        sql = u'insert into T_Article(article_title, article_author, article_url, article_content, article_summary, article_icon) values (\'{t}\',\'{a}\',\'{u}\',\'{c}\',\'{s}\',\'{i}\')'\            .format(                t = item["article_title"],                a = item["article_author"],                u = item["article_url"],                c = item["article_content"],                s = item["article_summary"],                i = item["article_icon"]            )        #这里要特殊处理这个\xa0，是空格，GBK无法转化这个编码        sql.replace(u'\xa0', u' ')        row = self.helper.execute(sql.encode('GBK', 'ignore'))        return item

阅读全文

0 0