基于python Scrapy的爬虫——爬取某网站新闻内容
来源:互联网 发布:淘宝买iphone7靠谱商家 编辑:程序博客网 时间:2024/06/15 02:13
【完整源码】https://github.com/beng0305/ThirtySixSpider
【环境】python 2.7 scrapy 1.4 PhantomJS Pyodbc sqlserver 2008
数据库用的pyodbc + sqlserver,pyodbc的编码问题花了很长时间才解决。
采用PhantomJS 来获取js动态内容,虽然速度会相当慢,但是也是windows系统上不得已的选择。
网上谈到的方式也是五花八门,尝试了用scrapy-splash,据说速度可以,但是splash是基于docker容器的,windows上安装docker,问题层出不穷,最后还是放弃了。
这个爬虫速度是慢了一点,初步爬取300个新闻内容大概需要20多分钟,但还算稳定。
【效果】
【源码】
主类ThirtySixSpider.py
# -*- coding: utf-8 -*-# Author: BinBin# Email: 289594665@qq.com# Time : 2017/07/27import urllib2import sysimport reimport scrapyimport loggingfrom pip._vendor.requests.packages import chardetfrom scrapy.http import HtmlResponsefrom scrapy.selector import HtmlXPathSelectorfrom ..items import ArticleItemfrom scrapy import Requestimport loggingreload(sys)sys.setdefaultencoding("utf-8")class ThirtySixSpider(scrapy.Spider): name = "ThirtySix" allowed_domains = ["36kr.com"] start_urls = ['http://36kr.com'] def parse(self, response): print "url:" + response.url print "response:" + response.__str__() links = response.xpath('//a[contains(@href, "/p/")]//@href').extract() #1、获取类似http://36kr.com/p/5055572.html这样的链接 newsFullLinks = response.xpath('//a[re:test(@href, ".+(/p/\d+\.html)$")]/@href').extract() for link in newsFullLinks: yield Request(link, callback=self.parse_item) #2、获取类似/p/5084179.html这样的链接 newsIncompleteLinks = response.xpath('//a[re:test(@href, "^(/p/\d+\.html)$")]/@href').extract() for link in newsIncompleteLinks: link = response.urljoin(link) #print link yield Request(link, callback=self.parse_item) # 3、获取/tags/***、/user/***、/topics/****这样的链接 otherIncompleteLinks = response.xpath('//a[re:test(@href, "(^/tags/|^/user/|^/topics/).*")]/@href').extract() for link in otherIncompleteLinks: link = response.urljoin(link) #print link yield Request(link, callback=self.parse_next) # 3、获取http://36kr.com/tags/***、http://36kr.com/user/***、http://36kr.com/topics/****这样的链接 otherFullLinks = response.xpath('//a[re:test(@href, "(^.+/tags/|^.+/user/|^.+/topics/).*")]/@href').extract() for link in otherFullLinks: #print link yield Request(link, callback=self.parse_next) #爬去下一个页面 def parse_next(self, response): links = response.xpath('//a[contains(@href, "/p/")]//@href').extract() # 1、获取类似http://36kr.com/p/5055572.html这样的链接 newsFullLinks = response.xpath('//a[re:test(@href, ".+(/p/\d+\.html)$")]/@href').extract() for link in newsFullLinks: yield Request(link, callback=self.parse_item) # 2、获取类似/p/5084179.html这样的链接 newsIncompleteLinks = response.xpath('//a[re:test(@href, "^(/p/\d+\.html)$")]/@href').extract() for link in newsIncompleteLinks: link = response.urljoin(link) print link yield Request(link, callback=self.parse_item) # 3、获取/tags/***、/user/***、/topics/****这样的链接 otherIncompleteLinks = response.xpath('//a[re:test(@href, "(^/tags/|^/user/|^/topics/).*")]/@href').extract() for link in otherIncompleteLinks: link = response.urljoin(link) #print link yield Request(link, callback=self.parse_next) # 3、获取http://36kr.com/tags/***、http://36kr.com/user/***、http://36kr.com/topics/****这样的链接 otherFullLinks = response.xpath('//a[re:test(@href, "(^.+/tags/|^.+/user/|^.+/topics/).*")]/@href').extract() for link in otherFullLinks: #print link yield Request(link, callback=self.parse_next) #分析新闻内容 def parse_item(self, response): print "parse_item url:" + response.url item = ArticleItem() article_titles = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/h1/text()').extract() if (article_titles.count > 0): print "article_title:" + article_titles[0] item["article_title"] = article_titles[0] article_authors = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/div[contains(@class, "author-panel")]/div[contains(@class, "author")]/a/span/text()').extract() if (article_authors.count > 0): print "article_author:" + article_authors[0] item["article_author"] = article_authors[0] article_summarys = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/section[@class="summary"]/text()').extract() print "article_summarys:" + article_summarys.__str__() if (article_summarys.count > 0): print "article_summary:" + article_summarys[0] item["article_summary"] = article_summarys[0] article_icons = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/section[@class="headimg"]/img/@src').extract() print "article_icons:" + article_icons.__str__() if (article_icons.count > 0): print "article_icon:" + article_icons[0] item["article_icon"] = article_icons[0] article_contents = response.xpath('//div[re:test(@id, "J_post_wrapper_.*")]/div[1]/div[1]/div[2]/section').extract() print "article_contents:" + article_contents.__str__() if (article_contents.count > 0): print "article_content:" + article_contents[0] item["article_content"] = article_contents[0] item["article_url"] = response.url if (item["article_title"] is not None): yield item
数据库封装DBHelper.py
# -*- coding: utf-8 -*-# Author: BinBin# Email: 289594665@qq.com# Time : 2017/07/27import pyodbcclass DBHelper(object): def __init__(self, serverIp, port, dbName, uid, pwd): conn_info = 'DRIVER={SQL Server};DATABASE=%s;SERVER=%s,%s;UID=%s;PWD=%s' % (dbName, serverIp, port, uid, pwd) self.connection = pyodbc.connect(conn_info, unicode_results=True) self.cursor = self.connection.cursor() def __del__(self): if self.cursor: self.cursor.close() self.cursor = None print(self.cursor, '__del__ cursor closed') if self.connection: self.connection.close() self.connection = None def destroy(self): if self.cursor: print(self.cursor, 'destroy cursor closed') self.cursor.close() self.cursor = None if self.connection: self.connection.close() self.connection = None # 获取全部查询结果 def queryAll(self, qryStr): print(qryStr.decode('gbk')) self.cursor.execute(qryStr) return self.cursor.fetchall() # 获取前maxcnt条查询结果 def querySome(self, qryStr, maxCount): self.cursor.execute(qryStr) return self.cursor.fetchmany(maxCount) #获取分页查询结果 def queryPage(self, qryStr, skipCnt, pageSize): self.cursor.execute(qryStr) self.cursor.skip(skipCnt) return self.cursor.fetchmany(pageSize) #获取查询条数 def count(self, sql): self.cursor.execute(sql) return self.cursor.fetchone()[0] #执行语句,包括增删改,返回变更数据数量 def execute(self, sql): count = self.cursor.execute(sql).rowcount self.connection.commit() return count
新闻信息类items.py
# -*- coding: utf-8 -*-# Author: BinBin# Email: 289594665@qq.com# Time : 2017/07/27import scrapyfrom scrapy import Fieldclass ArticleItem(scrapy.Item): article_title = Field() article_author = Field() article_src = Field() article_url = Field() article_type = Field() article_content = Field() article_summary = Field() article_icon = Field() article_time = Field()
中间件middlewares.py
# -*- coding: utf-8 -*-# Author: BinBin# Email: 289594665@qq.com# Time : 2017/07/27import stringfrom DBHelper import DBHelperclass ThirtySixPipeline(object): def __init__(self): self.helper = DBHelper('120.*.215.*', '1433', 'TestForBinBin', 'sa', '******') def process_item(self, item, spider): print "process_item title" + item["article_title"] #插入数据库的sql语句 sql = u'insert into T_Article(article_title, article_author, article_url, article_content, article_summary, article_icon) values (\'{t}\',\'{a}\',\'{u}\',\'{c}\',\'{s}\',\'{i}\')'\ .format( t = item["article_title"], a = item["article_author"], u = item["article_url"], c = item["article_content"], s = item["article_summary"], i = item["article_icon"] ) #这里要特殊处理这个\xa0,是空格,GBK无法转化这个编码 sql.replace(u'\xa0', u' ') row = self.helper.execute(sql.encode('GBK', 'ignore')) return item
数据处理pipelines.py
# -*- coding: utf-8 -*-# Author: BinBin# Email: 289594665@qq.com# Time : 2017/07/27import stringfrom DBHelper import DBHelperclass ThirtySixPipeline(object): def __init__(self): self.helper = DBHelper('120.*.215.*', '1433', 'TestForBinBin', 'sa', '******') def process_item(self, item, spider): print "process_item title" + item["article_title"] #插入数据库的sql语句 sql = u'insert into T_Article(article_title, article_author, article_url, article_content, article_summary, article_icon) values (\'{t}\',\'{a}\',\'{u}\',\'{c}\',\'{s}\',\'{i}\')'\ .format( t = item["article_title"], a = item["article_author"], u = item["article_url"], c = item["article_content"], s = item["article_summary"], i = item["article_icon"] ) #这里要特殊处理这个\xa0,是空格,GBK无法转化这个编码 sql.replace(u'\xa0', u' ') row = self.helper.execute(sql.encode('GBK', 'ignore')) return item
阅读全文
0 0
- 基于python Scrapy的爬虫——爬取某网站新闻内容
- 网络爬虫框架scrapy介绍及应用——抓取新浪新闻的标题内容评论
- 关于scrapy新闻爬虫,对新闻网页内容进行编辑的问题
- Python 爬虫 —— scrapy
- python scrapy多进程新闻爬虫
- 南燕新闻自动生成软件——scrapy爬虫程序
- 第五课 Python爬虫抓取新浪新闻的内容页
- 基于Scrapy框架下的Python网络爬虫的实现
- 基于Python的scrapy框架的广州天气爬虫源码下载
- 基于Python,scrapy,redis的分布式爬虫实现框架
- 基于Scrapy框架的python网络爬虫(1)
- 基于Scrapy框架的python网络爬虫学习(2)
- 基于Scrapy框架的python网络爬虫学习(3)
- 基于Scrapy框架的python网络爬虫学习(3)
- 基于Python+scrapy+redis的分布式爬虫实现框架
- python Scrapy 框架做爬虫 ——很好的教程
- Python 爬虫6——Scrapy的安装和使用
- Python分布式爬虫打造搜索引擎完整版-基于Scrapy、Redis、elasticsearch和django打造一个完整的搜索引擎网站
- 经典算法之冒泡指针
- hdu2046 骨牌铺方格(C语言)
- 使用lockbits方法处理图像
- 【bzoj1296】[SCOI2009]粉刷匠
- 单片机用AD测量电池电压的值
- 基于python Scrapy的爬虫——爬取某网站新闻内容
- hdu1278 漂亮面料的设计(模拟)
- Android_适配器(2)
- typescript总结笔记二
- MATLAB中一些isscalar,isnumeric,isnan,iscell,isfield函数
- dopost
- mybatis实战教程
- Docker应用示例2--使用Docker创建简单集群服务
- VBA 等待1S 处理方法