Python 抓取【参考消息网站】的新闻

来源:互联网 发布:看电视的软件大全 编辑:程序博客网 时间:2024/06/05 02:55

在学习Python,写的一个简单的爬取参考消息的例子。
根据参考消息网站的js可以爬取下一页。

# -*- coding:utf-8 -*-'''Created on 2015-12-8@author: AndyCoder'''import reimport urllib2import jsonclass spider(object):    '''    spider    '''    def __init__(self, url="",header=""):        '''        Constructor        '''        self.url = url        self.header = header    def parseUrl(self, urlPatter='"url":"(.*?)",'):        urlList = []        pattern = re.compile(urlPatter, re.DOTALL)        request = urllib2.Request(self.url)        request.add_header('User-Agent', self.header)        response = urllib2.urlopen(request)        html = response.read()        contentHtml =  html.decode('raw_unicode_escape')        items = re.findall(pattern, html)        for item in items:            urls = item.replace('\\','')            urlList.append(urls)        return urlList, contentHtml    def parseContent(self,url,contentPattern='<div class="content">(.*?)</a>(.*?)</strong>(.*?)</div>'):        newsList = []        newsDict = {}        contentPattern = re.compile(contentPattern, re.DOTALL)        titlePattern = re.compile('<title>(.*?)-(.*?)</title>', re.DOTALL)        req = urllib2.Request(url)        resp = urllib2.urlopen(req)        content = resp.read()        utfContent = content.decode('utf8')        title = ''        for item in re.findall(titlePattern, utfContent):            title = item[0]        for item in re.findall(contentPattern, content):#            news = "{'title':" + "'" + title + "'" + "," + "'url':" + "'" + url + "'"  + "," + "'time':" + "'" + item[1] + "'" + "," + "'content':" + "'" + item[2] + "'}"            newsDict['title'] = title            newsDict['url'] = url            newsDict['time'] = item[1]            newsDict['content'] = item[2]            newsList.append(newsDict)        return newsList,content# s = spider('http://app.cankaoxiaoxi.com/?app=system&controller=channel&action=wap_index&catid=1&order=publish&num=2weight=60&jsoncallback=?','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36')# urls, html = s.parseUrl()# for url in urls:#     newsList, content = s.parseContent(url)#     for news in newsList:#         print news# #         s = spider('http://app.cankaoxiaoxi.com/?app=system&controller=channel&action=wap_index&catid=1&order=publish&num=2weight=60&jsoncallback=?','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36')urls, html = s.parseUrl()for url in urls:    newsList, content = s.parseContent(url)    news_string = json.dumps(newsList)    decoded = json.loads(news_string)    if len(decoded)>0:        print decoded[0]
0 0