Python实现抓取CSDN热门文章列表

来源:互联网 发布:怎样在淘宝上卖东西 编辑:程序博客网 时间:2024/06/16 21:41

1、使用工具:
Python3.5
BeautifulSoup
2、抓取网站:
csdn热门文章列表 http://blog.csdn.net/hot.html
3、分析网站代码:
这里写图片描述
4、实现代码:

__author__ = 'Administrator'import urllib.requestimport refrom bs4 import BeautifulSoup########################################################## 抓取csdn首页文章http://blog.csdn.net/?&page=1###########################################################class CsdnUtils(object):    def __init__(self):        user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'        self.headers = {'Cache-Control': 'max-age=0',                        'Connection': 'keep-alive',                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',                        'User-Agent': user_agent,        }    def getPage(self, url=None):        request = urllib.request.Request(url, headers=self.headers)        response = urllib.request.urlopen(request)        soup = BeautifulSoup(response.read(), "html.parser")        #print(soup.prettify())        return soup    def parsePage(self, url=None, page=None):        soup = self.getPage(url)        itemBlog = soup.find_all('div', 'blog_list')        cnArticle = CsdnUtils        print("========================第", page, "页======================================")        for i, itemSingle in enumerate(itemBlog):            cnArticle.num = i            cnArticle.author = itemSingle.find('a', 'user_name').string            cnArticle.postTime = itemSingle.find('span', 'time').string            cnArticle.articleView = itemSingle.find('a', 'view').string            if itemSingle.find('h1').find('a').has_attr('class'):                cnArticle.type = itemSingle.find('h1').find('a', 'category').string            else:                cnArticle.type = "None"            cnArticle.title = itemSingle.find('h1').find('a', attrs={'name': True}).string            cnArticle.url = itemSingle.find('h1').find('a', attrs={'name': True}).get("href")            print("数据:", cnArticle.num + 1, '\t', cnArticle.author, '\t', cnArticle.postTime, '\t',                  cnArticle.articleView, '\t', cnArticle.type, '\t', cnArticle.title, '\t', cnArticle.url)#######     执行入口    ########if __name__ == "__main__":    #要抓取的网页地址'http://blog.csdn.net/?&page={}'.format(i+1),i+1)    url = "http://blog.csdn.net/hot.html"    cnblog = CsdnUtils()    for i in range(0, 5):        cnblog.parsePage(url, i + 1)

5、执行结果:
这里写图片描述

0 0
原创粉丝点击