[Python下载CSDN博客]2. 使用BeautifulSoup分析HTML(一)

来源:互联网 发布:cntv直播软件 编辑:程序博客网 时间:2024/05/24 05:11
BeautifulSoup比起HTMLParser操作起来会简单一点

(HTMLParser是边解析边回调使用, BeautifulSoup是全部解析完后再使用)


#!/usr/bin/env python# coding=utf-8# Python 2.7.3# 获取博客文章分类列表和文章存档列表# File: GetCategoryAndMonth.pyimport urllib2import HTMLParserimport httplibfrom bs4 import BeautifulSoupclass CHYGetCategoryAndMonth:def Parser(self, htmlStr, type, list):soup2 = BeautifulSoup(htmlStr)if 0 == type:listDiv = soup2.find_all("div", id = "panel_Category")for divItem in listDiv:ul = divItem.find("ul", class_ = "panel_head")if u"文章分类" != ul.span.text:continueul = divItem.find("ul", class_ = "panel_body")listLi = ul.find_all("li")listItem = ["", ""]for li in listLi:listItem[0] = li.a["href"]listItem[1] = li.a.textitem = listItem[:]list.append(item)breakelif 1 == type:div = soup2.find("div", id = "panel_Archive")# 找出该div后面的第一个divlistDiv = div.find_next("div")listLi = listDiv.find_all("li")listItem = ["", ""]for li in listLi:listItem[0] = li.a["href"]listItem[1] = li.a.textitem = listItem[:]list.append(item)'''# 测试代码if __name__ == '__main__':conn = httplib.HTTPConnection("blog.csdn.net")# 要模拟成IE发送, 否则CSDN不接受Python的请求user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    headersP = { 'User-Agent' : user_agent }conn.request(method = "GET", url = "/bagboy_taobao_com", headers = headersP)r1 = conn.getresponse()# 获得响应htmlByte = r1.read()# 获得HTMLhtmlStr = htmlByte.decode("utf8")# 需要转换成utf8编码, 否则分析异常my = CHYGetCategoryAndMonth()list1 = []my.Parser(htmlByte, 1, list1)list0 = []my.Parser(htmlByte, 0, list0)print(list1)print(list0)'''

#!/usr/bin/env python# Python 2.7.3# 获取博客文章# File: GetArticleList.pyimport urllib2import HTMLParserimport httplibfrom bs4 import BeautifulSoupclass CHYGetArticleList:def Parser(self, htmlStr, list):soup2 = BeautifulSoup(htmlStr)div = soup2.find("div", id = "article_list")listDiv = div.find_all("div", class_ = "list_item article_item")for divItem in listDiv:divA = divItem.find("div", class_ = "article_title")a = divA.h3.span.alist.append(a["href"])'''# http://blog.csdn.net/bagboy_taobao_com/article/month/2013/10# 测试代码if __name__ == '__main__':conn = httplib.HTTPConnection("blog.csdn.net")# 要模拟成IE发送, 否则CSDN不接受Python的请求user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    headersP = { 'User-Agent' : user_agent }conn.request(method = "GET", url = "/bagboy_taobao_com/article/month/2013/10", headers = headersP)r1 = conn.getresponse()# 获得响应htmlByte = r1.read()# 获得HTMLhtmlStr = htmlByte.decode("utf8")# 需要转换成utf8编码, 否则分析异常list = []my = CHYGetArticleList()my.Parser(htmlByte, list)print(list)'''

#!/usr/bin/env python# coding=utf-8# Python 2.7.3# 获取博客文章# File: GetArticle.pyimport urllib2import HTMLParserimport httplibfrom bs4 import BeautifulSoupclass CHYGetArticle:def Parser(self, htmlStr, article):soup2 = BeautifulSoup(htmlStr)divTitle = soup2.find("div", class_ = "article_title")article[0] = divTitle.h3.span.textarticle[0] = article[0].replace("\n\r", "")# 这里必须要重新赋值article[0] = article[0].strip()# 这里必须要重新赋值divComment = soup2.find("div", class_ = "article_content")article[1] = divComment.text'''# http://blog.csdn.net/bagboy_taobao_com/article/details/5582868# 测试代码if __name__ == '__main__':conn = httplib.HTTPConnection("blog.csdn.net")# 要模拟成IE发送, 否则CSDN不接受Python的请求user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'    headersP = { 'User-Agent' : user_agent }conn.request(method = "GET", url = "/bagboy_taobao_com/article/details/5582868", headers = headersP)r1 = conn.getresponse()# 获得响应htmlByte = r1.read()# 获得HTMLhtmlStr = htmlByte.decode("utf8")# 需要转换成utf8编码, 否则分析异常my = CHYGetArticle()article = ["", ""]my.Parser(htmlByte, article)f = open("data.txt", "w")print >> f, article[0].encode("utf8"), # print最后参数加一个"逗号", 这样就输出最后不换行print >>  f, article[1].encode("utf8")'''