Scraper——BeautifulSoup and LXML

来源:互联网 发布:python 字节流 编辑:程序博客网 时间:2024/06/10 06:49

# 数据抓取——BeautifulSoup包'''官方文档:'''# beautifulsoup包处理错误的HTML格式from bs4 import BeautifulSoupbroken_html = '<ul class=country><li>Area<li>Population</ul>'soup = BeautifulSoup(broken_html, "html.parser")fixed_html = soup.prettify()# 修复HTML格式# print fixed_htmlul = soup.find('ul', attrs={'class': 'country'})# 调取元素# print ul.find('li')# print ul.find_all('li')# 现在用此方法抽取国家面积数据import urllib2def download(url, user_agent="wswp", num_retries=2):    print "Download :", url    headers = {"User_agent": user_agent}    request = urllib2.Request(url, headers=headers)    try:        html = urllib2.urlopen(request).read()    except urllib2.URLError as e:        print "Download Error :", e.reason        html = None        if num_retries > 0:            if hasattr(e, "code") and 500 <= e.code < 600:                return download(url, user_agent, num_retries-1)    return htmlif __name__ == "__main__":    url = ""    html = download(url)    soup = BeautifulSoup(html, "html.parser", from_encoding="utf-8")    # 先找到其父元素    tr = soup.find(attrs={'id':'places_area__row'})    # 然后再找到面积所在的子元素    td = tr.find(attrs={'class':'w2p_fw'})    # 最后输出子元素的内容    area = td.text    print area# 总结:BeautifulSoup包虽然比正则表达式要复杂,但是,并不难懂,而且更易构造和理解。最后,像多余的空格和标签属性这种布局上的小变化,我们使用BeautifulSoup包更为方便。



# 数据抓取——Lxml模块'''Lxml是基于libxml2这一XML解析库的Python封存,该模块的解析速度更加块,比BeautifulSoup包快,因为,它使用的C语言编写。'''# 使用第一步先将不合法的HTML解析为统一的格式。import lxml.htmlimport urllib2'''broken_html = '<ul class=country><li>Area<li>Population</ul>'# 解析htmltree = lxml.html.fromstring(broken_html)fixed_html = lxml.html.tostring(tree, pretty_print=True)'''# print fixed_htmldef download(url, user_agent="wswp", num_retries=2):    print "Download :", url    headers = {"User_agent": user_agent}    request = urllib2.Request(url, headers=headers)    try:        html = urllib2.urlopen(request).read()    except urllib2.URLError as e:        print "Download Error :", e.reason        html = None        if num_retries > 0:            if hasattr(e, "code") and 500 <= e.code < 600:                return download(url, user_agent, num_retries - 1)    return htmlif __name__ == "__main__":    url = ""    html = download(url)    tree = lxml.html.fromstring(html)    td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]  # 注意在最新的lxml模块中已经没有cssselect包,需要单独下载 pip install cssselect    area = td.text_content()    print area

