三种网页抓取方法

来源:互联网 发布:飞利浦shp8000 知乎 编辑:程序博客网 时间:2024/05/16 16:58
#-*- coding:UTF-8 -*-#1正则表达式import reimport urllib2url = 'http://example.webscraping.com/places/view/United-Kingdom-239'html = urllib2.urlopen(url).read()print re.findall('<td class="w2p_fw">(.*?)</td>', html)#正则表达式为我们提供了抓取数据的快捷方式,但是,该方法过于脆弱,容易在网页更新后出现问题。#2BeautifulSoupfrom bs4 import BeautifulSoupimport urllib2url = 'http://example.webscraping.com/places/view/United-Kingdom-239'html = urllib2.urlopen(url).read()soup = BeautifulSoup(html)print soup.prettify()tr = soup.find(attrs={'id':'places_area_row'})td = tr.find(attrs={'class':'w2p_fw'})area = td.textprint area#3Lxml css选择器import lxml.htmlimport urllib2url = 'http://example.webscraping.com/places/view/United-Kingdom-239'html = urllib2.urlopen(url).read()tree = lxml.html.fromstring(html)td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]area = td.text_content()print area#抓取结果FIELDS = ('area', 'population', 'iso', 'country', 'capital',          'continent', 'tld', 'currency_code', 'currency_name',          'phone', 'postal_code_format', 'postal_code_regex', 'languages')import redef re_scraper(html):    results = {}    for field in FIELDS:        results[field] = re.search('<tr id="places_%s_row">.*?<td class="w2p_fw">(.*?)</tr>' % field, html).groups()[0]    return resultsfrom bs4 import BeautifulSoupdef bs_scraper(html):    soup = BeautifulSoup(html, 'html.parser')    results = {}    for field in FIELDS:        results[field] = soup.find('table').find('tr', id='places_%s_row' % field).find('td', class_='w2p_fw').text    return resultsimport lxml.htmldef lxml_scraper(html):    tree = lxml.html.fromstring(html)    results = {}    for field in FIELDS:        results[field] = tree.cssselect('table > tr#places_%s_row > td.w2p_fw' % field)[0].text_content()    return results
原创粉丝点击