用python写网络爬虫读书笔记 第二章数据抓取

来源:互联网 发布:css编程 编辑:程序博客网 时间:2024/06/12 12:07

1 三种网页抓取方法

1.1 正则表达式

通过分析网页可以看到多个国家的属性都使用了<td class="w2p_fw">标签,例如国家面积属性的位置:


这样就可以通过正则表达式进行数据的抓取:

# -*- coding: utf-8 -*-import urllib2import redef scrape(html):    area = re.findall('<tr id="places_area__row"><td class="w2p_fl"><label class="readonly" for="places_area" id="places_area__label">Area: </label></td><td class="w2p_fw">(.*?)</td><td class="w2p_fc"></td></tr>', html)[0]    return areaif __name__ == '__main__':    html = urllib2.urlopen('http://example.webscraping.com/places/default/view/Afghanistan-1').read()    print scrape(html)
但是如果网页发生变化,这种方案很容易失效。

1.2 Beautiful Soup

首先安装beautifulsoup4

pip install beautifulsoup4

使用beautifulsoup的第一步就是将已经下载的HTML内容解析为soup文档。此外beautifulsoup还具有修补引号缺失和标签未闭合的问题。

from bs4 import BeautifulSoupbroken_html='<ul class=country><li>Area<li>Population</ul>'soup=BeautifulSoup(broken_html,'html.parser')fixed_html=soup.prettify()print fixed_htmlul=soup.find('ul',attrs={'class':'country'})li=ul.find('li')print li.textprint ul.find_all('li')print soup.li.li.string

输出结果为:

<ul class="country"> <li>  Area  <li>   Population  </li> </li></ul>AreaPopulation[<li>Area<li>Population</li></li>, <li>Population</li>]Population

1.3 Lxml

lxml同样具有修补网页标签的能力

import lxml.htmlbroken_html='<ul class=country><li>Area<li>Population</ul>'tree=lxml.html.fromstring(broken_html)fixed_html=lxml.html.tostring(tree, pretty_print=True)print fixed_html
输出结果为:

<ul class="country"><li>Area</li><li>Population</li></ul>

关于CSS选择器

CSS选择器表示选择元素所使用的模式。常用示例如下:


使用lxml的CSS选择器抽取面积数据的示例代码:

# -*- coding: utf-8 -*-import urllib2import lxml.htmldef scrape(html):    tree = lxml.html.fromstring(html)    td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]    area = td.text_content()    return areaif __name__ == '__main__':    html = urllib2.urlopen('http://example.webscraping.com/places/default/view/Afghanistan-1').read()    print scrape(html)

1.4 性能对比

下面的代码是分别使用上述三种方式来抓取国家页面的所有属性信息,不仅仅包括国家的面积。同时又对三种方法进行了性能测试(每一种方式运行1000次)。代码如下:

import re from bs4 import BeautifulSoupimport lxml.htmlimport urllib2import timefrom test.test_sax import startFIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')def re_scraper(html):    results={}    for field in FIELDS:        results[field]=re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0]    return resultsdef bs_scraper(html):    soup=BeautifulSoup(html,'html.parser')    results={}    for field in FIELDS:        results[field]=soup.find('table').find('tr',id='places_%s__row' % field).find('td',class_='w2p_fw').text     return results    def lxml_scraper(html):    tree=lxml.html.fromstring(html)    results={}    for field in FIELDS:        results[field]=tree.cssselect('table > tr#places_%s__row > td.w2p_fw' % field)[0].text_content()    return resultsdef main():    NUM_ITERATIONS=1000    for name,scraper in [('Regular expressions',re_scraper),('BeatifulSoup',bs_scraper),('Lxml',lxml_scraper)]:        start=time.time()        for i in range(NUM_ITERATIONS):            if scraper==re_scraper:                re.purge()#默认情况下正则表达式模块会缓存搜索结果,这里是为了清除缓存。            result=scraper(html)            assert(result['area']=='647,500 square kilometres')        end=time.time()        print '%s: %.2f seconds' %(name,end-start)                    if __name__ == '__main__':    html = urllib2.urlopen('http://example.webscraping.com/places/default/view/Afghanistan-1').read()    print re_scraper(html)    print bs_scraper(html)    print lxml_scraper(html)    main()
输出结果为:

{'languages': 'fa-AF,ps,uz-AF,tk', 'area': '647,500 square kilometres', 'country': 'Afghanistan', 'postal_code_regex': '', 'tld': '.af', 'currency_name': 'Afghani', 'phone': '93', 'neighbours': '<div><a href="/places/default/iso/TM">TM </a><a href="/places/default/iso/CN">CN </a><a href="/places/default/iso/IR">IR </a><a href="/places/default/iso/TJ">TJ </a><a href="/places/default/iso/PK">PK </a><a href="/places/default/iso/UZ">UZ </a></div>', 'iso': 'AF', 'postal_code_format': '', 'capital': 'Kabul', 'continent': '<a href="/places/default/continent/AS">AS</a>', 'currency_code': 'AFN', 'population': '29,121,286'}{'languages': u'fa-AF,ps,uz-AF,tk', 'area': u'647,500 square kilometres', 'country': u'Afghanistan', 'postal_code_regex': u'', 'tld': u'.af', 'currency_name': u'Afghani', 'phone': u'93', 'neighbours': u'TM CN IR TJ PK UZ ', 'iso': u'AF', 'postal_code_format': u'', 'capital': u'Kabul', 'continent': u'AS', 'currency_code': u'AFN', 'population': u'29,121,286'}{'languages': 'fa-AF,ps,uz-AF,tk', 'area': '647,500 square kilometres', 'country': 'Afghanistan', 'postal_code_regex': '', 'tld': '.af', 'currency_name': 'Afghani', 'phone': '93', 'neighbours': 'TM CN IR TJ PK UZ ', 'iso': 'AF', 'postal_code_format': '', 'capital': 'Kabul', 'continent': 'AS', 'currency_code': 'AFN', 'population': '29,121,286'}Regular expressions: 3.19 secondsBeatifulSoup: 25.63 secondsLxml: 4.19 seconds
其实lxml和正则表达式模块都是C语言编写的,而beautifulsoup是纯python编写的。


1.5 为链接爬虫添加抓取回调

callback是一个函数,在发生某个特定事件之后会调用该函数(在本例中,会在网页下载完成后调用)。回调函数scrape_callback2代码如下:

# -*- coding: utf-8 -*-import csvimport reimport urlparseimport lxml.htmlfrom link_crawler import link_crawlerclass ScrapeCallback:    def __init__(self):        self.writer = csv.writer(open('countries.csv', 'wb'))        self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')        self.writer.writerow(self.fields)    def __call__(self, url, html):        if re.search('/view/', url):            tree = lxml.html.fromstring(html)            row = []            for field in self.fields:                row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())            self.writer.writerow(row)if __name__ == '__main__':    link_crawler('http://example.webscraping.com/', '/places/default/view/.*?-\d', scrape_callback=ScrapeCallback())
link_crawler函数代码如下:

# -*- conding:utf-f -*-import reimport urlparseimport urllib2import timefrom datetime import datetimeimport robotparserimport Queuedef link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1, scrape_callback=None):    """Crawl from the given seed URL following links matched by link_regex    """    # the queue of URL's that still need to be crawled    crawl_queue = [seed_url]    # the URL's that have been seen and at what depth    seen = {seed_url: 0}    # track how many URL's have been downloaded    num_urls = 0    rp = get_robots(seed_url)    throttle = Throttle(delay)    headers = headers or {}    if user_agent:        headers['User-agent'] = user_agent    while crawl_queue:        url = crawl_queue.pop()        depth = seen[url]        # check url passes robots.txt restrictions        if rp.can_fetch(user_agent, url):            throttle.wait(url)            html = download(url, headers, proxy=proxy, num_retries=num_retries)            links = []            if scrape_callback:#执行下载之后进行回调函数的调用。                links.extend(scrape_callback(url, html) or [])            if depth != max_depth:                # can still crawl further                if link_regex:                    # filter for links matching our regular expression                    links.extend(link for link in get_links(html) if re.match(link_regex, link))                for link in links:                    link = normalize(seed_url, link)                    # check whether already crawled this link                    if link not in seen:                        seen[link] = depth + 1                        # check link is within same domain                        if same_domain(seed_url, link):                            # success! add this new link to queue                            crawl_queue.append(link)            # check whether have reached downloaded maximum            num_urls += 1            if num_urls == max_urls:                break        else:            print 'Blocked by robots.txt:', urlclass Throttle:    """Throttle downloading by sleeping between requests to same domain    """    def __init__(self, delay):        # amount of delay between downloads for each domain        self.delay = delay        # timestamp of when a domain was last accessed        self.domains = {}            def wait(self, url):        """Delay if have accessed this domain recently        """        domain = urlparse.urlsplit(url).netloc        last_accessed = self.domains.get(domain)        if self.delay > 0 and last_accessed is not None:            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds            if sleep_secs > 0:                time.sleep(sleep_secs)        self.domains[domain] = datetime.now()def download(url, headers, proxy, num_retries, data=None):    print 'Downloading:', url    request = urllib2.Request(url, data, headers)    opener = urllib2.build_opener()    if proxy:        proxy_params = {urlparse.urlparse(url).scheme: proxy}        opener.add_handler(urllib2.ProxyHandler(proxy_params))    try:        response = opener.open(request)        html = response.read()        code = response.code    except urllib2.URLError as e:        print 'Download error:', e.reason        html = ''        if hasattr(e, 'code'):            code = e.code            if num_retries > 0 and 500 <= code < 600:                # retry 5XX HTTP errors                html = download(url, headers, proxy, num_retries-1, data)        else:            code = None    return htmldef normalize(seed_url, link):    """Normalize this URL by removing hash and adding domain    """    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates    return urlparse.urljoin(seed_url, link)def same_domain(url1, url2):    """Return True if both URL's belong to same domain    """    return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netlocdef get_robots(url):    """Initialize robots parser for this domain    """    rp = robotparser.RobotFileParser()    rp.set_url(urlparse.urljoin(url, '/robots.txt'))    rp.read()    return rp        def get_links(html):    """Return a list of links from html     """    # a regular expression to extract all links from the webpage    webpage_regex = re.compile('<a href="(.*?)">', re.IGNORECASE)    # list of all links from the webpage    return webpage_regex.findall(html)if __name__ == '__main__':    link_crawler('http://example.webscraping.com', '/places/default/view/.*?-\d|/places/default/index', delay=0, num_retries=1, user_agent='BadCrawler')    link_crawler('http://example.webscraping.com', '/places/default/view/.*?-\d|/places/default/index', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')


原创粉丝点击