学习python写网络爬虫（三）

来源：互联网发布：刷屏软件下载编辑：程序博客网时间：2024/04/27 06:36

链接爬虫，通过跟踪页面内的链接，通过正则表达式来确定需要下载的页面的url，通过set来去除重复的链接

# coding=utf-8import urllib2import reimport urlparsedef download(url, user_agent='wswp', num_retries=2):    print 'Downloading:', url    headers = {'User-agent': user_agent}    request = urllib2.Request(url, headers=headers)    try:        html = urllib2.urlopen(request).read()    except urllib2.URLError as e:        print 'Download error:', e.reason        html = None        if num_retries > 0:            if hasattr(e, 'code') and 500 <= e.code < 600:                return download(url, num_retries - 1)    return htmldef link_crawler(seed_url, link_regex):    """Crawl from the given seed URL follwoing links matched by link_regex"""    crawl_queue = [seed_url]    #防止抓取重复的url    seen = set(crawl_queue)    while crawl_queue:        url = crawl_queue.pop()        html = download(url)        for link in get_links(html):            if re.match(link_regex, link):                link = urlparse.urljoin(seed_url, link)                if link not in seen:                    seen.add(link)                    crawl_queue.append(link)                    #测试一下，打印出抓取到的url                    print linkdef get_links(html):    """Return a list of links from html"""    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)    return webpage_regex.findall(html)##############不要忘记com后边的斜杠，我之前忘了斜杠，导致一直出错link_crawler('http://example.webscraping.com/','/(index|view)')

0 0