学习python写网络爬虫(三)

来源:互联网 发布:刷屏软件下载 编辑:程序博客网 时间:2024/04/27 06:36

链接爬虫,通过跟踪页面内的链接,通过正则表达式来确定需要下载的页面的url,通过set来去除重复的链接

# coding=utf-8import urllib2import reimport urlparsedef download(url, user_agent='wswp', num_retries=2):    print 'Downloading:', url    headers = {'User-agent': user_agent}    request = urllib2.Request(url, headers=headers)    try:        html = urllib2.urlopen(request).read()    except urllib2.URLError as e:        print 'Download error:', e.reason        html = None        if num_retries > 0:            if hasattr(e, 'code') and 500 <= e.code < 600:                return download(url, num_retries - 1)    return htmldef link_crawler(seed_url, link_regex):    """Crawl from the given seed URL follwoing links matched by link_regex"""    crawl_queue = [seed_url]    #防止抓取重复的url    seen = set(crawl_queue)    while crawl_queue:        url = crawl_queue.pop()        html = download(url)        for link in get_links(html):            if re.match(link_regex, link):                link = urlparse.urljoin(seed_url, link)                if link not in seen:                    seen.add(link)                    crawl_queue.append(link)                    #测试一下,打印出抓取到的url                    print linkdef get_links(html):    """Return a list of links from html"""    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)    return webpage_regex.findall(html)##############不要忘记com后边的斜杠,我之前忘了斜杠,导致一直出错link_crawler('http://example.webscraping.com/','/(index|view)')
0 0
原创粉丝点击