学习python写网络爬虫(四)

来源:互联网 发布:做淘宝好累,头晕眼花 编辑:程序博客网 时间:2024/04/27 04:26

python自带的robotparser模块可以解析robots.txt文件,可以使用urllib2来支持代理,但是python的http模块的requests来实现该功能会更友好。

可以通过使用datetime模块和time模块在两次下载之间添加延时,可以防止被封禁。

在爬取动态网页时,由于一些网站会动态生成页面内容,导致出现无限多网页,这样可能会无止境的链接下去,这种情况被称作 爬虫陷阱,只要给爬虫设置一个爬取的深度就可以避免。

# coding=utf-8import urllib2import reimport urlparseimport robotparserimport datetimeimport time#记录每个域名上次访问的时间,来对爬虫进行限速class Throttle:    """Add a delay between download to the same domain"""    def __init__(self, delay):        self.delay = delay        self.domains = {}    def wait(self, url):        domain = urlparse.urlparse(url).netloc        last_accessed = self.domains.get(domain)        if self.delay > 0 and last_accessed is not None:            sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds            if sleep_secs > 0:                time.sleep(sleep_secs)        self.domains[domain] = datetime.datetime.now()def download(url, user_agent='wswp', proxy = None, num_retries=2):    print 'Downloading:', url    headers = {'User-agent': user_agent}    request = urllib2.Request(url, headers=headers)    #设置代理    opener = urllib2.build_opener()    if proxy:        proxy_params = {urlparse.urlparse(url).scheme : proxy}        opener.add_handler(urllib2.ProxyHandler(proxy_params))    try:        #html = urllib2.urlopen(request).read()        html = opener.open(request).read()    except urllib2.URLError as e:        print 'Download error:', e.reason        html = None        if num_retries > 0:            if hasattr(e, 'code') and 500 <= e.code < 600:                #return download(url, num_retries - 1)                html = download(url,user_agent,proxy,num_retries-1)    return htmldef link_crawler(seed_url, link_regex):    """Crawl from the given seed URL follwoing links matched by link_regex"""    crawl_queue = [seed_url]    #防止抓取重复的    seen = set(crawl_queue)    #robotparser模块解析robots.txt文件,避免下载禁止爬取的页面    rp = robotparser.RobotFileParser()    rp.set_url('http://example.webscraping.com/robots.txt')    rp.read()    throttle = Throttle(5)    while crawl_queue:        url = crawl_queue.pop()        #限速在这里        throttle.wait(url)        if rp.can_fetch('wswp', url):            html = download(url)            for link in get_links(html):                if re.match(link_regex, link):                    link = urlparse.urljoin(seed_url, link)                    if link not in seen:                        seen.add(link)                        crawl_queue.append(link)                        #测试一下,打印出抓取到的url                        print link        else:            print 'blocked by robots.txt:',urldef get_links(html):    """Return a list of links from html"""    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)    return webpage_regex.findall(html)link_crawler('http://example.webscraping.com/','/(index|view)')
0 0