爬虫系列6下载一个网页(异常处理,用户代理,重试次数)

来源:互联网 发布:培训4个月ui还是java好 编辑:程序博客网 时间:2024/05/22 01:42
# -*- coding: utf-8 -*-import urllib2# 下载网页def download(url):    return urllib2.urlopen(url).read()# 可能会遇到一些无法遇见的错误,可能会抛出异常# 捕捉异常版def download(url):    print 'Downloading:', url    try:        html = urllib2.urlopen(url).read()    except urllib2.URLError as e:        print 'Download Error:', e.reason        html = None    return html# 重试下载版(有些错误是临时的,我们可以尝试重新下载,5xx服务器端问题)def download(url, num_retries=2):    print 'Downloading:',url    try:        html = urllib2.urlopen(url).read()    except urllib2.URLError as e:        print 'Download Error:', e.reason        html = None        if num_retries > 0:            if hasattr(e, 'code') and 500 <= e.code < 600:                return download(url, num_retries-1)    return html# 设置用户代理,重试次数def download(url, user_aget='wswp', num_retries=2):    print 'Downloading:',url    headers = {'User-agent':user_aget}    request = urllib2.Request(url, headers=headers)    try:        html = urllib2.Request(request).read()    except urllib2.URLError as e:        print 'Download Error:', e.reason        html = None        if num_retries > 0:            if hasattr(e, 'code') and 500 <= e.code < 600:                return download(url, num_retries-1)    return html
阅读全文
0 0