初识爬虫之一:urllib2与urllib实现

来源:互联网 发布:阿里云干嘛的 编辑:程序博客网 时间:2024/05/29 12:09
#coding:utf8#一:请求与响应#GET请求#1,直接用urlopen函数:通过URL发出的请求获取数据import urllib2response = urllib2.urlopen('http://www.zhihu.com')html = response.read()print html#2,分两步:先请求再响应import urllib2#请求request = urllib2.Request('http://www.zhihu.com')#响应response = urllib2.urlopen(request)html = response.read()print html#POST请求:比GET请求增加了请求数据import urllibimport urllib2url = 'https://www.baidu.com/' #登陆网址postdata = {'username' : '黑暗骑仕win',            'password' : ''}  #请求数据:用户名和密码#info需要被编码为urllib2能理解的格式,这里用到的是urllibdata = urllib.urlencode(postdata)  #转编码req = urllib2.Request(url, data) #带数据请求response = urllib2.urlopen(req)  #响应html = response.read()print html#二:请求头headers处理:模拟浏览器访问#加上请求头信息,设置请求头中的User-Agent域和Referer域import urllibimport urllib2url = 'https://www.zhihu.com/#signin'#设置User-Agent域user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'#设置Referer域referer='https://www.zhihu.com/'postdata = {'username' : '1518789956@qq.com',            'password' : ''}#将user_agent,referer写入头信息headers={'User-Agent':user_agent,'Referer':referer}data = urllib.urlencode(postdata)req = urllib2.Request(url, data, headers) #添加请求头请求response = urllib2.urlopen(req)html = response.read()print html#或者用add_header来添加请求头信息import urllibimport urllib2url = 'https://www.zhihu.com/#signin'user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'referer='https://www.zhihu.com/'postdata = {'username' : '1518789956@qq.com',            'password' : ''}data = urllib.urlencode(postdata)req = urllib2.Request(url)#用add_header将user_agent,referer写入头信息req.add_header('User-Agent',user_agent)req.add_header('Referer',referer)#用add_data写入数据req.add_data(data)response = urllib2.urlopen(req)html = response.read()print html#三:Cookie处理#urllib2对Cookie的处理是自动的,使用CookieJar函数进行Cookie的管理。#如果需要得到某个Cookie项的值,可以这么做import urllib2import cookielibcookie = cookielib.CookieJar()opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))response = opener.open('http://www.baidu.com')for item in cookie:    print item.name+':'+item.value#若不想urllib2自动处理cookie,#则可以设置请求头中的cookie域来自己添加cookie内容import urllib2opener = urllib2.build_opener()opener.addheaders.append(('cookie', 'email=' + "1518789956@qq.com"))req = urllib2.Request('https://www.baidu.com/')response = opener.open(req)print response.headersretdata = response.read()print retdata#四:Timeout设置超时#在python2.6之前,urllib2的API只能更改Socket的全局Timeoutimport urllib2import socketsocket.setdefaulttimeout(10)  # 10秒之后超时# urllib2.socket.setdefaulttimeout(10)  #  另一种方式#在python2.6之后,urlopen函数提供了对Timeout设置,如下import urllib2request=urllib2.Request('https://www.baidu.com/')response=urllib2.urlopen(request,timeout=2)html=response.read()print html#五:获取HTTP响应码#对于200 OK来说,使用返回的response对象的getcode()方法可以得到HTTP返回码#但其他返回码,urlopen会抛出异常,此时要检查异常对象的code属性import urllib2try:    response = urllib2.urlopen('https://www.baidu.com/')    print responseexcept urllib2.HTTPError as e:    if hasattr(e, 'code'):        print 'Error code:',e.code#六:重定向#urllib2默认对HTTP3XX返回码重定向#检查是否发送重定向,只需检查Response的URL和Request的URL是否一致import urllib2response = urllib2.urlopen('https://www.baidu.com/')isRedirected = response.geturl() == 'https://www.baidu.com/'#若不想重定向,可以自定义HTTPRedirectHandler类import urllib2class RedirectHandler(urllib2.HTTPRedirectHandler):    def http_error_301(self, req, fp, code, msg, headers):        pass    def http_error_302(self, req, fp, code, msg, headers):        result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code,                                                            msg,headers)        result.status = code        result.newurl = result.geturl()        return resultopener = urllib2.build_opener(RedirectHandler)opener.open('https://www.baidu.com/')#七:Proxy的设置#urllib2默认使用环境变量http_proxy设置HTTP Proxy#通常使用ProxyHandler在程序中动态设置代理import urllib2proxy = urllib2.ProxyHandler({'http':'127.0.0.1:8087'})opener = urllib2.build_opener([proxy,])urllib2.install_opener(opener)response = urllib2.urlopen('https://www.baidu.com/')print response.read()#这里使用urllib2.install_opener()会设置全局opener,之后所有的HTTP访问都会使用这个代理#但是要在程序中使用两个不同的Proxy设置,则不能做更细粒度的控制#比较好的方法是,不使用install_opener更改全局设置,#直接用opener的open方法代替全局的urlopen方法import urllib2proxy = urllib2.ProxyHandler({'http':'127.0.0.1:8087'})opener = urllib2.build_opener(proxy,)response = opener.open("https://www.baidu.com/")print response.read()
原创粉丝点击