urllib和urllib2区别和常见用法

来源:互联网 发布:卡司数据 编辑:程序博客网 时间:2024/05/22 13:37

python2.x urllib和urllib2区别

  urllib urllib2 urlopen第一个参数 url字符串 url字符串和Request对象(可以设置header,包含Referer、User-Agent等) urlencode 有 无

urllib和urllib2常常结合在一起使用。

urllib获取HTTP信息

# -*- coding:utf-8 -*-import urllibres = urllib.urlopen('http://www.iqilu.com')print 'http header:/n', res.info()  # header信息print 'http status:', res.getcode()  # 状态码print 'url:', res.geturl()  # 获取完整URLprint 'Body:/n', res.read()  # 读取HTMLfor line in res:  # 像文件一样逐行读取    print line,res.close()

urllib常用函数

data = 'id=1&title = test'  data1 = urllib.quote(data)  print data1  # id%3D1%26title%20%3D%20testprint urllib.unquote(data1)  # id=1&title = test# 空格替换成加号data2 = urllib.quote_plus(data)  print data2  # id%3D1%26title+%3D+testprint urllib.unquote_plus(data2)# id=1&title = testdata3 = urllib.urlencode({ 'name': 'dark-bull', 'age': 200 })  print data3  # age=200&name=dark-bull# 将本地路径转换成url路径data4 = urllib.pathname2url(r'd:/a/b/c/23.php')print data4 # ///D|/a/b/c/23.php# 将url路径转换成本地路径print urllib.url2pathname(data4)# D:/a/b/c/23.php 

urllib发送GET请求

import urllib# 编码URL参数params = urllib.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})>>> f = urllib.urlopen("http://domain.com/query?%s" % params)>>> print f.read()

urllib发送POST请求

import urllib# URL编码参数params = urllib.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})# urlopen第二个参数不为None,即为POST请求f = urllib.urlopen("http://www.musi-cal.com/cgi-bin/query", params)print f.read()

urllib使用HTTP代理,自动跟踪重定向

import urllibproxies = {'http': 'http://proxy.example.com:8080/'}opener = urllib.FancyURLopener(proxies)f = opener.open("http://www.python.org")f.read()

urllib2请求获取URL信息

import urllib2# 以下代码跟urllib使用方法一致res = urllib2.urlopen('http://www.iqilu.com')html = response.read()

urllib2使用Request发送请求

import urllib2# 构造请求req = urllib2.Request("http://www.douban.com")# 使用默认opener发送请求req,返回Response类型resres= urllib2.urlopen(req)# 读取HTMLhtml = res.read()print html

urllib2伪装浏览器

修改user-agent

import urllibimport urllib2url = 'http://www.domain.com/login.php'# 将user_agent写入头信息user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'values = {'name' : 'who','password':'123456'}headers = { 'User-Agent' : user_agent }# 使用urllib的urlencode进行URL编码,不支持嵌套data = urllib.urlencode(values)# data不为None就是POST请求req = urllib2.Request(url, data, headers)response = urllib2.urlopen(req)the_page = response.read()

urllib2发送GET请求

模拟百度查询

import urllib import urllib2  url = 'http://www.baidu.com/s' post_data = {'wd':'iqilu.com'}# URL编码data = urllib.urlencode(values)url2 = url+'?'+data# urlopen第二个参数为None,就是GET请求res= urllib2.urlopen(url2)  # 打印响应的HTMLprint res.read() 

urllib2发送POST请求

import urllibimport urllib2url = 'http://www.domain.com/login.php'user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'values = {'name' : 'who','password':'123456'}# 修改User-Agentheaders = { 'User-Agent' : user_agent }data = urllib.urlencode(values)# Request第二个参数是POST表单数据,不为None,就是POST请求req = urllib2.Request(url, data, headers)response = urllib2.urlopen(req)the_page = response.read()

urllib2带cookie

import urllib2,urllibimport cookieliburl = r'http://www.renren.com/SysHome.do'# 创建一个cj的cookie的容器cj = cookielib.CookieJar()# 使用cookie处理器的openeropener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))# 将要POST出去的数据进行编码data = urllib.urlencode({"email":email,"password":pass})r = opener.open(url,data)print cj

urllib2使用基本的HTTP认证

也就是登录认证

import urllib2auth_handler = urllib2.HTTPBasicAuthHandler()auth_handler.add_password(realm='PDQ Application',                          uri='https://mahler:8092/site-updates.py',                          user='klem',                          passwd='kadidd!ehopper')opener = urllib2.build_opener(auth_handler)# 把opener设置为全局默认openerurllib2.install_opener(opener)urllib2.urlopen('http://www.example.com/login.html')

uild_opener默认提供很多处理程序, 包括代理处理程序, 代理默认会被设置为环境变量所提供的。

urllib2使用代理

proxy_handler = urllib2.ProxyHandler({'http': 'http://www.example.com:3128/'})proxy_auth_handler = urllib2.ProxyBasicAuthHandler()proxy_auth_handler.add_password('realm', 'host', 'username', 'password')opener = urllib2.build_opener(proxy_handler, proxy_auth_handler)opener.open('http://www.example.com/login.html')

httplib用法

httplib 和 httplib2 httplib 是http客户端协议的实现,通常不直接使用, urllib是以httplib为基础 httplib2 是第三方库, 比httplib有更多特性。

httplib比较底层,一般使用的话用urllib和urllib2即可

#!/usr/bin/env python    # -*- coding: utf-8 -*-    import httplib  import urllib  def sendhttp():      data = urllib.urlencode({'@number': 12524, '@type': 'issue', '@action': 'show'})         headers = {"Content-type": "application/x-www-form-urlencoded",                 "Accept": "text/plain"}      conn = httplib.HTTPConnection('bugs.python.org')      conn.request('POST', '/', data, headers)      httpres = conn.getresponse()      print httpres.status      print httpres.reason      print httpres.read()             if __name__ == '__main__':        sendhttp()

python3.x urllib

这里urllib成了一个包, 此包分成了几个模块,

  • urllib.request 用于打开和读取URL,
  • urllib.error 用于处理前面request引起的异常,
  • urllib.parse 用于解析URL,
  • urllib.robotparser用于解析robots.txt文件

python2.X 中的 urllib.urlopen()被废弃, urllib2.urlopen()相当于python3.X中的urllib.request.urlopen()

# GET一个URLimport urllib.requestwith urllib.request.urlopen('http://www.python.org/') as f:    print(f.read(300))# PUT一个请求import urllib.requestDATA=b'some data'req = urllib.request.Request(url='http://localhost:8080', data=DATA,method='PUT')with urllib.request.urlopen(req) as f:    passprint(f.status)print(f.reason)# 基本的HTTP认证import urllib.requestauth_handler = urllib.request.HTTPBasicAuthHandler()auth_handler.add_password(realm='PDQ Application',                          uri='https://mahler:8092/site-updates.py',                          user='klem',                          passwd='kadidd!ehopper')opener = urllib.request.build_opener(auth_handler)urllib.request.install_opener(opener)urllib.request.urlopen('http://www.example.com/login.html')# 使用proxyproxy_handler = urllib.request.ProxyHandler({'http': 'http://www.example.com:3128/'})proxy_auth_handler = urllib.request.ProxyBasicAuthHandler()proxy_auth_handler.add_password('realm', 'host', 'username', 'password')opener = urllib.request.build_opener(proxy_handler, proxy_auth_handler)opener.open('http://www.example.com/login.html')# 添加头部import urllib.requestreq = urllib.request.Request('http://www.example.com/')req.add_header('Referer', 'http://www.python.org/')r = urllib.request.urlopen(req)# 更改User-agentimport urllib.requestopener = urllib.request.build_opener()opener.addheaders = [('User-agent', 'Mozilla/5.0')]opener.open('http://www.example.com/')# 使用GET时设置URL的参数import urllib.requestimport urllib.parseparams = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})url = "http://www.musi-cal.com/cgi-bin/query?%s" % paramswith urllib.request.urlopen(url) as f:    print(f.read().decode('utf-8'))# 使用POST时设置参数import urllib.requestimport urllib.parsedata = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})data = data.encode('ascii')with urllib.request.urlopen("http://requestb.in/xrbl82xr", data) as f:    print(f.read().decode('utf-8'))# 指定proxyimport urllib.requestproxies = {'http': 'http://proxy.example.com:8080/'}opener = urllib.request.FancyURLopener(proxies)with opener.open("http://www.python.org") as f:    f.read().decode('utf-8')# 不使用proxy, 覆盖环境变量的proxyimport urllib.requestopener = urllib.request.FancyURLopener({})with opener.open("http://www.python.org/") as f:    f.read().decode('utf-8')

python2.X中的httplib被重命名为 http.client
使用 2to3 工具转换源码时, 会自动处理这几个库的导入。

参考

  • http://www.cnblogs.com/wly923/archive/2013/05/07/3057122.html
  • http://blog.csdn.net/permike/article/details/52437492
  • http://blog.csdn.net/dolphin_h/article/details/45296353