Python3 urllib库爬虫 基础

来源:互联网 发布:康乾盛世知乎 编辑:程序博客网 时间:2024/05/22 08:17

基础

add_header()添加报头

url="http://blog.csdn.net/yudiyanwang/article/details/78322039"req = urllib.request.Request(url)req.add_header("User-Agent","Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0")data = urllib.request.urlopen(req).read()print(data)

GET请求

keyword = "hello" #要搜索的关键字url = "http://www.baidu.com/s?wd=hello"req = urllib.request.Request(url)data = urllib.request.urlopen(req).read()with open("./result.txt","wb") as fd:    fd.write(data)# 上述当检索中文的时候 编码错误 keyword = "你好"key_code = urllib.request.quote(keyword) # 编码url = "http://www.baidu.com/s?wd=" + key_codeprint(url) # http://www.baidu.com/s?wd=%E4%BD%A0%E5%A5%BDreq = urllib.request.Request(url)data = urllib.request.urlopen(req).read()with open("./result.txt","wb") as fd:    fd.write(data)

post请求

# PHP页面<form action="" method="post">    <input name="name" type="text" /></br>    <input name="pass" type="text" /><br>    <input name="" type="submit" value="submit"/></form>#请求地址url = "http://192.168.1.108/login.html"# 构建表单数据并进行编码处理postdata = urllib.parse.urlencode({    "name":"abcdef",    "pass":"123456"}).encode("utf-8")# 创建Request对象 参数包括URL地址和要传递的数据req =urllib.request.Request(url,postdata)# 添加头信息req.add_header("User-Agent","Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0")data =urllib.request.urlopen(req).read()with open("./post.txt",'wb') as fd:    fd.write(data)

一边运行 一边打印日志 开启DebugLog

httpd = urllib.request.HTTPHandler(debuglevel=1)httpsd = urllib.request.HTTPSHandler(debuglevel=1)opener = urllib.request.build_opener(httpd,httpsd)urllib.request.install_opener(opener)data = urllib.request.urlopen("http://edu.jd.com")

异常

# URLerror异常 1,连接不上远程服务器,2,远程URL不存在,3 无网络,4 触发了HTTPErrortry:    data = urllib.request.urlopen("http://blog.csdn1.net").read()    print(data)except urllib.error.URLError as e:    # print(e.code)    # print("-----------------------")    print(e.reason)# 当构造一个存在的网址,引发的异常不能用HTTPError处理,要用URLError处理  ,URLError是HTTPError的父类
原创粉丝点击