python网络爬虫学习之路

来源：互联网发布：小米笔记本 centos 编辑：程序博客网时间：2024/04/29 16:06

安装检查网站构建的技术类型工具

pip install builtwith

敲 python 进入python

import builtwith

builtwith.parse('http://www.baidu.com')

安装寻找网站所有者的库

pip install python-whois

import whois

print whois.whois('www.baidu.com')

编写第一个简单的爬虫

#coding = utf-8
import urllib2

def download(url):
print 'downloading:', url
return urllib2.urlopen(url).read()
html = download('http://www.kuaizhaohuodi.com/')
print html

更健壮的版本

#coding = utf-8
import urllib2

def download(url):
print 'downloading:', url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'download error:', e.reason
html = None
return html
html = download('http://www.kuaizhaohuodi.com/')
print html

重复下载版本

#coding = utf-8
import urllib2

def download(url, num_retries=2):
print 'downloading:',url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'downloading erro:', e.reason
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, num_retries-1)
return html
html = download('http://httpstat.us/500')
print html

设置用户代理

#coding = utf-8
import urllib2

def download(url, user_agent='wswp', num_retries=2):
print 'downloading:', url
headers = {'User-agent': user_agent}
request = urllib2.Request(url, headers=headers)
try:
html = urllib2.urlopen(request).read()
except urllib2.URLError as e:
print 'downloading error:', e.reason
html = None
if num_retrise > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, user_agent, num_retries-1)
return html
html = download('http://www.imooc.com/')
print html

0 0