python网络爬虫学习之路

来源:互联网 发布:小米笔记本 centos 编辑:程序博客网 时间:2024/04/29 16:06

安装检查网站构建的技术类型工具

pip install builtwith 

敲 python 进入python

import  builtwith

builtwith.parse('http://www.baidu.com')

安装寻找网站所有者的库 

pip install python-whois

import  whois

print whois.whois('www.baidu.com')

编写第一个简单的爬虫

 #coding = utf-8
import urllib2

def download(url):
print 'downloading:', url
return urllib2.urlopen(url).read()
 html = download('http://www.kuaizhaohuodi.com/')
 print html

更健壮的版本

#coding = utf-8
import urllib2

def download(url):
     print 'downloading:', url
     try:
         html = urllib2.urlopen(url).read()
     except urllib2.URLError as e:
         print 'download error:', e.reason
         html = None
     return html
 html = download('http://www.kuaizhaohuodi.com/')
 print html

重复下载版本

#coding = utf-8
 import urllib2

 def download(url, num_retries=2):
     print 'downloading:',url
     try:
         html = urllib2.urlopen(url).read()
     except urllib2.URLError as e:
         print 'downloading erro:', e.reason
         html = None
         if num_retries > 0:
             if hasattr(e, 'code') and 500 <= e.code < 600:
                 return download(url, num_retries-1)
     return html
 html = download('http://httpstat.us/500')
 print html

设置用户代理

#coding = utf-8
import urllib2


def download(url, user_agent='wswp', num_retries=2):
    print 'downloading:', url
    headers = {'User-agent': user_agent}
    request = urllib2.Request(url, headers=headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print 'downloading error:', e.reason
        html = None
        if num_retrise > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_retries-1)
    return html
html = download('http://www.imooc.com/')
print html



0 0
原创粉丝点击