黄聪：Python网站采集功能（多线程的采集、WDPYSPIDER类、pycurl）

来源：互联网发布：rpm下载yum安装包下载编辑：程序博客网时间：2024/04/30 01:38

Python

1import urllib
2urlItem= urllib.urlopen("http://www.baidu.com")
3htmSource= urlItem.read()
4urlItem.close()
5print htmSource

pycurl

http://pycurl.sourceforge.net/download/

http://pycurl.sourceforge.net/doc/curlobject.html

Python

01import pycurl
02c= pycurl.Curl()
03c.setopt(pycurl.URL,"http://www.whiledo.com/")
04c.setopt(pycurl.HTTPHEADER, ["Accept:"])
05import StringIO
06b= StringIO.StringIO()
07c.setopt(pycurl.WRITEFUNCTION, b.write)
08c.setopt(pycurl.FOLLOWLOCATION,1)
09c.setopt(pycurl.MAXREDIRS,5)
10c.perform()
11print b.getvalue()
12print c.getinfo(pycurl.INFO_FILETIME)

curl_easy_setopt

告诉 libcurl 的如何做事CURLOPT_WRITEFUNCTION: 写(下载)回传函数,传递一个写指针供外部操作, 一次回调内容大小在 CURL_MAX_WRITE_SIZE (curl.h头文件)中设置CURLOPT_WRITEDATA: 直接写文件,指定一个文件名如c.setopt(pycurl.WRITEDATA, 'E:\WebSite\py\1.txt') 注win下不能用CURLOPT_READFUNCTION: 读(上传)回传函数CURLOPT_SEEKFUNCTION: 数据指针移动,int function(void *instream, curl_off_t offset, int origin);SEEK_SET, SEEK_CUR and SEEK_END,返回CURL_SEEKFUNC_OK或CURL_SEEKFUNC_FAIL或CURL_SEEKFUNC_CANTSEEK (0,1,2)CURLOPT_OPENSOCKETFUNCTION:CURLOPT_HEADERFUNCTION:只接收头数据 size_t function( void *ptr, size_t size, size_t nmemb, void *userdata);CURLOPT_DEBUGFUNCTION: int curl_debug_callback (CURL *, curl_infotype, char *, size_t, void *);CURLOPT_VERBOSE: 参数设置为1 能显示更多详细信息CURLOPT_HEADER: 设为 1 将在返回的文本中包含头信息CURLOPT_NOSIGNAL: 不超时CURLOPT_FOLLOWLOCATION:设置为1告诉libcurl遵循任何访问CURLOPT_MAXREDIRS: 设定重定向的数目限制,设置为-1表示无限的重定向（默认）CURLOPT_PUT:数据上载相关CURLOPT_POST:CURLOPT_POSTREDIR:CURLOPT_POSTFIELDS:CURLOPT_POSTFIELDSIZE:CURLOPT_POSTFIELDSIZE_LARGE:CURLOPT_COPYPOSTFIELDS:CURLOPT_HTTPPOST:CURLOPT_UPLOAD:CURLOPT_AUTOREFERER:libcurl自动设置RefererCURLOPT_REFERER: 伪造来源路径CURLOPT_USERAGENT:自定义USERAGENTCURLOPT_HTTPHEADER:自定义头CURLOPT_COOKIE: "name1=content1; name2=content2;"CURLOPT_COOKIEFILE:CURLOPT_COOKIEJAR:CURLOPT_COOKIESESSION: 默认情况下，libcurl始终加载和存储所有CookieCURLOPT_COOKIELISTCURLOPT_HTTPGETCURLOPT_HTTP_VERSION: CURL_HTTP_VERSION_NONE,CURL_HTTP_VERSION_1_0,CURL_HTTP_VERSION_1_1CURLOPT_IGNORE_CONTENT_LENGTH:忽略内容长度头,针对类似Apache 1.x的服务器CURLOPT_HTTP_TRANSFER_DECODING:告诉libcurl如何对传输解码,(0,=1)CURLOPT_HTTP200ALIASES:自定义HTTP 200响应别名,有些服务器对200返回不是标准的CURLOPT_ENCODING:设置接收的内容编码,同 Accept-Encoding, ('','gzip',....)CURLOPT_UNRESTRICTED_AUTH:数设置为1,继续发送认证（用户+密码）NETWORK OPTIONSCURLOPT_URL: http://xxxx,ftp://xxxxCURLOPT_PROXY:HTTP代理,主机名或IP地址CURLOPT_PROXYPORT:代理端口,也可在PROXY的地址后加":端口",如 :8080CURLOPT_PROXYTYPE:代理类型,CURLPROXY_HTTP(默认), CURLPROXY_HTTP_1_0,CURLPROXY_SOCKS4,CURLPROXY_SOCKS5,CURLPROXY_SOCKS4A,CURLPROXY_SOCKS5_HOSTNAME,CURLOPT_NOPROXY:不使用代理的域CURLOPT_HTTPPROXYTUNNEL:CURLOPT_BUFFERSIZE: libcurl的缓冲区大小（以字节为单位）（认证）CURLOPT_NETRC: 此参数控制你的密码,CURL_NETRC_OPTIONAL使用 ~/.netrc 文件, CURL_NETRC_IGNORED(默认):忽略文件,CURL_NETRC_REQUIRED:告诉该文件的使用所需的库，要忽略的URL信息CURLOPT_NETRC_FILE: 指定 ~/.netrc 文件CURLOPT_USERNAME:CURLOPT_USERPWD:CURLOPT_PASSWORD:CURLOPT_PROXYUSERNAME:CURLOPT_PROXYUSERPWD:CURLOPT_HTTPAUTH:CURLOPT_PROXYAUTH:

CURLAUTH_BASIC: HTTP基本验证
CURLAUTH_DIGEST: HTTP摘要身份验证
CURLAUTH_DIGEST_IE:
CURLAUTH_GSSNEGOTIATE: Kerberos5认证要建立GSS - API
CURLAUTH_NTLM: NTLM身份验证
CURLAUTH_ANY: 设置所有选项,ibcurl自动选择一个它认为合适的,安全的验证<
CURLAUTH_ANYSAFE: 设置基本选项....
CURLAUTH_ONLY: 强制所有请求使用验证

getinfo

CURLINFO_RESPONSE_CODE: 获得最后收到的HTTP或FTP的代码,如200,404,403,505 代理的CONNECT响应要参考 CURLINFO_HTTP_CONNECTCODECURLINFO_EFFECTIVE_URL: 最后一次使用有效的URLCURLINFO_HTTP_CONNECTCODE : 长期接受最后收到的代理响应代码CURLINFO_FILETIME:CURLINFO_TOTAL_TIME:CURLINFO_CONNECT_TIME:CURLINFO_NUM_CONNECTS: 多少个连接CURLINFO_CONTENT_TYPE: 例：text/htmlCURLINFO_REQUEST_SIZE:CURLINFO_HEADER_SIZE:CURLINFO_SIZE_DOWNLOAD: 下载总字节量CURLINFO_SIZE_UPLOAD:CURLINFO_HTTPAUTH_AVAIL: 接收掩码表明身份验证CURLINFO_PROXYAUTH_AVAIL: 接收掩码表明代理身份验证CURLINFO_COOKIELIST:部份使用 INFO_ 如：INFO_COOKIELIST

一个粗糙的共用对象的采集示例

Python

01import pycurl
02import StringIO
03import string
04import random
05class spider:
06    def __init__(self,addHeader= []):
07        self.httpheader= [
08             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
09            #,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
10        ]+ addHeader
11        self.curl= pycurl.Curl()
12        self.curl.setopt(pycurl.HTTPHEADER,self.httpheader)
13        self.curl.setopt(pycurl.REFERER,'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+self.rand_str())
14        #self.curl.setopt(pycurl.AUTOREFERER, 1)
15        self.curl.setopt(pycurl.FOLLOWLOCATION,1)
16        self.curl.setopt(pycurl.MAXREDIRS,5)
17 
18    def __del__(self):
19        pass
20 
21    def rand_str(self):
22        return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
23 
24    def tofile(self,url,filename):
25        fp= open(filename,'w');
26        self.curl.setopt(pycurl.URL, url)
27        self.curl.setopt(pycurl.WRITEFUNCTION, fp.write)
28        self.curl.perform()
29        fp.close()
30        return True
31 
32    def html(self, url):
33        sio= StringIO.StringIO()
34        self.curl.setopt(pycurl.URL, url)
35        self.curl.setopt(pycurl.WRITEFUNCTION, sio.write)
36        self.curl.perform()
37        reval= sio.getvalue()
38        sio.close()
39        return reval
40 
41if __name__== "__main__":
42    get= spider(['USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'])
43    print get.html("http://localhost/spider_for_test.php")
44    print get.tofile("http://localhost/spider_for_test.php",r'E:\WebSite\wwwroot\test.txt')

一个多线程的采集示例

Python

01import pycurl
02import threading
03import StringIO
04import string
05import random
06class spider:
07    def __init__(self,referer='',httpheader= []):
08        self.httpheader= [
09             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
10            ,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
11        ]+ httpheader
12        self.referer= referer
13    def __del__(self):
14        pass
15 
16    def fetch(self,url,stream):
17        curl= pycurl.Curl()
18        curl.setopt(pycurl.HTTPHEADER,self.httpheader)
19        if self.referer== '':
20            curl.setopt(pycurl.AUTOREFERER,1)
21        else:
22            curl.setopt(pycurl.REFERER,self.referer)
23        curl.setopt(pycurl.FOLLOWLOCATION,1)
24        curl.setopt(pycurl.MAXREDIRS,5)
25        curl.setopt(pycurl.URL, url)
26        curl.setopt(pycurl.WRITEFUNCTION, stream.write)
27        curl.perform()
28        curl.close()
29 
30    def rand_str(self):
31        return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
32 
33    def tofile(self,url,filename):
34        fp= open(filename,'w');
35        self.fetch(url,fp)
36        fp.close()
37        return True
38 
39    def html(self, url):
40        sio= StringIO.StringIO()
41        self.fetch(url,sio)
42        reval= sio.getvalue()
43        sio.close()
44        return reval
45 
46def gethtml(url,get):
47    print get.html(url)
48 
49if __name__== "__main__":
50    import time,datetime
51    dstart= datetime.datetime.now()
52    get= spider()
53    get.referer= 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()
54    thread_pool= []
55    acc= Account(100)
56    for iin range(10):
57        url= "http://localhost/test.php?n="+str(i)
58        th= threading.Thread(target=gethtml,args=(url,get))
59        thread_pool.append(th)
60    for iin range(10):
61        thread_pool[i].start()
62    for iin range(10):
63        threading.Thread.join(thread_pool[i])
64    dend= datetime.datetime.now()
65    print "Time span:" , dend-dstart;

WDPYSPIDER类（支持，多线程，代理，登陆验证,POST）

Python

001#coding:utf-8
002import pycurl
003import urllib
004import threading
005import StringIO
006import string
007import random
008class spider:
009    '''WDPYSPIDER(Whiledo Python Spider Class) 采集类
010 
011    @author HzqGhost admin@whiledo.com QQ:313143468
012    get = spider()
013    get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()
014    get.proxyuse = True
015    get.proxyip = ['059148233056.ctinets.com:80']
016    url = "http://www.whiledo.com"
017    print get.html(url=url)'''
018    def __init__(self):
019        self.httpheader= [
020             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
021            ,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
022        ]#http头信息
023        self.referer= ''#伪造来源路径
024        self.connnecttimeout= 60 #获取联接超时(秒)
025        self.timeout= 300 #读定超时(秒)
026        self.backheader= 0 #是否返回服务器http头信息(一般用于测试)
027        self.cookesfile= "./cookes.dat" #cookesfile 自动读写处理文件
028        self.proxyuse= False  #是否使用代理服务器
029        self.proxyip= [] #代理服务器[IP:PORT]列表，随机使用列表中的IP
030        self.proxynodomain= ['localhost','127.0.0.1'] #不使用代理服务器的域
031        self.http200alias= []#200返回信息别名列表
032        self.error= 'WDPYERROR' #非200状态时返回的错误标识
033    def __del__(self):
034        pass
035 
036    def fetch(self,url,stream, post={}):
037        '''
038        --url
039        --stream [stream] StringIO or fp
040        --post [dict] {'username':'hzq','password':'blog'}'''
041        curl= pycurl.Curl()
042        curl.setopt(pycurl.CONNECTTIMEOUT,self.connnecttimeout)
043        curl.setopt(pycurl.TIMEOUT,self.timeout)
044        curl.setopt(pycurl.HTTPHEADER,self.httpheader)
045        curl.setopt(pycurl.HTTP200ALIASES,self.http200alias)
046        curl.setopt(pycurl.HEADER,self.backheader)
047        curl.setopt(pycurl.FOLLOWLOCATION,1)
048        curl.setopt(pycurl.MAXREDIRS,5)
049        if self.referer== '':
050            curl.setopt(pycurl.AUTOREFERER,1)
051        else:
052            curl.setopt(pycurl.REFERER,self.referer)
053        curl.setopt(pycurl.COOKIEJAR,self.cookesfile)
054        curl.setopt(pycurl.COOKIEFILE,self.cookesfile)
055        curl.setopt(pycurl.WRITEFUNCTION, stream.write)
056        curl.setopt(pycurl.URL, url)
057        if self.proxyuse:
058            proxyip= self.proxyip[random.randint(0,len(self.proxyip)-1)];
059            curl.setopt(pycurl.PROXY, proxyip)
060            #curl.setopt(pycurl.PROXYNO, self.proxynodomain) #需要7.19.4 的pycurl版本
061        if len(post)>0 :
062            curl.setopt(pycurl.POSTFIELDS, post)
063        status= ''
064        try:
065            curl.perform()
066            status= curl.getinfo(pycurl.RESPONSE_CODE)
067        except:
068            status= curl.errstr()
069        finally:
070            curl.close()
071            status= str(status);
072            if status != '200':
073                status= self.error
074            return status;
075 
076    def rand_str(self):
077        return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
078 
079    def tofile(self,url,filename, post={}):
080        fp= open(filename,'wb');
081        self.fetch(url,fp,post)
082        fp.close()
083        return True
084 
085    def html(self, url, post={}):
086        sio= StringIO.StringIO()
087        reval= self.fetch(url,sio, post)
088        if reval== '200':
089            reval= sio.getvalue()
090        sio.close()
091        return reval
092 
093def gethtml(url,get):
094    print get.html(url)
095 
096if __name__== "__main__":
097    get= spider()
098    get.referer= 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()
099    get.proxyuse= True
100    get.proxyip= ['059148233056.ctinets.com:80']
101    url= "http://www.whiledo.com"
102    print get.html(url=url)

作者：黄聪
出处：http://www.cnblogs.com/huangcong/
本文版权归作者和博客园共有，欢迎转载，但未经作者同意必须保留此段声明，且在文章页面明显位置给出原文连接，否则保留追究法律责任的权利。

0 0