黄聪:Python网站采集功能(多线程的采集、WDPYSPIDER类、pycurl)
来源:互联网 发布:rpm下载yum安装包下载 编辑:程序博客网 时间:2024/04/30 01:38
黄聪:Python网站采集功能(多线程的采集、WDPYSPIDER类、pycurl)
Python
1
import
urllib
2
urlItem
=
urllib.urlopen(
"http://www.baidu.com"
)
3
htmSource
=
urlItem.read()
4
urlItem.close()
5
print
htmSource
pycurl
http://pycurl.sourceforge.net/download/
http://pycurl.sourceforge.net/doc/curlobject.html
Python
01
import
pycurl
02
c
=
pycurl.Curl()
03
c.setopt(pycurl.URL,
"http://www.whiledo.com/"
)
04
c.setopt(pycurl.HTTPHEADER, [
"Accept:"
])
05
import
StringIO
06
b
=
StringIO.StringIO()
07
c.setopt(pycurl.WRITEFUNCTION, b.write)
08
c.setopt(pycurl.FOLLOWLOCATION,
1
)
09
c.setopt(pycurl.MAXREDIRS,
5
)
10
c.perform()
11
print
b.getvalue()
12
print
c.getinfo(pycurl.INFO_FILETIME)
curl_easy_setopt
告诉 libcurl 的如何做事CURLOPT_WRITEFUNCTION: 写(下载)回传函数,传递一个写指针供外部操作, 一次回调内容大小在 CURL_MAX_WRITE_SIZE (curl.h头文件)中设置CURLOPT_WRITEDATA: 直接写文件,指定一个文件名如c.setopt(pycurl.WRITEDATA, 'E:\WebSite\py\1.txt') 注win下不能用CURLOPT_READFUNCTION: 读(上传)回传函数CURLOPT_SEEKFUNCTION: 数据指针移动,int function(void *instream, curl_off_t offset, int origin);SEEK_SET, SEEK_CUR and SEEK_END,返回CURL_SEEKFUNC_OK或CURL_SEEKFUNC_FAIL或CURL_SEEKFUNC_CANTSEEK (0,1,2)CURLOPT_OPENSOCKETFUNCTION:CURLOPT_HEADERFUNCTION:只接收头数据 size_t function( void *ptr, size_t size, size_t nmemb, void *userdata);CURLOPT_DEBUGFUNCTION: int curl_debug_callback (CURL *, curl_infotype, char *, size_t, void *);CURLOPT_VERBOSE: 参数设置为1 能显示更多详细信息CURLOPT_HEADER: 设为 1 将在返回的文本中包含头信息CURLOPT_NOSIGNAL: 不超时CURLOPT_FOLLOWLOCATION:设置为1告诉libcurl遵循任何访问CURLOPT_MAXREDIRS: 设定重定向的数目限制,设置为-1表示无限的重定向(默认)CURLOPT_PUT:数据上载相关CURLOPT_POST:CURLOPT_POSTREDIR:CURLOPT_POSTFIELDS:CURLOPT_POSTFIELDSIZE:CURLOPT_POSTFIELDSIZE_LARGE:CURLOPT_COPYPOSTFIELDS:CURLOPT_HTTPPOST:CURLOPT_UPLOAD:CURLOPT_AUTOREFERER:libcurl自动设置RefererCURLOPT_REFERER: 伪造来源路径CURLOPT_USERAGENT:自定义USERAGENTCURLOPT_HTTPHEADER:自定义头CURLOPT_COOKIE: "name1=content1; name2=content2;"CURLOPT_COOKIEFILE:CURLOPT_COOKIEJAR:CURLOPT_COOKIESESSION: 默认情况下,libcurl始终加载和存储所有CookieCURLOPT_COOKIELISTCURLOPT_HTTPGETCURLOPT_HTTP_VERSION: CURL_HTTP_VERSION_NONE,CURL_HTTP_VERSION_1_0,CURL_HTTP_VERSION_1_1CURLOPT_IGNORE_CONTENT_LENGTH:忽略内容长度头,针对类似Apache 1.x的服务器CURLOPT_HTTP_TRANSFER_DECODING:告诉libcurl如何对传输解码,(0,=1)CURLOPT_HTTP200ALIASES:自定义HTTP 200响应别名,有些服务器对200返回不是标准的CURLOPT_ENCODING:设置接收的内容编码,同 Accept-Encoding, ('','gzip',....)CURLOPT_UNRESTRICTED_AUTH:数设置为1,继续发送认证(用户+密码)NETWORK OPTIONSCURLOPT_URL: http://xxxx,ftp://xxxxCURLOPT_PROXY:HTTP代理,主机名或IP地址CURLOPT_PROXYPORT:代理端口,也可在PROXY的地址后加":端口",如 :8080CURLOPT_PROXYTYPE:代理类型,CURLPROXY_HTTP(默认), CURLPROXY_HTTP_1_0,CURLPROXY_SOCKS4,CURLPROXY_SOCKS5,CURLPROXY_SOCKS4A,CURLPROXY_SOCKS5_HOSTNAME,CURLOPT_NOPROXY:不使用代理的域CURLOPT_HTTPPROXYTUNNEL:CURLOPT_BUFFERSIZE: libcurl的缓冲区大小(以字节为单位)(认证)CURLOPT_NETRC: 此参数控制你的密码,CURL_NETRC_OPTIONAL使用 ~/.netrc 文件, CURL_NETRC_IGNORED(默认):忽略文件,CURL_NETRC_REQUIRED:告诉该文件的使用所需的库,要忽略的URL信息CURLOPT_NETRC_FILE: 指定 ~/.netrc 文件CURLOPT_USERNAME:CURLOPT_USERPWD:CURLOPT_PASSWORD:CURLOPT_PROXYUSERNAME:CURLOPT_PROXYUSERPWD:CURLOPT_HTTPAUTH:CURLOPT_PROXYAUTH:
- CURLAUTH_BASIC: HTTP基本验证
- CURLAUTH_DIGEST: HTTP摘要身份验证
- CURLAUTH_DIGEST_IE:
- CURLAUTH_GSSNEGOTIATE: Kerberos5认证 要建立GSS - API
- CURLAUTH_NTLM: NTLM身份验证
- CURLAUTH_ANY: 设置所有选项,ibcurl自动选择一个它认为合适的,安全的验证<
- CURLAUTH_ANYSAFE: 设置基本选项....
- CURLAUTH_ONLY: 强制所有请求使用验证
getinfo
CURLINFO_RESPONSE_CODE: 获得最后收到的HTTP或FTP的代码,如200,404,403,505 代理的CONNECT响应要参考 CURLINFO_HTTP_CONNECTCODECURLINFO_EFFECTIVE_URL: 最后一次使用有效的URLCURLINFO_HTTP_CONNECTCODE : 长期接受最后收到的代理响应代码CURLINFO_FILETIME:CURLINFO_TOTAL_TIME:CURLINFO_CONNECT_TIME:CURLINFO_NUM_CONNECTS: 多少个连接CURLINFO_CONTENT_TYPE: 例:text/htmlCURLINFO_REQUEST_SIZE:CURLINFO_HEADER_SIZE:CURLINFO_SIZE_DOWNLOAD: 下载总字节量CURLINFO_SIZE_UPLOAD:CURLINFO_HTTPAUTH_AVAIL: 接收掩码表明身份验证CURLINFO_PROXYAUTH_AVAIL: 接收掩码表明代理身份验证CURLINFO_COOKIELIST:部份使用 INFO_ 如:INFO_COOKIELIST
一个粗糙的共用对象的采集示例
Python
01
import
pycurl
02
import
StringIO
03
import
string
04
import
random
05
class
spider:
06
def
__init__(
self
,addHeader
=
[]):
07
self
.httpheader
=
[
08
'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
09
#,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
10
]
+
addHeader
11
self
.curl
=
pycurl.Curl()
12
self
.curl.setopt(pycurl.HTTPHEADER,
self
.httpheader)
13
self
.curl.setopt(pycurl.REFERER,
'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='
+
self
.rand_str())
14
#self.curl.setopt(pycurl.AUTOREFERER, 1)
15
self
.curl.setopt(pycurl.FOLLOWLOCATION,
1
)
16
self
.curl.setopt(pycurl.MAXREDIRS,
5
)
17
18
def
__del__(
self
):
19
pass
20
21
def
rand_str(
self
):
22
return
'
'.join(random.sample(['
a
','
b
','
c
','
d
','
e
','
f
','
g
','
h
','
i
','
j
','
k
','
l
','
m
','
n'],
6
))
23
24
def
tofile(
self
,url,filename):
25
fp
=
open
(filename,
'w'
);
26
self
.curl.setopt(pycurl.URL, url)
27
self
.curl.setopt(pycurl.WRITEFUNCTION, fp.write)
28
self
.curl.perform()
29
fp.close()
30
return
True
31
32
def
html(
self
, url):
33
sio
=
StringIO.StringIO()
34
self
.curl.setopt(pycurl.URL, url)
35
self
.curl.setopt(pycurl.WRITEFUNCTION, sio.write)
36
self
.curl.perform()
37
reval
=
sio.getvalue()
38
sio.close()
39
return
reval
40
41
if
__name__
=
=
"__main__"
:
42
get
=
spider([
'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
])
43
print
get.html(
"http://localhost/spider_for_test.php"
)
44
print
get.tofile(
"http://localhost/spider_for_test.php"
,r
'E:\WebSite\wwwroot\test.txt'
)
一个多线程的采集示例
Python
01
import
pycurl
02
import
threading
03
import
StringIO
04
import
string
05
import
random
06
class
spider:
07
def
__init__(
self
,referer
=
'',httpheader
=
[]):
08
self
.httpheader
=
[
09
'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
10
,
'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
11
]
+
httpheader
12
self
.referer
=
referer
13
def
__del__(
self
):
14
pass
15
16
def
fetch(
self
,url,stream):
17
curl
=
pycurl.Curl()
18
curl.setopt(pycurl.HTTPHEADER,
self
.httpheader)
19
if
self
.referer
=
=
'':
20
curl.setopt(pycurl.AUTOREFERER,
1
)
21
else
:
22
curl.setopt(pycurl.REFERER,
self
.referer)
23
curl.setopt(pycurl.FOLLOWLOCATION,
1
)
24
curl.setopt(pycurl.MAXREDIRS,
5
)
25
curl.setopt(pycurl.URL, url)
26
curl.setopt(pycurl.WRITEFUNCTION, stream.write)
27
curl.perform()
28
curl.close()
29
30
def
rand_str(
self
):
31
return
'
'.join(random.sample(['
a
','
b
','
c
','
d
','
e
','
f
','
g
','
h
','
i
','
j
','
k
','
l
','
m
','
n'],
6
))
32
33
def
tofile(
self
,url,filename):
34
fp
=
open
(filename,
'w'
);
35
self
.fetch(url,fp)
36
fp.close()
37
return
True
38
39
def
html(
self
, url):
40
sio
=
StringIO.StringIO()
41
self
.fetch(url,sio)
42
reval
=
sio.getvalue()
43
sio.close()
44
return
reval
45
46
def
gethtml(url,get):
47
print
get.html(url)
48
49
if
__name__
=
=
"__main__"
:
50
import
time,datetime
51
dstart
=
datetime.datetime.now()
52
get
=
spider()
53
get.referer
=
'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='
+
get.rand_str()
54
thread_pool
=
[]
55
acc
=
Account(
100
)
56
for
i
in
range
(
10
):
57
url
=
"http://localhost/test.php?n="
+
str
(i)
58
th
=
threading.Thread(target
=
gethtml,args
=
(url,get))
59
thread_pool.append(th)
60
for
i
in
range
(
10
):
61
thread_pool[i].start()
62
for
i
in
range
(
10
):
63
threading.Thread.join(thread_pool[i])
64
dend
=
datetime.datetime.now()
65
print
"Time span:"
, dend
-
dstart;
WDPYSPIDER类(支持,多线程,代理,登陆验证,POST)
Python
001
#coding:utf-8
002
import
pycurl
003
import
urllib
004
import
threading
005
import
StringIO
006
import
string
007
import
random
008
class
spider:
009
'''WDPYSPIDER(Whiledo Python Spider Class) 采集类
010
011
@author HzqGhost admin@whiledo.com QQ:313143468
012
get = spider()
013
get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()
014
get.proxyuse = True
015
get.proxyip = ['059148233056.ctinets.com:80']
016
url = "http://www.whiledo.com"
017
print get.html(url=url)'''
018
def
__init__(
self
):
019
self
.httpheader
=
[
020
'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
021
,
'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
022
]
#http头信息
023
self
.referer
=
''
#伪造来源路径
024
self
.connnecttimeout
=
60
#获取联接超时(秒)
025
self
.timeout
=
300
#读定超时(秒)
026
self
.backheader
=
0
#是否返回服务器http头信息(一般用于测试)
027
self
.cookesfile
=
"./cookes.dat"
#cookesfile 自动读写处理文件
028
self
.proxyuse
=
False
#是否使用代理服务器
029
self
.proxyip
=
[]
#代理服务器[IP:PORT]列表,随机使用列表中的IP
030
self
.proxynodomain
=
[
'localhost'
,
'127.0.0.1'
]
#不使用代理服务器的域
031
self
.http200alias
=
[]
#200返回信息别名列表
032
self
.error
=
'WDPYERROR'
#非200状态时返回的错误标识
033
def
__del__(
self
):
034
pass
035
036
def
fetch(
self
,url,stream, post
=
{}):
037
'''
038
--url
039
--stream [stream] StringIO or fp
040
--post [dict] {'username':'hzq','password':'blog'}'''
041
curl
=
pycurl.Curl()
042
curl.setopt(pycurl.CONNECTTIMEOUT,
self
.connnecttimeout)
043
curl.setopt(pycurl.TIMEOUT,
self
.timeout)
044
curl.setopt(pycurl.HTTPHEADER,
self
.httpheader)
045
curl.setopt(pycurl.HTTP200ALIASES,
self
.http200alias)
046
curl.setopt(pycurl.HEADER,
self
.backheader)
047
curl.setopt(pycurl.FOLLOWLOCATION,
1
)
048
curl.setopt(pycurl.MAXREDIRS,
5
)
049
if
self
.referer
=
=
'':
050
curl.setopt(pycurl.AUTOREFERER,
1
)
051
else
:
052
curl.setopt(pycurl.REFERER,
self
.referer)
053
curl.setopt(pycurl.COOKIEJAR,
self
.cookesfile)
054
curl.setopt(pycurl.COOKIEFILE,
self
.cookesfile)
055
curl.setopt(pycurl.WRITEFUNCTION, stream.write)
056
curl.setopt(pycurl.URL, url)
057
if
self
.proxyuse:
058
proxyip
=
self
.proxyip[random.randint(
0
,
len
(
self
.proxyip)
-
1
)];
059
curl.setopt(pycurl.PROXY, proxyip)
060
#curl.setopt(pycurl.PROXYNO, self.proxynodomain) #需要7.19.4 的pycurl版本
061
if
len
(post)>
0
:
062
curl.setopt(pycurl.POSTFIELDS, post)
063
status
=
''
064
try
:
065
curl.perform()
066
status
=
curl.getinfo(pycurl.RESPONSE_CODE)
067
except
:
068
status
=
curl.errstr()
069
finally
:
070
curl.close()
071
status
=
str
(status);
072
if
status !
=
'200'
:
073
status
=
self
.error
074
return
status;
075
076
def
rand_str(
self
):
077
return
'
'.join(random.sample(['
a
','
b
','
c
','
d
','
e
','
f
','
g
','
h
','
i
','
j
','
k
','
l
','
m
','
n'],
6
))
078
079
def
tofile(
self
,url,filename, post
=
{}):
080
fp
=
open
(filename,
'wb'
);
081
self
.fetch(url,fp,post)
082
fp.close()
083
return
True
084
085
def
html(
self
, url, post
=
{}):
086
sio
=
StringIO.StringIO()
087
reval
=
self
.fetch(url,sio, post)
088
if
reval
=
=
'200'
:
089
reval
=
sio.getvalue()
090
sio.close()
091
return
reval
092
093
def
gethtml(url,get):
094
print
get.html(url)
095
096
if
__name__
=
=
"__main__"
:
097
get
=
spider()
098
get.referer
=
'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='
+
get.rand_str()
099
get.proxyuse
=
True
100
get.proxyip
=
[
'059148233056.ctinets.com:80'
]
101
url
=
"http://www.whiledo.com"
102
print
get.html(url
=
url)
作者:黄聪
出处:http://www.cnblogs.com/huangcong/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。
出处:http://www.cnblogs.com/huangcong/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。
0 0
- 黄聪:Python网站采集功能(多线程的采集、WDPYSPIDER类、pycurl)
- python 多线程采集网页
- python---多线程采集示例
- Python爬虫采集CloudBlog网站的文章
- python多线程采集原理测试
- curl类多线程采集
- 采集类网站的生存之道
- 多线程采集的核心代码
- 简易的网站信息采集
- Python采集
- python 多线程采集网页完善版
- Python脚本抓取采集小说网站
- python网络爬虫-采集整个网站
- 多线程带智能采集策略的采集系统
- python 采集调类入库
- Python采集豆瓣网采集到的内容是乱码
- dedecms采集功能的详细介绍(十)
- dedecms采集功能的详细介绍(十)
- django 存放静态资源方法。
- 黄聪:使用 Python 登录网站
- 20150126-20150131工作周志
- 08-1. 求一批整数中出现最多的个位数字(20)
- 取本地文件
- 黄聪:Python网站采集功能(多线程的采集、WDPYSPIDER类、pycurl)
- Android apk动态加载机制的研究
- 淘宝有假货,O2O模式或将成解决方案
- 网站出现503错误的原因
- 黄聪:Python 字符串操作(string替换、删除、截取、复制、连接、比较、查找、包含、大小写转换、分割等)
- Map与XML的转换
- 冬令营第八天
- 互联网不是法外之地
- 黄聪:HtmlAgilityPack,C#实用的HTML解析类简介