python 多线程+gzip压缩爬虫

来源：互联网发布：求购信息搜索软件编辑：程序博客网时间：2024/06/13 02:31

#coding=utf-8
import urllib
import urllib2
import threading
import HTMLParser
import Queue
import os
import StringIO
import gzip
import re
import time
class GetUrllist(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.Urlqueue=Queue.Queue(-1)

def handle_starttag(self,tag,attrs):
if tag =='a':
for key,value in attrs:
if key =='href':
print value
self.Urlqueue.put(value)

class spider(threading.Thread):
def __init__(self,myname,parser,path='F:\\uuuuu'):
threading.Thread.__init__(self)
self.count=0
self.name=myname
self.parser=parser
self.timeout=5
self.dicpath=path

def run(self):
start=time.clock()
while True:
if self.parser.Urlqueue.empty()==False:
url=self.parser.Urlqueue.get()
data={'Accept-encoding':'gzip'}
request=urllib2.Request(url,urllib.urlencode(data))
opener=urllib2.build_opener()
try:
response=opener.open(request)
if response.code==200:
predata=response.read()
pdata=StringIO.StringIO(predata)
gzipper=gzip.GzipFile(fileobj=pdata)
try:
data=gzipper.read()
except:
data=predata
try:
self.parser.feed(data)
except:
print "Open Error"
try:
filePath = self.dicpath+"\\"+str(self.count)+".html"
print filePath
self.count += 1
file = open(filePath,'w')
file.write(data)
file.close()
except:
print " FileWriting Error"
except Exception,e:
print "Request Error"+str(e)
else:
end=time.clock()
if end-start>self.timeout:
break

starturl="http://music.so.com/?src=tab_web"
urldata=urllib.urlopen(starturl)
parser=GetUrllist()
parser.feed(urldata.read())
urldata.close()
spiderlist=[]
for i in range(3):
th=spider("the"+str(i)+"spider",parser)
spiderlist.append(th)

for t in spiderlist:
print t
t.start()

for t in spiderlist:

t.join()

这个爬虫的话只是使用了HTMLParser进行文本的解析，然后开辟了3个线程进行操作，这个爬虫应用起来还有许多的弊端，如动态网页处理，网页编码的处理，网页加密等，等以后学了爬虫框架和beautifulsoap之后会有新的更新！加油！！

pS:这个爬虫我没有控制BFS的深度，所以你懂得，自己加个变量控制一下吧

0 0

python 多线程+gzip压缩 爬虫

python 多线程+gzip压缩爬虫