python 多线程抓取代理网站(线程池)
来源:互联网 发布:2016淘宝直通车卡首屏 编辑:程序博客网 时间:2024/05/21 09:14
import Queue, threading, sys
from threading import Thread
import time,urllib
# working thread
class Worker(Thread):
worker_count = 0
def __init__( self, workQueue, resultQueue, timeout = 0, **kwds):
Thread.__init__( self, **kwds )
self.id = Worker.worker_count
Worker.worker_count += 1
self.setDaemon( True )
self.workQueue = workQueue
self.resultQueue = resultQueue
self.timeout = timeout
self.start( )
def run( self ):
''' the get-some-work, do-some-work main loop of worker threads '''
while True:
try:
callable, args, kwds = self.workQueue.get(timeout=self.timeout)
res = callable(*args, **kwds)
print "worker[%2d]: %s" % (self.id, str(res) )
self.resultQueue.put( res )
except Queue.Empty:
break
except :
print 'worker[%2d]' % self.id, sys.exc_info()[:2]
class WorkerManager:
def __init__( self, num_of_workers=10, timeout = 1):
self.workQueue = Queue.Queue()
self.resultQueue = Queue.Queue()
self.workers = []
self.timeout = timeout
self._recruitThreads( num_of_workers )
def _recruitThreads( self, num_of_workers ):
for i in range( num_of_workers ):
worker = Worker( self.workQueue, self.resultQueue, self.timeout )
self.workers.append(worker)
def wait_for_complete( self):
# ...then, wait for each of them to terminate:
while len(self.workers):
worker = self.workers.pop()
worker.join( )
if worker.isAlive() and not self.workQueue.empty():
self.workers.append( worker )
print "All jobs are are completed."
def add_job( self, callable, *args, **kwds ):
self.workQueue.put( (callable, args, kwds) )
def get_result( self, *args, **kwds ):
return self.resultQueue.get( *args, **kwds )
'''
Created on 2013-7-21
@author: Administrator
'''
import os
class FileHandler(object):
def __init__(self, filename , mode = "r"):
self.handle = open(filename, mode)
def close(self):
self.handle.close()
def read_one_line(self):
return self.handle.readline()
def read_lines(self):
return self.handle.readlines()
def seek(self, pos):
self.handle.seek(pos)
def tell(self):
return self.handle.tell()
def makedir(self,dirname):
os.mkdir(dirname)
def remove_file(self,filename):
os.remove(filename)
@staticmethod
def scan_dir(dirname):
if not dirname:
return False
files = [];
for item in os.listdir(dirname):
files.append(dirname + "/" + item)
return files
def file_put_contents(self, content):
self.handle.write(content)
'''
Created on 2013-7-27@author: Administrator
'''
import urllib,urllib2
from thread_pool import Worker
from thread_pool import WorkerManager
import sys
import framework.tool.FileHandler as FileHandler
def test_job(id, sleep = 0.001 ):
try:
html = urllib.urlopen('http://www.cnproxy.com/proxy%d.html' % (id)).read()
fh = FileHandler.FileHandler("proxy_{%d}.html" % id , "w")
fh.file_put_contents(html)
fh.close()
except:
print '[%4d]' % id, sys.exc_info()[:2]
return id
def test():
import socket
socket.setdefaulttimeout(10)
print 'start testing'
wm = WorkerManager(3)
for i in range(1,11):
wm.add_job( test_job, i, i*0.001 )
wm.wait_for_complete()
print 'end testing'
if __name__ == '__main__' :
test()
- python 多线程抓取代理网站(线程池)
- 使用Python多线程抓取并验证代理
- [Python]代理抓取并验证-多线程
- python抓取某代理网站代理IP及端口
- 抓取 网站 代理 ip
- 多线程内容匹配抓取(线程池)
- 用python多线程抓取网站图片,速度极快
- Python多线程抓取图片
- Python 多线程抓取网页
- Python多线程抓取代理服务器
- python多线程抓取类
- 8.抓取西刺网站(代理ip网站)
- python 自动抓取代理ip
- python动态抓取代理IP
- python 通过代理抓取数据
- python 多线程与线程池
- Python多线程之线程池
- python多线程之线程池
- JNDI 学习
- 扫雷游戏
- 地形处理技巧
- 做过的题目
- 各种语音编码总结
- python 多线程抓取代理网站(线程池)
- 杭电2110-Crisis of HDU
- android应用程序如何退出?探究程序退出的方法
- JAVA培训 第一次课后笔记
- 为sourceinsight添加makefile、kconfig、*.S文件支持
- Webservice_01_快速实例
- 详解 Qt 调用 DLL功能函数
- poj2769-同余问题
- MCI详解