python线程池实现网络爬虫
来源:互联网 发布:詹姆斯09年对魔术数据 编辑:程序博客网 时间:2024/06/06 04:50
参考:http://blog.csdn.net/sding/article/details/5538089
http://blog.daviesliu.net/2006/10/09/234822/
首先是创建线程池:
线程池主要由两个队列维护,线程队列和任务队列,线程队列存放开启的线程,任务队列由用户添加任务,开启的线程一直去任务队列中获取任务
import Queue, threading, sys from threading import Thread import time import urllib # working thread class Worker(Thread): worker_count = 0 timeout = 1 def __init__( self, workQueue, resultQueue, **kwds): Thread.__init__( self, **kwds ) self.id = Worker.worker_count Worker.worker_count += 1 self.setDaemon( True ) self.workQueue = workQueue self.resultQueue = resultQueue self.start( ) def run( self ): ''''' the get-some-work, do-some-work main loop of worker threads ''' while True: try: callable, args, kwds = self.workQueue.get(timeout=Worker.timeout) res = callable(*args, **kwds) print "worker[%2d]: %s" % (self.id, str(res) ) self.resultQueue.put( res ) #time.sleep(Worker.sleep) except Queue.Empty: break except : print 'worker[%2d]' % self.id, sys.exc_info()[:2] raise class WorkerManager: def __init__( self, num_of_workers=10, timeout = 2): self.workQueue = Queue.Queue() self.resultQueue = Queue.Queue() self.workers = [] self.timeout = timeout self._recruitThreads( num_of_workers ) def _recruitThreads( self, num_of_workers ): for i in range( num_of_workers ): worker = Worker( self.workQueue, self.resultQueue ) self.workers.append(worker) def wait_for_complete( self): # ...then, wait for each of them to terminate: while len(self.workers): worker = self.workers.pop() worker.join( ) if worker.isAlive() and not self.workQueue.empty(): self.workers.append( worker ) print "All jobs are are completed." def add_job( self, callable, *args, **kwds ): self.workQueue.put( (callable, args, kwds) ) def get_result( self, *args, **kwds ): return self.resultQueue.get( *args, **kwds )
下面是线程池的使用:
import urllib2 import time import socket from datetime import datetime from thread_pool import * def main(): url_list = {"sina":"http://www.sina.com.cn", "sohu":"http://www.sohu.com", "yahoo":"http://www.yahoo.com", "xiaonei":"http://www.xiaonei.com", "qihoo":"http://www.qihoo.com", "laohan":"http://www.laohan.org", "eyou":"http://www.eyou.com", "chinaren":"http://www.chinaren.com", "douban":"http://www.douban.com", "163":"http://www.163.com", "daqi":"http://www.daqi.com", "qq":"http://www.qq.com", "baidu_1":"http://www.baidu.com/s?wd=asdfasdf", "baidu_2":"http://www.baidu.com/s?wd=dddddddf", "google_1":"http://www.baidu.com/s?wd=sadfas", "google_2":"http://www.baidu.com/s?wd=sadflasd", "hainei":"http://www.hainei.com", "microsoft":"http://www.microsoft.com", "wlzuojia":"http://www.wlzuojia.com"} socket.setdefaulttimeout(10) print 'start testing' wm = WorkerManager(50) for url_name in url_list.keys(): wm.add_job(do_get_con, url_name, url_list[url_name]) wm.wait_for_complete() print 'end testing' def do_get_con(url_name,url_link): try: fd = urllib2.urlopen(url_link) data = fd.read() f_hand = open("/tmp/ttt/%s" % url_name,"w") f_hand.write(data) f_hand.close() except Exception,e: pass if __name__ == "__main__": main()
0 0
- python线程池实现网络爬虫
- Python实现网络爬虫
- Python实现网络爬虫
- Python实现网络爬虫
- python实现网络爬虫
- python实现网络爬虫
- [Python] 实现网络爬虫
- python实现网络爬虫
- Python实现网络爬虫
- [Python] 实现网络爬虫
- Python实现网络爬虫
- Python 实现网络爬虫
- Python实现网络爬虫
- python实现网络爬虫
- python单线程网络爬虫
- python爬虫实现(使用线程池)
- 简易“线程池”在Python网络爬虫中的应用
- 用python实现网络爬虫
- myBatis 使用Select top 动态参数出现“´@P0´ 附近有语法错误”的解决
- Memcached Client简要介绍
- ufldl学习笔记与编程作业:Multi
- hd 2123 An easy problem
- jQuery.Autocomplete实现自动完成功能(详解)
- python线程池实现网络爬虫
- NSRange的使用方法
- Reorder List 重排字符串
- 开源框架android-async-http使用
- 为什么Java永远比C++慢?
- os基础--多线程进程面试题02
- Converting Character Sets
- php中内置的mysql数据库连接驱动mysqlnd简介及mysqlnd的配置安装方式
- Node.js系列--模块