python 多线程抓取代理网站(线程池)

来源:互联网 发布:2016淘宝直通车卡首屏 编辑:程序博客网 时间:2024/05/21 09:14

import Queue, threading, sys   
from threading import Thread   
import time,urllib   
# working thread   
class Worker(Thread):   
   worker_count = 0   
   def __init__( self, workQueue, resultQueue, timeout = 0, **kwds):   
       Thread.__init__( self, **kwds )   
       self.id = Worker.worker_count   
       Worker.worker_count += 1   
       self.setDaemon( True )   
       self.workQueue = workQueue   
       self.resultQueue = resultQueue   
       self.timeout = timeout   
       self.start( ) 
         
   def run( self ):   
       ''' the get-some-work, do-some-work main loop of worker threads '''   
       while True:   
           try:   
               callable, args, kwds = self.workQueue.get(timeout=self.timeout)   
               res = callable(*args, **kwds)   
               print "worker[%2d]: %s" % (self.id, str(res) )   
               self.resultQueue.put( res )   
           except Queue.Empty:   
               break   
           except :   
               print 'worker[%2d]' % self.id, sys.exc_info()[:2]   
                  
class WorkerManager:   
   def __init__( self, num_of_workers=10, timeout = 1):   
       self.workQueue = Queue.Queue()   
       self.resultQueue = Queue.Queue()   
       self.workers = []   
       self.timeout = timeout   
       self._recruitThreads( num_of_workers )   
   def _recruitThreads( self, num_of_workers ):   
       for i in range( num_of_workers ):   
           worker = Worker( self.workQueue, self.resultQueue, self.timeout )   
           self.workers.append(worker)   
   def wait_for_complete( self):   
       # ...then, wait for each of them to terminate:   
       while len(self.workers):   
           worker = self.workers.pop()   
           worker.join( )   
           if worker.isAlive() and not self.workQueue.empty():   
               self.workers.append( worker )   
       print "All jobs are are completed."   
   def add_job( self, callable, *args, **kwds ):   
       self.workQueue.put( (callable, args, kwds) )   
   def get_result( self, *args, **kwds ):   
       return self.resultQueue.get( *args, **kwds ) 


'''
Created on 2013-7-21


@author: Administrator
'''


import os


class FileHandler(object):
    
    def __init__(self, filename , mode = "r"):
        self.handle = open(filename, mode)
    
    def close(self):
        self.handle.close()
        
    def read_one_line(self):
        return self.handle.readline()
    
    def read_lines(self):
        return self.handle.readlines()
    
    def seek(self, pos):
        self.handle.seek(pos)
        
    def tell(self):
        return self.handle.tell()
    
    def makedir(self,dirname):
        os.mkdir(dirname)
        
    def remove_file(self,filename):
        os.remove(filename)
        
    @staticmethod
    def scan_dir(dirname):
        
        if not dirname:
            return False
        
        files = [];
        for item in os.listdir(dirname):
            files.append(dirname + "/" + item)
        return files
    
    def file_put_contents(self, content):
        self.handle.write(content)
    
    
        
        
    



'''

Created on 2013-7-27


@author: Administrator
'''
import urllib,urllib2
from thread_pool import Worker
from thread_pool import WorkerManager
import sys
import framework.tool.FileHandler as FileHandler


def test_job(id, sleep = 0.001 ):   
   try:   
       html = urllib.urlopen('http://www.cnproxy.com/proxy%d.html' % (id)).read()   
       fh = FileHandler.FileHandler("proxy_{%d}.html" % id , "w")
       fh.file_put_contents(html)
       fh.close()
   except:   
       print '[%4d]' % id, sys.exc_info()[:2]   
   return id  
 
def test():   
   import socket   
   socket.setdefaulttimeout(10)   
   print 'start testing'   
   wm = WorkerManager(3)   
   for i in range(1,11):   
       wm.add_job( test_job, i, i*0.001 )   
   wm.wait_for_complete()   
   print 'end testing'
   
if  __name__ == '__main__' :
    test()
原创粉丝点击