Python 多队列数据挖掘网站

来源:互联网 发布:ae cs6 mac破解版下载 编辑:程序博客网 时间:2024/06/06 05:09
#!/usr/bin/evn python#coding=utf-8import Queueimport threadingimport urllib2import timeimport HTMLParserfrom BeautifulSoup import BeautifulSouphosts = ['http://yahoo.com','http://google.com','http://amazon.com','http://ibm.com','http://apple.com']queue = Queue.Queue()out_queue = Queue.Queue()class ThreadUrl(threading.Thread):def __init__(self,queue,out_queue):threading.Thread.__init__(self)self.queue = queueself.out_queue = out_queuedef run(self):while True:host = self.queue.get()url = urllib2.urlopen(host)self.out_queue.put(url.read())self.queue.task_done()class DatamineThread(threading.Thread):def __init__(self,out_queue):threading.Thread.__init__(self)self.out_queue = out_queuedef run(self):while True:chunk = self.out_queue.get()soup = BeautifulSoup(chunk)print soup.findAll(['title'])self.out_queue.task_done()def main():for i in range(5):t = ThreadUrl(queue,out_queue)t.setDaemon(True)t.start()for host in hosts:queue.put(host)for i in range(5):dt = DatamineThread(out_queue)dt.setDaemon(True)dt.start()queue.join()out_queue.join()if __name__ == '__main__':start = time.time()main()print 'Elapsed Time:%s' % (time.time()-start)

原创粉丝点击