python---多线程采集示例

来源:互联网 发布:火车票网络退票手续费 编辑:程序博客网 时间:2024/05/16 19:04
# coding:utf-8import urllib2import timefrom lxml import etreefrom bs4 import BeautifulSoupimport requestsimport typesimport sysimport jsonfrom Queue import Queueimport threadingCRAWL_EXIT = FalsePARSE_EXIT = Falseclass ThreadCrawl(threading.Thread):    def __init__(self, threadName, pageQueue, dataQueue):        # threading.Thread.__init__(self)        super(ThreadCrawl, self).__init__()        self.threadName = threadName        self.pageQueue = pageQueue        self.dataQueue = dataQueue        self.headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}    def run(self):        print "启动 " + self.threadName        while not CRAWL_EXIT:            try:                page = self.pageQueue.get(False)                url = "https://tieba.baidu.com/f?kw=%E5%A6%B9%E5%AD%90&ie=utf-8&pn=" + str(page * 50)                content = requests.get(url, headers=self.headers).text                time.sleep(1)                self.dataQueue.put(content)            except:                pass        print "结束 " + self.threadNameclass ThreadParse(threading.Thread):    def __init__(self, threadName, dataQueue, filename, lock):        super(ThreadParse, self).__init__()        self.threadName = threadName        self.dataQueue = dataQueue        self.fileName = filename        self.lock = lock    def run(self):        print "启动" + self.threadName        while not PARSE_EXIT:            try:                html = self.dataQueue.get(False)                self.parse(html)            except:                pass        print "退出" + self.threadName    def parse(self, html):        html = etree.HTML(html)        nodeList = html.xpath('//*[@id="thread_list"]//li/div/div[2]/div[1]/div[1]/a')        for title in nodeList:            items = {                "title" : title.text            }            with self.lock:                self.fileName.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + "\n")def main():    pageQueue = Queue(10)    for i in range(1, 11):        pageQueue.put(i)    dataQueue = Queue()    filename = open("duanzi.json", "a")    lock = threading.Lock()    # 三个采集线程的名字    crawlList = ["采集线程1号", "采集线程2号", "采集线程3号"]    # 存储三个采集线程的列表集合    threadcrawl = []    for threadName in crawlList:        thread = ThreadCrawl(threadName, pageQueue, dataQueue)        thread.start()        threadcrawl.append(thread)    # 三个解析线程的名字    parseList = ["解析线程1号", "解析线程2号", "解析线程3号"]    # 存储三个解析线程    threadparse = []    for threadName in parseList:        thread = ThreadParse(threadName, dataQueue, filename, lock)        thread.start()        threadparse.append(thread)    while not pageQueue.empty():        pass    # 如果pageQueue为空,采集线程退出循环    global CRAWL_EXIT    CRAWL_EXIT = True    print "pageQueue is empty"    for thread in threadcrawl:        thread.join()        print "1"    while not dataQueue.empty():        pass    global PARSE_EXIT    PARSE_EXIT = True    for thread in threadparse:        thread.join()        print "2"    with lock:        # 关闭文件        filename.close()    print "谢谢使用!"if __name__ == "__main__":    main()
原创粉丝点击