多线程实际应用

来源:互联网 发布:access数据库下载 绿色 编辑:程序博客网 时间:2024/06/14 19:25
# -*- coding:utf-8 -*-import sysreload(sys)sys.setdefaultencoding("utf-8")import requestsfrom Queue import Queuefrom lxml import etreeimport threadingfrom time import sleepimport codecsfile_handle = codecs.open("result.txt", "w", encoding="utf-8")class DownloadThread(threading.Thread):    def __init__(self, thread_name, queue):        super(DownloadThread, self).__init__()        self.thread_name = thread_name        self.queue = queue    def run(self):        while True:            if self.queue.empty():                file_handle.write(self.thread_name + "已经下班" + "\n")                break            page = self.queue.get()            file_handle.write(self.thread_name + "准备下载" + str(page)+"\n")            url = "http://blog.jobbole.com/all-posts/page/ " + str(page)            headers = {                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0",            }            response = requests.get(url=url, headers=headers)            if response and response.status_code == 200:                content = response.content                parse_content_queue.put(content)            else:                file_handle.write(self.thread_name + "网页下载失败" + url + "\n")            file_handle.write(self.thread_name + "下载完成" + str(page)+"\n")class ParseThread(threading.Thread):    def __init__(self, thread_name):        super(ParseThread, self).__init__()        self.thread_name = thread_name    def run(self):        global parse_switch        while parse_switch:            file_handle.write("--------------------------" + "\n")            content = parse_content_queue.get()            doc = etree.HTML(content)            file_handle.write(self.thread_name + "开始解析" + "\n")            img_list = doc.xpath("//img/@src")            for img in img_list:                file_handle.write(img + "\n")            file_handle.write(self.thread_name + "结束解析" + "\n")if __name__ == '__main__':    download_queue = Queue(maxsize=30)    parse_content_queue = Queue()    parse_switch = True    for page in range(1, download_queue.maxsize + 1):        download_queue.put(page)    download_thread_names = [        "下载器1",        "下载器2",        "下载器3",        "下载器4",    ]    download_thread_list = []    for thread_name in download_thread_names:        d = DownloadThread(thread_name, download_queue)        d.start()        download_thread_list.append(d)    while not download_queue.empty():        pass    for thread in download_thread_list:        thread.join()    parse_thread_names = [        "解析器1",        "解析器2",        "解析器3",    ]    parse_thread_list = []    for thread_name in parse_thread_names:        p = ParseThread(thread_name)        p.start()        parse_thread_list.append(p)    while not parse_content_queue.empty():        pass    parse_switch = False    for thread in parse_thread_list:        thread.join()