Multithread download tool for massive tiny files.

来源:互联网 发布:插画师卤猫 知乎 编辑:程序博客网 时间:2024/05/16 10:10

Http Files Downloader

with Python 2.7 standard library

# -*- coding: utf-8 -*-"""    Multithread download tool for massive tiny files."""#-------------------------# Author: Kun Liu         # Start date: 2017-03-06  # Latest edit: 2017-03-16 # email = lancelotdev@163.com# python_version = Python 2.7.11#===================================#-----Python 3 Compatiblefrom __future__ import absolute_importfrom __future__ import divisionfrom __future__ import print_functionfrom __future__ import unicode_literals#---------------------------------import reimport osimport sysimport Queueimport threadingimport urllibimport urllib2import jsonimport loggingfrom time import ctime,sleep# 模拟浏览器访问参数user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'headers = {    'Connection': 'Keep-Alive',    'Accept': 'text/html, application/xhtml+xml, */*',    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'}# log settings 抓取记录日志的设置logging.basicConfig(level=logging.DEBUG,                    format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',                    datefmt='%a, %d %b %Y %H:%M:%S',                    filename='pythonDownload.log',                    filemode='w')def agent_request(url):    req_timeout = 40    req = urllib2.Request(url, None, headers)    resp = urllib2.urlopen(req, None, req_timeout)    html_content = resp.read()    return html_content      class DownLoad(threading.Thread):    def __init__(self, file_que, folder_name = "PyDownload"):        # Make directory 'PyDownload' to save files        if not os.path.exists('PyDownload') and folder_name == "PyDownload":            os.mkdir('PyDownload')        self.que = Queue.Queue()        if isinstance(file_que, list):            for i in file_que:                self.que.put(i)        else:            self.que = file_que        self.folder_name = folder_name        self.fail_file_list = []        threading.Thread.__init__(self)    def run(self):        global success_case_num        print("%d thread is working!"%threading.active_count())        while True:            if not self.que.empty():                file_tuple = self.que.get()                file_name = file_tuple[0]                # Legalize file name                file_name = file_name.translate((None,"|\\?*<\":>+[]/'"))                file_url = file_tuple[1]                try:                    if not file_name.endswith(".torrent"):                        file_name += ".torrent"                    with open(os.path.join(self.folder_name,file_name), 'wb') as file:                            file_data = agent_request(file_url)                            file.write(file_data)                            if mutex.acquire(1):                                  success_case_num += 1                                mutex.release()                except Exception as e:                    self.fail_file_list.append(file_url)                    logging.warning("DownLoad error:" + str(e) + "Fail file: " + file_url)                    # print (e, file_url)                    continue            else:                returnmutex = threading.Lock()success_case_num = 0class DownLoadDispatcher:    def __init__(self, name_url_tuple_list):        self.file_list = name_url_tuple_list    def start_download(self):        try:            file_que = Queue.Queue()            # que=queue.Queue()#py 3            for f_tuple in self.file_list:                file_que.put(f_tuple)            # 线程个数            for _ in range(1):                d = DownLoad(file_que)                d.start()                # Download frequency control                sleep(1)        except Exception as e:            print ("pic_downloader exception:" + str(e))
0 0