多线程批量转换文件编码, 从GBK, GB2312编码转换到UTF-8编码(Python)

来源:互联网 发布:max 无法录入数据 编辑:程序博客网 时间:2024/04/26 01:59

# coding=utf-8# author:Jeffrey Ma# version:0.1# build 2# created on:2015年3月31日# description:  1. 批量转换文件编码,从GBK GB2312编码转换到UTF-8编码#               2. 支持指定目录下所有的文件的转换,包括子目录中的文件#               3. 支持检测原始编码,对已经是UTF-8编码的文件,不做转换#               4. 支持只转换指定扩展名的编码#               5. 支持多线程转换和控制台输出#               6. 支持控制台显示线程池的状态#               7. 支持日志记录# usage: python gbk2utf8.py  -s [文件路径]# args : 文件的绝对路径# notes : 转换前请备份原始文件,转换后的文件会覆盖原文件。from __future__ import divisionimport sysimport osimport getoptimport loggingimport logging.configimport Queueimport threadpoolimport threadingfrom threading import Threadfrom multiprocessing.dummy import Pool as ThreadPoolimport chardetimport cursesimport timeimport localelocale.setlocale(locale.LC_ALL, "")global loggerglobal stdscrglobal poolstdscr = curses.initscr()def GBK2UTF8(filename):    threadName = threading.currentThread().getName()    f = open(filename, 'rb')    s = f.read()    f.close()    encodingName = chardet.detect(s)['encoding']    str = "";    if (encodingName.startswith('GB')):        # GBK码,需要转换        try:            gbkContent = s.decode(encodingName)            utf8Content = gbkContent.encode('utf-8')            f = open(filename, 'w')            f.write(utf8Content)            f.close()        except UnicodeDecodeError:            str = "%s: %s, %s 转换出错" % (threadName, filename, encodingName)            # logger.error("%s: %s, %s 转换出错" % (threadName, filename, encodingName))            # logger.error('%s: decoe error %s' % (threadName, UnicodeDecodeError.reason))            pass        str = "%s: %s, %s 转换done" % (threadName, filename, encodingName)    else:        # 已经是UTF-8不需要转换        str = "%s: %s, %s 已经是UTF-8不需要转换" % (threadName, filename, encodingName)    return {"tName": threadName, "encodingName": encodingName,"filename":filename, "result":str}def initLogger():    global logger    # 日志初始化    LOG_FILENAME = 'logging.conf'    logging.config.fileConfig(LOG_FILENAME)    logger = logging.getLogger("GBK2UTF8")    # 测试代码    # logger.debug("debug message")    # logger.info("info message")    # logger.warn("warn message")    # logger.error("error message")    # logger.critical("critical message")def main():    initLogger()    shortargs = 's:d'    longargs = ['src=', 'dest']    try:        opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs)    except getopt.GetoptError, err:        # print help information and exit:        print str(err) # will print something like "option -a not recognized"        # usage()        print "Usage: python gbk2utf8.py -s [file full path]"        return        # sys.exit(2)    srcPath = None    destPath = None    for o, a in opts:        if o in ("-s", "--src"):            srcPath = a        elif o in ("-d", "--dest"):            destPath = a        else:            assert False, "unhandled option"    if (srcPath != None and os.path.exists(srcPath) and os.path.isdir(srcPath)):        doWork(srcPath)def doWork(sPath):    # Make the Pool of workers    global pool    pool = threadpool.ThreadPool(10)    extFilters = ['xml', 'java', 'js', 'txt', 'css', 'php', 'html', 'htm', 'tpl']    i = 0    arrFiles = []    for root, dirs, files in os.walk(sPath):        for file in files:            # print root            # print file            i = i+1            sFilePath = root + os.sep + file            extension = os.path.splitext(sFilePath)[1][1:]            if (extension in extFilters):                arrFiles.append(sFilePath)            else:                logger.info('Skipping %s' % sFilePath)    print 'waiting...job'    curses.noecho()    curses.cbreak()    requests = threadpool.makeRequests(GBK2UTF8, arrFiles, print_result)    [pool.putRequest(req) for req in requests]    #close the pool and wait for the work to finish    pool.wait()    curses.nocbreak()    curses.echo()    curses.endwin()    print 'end job'def print_result(request, result):    try:        idx = 0        for t in pool.workers:            idx = idx+1            if(t.getName() == result["tName"]):                break        if idx > 0:            y, x = stdscr.getmaxyx()            # stdscr.deleteln()            text = result["result"]            textLen = len(text)            text = text.ljust(x)            stdscr.addstr(idx, 0, text)            stdscr.refresh()            logger.info(text)    except curses.error:        passif __name__ == '__main__':    main()




# coding=utf-8# author:Jeffrey Ma# version:0.1# build 2# created on:2015年3月31日# description:  1. 批量转换文件编码,从GBK GB2312编码转换到UTF-8编码#               2. 支持指定目录下所有的文件的转换,包括子目录中的文件#               3. 支持检测原始编码,对已经是UTF-8编码的文件,不做转换#               4. 支持只转换指定扩展名的编码#               5. 支持多线程转换和控制台输出#               6. 支持控制台显示线程池的状态#               7. 支持日志记录# usage: python gbk2utf8.py  -s [文件路径]# args : 文件的绝对路径# notes : 转换前请备份原始文件,转换后的文件会覆盖原文件。from __future__ import divisionimport sysimport osimport getoptimport loggingimport logging.configimport Queueimport threadpoolimport threadingfrom threading import Threadfrom multiprocessing.dummy import Pool as ThreadPoolimport chardetimport cursesimport timeimport localelocale.setlocale(locale.LC_ALL, "")global loggerglobal stdscrglobal poolstdscr = curses.initscr()def GBK2UTF8(filename):    threadName = threading.currentThread().getName()    f = open(filename, 'rb')    s = f.read()    f.close()    encodingName = chardet.detect(s)['encoding']    str = "";    if (encodingName.startswith('GB')):        # GBK码,需要转换        try:            gbkContent = s.decode(encodingName)            utf8Content = gbkContent.encode('utf-8')            f = open(filename, 'w')            f.write(utf8Content)            f.close()        except UnicodeDecodeError:            str = "%s: %s, %s 转换出错" % (threadName, filename, encodingName)            # logger.error("%s: %s, %s 转换出错" % (threadName, filename, encodingName))            # logger.error('%s: decoe error %s' % (threadName, UnicodeDecodeError.reason))            pass        str = "%s: %s, %s 转换done" % (threadName, filename, encodingName)    else:        # 已经是UTF-8不需要转换        str = "%s: %s, %s 已经是UTF-8不需要转换" % (threadName, filename, encodingName)    return {"tName": threadName, "encodingName": encodingName,"filename":filename, "result":str}def initLogger():    global logger    # 日志初始化    LOG_FILENAME = 'logging.conf'    logging.config.fileConfig(LOG_FILENAME)    logger = logging.getLogger("GBK2UTF8")    # 测试代码    # logger.debug("debug message")    # logger.info("info message")    # logger.warn("warn message")    # logger.error("error message")    # logger.critical("critical message")def main():    initLogger()    shortargs = 's:d'    longargs = ['src=', 'dest']    try:        opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs)    except getopt.GetoptError, err:        # print help information and exit:        print str(err) # will print something like "option -a not recognized"        # usage()        print "Usage: python gbk2utf8.py -s [file full path]"        return        # sys.exit(2)    srcPath = None    destPath = None    for o, a in opts:        if o in ("-s", "--src"):            srcPath = a        elif o in ("-d", "--dest"):            destPath = a        else:            assert False, "unhandled option"    if (srcPath != None and os.path.exists(srcPath) and os.path.isdir(srcPath)):        doWork(srcPath)def doWork(sPath):    # Make the Pool of workers    global pool    pool = threadpool.ThreadPool(10)    extFilters = ['xml', 'java', 'js', 'txt', 'css', 'php', 'html', 'htm', 'tpl']    i = 0    arrFiles = []    for root, dirs, files in os.walk(sPath):        for file in files:            # print root            # print file            i = i+1            sFilePath = root + os.sep + file            extension = os.path.splitext(sFilePath)[1][1:]            if (extension in extFilters):                arrFiles.append(sFilePath)            else:                logger.info('Skipping %s' % sFilePath)    print 'waiting...job'    curses.noecho()    curses.cbreak()    requests = threadpool.makeRequests(GBK2UTF8, arrFiles, print_result)    [pool.putRequest(req) for req in requests]    #close the pool and wait for the work to finish    pool.wait()    curses.nocbreak()    curses.echo()    curses.endwin()    print 'end job'def print_result(request, result):    try:        idx = 0        for t in pool.workers:            idx = idx+1            if(t.getName() == result["tName"]):                break        if idx > 0:            y, x = stdscr.getmaxyx()            # stdscr.deleteln()            text = result["result"]            textLen = len(text)            text = text.ljust(x)            stdscr.addstr(idx, 0, text)            stdscr.refresh()            logger.info(text)    except curses.error:        passif __name__ == '__main__':    main()


1 0