多线程批量转换文件编码, 从GBK, GB2312编码转换到UTF-8编码(Python)
来源:互联网 发布:max 无法录入数据 编辑:程序博客网 时间:2024/04/26 01:59
# coding=utf-8# author:Jeffrey Ma# version:0.1# build 2# created on:2015年3月31日# description: 1. 批量转换文件编码,从GBK GB2312编码转换到UTF-8编码# 2. 支持指定目录下所有的文件的转换,包括子目录中的文件# 3. 支持检测原始编码,对已经是UTF-8编码的文件,不做转换# 4. 支持只转换指定扩展名的编码# 5. 支持多线程转换和控制台输出# 6. 支持控制台显示线程池的状态# 7. 支持日志记录# usage: python gbk2utf8.py -s [文件路径]# args : 文件的绝对路径# notes : 转换前请备份原始文件,转换后的文件会覆盖原文件。from __future__ import divisionimport sysimport osimport getoptimport loggingimport logging.configimport Queueimport threadpoolimport threadingfrom threading import Threadfrom multiprocessing.dummy import Pool as ThreadPoolimport chardetimport cursesimport timeimport localelocale.setlocale(locale.LC_ALL, "")global loggerglobal stdscrglobal poolstdscr = curses.initscr()def GBK2UTF8(filename): threadName = threading.currentThread().getName() f = open(filename, 'rb') s = f.read() f.close() encodingName = chardet.detect(s)['encoding'] str = ""; if (encodingName.startswith('GB')): # GBK码,需要转换 try: gbkContent = s.decode(encodingName) utf8Content = gbkContent.encode('utf-8') f = open(filename, 'w') f.write(utf8Content) f.close() except UnicodeDecodeError: str = "%s: %s, %s 转换出错" % (threadName, filename, encodingName) # logger.error("%s: %s, %s 转换出错" % (threadName, filename, encodingName)) # logger.error('%s: decoe error %s' % (threadName, UnicodeDecodeError.reason)) pass str = "%s: %s, %s 转换done" % (threadName, filename, encodingName) else: # 已经是UTF-8不需要转换 str = "%s: %s, %s 已经是UTF-8不需要转换" % (threadName, filename, encodingName) return {"tName": threadName, "encodingName": encodingName,"filename":filename, "result":str}def initLogger(): global logger # 日志初始化 LOG_FILENAME = 'logging.conf' logging.config.fileConfig(LOG_FILENAME) logger = logging.getLogger("GBK2UTF8") # 测试代码 # logger.debug("debug message") # logger.info("info message") # logger.warn("warn message") # logger.error("error message") # logger.critical("critical message")def main(): initLogger() shortargs = 's:d' longargs = ['src=', 'dest'] try: opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" # usage() print "Usage: python gbk2utf8.py -s [file full path]" return # sys.exit(2) srcPath = None destPath = None for o, a in opts: if o in ("-s", "--src"): srcPath = a elif o in ("-d", "--dest"): destPath = a else: assert False, "unhandled option" if (srcPath != None and os.path.exists(srcPath) and os.path.isdir(srcPath)): doWork(srcPath)def doWork(sPath): # Make the Pool of workers global pool pool = threadpool.ThreadPool(10) extFilters = ['xml', 'java', 'js', 'txt', 'css', 'php', 'html', 'htm', 'tpl'] i = 0 arrFiles = [] for root, dirs, files in os.walk(sPath): for file in files: # print root # print file i = i+1 sFilePath = root + os.sep + file extension = os.path.splitext(sFilePath)[1][1:] if (extension in extFilters): arrFiles.append(sFilePath) else: logger.info('Skipping %s' % sFilePath) print 'waiting...job' curses.noecho() curses.cbreak() requests = threadpool.makeRequests(GBK2UTF8, arrFiles, print_result) [pool.putRequest(req) for req in requests] #close the pool and wait for the work to finish pool.wait() curses.nocbreak() curses.echo() curses.endwin() print 'end job'def print_result(request, result): try: idx = 0 for t in pool.workers: idx = idx+1 if(t.getName() == result["tName"]): break if idx > 0: y, x = stdscr.getmaxyx() # stdscr.deleteln() text = result["result"] textLen = len(text) text = text.ljust(x) stdscr.addstr(idx, 0, text) stdscr.refresh() logger.info(text) except curses.error: passif __name__ == '__main__': main()
# coding=utf-8# author:Jeffrey Ma# version:0.1# build 2# created on:2015年3月31日# description: 1. 批量转换文件编码,从GBK GB2312编码转换到UTF-8编码# 2. 支持指定目录下所有的文件的转换,包括子目录中的文件# 3. 支持检测原始编码,对已经是UTF-8编码的文件,不做转换# 4. 支持只转换指定扩展名的编码# 5. 支持多线程转换和控制台输出# 6. 支持控制台显示线程池的状态# 7. 支持日志记录# usage: python gbk2utf8.py -s [文件路径]# args : 文件的绝对路径# notes : 转换前请备份原始文件,转换后的文件会覆盖原文件。from __future__ import divisionimport sysimport osimport getoptimport loggingimport logging.configimport Queueimport threadpoolimport threadingfrom threading import Threadfrom multiprocessing.dummy import Pool as ThreadPoolimport chardetimport cursesimport timeimport localelocale.setlocale(locale.LC_ALL, "")global loggerglobal stdscrglobal poolstdscr = curses.initscr()def GBK2UTF8(filename): threadName = threading.currentThread().getName() f = open(filename, 'rb') s = f.read() f.close() encodingName = chardet.detect(s)['encoding'] str = ""; if (encodingName.startswith('GB')): # GBK码,需要转换 try: gbkContent = s.decode(encodingName) utf8Content = gbkContent.encode('utf-8') f = open(filename, 'w') f.write(utf8Content) f.close() except UnicodeDecodeError: str = "%s: %s, %s 转换出错" % (threadName, filename, encodingName) # logger.error("%s: %s, %s 转换出错" % (threadName, filename, encodingName)) # logger.error('%s: decoe error %s' % (threadName, UnicodeDecodeError.reason)) pass str = "%s: %s, %s 转换done" % (threadName, filename, encodingName) else: # 已经是UTF-8不需要转换 str = "%s: %s, %s 已经是UTF-8不需要转换" % (threadName, filename, encodingName) return {"tName": threadName, "encodingName": encodingName,"filename":filename, "result":str}def initLogger(): global logger # 日志初始化 LOG_FILENAME = 'logging.conf' logging.config.fileConfig(LOG_FILENAME) logger = logging.getLogger("GBK2UTF8") # 测试代码 # logger.debug("debug message") # logger.info("info message") # logger.warn("warn message") # logger.error("error message") # logger.critical("critical message")def main(): initLogger() shortargs = 's:d' longargs = ['src=', 'dest'] try: opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" # usage() print "Usage: python gbk2utf8.py -s [file full path]" return # sys.exit(2) srcPath = None destPath = None for o, a in opts: if o in ("-s", "--src"): srcPath = a elif o in ("-d", "--dest"): destPath = a else: assert False, "unhandled option" if (srcPath != None and os.path.exists(srcPath) and os.path.isdir(srcPath)): doWork(srcPath)def doWork(sPath): # Make the Pool of workers global pool pool = threadpool.ThreadPool(10) extFilters = ['xml', 'java', 'js', 'txt', 'css', 'php', 'html', 'htm', 'tpl'] i = 0 arrFiles = [] for root, dirs, files in os.walk(sPath): for file in files: # print root # print file i = i+1 sFilePath = root + os.sep + file extension = os.path.splitext(sFilePath)[1][1:] if (extension in extFilters): arrFiles.append(sFilePath) else: logger.info('Skipping %s' % sFilePath) print 'waiting...job' curses.noecho() curses.cbreak() requests = threadpool.makeRequests(GBK2UTF8, arrFiles, print_result) [pool.putRequest(req) for req in requests] #close the pool and wait for the work to finish pool.wait() curses.nocbreak() curses.echo() curses.endwin() print 'end job'def print_result(request, result): try: idx = 0 for t in pool.workers: idx = idx+1 if(t.getName() == result["tName"]): break if idx > 0: y, x = stdscr.getmaxyx() # stdscr.deleteln() text = result["result"] textLen = len(text) text = text.ljust(x) stdscr.addstr(idx, 0, text) stdscr.refresh() logger.info(text) except curses.error: passif __name__ == '__main__': main()
1 0
- 多线程批量转换文件编码, 从GBK, GB2312编码转换到UTF-8编码(Python)
- linux 批量转换GBK到UTF-8编码的方法
- linux下GBK->UTF-8文件编码批量转换脚本
- linux下GBK->UTF-8文件编码批量转换脚本
- Linux下GBK->UTF-8文件编码批量转换命令
- linux 下 GBK-UTF-8文件编码批量转换脚本
- 批量文件编码转换(GBK/UTF-8/UNICODE etc)
- Linux下GBK文件编码批量转换UTF-8命令
- Linux下批量转换GBK文件到UTF-8编码方法
- linux-利用iconv批量转换GBK文件到UTF-8编码方法
- 编码转换utf-8/gb2312
- Linux批量转换gbk编码文件到utf8编码
- GBK(GB2312)向UTF-8的编码转换
- GBK(GB2312)向UTF-8的编码转换
- GBK(GB2312)向UTF-8的编码转换
- GBK(GB2312)向UTF-8的编码转换 -- C++
- Python的编码机制,unicode, utf-8, utf-16, GBK, GB2312,ISO-8859-1 等编码之间的转换。
- Python的编码机制,unicode, utf-8, utf-16, GBK, GB2312,ISO-8859-1 等编码之间的转换
- Tomcat服务器不能打开http://localhost:8080安装测试页面
- onkeyup事件、onkeydown事件、onblur事件、onchange事件
- Ubuntu下NFS服务器配置
- Sobel算子学习与理解
- Protocol Buffers Developer Guide-API Reference --C++ Generated Code
- 多线程批量转换文件编码, 从GBK, GB2312编码转换到UTF-8编码(Python)
- 这年头什么最赚钱
- Windows环境下批量修改文件名
- libsvm实例子
- JNI/NDK开发指南(一)—— JNI开发流程及HelloWorld
- ubuntu下chatofpomelo项目调试记录
- Activity的启动模式详细介绍
- Hive自定义UDAF详解
- 如何配置SSH(Mac)