python实现指定目录下批量文件的单词计数:串行版本

来源:互联网 发布:剪力弯矩图软件 编辑:程序博客网 时间:2024/05/22 07:06


       直接上代码。

 

       练习目标:

             1.  使用 Python 面向对象的方法封装逻辑和表达 ;

             2.  使用异常处理和日志API ;

             3.  使用文件目录读写API ; 

             4.  使用 list, map, tuple 三种数据结构 ;

             5.  lambda 、正则使用及其它。


       下一篇将实现并发版本。  

       

#-------------------------------------------------------------------------------# Name:        wordstat_serial.py# Purpose:     statistic words in java files of given directory by serial## Author:      qin.shuq## Created:     08/10/2014# Copyright:   (c) qin.shuq 2014# Licence:     <your licence>#-------------------------------------------------------------------------------import reimport osimport timeimport loggingLOG_LEVELS = {    'DEBUG': logging.DEBUG, 'INFO': logging.INFO,    'WARN': logging.WARNING, 'ERROR': logging.ERROR,    'CRITICAL': logging.CRITICAL}def initlog(filename) :    logger = logging.getLogger()    hdlr = logging.FileHandler(filename)    formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")    hdlr.setFormatter(formatter)    logger.addHandler(hdlr)    logger.setLevel(LOG_LEVELS['INFO'])    return loggererrlog = initlog("error.log")infolog = initlog("info.log")class WordReading(object):    def __init__(self, fileList):        self.fileList = fileList    def readFileInternal(self, filename):        lines = []        try:            f = open(filename, 'r')            lines = f.readlines()            infolog.info('[successful read file %s]\n' % filename)            f.close()        except IOError, err:            errorInfo = 'file %s Not found \n' % filename            errlog.error(errorInfo)        return lines    def readFile(self):        allLines = []        for filename in self.fileList:            allLines.extend(self.readFileInternal(filename))        return allLinesclass WordAnalyzing(object):    '''     return Map<Word, count>  the occurrence times of each word    '''    wordRegex = re.compile("[\w]+")    def __init__(self, allLines):        self.allLines = allLines    def analyze(self):        result = {}        lineContent = ''.join(self.allLines)        matches = WordAnalyzing.wordRegex.findall(lineContent)        if matches:            for word in matches:                if result.get(word) is None:                    result[word] = 0                result[word] += 1        return resultclass FileObtainer(object):    def __init__(self, dirpath, fileFilterFunc=None):        self.dirpath = dirpath        self.fileFilterFunc = fileFilterFunc    def findAllFilesInDir(self):        files = []        for path, dirs, filenames in os.walk(self.dirpath):            if len(filenames) > 0:                for filename in filenames:                    files.append(path+'/'+filename)        if self.fileFilterFunc is None:            return files        else:            return filter(self.fileFilterFunc, files)class PostProcessing(object):    def __init__(self, resultMap):        self.resultMap = resultMap    def sortByValue(self):        return sorted(self.resultMap.items(),key=lambda e:e[1], reverse=True)    def obtainTopN(self, topN):        sortedResult = self.sortByValue()        sortedNum = len(sortedResult)        topN = sortedNum if topN > sortedNum else topN        for i in range(topN):            topi = sortedResult[i]            print topi[0], ' counts: ', topi[1]if __name__ == "__main__":    dirpath = "c:\\Users\\qin.shuq\\Desktop\\region_master\\src"    starttime = time.time()    fileObtainer = FileObtainer(dirpath, lambda f: f.endswith('.java'))    fileList = fileObtainer.findAllFilesInDir()    endtime = time.time()    print 'ObtainFile cost: ', (endtime-starttime)*1000 , 'ms'    starttime = time.time()    wr = WordReading(fileList)    allLines = wr.readFile()    endtime = time.time()    print 'WordReading cost: ', (endtime-starttime)*1000 , 'ms'    starttime = time.time()    wa = WordAnalyzing(allLines)    resultMap = wa.analyze()    endtime = time.time()    print 'WordAnalyzing cost: ', (endtime-starttime)*1000 , 'ms'    starttime = time.time()    postproc = PostProcessing(resultMap)    postproc.obtainTopN(30)    endtime = time.time()    print 'PostProcessing cost: ', (endtime-starttime)*1000 , 'ms'


0 0