文件统计

来源:互联网 发布:通达信指标公式源码 编辑:程序博客网 时间:2024/04/30 16:46

统计文件的字数,行数,高频词汇等


'''Created on 2014/09/02@author: wangz'''keep = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',' ','-',"'"]def normalize(s):    '''Convert s to normalized string.'''    result = ''    for c in s.lower():        if c in keep:            result += c    return resultdef normalize2(s):    '''Convert s to normalized string.'''    return ''.join(c for c in s.lower() if c in keep)def make_freq_dict(s):    '''Returns a dictionary whose keys are the words of s,    and whose values are the counts of those words.    '''        s = normalize(s)    words = s.split()    d={}    for w in words:        if w in d:            d[w] += 1        else:            d[w] = 1    return ddef print_file_stats(fname):    '''Print statistics for the given file.'''    s = open(fname,'r').read()    num_chars = len(s)    num_lines = s.count('\n')    d = make_freq_dict(s)    num_words = sum(d[w] for w in d)        lst = [(d[w],w) for w in d]    lst.sort()    lst.reverse()        print("The file '%s' has: "%fname)    print(" %s characters"%num_chars)    print(" %s lines"%num_lines)    print(" %s words"%num_words)    print("\nThe top 10 most frequent word are:")    i = 1    for count,word in lst[:10]:        print('%2s. %4s %s'%(i,count,word))        i += 1        inputfile = raw_input('input a file:')print_file_stats(inputfile)


0 0