文本挖掘--python

来源:互联网 发布:推广数据留言 编辑:程序博客网 时间:2024/06/04 18:11
# -*- coding: utf-8 -*-"""Created on Mon Oct 03 11:07:58 2016@author: liqi"""keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',' ','-',"'"}def normalize(s):   return ''.join(c for c in s.lower() if c in keep)  def make_freq_dict(s):    s = normalize(s)    words = s.split()    d = {}    for w in words:        if w in d:            d[w] += 1        else:            d[w] = 1    return ddef print_file_stats(fname):    s = open(fname,'r').read()    num_chars = len(s)    num_lines = s.count('\n')    d = make_freq_dict(s)    num_words = sum(d[w] for w in d)    lst = [(d[w],w) for w in d]    lst.sort()    lst.reverse()    print("The file '%s' has:"% fname)    print(" %s characters" % num_chars)    print(" %s lines" % num_lines)    print(" %s words" % num_words)    print("\nThe top 10 most frequant words are:")    i = 1    for count,word in lst[:20]:        print('%2s. %4s %s' %(i,count,word))        i += 1def main():    print_file_stats('bill.txt')if __name__ == '__main__':    main()

输出结果

The file 'bill.txt' has: 34426 characters 94 lines 6215 wordsThe top 10 most frequant words are: 1.  320 the 2.  260 i 3.  202 and 4.  183 to 5.  148 of 6.  147 a 7.  131 was 8.  124 in 9.   81 my10.   64 he11.   61 for12.   57 had13.   56 that14.   51 it15.   50 with16.   50 me17.   48 his18.   47 on19.   35 when20.   35 but
0 0
原创粉丝点击