python统计汉字词频

来源:互联网 发布:阿里云个人邮箱登录 编辑:程序博客网 时间:2024/05/22 13:11

# -*- coding: utf-8 -*-"""Created on Tue Apr 22 17:25:47 2014@author: lifeix"""import sys,re    reload(sys)   sys.setdefaultencoding('utf8')  txt = open('/home/lifeix/xiaoshuo1.txt','r').read()  wfile=open('/home/lifeix/result.txt','w')      r = re.compile('[\x80-\xff]+')  m = r.findall(txt)  dict={}  z1 = re.compile('[\x80-\xff]{3}')  z2 = re.compile('[\x80-\xff]{4}')  z3 = re.compile('[\x80-\xff]{6}')  z4 = re.compile('[\x80-\xff]{8}')  for i in m:      x = i.encode('utf8')      i = z1.findall(x)       for j in i:                    if (j in dict):              dict[j]+=1          else:              dict[j]=1                              dict=sorted(dict.items(), key=lambda d:d[1],reverse=True)  for a,b in dict:      if b>0:          wfile.write(a+','+str(b)+'\n')  wfile.close()    f = open('/home/lifeix/result.txt','r')count = 0for line in f.readlines():    if count%10 == 0:        print '\n'    line = line[0:len(line) - 1]    print "%s "%line,    count = count + 1f.close()


0 0