python统计中文单词

来源:互联网 发布:bilibili mac 编辑:程序博客网 时间:2024/06/09 21:58
#coding:UTF-8
import sys   
sys.setrecursionlimit(100000000) 
def wordHan(inIo, outIo='wordcountHAN.txt', writing='w'):
    s = ''
    for fg in inIo:
        s = s + open(fg, 'r').read().decode('utf-8')
    print "一共" , len(s) , "单词"
    lt = set(s)
    word = []
    for x in lt:
        if  19968 <= ord(x) <= 40869:
            word.append(x)
    sts = lambda x:[x + "-->", str(s.count(x) * 1.0 / len(s))]
    m = map(sts, word)
    m = wordsort(m[0], m, 0)
    w = open(outIo, writing)
    for i in m:
        w.writelines(i)
        w.write('\n')
    w.flush()
    w.close()
    
def wordsort(x, m, i):
    if len(m[i:]) == 1:
        return m
    for v in m[i + 1:]:
        f = float(v[1])
        if f > float(m[i][1]):
            ind = m.index(v, i + 1)
            z = m[i]
            m[i] = v
            m[ind] = z
    i += 1
    return wordsort(m[i], m, i) 
 
if __name__ == '__main__':
    wordHan(['test1.txt', 'test2.txt'], writing='w')
    wordEn('test1.txt', writing='w')
原创粉丝点击