python统计中文单词
来源:互联网 发布:bilibili mac 编辑:程序博客网 时间:2024/06/09 21:58
#coding:UTF-8
import sys
sys.setrecursionlimit(100000000)
def wordHan(inIo, outIo='wordcountHAN.txt', writing='w'):
s = ''
for fg in inIo:
s = s + open(fg, 'r').read().decode('utf-8')
print "一共" , len(s) , "单词"
lt = set(s)
word = []
for x in lt:
if 19968 <= ord(x) <= 40869:
word.append(x)
sts = lambda x:[x + "-->", str(s.count(x) * 1.0 / len(s))]
m = map(sts, word)
m = wordsort(m[0], m, 0)
w = open(outIo, writing)
for i in m:
w.writelines(i)
w.write('\n')
w.flush()
w.close()
def wordsort(x, m, i):
if len(m[i:]) == 1:
return m
for v in m[i + 1:]:
f = float(v[1])
if f > float(m[i][1]):
ind = m.index(v, i + 1)
z = m[i]
m[i] = v
m[ind] = z
i += 1
return wordsort(m[i], m, i)
if __name__ == '__main__':
wordHan(['test1.txt', 'test2.txt'], writing='w')
wordEn('test1.txt', writing='w')
import sys
sys.setrecursionlimit(100000000)
def wordHan(inIo, outIo='wordcountHAN.txt', writing='w'):
s = ''
for fg in inIo:
s = s + open(fg, 'r').read().decode('utf-8')
print "一共" , len(s) , "单词"
lt = set(s)
word = []
for x in lt:
if 19968 <= ord(x) <= 40869:
word.append(x)
sts = lambda x:[x + "-->", str(s.count(x) * 1.0 / len(s))]
m = map(sts, word)
m = wordsort(m[0], m, 0)
w = open(outIo, writing)
for i in m:
w.writelines(i)
w.write('\n')
w.flush()
w.close()
def wordsort(x, m, i):
if len(m[i:]) == 1:
return m
for v in m[i + 1:]:
f = float(v[1])
if f > float(m[i][1]):
ind = m.index(v, i + 1)
z = m[i]
m[i] = v
m[ind] = z
i += 1
return wordsort(m[i], m, i)
if __name__ == '__main__':
wordHan(['test1.txt', 'test2.txt'], writing='w')
wordEn('test1.txt', writing='w')
阅读全文
0 0
- python统计中文单词
- python 统计 英文 单词
- python MapReduce单词统计
- python统计文章单词次数
- python 统计文章单词个数
- python之单词统计(words count)
- python统计文本中单词个数
- Python 统计文章单词出现频率
- python 统计文件中单词数目
- Python--处理文献中单词,统计个数
- python 统计文本单词数-字典排序
- python 文本单词提取和词频统计
- [python每日一练]--0006:单词统计
- python统计文本中单词出现次数
- python统计文件中单词数
- 使用Python统计字符串中单词数量
- python字典用法-统计统计一句单词
- 统计单词
- Centos7 内核从3.10升级到4.12过程
- GWT编译中出现Invalid Character问题的一种解决方式
- 写博客的三个阶段
- EL
- 并查集理解 【这个比较形象】 附:SDUTOJ 数据结构实验:连通分量个数
- python统计中文单词
- Linux基础命令
- Ubuntu 16.04 安装搜狗输入法教程
- 平衡树(splay treap)(普通平衡树,郁闷的出纳员)
- 8皇后问题
- C Primer Plus学习笔记----第四章字符串和格式化输入/输出
- 交叉编译工具链
- lib和dll的区别,生成
- for循环实用实例