python统计文档词频

来源:互联网 发布:淘宝的千人千面 编辑:程序博客网 时间:2024/05/10 02:20

python统计文档中词频的小程序

python版本2.7

程序如下,测试文件完整程序在我的github中 


#统计空格数与单词数 本函数只返回了空格数 需要的可以自己返回多个值
def count_space(path):
    number_counts = 0    space_counts = 0    number_list = []    with open(path, 'r') as f:        for line in f:            line = line.strip()            space_split_list = line.split(' ')            space_counts += len(space_split_list) - 1            for word in space_split_list:                    if word.isdigit():                        number_list.append(word)            number_counts = len(number_list)    return space_counts
#大写转小写 过滤特殊字符等def count_word(path):    result = {}    with open(path) as fileread:        alltext = fileread.read()        alltext = alltext.lower()        alltext = re.sub("\"|,|\.", "", alltext)        for word in alltext.split():            if word not in result:                result[word] = 0            result[word] += 1        return resultdef sort_by_count(d):    d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1]))    return dif __name__ == '__main__':    try:        filename = 'read.txt'        dword = count_word(filename)        dword = sort_by_count(dword)        countspace = count_space(filename)        print "space_counts", countspace        count_word(filename)        for key,value in dword.items():            print key + ":%d" % value    except IOError:        print 'cannot open file %s for read' % filename



1 0
原创粉丝点击