python wordcount

来源:互联网 发布:java标签库 编辑:程序博客网 时间:2024/06/16 16:30
  1. 统计给出文件的单词数量,简单版,测试下网页编辑功能
#!/usr/bin/env python# -*- coding: utf-8 -*-"""统计给出文件的单词数量,并按照由多到少排序"""'''#######################1. 读取文本文件2. 统计单词个数3. 排序后输出#########################'''import reimport timeimport operatorclass WordCount(object):    def __init__(self, filename):        self.filename = filename        self.word_dict = {}        self.split_pattern = re.compile(r"[^A-Za-z-]")    def readfile(self):        print u"文件名:", self.filename        with open(self.filename, 'r') as fd:            for line in fd.readlines():                self.split_word(line)    def split_word(self, line):        bb = re.split(self.split_pattern, line.strip('\n'))        for word in bb:            word_lower = word.lower()            if word_lower != '':                self.dict_count(word_lower)    def dict_count(self, word):        count = 0        if self.word_dict.has_key(word):            count += self.word_dict.get(word)            self.word_dict[word] = count + 1        else:            self.word_dict[word] = 1    def get_result_dict(self):        return self.word_dict    def sort_count(self):        print "单词类型个数", len(self.word_dict.keys())        sort_list = sorted(self.word_dict)        '''        print "单词数量统计列表"        for word in sort_list:            print word, self.word_dict.get(word)        '''        '''        print "单词数量统计列表,词频大于30"        for word in sort_list:            count = self.word_dict.get(word)            if count > 30:                print word, self.word_dict.get(word)        '''        ''' 按照单词出现频率排序 '''        sort_dict = sorted(self.word_dict.iteritems(), key=lambda d:d[1], reverse = True)        print  sort_dict        count = 0        count_const = 1000        for word in sort_dict:            if word[1] > count_const:                print word[0], word[1]                count += 1        print "单词频率大于", count_const, "词数为" , count    def format_print(self):        self.sort_count()if __name__ == '__main__':    start_time = time.time()    time.sleep(5)    wordcount = WordCount("/root/code/python/temTestDir/pythonlib.txt")    wordcount.readfile()    wordcount.format_print()    print time.time() - start_time
  1. 测试结果 ,截取一小部分
    world wordcount 部分结果未截取 好吧我只是想看看csdn的代码编辑共功能
0 0
原创粉丝点击