语料中筛选出英文单词并统计词频，正则切割匹配

来源：互联网发布：java错误无法加载主类编辑：程序博客网时间：2024/06/05 14:18

1.正则的使用匹配

2.dic.setdefault()的使用

3、内建函数enumerate(sequence,start=0)的使用

4、内建函数sorted(),key,reversed参数设置

5、str.lower()string大小写转换

#coding:utf-8import reimport osimport timeimport codecsPATH = os.path.dirname(__file__)s = u'what a Beautiful woRld'.lower()pattern = re.compile(u'[^a-z]+', re.U)#在非英文出进行切割for con in pattern.split(s.lower()):#将所有英文转化为小写    if len(con) <= 1:        continue    else:        print condef get_english_words():    '''过滤出语料中夹杂在汉语中的英文单词，并统计出现的词频'''    eng_freq_dic = {}    pattern = re.compile(u'[^a-z]+', re.U)    cut_filename = r'E:\SVN\linguistic_model\data\combine_msg_comment.txt'    with codecs.open(cut_filename, encoding='utf-8') as f:        for line in f.readlines():            for con in pattern.split(line.lower()):                if len(con) <= 1:#过滤掉单字母                    continue                else:                    count = eng_freq_dic.setdefault(con, 0) + 1 #若没有该key，则保存该key且设value其为0。若有则value加1                    eng_freq_dic[con] = count#整个英文单词及其出现的频度    eng_filename = os.path.join(PATH, 'english_words_original.txt')    eng_to_write_list = sorted([(k,v) for (k, v) in eng_freq_dic.items()], key=lambda x:x[1], reverse=True)#按照词频的高低进行倒序排列    codecs.open(eng_filename, mode='wb', encoding='utf-8').writelines([item[0]+'\t'+str(item[1])+'\n' for item in eng_to_write_list])#词频为int类型，转化为str类型以后写入到本地文件中def chose_top_n():    '''筛选出top2000,并写入到文件'''    line_list = []    filename = os.path.join(PATH, 'english_words_original.txt')    with codecs.open(filename, encoding='utf-8') as f:        for index,line in enumerate(f.readlines(), start=1):#enumerate(sequence, start=0)用法，显示可迭代序列中元素及其位置，start参数可以确定起始下标，默认情况下为0            print index, line.strip()            time.sleep(1)            line_list.append(line)            if index == 2000:                top_filename = os.path.join(PATH, 'top_2000_english_words.txt')                codecs.open(top_filename, mode='wb', encoding='utf-8').writelines(line_list)                break

0 0