1对n的条件筛选

来源:互联网 发布:19岁网络女主播萱萱 编辑:程序博客网 时间:2024/05/22 10:28

文件中有word pinyin freq参数,一个word可能对应多个pinyin和多个freq,例如:

的 de 123

的 di  100

读取一个文件进行高freq的筛选

def gen_single_word_dic():    import codecs,os    THIS_PATH = os.path.dirname(os.path.abspath(__file__))    BASE_WORDS_HZOUT = os.path.join(THIS_PATH, "data","HZout_NoTone.txt")    fileObj= codecs.open(BASE_WORDS_HZOUT,mode="rb",encoding="utf-16")    word_pinyin_freq_dic = {}    for line in fileObj.readlines():        splited_line = line.split("\t")        word = splited_line[0]        check_word_exists = word_pinyin_freq_dic.get(word)        if check_word_exists: #if word exists            check_word_exists.append(splited_line)        else:#if word not exists            list_bak = []            list_bak.append(splited_line)            word_pinyin_freq_dic[word] = list_bak    fileObj.close()    return word_pinyin_freq_dic# gen_single_word_dic()#def get_high_freq():    import codecs    word_wpf_dic = gen_single_word_dic()    fileObj = codecs.open("data/high_frequence_single_word.txt",mode="wb",encoding="utf-16")    for key in word_wpf_dic:        word_pinyin_freq_list_len = len(word_wpf_dic[key])        if word_pinyin_freq_list_len > 1:            print word_wpf_dic[key]            high_freq_word = max(word_wpf_dic[key], key=lambda x: x[2])            print high_freq_word            com_str = "\t".join(high_freq_word)            fileObj.write(com_str)    fileObj.close()get_high_freq()
将解析后的数据写入到新文件中


0 0
原创粉丝点击