Python 自然语言处理二：用ngrams 进行语言种类识别

来源：互联网发布：2016网络第一红人编辑：程序博客网时间：2024/06/05 05:47

世界上的语言种类很多，然而很多油相似之处，如何智能的识别出语言类别，也是如今自然语言处理的一个话题。用ngrams的理念来搭建语言模型从而识别出语言类别是一个基本简单的方法。（当然现在网上有很多API可以直接调用) 就比如以下的例子

malaysian（马来文） Semua manusia dilahirkan bebasindonesian （印尼文）Semua orang  dilahirkan merdekatamil （泰米尔语）Maitap piiviyiar cakalarum cutantiramkav piakkiaar

马来文和印尼文泰米尔语这三种语言相似度特别大，如何只能判别出三种语言。根据已有的语料库，建立4-grams的语言模型。

这次是一次作业，由于第一次用python和nltk. 代码会显得重复冗长（大神见笑了）。希望能给正在学习语言处理和人工智能的朋友们起到一点参考价值。

没什么好说的，上代码。

#!/usr/bin/python#import divisionfrom __future__ import divisionimport reimport nltkimport sysimport getopt #import ngrams from nltk to help build ngramsfrom nltk.util import ngrams# the LMs#LM for indonesiandict_indonesian = {}#LM for malaysiandict_malaysian = {}#LM for tamildict_tamil = {}#LM for store all the entries, to help smoothingdict_all = {}def build_LM(in_file):    """    build language models for each label    each line in in_file contains a label and an URL separated by a tab(\t)    """    print 'building language models...'    # This is an empty method    # Pls implement your code in below    global dict_indonesian    global dict_malaysian     global dict_tamil    global dict_all    #used to count the total occurrence of entries    sum_malay = 0    sum_indo = 0    sum_tamil = 0        #open the input file    f = file(in_file)    for line in f:#to check if the label is indonesianif line.startswith('indonesian'):    #split the label from the line    dictsrc = line.split(' ',1)[1]    #append the beginning and ending shown from lecture notes    dictsrc = '   ' + dictsrc + '   '    #build the 4-grams and add it to the dictionary    for word in ngrams(dictsrc,4):if word in dict_indonesian:    dict_indonesian[word] += 1else:    dict_indonesian[word] = 2#add the new entry into the dict_allif word not in dict_all:    dict_all[word] = 1#to check if the label is malaysian elif line.startswith('malaysian'):    dictsrc = line.split(' ',1)[1]    for word in ngrams(dictsrc,4):if word in dict_malaysian:    dict_malaysian[word] += 1else:    dict_malaysian[word] = 2if word not in dict_all:    dict_all[word] = 1#to check if the label is tamilelif line.startswith('tamil'):    dictsrc =line.split(' ',1)[1]    for word in ngrams(dictsrc,4):if word in dict_tamil:    dict_tamil[word] += 1else:    dict_tamil[word] = 2if word not in dict_all:    dict_all[word] = 1    #add one smoothing    dict_indonesian = dict(dict_all.items() + dict_indonesian.items())    dict_malaysian = dict(dict_all.items() + dict_malaysian.items())      dict_tamil = dict(dict_all.items() + dict_tamil.items())    #calculate the corresponding probability for each entry in the indonesian LM    for (k,v) in dict_indonesian.items():sum_indo += v    for (k,v) in dict_indonesian.items():dict_indonesian[k] = v/sum_indo    #calculate the corresponding probability for each entry in the malaysian LM    for (k,v) in dict_malaysian.items():sum_malay += v    for (k,v) in dict_malaysian.items():dict_malaysian[k] = v/sum_malay    #calculate the corresponding probability for each entry in the tamil LM    for (k,v) in dict_tamil.items():sum_tamil += v    for (k,v) in dict_tamil.items():dict_tamil[k] = v/sum_tamildef test_LM(in_file, out_file, LM):    """    test the language models on new URLs    each line of in_file contains an URL    you should print the most probable label for each URL into out_file    """    print "testing language models..."    # This is an empty method    # Pls implement your code in below    #open the input file and the output file and choose the mode    infile = file(in_file)    ofile = file(out_file,'w')    for line in infile:#set the initial probabilityPindo = 1Pmalay = 1Ptamil = 1#split the string into tuples of length 4n=4for i in range(0,len(line),n):    word = line[i:i+n]    '''in some case, usually at the end of a sentence, the length of     word is small than 4, append the ending'''    if len(word) < 4:word = word + ' ' * (4 - len(word))       #convert the word to tuple    word = tuple (word)            #check if the word is in the entry, if it is, calculate the probability    if word in dict_all:        Pindo = Pindo * dict_indonesian[word]        Pmalay = Pmalay * dict_malaysian[word]        Ptamil = Ptamil * dict_tamil[word]if Pindo > Pmalay and Pindo > Ptamil:    ofile.write('indonesian ' + line)elif Pmalay > Pindo and Pmalay > Ptamil:    ofile.write('malaysian ' + line)elif Ptamil > Pindo and Ptamil > Pmalay:    ofile.write('tamil ' + line)else:     #the case the sentence belongs to other language    ofile.write('other ' + line)    #close file    infile.close()    ofile.close()def usage():    print "usage: " + sys.argv[0] + " -b input-file-for-building-LM -t input-file-for-testing-LM -o output-file"input_file_b = input_file_t = output_file = Nonetry:    opts, args = getopt.getopt(sys.argv[1:], 'b:t:o:')except getopt.GetoptError, err:    usage()    sys.exit(2)for o, a in opts:    if o == '-b':        input_file_b = a    elif o == '-t':        input_file_t = a    elif o == '-o':        output_file = a    else:        assert False, "unhandled option"if input_file_b == None or input_file_t == None or output_file == None:    usage()    sys.exit(2)LM = build_LM(input_file_b)test_LM(input_file_t, output_file, LM)

代码注释用英文写的懒得再改回中文了，也挺浅显易懂的。

代码是在unix系统下写的，测试文档就不给大家发了. 代码主要有两个部分组成，build_LM（）和 test_LM（）

build_LM()的作用是根据语料库来搭建语言模型，

test_LM（）则是实际应用，对任意的语句或者文件来判断它的语言类别.

Python 自然语言处理 二： 用ngrams 进行 语言种类识别

Python 自然语言处理二：用ngrams 进行语言种类识别