python正向最大匹配分词和逆向最大匹配分词

来源：互联网发布：av淘宝avtaobao.me在线编辑：程序博客网时间：2024/05/22 14:04

正向最大匹配

# -*- coding:utf-8 -*-CODEC='utf-8'def u(s, encoding):    'converted other encoding to unicode encoding'    if isinstance(s, unicode):        return s    else:        return unicode(s, encoding)def fwd_mm_seg(wordDict, maxLen, str):    'forward max match segment'    wordList = []    segStr = str    segStrLen = len(segStr)    for word in wordDict:        print 'word: ', word    print "\n"    while segStrLen > 0:        if segStrLen > maxLen:            wordLen = maxLen        else:            wordLen = segStrLen        subStr = segStr[0:wordLen]        print "subStr: ", subStr        while wordLen > 1:            if subStr in wordDict:                print "subStr1: %r" % subStr                break            else:                print "subStr2: %r" % subStr                wordLen = wordLen - 1                subStr = subStr[0:wordLen]#            print "subStr3: ", subStr        wordList.append(subStr)        segStr = segStr[wordLen:]        segStrLen = segStrLen - wordLen    for wordstr in wordList:        print "wordstr: ", wordstr    return wordList                    def main():    fp_dict = open('words.dic')    wordDict = {}    for eachWord in fp_dict:        wordDict[u(eachWord.strip(), 'utf-8')] = 1    segStr = u'你好世界hello world'    print segStr    wordList = fwd_mm_seg(wordDict, 10, segStr)    print "==".join(wordList)    if __name__ == '__main__':    main()

逆向最大匹配

# -*- coding:utf-8 -*-def u(s, encoding):    'converted other encoding to unicode encoding'    if isinstance(s, unicode):        return s    else:        return unicode(s, encoding)CODEC='utf-8'def bwd_mm_seg(wordDict, maxLen, str):    'forward max match segment'    wordList = []    segStr = str    segStrLen = len(segStr)    for word in wordDict:        print 'word: ', word    print "\n"    while segStrLen > 0:        if segStrLen > maxLen:            wordLen = maxLen        else:            wordLen = segStrLen        subStr = segStr[-wordLen:None]        print "subStr: ", subStr        while wordLen > 1:            if subStr in wordDict:                print "subStr1: %r" % subStr                break            else:                print "subStr2: %r" % subStr                wordLen = wordLen - 1                subStr = subStr[-wordLen:None]#            print "subStr3: ", subStr        wordList.append(subStr)        segStr = segStr[0: -wordLen]        segStrLen = segStrLen - wordLen    wordList.reverse()    for wordstr in wordList:        print "wordstr: ", wordstr    return wordList                    def main():    fp_dict = open('words.dic')    wordDict = {}    for eachWord in fp_dict:        wordDict[u(eachWord.strip(), 'utf-8')] = 1    segStr = ur'你好世界hello world'    print segStr    wordList = bwd_mm_seg(wordDict, 10, segStr)    print "==".join(wordList)if __name__ == '__main__':    main()

阅读全文

0 0