拼音切分

来源:互联网 发布:域名注册后 编辑:程序博客网 时间:2024/04/25 16:54

我之前贴了一个拼音列表,然后给出了一个Trie树的实现。也许能猜出来,对了,我要做一个拼音切词。通常url中的拼音是没有字符分割的比如说guojibaodao(国际报道)。如果我们想用一用url中的这部分信息,我们可以选择做一下切分,然后把它映射成汉字。

和上次一样,我贴出来代码。实现还是很一目了然的,就别废话了。这个实现只给出了一种可能的切分结果。如果需要返回所有的情况,需要自己修改一下。运行时需要一个pinyin_trie的文件。这个就自己生成吧。


#!/usr/bin/env pythonimport sys, pickleclass TrieNode(object):    def __init__(self):        self.value = None        self.children = {}class Trie(object):    def __init__(self):        self.root = TrieNode()    def add(self, key):        node = self.root        for char in key:            if char not in node.children:                child = TrieNode()                node.children[char] = child                node = child            else:                node = node.children[char]        node.value = key    def search(self, key):        node = self.root        matches = []        matched_length = 0        for char in key:            if char not in node.children:                break            node = node.children[char]            if node.value:                matches.append(node.value)        return matchesclass ScanPos(object):    def __init__(self, pos, token = None, parent = None):        self.pos = pos        self.token = token        self.parent = parentclass PinyinTokenizer(object):    def __init__(self):        with open('pinyin_trie') as f:            self.trie = pickle.load(f)    def tokenize(self, content):        total_length = len(content)        tokens = []        candidate_pos = [ScanPos(0)]        last_pos = None        while candidate_pos:            p = candidate_pos.pop()            if p.pos == total_length:                last_pos = p                break            matches = self.trie.search(content[p.pos:])            for m in  matches:                new_pos = ScanPos(len(m) + p.pos, m, p)                candidate_pos.append(new_pos)        pos = last_pos        while pos:            if pos.parent:                tokens.insert(0, pos.token)            pos = pos.parent        return tokensif __name__ == '__main__':    tokenizer = PinyinTokenizer()    print tokenizer.tokenize('woaibeijingtiananmentiananmenshangtaiyangsheng')