python 中langid的阅读。

来源：互联网发布：华客数据恢复中心编辑：程序博客网时间：2024/06/05 01:17

langid的源码结构

在这里看到功能跟方法

首先我直接进class中 from_modelstring,from_modelpath方法

@classmethod  def from_modelstring(cls, string, *args, **kwargs):    b = base64.b64decode(string)#解码    '''is the string to decode.  Optional altchars must be a string of at least    length 2 (additional characters are ignored) which specifies the    alternative alphabet used instead of the '+' and '/' characters'''    z = bz2.decompress(b)#解压    """            decompress(data) -> decompressed data            Decompress data in one shot. If you want to decompress data sequentially,            use an instance of BZ2Decompressor instead.            """    model = loads(z)    """ loads(string) -- Load a pickle from the given string """    nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model #get value from model      #as I konw nb_pc ,nb_ptc is list        nb_numfeats = int(len(nb_ptc) / len(nb_pc))#retuen int        # reconstruct pc and ptc        nb_pc = np.array(nb_pc)#[1,2,3]        nb_ptc = np.array(nb_ptc).reshape(nb_numfeats, len(nb_pc))#nb_numfeats is col len(nb_pc) is row        '''look like nb_ptc=array([[0, 4, 3],                                    [2, 1, 5]])'''        return cls(nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output, *args, **kwargs)    @classmethod    def from_modelpath(cls, path, *args, **kwargs):        with open(path) as f:            return cls.from_modelstring(f.read().encode(), *args, **kwargs)    def __init__(self, nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output,                 norm_probs=NORM_PROBS):        self.nb_ptc = nb_ptc        self.nb_pc = nb_pc        self.nb_numfeats = nb_numfeats        self.nb_classes = nb_classes        self.tk_nextmove = tk_nextmove        self.tk_output = tk_output        #dirctily to kown value        if norm_probs:            def norm_probs(pd):                """                Renormalize log-probs into a proper distribution (sum 1)                The technique for dealing with underflow is described in                http://jblevins.org/log/log-sum-exp                """                # Ignore overflow when computing the exponential. Large values                # in the exp produce a result of inf, which does not affect                # the correctness of the calculation (as 1/x->0 as x->inf).                # On Linux this does not actually trigger a warning, but on                # Windows this causes a RuntimeWarning, so we explicitly                # suppress it.                with np.errstate(over='ignore'):                    pd = (1 / np.exp(pd[None, :] - pd[:, None]).sum(1))                return pd        else:            def norm_probs(pd):                return pd        self.norm_probs = norm_probs        # Maintain a reference to the full model, in case we change our language set        # multiple times.        self.__full_model = nb_ptc, nb_pc, nb_classes

从作者训练的模型中导入数据，然后看set_languages

    def set_languages(self, langs=None):        logger.debug("restricting languages to: %s", langs)        # Unpack the full original model. This is needed in case the language set        # has been previously trimmed, and the new set is not a subset of the current        # set.        nb_ptc, nb_pc, nb_classes = self.__full_model        if langs is None:            self.nb_classes = nb_classes            self.nb_ptc = nb_ptc            self.nb_pc = nb_pc        else:            # We were passed a restricted set of languages. Trim the arrays accordingly            # to speed up processing.            for lang in langs:                if lang not in nb_classes:                    raise ValueError("Unknown language code %s" % lang)            subset_mask = np.fromiter((l in langs for l in nb_classes), dtype=bool)            self.nb_classes = [c for c in nb_classes if c in langs]            self.nb_ptc = nb_ptc[:, subset_mask]            self.nb_pc = nb_pc[subset_mask]

设置可选语言检测语句
ex

langid.set_languages('zh','en')

再进入instance2fv

    def instance2fv(self, text):        """        Map an instance into the feature space of the trained model.        """        if (sys.version_info > (3, 0)):            # Python3            if isinstance(text, str):                text = text.encode('utf8')        else:            # Python2            if isinstance(text, unicode):                text = text.encode('utf8')            # Convert the text to a sequence of ascii values            text = map(ord, text)#a list of ord like [125,54,74,65]        arr = np.zeros((self.nb_numfeats,), dtype='uint32')#array of uint32        # Count the number of times we enter each state        state = 0        statecount = defaultdict(int) #dic for deafult null        for letter in text:            state = self.tk_nextmove[(state << 8) + letter]            statecount[state] += 1            #return {ja:3,zh:4,en:0}        # Update all the productions corresponding to the state        for state in statecount:            for index in self.tk_output.get(state, []):                arr[index] += statecount[state]        return arr

阅读全文

0 0