python 中langid的阅读。
来源:互联网 发布:华客数据恢复中心 编辑:程序博客网 时间:2024/06/05 01:17
langid的源码结构
首先我直接进class中 from_modelstring,from_modelpath方法
@classmethod def from_modelstring(cls, string, *args, **kwargs): b = base64.b64decode(string)#解码 '''is the string to decode. Optional altchars must be a string of at least length 2 (additional characters are ignored) which specifies the alternative alphabet used instead of the '+' and '/' characters''' z = bz2.decompress(b)#解压 """ decompress(data) -> decompressed data Decompress data in one shot. If you want to decompress data sequentially, use an instance of BZ2Decompressor instead. """ model = loads(z) """ loads(string) -- Load a pickle from the given string """ nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model #get value from model #as I konw nb_pc ,nb_ptc is list nb_numfeats = int(len(nb_ptc) / len(nb_pc))#retuen int # reconstruct pc and ptc nb_pc = np.array(nb_pc)#[1,2,3] nb_ptc = np.array(nb_ptc).reshape(nb_numfeats, len(nb_pc))#nb_numfeats is col len(nb_pc) is row '''look like nb_ptc=array([[0, 4, 3], [2, 1, 5]])''' return cls(nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output, *args, **kwargs) @classmethod def from_modelpath(cls, path, *args, **kwargs): with open(path) as f: return cls.from_modelstring(f.read().encode(), *args, **kwargs) def __init__(self, nb_ptc, nb_pc, nb_numfeats, nb_classes, tk_nextmove, tk_output, norm_probs=NORM_PROBS): self.nb_ptc = nb_ptc self.nb_pc = nb_pc self.nb_numfeats = nb_numfeats self.nb_classes = nb_classes self.tk_nextmove = tk_nextmove self.tk_output = tk_output #dirctily to kown value if norm_probs: def norm_probs(pd): """ Renormalize log-probs into a proper distribution (sum 1) The technique for dealing with underflow is described in http://jblevins.org/log/log-sum-exp """ # Ignore overflow when computing the exponential. Large values # in the exp produce a result of inf, which does not affect # the correctness of the calculation (as 1/x->0 as x->inf). # On Linux this does not actually trigger a warning, but on # Windows this causes a RuntimeWarning, so we explicitly # suppress it. with np.errstate(over='ignore'): pd = (1 / np.exp(pd[None, :] - pd[:, None]).sum(1)) return pd else: def norm_probs(pd): return pd self.norm_probs = norm_probs # Maintain a reference to the full model, in case we change our language set # multiple times. self.__full_model = nb_ptc, nb_pc, nb_classes
从作者训练的模型中导入数据,然后看set_languages
def set_languages(self, langs=None): logger.debug("restricting languages to: %s", langs) # Unpack the full original model. This is needed in case the language set # has been previously trimmed, and the new set is not a subset of the current # set. nb_ptc, nb_pc, nb_classes = self.__full_model if langs is None: self.nb_classes = nb_classes self.nb_ptc = nb_ptc self.nb_pc = nb_pc else: # We were passed a restricted set of languages. Trim the arrays accordingly # to speed up processing. for lang in langs: if lang not in nb_classes: raise ValueError("Unknown language code %s" % lang) subset_mask = np.fromiter((l in langs for l in nb_classes), dtype=bool) self.nb_classes = [c for c in nb_classes if c in langs] self.nb_ptc = nb_ptc[:, subset_mask] self.nb_pc = nb_pc[subset_mask]
设置可选语言检测语句
ex
langid.set_languages('zh','en')
再进入instance2fv
def instance2fv(self, text): """ Map an instance into the feature space of the trained model. """ if (sys.version_info > (3, 0)): # Python3 if isinstance(text, str): text = text.encode('utf8') else: # Python2 if isinstance(text, unicode): text = text.encode('utf8') # Convert the text to a sequence of ascii values text = map(ord, text)#a list of ord like [125,54,74,65] arr = np.zeros((self.nb_numfeats,), dtype='uint32')#array of uint32 # Count the number of times we enter each state state = 0 statecount = defaultdict(int) #dic for deafult null for letter in text: state = self.tk_nextmove[(state << 8) + letter] statecount[state] += 1 #return {ja:3,zh:4,en:0} # Update all the productions corresponding to the state for state in statecount: for index in self.tk_output.get(state, []): arr[index] += statecount[state] return arr
阅读全文
0 0
- python 中langid的阅读。
- LangID
- 如何利用python中的langid,对文本语种进行分类
- 理解CCSID,LANGID对系统内码的影响
- 【Python】Python文本处理中用langid工具包来对文本进行语言检测与判别
- 看<Headfirst Python>,学习过程中阅读的博客文
- Python--阅读优秀的代码
- 语言识别开源项目langid.py的原理简简简述
- python中BaseHTTPServer.py代码阅读分析
- source insight中阅读python源代码
- source insight中阅读python源代码
- source insight中阅读python源代码
- python中BaseHTTPServer.py代码阅读分析
- 在Source Insight 中阅读python
- 阅读中涉及到的1
- 提高PYTHON编程的能力---阅读源代码
- Python tutorial 3.4 documentation的阅读笔记
- 阅读学习的第一个python程序
- 缓冲流进行复制文件
- Codeforces Round #426 (Div. 2) D. The Bakery(DP+线段树)
- LOJ 「网络流 24 题」太空飞行计划(最大权闭合子图)
- 群体智能-粒子群算法--使用java对于粒子群系统的显示(2)
- Nexted-loop join中小表驱动大表的原因分析
- python 中langid的阅读。
- qt_2_QT5.6+VS2015配置
- MySQL_第三方数据库引擎_tokudb
- 改善java程序之注解和枚举
- 【python】*函数:全局局部变量、内部函数、闭包
- ajax总结
- Tensorflow构建一个简单的神经网络
- DOM 基础三
- OpenCV图像处理——人脸表情识别