分词器之NLPIR加密文件在哪
来源:互联网 发布:mysql 5.7 json 函数 编辑:程序博客网 时间:2024/05/16 16:19
官方网站 http://ictclas.nlpir.org/newsdownloads?DocId=389
既然官方承诺对个人用户永久免费,那拿来做科研还是可以的。只不过每次过期失效之后都要下载最新版本,找到其中的Data/NLPIR.user文件, 这是一个加密文件, 相当于软件可以用的证书。
替换旧版本的Data/NLPIR.user文件,其他不变即可继续使用很长时间了。
python包装之后的代码:
# -*- coding: UTF-8 -*-__author__ = 'Peter_Howe<haobibo@gmail.com>''''Python Warpper for ICTCLAS2014Loading functions from Dynamic Link Library directly.'''from ctypes import *#NLPIR2014 Lib File (NLPIR64, NLPIR32, libNLPIR64.so, libNLPIR32.so),#Change this when you are not using a Win64 environment:libFile = './nlpir/NLPIR32.dll'dll = CDLL(libFile)#load函数,进行dll的python包装def loadFun(exportName, restype, argtypes): global dll f = getattr(dll,exportName) f.restype = restype f.argtypes = argtypes return fclass ENCODING: GBK_CODE = 0 #默认支持GBK编码 UTF8_CODE = GBK_CODE+1 #UTF8编码 BIG5_CODE = GBK_CODE+2 #BIG5编码 GBK_FANTI_CODE = GBK_CODE+3 #GBK编码,里面包含繁体字class POSMap: ICT_POS_MAP_SECOND = 0 #计算所二级标注集 ICT_POS_MAP_FIRST = 1 #计算所一级标注集 PKU_POS_MAP_SECOND = 2 #北大二级标注集 PKU_POS_MAP_FIRST = 3 #北大一级标注集POS = { "n": { #1. 名词 (1个一类,7个二类,5个三类) "n":"名词", "nr":"人名", "nr1":"汉语姓氏", "nr2":"汉语名字", "nrj":"日语人名", "nrf":"音译人名", "ns":"地名", "nsf":"音译地名", "nt":"机构团体名", "nz":"其它专名", "nl":"名词性惯用语", "ng":"名词性语素" }, "t": { #2. 时间词(1个一类,1个二类) "t":"时间词", "tg":"时间词性语素" }, "s": { #3. 处所词(1个一类) "s":"处所词" }, "f": { #4. 方位词(1个一类) "f":"方位词" }, "v": { #5. 动词(1个一类,9个二类) "v":"动词", "vd":"副动词", "vn":"名动词", "vshi":"动词“是”", "vyou":"动词“有”", "vf":"趋向动词", "vx":"形式动词", "vi":"不及物动词(内动词)", "vl":"动词性惯用语", "vg":"动词性语素" }, "a": { #6. 形容词(1个一类,4个二类) "a":"形容词", "ad":"副形词", "an":"名形词", "ag":"形容词性语素", "al":"形容词性惯用语" }, "b": { #7. 区别词(1个一类,2个二类) "b":"区别词", "bl":"区别词性惯用语" }, "z": { #8. 状态词(1个一类) "z":"状态词" }, "r": { #9. 代词(1个一类,4个二类,6个三类) "r":"代词", "rr":"人称代词", "rz":"指示代词", "rzt":"时间指示代词", "rzs":"处所指示代词", "rzv":"谓词性指示代词", "ry":"疑问代词", "ryt":"时间疑问代词", "rys":"处所疑问代词", "ryv":"谓词性疑问代词", "rg":"代词性语素" }, "m": { #10. 数词(1个一类,1个二类) "m":"数词", "mq":"数量词" }, "q": { #11. 量词(1个一类,2个二类) "q":"量词", "qv":"动量词", "qt":"时量词" }, "d": { #12. 副词(1个一类) "d":"副词" }, "p": { #13. 介词(1个一类,2个二类) "p":"介词", "pba":"介词“把”", "pbei":"介词“被”" }, "c": { #14. 连词(1个一类,1个二类) "c":"连词", "cc":"并列连词" }, "u": { #15. 助词(1个一类,15个二类) "u":"助词", "uzhe":"着", "ule":"了 喽", "uguo":"过", "ude1":"的 底", "ude2":"地", "ude3":"得", "usuo":"所", "udeng":"等 等等 云云", "uyy":"一样 一般 似的 般", "udh":"的话", "uls":"来讲 来说 而言 说来", "uzhi":"之", "ulian":"连 " #(“连小学生都会”) }, "e": { #16. 叹词(1个一类) "e":"叹词" }, "y": { #17. 语气词(1个一类) "y":"语气词(delete yg)" }, "o": { #18. 拟声词(1个一类) "o":"拟声词" }, "h": { #19. 前缀(1个一类) "h":"前缀" }, "k": { #20. 后缀(1个一类) "k":"后缀" }, "x": { #21. 字符串(1个一类,2个二类) "x":"字符串", "xx":"非语素字", "xu":"网址URL" }, "w":{ #22. 标点符号(1个一类,16个二类) "w":"标点符号", "wkz":"左括号", #( 〔 [ { 《 【 〖 〈 半角:( [ { < "wky":"右括号", #) 〕 ] } 》 】 〗 〉 半角: ) ] { > "wyz":"全角左引号", #“ ‘ 『 "wyy":"全角右引号", #” ’ 』 "wj":"全角句号", #。 "ww":"问号", #全角:? 半角:? "wt":"叹号", #全角:! 半角:! "wd":"逗号", #全角:, 半角:, "wf":"分号", #全角:; 半角: ; "wn":"顿号", #全角:、 "wm":"冒号", #全角:: 半角: : "ws":"省略号", #全角:…… … "wp":"破折号", #全角:—— -- ——- 半角:--- ---- "wb":"百分号千分号", #全角:% ‰ 半角:% "wh":"单位符号" #全角:¥ $ £ ° ℃ 半角:$ }}class SegAtom(Structure): _fields_ = [("start", c_int32), ("length", c_int32), ("sPOS", c_char * 40), ("iPOS", c_int32), ("word_ID", c_int32), ("word_type", c_int32), ("weight", c_int32) ]def translatePOS(sPOS): global POS if sPOS=='url': sPOS = 'xu' c = sPOS[0] return POS[c][sPOS]#对dll库中的各个函数进行Python包装,包装之后的调用方式为,例:ImportUserDict("userdic.txt")Init = loadFun('NLPIR_Init',c_int, [c_char_p, c_int, c_char_p])Exit = loadFun('NLPIR_Exit',c_bool, None)ParagraphProcess = loadFun('NLPIR_ParagraphProcess',c_char_p, [c_char_p, c_int])ParagraphProcessA = loadFun('NLPIR_ParagraphProcessA',POINTER(SegAtom), [c_char_p, c_void_p, c_bool])#ParagraphProcessAW = loadFun('NLPIR_ParagraphProcessAW',None, [c_int, POINTER(SegAtom)])FileProcess = loadFun('NLPIR_FileProcess',c_double, [c_char_p, c_char_p, c_int])ImportUserDict = loadFun('NLPIR_ImportUserDict',c_uint, [c_char_p])AddUserWord = loadFun('NLPIR_AddUserWord', c_int, [c_char_p])SaveTheUsrDic = loadFun('NLPIR_SaveTheUsrDic', c_int, None)DelUsrWord = loadFun('NLPIR_DelUsrWord',c_int, [c_char_p])GetUniProb = loadFun('NLPIR_GetUniProb', c_double, [c_char_p])IsWord = loadFun('NLPIR_IsWord',c_bool, [c_char_p])GetKeyWords = loadFun('NLPIR_GetKeyWords',c_char_p, [c_char_p, c_int, c_bool])GetFileKeyWords = loadFun('NLPIR_GetNewWords',c_char_p, [c_char_p, c_int, c_bool])GetNewWords = loadFun('NLPIR_GetNewWords', c_char_p, [c_char_p, c_int, c_bool])GetFileNewWords = loadFun('NLPIR_GetFileNewWords',c_char_p, [c_char_p, c_int, c_bool])FingerPrint = loadFun('NLPIR_FingerPrint',c_ulong, [c_char_p])SetPOSmap = loadFun('NLPIR_SetPOSmap',c_int, [c_int])#New Word IdentificationNWI_Start = loadFun('NLPIR_NWI_Start', c_bool, None)NWI_AddFile = loadFun('NLPIR_NWI_AddFile',c_bool, [c_char_p])NWI_AddMem = loadFun('NLPIR_NWI_AddMem',c_bool, [c_char_p])NWI_Complete = loadFun('NLPIR_NWI_Complete', c_bool, None)NWI_GetResult = loadFun('NLPIR_NWI_GetResult',c_char_p, [c_int])NWI_Result2UserDict = loadFun('NLPIR_NWI_Result2UserDict',c_uint, None)#初始化分词器if not Init('',ENCODING.UTF8_CODE,''): print("Initialization failed!") exit(-111111)'''if not SetPOSmap(3): #POSMap.ICT_POS_MAP_SECOND print("Setting POS Map failed!") exit(-22222)'''def seg(paragraph): result = ParagraphProcess(paragraph, c_int(1)) atoms = [i.strip().split('/') for i in result.split(' ') if len(i)>=1 and i[0]!=' '] atoms = [(a[0],a[1]) for a in atoms if len(a[0])>0] return atomsdef segment(paragraph): count = c_int32() result = ParagraphProcessA(paragraph, byref(count),c_bool(True)) count = count.value atoms = cast(result, POINTER(SegAtom)) return [atoms[i] for i in range(0,count)]def Seg(paragraph): atoms = segment(paragraph) for a in atoms: if len(a.sPOS) < 1: continue i = paragraph[a.start: a.start + a.length]#.decode('utf-8')#.encode('ascii') yield (i, a.sPOS)if __name__ == "__main__":#分词测试: p='央视啊 希望再也不要重蹈春晚广告门覆辙 加油@![微笑]~' for t in Seg(p): s = '%s\t%s\t%s' % (t[0],t[1],translatePOS(t[1])) print(s)
0 0
- 分词器之NLPIR加密文件在哪
- NLPIR/ICTCLAS2016分词系统的文件结构
- NLPIR分词之N-最短路径
- NLPIR汉语分词系统在VS中使用
- 在eclipse中使用NLPIR(ICTCLAS)2013进行分词
- NLPIR中文分词系统之Java接口的使用
- NLPIR分词乱码问题
- NLPIR中文分词 java
- NLPIR分词使用说明
- 中科院分词(NLPIR) JAVA
- nlpir分词系统
- nlpir分词工具使用记录
- 使用NLPIR-ICTCLAS2014分词系统
- NLPIR分词教程 Scala版
- 使用NLPIR汉语分词系统进行分词
- NLPIR(ICTCLAS2013)中文分词系统应用在Web项目中
- NLPIR分词系统(ICTCLAS 2013)在VS 2013中使用
- NLPIR(ICTCLAS2015)分词工具Java开发简介
- 强连通分量 模板
- 优化 Go 中的 map 并发存取
- 2.9 Fibonacci数列
- iOS视图控制器的跳转方法
- 线段树的两种查询方式
- 分词器之NLPIR加密文件在哪
- IK分词加入标点符号
- 斯特林数 组合数
- hdu5399(2015多校9)--Too Simple
- 总线设备驱动模型——设备篇
- HDU 5399
- C语言之文件操作06——写数据到文本文件遇0停止
- dinic 模板
- WPF中StaticResource 标记扩展和DynamicResource 标记扩展