IK动态词库及禁用内置主词库
来源:互联网 发布:如何开淘宝童装店 编辑:程序博客网 时间:2024/06/10 00:36
ik-analyzer新增词库后,需要重启solr,而线上环境肯定是需要支持热更新的,需要修改词库后“实时”更新词库。个人将先前修改后的IK(支持solr6.6+版本),再做修改,使之能实现以下功能:
- 支持IK词库热更新,服务定期扫描词库,发现词库变化则重新导入相应词库;
- 记录详细更新日志,新增时间+新增词语,方便定位问题;
- 支持禁用内置主词典main2012.dic。
1、DefaultConfig主要修改代码:
/** * 获取词典动态更新时间间隔[首次延时,时间间隔](格式:正整数,单位:分钟) * * @return Integer 时间间隔 */ public Integer[] getDicUpdateMin() { String extUpdateMin = props.getProperty(DIC_UPDATEMIN); Integer[] timeInterval = null; if (null != extUpdateMin && !Objects.equals("", extUpdateMin.trim())) { String[] split = extUpdateMin.split(","); if (split.length == 2) { timeInterval = new Integer[2]; timeInterval[0] = Integer.valueOf(split[0].trim()); timeInterval[1] = Integer.valueOf(split[1].trim()); if (timeInterval[1] <= 0) { timeInterval = null; } } } Dictionary.print("dic_updateMin_Param", extUpdateMin); return timeInterval; } /** * 是否禁用内置主词典main2012.dic * * @return bool 默认false(不禁用) */ public boolean isDicDisable() { String extUpdateMin = props.getProperty(DICINNER_DISABLE); Dictionary.print("isDicDisable", extUpdateMin); return Objects.equals("true", extUpdateMin); }2、Dictionary词典管理类
重构部分代码,主要修改代码如下:
/** * 词典管理类,单子模式 */public class Dictionary { /* * 词典单子实例 */ private static Dictionary singleton; /* * 主词典对象 */ private static DictSegment _MainDict = null; /* * 停止词词典 */ private static DictSegment _StopWordDict = null; /* * 量词词典 */ private DictSegment _QuantifierDict; /** * 词典上传修改时间. */ private static Map<String, Long> dicLastModified = new HashMap<String, Long>(); /** * 扩展词. */ private static Set<String> dicExtSet = new HashSet<String>(10000); /** * 停用词. */ private static Set<String> dicStopSet = new HashSet<String>(2000); /** * 配置对象 */ private static Configuration cfg; /** * 线程池定时加载词典. */ private static ScheduledExecutorService scheduledThreadPool = Executors.newScheduledThreadPool(1); /** * 是否已加载过词典. */ private static boolean hasAdd = false; /** * SimpleDateFormat(程序逻辑不存在并发,不考虑线程不安全情况). */ private final static java.text.SimpleDateFormat DATE_FORMAT = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S"); /** * 词典初始化 * * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 * * 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间, * * 该方法提供了一个在应用加载阶段就初始化字典的手段 * * @return Dictionary */ public static Dictionary initial(Configuration cfg) { if (singleton == null) { synchronized (Dictionary.class) { if (singleton == null) { singleton = new Dictionary(cfg); Integer[] dicUpdateMin = cfg.getDicUpdateMin(); if (null != dicUpdateMin) { print("loadDicFixedTime", "start"); loadDicFixedTime(dicUpdateMin); } return singleton; } } } return singleton; } /** * 定期加载配置文件. * * @param dicUpdateMin * 加载间隔 */ private static void loadDicFixedTime(Integer[] dicUpdateMin) { scheduledThreadPool.scheduleWithFixedDelay(new Runnable() { public void run() { try { loadMainDict(); loadStopWordDict(); } catch (Exception e) { print(e); } } }, dicUpdateMin[0], dicUpdateMin[1], TimeUnit.MINUTES); } private Dictionary(Configuration cfg) { this.cfg = cfg; this.loadMainDict(); this.loadStopWordDict(); this.loadQuantifierDict(); hasAdd = true; } /** * 获取词典单子实例 * * @return Dictionary 单例对象 */ public static Dictionary getSingleton() { if (singleton == null) { throw new IllegalStateException("词典尚未初始化,请先调用initial方法"); } return singleton; } /** * 批量加载新词条 * * @param words * Collection<String>词条列表 */ public void addWords(Collection<String> words) { if (words != null) { for (String word : words) { if (word != null) { // 批量加载词条到主内存词典中 singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray()); } } } } /** * 批量移除(屏蔽)词条 * * @param words */ public void disableWords(Collection<String> words) { if (words != null) { for (String word : words) { if (word != null) { // 批量屏蔽词条 singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray()); } } } } /** * 检索匹配主词典 * * @param charArray * @return Hit 匹配结果描述 */ public Hit matchInMainDict(char[] charArray) { return singleton._MainDict.match(charArray); } /** * 检索匹配主词典 * * @param charArray * @param begin * @param length * @return Hit 匹配结果描述 */ public Hit matchInMainDict(char[] charArray, int begin, int length) { return singleton._MainDict.match(charArray, begin, length); } /** * 检索匹配量词词典 * * @param charArray * @param begin * @param length * @return Hit 匹配结果描述 */ public Hit matchInQuantifierDict(char[] charArray, int begin, int length) { return singleton._QuantifierDict.match(charArray, begin, length); } /** * 从已匹配的Hit中直接取出DictSegment,继续向下匹配 * * @param charArray * @param currentIndex * @param matchedHit * @return Hit */ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) { DictSegment ds = matchedHit.getMatchedDictSegment(); return ds.match(charArray, currentIndex, 1, matchedHit); } /** * 判断是否是停止词 * * @param charArray * @param begin * @param length * @return boolean */ public boolean isStopWord(char[] charArray, int begin, int length) { return singleton._StopWordDict.match(charArray, begin, length).isMatch(); } /** * 加载主词典及扩展词典 */ private static void loadMainDict() { // 建立一个主词典实例 if (_MainDict == null) { // 首次加载 _MainDict = new DictSegment((char) 0); String mainDictionary = cfg.getMainDictionary(); // 读取主词典文件 if (!cfg.isDicDisable()) { loadToMain(mainDictionary, 1); } } // 加载扩展词典 List<String> extDictFiles = cfg.getExtDictionarys(); if (null != extDictFiles && !extDictFiles.isEmpty()) { for (String extFile : extDictFiles) { loadToMain(extFile, null); } } } /** * 将文件加载到主库. * * @param mainDictionary * mainDictionary * @param innerDic * 是否是内置词典(1是) */ private static void loadToMain(String mainDictionary, Integer innerDic) { String path = null; InputStream is = null; File file = new File(""); if (Objects.equals(1, innerDic)) { is = Dictionary.class.getClassLoader().getResourceAsStream(mainDictionary); } else { path = getFilePath(mainDictionary); file = new File(path); try { is = new FileInputStream(file); } catch (FileNotFoundException e) { print(e); } } if (is == null) { print("loadToMain:FileNotFoundException", path); // throw new RuntimeException("Main Dictionary not found!!!"); return; } if (hasAdd && dicLastModified.containsKey(path) && file.lastModified() <= dicLastModified.get(path)) { return; // 非首次加载或词典未修改 } print("loadToMain_START", mainDictionary); BufferedReader br = null; InputStreamReader inputStreamReader = null; StringBuilder updateDic = new StringBuilder(); try { inputStreamReader = new InputStreamReader(is, "UTF-8"); br = new BufferedReader(inputStreamReader, 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { if (!dicExtSet.contains(theWord)) { dicExtSet.add(theWord); if (hasAdd) { updateDic.append(theWord).append(";"); } } _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { print("loadToMain exception."); print(ioe); } finally { dicLastModified.put(path, file.lastModified()); if (updateDic.length() != 0) { print("loadToMain_END", "FileLastModified:" + DATE_FORMAT.format(new Date(file.lastModified())), updateDic.toString()); } close(is, inputStreamReader, br); } } /** * 获取字典文件实际路径. * * @param dictionary * 字典名 * @return 字典路径 */ private static String getFilePath(String dictionary) { URL resource = Dictionary.class.getClassLoader().getResource(dictionary); if (null == resource) { print("NullPointerException", "getFilePath", dictionary); // 提示用户配置词库有误,方便用户定位异常 } return resource.getPath(); // 抛出异常,终止IK } /** * 加载用户扩展的停止词词典 */ private static void loadStopWordDict() { // 建立一个主词典实例 if (_StopWordDict == null) { _StopWordDict = new DictSegment((char) 0); } // 加载扩展停止词典 List<String> extStopWordDictFiles = cfg.getExtStopWordDictionarys(); if (extStopWordDictFiles != null) { InputStream is = null; for (String extStopWordDictName : extStopWordDictFiles) { // 读取扩展词典文件 // is = Dictionary.class.getClassLoader().getResourceAsStream(extStopWordDictName); String path = getFilePath(extStopWordDictName); File file = new File(path); try { is = new FileInputStream(file); } catch (FileNotFoundException e) { print("loadStopWordDict:FileNotFoundException", path); print(e); } finally { close(is); } // 如果找不到扩展的字典,则忽略 if (is == null) { continue; } if (hasAdd && dicLastModified.containsKey(path) && file.lastModified() <= dicLastModified.get(path)) { continue; // 非首次加载或词典未修改 } print("loadStopWordDict_START", extStopWordDictName); BufferedReader br = null; InputStreamReader inputStreamReader = null; StringBuilder updateDic = new StringBuilder(); try { inputStreamReader = new InputStreamReader(is, "UTF-8"); br = new BufferedReader(inputStreamReader, 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { // System.out.println(theWord); // 加载扩展停止词典数据到内存中 _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); if (!dicStopSet.contains(theWord)) { dicStopSet.add(theWord); if (hasAdd) { updateDic.append(theWord).append(";"); } } } } while (theWord != null); } catch (IOException ioe) { print("loadStopWordDict exception."); print(ioe); } finally { dicLastModified.put(path, file.lastModified()); if (updateDic.length() != 0) { print("loadStopWordDict_END", "FileLastModified:" + DATE_FORMAT.format(new Date(file.lastModified())), updateDic.toString()); } close(is, inputStreamReader, br); } } } } /** * 加载量词词典 */ private void loadQuantifierDict() { // 建立一个量词典实例 _QuantifierDict = new DictSegment((char) 0); // 读取量词词典文件 InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary()); if (is == null) { throw new RuntimeException("Quantifier Dictionary not found!!!"); } BufferedReader br = null; InputStreamReader inputStreamReader = null; try { inputStreamReader = new InputStreamReader(is, "UTF-8"); br = new BufferedReader(inputStreamReader, 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { print("Quantifier Dictionary loading exception."); print(ioe); } finally { close(is, inputStreamReader, br); } } /** * 批量关闭文件流. * * @param closeables * 文件流集合 */ private static void close(AutoCloseable... closeables) { if (null != closeables && closeables.length > 0) { for (AutoCloseable autoCloseable : closeables) { if (null != autoCloseable) { try { autoCloseable.close(); } catch (Exception e) { print(e); } } } } } /** * 控制台打印. * * @param param * 参数 */ public static void print(String... param) { StringBuilder builder = new StringBuilder(); builder.append("[").append(DATE_FORMAT.format(new Date())).append("]"); for (String str : param) { builder.append("[").append(str).append("]"); } System.out.println(builder.toString()); } /** * 控制台打印. * * @param e * 异常信息 */ public static void print(Exception e) { StringBuilder builder = new StringBuilder(); builder.append("[").append(DATE_FORMAT.format(new Date())).append("]").append(e.getMessage()); System.out.println(builder.toString()); e.printStackTrace(); }}
项目完整源码:https://github.com/zxiaofan/ik-analyzer-solr6
可直接从https://github.com/zxiaofan/ik-analyzer-solr6/releases 下载 solr6.6.1版本的jar。
欢迎个人转载,但须在文章页面明显位置给出原文连接;未经作者同意必须保留此段声明、不得随意修改原文、不得用于商业用途,否则保留追究法律责任的权利。【 CSDN 】:csdn.zxiaofan.com【GitHub】:github.zxiaofan.com如有任何问题,欢迎留言。祝君好运!Life is all about choices! 将来的你一定会感激现在拼命的自己!
阅读全文
0 0
- IK动态词库及禁用内置主词库
- 配置Solr中文分词器IK以及ansj,支持动态修改用户词库。
- IK-analyzer添加搜狗词库
- solr配置IK分词,使用sogou词库
- Solr配置IK分词器自定义词库
- ElasticSearch5.0——IK词库加载
- paip.禁用IKAnalyzer 的默认词库.仅仅使用自定义词库.
- solr中ik分词自定义词库和停止词
- es ik 分词插件 词库热加载源码分析
- solr中ik分词配置同义词、停止词、自定义词库
- 禁用mac 内置键盘
- paip..禁用mmseg 的默认词库. . 仅仅使用自定义词库from数据库.
- 笔记本禁用内置键盘方法
- 吴超-----solr添加中文IK分词器,以及配置自定义词库
- solr配置同义词,停止词,和扩展词库(IK分词器为例)
- ubuntu下安装goldendict及离线词库
- Solr配置扩展词/自定义词库(IK版)、如何做逻辑与逻辑或即AND OR查询
- Solr动态加载分词器的自定义词库扩展词库解决方案
- 使用impdp实现数据在不同用户、不同实例之间快速复制
- Sukudo问题
- new与malloc的10点区别
- 欧拉函数
- SpringMvc ,如何将页面的List传入后台Controller
- IK动态词库及禁用内置主词库
- matlab函数meshgrid命令讲解
- 邮箱判断+名字判断+密码判断
- HDOJ2057_A + B Again
- wxPython学习(一)
- 非周期连续傅立叶变换的推导
- Vi/Vim查找替换使用方法(可以用来批量改txt的前缀)
- 百度地图开发,计算两个点间直线距离js方法
- Python进阶(二):String类