IK动态词库及禁用内置主词库

来源:互联网 发布:如何开淘宝童装店 编辑:程序博客网 时间:2024/06/10 00:36

        ik-analyzer新增词库后,需要重启solr,而线上环境肯定是需要支持热更新的,需要修改词库后“实时”更新词库。个人将先前修改后的IK(支持solr6.6+版本),再做修改,使之能实现以下功能:

  1. 支持IK词库热更新,服务定期扫描词库,发现词库变化则重新导入相应词库;
  2. 记录详细更新日志,新增时间+新增词语,方便定位问题;
  3. 支持禁用内置主词典main2012.dic。
1、DefaultConfig主要修改代码:
/**     * 获取词典动态更新时间间隔[首次延时,时间间隔](格式:正整数,单位:分钟)     *      * @return Integer 时间间隔     */    public Integer[] getDicUpdateMin() {        String extUpdateMin = props.getProperty(DIC_UPDATEMIN);        Integer[] timeInterval = null;        if (null != extUpdateMin && !Objects.equals("", extUpdateMin.trim())) {            String[] split = extUpdateMin.split(",");            if (split.length == 2) {                timeInterval = new Integer[2];                timeInterval[0] = Integer.valueOf(split[0].trim());                timeInterval[1] = Integer.valueOf(split[1].trim());                if (timeInterval[1] <= 0) {                    timeInterval = null;                }            }        }        Dictionary.print("dic_updateMin_Param", extUpdateMin);        return timeInterval;    }    /**     * 是否禁用内置主词典main2012.dic     *      * @return bool 默认false(不禁用)     */    public boolean isDicDisable() {        String extUpdateMin = props.getProperty(DICINNER_DISABLE);        Dictionary.print("isDicDisable", extUpdateMin);        return Objects.equals("true", extUpdateMin);    }
2、Dictionary词典管理类
重构部分代码,主要修改代码如下:
/** * 词典管理类,单子模式 */public class Dictionary {    /*     * 词典单子实例     */    private static Dictionary singleton;    /*     * 主词典对象     */    private static DictSegment _MainDict = null;    /*     * 停止词词典     */    private static DictSegment _StopWordDict = null;    /*     * 量词词典     */    private DictSegment _QuantifierDict;    /**     * 词典上传修改时间.     */    private static Map<String, Long> dicLastModified = new HashMap<String, Long>();    /**     * 扩展词.     */    private static Set<String> dicExtSet = new HashSet<String>(10000);    /**     * 停用词.     */    private static Set<String> dicStopSet = new HashSet<String>(2000);    /**     * 配置对象     */    private static Configuration cfg;    /**     * 线程池定时加载词典.     */    private static ScheduledExecutorService scheduledThreadPool = Executors.newScheduledThreadPool(1);    /**     * 是否已加载过词典.     */    private static boolean hasAdd = false;    /**     * SimpleDateFormat(程序逻辑不存在并发,不考虑线程不安全情况).     */    private final static java.text.SimpleDateFormat DATE_FORMAT = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S");    /**     * 词典初始化     *      * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化     *      * 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间,     *      * 该方法提供了一个在应用加载阶段就初始化字典的手段     *      * @return Dictionary     */    public static Dictionary initial(Configuration cfg) {        if (singleton == null) {            synchronized (Dictionary.class) {                if (singleton == null) {                    singleton = new Dictionary(cfg);                    Integer[] dicUpdateMin = cfg.getDicUpdateMin();                    if (null != dicUpdateMin) {                        print("loadDicFixedTime", "start");                        loadDicFixedTime(dicUpdateMin);                    }                    return singleton;                }            }        }        return singleton;    }    /**     * 定期加载配置文件.     *      * @param dicUpdateMin     *            加载间隔     */    private static void loadDicFixedTime(Integer[] dicUpdateMin) {        scheduledThreadPool.scheduleWithFixedDelay(new Runnable() {            public void run() {                try {                    loadMainDict();                    loadStopWordDict();                } catch (Exception e) {                    print(e);                }            }        }, dicUpdateMin[0], dicUpdateMin[1], TimeUnit.MINUTES);    }    private Dictionary(Configuration cfg) {        this.cfg = cfg;        this.loadMainDict();        this.loadStopWordDict();        this.loadQuantifierDict();        hasAdd = true;    }    /**     * 获取词典单子实例     *      * @return Dictionary 单例对象     */    public static Dictionary getSingleton() {        if (singleton == null) {            throw new IllegalStateException("词典尚未初始化,请先调用initial方法");        }        return singleton;    }    /**     * 批量加载新词条     *      * @param words     *            Collection<String>词条列表     */    public void addWords(Collection<String> words) {        if (words != null) {            for (String word : words) {                if (word != null) {                    // 批量加载词条到主内存词典中                    singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());                }            }        }    }    /**     * 批量移除(屏蔽)词条     *      * @param words     */    public void disableWords(Collection<String> words) {        if (words != null) {            for (String word : words) {                if (word != null) {                    // 批量屏蔽词条                    singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());                }            }        }    }    /**     * 检索匹配主词典     *      * @param charArray     * @return Hit 匹配结果描述     */    public Hit matchInMainDict(char[] charArray) {        return singleton._MainDict.match(charArray);    }    /**     * 检索匹配主词典     *      * @param charArray     * @param begin     * @param length     * @return Hit 匹配结果描述     */    public Hit matchInMainDict(char[] charArray, int begin, int length) {        return singleton._MainDict.match(charArray, begin, length);    }    /**     * 检索匹配量词词典     *      * @param charArray     * @param begin     * @param length     * @return Hit 匹配结果描述     */    public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {        return singleton._QuantifierDict.match(charArray, begin, length);    }    /**     * 从已匹配的Hit中直接取出DictSegment,继续向下匹配     *      * @param charArray     * @param currentIndex     * @param matchedHit     * @return Hit     */    public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {        DictSegment ds = matchedHit.getMatchedDictSegment();        return ds.match(charArray, currentIndex, 1, matchedHit);    }    /**     * 判断是否是停止词     *      * @param charArray     * @param begin     * @param length     * @return boolean     */    public boolean isStopWord(char[] charArray, int begin, int length) {        return singleton._StopWordDict.match(charArray, begin, length).isMatch();    }    /**     * 加载主词典及扩展词典     */    private static void loadMainDict() {        // 建立一个主词典实例        if (_MainDict == null) { // 首次加载            _MainDict = new DictSegment((char) 0);            String mainDictionary = cfg.getMainDictionary();            // 读取主词典文件            if (!cfg.isDicDisable()) {                loadToMain(mainDictionary, 1);            }        }        // 加载扩展词典        List<String> extDictFiles = cfg.getExtDictionarys();        if (null != extDictFiles && !extDictFiles.isEmpty()) {            for (String extFile : extDictFiles) {                loadToMain(extFile, null);            }        }    }    /**     * 将文件加载到主库.     *      * @param mainDictionary     *            mainDictionary     * @param innerDic     *            是否是内置词典(1是)     */    private static void loadToMain(String mainDictionary, Integer innerDic) {        String path = null;        InputStream is = null;        File file = new File("");        if (Objects.equals(1, innerDic)) {            is = Dictionary.class.getClassLoader().getResourceAsStream(mainDictionary);        } else {            path = getFilePath(mainDictionary);            file = new File(path);            try {                is = new FileInputStream(file);            } catch (FileNotFoundException e) {                print(e);            }        }        if (is == null) {            print("loadToMain:FileNotFoundException", path);            // throw new RuntimeException("Main Dictionary not found!!!");            return;        }        if (hasAdd && dicLastModified.containsKey(path) && file.lastModified() <= dicLastModified.get(path)) {            return; // 非首次加载或词典未修改        }        print("loadToMain_START", mainDictionary);        BufferedReader br = null;        InputStreamReader inputStreamReader = null;        StringBuilder updateDic = new StringBuilder();        try {            inputStreamReader = new InputStreamReader(is, "UTF-8");            br = new BufferedReader(inputStreamReader, 512);            String theWord = null;            do {                theWord = br.readLine();                if (theWord != null && !"".equals(theWord.trim())) {                    if (!dicExtSet.contains(theWord)) {                        dicExtSet.add(theWord);                        if (hasAdd) {                            updateDic.append(theWord).append(";");                        }                    }                    _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());                }            } while (theWord != null);        } catch (IOException ioe) {            print("loadToMain exception.");            print(ioe);        } finally {            dicLastModified.put(path, file.lastModified());            if (updateDic.length() != 0) {                print("loadToMain_END", "FileLastModified:" + DATE_FORMAT.format(new Date(file.lastModified())), updateDic.toString());            }            close(is, inputStreamReader, br);        }    }    /**     * 获取字典文件实际路径.     *      * @param dictionary     *            字典名     * @return 字典路径     */    private static String getFilePath(String dictionary) {        URL resource = Dictionary.class.getClassLoader().getResource(dictionary);        if (null == resource) {            print("NullPointerException", "getFilePath", dictionary); // 提示用户配置词库有误,方便用户定位异常        }        return resource.getPath(); // 抛出异常,终止IK    }    /**     * 加载用户扩展的停止词词典     */    private static void loadStopWordDict() {        // 建立一个主词典实例        if (_StopWordDict == null) {            _StopWordDict = new DictSegment((char) 0);        }        // 加载扩展停止词典        List<String> extStopWordDictFiles = cfg.getExtStopWordDictionarys();        if (extStopWordDictFiles != null) {            InputStream is = null;            for (String extStopWordDictName : extStopWordDictFiles) {                // 读取扩展词典文件                // is = Dictionary.class.getClassLoader().getResourceAsStream(extStopWordDictName);                String path = getFilePath(extStopWordDictName);                File file = new File(path);                try {                    is = new FileInputStream(file);                } catch (FileNotFoundException e) {                    print("loadStopWordDict:FileNotFoundException", path);                    print(e);                } finally {                    close(is);                }                // 如果找不到扩展的字典,则忽略                if (is == null) {                    continue;                }                if (hasAdd && dicLastModified.containsKey(path) && file.lastModified() <= dicLastModified.get(path)) {                    continue; // 非首次加载或词典未修改                }                print("loadStopWordDict_START", extStopWordDictName);                BufferedReader br = null;                InputStreamReader inputStreamReader = null;                StringBuilder updateDic = new StringBuilder();                try {                    inputStreamReader = new InputStreamReader(is, "UTF-8");                    br = new BufferedReader(inputStreamReader, 512);                    String theWord = null;                    do {                        theWord = br.readLine();                        if (theWord != null && !"".equals(theWord.trim())) {                            // System.out.println(theWord);                            // 加载扩展停止词典数据到内存中                            _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());                            if (!dicStopSet.contains(theWord)) {                                dicStopSet.add(theWord);                                if (hasAdd) {                                    updateDic.append(theWord).append(";");                                }                            }                        }                    } while (theWord != null);                } catch (IOException ioe) {                    print("loadStopWordDict exception.");                    print(ioe);                } finally {                    dicLastModified.put(path, file.lastModified());                    if (updateDic.length() != 0) {                        print("loadStopWordDict_END", "FileLastModified:" + DATE_FORMAT.format(new Date(file.lastModified())), updateDic.toString());                    }                    close(is, inputStreamReader, br);                }            }        }    }    /**     * 加载量词词典     */    private void loadQuantifierDict() {        // 建立一个量词典实例        _QuantifierDict = new DictSegment((char) 0);        // 读取量词词典文件        InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());        if (is == null) {            throw new RuntimeException("Quantifier Dictionary not found!!!");        }        BufferedReader br = null;        InputStreamReader inputStreamReader = null;        try {            inputStreamReader = new InputStreamReader(is, "UTF-8");            br = new BufferedReader(inputStreamReader, 512);            String theWord = null;            do {                theWord = br.readLine();                if (theWord != null && !"".equals(theWord.trim())) {                    _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());                }            } while (theWord != null);        } catch (IOException ioe) {            print("Quantifier Dictionary loading exception.");            print(ioe);        } finally {            close(is, inputStreamReader, br);        }    }    /**     * 批量关闭文件流.     *      * @param closeables     *            文件流集合     */    private static void close(AutoCloseable... closeables) {        if (null != closeables && closeables.length > 0) {            for (AutoCloseable autoCloseable : closeables) {                if (null != autoCloseable) {                    try {                        autoCloseable.close();                    } catch (Exception e) {                        print(e);                    }                }            }        }    }    /**     * 控制台打印.     *      * @param param     *            参数     */    public static void print(String... param) {        StringBuilder builder = new StringBuilder();        builder.append("[").append(DATE_FORMAT.format(new Date())).append("]");        for (String str : param) {            builder.append("[").append(str).append("]");        }        System.out.println(builder.toString());    }    /**     * 控制台打印.     *      * @param e     *            异常信息     */    public static void print(Exception e) {        StringBuilder builder = new StringBuilder();        builder.append("[").append(DATE_FORMAT.format(new Date())).append("]").append(e.getMessage());        System.out.println(builder.toString());        e.printStackTrace();    }}

项目完整源码:https://github.com/zxiaofan/ik-analyzer-solr6
可直接从https://github.com/zxiaofan/ik-analyzer-solr6/releases 下载 solr6.6.1版本的jar。

欢迎个人转载,但须在文章页面明显位置给出原文连接;未经作者同意必须保留此段声明、不得随意修改原文、不得用于商业用途,否则保留追究法律责任的权利。【 CSDN 】:csdn.zxiaofan.com【GitHub】:github.zxiaofan.com如有任何问题,欢迎留言。祝君好运!Life is all about choices! 将来的你一定会感激现在拼命的自己!

阅读全文
0 0
原创粉丝点击