solr5.5.4扩展ansj_lucene5

来源:互联网 发布:用python编写数据库 编辑:程序博客网 时间:2024/06/05 02:14
  1. solr5.5.4
    http://mirror.bit.edu.cn/apache/lucene/solr/
  2. ansj
    https://github.com/NLPchina/ansj_seg
    下载ansj源码,在ansj_lucene5_plug中添加org.ansj.solr.AnsjTokenizerFactory

    这里写图片描述

package org.ansj.solr;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.List;import java.util.Map;import org.ansj.lucene.util.AnsjTokenizer;import org.ansj.recognition.impl.StopRecognition;import org.ansj.splitWord.analysis.IndexAnalysis;import org.ansj.splitWord.analysis.ToAnalysis;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.util.TokenizerFactory;import org.apache.lucene.util.AttributeFactory;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class AnsjTokenizerFactory extends TokenizerFactory {    public final Logger logger = LoggerFactory.getLogger(getClass());    boolean pstemming;    boolean isQuery;    private String stopwordsDir;    public List<StopRecognition> filters;    public AnsjTokenizerFactory(Map<String, String> args) {        super(args);        filters = new ArrayList<StopRecognition>();        getLuceneMatchVersion();        isQuery = getBoolean(args, "isQuery", true);        pstemming = getBoolean(args, "pstemming", false);        stopwordsDir = get(args, "stopwords");        addStopwords(stopwordsDir);    }    // add stopwords list to filter    private void addStopwords(String dir) {        if (dir == null) {            logger.info("no stopwords dir");            return;        }        // read stoplist        logger.info("stopwords: " + dir);        File file = new File(dir);        InputStreamReader reader;        try {            reader = new InputStreamReader(new FileInputStream(file), "UTF-8");            BufferedReader br = new BufferedReader(reader);            StopRecognition testFilter = new StopRecognition();            String word = br.readLine();            while (word != null) {                testFilter.insertStopWords(word);                word = br.readLine();            }            filters.add(testFilter);            br.close();        } catch (FileNotFoundException e) {            logger.info("No stopword file found");        } catch (IOException e) {            logger.info("stopword file io exception");        }    }    @Override    public Tokenizer create(AttributeFactory factory) {        if (isQuery == true) {            // query            return new AnsjTokenizer(new ToAnalysis(), filters, null);        } else {            // index            return new AnsjTokenizer(new IndexAnalysis(), filters, null);        }    }}

打包编译得到ansj_lucene5_plug-5.1.2.0.jar
将下边软件包移动到solr-5.5.4\server\solr-webapp\webapp\WEB-INF\lib下,
http://pan.baidu.com/s/1qY8Ycn6密码:xj74
这里写图片描述
分词配置文件(library.properties)放到/solr/server/resources目录下。
修改schema

<fieldType name="text_ansj" class="solr.TextField" positionIncrementGap="100">     <analyzer type="index">        <tokenizer class="org.ansj.solr.AnsjTokenizerFactory"  isQuery="false" stopwords="D:/solr-5.5.4/server/library/stopwords.txt"/>   </analyzer>    <analyzer type="query">        <tokenizer  class="org.ansj.solr.AnsjTokenizerFactory"/>    </analyzer>  </fieldType>