Lucene 5.3 自定义同义词分词器

来源:互联网 发布:狼雨seo官网 编辑:程序博客网 时间:2024/05/29 03:54

利用MMseg4j作为中文分词  自定义同义词分词器

MMseg4j-2.3jar包 : http://download.csdn.net/detail/u010167215/9178257

MMseg4j-2.3 源码: http://download.csdn.net/detail/u010167215/9178265 

Code 1:MySameWordTokenFilter

public class MySameWordTokenFilter extends TokenFilter{        private CharTermAttribute cta = null;//自定义流的属性        private PositionIncrementAttribute  pia = null;        private SamewordEngine samewordEngine;  //同义词引擎                //input  输入流        protected MySameWordTokenFilter(TokenStream input,SamewordEngine engine) {                super(input);                this.samewordEngine = engine;                  cta = this.addAttribute(CharTermAttribute.class);//对流进行属性添加                pia = this.addAttribute(PositionIncrementAttribute.class);                }                        private Stack<String> sameWordStack = new Stack<String>();   //用于存储每个词的同义词 用List也行                private AttributeSource.State  currentState;  //记录当前状态                /**         * 该方法用于添加同义词         * 将同义词压入到栈中         * @param key  原词         * @return  有同义词返回true 没有同义词返回false         */        private boolean addSameWords(String key){                String[] sws = samewordEngine.getSameword(key);                if(sws!=null){                        for(String str:sws){                                //遍历放到栈中                                sameWordStack.push(str);                        }                        return true;                }else{                        return false;                }                        }                @Override        public boolean incrementToken() throws IOException {                                //如果栈非空  证明还有元素                if(sameWordStack.size()>0){                        //将元素出栈 并且获取这个同义词                        String str  = sameWordStack.pop();                        //还原状态                        restoreState(currentState);                        //System.out.println("-------"+cta);                        cta.setEmpty();                        cta.append(str);                                                //设置位置0                        pia.setPositionIncrement(0);                       // this.reset();                         return true;                }                               // input.reset();                //取完了再进行下一步                if(! input.incrementToken()) return false;  //已经没有数据了 空 输出false                                if(addSameWords(cta.toString())){                        currentState = captureState();                }                return true;        }}

Code 2:MySameWordAnalyzer


public class MySameWordAnalyzer extends Analyzer {        private SamewordEngine samewordEngine;        public MySameWordAnalyzer(SamewordEngine samewordEngine){                this.samewordEngine = samewordEngine;        }                @Override        protected TokenStreamComponents createComponents(String arg0) {                //MMSegTokenizer                Dictionary dic = Dictionary.getInstance("/xxxpath");  //词库路径              //Dictionary dic = Dictionary.getInstance();  //利用默认路径                MMSegTokenizer mmSegTokenizer = new MMSegTokenizer(new MaxWordSeg(dic));                MySameWordTokenFilter mySameWordTokenFilter = new MySameWordTokenFilter(mmSegTokenizer , samewordEngine);                 return new TokenStreamComponents(mmSegTokenizer, mySameWordTokenFilter);        }}

Code 3:SamewordEngine

/** * 可以通过该接口加载文件中的同义词 */public interface SamewordEngine {        public String[] getSameword(String key);}


Code 4:SimpleSamewordEngine

public class SimpleSamewordEngine implements SamewordEngine {        private Map<String,String[]> samewordsMap = new HashMap<String,String[]>();  //每个词的同义词                         public SimpleSamewordEngine() {                super();                //添加同义词                 samewordsMap.put("我", new String[]{"俺","咋"});        }        @Override        public String[] getSameword(String key) {                return samewordsMap.get(key);        }}


Code 5:测试代码

       /**         * 测试同义词查询         */        @Test        public void testSameWorldAnalyzer02(){                //一、Index                //1.创建Directory                //2.创建IndexWriter                //3.创建Document对象                //4.为Document添加Field //遍历文件               //5.通过IndexWriter 添加文档到索引中               //6.关闭IndexWriter                                //二、Search                //1.创建Directory(去哪个地方找)                //2.创建IndexReader(通过这个IndexReader来读取所有的索引)                //3.根据IndexReader 创建 IndexSearcher                 //4.创建搜索的Query(就像mysql语句一样) 参数:1.版本,2.要搜索的域(之前创建了content、filename、path),3.分词器                         //(1)创建QueryParser (2) 通过QueryParser创建Query                //如果QueryParser 的包域内核包不同  会报Lucene java.lang.NoSuchFieldError                //5.根据searcher搜索并返回TopDocs文档                //6.根据TopDocs获取ScoreDoc对象(评分对象)               //循环获取文档                //7.根据seacher和ScoreDoc对象获取具体的Document对象                //8.根据Document 对象获取需要的值                 //9.关闭search                                                        Analyzer mySameWordAnalyzer = new MySameWordAnalyzer(new SimpleSamewordEngine());  //抽象出接口                        String text  = "我来自中国华南农业大学网路工程专业";                                                  try {                                Directory directory = new RAMDirectory();                                IndexWriter iw = new IndexWriter(directory, new IndexWriterConfig(mySameWordAnalyzer));                                Document doc = new Document();                                doc.add(new TextField("content", text, Store.YES));                                iw.addDocument(doc);                                 iw.close();                                                                                     IndexReader indexReader = DirectoryReader.open(directory);                                IndexSearcher indexSearcher = new IndexSearcher(indexReader);                                TermQuery query = new TermQuery(new Term("content","俺"));   //在content中搜索俺                                                       TopDocs tds = indexSearcher.search(query, 10);                                ScoreDoc[] sds = tds.scoreDocs;
                                Document document = indexSearcher.doc(sds[0].doc);                                System.out.println("----->"+document.get("content"));                                indexReader.close();                        } catch (IOException e) {                                e.printStackTrace();                        }                                                        }



0 0
原创粉丝点击