三、lucene3.5的分词语法[停用词扩展、同义词搜索等]

来源:互联网 发布:linux如何卸载mysql 编辑:程序博客网 时间:2024/05/17 06:56

1

2、语汇单元的结构解释


3、同义词的设计思路


4、分词器的比较和测试

package org.lucene.test;import java.io.File;import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.SimpleAnalyzer;import org.apache.lucene.analysis.StopAnalyzer;import org.apache.lucene.analysis.WhitespaceAnalyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;import org.junit.Test;import org.lucene.util.AnalyzerUtils;import org.lucene.util.MySameAnalyzer;import org.lucene.util.MyStopAnalyzer;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;public class TestAnalyzer {/** * 几种分词器在英文分词下面的比较 */@Testpublic void test01(){//标准分词器Analyzer a1 = new StandardAnalyzer(Version.LUCENE_35);//停用词分词器Analyzer a2 = new StopAnalyzer(Version.LUCENE_35);//简单分词器Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_35);//空格分词器Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_35);String txt = "this is my house,I am come from yunnang zhaotong," +"My email is ynkonghao@gmail.com,My QQ is 707807876";AnalyzerUtils.displayToken(txt, a1);//[my][house][i][am][come][from][yunnang][zhaotong][my][email][ynkonghao][gmail.com][my][qq][707807876]AnalyzerUtils.displayToken(txt, a2);//[my][house][i][am][come][from][yunnang][zhaotong][my][email][ynkonghao][gmail][com][my][qq]AnalyzerUtils.displayToken(txt, a3);//[this][is][my][house][i][am][come][from][yunnang][zhaotong][my][email][is][ynkonghao][gmail][com][my][qq][is]AnalyzerUtils.displayToken(txt, a4);//[this][is][my][house,I][am][come][from][yunnang][zhaotong,My][email][is][ynkonghao@gmail.com,My][QQ][is][707807876]}/** * 几种分词器在中文分词下面的比较 */@Testpublic void test02(){//标准分词器Analyzer a1 = new StandardAnalyzer(Version.LUCENE_35);//停用词分词器Analyzer a2 = new StopAnalyzer(Version.LUCENE_35);//简单分词器Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_35);//空格分词器Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_35);String txt = "我来自云南昭通昭阳区师专";AnalyzerUtils.displayToken(txt, a1);//[我][来][自][云][南][昭][通][昭][阳][区][师][专]AnalyzerUtils.displayToken(txt, a2);//[我来自云南昭通昭阳区师专]AnalyzerUtils.displayToken(txt, a3);//[我来自云南昭通昭阳区师专]AnalyzerUtils.displayToken(txt, a4);//[我来自云南昭通昭阳区师专]}/** * 打印分词的详细信息 */@Testpublic void test03(){//标准分词器Analyzer a1 = new StandardAnalyzer(Version.LUCENE_35);//停用词分词器Analyzer a2 = new StopAnalyzer(Version.LUCENE_35);//简单分词器Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_35);//空格分词器Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_35);String txt = "how are you thank you";AnalyzerUtils.displayAllToken(txt, a1);AnalyzerUtils.displayAllToken(txt, a2);AnalyzerUtils.displayAllToken(txt, a3);AnalyzerUtils.displayAllToken(txt, a4);}/** * 停用词的测试 */@Testpublic void test04(){Analyzer a1 = new MyStopAnalyzer(new String[]{"I","you","hate"});Analyzer a2 = new StopAnalyzer(Version.LUCENE_35);String txt = "how are You thAnk's you I hate you";AnalyzerUtils.displayToken(txt, a1);AnalyzerUtils.displayToken(txt, a2);}/** * 中文分词测试 * 使用词库分词,自己可扩展词库 */@Testpublic void test05(){//Analyzer a1 = new MMSegAnalyzer();//未加入该分词器自带的词库//[我][来][自][云][南][昭][通][昭][阳][区][师][专]//导入分词的词典便有词库Analyzer a1 = new MMSegAnalyzer(new File("D:\\Workspaces\\03_lucene_analyzer\\mmseg4j-1.8.4\\data"));//[我][来自][云南][昭][通][昭][阳][区][师专]//可以在data文件下面的words-my.dic扩展自己的词典,比如加了昭通,分词结果为://[我][来自][云南][昭通][昭][阳][区][师专]String txt = "我来自云南昭通昭阳区师专";AnalyzerUtils.displayToken(txt, a1);}/** * 同义词测试 * @throws IOException  * @throws CorruptIndexException  */@Testpublic void test06() throws CorruptIndexException, IOException{Analyzer a1 = new MySameAnalyzer();String txt = "我来自中国云南昭通昭阳区师专";AnalyzerUtils.displayAllToken(txt, a1);String keyword = "俺";Directory dire = new RAMDirectory();IndexWriter indexWriter = new IndexWriter(dire,new IndexWriterConfig(Version.LUCENE_35, a1));Document doc = new Document();doc.add(new Field("content",txt,Field.Store.YES,Field.Index.ANALYZED));indexWriter.addDocument(doc);indexWriter.close();IndexSearcher search = new IndexSearcher(IndexReader.open(dire));TopDocs topDoc = search.search(new TermQuery(new Term("content",keyword)),10);ScoreDoc[] scoreDoc = topDoc.scoreDocs;for(ScoreDoc score : scoreDoc){Document doc1 = search.doc(score.doc);System.out.println(doc1.get("content"));}}}

5、扩展自己的停用词分词器

package org.lucene.util;import java.io.IOException;import java.io.Reader;import java.util.Set;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.LetterTokenizer;import org.apache.lucene.analysis.LowerCaseFilter;import org.apache.lucene.analysis.StopAnalyzer;import org.apache.lucene.analysis.StopFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.util.Version;/** * 扩展自己的停用词分词器 * @author user * */public class MyStopAnalyzer extends Analyzer{private Set stops;public MyStopAnalyzer(String[] sws){//会自动将字符串数组转化为Setstops = StopFilter.makeStopSet(Version.LUCENE_35, sws, true);//把原来的停用词给加进来stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);}public MyStopAnalyzer(){//获取原有的停用词stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);}@Overridepublic TokenStream tokenStream(String fieldName, Reader reader) {System.out.println("//------------------------------------");Tokenizer tokenizer = new LetterTokenizer(Version.LUCENE_35,reader);//Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_35,reader);CharTermAttribute cta = tokenizer.addAttribute(CharTermAttribute.class);try {while(tokenizer.incrementToken()){System.out.println(cta);}} catch (IOException e) {e.printStackTrace();}System.out.println("------------------------------------\\");//为这个分词器设定过滤链和Tokenizerreturn new StopFilter(Version.LUCENE_35, new LowerCaseFilter(Version.LUCENE_35, new LetterTokenizer(Version.LUCENE_35, reader)), stops);}}
6、分词器的扩展,同义词分词器
package org.lucene.util;import java.io.Reader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MaxWordSeg;import com.chenlb.mmseg4j.analysis.MMSegTokenizer;/** * 分词器的扩展,同义词分词器 * @author user * */public class MySameAnalyzer extends Analyzer{@Overridepublic TokenStream tokenStream(String fieldName, Reader reader) {Dictionary dic = Dictionary.getInstance("D:\\Workspaces\\03_lucene_analyzer\\mmseg4j-1.8.4\\data");return new MySameTokenFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader));}}

7、同义词过滤器的扩展

package org.lucene.util;import java.io.IOException;import java.util.HashMap;import java.util.Map;import java.util.Stack;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.util.AttributeSource;/** * 同义词过滤器的扩展 * @author user * */public class MySameTokenFilter extends TokenFilter{private CharTermAttribute cta = null;private PositionIncrementAttribute pia = null;private AttributeSource.State current = null;private Stack<String> sames = null;protected MySameTokenFilter(TokenStream input) {super(input);cta = this.addAttribute(CharTermAttribute.class);pia = this.addAttribute(PositionIncrementAttribute.class);sames = new Stack<String>();}/** * 思想如下: * 其实每个同义词都要放在CharTermAttribute里面,但是如果直接cta.append("大陆");的话 * 那会直接把原来的词和同义词连接在同一个语汇单元里面[中国大陆],这样是不行的 * 要的是这样的效果[中国][大陆] * 那么就要在遇到同义词的时候把当前的状态保存一份,并把同义词的数组放入栈中, * 这样在下一个语汇单元的时候判断同义词数组是否为空,不为空的话把之前的保存的一份状态 * 还原,然后在修改之前状态的值cta.setEmpty(),然后在把同义词的值加入cta.append("大陆") * 再把位置增量设为0,pia.setPositionIncrement(0),这样的话就表示是同义词, * 接着把该同义词的语汇单元返回 */@Overridepublic boolean incrementToken() throws IOException {while(sames.size() > 0){//将元素出栈,并获取这个同义词String str = sames.pop();//还原状态restoreState(current);cta.setEmpty();cta.append(str);//设置位置pia.setPositionIncrement(0);return true;}if(!input.incrementToken()) return false;if(getSameWords(cta.toString())){//如果有同义词将当前状态先保存current = captureState();}return true;}/* * 使用这种方式是不行的,这种会把的结果是[中国]替换成了[大陆] * 而不是变成了[中国][大陆]@Overridepublic boolean incrementToken() throws IOException {if(!input.incrementToken()) return false;if(cta.toString().equals("中国")){cta.setEmpty();cta.append("大陆");}return true;}*/private boolean getSameWords(String name){Map<String,String[]> maps = new HashMap<String,String[]>();maps.put("中国", new String[]{"大陆","天朝"});maps.put("我", new String[]{"咱","俺"});String[] sws = maps.get(name);if(sws != null){for(String s : sws){sames.push(s);}return true;}return false;}}

8、打印语汇单元的信息

package org.lucene.util;import java.io.IOException;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.analysis.tokenattributes.TypeAttribute;/** * 打印语汇单元的信息 * @author user * */public class AnalyzerUtils {public static void displayToken(String str,Analyzer a){TokenStream stream = a.tokenStream("content", new StringReader(str));/* * TokenStream相当于一条流 * CharTermAttribute相当于一个碗 * 然后把碗丢进流里面,当碗得到一个元素后,碗又会自动流到了下 * 一个元素进行取值 * 这是一种设计模式:创建一个属性,这个属性会添加流中, * 随着这个TokenStream增加 */CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);try {while(stream.incrementToken()){System.out.print("["+cta+"]");//System.out.println(stream);//如果直接打印Stream的话,toString打印如下://(来,startOffset=1,endOffset=2,positionIncrement=1,type=<IDEOGRAPHIC>)}System.out.println();} catch (IOException e) {e.printStackTrace();}}/** * 打印详细信息的语汇单元 * @param str * @param a */public static void displayAllToken(String str,Analyzer a){TokenStream stream = a.tokenStream("content", new StringReader(str));//位置增量PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class);//偏移量OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class);//词元CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);//分词的类型TypeAttribute ta = stream.addAttribute(TypeAttribute.class);try {while(stream.incrementToken()){System.out.print(pia.getPositionIncrement()+":");System.out.print(cta+"["+oa.startOffset()+"-"+oa.endOffset()+"-"+ta.type());System.out.println();}System.out.println();} catch (IOException e) {e.printStackTrace();}}}

工程下载路径:http://download.csdn.net/detail/wxwzy738/5284705