Lucene中文分词2.4.0

来源:互联网 发布:郑州大数据培训学校 编辑:程序博客网 时间:2024/05/17 01:32
jar包使用:
lucene-core-2.4.0.jar//Lucene核心包.
lucene-analyzers-2.4.0.jar//分词器包.
paoding-analysis-2.0.4-alpha2//paoding分词器包.


//首先我们想要分词就得创建一个分词的所索引.public void createLuceneIndex(List<Bean> beanList) throws Exception {System.out.println("--------- 开始创建索引 ------------");Directory fsDir = FSDirectory.getDirectory("E:\\Workspaces\\Lucene\\FileIndex");//IndexWriter参数说明: 目录参数, 分词器, 是否重新创建, 字段最大长度.IndexWriter fsindexWriter = new IndexWriter(fsDir, analyzer, true, MaxFieldLength.LIMITED);//创建随机储存目录.Directory ramDir = new RAMDirectory(fsDir);IndexWriter ramIndexWriter = new IndexWriter(ramDir, analyzer, MaxFieldLength.LIMITED);for (Bean bean : beanList) {//把文件转为Document类型Document doc = DocumentUtil.getDocuement(bean);System.out.println("索引号------> : " + ramIndexWriter.numRamDocs());System.out.println("文章标题----> : " + bean.getText());ramIndexWriter.addDocument(doc);}//关闭随机索性写入器.ramIndexWriter.optimize();ramIndexWriter.close();//把内存里的索引存入硬盘里.fsindexWriter.addIndexesNoOptimize(new Directory[]{ ramDir });System.out.println("--------- 创建索引成功 ------------");fsindexWriter.optimize();fsindexWriter.close();}//我们创建索引的时候会插入的数据是Document类型的.创建一个方法来把一个Bean对像转为Document对象.private static int index = 0;//声明一个全局变量来做为索引的gid, 方便对单一索引进行操作.public synchronized static Document getDocuement(Bean bean){//File file = new File(path);Document doc = new Document();//Filed 参数说明: 索引名, 文件名, 是否存储(Store.COMPRESS(压缩之后在存)), 是否建立索引//(index.NOT_ANALYZED(不分词索引);index.ANALYZED(分词后索引);index.NO(不索引)).doc.add(new Field("gid", getNextIndex()+"", Store.COMPRESS, Index.ANALYZED));doc.add(new Field("title", bean.getTitle(), Store.COMPRESS, Index.ANALYZED));doc.add(new Field("text", bean.getText(), Store.COMPRESS, Index.ANALYZED));return doc;}//插入索引.public synchronized boolean insertIndex(Bean bean) throws Exception{System.out.println("开始插入数据........");//创建读入索引对象.创建读入对像是为了看是否插入数据.也可以不查看.IndexReader indexReader = IndexReader.open("E:\\Workspaces\\Lucene\\FileIndex");System.out.println("插入前号:--》" + indexReader.numDocs());indexReader.close();//获取索引对象.Directory fsDir = FSDirectory.getDirectory("E:\\Workspaces\\Lucene\\FileIndex");//创建索引对写入对象.IndexWriter fsindexWriter = new IndexWriter(fsDir, analyzer, false, MaxFieldLength.LIMITED);Document doc = FileToDocument.getDocuement(bean);//传入BEAN得到文档对象.fsindexWriter.addDocument(doc);//添加文档对象.fsindexWriter.optimize();//fsindexWriter.close();IndexReader indexReaderLast = IndexReader.open("E:\\Workspaces\Lucene\\FileIndex");System.out.println("插入后号:--》" + indexReaderLast.numDocs());indexReaderLast.close();return true;}//删除相对的索引.public synchronized boolean deleteIndex(int index) throws Exception {Directory fsDir = FSDirectory.getDirectory("E:\\Workspaces\\Lucene\\FileIndex");IndexWriter fsindexWriter = new IndexWriter(fsDir, analyzer, false, MaxFieldLength.LIMITED);//删除相对的索引.通过对我们自己设置的GID进行查找.就能找以我们想要删除的对象.fsindexWriter.deleteDocuments(new Term("gid",index+""));fsindexWriter.commit();fsindexWriter.close();IndexReader indexReader2 = IndexReader.open("E:\\Workspaces\\Lucene\\FileIndex");System.out.println("删除后---------->" + indexReader2.numDocs());}public class LuceneTest {public static void main(String[] args) throws Exception {String queryStr = "中国";//查询用的分词器和创建用的最好使用一个.Analyzer analyzer = new PaodingAnalyzer();//把要解析的文本解析为Query对像.String[] fields = {"title"};QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer);Query query = queryParser.parse(queryStr);//过滤器.为空就不过滤.Filter filter = null;IndexSearcher indexSearcher = new IndexSearcher("E:\\Workspaces\\Lucene\\FileIndex");//indexSearcher参数说明: 查询对象, 过滤器, 一次性能查的最大文档. TopDocs topDocs = indexSearcher.search(query, filter, 10000);System.out.println("匹配条数:" + topDocs.totalHits);for (ScoreDoc topDoc : topDocs.scoreDocs) {int docid = topDoc.doc;System.out.println(docid);//文档内部编号.Document doc = indexSearcher.doc(docid);System.out.println(doc.get("text"));}}}//分词器的分词例子.package com.testLucene.analyzer;import java.io.StringReader;import net.paoding.analysis.analyzer.PaodingAnalyzer;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenStream;public class AnalyzerTest {public static void main(String[] args) throws Exception {Analyzer analyzer = new PaodingAnalyzer();String str = "中华人民共和国";new AnalyzerTest().analyzer(analyzer, str);}public void analyzer(Analyzer analyzer, String str) throws Exception{TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(str));for(Token token = new Token();(token = tokenStream.next(token)) != null;){System.out.println(token);}}}


paoding分词器的配置方法.
版本: paoding-analysis-2.0.4-alpha2
使用paoding分词要配置环境变量.(没试验过不配能不能用).
1. PAODING_DIC_HOME = E:\MyDocument\paoding-analysis-2.0.4-alpha2\dic
2. 把E:\MyDocument\paoding-analysis-2.0.4-alpha2\src\paoding-dic-home.properties这个文件复制到项目的src目录下.
3. 修改paoding-dic-home.properties的内容:
paoding.dic.home =/MyDocument/paoding-analysis-2.0.4-alpha2/dic

如果报错:Caused by: java.lang.ClassNotFoundException: org.apache.commons.logging.LogFactory
需添加一个包: commons-logging-1.0.4.jar.
这样paoding分词器就搞定.
原创粉丝点击