lucene使用教程7 --lucene实例代码

来源:互联网 发布:三权分立 知乎 编辑:程序博客网 时间:2024/06/05 05:16

废话不说了,直接上实例代码,如果你看过前面几篇文章,这些代码对你来说都是小case了,理解最重要

下面两个代码是一个工程:

IndexDocument.java

package baseSample;import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.LockObtainFailedException;public class IndexDocument {public static Directory getIndexDirectory(Directory directory,Analyzer analyzer) throws CorruptIndexException,LockObtainFailedException, IOException {IndexWriter iwriter = new IndexWriter(directory, analyzer, true,new IndexWriter.MaxFieldLength(25000));// 索引过程的调优//iwriter.setMergeFactor(10); // 激励因子//iwriter.setMaxMergeDocs(2000); // segment最大文档数量//iwriter.setMaxBufferedDocs(1); // 内存文档数量// news FieldsField newsId = null;Field newsName = null;Field publishDate = null;Field newsSource = null;Field newssummay = null;// 第1篇新闻Document doc1 = new Document();newsId = new Field("newsId", "aaaa", Field.Store.YES,Field.Index.NOT_ANALYZED);newsName = new Field("newsName", "江苏常州曝疫苗造假大案7人被捕超百万人受害",Field.Store.YES, Field.Index.ANALYZED);publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES,Field.Index.NOT_ANALYZED);newsSource = new Field("newsSource", "网易新闻频道", Field.Store.YES,Field.Index.ANALYZED);newssummay = new Field("newssummay","据香港明报报道,江苏常州爆出疫苗造假大案。当地著名疫苗生产商江苏延申生物科技股份有限公司(简称“江苏延申”)被国家药监局查实在疫苗生产过程中长期故意造假,导致大量问题疫苗流向市场,受害者最少超过100万人。",Field.Store.YES, Field.Index.ANALYZED);doc1.add(newsId);doc1.add(newsName);doc1.add(publishDate);doc1.add(newsSource);doc1.add(newssummay);iwriter.addDocument(doc1);// 第2篇新闻Document doc2 = new Document();newsId = new Field("newsId", "bbbb", Field.Store.YES,Field.Index.NOT_ANALYZED);newsName = new Field("newsName", "富士康一月内发生三起坠楼案", Field.Store.YES,Field.Index.ANALYZED);publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES,Field.Index.NOT_ANALYZED);newsSource = new Field("newsSource", "广州日报", Field.Store.YES,Field.Index.ANALYZED);newssummay = new Field("newssummay","昨日凌晨3时左右,富士康科技集团龙华厂区的一名23岁湖南籍男性员工从宿舍楼上坠下,当场死亡",Field.Store.YES, Field.Index.ANALYZED);doc2.add(newsId);doc2.add(newsName);doc2.add(publishDate);doc2.add(newsSource);doc2.add(newssummay);iwriter.addDocument(doc2);// 第3篇新闻Document doc3 = new Document();newsId = new Field("newsId", "cccc", Field.Store.YES,Field.Index.NOT_ANALYZED);newsName = new Field("newsName", "普京称要消灭掉制造地铁爆炸案恐怖分子", Field.Store.YES,Field.Index.ANALYZED);publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES,Field.Index.NOT_ANALYZED);newsSource = new Field("newsSource", "网易新闻频道", Field.Store.YES,Field.Index.ANALYZED);newssummay = new Field("newssummay","据外电报道,俄罗斯总理普京29日表示,当天制造莫斯科地铁连环爆炸案的恐怖分子一定会被抓到,并被消灭掉。",Field.Store.YES, Field.Index.ANALYZED);doc3.add(newsId);doc3.add(newsName);doc3.add(publishDate);doc3.add(newsSource);doc3.add(newssummay);// doc3.setBoost(2);iwriter.addDocument(doc3);// 第4篇新闻Document doc4 = new Document();newsId = new Field("newsId", "cccc", Field.Store.YES,Field.Index.NOT_ANALYZED);newsName = new Field("newsName", "最天使", Field.Store.YES,Field.Index.ANALYZED);publishDate = new Field("publishDate", "2009/3/30", Field.Store.YES,Field.Index.NOT_ANALYZED);newsSource = new Field("newsSource", "易", Field.Store.YES,Field.Index.ANALYZED);newssummay = new Field("newssummay", "长肥了", Field.Store.YES,Field.Index.ANALYZED);doc4.add(newsId);doc4.add(newsName);doc4.add(publishDate);doc4.add(newsSource);doc4.add(newssummay);iwriter.addDocument(doc4);iwriter.close();return directory;}}

SampleSearch.java

package baseSample;import java.io.File;import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.cjk.CJKAnalyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.Term;import org.apache.lucene.queryParser.MultiFieldQueryParser;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.BooleanClause;import org.apache.lucene.search.BooleanQuery;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.LockObtainFailedException;import org.apache.lucene.util.Version;public class SampleSearch{public static void main(String arg[]) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException{//Store the index in memory://Directory directory  = new RAMDirectory();//To store an index on disk, use this instead:File file = new File("D:/mapreduce-out/lucenetmp/cache.txt") ;if(file.exists()) {System.out.println("文件已存在,删除掉");file.delete() ;}    Directory directory = FSDirectory.open(file);Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);analyzer = new CJKAnalyzer(Version.LUCENE_30);//Now search the index 这一步同时也写入了lucene的cache文件IndexSearcher isearcher = new IndexSearcher(IndexDocument.getIndexDirectory(directory, analyzer), true);/** * IndexSearcher 的主要检索方法    * isearcher.search(Query query, Collector results); * isearcher.search(Query query,int n); * isearcher.search(Query query, Filter filter, Collector results);  *///Term 是查询的基本单位//1.termQueryQuery termQuery = new TermQuery(new Term("newsSource","网易"));System.out.println("--- termQuery : "+termQuery.toString());//2.BooleanQuery ,类似还提供RangeQuery范围搜索; PrefixQuery 前缀搜索 ;FuzzyQuery 模糊搜索 ..etcQuery a = new TermQuery(new Term("newsSource", "网"));Query b = new TermQuery(new Term("newsSource", "易"));BooleanQuery booleanQuery = new BooleanQuery();booleanQuery.add(a, BooleanClause.Occur.MUST);booleanQuery.add(b, BooleanClause.Occur.MUST);System.out.println("--- booleanQuery :"+ booleanQuery.toString());//3.用QueryParser 切词出 querySystem.out.println("lucene的当前版本 : " + Version.LUCENE_CURRENT);QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "newsSource", analyzer);parser.setDefaultOperator(QueryParser.AND_OPERATOR);//默认term之间是or关系Query parserQuery = parser.parse("java lucene");System.out.println("--- parserQuery : "+parserQuery.toString());//4.利用MultiFieldQueryParser实现对多Field查询String[] fields = {"newsName","newsSource"};MultiFieldQueryParser mparser = new MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer);Query mQuery = mparser.parse("江苏");System.out.println("---- mQuery :"+mQuery);ScoreDoc[] docs = isearcher.search(termQuery, 10).scoreDocs;for (int i = 0; i < docs.length; i++){           System.out.println(docs[i].doc);        System.out.println("searcher score :" + docs[i].score);            Document hitDoc = isearcher.doc(docs[i].doc);            System.out.println("--- explain : "+isearcher.explain(termQuery, docs[i].doc));            System.out.println("boost:" + hitDoc.getBoost());            System.out.println("newsId:" + hitDoc.get("newsId"));            System.out.println("newsName:" + hitDoc.get("newsName"));            System.out.println("publishDate:" + hitDoc.get("publishDate"));            System.out.println("newsSource:" + hitDoc.get("newsSource"));            System.out.println("newssummay:" + hitDoc.get("newssummay"));            System.out.println("------------------------------------------"); }   }}

 

下面两个代码,是一起的

TextFileIndexer.java

package lighter.javaeye.com;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileReader;import java.io.IOException;import java.io.InputStreamReader;import java.util.Date;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriter.MaxFieldLength;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;public class TextFileIndexer {public static void main(String[] args) throws IOException {//致命要索引文件夹的位置File fileDir = new File("D:/mapreduce-out/lucenetmp/demo1") ;//这里放索引文件的位置File indexDir = new File("D:/mapreduce-out/lucenetmp/demo2") ;//此处的indexDir应该是放置生成缓存的文件夹Directory docx = FSDirectory.open(indexDir);Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT) ;IndexWriter.MaxFieldLength mf = new MaxFieldLength(100);IndexWriter indexWriter = new IndexWriter(docx, luceneAnalyzer, mf) ;File[] textFiles = fileDir.listFiles();long startTime = new Date().getTime();for(int i=0;i<textFiles.length;i++) {if(textFiles[i].isFile() && textFiles[i].getName().endsWith(".txt")) {System.out.println("文件 " + textFiles[i].getCanonicalPath() + "正在呗索引") ;String temp = fileReaderAll(textFiles[i].getCanonicalPath(), "GBK") ;System.out.println("temp = " + temp);Document document = new Document();Field fieldPath = new Field("path", textFiles[i].getPath(),Field.Store.YES, Field.Index.NO) ;Field fieldBody = new Field("body", temp, Field.Store.YES, Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS) ;document.add(fieldPath);document.add(fieldBody);indexWriter.addDocument(document);}}//optimize()方法是对索引进行优化  indexWriter.optimize();indexWriter.close();long endTime = new Date().getTime();   System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!" + fileDir.getPath()); }public static String fileReaderAll(String fileName, String charset) throws IOException {BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName),charset));String line = new String() ;String temp = new String() ;while((line = reader.readLine()) != null) {temp += line ;}reader.close();return temp ;}}

TestQuery.java

package lighter.javaeye.com;import java.io.File;import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;public class TestQuery {public static void main(String[] args) throws IOException {TopDocs topDoc = null ;String queryString = "中华" ;Query query = null ;Directory directory = FSDirectory.open(new File("D:/mapreduce-out/lucenetmp/demo2"));IndexSearcher search = new IndexSearcher(directory) ;Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);try {QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "body", analyzer) ;query = qp.parse(queryString);} catch (ParseException e) {e.printStackTrace() ;}if(search != null) {topDoc = search.search(query, 100);if (topDoc.getMaxScore() > 0) {System.out.println("topDoc.totalHits" + topDoc.totalHits);System.out.println("topDoc.getMaxScore()" + topDoc.getMaxScore());System.out.println("topDoc.toString()" + topDoc.toString());} else {System.out.println("没有查询到结果");}}}}




原创粉丝点击