使用lucene进行信息检索

来源:互联网 发布:阿里云的域名如何解析 编辑:程序博客网 时间:2024/06/05 14:50
Apache Lucene是一个开放源程序的搜寻器引擎,利用它可以轻易地为Java软件加入全文搜寻功能。Lucene的最主要工作是替文件的每一个字作索引,索引让搜寻的效率比传统的逐字比较大大提高,Lucen提供一组解读,过滤,分析文件,编排和使用索引的API,它的强大之处除了高效和简单外,是最重要的是使使用者可以随时应自己需要自订其功能。

首先在自己的电脑上建三个文本供建立索引使用,我一共建了三个文本

然后编写一个索引类:

package com.lucene.test;import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.IndexWriterConfig.OpenMode;import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version;public class TestFileIndex {   public static void main(String[] args) throws Exception {     String dataDir="d:/lucene/data";     String indexDir="d:/lucene/index";          File[] files=new File(dataDir).listFiles();          Analyzer analyzer=new SmartChineseAnalyzer(Version.LUCENE_36, true);    Directory dir=FSDirectory.open(new File(indexDir));         IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);IndexWriter writer = new IndexWriter(dir, iwc);          for(int i=0;i<files.length;i++){      System.out.println("文件: "+files[i].getName()+"  索引建立中....");      Long startTime = System.currentTimeMillis();      StringBuffer strBuffer=new StringBuffer();       String line="";       FileInputStream is=new FileInputStream(files[i].getCanonicalPath());       BufferedReader reader=new BufferedReader(new InputStreamReader(is,"gb2312"));       line=reader.readLine();       while(line != null){         strBuffer.append(line);         strBuffer.append("\n");         line=reader.readLine();       }               Document doc=new Document();       doc.add(new Field("fileName", files[i].getName(), Field.Store.YES, Field.Index.ANALYZED));       doc.add(new Field("contents", strBuffer.toString(), Field.Store.YES, Field.Index.ANALYZED));       writer.addDocument(doc);       reader.close();       is.close();       Long endTime = System.currentTimeMillis();      System.out.println("文件: "+files[i].getName()+"索引建立结束。      所用时间为:"+(endTime - startTime)+"毫秒"  );    }          writer.commit();    writer.close();     dir.close();     System.out.println("索引结束");   } } 
运行完,在index文件夹下回产生索引文件:



再编写一个searcher类,检索index文件夹,作为测试:

package com.lucene.test;import java.io.File; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class TestFileSearcher {   public static void main(String[] args) throws Exception {     String indexDir = "d:/lucene/index";     Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_36, true);    Directory dir = FSDirectory.open(new File(indexDir));         IndexReader reader;reader = IndexReader.open(FSDirectory.open(new File(indexDir)));IndexSearcher searcher = new IndexSearcher(reader);        QueryParser parser = new QueryParser(Version.LUCENE_36, "contents",analyzer);     Query query = parser.parse("青元");       // Term term=new Term("fileName", "test");    // TermQuery query=new TermQuery(term);          TopDocs docs=searcher.search(query, 1000);     ScoreDoc[] hits=docs.scoreDocs;     System.out.println(hits.length);     for(int i=0;i<hits.length;i++){       Document doc=searcher.doc(hits[i].doc);       System.out.print(doc.get("fileName")+"\n");      // System.out.println(doc.get("contents")+"\n");     }          searcher.close();     dir.close();   } }