lucene简单例子

来源：互联网发布：安卓仿淘宝订单实现编辑：程序博客网时间：2024/04/28 02:59

建立索引

import java.io.File;
import java.io.FileReader;
import java.io.Reader;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
/**
* This class demonstrate the process of creating index with Lucene for text files
* 这个用来生成索引文件
*/
public class TxtFileIndexer {
public static void main(String[] args) throws Exception{
  /**
   * indexDir is the directory that hosts Lucene's index files
   * 索引文件存放目录
   */
        File indexDir = new File("f://luceneIndex");

        /**
         * dataDir is the directory that hosts the text files that to be indexed
         * 被索引的文件存放目录
         */
        File dataDir = new File("f://luceneData");

        /**
         * Builds an analyzer
         * 生成一个分析器
         */
        Analyzer luceneAnalyzer = new StandardAnalyzer();
        // 标准分析器，除了拉丁语言还支持亚洲语言;SimpleAnalyzer：支持最简单拉丁语言
        // StandardAnalyzer(String[] stopWords)，可以对分析器定义一些使用词语，
        // 这不仅可以免除检索一些无用信息，而且还可以在检索中定义禁止的政治性、非法性的检索关键词。


        /**
         * 遍历被索引文件夹下的所有文件
         */
        File[] dataFiles = dataDir.listFiles();

        /**
         * Lucene 用来创建索引的一个核心的类，他的作用是把一个个的 Document 对象加到索引中来。
         */
        IndexWriter indexWriter = new IndexWriter(indexDir,luceneAnalyzer,true);
        // IndexWriter(String path, Analyzer a, boolean create)，path为文件路径，a为分析器，
        // create标志是否重建索引（true：建立或者覆盖已存在的索引，false：扩展已存在的索引。）

        /**
         * 开始索引时间
         */
        long startTime = new Date().getTime();


        for(int i = 0; i < dataFiles.length; i++){
        if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){
          System.out.println("Indexing file " + dataFiles[i].getCanonicalPath());
          Document document = new Document();
          Reader txtReader = new FileReader(dataFiles[i]);
          document.add(Field.Text("path",dataFiles[i].getCanonicalPath()));
          document.add(Field.Text("contents",txtReader));
          /**
           * 索引添加一个文档
           */
          indexWriter.addDocument(document);
        }
        }

        /**
         * 合并索引并优化
         */
        indexWriter.optimize();
        // IndexWriter为了减少大量的io维护操作，在每得到一定量的索引后建立新的小索引文件（笔者测试索引批量的最小单位为10），
        // 然后再定期将它们整合到一个索引文件中，因此在索引结束时必须进行wirter. optimize()，以便将所有索引合并优化。


        /**
         * 关闭
         */
        indexWriter.close();

        /**
         * 索引结束时间
         */
        long endTime = new Date().getTime();

        System.out.println("It takes " + (endTime - startTime)
                           + " milliseconds to create index for the files in directory "
                     + dataDir.getPath());
}
}

------------------------------------------------------------------------------------------------------------------------------------------------

搜索文档

import java.io.File;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
/**
* This class is used to demonstrate the
* process of searching on an existing
* Lucene index
*
*/
public class TxtFileSearcher {
public static void main(String[] args) throws Exception{
  /**
   * 检索词
   */
     String queryStr = "ok";

     /**
      * This is the directory that hosts the Lucene index
      * 索引文件路径
      */
        File indexDir = new File("f://luceneIndex");

        /**
         * 表示一个存储在文件系统中的索引的位置
         */
        FSDirectory directory = FSDirectory.getDirectory(indexDir,false);

        /**
         * 用来在建立好的索引上进行搜索的。它只能以只读的方式打开一个索引，所以可以有多个IndexSearcher的实例在一个索引上进行操作。
         */
        IndexSearcher searcher = new IndexSearcher(directory);
        if(!indexDir.exists()){
        System.out.println("The Lucene index is not exist");
        return;
        }

        /**
         * 搜索的基本单位，一个Term对象有两个String类型的域组成。
         * 生成一个Term对象可以有如下一条语句来完成：Term term = new Term(“fieldName”,”queryWord”);
         * 其中第一个参数代表了要在文档的哪一个Field上进行查找，第二个参数代表了要查询的关键词。
         */
        Term term = new Term("contents",queryStr.toLowerCase());

        /**
         * 是抽象类Query的一个子类，它同时也是Lucene支持的最为基本的一个查询类。
         * 生成一个TermQuery对象由如下语句完成： TermQuery termQuery = new TermQuery(new Term(“fieldName”,”queryWord”));
         * 它的构造函数只接受一个参数，那就是一个Term对象。
         */
        TermQuery luceneQuery = new TermQuery(term);

        /**
         * 用来保存搜索的结果的
         */
        Hits hits = searcher.search(luceneQuery);
        for(int i = 0; i < hits.length(); i++){
        Document document = hits.doc(i);
        System.out.println("File: " + document.get("path"));
        }
}
}