lucene使用

来源：互联网发布：全球网络电视直播apk 编辑：程序博客网时间：2024/05/16 07:51

名词解释：

IndexWriter:lucene中最重要的的类之一，它主要是用来将文档加入索引，同时控制索引过程中的一些参数使用。
Analyzer:分析器,主要用于分析搜索引擎遇到的各种文本。常用的有StandardAnalyzer分析器,StopAnalyzer分析器,WhitespaceAnalyzer分析器等。
Directory:索引存放的位置;lucene提供了两种索引存放的位置，一种是磁盘，一种是内存。一般情况将索引放在磁盘上；相应地lucene提供了FSDirectory和 RAMDirectory两个类。
Document:文档;Document相当于一个要进行索引的单元，任何可以想要被索引的文件都必须转化为Document对象才能进行索引。
Field：字段。
IndexSearcher:是lucene中最基本的检索工具，所有的检索都会用到IndexSearcher工具;
Query:查询，lucene中支持模糊查询，语义查询，短语查询，组合查询等等,如有TermQuery,BooleanQuery,RangeQuery,WildcardQuery等一些类。
QueryParser: 是一个解析用户输入的工具，可以通过扫描用户输入的字符串，生成Query对象。
Hits:在搜索完成之后，需要把搜索结果返回并显示给用户，只有这样才算是完成搜索的目的。在lucene中，搜索的结果的集合是用Hits类的实例来表示的。
示例如下:
package cn.com.util;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.util.Date;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.Field.Index;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.Hits;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.RAMDirectory;/*  * @author op_xiaoyang@yeah.net */ public class LuceneUtil {/* * 将FILE对象转换成document *  * @param File , charset * return Document*/public static Document fileToDoc(File file,String charset) throws IOException {Document document=new Document();document.add(new Field("name", file.getName(), Store.YES, Index.TOKENIZED));document.add(new Field("path", file.getAbsolutePath(), Store.YES, Index.TOKENIZED));document.add(new Field("size", String.valueOf(file.length()), Store.YES, Index.TOKENIZED));document.add(new Field("content", ReaderFileAll(file.getCanonicalPath(),charset), Store.YES, Index.TOKENIZED));return document;}/* * @param FileName文件名称，charSet 字符集 *  * return String || null */public static String ReaderFileAll(String FileName, String charset)throws IOException {// 创 流BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(FileName), charset));String line = new String();String content = new String();while ((line = reader.readLine()) != null) {// 读一行写一行content += line;}// 关闭流reader.close();return content;}/* * 标准化增加索引 * @param FileUrl文件路徑，indexUrlWillIn索引存放路徑，agreeType同意類型 charset 字符集 */public static void standardCreateIndex(String FileUrl, String indexUrlWillIn,StringBuffer agreeType, String charSet) throws Exception {// 記錄運行時間long startTime = new Date().getTime();// 欲搜索文件地址File files = new File(FileUrl);// index存放地址File indexsUrl = new File(indexUrlWillIn);// 標準解析器Analyzer luceneAnalyzer = new StandardAnalyzer();// 索引者 流IndexWriter indexWriter = new IndexWriter(indexsUrl, luceneAnalyzer,true);// 能讀if (files.canRead()) {// 打包路徑文件數組File[] textFiles = files.listFiles();// 處理索引// 增加document到索引去fileArrCreateindexTool(textFiles, agreeType, indexWriter, charSet);}// 索引者将索引优化indexWriter.optimize();// 关闭流indexWriter.close();long endTime = new Date().getTime();// 运行时间System.out.println("创建" + files.getPath() + "其中文件索引完毕,花费时间"+ (endTime - startTime) + "ms");}/* * 文件缓存化增加索引 * @param FileUrl文件路徑，indexUrlWillIn索引存放路徑，agreeType同意類型 charset 字符集 */public static void cacheCreateIndex(String FileUrl, String indexUrlWillIn,StringBuffer agreeType, String charSet) throws Exception {// 記錄運行時間long startTime = new Date().getTime();//Directory fsDir=FSDirectory.getDirectory(indexUrlWillIn,false);//启动时读取Directory raDir=new RAMDirectory(fsDir);// 標準解析器Analyzer luceneAnalyzer = new StandardAnalyzer();// 索引者 流IndexWriter indexWriter = new IndexWriter(raDir, luceneAnalyzer,true);// 欲搜索文件地址File files = new File(FileUrl);// 能讀if (files.canRead()) {// 打包路徑文件數組File[] textFiles = files.listFiles();// 處理索引// 增加document到索引去fileArrCreateindexTool(textFiles, agreeType, indexWriter, charSet);}// 索引者将索引优化indexWriter.optimize();// 关闭流indexWriter.close();//退出时保存IndexWriter fsIndexWriter = new IndexWriter(fsDir, luceneAnalyzer,true);fsIndexWriter.addIndexes(new Directory[]{raDir});   fsIndexWriter.close();long endTime = new Date().getTime();// 运行时间System.out.println("创建" + files.getPath() + "其中文件索引完毕,花费时间"+ (endTime - startTime) + "ms");}/* * 查询方法 *  *  * return Hits || null */public static Hits searchField(String queryString, String FileUrl)throws Exception {// 结果Hits hits = null;// 创建标准字义分析器Analyzer analyMachine = new StandardAnalyzer();// 解析StringQueryParser parser = new QueryParser("content", analyMachine);// 创建查询Query query = parser.parse(queryString);// 创建索引搜索者IndexSearcher searcher = new IndexSearcher(FileUrl);if (searcher != null) {// 得到查询结果hits = searcher.search(query);}return hits;}/* *  * */public static void fileArrCreateindexTool(File[] textFiles,StringBuffer agreeType,IndexWriter indexWriter,String charSet) throws Exception{// 增加document到索引去for (int i = 0; i < textFiles.length; i++) {// .必须转义String[] tempString = (textFiles[i].getName()).split("\\.");if (textFiles[i].isFile()&& agreeType.indexOf(tempString[tempString.length - 1]) != -1) {System.out.println(textFiles[i].getCanonicalPath()+ "开始创建索引");Document document=LuceneUtil.fileToDoc(textFiles[i], charSet);indexWriter.addDocument(document);System.out.println(textFiles[i].getCanonicalPath()+ "索引完毕");}}}}
-----------------------------------------------------------------------------main-------------------------------------------------------------------------------
package cn.com;import org.apache.lucene.document.Document;import org.apache.lucene.search.Hits;import cn.com.util.LuceneUtil;/* * @author op_xiaoyang@yeah.net */public class LuceneUniversalMethod {//文件路径static String fileUrl = "F:\\文字类\\建站";//字符集设置static String setString = "gbk";//查询语句static String queryString = "定律";//允许格式、static String allowType="txt";/* * main */public static void main(String[] args) {try {//LuceneUtil.standardCreateIndex(fileUrl, fileUrl + "\\index", new StringBuffer(allowType), setString);LuceneUtil.cacheCreateIndex(fileUrl, fileUrl + "\\index", new StringBuffer(allowType), setString);Hits content = LuceneUtil.searchField(queryString, fileUrl+ "\\index");System.out.println("搜索->" + queryString + "<-共记录"+ content.length() + "条");for(int i=0;i<content.length();i++){Document condoc = content.doc(i);System.out.println(condoc.get("content"));}} catch (Exception e) {e.printStackTrace();}}}
那么lucene如何查询数据库里的东西呢？

先表述一下思想：参考（http://topic.csdn.net/t/20061211/20/5223023.html）
1.写一段传统的JDBC程序，讲每条的用户信息从数据库读取出来
2.针对每条用户记录，建立一个lucene document
Document doc = new Document();
    并根据你的需要，将用户信息的各个字段对应luncene document中的field 进行添加，如：
doc.add(new Field( "NAME ", "USERNAME ", Field.Store.YES,Field.Index.UN_TOKENIZED));
    然后将该条doc加入到索引中，如： luceneWriter.addDocument(doc);
    这样就建立了lucene的索引库
3.编写对索引库的搜索程序（看lucene文档），通过对lucene的索引库的查找，你可以快速找到对应记录的ID
4.通过ID到数据库中查找相关记录

以下为程序：
//下星期弄全文精确高级检索，写完再贴代码
关于搜索searchIndex
1、首先，要知道Weight(接口)存在的目的：
使得检索不改变一个Query，使得Query可以重用。所以就出现了Weight，一个Weight可以保存与某次检索相关的IndexSearcher检索器的独立状态值。其实Weight间接保存了IndexSearcher索引器的独立状态信息。
每次检索，即初始化一个IndexSearcher检索器，都需要一个Query，例如
   Query query = new TermQuery(term);
    Hits hits = searcher.search(query);
而Query抽象了用户的检索意向信息，可以使用Query的public Query rewrite(IndexReader reader)方法来实现对先前的检索意向信息的修改(重写)。
用户的一次检索，是与一个Weight对应的，当然可以不保存本次检索相关的IndexSearcher检索器的状态信息到一个Weight中，这样的坏处就是Query不能重用，每次都要重新实例化一个。
Weight接口定义了如下的内容：
public interface Weight extends java.io.Serializable {
Query getQuery();    // 通过一个Weight可以获取到一个Query实例
float getValue();    // Weight相关的Query的权重值
float sumOfSquaredWeights() throws IOException;    // 一个Query可以有很多子句(比如一个BooleanQuery可以包含多个TermQuery子句)，获取到所有子句的权重值的平方
void normalize(float norm);    // 指派查询的标准化因子
Scorer scorer(IndexReader reader) throws IOException;   // 根据一个IndexReader，通过Weight获取得分
Explanation explain(IndexReader reader, int doc) throws IOException;    // 为编号为doc的Document设置计算得分的描述信息
}
2、其次，知道Sort类是为一次检索设定排序方式的。
这些排序的方式是在SortField类中定义的，一共定义了7种，当然包括客户化定制排序方式。
3、再次，知道Explanation类是关于某次检索中，封装了对某个Document的得分计算的描述。
4、接着，知道TopDocs类是关于某次实际的检索出来结果集的信息，包括Hits数量，及其最大得分的信息。TopDocs的子类TopFieldDocs类指定了排序方式(Sort)，为Fields进行排序。
5、然后，知道FieldSelector是一个筛选器接口，将某个Document中的满足接受条件的Field返回。在FieldSelector中定义了FieldSelectorResult accept(String fieldName);方法。
6、最后，理解TopDocCollector类的用于IndexSearcher的目的。其实TopDocCollector内部定义了一个collect()方法，该方法可以实现根据Document的得分来排序。TopDocCollector类继承自HitCollector，而HitCollector抽象类定义了实现查询(queries)、排序(sorting)、过滤(filtering)的功能。