Lucene 基础

来源:互联网 发布:c语言全套视频教程 编辑:程序博客网 时间:2024/04/29 21:22

刚刚开始学习Lucene,先从一个简单的实现来学习一下吧!


首先介绍一下几个比较重要的对象:

Document:一个要进行索引的单元,相当于数据库的一行纪录,任何想要被索引的数据,都必须转化为Document对象存放。

Field:Document中的一个字段,相当于数据库中的Column。Field.Store.YES表示域里面的内容将被存储到索引。Field.Store.NO:原文不存储在索引文件中

Field.Index有四个属性,Field.Index.ANALYZED,分词索引。Field.Index.NOT_ANALYZED:不分词进行索引,如作者名,日期等。Field.Index.NO:不进行索引,存放不能被搜索的内容如文档的一些附加属性如文档类型, URL等。
IndexWriter:负责将Document写入索引文件,索引器IndexWriter的功能主要就是创建索引,是建立索引工作中最核心的。当IndexWriter执行完addDocument方法后,一定要记得调用自身的close方法来关闭它。

Analyzer:分析器,主要用于文本分词。常用的有StandardAnalyzer分析器,StopAnalyzer分析器,WhitespaceAnalyzer分析器等。

Directory:索引存放的位置。lucene提供了两种索引存放的位置,一种是磁盘,一种是内存。一般情况将索引放在磁盘上;相应地lucene提供了FSDirectory和RAMDirectory两个类。

Segment,是Lucene索引文件的最基本的一个单位。Lucene说到底就是不断加入新的Segment,然后按一定的规则算法合并不同的Segment以合成新的Segment。


下面是我写的一个小例子:

package moheng;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class Indexer {

/**
* @param args
* @author Airpaul
* @since 2011/08/21
*/
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage: java " + Indexer.class.getName());
}
File indexDir = new File(args[0]);
File dataDir = new File(args[1]);
long start = new Date().getTime();
int numIndexed = index(indexDir, dataDir);
long end = new Date().getTime();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}

// open an index and start file directory traversal
public static int index(File indexDir, File dataDir) throws IOException {
if (!dataDir.exists() || !dataDir.isDirectory()) {
throw new IOException(dataDir
+ " does not exist or is not a directory");
}

IndexWriter writer = new IndexWriter(new SimpleFSDirectory(indexDir),new IndexWriterConfig(Version.LUCENE_33,new StandardAnalyzer(Version.LUCENE_33)));
indexDirectory(writer, dataDir);
int numIndexed = writer.numDocs();
writer.optimize();
writer.close();
return numIndexed;
}

private static void indexDirectory(IndexWriter writer, File dir)
throws IOException {
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) {
File f = files[i];
if (f.isDirectory()) {
indexDirectory(writer, f);
} else if (f.getName().endsWith(".txt")) {
indexFile(writer, f);
}
}
}

private static void indexFile(IndexWriter writer, File f)
throws IOException {
if (f.isHidden() || !f.exists() || !f.canRead()) {
return;
}
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = new Document();
Field field = new Field("content",new FileReader(f));
doc.add(field);
Field field2 = new Field("filename",true, f.getCanonicalPath(),Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.YES);
doc.add(field2);
writer.addDocument(doc);
}
}


package moheng;
import java.io.File;
import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class Searcher {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage: java " + Searcher.class.getName() + " <index dir> <auery>");
}
File indexDir = new File(args[0]);
String q = args[1];
if (!indexDir.exists() || !indexDir.isDirectory()) {
throw new Exception(indexDir + " does not exist or is not a directory.");
}
search(indexDir, q);
}

public static void search(File indexDir, String q) throws Exception {
FSDirectory fsDir = new SimpleFSDirectory(indexDir);
IndexSearcher is = new IndexSearcher(fsDir,true);
QueryParser qp = new QueryParser(Version.LUCENE_33,"content",new StandardAnalyzer(Version.LUCENE_33));
Query query = qp.parse(q);
long start = new Date().getTime();
ScoreDoc[] hits = is.search(query,10).scoreDocs;
long end = new Date().getTime();
System.err.println("Found " + hits.length + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':");
for (int i = 0; i < hits.length; i++) {
Document doc = is.doc(hits[i].doc);
System.out.println(doc.get("filename"));
}
}
}





原创粉丝点击