lucene 学习笔记

来源：互联网发布：杜兰特vs詹姆斯数据编辑：程序博客网时间：2024/06/03 16:00
package com.test.lucene;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStreamReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.IndexWriterConfig.OpenMode;import org.apache.lucene.queryParser.MultiFieldQueryParser;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.LockObtainFailedException;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;import org.junit.Test;public class Main {private static final Version version = Version.LUCENE_35;private static final Analyzer analyzer = new StandardAnalyzer(version);private static final File indexDir = new File("E:/lucene/index");private static final File dataDir = new File("E:/lucene/data");/** * 建立索引 *  * @throws Exception */public void index() throws Exception {IndexWriter writer = getIndexWriter();File[] files = dataDir.listFiles();for (File file : files) {if (file.isDirectory()) {// 略过文件夹continue;}Document doc = getDoc(file);writer.addDocument(doc);}writer.close();}private IndexWriter getIndexWriter() {IndexWriterConfig iwc = null;IndexWriter writer = null;try {Directory dir = null;dir = FSDirectory.open(indexDir);// 索引文件保存在文件系统上， 存在io操作， 速度较慢dir = new RAMDirectory();// 放在内存里， 速度快， 没有io操作， 但是程序一退出， 就没有了// 可以结合以上两个优点：索引存放在文件系统上， 程序启动时， 把索引库读到内存，// 程序退出时， 把经过增删改的索引库保存会硬盘上// 1.启动时读入writer = new IndexWriter(dir, iwc);} catch (Exception e) {e.printStackTrace();}return writer;}/** * 测试两种存放方式： *  * <pre> * 1.启动时从filesystem加载索引到ram  * 2.对ram中的索引进行增删改查 * 3.退出时保存：从ram到filesystem * </pre> */@Testpublic void test() {IndexWriterConfig iwc1 = null;IndexWriterConfig iwc2 = null;IndexWriter fsWriter = null;IndexWriter ramWriter = null;// indexWriterConfig 不能用两次： the object cannot be set twice!iwc1 = new IndexWriterConfig(version, analyzer);iwc1.setOpenMode(OpenMode.CREATE_OR_APPEND);// ram中，添加文档，使用创建或追加iwc2 = new IndexWriterConfig(version, analyzer);iwc2.setOpenMode(OpenMode.CREATE);// 因为是从ram中写入， ram中保存的是最新的， 所以直接创建try {Directory fsDir = FSDirectory.open(indexDir);Directory ramDir = new RAMDirectory(fsDir);// 从systemfile加载ramWriter = new IndexWriter(ramDir, iwc2);// 操作内存索引的writerramWriter.addDocument(getDoc(new File("E:/lucene/data/test")));// 直接添加，方便点ramWriter.commit();ramWriter.close();// 关闭后才能把ram中最新的索引写回systemfilefsWriter = new IndexWriter(fsDir, iwc1);// 操作硬盘索引的writerfsWriter.addIndexes(ramDir);fsWriter.close();} catch (Exception e) {e.printStackTrace();}}private Document getDoc(File file) {Document doc = new Document();/** * 网页搜索时有：url地址， 标题， 内容等， 而通常不需要通过url进行搜索， 但是url还是得存起来， * 这时需要用Field.Store.YES， Field.Index.NOT_ANALYZED <br/> * 索引 *  * <pre> * +--+不索引  * +--+索引 * +---+分词  * +---+不分词 * </pre> */Field name = new Field("name", file.getName(), Field.Store.YES,Field.Index.ANALYZED);// 索引Field size = new Field("size", String.valueOf(file.length()),Field.Store.YES, Field.Index.NOT_ANALYZED);Field content = new Field("content", readFile(file), Field.Store.YES,Field.Index.ANALYZED);doc.add(name);doc.add(size);doc.add(content);return doc;}private String readFile(File file) {StringBuffer content = new StringBuffer();String line = "";BufferedReader reader = null;try {reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));while ((line = reader.readLine()) != null) {content.append(line).append("\n");}} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return content.toString();}private Query getQuery(String fieldName, String key) throws Exception {// 单个field中索引QueryParser parser = new QueryParser(version, fieldName, analyzer);return parser.parse(key);}private Query getQuery(String[] fields, String key) throws Exception {// 多个field中检索QueryParser parser = new MultiFieldQueryParser(version, fields,analyzer);return parser.parse(key);}@Testpublic void search() throws Exception {IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));// 存放在文件系统// IndexReader reader = IndexReader.open(new RAMDirectory());// 存放在内存IndexSearcher searcher = new IndexSearcher(reader);Query query = getQuery(new String[] { "name", "content" }, "test");TopDocs hits = searcher.search(query, 100);// 100 是搜索最大记录数， 不是分页用的， 搞错了int total = hits.totalHits;if (total > 0) {System.out.println("共找到" + total + "条记录");} else {System.out.println("没有找到记录");}ScoreDoc[] scoreDocs = hits.scoreDocs;int start = 0;int end = hits.totalHits;// for (ScoreDoc doc : hits.scoreDocs) {//这样不便分页for (int i = start; i < end; i++) {// 可以分页int sn = scoreDocs[i].doc;// 相当于获取主键，Document document = searcher.doc(sn);// 根据主键获取文档print(document);}searcher.close();}private void print(Document doc) {System.out.println("--------------------------------------------------");System.out.println("name   :" + doc.get("name"));System.out.println("size   :" + doc.get("size"));System.out.println("content:\n" + doc.get("content"));}}