搜索引擎lucene之文本文件索引

来源:互联网 发布:内江市淘宝贝幼儿园 编辑:程序博客网 时间:2024/04/27 10:22
package cn.yws;import java.io.BufferedReader;import java.io.File;import java.io.FileReader;import java.util.Date;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;public class TXTFileIndex {public static void main(String[] args) throws Exception {//索引保存的目录File indexDir=new File("d:\\luceneIndex");//要索引的文本数据File dataDir=new File("D:\\Workspaces\\MyEclipse 10\\lucene\\data");//分词器Analyzer luceneAnalyzer=new StandardAnalyzer();//写索引器IndexWriter indexWriter=new IndexWriter(indexDir, luceneAnalyzer,true);File[] dataFiles=dataDir.listFiles();if(dataFiles==null){System.err.println("dataFiles==null");return ;}long startTime=new Date().getTime();for(int i=0;i<dataFiles.length;i++){if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){System.out.println("indexing file:"+dataFiles[i].getCanonicalPath());Document document=new Document();FileReader txtReader=new FileReader(dataFiles[i]);/** * Index.NO:不需要索引,Index.TOKENIZED:先被分词再被索引 Index.UN_TOKENIZED:不对该Field进行分词,但会对它进行索引 Index.NO_NORMS:对该Field进行索引,但是不使用Analyzer,同时禁止它参加评分,主要是为了减少内存的消耗。 */document.add(new Field("url", "http://www.lietu.com/segtest/",                     Field.Store.YES, Field.Index.UN_TOKENIZED,                    Field.TermVector.NO));document.add(new Field("path", dataFiles[i].getCanonicalPath(),                     Field.Store.YES,   Field.Index.TOKENIZED,                    Field.TermVector.WITH_POSITIONS_OFFSETS));BufferedReader br = new BufferedReader(txtReader);String s;StringBuffer body=new StringBuffer();    if ( (s = br.readLine()) != null )    {    //news.title = s;    System.out.println(s);while( (s = br.readLine()) != null ) {body.append(s);body.append('\n');}    }    br.close();    document.add(new Field("content", body.toString(),                     Field.Store.YES,Field.Index.TOKENIZED,                    Field.TermVector.WITH_POSITIONS_OFFSETS));indexWriter.addDocument(document);}}indexWriter.optimize();indexWriter.close();long endTime=new Date().getTime(); System.out.println("It takes " + (endTime - startTime)          + " milliseconds to create index for the files in directory " + dataDir.getPath());  }    }
package cn.yws;import java.io.File;import java.io.IOException;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;public class TXTFileSearcher {public static void main(String[] args) throws Exception {String indexDir = "d:\\luceneIndex" ;//打开指定目录下的索引Directory dir = FSDirectory.getDirectory(new File(indexDir));IndexReader reader = IndexReader.open(dir);//显示索引中的所有文档System.out.println("索引里面document列表:"+reader.numDocs());for(int i=0;i<reader.numDocs();i++){//System.out.println(reader.document(i));}System.out.println("索引里面document数量是:"+reader.numDocs());reader.close();IndexSearcher searcher = new IndexSearcher(dir);//构建IndexSearcher对象/**Term t = new Term("url", "http://www.lietu.com/segtest/");//Term t = new Term("content", "农民");Query query = new TermQuery(t);*/// 创建查询分析器,对File属性title进行查询,采用的分析器是StandardAnalyzerQueryParser queryparser = new QueryParser("content",new StandardAnalyzer());// 分析用户输入的字符串Query query = queryparser.parse("data 农民");//按照指定的query查询,获得评分最高的100个文档TopDocs hits = searcher.search(query,null, 100);System.out.println("查询到的文档数是:"+hits.totalHits);Document document;for(int i = 0;i<hits.totalHits;i++){document=searcher.doc(hits.scoreDocs[i].doc);System.out.println("查询到的文档是:"+document);System.out.println("查询到文档编号和评价分数是:"+hits.scoreDocs[i].doc+","+hits.scoreDocs[i].score);}System.out.println("文档最高评价分数是:"+hits.getMaxScore());}}
package cn.yws;import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.TopDocCollector;import org.apache.lucene.store.Directory;import org.apache.lucene.store.RAMDirectory;//http://my.oschina.net/lsw90/blog/186732public class Example {public static void main(String[] args) throws Exception {testIndexAndSearchold();}public static void testIndexAndSearchold() throws Exception {Analyzer analyzer = new StandardAnalyzer();// Store the index in memory:Directory directory = new RAMDirectory();// To store an index on disk, use this instead:// Directory directory = FSDirectory.getDirectory("/tmp/testindex");IndexWriter iwriter = new IndexWriter(directory, analyzer,true);Document doc = new Document();String text = "This is the text to be indexed. 你好啊 呵呵 内存索引";doc.add(new Field("fieldname", text, Field.Store.YES,Field.Index.TOKENIZED));doc.add(new Field("field", "text 麻痹的", Field.Store.YES,Field.Index.TOKENIZED));iwriter.addDocument(doc);iwriter.optimize();iwriter.close();// Now search the index:IndexSearcher isearcher = new IndexSearcher(directory);// Parse a simple query that searches for "text":QueryParser parser = new QueryParser("fieldname", analyzer);Query query = parser.parse("text");TopDocCollector hits = new TopDocCollector(0);isearcher.search(query, hits);System.out.println("TotalHits:"+hits.getTotalHits());// Iterate through the results:for (int i = 0; i < hits.getTotalHits(); i++) {Document hitDoc = isearcher.doc(i);System.out.println(("This is the text to be indexed:"+hitDoc.get("fieldname")));}isearcher.close();directory.close();}}



0 0