搜索引擎lucene之文本文件索引
来源:互联网 发布:内江市淘宝贝幼儿园 编辑:程序博客网 时间:2024/04/27 10:22
package cn.yws;import java.io.BufferedReader;import java.io.File;import java.io.FileReader;import java.util.Date;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;public class TXTFileIndex {public static void main(String[] args) throws Exception {//索引保存的目录File indexDir=new File("d:\\luceneIndex");//要索引的文本数据File dataDir=new File("D:\\Workspaces\\MyEclipse 10\\lucene\\data");//分词器Analyzer luceneAnalyzer=new StandardAnalyzer();//写索引器IndexWriter indexWriter=new IndexWriter(indexDir, luceneAnalyzer,true);File[] dataFiles=dataDir.listFiles();if(dataFiles==null){System.err.println("dataFiles==null");return ;}long startTime=new Date().getTime();for(int i=0;i<dataFiles.length;i++){if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){System.out.println("indexing file:"+dataFiles[i].getCanonicalPath());Document document=new Document();FileReader txtReader=new FileReader(dataFiles[i]);/** * Index.NO:不需要索引,Index.TOKENIZED:先被分词再被索引 Index.UN_TOKENIZED:不对该Field进行分词,但会对它进行索引 Index.NO_NORMS:对该Field进行索引,但是不使用Analyzer,同时禁止它参加评分,主要是为了减少内存的消耗。 */document.add(new Field("url", "http://www.lietu.com/segtest/", Field.Store.YES, Field.Index.UN_TOKENIZED, Field.TermVector.NO));document.add(new Field("path", dataFiles[i].getCanonicalPath(), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));BufferedReader br = new BufferedReader(txtReader);String s;StringBuffer body=new StringBuffer(); if ( (s = br.readLine()) != null ) { //news.title = s; System.out.println(s);while( (s = br.readLine()) != null ) {body.append(s);body.append('\n');} } br.close(); document.add(new Field("content", body.toString(), Field.Store.YES,Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));indexWriter.addDocument(document);}}indexWriter.optimize();indexWriter.close();long endTime=new Date().getTime(); System.out.println("It takes " + (endTime - startTime) + " milliseconds to create index for the files in directory " + dataDir.getPath()); } }
package cn.yws;import java.io.File;import java.io.IOException;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;public class TXTFileSearcher {public static void main(String[] args) throws Exception {String indexDir = "d:\\luceneIndex" ;//打开指定目录下的索引Directory dir = FSDirectory.getDirectory(new File(indexDir));IndexReader reader = IndexReader.open(dir);//显示索引中的所有文档System.out.println("索引里面document列表:"+reader.numDocs());for(int i=0;i<reader.numDocs();i++){//System.out.println(reader.document(i));}System.out.println("索引里面document数量是:"+reader.numDocs());reader.close();IndexSearcher searcher = new IndexSearcher(dir);//构建IndexSearcher对象/**Term t = new Term("url", "http://www.lietu.com/segtest/");//Term t = new Term("content", "农民");Query query = new TermQuery(t);*/// 创建查询分析器,对File属性title进行查询,采用的分析器是StandardAnalyzerQueryParser queryparser = new QueryParser("content",new StandardAnalyzer());// 分析用户输入的字符串Query query = queryparser.parse("data 农民");//按照指定的query查询,获得评分最高的100个文档TopDocs hits = searcher.search(query,null, 100);System.out.println("查询到的文档数是:"+hits.totalHits);Document document;for(int i = 0;i<hits.totalHits;i++){document=searcher.doc(hits.scoreDocs[i].doc);System.out.println("查询到的文档是:"+document);System.out.println("查询到文档编号和评价分数是:"+hits.scoreDocs[i].doc+","+hits.scoreDocs[i].score);}System.out.println("文档最高评价分数是:"+hits.getMaxScore());}}
package cn.yws;import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.TopDocCollector;import org.apache.lucene.store.Directory;import org.apache.lucene.store.RAMDirectory;//http://my.oschina.net/lsw90/blog/186732public class Example {public static void main(String[] args) throws Exception {testIndexAndSearchold();}public static void testIndexAndSearchold() throws Exception {Analyzer analyzer = new StandardAnalyzer();// Store the index in memory:Directory directory = new RAMDirectory();// To store an index on disk, use this instead:// Directory directory = FSDirectory.getDirectory("/tmp/testindex");IndexWriter iwriter = new IndexWriter(directory, analyzer,true);Document doc = new Document();String text = "This is the text to be indexed. 你好啊 呵呵 内存索引";doc.add(new Field("fieldname", text, Field.Store.YES,Field.Index.TOKENIZED));doc.add(new Field("field", "text 麻痹的", Field.Store.YES,Field.Index.TOKENIZED));iwriter.addDocument(doc);iwriter.optimize();iwriter.close();// Now search the index:IndexSearcher isearcher = new IndexSearcher(directory);// Parse a simple query that searches for "text":QueryParser parser = new QueryParser("fieldname", analyzer);Query query = parser.parse("text");TopDocCollector hits = new TopDocCollector(0);isearcher.search(query, hits);System.out.println("TotalHits:"+hits.getTotalHits());// Iterate through the results:for (int i = 0; i < hits.getTotalHits(); i++) {Document hitDoc = isearcher.doc(i);System.out.println(("This is the text to be indexed:"+hitDoc.get("fieldname")));}isearcher.close();directory.close();}}
0 0
- 搜索引擎lucene之文本文件索引
- lucene 搜索引擎 创建索引过程
- lucene搜索引擎(文件索引、数据库索引)
- lucene搜索引擎(文件索引、数据库索引)
- WebGIS搜索引擎之Lucene
- java之lucene索引
- Lucene之删除索引
- Lucene之创建索引
- lucene 之创建索引
- 搜索引擎索引之索引基础
- 搜索引擎索引之索引基础
- Lucene 需要索引的文本文件太大,怎么解决?
- Lucene入门之创建索引
- lucene学习之创建索引
- lucene 4.6 之索引文件格式
- Lucene初探之索引文件格式
- Lucene之倒排索引
- lucene索引word/pdf/html/txt文件及检索(搜索引擎)
- JS实现拖动div改变大小
- Linux管理:将Redhat安装ISO设置为yum源
- 腾讯旗下支付公司财付通更名
- Myeclipse从SVN上加载项目时出现jar包混乱和项目里面的文件没错项目名上面有红叉解决方法
- declare-styleable中format详解
- 搜索引擎lucene之文本文件索引
- 用js判断页面刷新或关闭的方法(onbeforeunload与onunload事件) 详细出处参考:http://www.jb51.net/article/30640.htm
- 豆瓣客户端的实现02
- 【综述】(中科院)樊彬老师-“局部图像特征描述概述”
- VC++ 2010下的MFC应用程序与Windows窗体应用程序的区别
- 再谈ITFriend网站的定位
- IOS应用程序崩溃日志分析
- C++设计模式实现--模板(Template)模式
- inflate()参数的问题