Lucene全文检索样例（解决大文本建索引）

来源：互联网发布：加密狗软件图片编辑：程序博客网时间：2024/06/07 02:59

建索引:

Java代码

package com.pccw;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
/** */ /**
* author Shane in PCCW
*
*/
public class TextFileIndexer {
public static void main(String[] args) throws Exception {
/**/ /* 指明要索引文件夹的位置,这里是C盘的S文件夹下 */
File fileDir = new File( "c://s" );
/**/ /* 这里放索引文件的位置 */
File indexDir = new File( "c://index" );
Analyzer luceneAnalyzer = new StandardAnalyzer();
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,
true );
indexWriter.setMaxFieldLength(99999999);//增加内存域长度限制（非常重要）
File[] textFiles = fileDir.listFiles();
long startTime = new Date().getTime();
// 增加document到索引去
for ( int i = 0 ; i < textFiles.length; i ++ ) {
if (textFiles[i].isFile()
&& textFiles[i].getName().endsWith( ".txt" )) {
System.out.println( " File " + textFiles[i].getCanonicalPath()
+ " 正在被索引. " );
String temp = FileReaderAll(textFiles[i].getCanonicalPath(),
" GBK " );
System.out.println(temp);
Document document = new Document();
Field FieldPath = new Field( " path " , textFiles[i].getPath(),
Field.Store.YES, Field.Index.NO);
Field FieldBody = new Field( " body " , temp, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
document.add(FieldPath);
document.add(FieldBody);
indexWriter.addDocument(document);
}
}
// optimize()方法是对索引进行优化
indexWriter.optimize();
indexWriter.close();
// 测试一下索引的时间
long endTime = new Date().getTime();
System.out
.println( " 这花费了 "
+ (endTime - startTime)
+ " 毫秒来把文档增加到索引里面去! "
+ fileDir.getPath());
}
public static String FileReaderAll(String FileName, String charset)
throws IOException {
BufferedReader reader = new BufferedReader( new InputStreamReader(
new FileInputStream(FileName), charset));
String line = new String();
String temp = new String();
while ((line = reader.readLine()) != null ) {
temp += line + "/n";
}
reader.close();
return temp;
}
}

package  com.pccw;      import  java.io.BufferedReader;    import  java.io.File;    import  java.io.FileInputStream;    import  java.io.IOException;    import  java.io.InputStreamReader;    import  java.util.Date;      import  org.apache.lucene.analysis.Analyzer;    import  org.apache.lucene.analysis.standard.StandardAnalyzer;    import  org.apache.lucene.document.Document;    import  org.apache.lucene.document.Field;    import  org.apache.lucene.index.IndexWriter;      /** */ /**    * author Shane in PCCW *  */    public   class  TextFileIndexer   {        public   static   void  main(String[] args)  throws  Exception   {            /**/ /*  指明要索引文件夹的位置,这里是C盘的S文件夹下  */           File fileDir  =   new  File( "c://s" );              /**/ /*  这里放索引文件的位置  */           File indexDir  =   new  File( "c://index" );           Analyzer luceneAnalyzer  =   new  StandardAnalyzer();           IndexWriter indexWriter  =   new  IndexWriter(indexDir, luceneAnalyzer,                    true );        indexWriter.setMaxFieldLength(99999999);//增加内存域长度限制（非常重要）        File[] textFiles  =  fileDir.listFiles();            long  startTime  =   new  Date().getTime();                       // 增加document到索引去               for  ( int  i  =   0 ; i  <  textFiles.length; i ++ )   {                if  (textFiles[i].isFile()                        &&  textFiles[i].getName().endsWith( ".txt" ))   {                   System.out.println( " File  "   +  textFiles[i].getCanonicalPath()                            +   " 正在被索引. " );                   String temp  =  FileReaderAll(textFiles[i].getCanonicalPath(),                            " GBK " );                   System.out.println(temp);                   Document document  =   new  Document();                   Field FieldPath  =   new  Field( " path " , textFiles[i].getPath(),                           Field.Store.YES, Field.Index.NO);                   Field FieldBody  =   new  Field( " body " , temp, Field.Store.YES,                           Field.Index.TOKENIZED,                           Field.TermVector.WITH_POSITIONS_OFFSETS);                   document.add(FieldPath);                   document.add(FieldBody);                   indexWriter.addDocument(document);               }            }             // optimize()方法是对索引进行优化             indexWriter.optimize();           indexWriter.close();                       // 测试一下索引的时间              long  endTime  =   new  Date().getTime();           System.out                   .println( " 这花费了 "                            +  (endTime  -  startTime)                            +   "  毫秒来把文档增加到索引里面去! "                            +  fileDir.getPath());       }           public   static  String FileReaderAll(String FileName, String charset)                throws  IOException   {           BufferedReader reader  =   new  BufferedReader( new  InputStreamReader(                    new  FileInputStream(FileName), charset));           String line  =   new  String();           String temp  =   new  String();                       while  ((line  =  reader.readLine())  !=   null )   {               temp  +=  line + "/n";           }            reader.close();            return  temp;       }    }

查询:

Java代码

package com.pccw;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
public class TestQuery {
public static void main(String[] args) throws IOException, ParseException {
Hits hits = null ;
String queryString = "中华" ;
Query query = null ;
IndexSearcher searcher = new IndexSearcher( " c://index " );
Analyzer analyzer = new StandardAnalyzer();
try {
QueryParser qp = new QueryParser( " body " , analyzer);
query = qp.parse(queryString);
} catch (ParseException e) {
}
if (searcher != null ) {
hits = searcher.search(query);
if (hits.length() > 0 ) {
System.out.println( " 找到: " + hits.length() + " 个结果! " );
}
}
}
}