Lucene加中文分词paoding调研结果

来源：互联网发布：软件过程可视性编辑：程序博客网时间：2024/05/01 11:23

因为项目原因，调研了下全文检索。网上开源的最流行的检索好像是lucene,nutch据说是稳定性有待测试，所以没试。需要说明的是我要做的这个全文检索是搜索本项目的文档，网页和数据库内容，不涉及到web上网页的搜索。

Lucene介绍的文章很多，重点的都看了，原理不明白的可以看下以下：

http://www.ibm.com/developerworks/cn/java/wa-lucene/

自己参照别人的写了个例子。我用的是lucene2.3.1版本，这里需要注意版本不一样的话写的代码就有所区别，网上很多例子都不是参照新版的写的，不能直接用。

import org.apache.lucene.index.IndexWriter;

import java.io.File;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.Searcher;

import java.io.FileReader;

import org.apache.lucene.document.*;

import org.apache.lucene.store.*;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import java.io.*;

import org.apache.poi.hwpf.extractor.WordExtractor;

import org.apache.lucene.search.BooleanQuery;

import org.apache.lucene.search.BooleanClause;

import org.apache.lucene.search.highlight.*;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class Mp3Searcher {

private String DATA_DIR ; //文件的目录（如果是对文件进行索引的话）

private String index_DIR ;//存放索引的目录

private RAMDirectory directory ;

private PaodingAnalyzer analyzer = null;

//private Analyzer analyzer = null;

public Mp3Searcher(){

//Analyzer analyzer = new PaodingAnalyzer();

}

public String getindedxdir()

{

return this.index_DIR;

}

public void buildIndex() throws IOException{

String DATA_DIR="C://lucenetest//index"; //存放文件目录

String index_DIR="E://test"; //存放索引文件目录

File data_Dir = new File(DATA_DIR);

File index_Dir = new File(index_DIR);

this.index_DIR=index_DIR;

Analyzer analyzer = new PaodingAnalyzer();

//Analyzer analyzer = new StandardAnalyzer();

File[] dataFiles = data_Dir.listFiles();

boolean fileIsExist = false;

if (index_Dir.listFiles().length == 0)

fileIsExist = true;

IndexWriter writer = new IndexWriter(index_Dir, analyzer, fileIsExist);

try{

this.doIndex(dataFiles,writer);

}catch(Exception e)

{

e.printStackTrace();

}

writer.optimize();

writer.close();

}

private void doIndex(File[] dataFiles, IndexWriter indexWriter) throws Exception {

for (int i = 0; i < dataFiles.length; i++) {

if (dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".htm"))

{//索引所有htm格式文件

System.out.println("Indexing file " + dataFiles[i].getCanonicalPath());

Reader txtReader = new FileReader(dataFiles[i]);

Document document = new Document();

document.add(new Field("path", dataFiles[i].getCanonicalPath(), Field.Store.YES,Field.Index.UN_TOKENIZED));

document.add(new Field("filename", dataFiles[i].getName(), Field.Store.YES, Field.Index.TOKENIZED));

document.add(new Field("contents", txtReader));

indexWriter.addDocument(document);

}

else if (dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".doc"))

{

FileInputStream in = new FileInputStream(dataFiles[i]);//获得文件流

WordExtractor extractor = new WordExtractor(in);//使用POI对word文件进行解析

String str = extractor.getText();//返回String

Document document = new Document();//生成Document对象,其中有3个Field,分别是path,filename,contents

document.add(new Field("path", dataFiles[i].getCanonicalPath(), Field.Store.YES,Field.Index.UN_TOKENIZED));

document.add(new Field("filename", dataFiles[i].getName(), Field.Store.YES, Field.Index.TOKENIZED));

document.add(new Field("contents", str, Field.Store.YES,Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS));

System.out.print(document.getField("path").toString()+document.getField("filename").toString()+document.getField("contents").toString());

indexWriter.addDocument(document);

}

else if (dataFiles[i].isDirectory())

doIndex(dataFiles[i].listFiles(), indexWriter);//使用递归,继续索引文件夹

}

// public void searchIndex(String curcontents,String curfilename) throws Exception {

public void searchIndex(String curcontents) throws Exception {

String contents = curcontents;//内容的关键字

// String filename = curfilename;//文件名的关键字

File indexDir = new File(index_DIR);//存放索引的文件夹

FSDirectory directory = FSDirectory.getDirectory(indexDir);

Searcher searcher = new IndexSearcher(directory);

Analyzer analyzer = new PaodingAnalyzer();

//Analyzer analyzer = new StandardAnalyzer();

QueryParser parserContents = new QueryParser("contents", analyzer);

Query query1 = parserContents.parse(contents);

// QueryParser parserFilename = new QueryParser("filename", analyzer); //使用同一个分析器luceneAnalyzer分别生成两个QueryParser对象

// Query query2 = parserFilename.parse(filename);

BooleanQuery query = new BooleanQuery();

query.add(query1, BooleanClause.Occur.MUST);

// query.add(query2, BooleanClause.Occur.MUST);

//SimpleHTMLFormatter formatter =new SimpleHTMLFormatter("<span class=/"highlight/">","</span>");

// SimpleHTMLFormatter formatter =new SimpleHTMLFormatter();

// Highlighter highlighter = new Highlighter(formatter,new QueryScorer(query));

Highlighter highlighter = new Highlighter(new QueryScorer(query));

highlighter.setTextFragmenter(new SimpleFragmenter(60)); //Lucene自带的高亮功能

Hits hits = searcher.search(query);

for(int i=0;i<hits.length();i++){

Document doc=hits.doc(i);

System.out.println("检索文件"+doc.get("path"));

String contents1=doc.get("contents");

if (contents1!=null)

{

TokenStream tokenStream = analyzer.tokenStream("contents", new StringReader(contents1));

String str = highlighter.getBestFragment(tokenStream,hits.doc(i).get("contents"));

System.out.println(str);

}

public static void main(String[] args) throws Exception

{

Mp3Searcher searcher=new Mp3Searcher();

String DATA_DIR="C://lucenetest//index";

String index_DIR="E://test";

searcher.buildIndex();

//searcher.searchIndex("lucene","绝缘材料");

searcher.searchIndex("索引");

}

以上代码已经在eclipse3.2上测试通过，建的是java项目。代码中有些注释的部分也是能用的，稍微修改下就行。

下面重点说下中文分词。因为lucene是老外写的，所以它对英文，德文等支持比较好。对于中文的功能是比较弱的。中文分词要用standardanalyzer,这个是采用二元分词法，分词效果不理想，所以国人也开发了很多中文分词的软件。网上很有名的就是中科院的，可是中科院的这个是c写的，也有人把它改为java，但是据说bug很多，所以我也没下。我稍微看了下好像是说中科院的这个分词的dll要用jni用在lucene.net这个版本上，但是这个版本已经旧了，所以我没调研。其他的中文分词软件我主要看了下庖丁解牛分词软件。

首先说明这个软件在java项目中是可以使用的，问题主要在web项目中。

http://paoding.googlecode.com/svn/trunk/paoding-analysis/

最好是下svn上的，因为这个是更新过的，更改了2.0.4版的一些bug。

http://groups.google.com/group/paoding/topics?start=20&sa=N是论坛

网上也有各种出现的各种错误，主要是如下的，我都碰到过：

首先是需要设置Paoding词典到我们的系统环境变量,该词典就在Paoding的dic 文件夹里,把它设置到环境变量中,变量名是PAODING_DIC_HOME,这里要注意:DIC的路径不能包含中文,标点,空格...

然后,再使用的时候,你需要将Paoding的5个jar包拷贝到你的工程中去,但是要注意你工程的路径,如果你发布的是WEB工程,记得你的服务器路径也不能包含中文空格,之前我的路径是

g:/Tomcat 6.0就不行,被迫改成了g:/Tomcat

如果上面设置没问题,就可以使用Paoding进行搜索开发了,但是在建立索引的时候你可能还会遇到 java.io.File.setWritable(Z)Z异常问题,这个一般是你JDK的问题,升级下你JDK的版本.

我就遇到过很奇怪的问题,我使用JDK1.5一直没问题,但是有一天突然爆了这个错误,让我郁闷,最后升级了个JDK,问题就解决了....

以上是网上别人反应的问题，我自己碰到的问题主要是这几个：

1. Classpath的路径paoding总是不识别，导致无法正确加载词库。无论在properties文件中如何设置都搞不好，也许是jdk版本问题，我用的是1.5的。这样导致在部署到tomcat5.0时总是报路径不对。

2. 我建了个jsp和servlet，将结果显示在servlet中。结果发现在jsp页面上输入中文关键字时页面无任何结果，但是搜英文的就有结果。

这个问题很多人看了都认为是个乱码处理问题，但是我调了很久，各种方式都试了，仍然不行。网上也有人反映和我一样的问题，但是没有解决办法。

这两个重大问题导致我放弃了paoding，还有一点是分词软件是需要维护的，网上的成功的基于lucene的网站如jlive等，我想它们可能是购买的某公司的中文分词的解析器来做搜索的。