基于luence的中文分词算法研究(定长递归)

来源:互联网 发布:网络推广工作总结 编辑:程序博客网 时间:2024/05/22 12:25

一、需处理的场境如下:
    通过luence对任意的内容(长度不太于128个汉字)进行索引,然后任意截取内容中的一小句进行搜索,要求按匹配度进行排序(匹配度最高的排在最前面);


二、算法说明:
    本算法是通过指定最大词的长度(在这里间称词长),然后按最大词长逐个字往后移组成词,然后再对每个词进行从1到最大词长进行组词,最后去掉重复的词得到最终的分词结果;
    例子如下:
1、设需拆解的内容是“本算法是通过指定最大词的长度”,最大词长为“3”;


2、按最大词长得到的分司如下:


  本算法 算法是 法是通 是通过 通过指 过指定 指定最 定最大 最大词 词的长 的长度


  从上面的结果可以看到每个词的第一个字连起来就是拆解的内容;理论上词长越大查询的结果越准确(不过实践证明词长为3和4比较合适);


3、对每个词进行做1到最大词长分拆分:


   本算法 ==》 本 本算 本算法


        4、对所有的词进行汇总并去掉重复和停顿词(如果有需要过滤的词),得到最后的分词;


三、实现思路
    1、建索引的分词算法基本上是按上面算法的实现;
    2、搜索时跟据用户操作习惯和心理做了一下调整,如果用户输入的搜索内容中含有空格(前后空格除外)则只需按空格来划分分词则可;如果用户输入的内容没有空格则把上面的算法取得分词,并去掉单字,如果只有单字分词则保留;
    3、对搜索结果进行分词标识,其取得分词的算法和建索引的分词算法一致;


四、优缺点分析
    1、优点是不用理会词典是否存在该分词,搜索结果比较精准,搜索出来的结果基本上和相像中的一致;
    2、缺点是索引文件一般较大。


五、JAVA代码实现




import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;


import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeSource;


public class MyTokenizer extends Tokenizer {
public final static String PUNCTION = "。,!?;,!?;|-+()*&^%$#@!~/\\";  
public static final String SPACES = "  \t\r\n";
public static Set<String> stopWordSet = new HashSet<String>();

private int wordLength = 4;
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done = false;
private boolean isSearch = false;
private char[] sourceBuffer;
private int finalOffset;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private int upto = 0;
private int curOffset = 0;
private List<Integer> startList = null; 
private List<Integer> endList = null; 

static{
stopWordSet.add("。");
stopWordSet.add(",");
stopWordSet.add("!");
stopWordSet.add("?");
stopWordSet.add(";");
stopWordSet.add(",");
stopWordSet.add("!");
stopWordSet.add("?");
stopWordSet.add(";");
stopWordSet.add("|");
stopWordSet.add("-");
stopWordSet.add("+");
stopWordSet.add("*");
stopWordSet.add("&");
stopWordSet.add("^");
stopWordSet.add("%");
stopWordSet.add("$");
stopWordSet.add("#");
stopWordSet.add("@");
stopWordSet.add("~");
stopWordSet.add("/");
stopWordSet.add("\\");
stopWordSet.add(" ");
stopWordSet.add("\t");
stopWordSet.add("\n");
stopWordSet.add("\r");
stopWordSet.add(" ");
}


public MyTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE);
}

public MyTokenizer(boolean isSearch,Reader input) {
this(input, DEFAULT_BUFFER_SIZE);
this.isSearch = isSearch;
}


public MyTokenizer(Reader input, int bufferSize) {
super(input);
termAtt.resizeBuffer(bufferSize);
}


public MyTokenizer(AttributeSource source, Reader input,
int bufferSize) {
super(source, input);
termAtt.resizeBuffer(bufferSize);
}


public MyTokenizer(AttributeFactory factory, Reader input,
int bufferSize) {
super(factory, input);
termAtt.resizeBuffer(bufferSize);
}


@Override
public final boolean incrementToken() throws IOException {
if(!done){
//System.out.println("===================================");
clearAttributes();
startList = new ArrayList<Integer>(); 
endList = new ArrayList<Integer>(); 
done = true;
upto = 0;
curOffset = 0;
char[] buffer = termAtt.buffer();
Set<String> wordSet = new HashSet<String>();
while (true) {
final int length = input.read(buffer, upto, buffer.length - upto);
if (length == -1)
break;
upto += length;
if (upto == buffer.length)
buffer = termAtt.resizeBuffer(1 + buffer.length);
}
if(upto>0){
String firstStr = "";
String secondStr = "";
String word = "";
//按照价乘在分词
for(int i = 0; i<upto; i++){
firstStr = String.valueOf(buffer[i]);
word = firstStr;
if(!stopWordSet.contains(firstStr)){
//System.out.println("OUT:"+word);
addNewOffset(i,i+1,word,wordSet);
for(int j = i+1; j < upto; j++){
secondStr = String.valueOf(buffer[j]);
word += secondStr;
if(!stopWordSet.contains(secondStr) && (j-i+1) <= wordLength){
//System.out.println("OUT:"+word);
addNewOffset(i,j+1,word,wordSet);
}else{
break;
}
}
}
}
if(isSearch){
Set<String> singleWordSet = new HashSet<String>();
word = "";
//在搜索的时候需要把用分隔符隔开的的单字加上
if(upto > 1){
for(int i = 0; i<upto; i++){
firstStr = String.valueOf(buffer[i]);
if(!stopWordSet.contains(firstStr)){
word += firstStr;
}else if(word.length() == 1 && !singleWordSet.contains(word)){
startList.add(i-1);
endList.add(i);
singleWordSet.add(word);
word = "";
}else{
word = "";
}
}
}else{
startList.add(0);
endList.add(1);
singleWordSet.add(word);
}
}
}
sourceBuffer = new String(buffer,0,upto).toCharArray();


}
if(startList != null && startList.size()>0 && curOffset < startList.size()){
int startOffset = startList.get(curOffset);
int endOffset = endList.get(curOffset);
curOffset++;
//System.out.println(startOffset + ":" + endOffset);
//termAtt.setLength(endOffset - startOffset);
//finalOffset = correctOffset(endOffset - startOffset);
//offsetAtt.setOffset(correctOffset(startOffset), finalOffset);
return flush(startOffset,endOffset);

}
return false;
}

private void addNewOffset(int startOffset,int endOffset,String word,Set<String> wordSet){
if(word != null && !wordSet.contains(word)){
//System.out.println("OUT:"+word+"  "+startOffset+":"+endOffset);
int wordLength = endOffset - startOffset;
if(isSearch){
//在搜索的时候去掉单字
if(wordLength > 1){
startList.add(startOffset);
endList.add(endOffset);
wordSet.add(word);
}
}else{
startList.add(startOffset);
endList.add(endOffset);
wordSet.add(word);
}

}
}
private final boolean flush(int start,int end){
int length = end - start;
if(end - start > 0){
//System.out.println("OUT:"+(new String(sourceBuffer,start,end)) +":"+String.valueOf(sourceBuffer));
termAtt.copyBuffer(sourceBuffer, start, length);
termAtt.setLength(length);
offsetAtt.setOffset(start, end);
return true;
}else{
return false;
}
}
@Override
public final void end() {
// set final offset
offsetAtt.setOffset(finalOffset, finalOffset);
}


@Override
public void reset(Reader input) throws IOException {
super.reset(input);
this.done = false;
}
}








import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;


public class MyBuilderIndexAnalyzer extends ReusableAnalyzerBase{


@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new MyTokenizer(reader));
}

public static void displayTokenStream(TokenStream ts) throws IOException {
//TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
CharTermAttribute termAtt = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
    while (ts.incrementToken()) {
     
        String token = new String(termAtt.buffer(),0,termAtt.length());
        System.out.println(token);
   }
}

public static void testAnalyzer(){
String words = "我的测试内容";
StringReader reader = new StringReader(words);
Analyzer analyzer = new MyBuilderIndexAnalyzer();
TokenStream ts = analyzer.tokenStream("fullcode", reader);
try {
displayTokenStream(ts);
} catch (IOException e) {
e.printStackTrace();
}
}

public static void testIndex(){
try {
Directory d = FSDirectory.open(new File("c:\\123"));
// Analyzer analyzer = new IKAnalyzer();
Analyzer analyzer = new MyBuilderIndexAnalyzer();
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35,analyzer);
IndexWriter writer = new IndexWriter(d, conf);

} catch (Exception e) {
e.printStackTrace();
}
}


public static void main(String[] args) {
testAnalyzer();
}


}






import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;


public class MySearchAnalyzer extends ReusableAnalyzerBase{


@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new MyTokenizer(true,reader));
}
public static void displayTokenStream(TokenStream ts) throws IOException {
//TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
CharTermAttribute termAtt = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
    while (ts.incrementToken()) {
     
        String token = new String(termAtt.buffer(),0,termAtt.length());
       
   }
}

public static void testAnalyzer(){
String words = "测试";
StringReader reader = new StringReader(words);
Analyzer analyzer = new MySearchAnalyzer();
TokenStream ts = analyzer.tokenStream("fullcode", reader);
try {
displayTokenStream(ts);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
testAnalyzer();
}
}




import java.io.IOException;
import java.io.StringReader;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.Version;


public class MyQueryParser extends QueryParser{


public MyQueryParser(Version matchVersion, String fileld, Analyzer analyzer) {
super(matchVersion, fileld, analyzer);
}


@Override
public org.apache.lucene.search.Query parse(String queryStr)
throws ParseException {
BooleanQuery rtQuery = new BooleanQuery();
if(queryStr != null && queryStr.length()>0){
TokenStream ts = this.getAnalyzer().tokenStream(this.getField(), new StringReader(queryStr));
CharTermAttribute termAtt = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
try {
while (ts.incrementToken()) {
String token = new String(termAtt.buffer(),offsetAtt.startOffset(),termAtt.length());
Term trem = new Term(this.getField(), token);
TermQuery query = new TermQuery(trem);
rtQuery.add(query, Occur.SHOULD);}
} catch (IOException e) {
e.printStackTrace();
}
}



return rtQuery;
}


}
0 0