C#关于在lucene下的中文切词
来源:互联网 发布:linux还是unix 编辑:程序博客网 时间:2024/05/22 01:33
在实现了中文切词的基础方法上,我将其封装在继承lucene的Analyzer类下
chineseAnalzer的方法就不用多说了。
using System;
using System.Collections.Generic;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
namespace Lucene.Fanswo
{
/// <summary>
///
/// </summary>
public class ChineseAnalyzer:Analyzer
{
//private System.Collections.Hashtable stopSet;
public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我们" };
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
/// </summary>
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{
TokenStream result = new ChineseTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
return result;
}
}
}
using System.Collections.Generic;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
namespace Lucene.Fanswo
{
/// <summary>
///
/// </summary>
public class ChineseAnalyzer:Analyzer
{
//private System.Collections.Hashtable stopSet;
public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我们" };
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
/// </summary>
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{
TokenStream result = new ChineseTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
return result;
}
}
}
ChineseTokenizer类的实现:
这里通过词典来正向匹配字符,返回lucene下定义的token流
using System;
using System.Collections.Generic;
using System.Text;
using Lucene.Net.Analysis;
using System.Collections;
using System.Text.RegularExpressions;
using System.IO;
namespace Lucene.Fanswo
{
class ChineseTokenizer : Tokenizer
{
private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量,当前字符的位置,字符长度
private int start;//开始位置
/// <summary>
/// 存在字符内容
/// </summary>
private string text;
/// <summary>
/// 切词所花费的时间
/// </summary>
public double TextSeg_Span = 0;
/// <summary>Constructs a tokenizer for this Reader. </summary>
public ChineseTokenizer(System.IO.TextReader reader)
{
this.input = reader;
text = input.ReadToEnd();
dataLen = text.Length;
}
/// <summary>进行切词,返回数据流中下一个token或者数据流为空时返回null
/// </summary>
///
public override Token Next()
{
Token token = null;
WordTree tree = new WordTree();
//读取词库
tree.LoadDict();
//初始化词库,为树形
Hashtable t_chartable = WordTree.chartable;
string ReWord = "";
string char_s;
start = offset;
bufferIndex = start;
while (true)
{
//开始位置超过字符长度退出循环
if (start >= dataLen)
{
break;
}
//获取一个词
char_s = text.Substring(start, 1);
if (string.IsNullOrEmpty(char_s.Trim()))
{
start++;
continue;
}
//字符不在字典中
if (!t_chartable.Contains(char_s))
{
if (ReWord == "")
{
int j = start + 1;
switch (tree.GetCharType(char_s))
{
case 0://中文单词
ReWord += char_s;
break;
case 1://英文单词
j = start + 1;
while (j < dataLen)
{
if (tree.GetCharType(text.Substring(j, 1)) != 1)
break;
j++;
}
ReWord += text.Substring(start, j - offset);
break;
case 2://数字
j = start + 1;
while (j < dataLen)
{
if (tree.GetCharType(text.Substring(j, 1)) != 2)
break;
j++;
}
ReWord += text.Substring(start, j - offset);
break;
default:
ReWord += char_s;//其他字符单词
break;
}
offset = j;//设置取下一个词的开始位置
}
else
{
offset = start;//设置取下一个词的开始位置
}
//返回token对象
return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
}
//字符在字典中
ReWord += char_s;
//取得属于当前字符的词典树
t_chartable = (Hashtable)t_chartable[char_s];
//设置下一循环取下一个词的开始位置
start++;
if (start == dataLen)
{
offset = dataLen;
return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
}
}
return token;
}
}
}
using System.Collections.Generic;
using System.Text;
using Lucene.Net.Analysis;
using System.Collections;
using System.Text.RegularExpressions;
using System.IO;
namespace Lucene.Fanswo
{
class ChineseTokenizer : Tokenizer
{
private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量,当前字符的位置,字符长度
private int start;//开始位置
/// <summary>
/// 存在字符内容
/// </summary>
private string text;
/// <summary>
/// 切词所花费的时间
/// </summary>
public double TextSeg_Span = 0;
/// <summary>Constructs a tokenizer for this Reader. </summary>
public ChineseTokenizer(System.IO.TextReader reader)
{
this.input = reader;
text = input.ReadToEnd();
dataLen = text.Length;
}
/// <summary>进行切词,返回数据流中下一个token或者数据流为空时返回null
/// </summary>
///
public override Token Next()
{
Token token = null;
WordTree tree = new WordTree();
//读取词库
tree.LoadDict();
//初始化词库,为树形
Hashtable t_chartable = WordTree.chartable;
string ReWord = "";
string char_s;
start = offset;
bufferIndex = start;
while (true)
{
//开始位置超过字符长度退出循环
if (start >= dataLen)
{
break;
}
//获取一个词
char_s = text.Substring(start, 1);
if (string.IsNullOrEmpty(char_s.Trim()))
{
start++;
continue;
}
//字符不在字典中
if (!t_chartable.Contains(char_s))
{
if (ReWord == "")
{
int j = start + 1;
switch (tree.GetCharType(char_s))
{
case 0://中文单词
ReWord += char_s;
break;
case 1://英文单词
j = start + 1;
while (j < dataLen)
{
if (tree.GetCharType(text.Substring(j, 1)) != 1)
break;
j++;
}
ReWord += text.Substring(start, j - offset);
break;
case 2://数字
j = start + 1;
while (j < dataLen)
{
if (tree.GetCharType(text.Substring(j, 1)) != 2)
break;
j++;
}
ReWord += text.Substring(start, j - offset);
break;
default:
ReWord += char_s;//其他字符单词
break;
}
offset = j;//设置取下一个词的开始位置
}
else
{
offset = start;//设置取下一个词的开始位置
}
//返回token对象
return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
}
//字符在字典中
ReWord += char_s;
//取得属于当前字符的词典树
t_chartable = (Hashtable)t_chartable[char_s];
//设置下一循环取下一个词的开始位置
start++;
if (start == dataLen)
{
offset = dataLen;
return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
}
}
return token;
}
}
}
测试的代码:
using System;
using System.Collections.Generic;
using System.Text;
using Analyzer = Lucene.Net.Analysis.Analyzer;
using SimpleAnalyzer = Lucene.Net.Analysis.SimpleAnalyzer;
using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
using Token = Lucene.Net.Analysis.Token;
using TokenStream = Lucene.Net.Analysis.TokenStream;
namespace MyLuceneTest
{
class Program
{
[STAThread]
public static void Main(System.String[] args)
{
try
{
Test("中华人民共和国在1949年建立,从此开始了新中国的伟大篇章。长春市长春节致词", true);
}
catch (System.Exception e)
{
System.Console.Out.WriteLine(" caught a " + e.GetType() + " with message: " + e.Message + e.ToString());
}
}
internal static void Test(System.String text, bool verbose)
{
System.Console.Out.WriteLine(" Tokenizing string: " + text);
Test(new System.IO.StringReader(text), verbose, text.Length);
}
internal static void Test(System.IO.TextReader reader, bool verbose, long bytes)
{
//Analyzer analyzer = new StandardAnalyzer();
Analyzer analyzer = new Lucene.Fanswo.ChineseAnalyzer();
TokenStream stream = analyzer.TokenStream(null, reader);
System.DateTime start = System.DateTime.Now;
int count = 0;
for (Token t = stream.Next(); t != null; t = stream.Next())
{
if (verbose)
{
System.Console.Out.WriteLine("Token=" + t.ToString());
}
count++;
}
System.DateTime end = System.DateTime.Now;
long time = end.Ticks - start.Ticks;
System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens");
System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token");
System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour");
}
}
}
using System.Collections.Generic;
using System.Text;
using Analyzer = Lucene.Net.Analysis.Analyzer;
using SimpleAnalyzer = Lucene.Net.Analysis.SimpleAnalyzer;
using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
using Token = Lucene.Net.Analysis.Token;
using TokenStream = Lucene.Net.Analysis.TokenStream;
namespace MyLuceneTest
{
class Program
{
[STAThread]
public static void Main(System.String[] args)
{
try
{
Test("中华人民共和国在1949年建立,从此开始了新中国的伟大篇章。长春市长春节致词", true);
}
catch (System.Exception e)
{
System.Console.Out.WriteLine(" caught a " + e.GetType() + " with message: " + e.Message + e.ToString());
}
}
internal static void Test(System.String text, bool verbose)
{
System.Console.Out.WriteLine(" Tokenizing string: " + text);
Test(new System.IO.StringReader(text), verbose, text.Length);
}
internal static void Test(System.IO.TextReader reader, bool verbose, long bytes)
{
//Analyzer analyzer = new StandardAnalyzer();
Analyzer analyzer = new Lucene.Fanswo.ChineseAnalyzer();
TokenStream stream = analyzer.TokenStream(null, reader);
System.DateTime start = System.DateTime.Now;
int count = 0;
for (Token t = stream.Next(); t != null; t = stream.Next())
{
if (verbose)
{
System.Console.Out.WriteLine("Token=" + t.ToString());
}
count++;
}
System.DateTime end = System.DateTime.Now;
long time = end.Ticks - start.Ticks;
System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens");
System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token");
System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour");
}
}
}
测试结果:
完毕!
- C#关于在lucene下的中文切词
- 关于Lucene中文分词的highlight显示
- Lucene在Eclipse下的部署
- C# Lucene的使用详解及中文分词算法
- 关于在不同编译器下中文的乱码问题!
- 关于几个基于Lucene的中文分词库的比较
- 关于Lucene.net中文分词后的结果着色问题
- Lucene关于几种中文分词的总结
- .Net下的中文分词IKAnalyzerNet(基于Lucene.Net)
- Lucene下引入ICTCLAS进行中文分词的实现方法
- 控制台程序在C#下调用;关于MFC的初始化
- 关于Lucene的资料
- lucene在eclipse下运行
- lucene在eclipse下运行
- lucene在eclipse下运行
- Lucene在Linux下环境的搭建和运行
- Lucene 在Ubuntu+Python2的环境下进行搜索
- lucene的中文分词器
- spoj 694 Distinct Substrings 705 New Distinct Substrings
- [Baidu面试题]题目1:输入n个整数,输出其中最小的k个。
- HDU 1258 sum it up
- 终端中执行命令时提示权限不够的解决方法
- 在VS2008里开发C项目的步骤
- C#关于在lucene下的中文切词
- MySQL数据库管理常用命令
- ubuntu阻止软件包升级(指定的软件包不升级)
- 软件开发非著名高手修炼法:ET-TCARBSSW
- 一步一步教你配置vnc
- POSIX操作系统的串口编程指南(1)
- Lucene2.9.1使用小结
- Tips: 使用 Fiddler 分析网页加载过程 | oneoo's 私家花园
- 用户体验(UE)