C# 分词算法,ChineseAnalyzer,源代码分析,其他地方的代码都是稀烂。。。。
来源:互联网 发布:sap hana数据库 编辑:程序博客网 时间:2024/05/16 14:45
1.引用文件下载地址:
http://www.piaoyi.org/upimg/file071127_08/02/ChineseAnalyzer.rar
2.引用一个Lucene.Net.dll文件
3.添加新类库文件 WordTree.cs
using System;using System.Collections;using System.IO;using System.Text;using System.Text.RegularExpressions;namespace A.SplitString{ public class WordTree { //需要添加的对照文件 sdict.txt文件 private static string DictPath = System.Web.HttpContext.Current.Server.MapPath("~/sDict.txt"); public static Hashtable chartable = new Hashtable(); public static bool DictLoaded = false; public static double DictLoad_Span = 0.0; public string strChinese = "[一-龥]"; public string strNumber = "[0-9]"; public string strEnglish = "[a-zA-Z]"; public int GetCharType(string Char) { int result; if (new Regex(this.strChinese).IsMatch(Char)) { result = 0; } else if (new Regex(this.strEnglish).IsMatch(Char)) { result = 1; } else if (new Regex(this.strNumber).IsMatch(Char)) { result = 2; } else { result = -1; } return result; } public void LoadDict() { if (!WordTree.DictLoaded) { this.BuidDictTree(); WordTree.DictLoaded = true; } } private void BuidDictTree() { long ticks = DateTime.Now.Ticks; StreamReader streamReader = new StreamReader(WordTree.DictPath, Encoding.UTF8); string text = streamReader.ReadLine(); if (!chartable.Contains("word")) { WordTree.chartable.Add("word", null); } while (!string.IsNullOrEmpty(text)) { Hashtable hashtable = WordTree.chartable; for (int i = 0; i < text.Length; i++) { string key = text.Substring(i, 1); if (!hashtable.Contains(key)) { hashtable.Add(key, new Hashtable()); } hashtable = (Hashtable)hashtable[key]; } if (!hashtable.Contains("word")) { hashtable.Add("word", null); } text = streamReader.ReadLine(); } streamReader.Close(); } }}
4.添加cs文件 ChineseTokenizer.cs
using Lucene.Net.Analysis;using System;using System.Collections;using System.IO;namespace A.SplitString{ internal class ChineseTokenizer : Tokenizer { private int bufferIndex = 0; private int dataLen = 0; private int start; private string text; public ChineseTokenizer(TextReader reader) { this.input = reader; this.text = this.input.ReadToEnd(); this.dataLen = this.text.Length; } public override Token Next() { WordTree wordTree = new WordTree(); wordTree.LoadDict(); Hashtable hashtable = WordTree.chartable; string text = string.Empty; this.bufferIndex = this.start; int num = this.start; int num2 = this.bufferIndex; string text2 = string.Empty; Token result; while (this.start < this.dataLen) { string text3 = this.text.Substring(this.start, 1); if (!string.IsNullOrEmpty(text3.Trim())) { if (!hashtable.Contains(text3)) { if (text == string.Empty) { int i = this.start + 1; switch (wordTree.GetCharType(text3)) { case 0: text += text3; break; case 1: while (i < this.dataLen) { if (wordTree.GetCharType(this.text.Substring(i, 1)) != 1) { break; } i++; } text += this.text.Substring(this.start, i - this.start).ToLower(); break; case 2: while (i < this.dataLen) { if (wordTree.GetCharType(this.text.Substring(i, 1)) != 2) { break; } i++; } text += this.text.Substring(this.start, i - this.start); break; default: this.start++; this.bufferIndex = this.start; continue; } this.start = i; } else if (wordTree.GetCharType(text3) == -1) { this.start++; } if (hashtable.Contains("word")) { result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length); } else { this.start = num + 1; result = new Token(text2, num2, num2 + text2.Length); } } else { text += text3; hashtable = (Hashtable)hashtable[text3]; if (hashtable.Contains("word") || text.Length == 1) { text2 = text; num = this.start; num2 = this.bufferIndex; } this.start++; if (this.start != this.dataLen) { continue; } if (hashtable.Contains("word") || text.Length == 1) { result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length); } else { this.start = num + 1; result = new Token(text2, num2, num2 + text2.Length); } } return result; } this.start++; this.bufferIndex = this.start; } result = null; return result; } }}
5.添加cs 文件 SplitAdapter.cs
using Lucene.Net.Analysis;using Lucene.Net.Analysis.Standard;using System;using System.Collections.Generic;using System.IO;using System.Linq;using System.Text;using System.Threading.Tasks;namespace A.SplitString{ public class SplitAdapter : Analyzer { public static string[] CHINESE_ENGLISH_STOP_WORDS; public static readonly string[] Filter = new string[321]; public SplitAdapter(string path) { StreamReader streamReader = new StreamReader(path, Encoding.UTF8); string text = streamReader.ReadLine(); int num = 0; while (!string.IsNullOrEmpty(text)) { SplitAdapter.Filter[num] = text; text = streamReader.ReadLine(); num++; } } public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream tokenStream = new ChineseTokenizer(reader); tokenStream = new StandardFilter(tokenStream); return new StopFilter(tokenStream, SplitAdapter.Filter); } }}
6.实现类库
using Lucene.Net.Analysis;using System;using System.Collections.Generic;using System.IO;using System.Linq;using System.Text;using System.Text.RegularExpressions;using System.Threading.Tasks;namespace A.Helper{ public class MatchingHelper { public static List<string> GetMatchingList(string inputString) { string snoisePath = System.Web.HttpContext.Current.Server.MapPath("~/sNoise.config"); List<string> resultList = new List<string>(); SplitAdapter analyzer = new SplitAdapter(snoisePath); StringReader reader = new StringReader(inputString); TokenStream tokenStream = analyzer.TokenStream(null, reader); Token token = tokenStream.Next(); while (token != null) { resultList.Add(token.TermText()); token = tokenStream.Next(); } return resultList; //这个 list,就是拆分后的 词汇 } }}
1 0
- C# 分词算法,ChineseAnalyzer,源代码分析,其他地方的代码都是稀烂。。。。
- C#中文分词算法:ChineseAnalyzer
- 中文分词源代码分析
- 我不会代码设计--看着自己写的稀烂的代码 想哭了都。。
- C# 一个简单分词程序的思路和代码(六) 源代码 ,测试程序,词库下载地址
- baidu分词算法分析
- baidu分词算法分析
- baidu分词算法分析
- 百度分词算法分析
- C#中文分词算法:IKAnalyzerNet
- C# Lucene的使用详解及中文分词算法
- JDK源代码分析聚集篇-------Set分析(我们大家都是第一无二的)
- baidu分词算法分析之一
- baidu分词算法分析之一
- [原创]一个C#病毒源代码的分析
- 一个C#病毒源代码的分析
- 一个C#病毒源代码的分析
- 用mysql数据库写的分词算法代码
- Gym
- MFC 两个窗口 传递值
- HRBUST
- 斐波那契数列的四种简单实现方式
- Java OOP day03
- C# 分词算法,ChineseAnalyzer,源代码分析,其他地方的代码都是稀烂。。。。
- 蚂蚁感冒
- RHEL 5.7 Yum配置本地源[Errno 2] No such file or directory
- 海量数据处理问题之面试题
- ubuntu14.04搜狗输入法不能输入中文问题
- 设计模式(十五):原型模式
- Find the Duplicate Number
- hdu 2031 进制转换
- 申请人数大增美国大学录取条件看涨 备战雅思增强竞争力