C# 分词算法,ChineseAnalyzer,源代码分析,其他地方的代码都是稀烂。。。。

来源:互联网 发布:sap hana数据库 编辑:程序博客网 时间:2024/05/16 14:45

1.引用文件下载地址:

http://www.piaoyi.org/upimg/file071127_08/02/ChineseAnalyzer.rar

2.引用一个Lucene.Net.dll文件

3.添加新类库文件 WordTree.cs

using System;using System.Collections;using System.IO;using System.Text;using System.Text.RegularExpressions;namespace A.SplitString{    public class WordTree    {        //需要添加的对照文件 sdict.txt文件        private static string DictPath = System.Web.HttpContext.Current.Server.MapPath("~/sDict.txt");        public static Hashtable chartable = new Hashtable();        public static bool DictLoaded = false;        public static double DictLoad_Span = 0.0;        public string strChinese = "[一-龥]";        public string strNumber = "[0-9]";        public string strEnglish = "[a-zA-Z]";        public int GetCharType(string Char)        {            int result;            if (new Regex(this.strChinese).IsMatch(Char))            {                result = 0;            }            else if (new Regex(this.strEnglish).IsMatch(Char))            {                result = 1;            }            else if (new Regex(this.strNumber).IsMatch(Char))            {                result = 2;            }            else            {                result = -1;            }            return result;        }        public void LoadDict()        {            if (!WordTree.DictLoaded)            {                this.BuidDictTree();                WordTree.DictLoaded = true;            }        }        private void BuidDictTree()        {            long ticks = DateTime.Now.Ticks;            StreamReader streamReader = new StreamReader(WordTree.DictPath, Encoding.UTF8);            string text = streamReader.ReadLine();            if (!chartable.Contains("word"))            {                WordTree.chartable.Add("word", null);            }            while (!string.IsNullOrEmpty(text))            {                Hashtable hashtable = WordTree.chartable;                for (int i = 0; i < text.Length; i++)                {                    string key = text.Substring(i, 1);                    if (!hashtable.Contains(key))                    {                        hashtable.Add(key, new Hashtable());                    }                    hashtable = (Hashtable)hashtable[key];                }                if (!hashtable.Contains("word"))                {                    hashtable.Add("word", null);                }                text = streamReader.ReadLine();            }            streamReader.Close();        }    }}

4.添加cs文件 ChineseTokenizer.cs

using Lucene.Net.Analysis;using System;using System.Collections;using System.IO;namespace A.SplitString{    internal class ChineseTokenizer : Tokenizer    {        private int bufferIndex = 0;        private int dataLen = 0;        private int start;        private string text;        public ChineseTokenizer(TextReader reader)        {            this.input = reader;            this.text = this.input.ReadToEnd();            this.dataLen = this.text.Length;        }        public override Token Next()        {            WordTree wordTree = new WordTree();            wordTree.LoadDict();            Hashtable hashtable = WordTree.chartable;            string text = string.Empty;            this.bufferIndex = this.start;            int num = this.start;            int num2 = this.bufferIndex;            string text2 = string.Empty;            Token result;            while (this.start < this.dataLen)            {                string text3 = this.text.Substring(this.start, 1);                if (!string.IsNullOrEmpty(text3.Trim()))                {                    if (!hashtable.Contains(text3))                    {                        if (text == string.Empty)                        {                            int i = this.start + 1;                            switch (wordTree.GetCharType(text3))                            {                                case 0:                                    text += text3;                                    break;                                case 1:                                    while (i < this.dataLen)                                    {                                        if (wordTree.GetCharType(this.text.Substring(i, 1)) != 1)                                        {                                            break;                                        }                                        i++;                                    }                                    text += this.text.Substring(this.start, i - this.start).ToLower();                                    break;                                case 2:                                    while (i < this.dataLen)                                    {                                        if (wordTree.GetCharType(this.text.Substring(i, 1)) != 2)                                        {                                            break;                                        }                                        i++;                                    }                                    text += this.text.Substring(this.start, i - this.start);                                    break;                                default:                                    this.start++;                                    this.bufferIndex = this.start;                                    continue;                            }                            this.start = i;                        }                        else if (wordTree.GetCharType(text3) == -1)                        {                            this.start++;                        }                        if (hashtable.Contains("word"))                        {                            result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length);                        }                        else                        {                            this.start = num + 1;                            result = new Token(text2, num2, num2 + text2.Length);                        }                    }                    else                    {                        text += text3;                        hashtable = (Hashtable)hashtable[text3];                        if (hashtable.Contains("word") || text.Length == 1)                        {                            text2 = text;                            num = this.start;                            num2 = this.bufferIndex;                        }                        this.start++;                        if (this.start != this.dataLen)                        {                            continue;                        }                        if (hashtable.Contains("word") || text.Length == 1)                        {                            result = new Token(text, this.bufferIndex, this.bufferIndex + text.Length);                        }                        else                        {                            this.start = num + 1;                            result = new Token(text2, num2, num2 + text2.Length);                        }                    }                    return result;                }                this.start++;                this.bufferIndex = this.start;            }            result = null;            return result;        }    }}

5.添加cs 文件 SplitAdapter.cs

using Lucene.Net.Analysis;using Lucene.Net.Analysis.Standard;using System;using System.Collections.Generic;using System.IO;using System.Linq;using System.Text;using System.Threading.Tasks;namespace A.SplitString{    public class SplitAdapter : Analyzer    {        public static string[] CHINESE_ENGLISH_STOP_WORDS;        public static readonly string[] Filter = new string[321];        public SplitAdapter(string path)        {            StreamReader streamReader = new StreamReader(path, Encoding.UTF8);            string text = streamReader.ReadLine();            int num = 0;            while (!string.IsNullOrEmpty(text))            {                SplitAdapter.Filter[num] = text;                text = streamReader.ReadLine();                num++;            }        }        public override TokenStream TokenStream(string fieldName, TextReader reader)        {            TokenStream tokenStream = new ChineseTokenizer(reader);            tokenStream = new StandardFilter(tokenStream);            return new StopFilter(tokenStream, SplitAdapter.Filter);        }    }}

6.实现类库

using Lucene.Net.Analysis;using System;using System.Collections.Generic;using System.IO;using System.Linq;using System.Text;using System.Text.RegularExpressions;using System.Threading.Tasks;namespace A.Helper{    public class MatchingHelper    {        public static List<string> GetMatchingList(string inputString)        {            string snoisePath = System.Web.HttpContext.Current.Server.MapPath("~/sNoise.config");            List<string> resultList = new List<string>();            SplitAdapter analyzer = new SplitAdapter(snoisePath);            StringReader reader = new StringReader(inputString);            TokenStream tokenStream = analyzer.TokenStream(null, reader);            Token token = tokenStream.Next();            while (token != null)            {                resultList.Add(token.TermText());                token = tokenStream.Next();            }            return resultList;            //这个 list,就是拆分后的 词汇        }    }}
1 0