敏感词汇过滤DFA算法

来源:互联网 发布:sql怎么备份数据库 编辑:程序博客网 时间:2024/05/16 03:36
using System;using System.Collections.Generic;using System.IO;using System.Linq;using System.Text;using System.Threading.Tasks;namespace SensitiveWordFilter{    public class SensitiveWord    {        private static readonly char IsEndChar = '$';        /**         * 初始化敏感词库<br>         * 将敏感词加入到HashMap中<br>         * 构建DFA算法模型         *          * @author dxm         *          */        public class SensitiveWordInit        {            // 字符编码            private static readonly  String ENCODING = "UTF-8";            /**             * 初始化敏感字库             *              * @return             */            public Dictionary<char, object> initKeyWord()            {                // 读取敏感词库                HashSet<String> wordSet = readSensitiveWordFile();                // 将敏感词库加入到HashMap中                return addSensitiveWordToHashMap(wordSet);            }            /**             * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>             * 中 = {              *       isEnd = 0              *       国 = {             *             isEnd = 1              *             人 = {              *                   isEnd = 0              *                   民 = {             *                         isEnd = 1              *                   }             *             }              *             男 = {              *                   isEnd = 0              *                   人 = {              *                         isEnd = 1              *                   }              *             }              *       }              * }              * 五 = {              *       isEnd = 0              *       星 = {              *             isEnd = 0              *             红 = {              *                    isEnd = 0              *                    旗 = {              *                           isEnd = 1              *                    }             *              }              *       }              * }             */            private Dictionary<char, object> addSensitiveWordToHashMap(HashSet<String> wordSet)            {                // 初始化敏感词容器,减少扩容操作                Dictionary<char, object> wordMap = new Dictionary<char, object>(wordSet.Count);                foreach (String word in wordSet)                {                    IDictionary<char, object> nowMap = wordMap;                    for (int i = 0; i < word.Length; i++)                    {                        // 转换成char型                        char keyChar = word[i];                        if (keyChar == IsEndChar)                            continue;                        Object tempMap;                        // 获取                        nowMap.TryGetValue(keyChar, out tempMap);                        // 如果存在该key,直接赋值                        if (tempMap != null)                        {                            nowMap = (Dictionary<char, object>)tempMap;                        }                        // 不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个                        else {                            // 设置标志位                            Dictionary<char, object> newMap = new Dictionary<char, object>();                            newMap.Add(IsEndChar, "0");                            // 添加到集合                            nowMap.Add(keyChar, newMap);                            nowMap = newMap;                        }                        // 最后一个                        if (i == word.Length - 1)                        {                            nowMap[IsEndChar] = "1";                        }                    }                }                return wordMap;            }            /**             * 读取敏感词库中的内容,将内容添加到SortedSet集合中             *              * @return             * @throws Exception             */            private HashSet<String> readSensitiveWordFile()            {                HashSet<String> wordSet = new HashSet<string>();                string content = File.ReadAllText("dic.txt", Encoding.GetEncoding(ENCODING));                using (StringReader sr = new StringReader(content))                {                    string s;                    while ((s = sr.ReadLine()) != null)                    {                        wordSet.Add(s);                    }                }                return wordSet;            }        }        public class SensitivewordFilter        {            private Dictionary<char, object> sensitiveWordMap = null;            // 最小匹配规则            public static int minMatchTYpe = 1;            // 最大匹配规则            public static int maxMatchType = 2;            // 单例            private static SensitivewordFilter inst = null;            /**             * 构造函数,初始化敏感词库             */            private SensitivewordFilter()            {                sensitiveWordMap = new SensitiveWordInit().initKeyWord();            }            /**             * 获取单例             *              * @return             */            public static SensitivewordFilter getInstance()            {                if (null == inst)                {                    inst = new SensitivewordFilter();                }                return inst;            }            /**             * 判断文字是否包含敏感字符             *              * @param txt             * @param matchType             * @return             */            public bool isContaintSensitiveWord(String txt, int matchType = 1)            {                bool flag = false;                for (int i = 0; i < txt.Length; i++)                {                    // 判断是否包含敏感字符                    int matchFlag = this.CheckSensitiveWord(txt, i, matchType);                    // 大于0存在,返回true                    if (matchFlag > 0)                    {                        flag = true;                    }                }                return flag;            }            /**             * 获取文字中的敏感词             *              * @param txt             * @param matchType             * @return             */            public HashSet<String> getSensitiveWord(String txt, int matchType = 1)            {                HashSet<String> sensitiveWordList = new HashSet<String>();                for (int i = 0; i < txt.Length; i++)                {                    // 判断是否包含敏感字符                    int length = CheckSensitiveWord(txt, i, matchType);                    // 存在,加入list中                    if (length > 0)                    {                        sensitiveWordList.Add(txt.Substring(i, length));                        // 减1的原因,是因为for会自增                        i = i + length - 1;                    }                }                return sensitiveWordList;            }            /**             * 替换敏感字字符             *              * @param txt             * @param matchType             * @param replaceChar             * @return             */            public String replaceSensitiveWord(String txt, String replaceChar, int matchType = 1)            {                StringBuilder sb = new StringBuilder(txt);                for (int i = 0; i < txt.Length; i++)                {                    // 判断是否包含敏感字符                    int length = CheckSensitiveWord(txt, i, matchType);                    // 存在,加入list中                    if (length > 0)                    {                        var ttxt = txt.Substring(i, length);                        sb.Replace(ttxt, getReplaceChars(replaceChar, ttxt.Length), i, length);                        // 减1的原因,是因为for会自增                        i = i + length - 1;                    }                }                return sb.ToString();            }            /**             * 获取替换字符串             *              * @param replaceChar             * @param length             * @return             */            private String getReplaceChars(String replaceChar, int length)            {                StringBuilder sb = new StringBuilder();                for (int i = 0; i < length; i++)                {                    sb.Append(replaceChar);                }                return sb.ToString();            }            /**             * 检查文字中是否包含敏感字符,检查规则如下:<br>             * 如果存在,则返回敏感词字符的长度,不存在返回0             *              * @param txt             * @param beginIndex             * @param matchType             * @return             */            public int CheckSensitiveWord(String txt, int beginIndex, int matchType)            {                // 敏感词结束标识位:用于敏感词只有1位的情况                bool flag = false;                // 匹配标识数默认为0                int matchFlag = 0;                Dictionary<char, object> nowMap = sensitiveWordMap;                int tempFlag = 0;                Dictionary<char, object> tempMapForBack = new Dictionary<char, object>();                int len = txt.Length;                for (int i = beginIndex; i < len; i++)                {                    char word = txt[i];                    if (word == IsEndChar)                        continue;                    // 获取指定key                    Object tempMap;                    // 获取                    nowMap.TryGetValue(word, out tempMap);                    if (tempFlag == 0)                        tempMapForBack = nowMap;                    // 如果存在该key,直接赋值                    if (tempMap != null)                    {                        nowMap = (Dictionary<char, object>)tempMap;                    }                    else                    {                        if (tempFlag > 0)                        {                            matchFlag = matchFlag - (i - tempFlag);                            i = tempFlag - 1;                            nowMap = tempMapForBack;                            continue;                        }                        else                        {                            nowMap = null;                        }                    }                    // 存在,则判断是否为最后一个                    if (nowMap != null)                    {                        // 找到相应key,匹配标识+1                        matchFlag++;                        object value;                        if (nowMap.TryGetValue(IsEndChar, out value))                        {                            if (value is string)                            {                                // 如果为最后一个匹配规则,结束循环,返回匹配标识数                                if ("1" == (string)value)                                {                                    if (nowMap.Keys.Count == 1 || tempFlag != 0 || i == len - 1)                                    {                                        // 结束标志位为true                                        flag = true;                                        // 最小规则,直接返回,最大规则还需继续查找                                        if (SensitivewordFilter.minMatchTYpe == matchType)                                        {                                            break;                                        }                                    }                                    else                                    {                                        tempFlag = i;                                    }                                }                            }                        }                    }                    // 不存在,直接返回                    else                    {                        break;                    }                }                // 长度必须大于等于1,为词                if (matchFlag < 2 || !flag)                {                    matchFlag = 0;                }                return matchFlag;            }        }    }}using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Threading.Tasks;namespace SensitiveWordFilter{    class Program    {        static void Main(string[] args)        {            SensitiveWord.SensitivewordFilter filter = SensitiveWord.SensitivewordFilter.getInstance();            String txt = "$fuckfuck you你麻痹e菜太菜了fuckyou从飞啊 fuck you";            String hou = filter.replaceSensitiveWord(txt, "*");            Console.WriteLine("替换前的文字为:" + txt);            Console.WriteLine("替换后的文字为:" + hou);            Console.ReadKey();        }    }}
0 0