[Java Web]敏感词过滤算法

来源:互联网 发布:用c语言写脚本 编辑:程序博客网 时间:2024/04/29 19:30

1.DFA算法

DFA算法的原理可以参考这里,简单来说就是通过Map构造出一颗敏感词树,树的每一条由根节点到叶子节点的路径构成一个敏感词,例如下图:


代码简单实现如下:

public class TextFilterUtil {    //日志    private static final Logger LOG = LoggerFactory.getLogger(TextFilterUtil.class);    //敏感词库    private static HashMap sensitiveWordMap = null;    //默认编码格式    private static final String ENCODING = "gbk";    //敏感词库的路径    private static final InputStream in = TextFilterUtil.class.getClassLoader().getResourceAsStream("sensitive/keyWords.txt");    /**     * 初始化敏感词库     */    private static void init() {        //读取文件        Set<String> keyWords = readSensitiveWords();        //创建敏感词库        sensitiveWordMap = new HashMap<>(keyWords.size());        for (String keyWord : keyWords) {            createKeyWord(keyWord);        }    }    /**     * 构建敏感词库     *     * @param keyWord     */    private static void createKeyWord(String keyWord) {        if (sensitiveWordMap == null) {            LOG.error("sensitiveWordMap 未初始化!");            return;        }        Map nowMap = sensitiveWordMap;        for (Character c : keyWord.toCharArray()) {            Object obj = nowMap.get(c);            if (obj == null) {                Map<String, Object> childMap = new HashMap<>();                childMap.put("isEnd", "false");                nowMap.put(c, childMap);                nowMap = childMap;            } else {                nowMap = (Map) obj;            }        }        nowMap.put("isEnd", "true");    }    /**     * 读取敏感词文件     *     * @return     */    private static Set<String> readSensitiveWords() {        Set<String> keyWords = new HashSet<>();        BufferedReader reader = null;        try {            reader = new BufferedReader(new InputStreamReader(in, ENCODING));            String line;            while ((line = reader.readLine()) != null) {                keyWords.add(line.trim());            }        } catch (UnsupportedEncodingException e) {            LOG.error("敏感词库文件转码失败!");        } catch (FileNotFoundException e) {            LOG.error("敏感词库文件不存在!");        } catch (IOException e) {            LOG.error("敏感词库文件读取失败!");        } finally {            if (reader != null) {                try {                    reader.close();                } catch (IOException e) {                    e.printStackTrace();                }                reader = null;            }        }        return keyWords;    }    /**     * 检查敏感词     *     * @return     */    private static List<String> checkSensitiveWord(String text) {        if (sensitiveWordMap == null) {            init();        }        List<String> sensitiveWords = new ArrayList<>();        Map nowMap = sensitiveWordMap;        for (int i = 0; i < text.length(); i++) {            Character word = text.charAt(i);            Object obj = nowMap.get(word);            if (obj == null) {                continue;            }            int j = i + 1;            Map childMap = (Map) obj;            while (j < text.length()) {                if ("true".equals(childMap.get("isEnd"))) {                    sensitiveWords.add(text.substring(i, j));                }                obj = childMap.get(text.charAt(j));                if (obj != null) {                    childMap = (Map) obj;                } else {                    break;                }                j++;            }        }        return sensitiveWords;    }}


2.TTMP算法

TTMP算法由网友原创,关于它的起源可以查看这里,TTMP算法的原理是将敏感词拆分成“脏字”的序列,只有待比对字符串完全由“脏字”组成时,才去判断它是否为敏感词,减少了比对次数。这个算法的简单实现如下:

public class TextFilterUtil {    //日志    private static final Logger LOG = LoggerFactory.getLogger(TextFilterUtil.class);    //默认编码格式    private static final String ENCODING = "gbk";    //敏感词库的路径    private static final InputStream in = TextFilterUtil.class.getClassLoader().getResourceAsStream("sensitive/keyWords.txt");    //脏字库    private static Set<Character> sensitiveCharSet = null;    //敏感词库    private static Set<String> sensitiveWordSet = null;    /**     * 初始化敏感词库     */    private static void init() {        //初始化容器        sensitiveCharSet = new HashSet<>();        sensitiveWordSet = new HashSet<>();        //读取文件 创建敏感词库        readSensitiveWords();    }    /**     * 读取本地的敏感词文件     *     * @return     */    private static void readSensitiveWords() {        BufferedReader reader = null;        try {            reader = new BufferedReader(new InputStreamReader(in, ENCODING));            String line;            while ((line = reader.readLine()) != null) {                String word = line.trim();                sensitiveWordSet.add(word);                for (Character c : word.toCharArray()) {                    sensitiveCharSet.add(c);                }            }        } catch (UnsupportedEncodingException e) {            LOG.error("敏感词库文件转码失败!");        } catch (FileNotFoundException e) {            LOG.error("敏感词库文件不存在!");        } catch (IOException e) {            LOG.error("敏感词库文件读取失败!");        } finally {            if (reader != null) {                try {                    reader.close();                } catch (IOException e) {                    e.printStackTrace();                }                reader = null;            }        }        return;    }    /**     * 检查敏感词     *     * @return     */    private static List<String> checkSensitiveWord(String text) {        if (sensitiveWordSet == null || sensitiveCharSet == null) {            init();        }        List<String> sensitiveWords = new ArrayList<>();        for (int i = 0; i < text.length(); i++) {            Character word = text.charAt(i);            if (!sensitiveCharSet.contains(word)) {                continue;            }            int j = i;            while (j < text.length()) {                if (!sensitiveCharSet.contains(word)) {                    break;                }                String key = text.substring(i, j + 1);                if (sensitiveWordSet.contains(key)) {                    sensitiveWords.add(key);                }                j++;            }        }        return sensitiveWords;    }}

注:以上代码实现仅用于展示思路,在实际使用中还有很多地方可以优化。

原文地址:http://blog.csdn.net/sinat_19425927/article/details/42872129

原创粉丝点击