【Java】二分查找敏感字词过滤算法

来源:互联网 发布:unity3d 如何三维建模 编辑:程序博客网 时间:2024/03/29 04:25
package com.hz.yisheng.portal.mobi.goodman.util;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.Arrays;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Map.Entry;/** * 过滤敏感词,并把敏感词替换成* * * @author WWX */public class SensitiveWordUtils {private static StringENCODING= "GBK"; /* 字符编码 */static ArrayList<String>first= new ArrayList<String>();staticString[] sortFirst;static char[]charFirst;static HashMap<String, ArrayList<String> >map= new HashMap<String, ArrayList<String> >();static HashMap<String, String[]>sortMap = new HashMap<String, String[]>();static HashMap<String, char[]>charMap = new HashMap<String, char[]>();static ArrayList<String>temp;static Stringkey, value;intlength;/** * 带参数的构造函数 * * @param keys *            敏感词 * @param tContent *            需要过滤的内容 */public SensitiveWordUtils( List<String> keys, String tContent ){for ( String k : keys ){if ( !first.contains( k.substring( 0, 1 ) ) ){first.add( k.substring( 0, 1 ) );}length = k.length();for ( int i = 1; i < length; i++ ){key= k.substring( 0, i );value= k.substring( i, i + 1 );if ( i == 1 && !first.contains( key ) ){first.add( key );}/* 有,添加 */if ( map.containsKey( key ) ){if ( !map.get( key ).contains( value ) ){map.get( key ).add( value );}}/* 没有添加 */else {temp = new ArrayList<String>();temp.add( value );map.put( key, temp );}}}sortFirst = first.toArray( new String[first.size()] );Arrays.sort( sortFirst ); /* 排序 */charFirst = new char[first.size()];for ( int i = 0; i < charFirst.length; i++ ){charFirst[i] = first.get( i ).charAt( 0 );}Arrays.sort( charFirst ); /* 排序 */String[] sortValue;ArrayList<String>v;Map.Entry<String, ArrayList<String> >entry;Iterator < Entry < String, ArrayList < String >>> iter = map.entrySet() .iterator();while ( iter.hasNext() ){entry= (Map.Entry<String, ArrayList<String> >)iter.next();v= (ArrayList<String>)entry.getValue();sortValue= v.toArray( new String[v.size()] );Arrays.sort( sortValue ); /* 排序 */sortMap.put( entry.getKey(), sortValue );}char[] charValue;iter = map.entrySet().iterator();while ( iter.hasNext() ){entry= (Map.Entry<String, ArrayList<String> >)iter.next();v= (ArrayList<String>)entry.getValue();charValue= new char[v.size()];for ( int i = 0; i < charValue.length; i++ ){charValue[i] = v.get( i ).charAt( 0 );}Arrays.sort( charValue ); /* 排序 */charMap.put( entry.getKey(), charValue );}}/** * 把敏感词替换成* * * @param content *            需要过滤的内容 * @return 过滤完后的符合要求的内容 */public String replace( String content ){Stringr= null, f, c = content;Stringreplacedword= content;charg;char[] temps;int length = c.length();for ( int i = 0; i < length - 1; i++ ){g = c.charAt( i );/* 二分查找 */if ( Arrays.binarySearch( charFirst, g ) > -1 ){tag: for ( int j = i + 1; j < length; j++ ){f= c.substring( i, j );g= c.charAt( j );temps= charMap.get( f );if ( temps == null ) /* 找到了 */{ /* System.out.println("ok"); */r = f;String str = "";for ( int m = 1; m <= r.length(); m++ ){str = str + "*";}replacedword= c.replace( r, str );c= replacedword;break tag;}/* 二分查找 */if ( Arrays.binarySearch( temps, g ) > -1 ){if ( j == length - 1 ){/* * print("find!"); * System.out.println("find!"); */r = c.substring( i, j + 1 );String str = "";for ( int m = 1; m <= r.length(); m++ ){str = str + "*";}replacedword= c.replace( r, str );c= replacedword;break tag;}} else { /* 没有找到了 */break;}}}}return(replacedword);}@SuppressWarnings( "resource" )public static void main( String[] args ) throws Exception{String string = "太多的伤感情怀也许只局限于阿扁推翻 荧幕中的情节,臭狗屎尝试着去用某种方式渐渐的很潇洒地释自杀指南怀那些自己经历的政府无能。"+ "然后法轮功 我们的扮演的推翻政府就是跟随着主人公的喜红客联盟 怒哀乐而过于牵强的把自己的情感也附加于银幕情节中,然后感动就流泪,"+ "难过就躺在某一个人的怀里尽情的阐述心扉或者手机卡复制器一个人一杯红酒一部电影在夜三级片深人静的晚上,关上电话静静的发呆着。";System.out.println( "待检测语句字数:" + string.length() );long beginTime = System.currentTimeMillis();/* 读取敏感字文档,将其保存到列表中 */Filefile= new File( "../yisheng-static/userfiles/SensitiveWord.txt" );  /* 读取文件 */InputStreamReaderread= new InputStreamReader(new FileInputStream( file ), ENCODING );List<String> keys = null;try {if ( file.isFile() && file.exists() )                                                   /* 文件流是否存在 */{keys = new ArrayList<String>();BufferedReaderbufferedReader= new BufferedReader( read );Stringtxt= null;while ( (txt = bufferedReader.readLine() ) != null )                            /* 读取文件,将文件内容放入到set中 */{keys.add( txt );}} else {                                                                                /* 不存在抛出异常信息 */throw new Exception( "敏感词库文件不存在" );}} catch ( Exception e ) {throw e;} finally {read.close();                                                                           /* 关闭文件流 */}SensitiveWordUtils swUtil = new SensitiveWordUtils( keys, string );System.out.println( swUtil.replace( string ) );long endTime = System.currentTimeMillis();System.out.println( "总共消耗时间为:" + (endTime - beginTime) );}}


0 0
原创粉丝点击