文本标签过滤

来源:互联网 发布:百度大数据合作 编辑:程序博客网 时间:2024/05/22 14:44
<pre name="code" class="java">public class HtmlCleaner {  private static Pattern p_script;  private static java.util.regex.Matcher m_script;  private static Pattern p_style;  private static java.util.regex.Matcher m_style;  private static Pattern p_html;  private static java.util.regex.Matcher m_html;    private static  Pattern p_a;  private static java.util.regex.Matcher m_a;    private static String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script>  // }  private static String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; // 定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style>  // }  private static String regEx_a = "<\\s*a\\s+([^>]*)\\s*>"; //过滤a的正则表达式    private static String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式  /**   * Html2Text is used to clean the HTML tags in a String which is very useful   * in the article component and so on   *    * @param inputString   *            the String which is needed to clean   * @return String,the result of cleaning operation   */  public static String filterHtmlToA(String inputString){    String htmlStr = inputString; // 含html标签的字符串              String textStr = "";          try {            p_a = Pattern.compile(regEx_a,Pattern.CASE_INSENSITIVE);      m_a = p_a.matcher(htmlStr);      htmlStr = m_a.replaceAll(""); //过滤a标签          textStr = htmlStr;     }catch (Exception e) {         System.err.println("Html2Text: " + e.getMessage());}    return textStr;// 返回文本字符串  }    public static String filterHtmlTag(String inputString) {          String htmlStr = inputString; // 含html标签的字符串          String textStr = "";          try {                  p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);                  m_script = p_script.matcher(htmlStr);                  htmlStr = m_script.replaceAll(""); // 过滤script标签                  p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);                  m_style = p_style.matcher(htmlStr);                  htmlStr = m_style.replaceAll(""); // 过滤style标签                  p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);                  m_html = p_html.matcher(htmlStr);                  htmlStr = m_html.replaceAll(""); // 过滤html标签                  textStr = htmlStr;          } catch (Exception e) {                  System.err.println("Html2Text: " + e.getMessage());          }          return textStr;// 返回文本字符串  }


                                             
0 0
原创粉丝点击