方法1:
public static string ClearHTMLTags1(string HTML)
{
string[] Regexs ={
@"<script[^>]*?>.*?</script>",
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
@"([\r\n])[\s]+",
@"&(quot|#34);",
@"&(amp|#38);",
@"&(lt|#60);",
@"&(gt|#62);",
@"&(nbsp|#160);",
@"&(iexcl|#161);",
@"&(cent|#162);",
@"&(pound|#163);",
@"&(copy|#169);",
@"&#(\d+);",
@"-->",
@"<!--.*\n",
};
string[] Replaces ={
"",
"",
"",
"\"",
"&",
"<",
">",
" ",
"\xa1", //chr(161),
"\xa2", //chr(162),
"\xa3", //chr(163),
"\xa9", //chr(169),
"",
"\r\n",
"",
""
};
string s = HTML;
for (int i = 0; i < Regexs.Length; i++)
{
s = new Regex(Regexs[i], RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(s, Replaces[i]);
}
s.Replace("<", "");
s.Replace(">", "");
s.Replace("\r\n", "");
return s;
}
方法2:
转载自.http://blog.sina.com.cn/cheneyblog
public stringRemoveHTMLTags(string htmlStream)
{
if (htmlStream == null)
{
throw new Exception("Your input html stream is null!");
return null;
}
/*
*最好把所有的特殊HTML标记都找出来,然后把与其相对应的Unicode字符一起影射到Hash表内,最后一起都替换掉
*/
//先单独测试,成功后,再把所有模式合并
//注:这两个必须单独处理
//去掉嵌套了HTML标记的JavaScript:(<script)[\\s\\S]*(</script>)
//去掉css标记:(<style)[\\s\\S]*(</style>)
//去掉css标记:\\..*\\{[\\s\\S]*\\}
htmlStream = Regex.Replace(htmlStream,"(<script)[\\s\\S]*?(</script>)|(<style)[\\s\\S]*?(</style>)"," ", RegexOptions.IgnoreCase);
//htmlStream = RemoveTag(htmlStream, "script");
//htmlStream = RemoveTag(htmlStream, "style");
//去掉普通HTML标记:<[^>]+>
//替换空格: |&|­| |­
htmlStream = Regex.Replace(htmlStream,"<[^>]+>| |&|­| |­|•|<|>"," ", RegexOptions.IgnoreCase);
//htmlStream = RemoveTag(htmlStream);
//替换左尖括号
//htmlStream = Regex.Replace(htmlStream, "<","<");
//替换右尖括号
//htmlStream = Regex.Replace(htmlStream, ">",">");
//替换空行
//htmlStream = Regex.Replace(htmlStream, "[\n|\r|\t]", "");//[\n|\r][\t*| *]*[\n|\r]
htmlStream = Regex.Replace(htmlStream, "(\r\n[\r|\n|\t|]*\r\n)|(\n[\r|\n|\t| ]*\n)", "\r\n");
htmlStream = Regex.Replace(htmlStream, "[\t| ]{1,}", "");
return htmlStream.Trim();
}
方法3:
//除去所有在html元素中标记
publicstatic string striphtml(string strhtml)
{
string stroutput = strhtml;
Regex regex = newRegex(@"<[^>]+>|</[^>]+>");
stroutput = regex.Replace(stroutput, "");
return stroutput;
}
方法4:
转载 自.http://blog.csdn.net/hougelou/article/details/7901066
protected string ClearHtml(string Content) { Content = Zxj_ReplaceHtml("&#[^>]*;", "", Content); Content = Zxj_ReplaceHtml("</?marquee[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?object[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?param[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?embed[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?table[^>]*>", "", Content); Content = Zxj_ReplaceHtml(" ", "", Content); Content = Zxj_ReplaceHtml("</?tr[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?th[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?p[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?a[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?img[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?tbody[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?li[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?span[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?div[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?th[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?td[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?script[^>]*>", "", Content); Content = Zxj_ReplaceHtml("(javascript|jscript|vbscript|vbs):", "", Content); Content = Zxj_ReplaceHtml("on(mouse|exit|error|click|key)", "", Content); Content = Zxj_ReplaceHtml("<\\?xml[^>]*>", "", Content); Content = Zxj_ReplaceHtml("<\\/?[a-z]+:[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?font[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?b[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?u[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?i[^>]*>", "", Content); Content = Zxj_ReplaceHtml("</?strong[^>]*>", "", Content); string clearHtml = Content; return clearHtml; } private string Zxj_ReplaceHtml(string patrn, string strRep, string content) { if (string.IsNullOrEmpty(content)) { content = ""; } Regex rgEx = new Regex(patrn, RegexOptions.IgnoreCase); string strTxt = rgEx.Replace(content, strRep); return strTxt; }
0 0