提取HTML代码中文字的C#函数(HTML to TEXT)

来源:互联网 发布:快云数据库 编辑:程序博客网 时间:2024/04/30 04:34
方法1:
///提取HTML代码中文字的C#函数       ///   <summary>     ///   去除HTML标记     ///   </summary>     ///   <param   name="strHtml">包括HTML的源码   </param>     ///   <returns>已经去除后的文字</returns>     using   System;     using   System.Text.RegularExpressions;     public   class   StripHTMLTest{         public   static   void   Main(){             string   s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");             Console.WriteLine(s);         }             public   static   string   StripHTML(string   strHtml){             string   []   aryReg   ={                         @"<script[^>]*?>.*?</script>",                             @"<(///s*)?!?((/w+:)?/w+)(/w+(/s*=?/s*(([""'])(//[""'tbnr]|[^/7])*?/7|/w+)|.{0})|/s)*?(///s*)?>",                         @"([/r/n])[/s]+",                         @"&(quot|#34);",                         @"&(amp|#38);",                         @"&(lt|#60);",                         @"&(gt|#62);",                           @"&(nbsp|#160);",                           @"&(iexcl|#161);",                         @"&(cent|#162);",                         @"&(pound|#163);",                         @"&(copy|#169);",                         @"&#(/d+);",                         @"-->",                         @"<!--.*/n"                       };                 string   []   aryRep   =   {                           "",                           "",                           "",                           "/"",                           "&",                           "<",                           ">",                           "   ",                           "/xa1",//chr(161),                           "/xa2",//chr(162),                           "/xa3",//chr(163),                           "/xa9",//chr(169),                           "",                           "/r/n",                           ""                         };                 string   newReg   =aryReg[0];             string   strOutput=strHtml;             for(int   i   =   0;i<aryReg.Length;i++){                 Regex   regex   =   new   Regex(aryReg[i],RegexOptions.IgnoreCase);                 strOutput   =   regex.Replace(strOutput,aryRep[i]);             }             strOutput.Replace("<","");             strOutput.Replace(">","");             strOutput.Replace("/r/n","");             return   strOutput;         }     }       

 方法2:

public static string DelHTML(string Htmlstring)//将HTML去除         {                   #region             //删除脚本             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             //删除HTML             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<(.[^>]*)>","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"([/r/n])[/s]+","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"-->","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<!--.*","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);                         //Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<A>.*</A>","");                     //Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<[a-zA-Z]*=/.[a-zA-Z]*/?[a-zA-Z]+=/d&/w=%[a-zA-Z]*|[A-Z0-9]","");                                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(quot|#34);","/"",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(amp|#38);","&",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(lt|#60);","<",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(gt|#62);",">",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(nbsp|#160);"," ",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(iexcl|#161);","/xa1",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring,@"&(cent|#162);","/xa2",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(pound|#163);","/xa3",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(copy|#169);","/xa9",System.Text.RegularExpressions.RegexOptions.IgnoreCase);             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring, @"&#(/d+);","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);                         Htmlstring.Replace("<","");             Htmlstring.Replace(">","");             Htmlstring.Replace("/r/n","");             //Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();             #endregion             return Htmlstring;         }