提取HTML代码中文字的C#函数

来源:互联网 发布:淘宝卡罗德钢琴怎么样 编辑:程序博客网 时间:2024/04/30 03:20

方法1:  

        ///提取HTML代码中文字的C#函数
        ///   <summary>
        ///   去除HTML标记
        ///   </summary>
        ///   <param   name="strHtml">包括HTML的源码   </param>
        ///   <returns>已经去除后的文字</returns>
        using   System;
        using   System.Text.RegularExpressions;
        public   class   StripHTMLTest{
              public   static   void   Main(){
                  string   s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");
                  Console.WriteLine(s);
              }
              public   static   string   StripHTML(string   strHtml){
                  string   []   aryReg   ={
                              @"<script[^>]*?>.*?</script>",
                              @"<(///s*)?!?((/w+:)?/w+)(/w+(/s*=?/s*(([""'])(//[""'tbnr]|[^/7])*?/7|/w+)|.{0})|/s)*?(///s*)?>",
                              @"([/r/n])[/s]+",
                              @"&(quot|#34);",
                              @"&(amp|#38);",
                              @"&(lt|#60);",
                              @"&(gt|#62);",
                              @"&(nbsp|#160);",
                              @"&(iexcl|#161);",
                              @"&(cent|#162);",
                              @"&(pound|#163);",
                              @"&(copy|#169);",
                              @"&#(/d+);",
                              @"-->",
                              @"<!--.*/n"
                            };
                  string   []   aryRep   =   {
                                "",
                                "",
                                "",
                                "/"",
                                "&",
                                "<",
                                ">",
                                "   ",
                                "/xa1",//chr(161),
                                "/xa2",//chr(162),
                                "/xa3",//chr(163),
                                "/xa9",//chr(169),
                                "",
                                "/r/n",
                                ""
                              };

                  string   newReg   =aryReg[0];
                  string   strOutput=strHtml;
                  for(int   i   =   0;i<aryReg.Length;i++){
                      Regex   regex   =   new   Regex(aryReg[i],RegexOptions.IgnoreCase);
                      strOutput   =   regex.Replace(strOutput,aryRep[i]);
                  }
                  strOutput.Replace("<","");
                  strOutput.Replace(">","");
                  strOutput.Replace("/r/n","");
                  return   strOutput;
              }
          }

 

方法2:

        public static string DelHTML(string Htmlstring)//将HTML去除
                 {
                     #region
                     //删除脚本
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     //删除HTML
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<(.[^>]*)>","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"([/r/n])[/s]+","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"-->","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<!--.*","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     //Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<A>.*</A>","");
                     //Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<[a-zA-Z]*=/.[a-zA-Z]*/?[a-zA-Z]+=/d&/w=%[a-zA-Z]*|[A-Z0-9]","");
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(quot|#34);","/"",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(amp|#38);","&",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(lt|#60);","<",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(gt|#62);",">",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(nbsp|#160);"," ",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(iexcl|#161);","/xa1",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring,@"&(cent|#162);","/xa2",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(pound|#163);","/xa3",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(copy|#169);","/xa9",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring, @"&#(/d+);","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                     Htmlstring.Replace("<","");
                     Htmlstring.Replace(">","");
                     Htmlstring.Replace("/r/n","");
                     //Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
                     #endregion
                     return Htmlstring;
                 }