Asp.net 使用正则和网络编程抓取网页数据(实用)

来源:互联网 发布:程序框图软件 编辑:程序博客网 时间:2024/06/06 03:13

Asp.net 使用正则和网络编程抓取网页数据(实用)

Asp.net 使用正则和网络编程抓取网页数据(实用)

        /// <summary>        /// 抓取网页相应内容        /// </summary>        /// <param name="strUrl">采集地址</param>        /// <param name="Begin">开始字符</param>        /// <param name="End">结束字符</param>        /// <returns></returns>        private static String GetContent(String strUrl, String Begin, String End)        {            String result = String.Empty;            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);            HttpWebResponse response = (HttpWebResponse)request.GetResponse();            using (StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.Default))            {                result = reader.ReadToEnd();                                reader.Close();                response.Close();            }            //抓取内容            Match table = Regex.Match(result, "(?<=" + Begin + ")[\\s\\S]*?(?=" + End + ")", RegexOptions.IgnoreCase);            result = NoHTML(table.Value);            return result;        }        ///<summary>        ///去除HTML标记        ///</summary>        ///<param   name="NoHTML">包括HTML的源码   </param>        ///<returns>已经去除后的文字</returns>        private static string NoHTML(string Htmlstring)        {            //删除脚本            Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "",                RegexOptions.IgnoreCase);            //删除HTML            Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "   ",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "",                RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, ">", "");            Htmlstring = Regex.Replace(Htmlstring, "<", "");            Htmlstring = Regex.Replace(Htmlstring, "\r\n", "");            Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf("\n") + 1);            if (Htmlstring.LastIndexOf("'") >= 0)                Htmlstring = Htmlstring.Substring(Htmlstring.LastIndexOf("'") + 1);            if (Htmlstring.IndexOf("class='tdbk'") >= 0)                Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf("class='tdbk'") + "class='tdbk'".Length);            return Htmlstring;        }


1 0
原创粉丝点击