爬虫程序

来源:互联网 发布:口才与演讲软件 编辑:程序博客网 时间:2024/05/10 23:42
        /// <summary>        ///  获取抓取链接Html的源代码        /// </summary>        /// <param name="url">url地址</param>        /// <param name="charSet">编码方式、如果传入""则自动获取编码</param>        /// <returns></returns>        public string GetHttpSource(string url, string charSet = "")        {            try            {                string strWebData;                StreamReader sr;                var myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);                myHttpWebRequest.Proxy = null;                myHttpWebRequest.Timeout = 15 * 1000; //连接超时                myHttpWebRequest.Accept = "*/*";                myHttpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/536.1";                myHttpWebRequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; //自动解压gzip                var myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();                var stream = myHttpWebResponse.GetResponseStream();                //stream.ReadTimeout = 15 * 1000; //读取超时 设置.AutomaticDecompression后不支持超时                //先分析header中编码                var hchart = myHttpWebResponse.Headers["Content-Type"];                var hchartm = Regex.Match(hchart, "charset=(.*)?", RegexOptions.IgnoreCase);                var hchart1 = hchartm.Groups[1].Value;                if (hchart1 != "")                {                    if (stream != null) {                         sr = new StreamReader(stream, Encoding.GetEncoding(hchart1));                        strWebData = sr.ReadToEnd();                        goto endthis;                    }                }                //保存到 MemoryStream 供重复读取                var ms = new MemoryStream();                var buffer = new byte[1024];                while (true)                {                    if (stream != null)                    {                        var sz = stream.Read(buffer, 0, 1024);                        if (sz == 0) break;                        ms.Write(buffer, 0, sz);                    }                }                //默认编码读取                            ms.Position = 0;//指针置于流开头                if (charSet == "") charSet = "gb2312";                sr = new StreamReader(ms, Encoding.GetEncoding(charSet));                strWebData = sr.ReadToEnd();                //获取网页meta字符编码                var charSetMatch = Regex.Match(strWebData, "<meta([^>]*)charset=(\")?(.*)?\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);                var webCharSet = charSetMatch.Groups[3].Value.ToLower();                if (!Encoding.GetEncoding(webCharSet).Equals(Encoding.GetEncoding(charSet)) && webCharSet != "")                {                    ms.Position = 0;//指针置于流开头                    sr = new StreamReader(ms, Encoding.GetEncoding(webCharSet));                    strWebData = sr.ReadToEnd();                }                ms.Close();            endthis:                sr.Close();                stream.Close();                myHttpWebResponse.Close(); myHttpWebRequest.Abort();                return strWebData;            }            catch (Exception ex) { return "Error:" + ex.Message; }        }


        /// <summary>        /// 清除内容中的Html代码        /// </summary>        /// <param name="Content"></param>        /// <returns></returns>        public string ClearHtml(string Content)        {            Content = ReplaceHtml("&#[^>]*;", "", Content);            Content = ReplaceHtml("</?marquee[^>]*>", "", Content);            Content = ReplaceHtml("</?object[^>]*>", "", Content);            Content = ReplaceHtml("</?param[^>]*>", "", Content);            Content = ReplaceHtml("</?embed[^>]*>", "", Content);            Content = ReplaceHtml("</?table[^>]*>", "", Content);            Content = ReplaceHtml(" ", "", Content);            Content = ReplaceHtml("</?tr[^>]*>", "", Content);            Content = ReplaceHtml("</?th[^>]*>", "", Content);            Content = ReplaceHtml("</?p[^>]*>", "", Content);            Content = ReplaceHtml("</?a[^>]*>", "", Content);            Content = ReplaceHtml("</?img[^>]*>", "", Content);            Content = ReplaceHtml("</?tbody[^>]*>", "", Content);            Content = ReplaceHtml("</?li[^>]*>", "", Content);            Content = ReplaceHtml("</?span[^>]*>", "", Content);            Content = ReplaceHtml("</?div[^>]*>", "", Content);            Content = ReplaceHtml("</?th[^>]*>", "", Content);            Content = ReplaceHtml("</?td[^>]*>", "", Content);            Content = ReplaceHtml("</?script[^>]*>", "", Content);            Content = ReplaceHtml("(javascript|jscript|vbscript|vbs):", "", Content);            Content = ReplaceHtml("on(mouse|exit|error|click|key)", "", Content);            Content = ReplaceHtml("<\\?xml[^>]*>", "", Content);            Content = ReplaceHtml("<\\/?[a-z]+:[^>]*>", "", Content);            Content = ReplaceHtml("</?font[^>]*>", "", Content);            Content = ReplaceHtml("</?b[^>]*>", "", Content);            Content = ReplaceHtml("</?u[^>]*>", "", Content);            Content = ReplaceHtml("</?i[^>]*>", "", Content);            Content = ReplaceHtml("</?strong[^>]*>", "", Content);            string clearHtml = Content;            return clearHtml;        }

        /// <summary>        /// 获取Html代码中所要抓取的链接集合        /// </summary>        /// <param name="html"></param>        /// <param name="url"></param>        /// <param name="strReg"></param>        /// <returns></returns>        public string[] GetLinks(string html, string url,string strReg)        {            Collection<string> urls = new Collection<string>();            MatchCollection matches = new Regex(strReg, RegexOptions.Singleline).Matches(html);            var regLInk = string.Empty;            //判断:如果Html源代码中没有 </body> 标签则证明源不完整,重新抓取            if (!html.Contains("</body>"))             {                var newContent = GetHttpSource(url, "");                GetLinks(newContent, url, strReg);            }            //循环添加            foreach (Match match in matches)            {                regLInk = match.Groups["key"].Value;                urls.Add(regLInk);            }            return urls.ToArray();        }


0 0