网页内空为压缩行式的测试代码
来源:互联网 发布:2017淘宝权重规则 编辑:程序博客网 时间:2024/06/05 22:31
try { req = (HttpWebRequest)HttpWebRequest.Create(strUrl); req.UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:6.0) Gecko/20100101 Firefox/6.0"; req.Accept = "*/*"; req.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5"); req.ContentType = "text/xml"; req.Timeout = 20000; // 设置访问来源地址,避免被检测到非本地址访问 if (String.IsNullOrEmpty(referer)) { req.Referer = req.RequestUri.Host; } else { req.Referer = referer; } // 设置 cookie 验证信息 CookieContainer cc = new CookieContainer(); req.CookieContainer = cc; foreach (Cookie cook in cookies) { Cookie c = new Cookie(cook.Name, cook.Value); if (cookieDomain != null) c.Domain = cookieDomain; cc.Add(c); } resp = (HttpWebResponse)req.GetResponse(); Encoding enc; // 解决 .NET 编码识别错误的问题 if ("ISO-8859-1" == resp.CharacterSet) { // 使用指定的编码格式校正 enc = encoding; } else { // 自动使用识别出来的编码格式 enc = Encoding.GetEncoding(resp.CharacterSet); } //if (isAjax) //{ // return GetAjaxUseWebBrowser(strUrl); //} string sHTML = string.Empty; // 从网站Url获取内容流 stream = resp.GetResponseStream(); // 判断内容是否使用gzip压缩 if (resp.ContentEncoding.ToLower().Contains("gzip")) { // 设置gzip解压缩 stream = new GZipStream(stream, CompressionMode.Decompress); read = new StreamReader(stream, enc); sHTML = read.ReadToEnd(); } else { string sChartSet = ""; read = new StreamReader(stream, enc); sHTML = read.ReadToEnd(); // 获取内容使用的编码格式 Match charSetMatch = Regex.Match(sHTML, "charset=(?<code>[a-zA-Z0-9\\-]+)", RegexOptions.IgnoreCase); sChartSet = charSetMatch.Groups["code"].Value; //if it's not utf-8,we should redecode the html. Regex rx = new Regex("([\u4e00-\u9fa5]{2,4})"); // 如果内容为UTF-8编码格式则进行再次转码 if (!rx.IsMatch(sHTML)) { if (!string.IsNullOrEmpty(sChartSet.Trim())) sHTML = Encoding.GetEncoding(sChartSet).GetString(enc.GetBytes(sHTML)); } } // 去除换行符 sHTML = sHTML.Replace("\n", "").Replace("\r", "").Replace("\t", ""); // 设置抓取任务状态为 1:=成功 crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Success; return sHTML; } catch (Exception ex) { CommonFunction.logWirte(ex.ToString() + strUrl, LogGrade.Warning); if (ex.Message.ToString().IndexOf("远程服务器返回错误: (503) 服务器不可用") > -1) { crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Forbat; } else { crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Error; } } finally { if (resp != null) { resp.Close(); } if (stream != null) { stream.Close(); } if (read != null) { read.Close(); } }