网页内空为压缩行式的测试代码

来源:互联网 发布:2017淘宝权重规则 编辑:程序博客网 时间:2024/06/05 22:31
            try            {                req = (HttpWebRequest)HttpWebRequest.Create(strUrl);                req.UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:6.0) Gecko/20100101 Firefox/6.0";                req.Accept = "*/*";                req.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");                req.ContentType = "text/xml";                req.Timeout = 20000;                // 设置访问来源地址,避免被检测到非本地址访问                if (String.IsNullOrEmpty(referer))                {                    req.Referer = req.RequestUri.Host;                }                else                {                    req.Referer = referer;                }                // 设置 cookie 验证信息                CookieContainer cc = new CookieContainer();                req.CookieContainer = cc;                foreach (Cookie cook in cookies)                {                    Cookie c = new Cookie(cook.Name, cook.Value);                    if (cookieDomain != null)                        c.Domain = cookieDomain;                    cc.Add(c);                }                resp = (HttpWebResponse)req.GetResponse();                Encoding enc;                // 解决 .NET 编码识别错误的问题                if ("ISO-8859-1" == resp.CharacterSet)                {                    // 使用指定的编码格式校正                    enc = encoding;                }                else                {                    // 自动使用识别出来的编码格式                    enc = Encoding.GetEncoding(resp.CharacterSet);                }                //if (isAjax)                //{                //    return GetAjaxUseWebBrowser(strUrl);                //}                string sHTML = string.Empty;                // 从网站Url获取内容流                stream = resp.GetResponseStream();                // 判断内容是否使用gzip压缩                if (resp.ContentEncoding.ToLower().Contains("gzip"))                {                    // 设置gzip解压缩                    stream = new GZipStream(stream, CompressionMode.Decompress);                    read = new StreamReader(stream, enc);                    sHTML = read.ReadToEnd();                }                else                {                    string sChartSet = "";                    read = new StreamReader(stream, enc);                    sHTML = read.ReadToEnd();                    // 获取内容使用的编码格式                    Match charSetMatch = Regex.Match(sHTML, "charset=(?<code>[a-zA-Z0-9\\-]+)", RegexOptions.IgnoreCase);                    sChartSet = charSetMatch.Groups["code"].Value;                    //if it's not utf-8,we should redecode the html.                    Regex rx = new Regex("([\u4e00-\u9fa5]{2,4})");                    // 如果内容为UTF-8编码格式则进行再次转码                    if (!rx.IsMatch(sHTML))                    {                        if (!string.IsNullOrEmpty(sChartSet.Trim()))                            sHTML = Encoding.GetEncoding(sChartSet).GetString(enc.GetBytes(sHTML));                    }                }                // 去除换行符                sHTML = sHTML.Replace("\n", "").Replace("\r", "").Replace("\t", "");                // 设置抓取任务状态为 1:=成功                crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Success;                return sHTML;            }            catch (Exception ex)            {                CommonFunction.logWirte(ex.ToString() + strUrl, LogGrade.Warning);                if (ex.Message.ToString().IndexOf("远程服务器返回错误: (503) 服务器不可用") > -1)                {                    crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Forbat;                }                else                {                    crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Error;                }            }            finally            {                if (resp != null)                {                    resp.Close();                }                if (stream != null)                {                    stream.Close();                }                if (read != null)                {                    read.Close();                }            }