asp.net 抓取网页 自动识别编码

来源:互联网 发布:nginx rtmp 直播配置 编辑:程序博客网 时间:2024/04/27 11:58

      /// <summary>
        /// 获取页面编码
        /// </summary>
        /// <param name="response">HttpWebResponse</param>
        /// <returns></returns>
        private Encoding GetPageEncoding(HttpWebResponse response)
        {
            //如果发现content-type头
            string ctype = response.Headers["content-type"];
            string charset = string.Empty;
            if (!string.IsNullOrEmpty(ctype))
            {
                int s = ctype.IndexOf("charset=");

                if (s > -1)
                {
                    charset = ctype.Substring(s+8);//因为“charset=”长度为8位
                }
            }
            //如果没有发现content-type,只好从脚本中搜索了
            if (string.IsNullOrEmpty(charset))
            {
                HttpWebRequest all_codeRequest = (HttpWebRequest)WebRequest.Create(url);
                HttpWebResponse all_codeResponse = (HttpWebResponse)all_codeRequest.GetResponse();

                if (all_codeResponse.StatusCode == HttpStatusCode.OK)
                {
                    StreamReader the_Reader = new StreamReader(all_codeResponse.GetResponseStream(), encoding);
                    ///所有的页面代码文本
                    string all_code = the_Reader.ReadToEnd();

                    if (!string.IsNullOrEmpty(all_code))
                    {
                        int s = all_code.IndexOf("charset=");
                        int e = -1;

                        if (s > -1)
                        {
                            s = s + 8;
                            e = all_code.IndexOf("/"", s + 1);

                            if (e > -1)
                            {
                                charset = all_code.Substring(s, e - s);

                                ///去掉开始位置的引号
                                charset = charset.TrimStart(new Char[] { '"' });
                                ///去掉结束位置的引号
                                charset = charset.TrimEnd(new Char[] { '>', '"' });

                            }
                        }

                    }
                    the_Reader.Close();
                    the_Reader.Dispose();
                }
                all_codeResponse.Close();

            }

            if (!string.IsNullOrEmpty(charset))
            {
                try
                {
                    encoding = Encoding.GetEncoding(charset);
                }
                catch (Exception)
                {
                    encoding = Encoding.UTF8;
                }
            }

            return encoding;
        }

        public void SaveFile()
        {
            string all_code = "";
            try
            {

                HttpWebRequest all_codeRequest = (HttpWebRequest)WebRequest.Create(url);

                HttpWebResponse all_codeResponse = (HttpWebResponse)all_codeRequest.GetResponse();

                if (all_codeResponse.StatusCode == HttpStatusCode.OK)
                {

                    encoding = GetPageEncoding(all_codeResponse);


                    StreamReader the_Reader = new StreamReader(all_codeResponse.GetResponseStream(), encoding);

                    all_code = the_Reader.ReadToEnd();

                    FileStream fs = new FileStream("F://test.html", FileMode.Create);

                    StreamWriter sw = new StreamWriter(fs, encoding);

                    sw.WriteLine(all_code);

                    sw.Close();
                    fs.Close();
                    the_Reader.Close();
                    the_Reader.Dispose();
                    all_codeResponse.Close();
                }

            }
            catch (Exception)
            {
                throw;
            }
        }

原创粉丝点击