无视网页编码获得Html的一个方法

来源:互联网 发布:asp学生管理系统源码 编辑:程序博客网 时间:2024/06/11 03:30


最近在写一个比较简单的分布式爬虫,爬取的网页编码各式各样,所以写出了一套方法,用来获取Html



下面上代码



/// <summary>        /// 获取网页编码并输出内容        /// </summary>        /// <param name="url">url</param>        /// <param name="encode">回传编码</param>        /// <returns>html</returns>        public static string GetDataFromUrl(string url, ref Encoding encode)        {            try            {                string str = string.Empty;                HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);                //设置http头                request.AllowAutoRedirect = true;                request.AllowWriteStreamBuffering = true;                request.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.162 Safari/535.19";                request.Method = "GET";                request.Timeout = 10 * 1000;                HttpWebResponse response = null;                response = (HttpWebResponse)request.GetResponse();                //根据http应答的http头来判断编码                string characterSet = response.CharacterSet;                //Encoding encode;                if (characterSet != "")                {                    if (characterSet == "ISO-8859-1")                    {                        characterSet = "gb2312";                    }                    encode = Encoding.GetEncoding(characterSet);                }                else                {                    encode = Encoding.Default;                }                //声明一个内存流来保存http应答流                Stream receiveStream = response.GetResponseStream();                MemoryStream mStream = new MemoryStream();                byte[] bf = new byte[255];                int count = receiveStream.Read(bf, 0, 255);                while (count > 0)                {                    mStream.Write(bf, 0, count);                    count = receiveStream.Read(bf, 0, 255);                }                receiveStream.Close();                mStream.Seek(0, SeekOrigin.Begin);                //从内存流里读取字符串                StreamReader reader = new StreamReader(mStream, encode);                char[] buffer = new char[1024];                count = reader.Read(buffer, 0, 1024);                while (count > 0)                {                    str += new String(buffer, 0, count);                    count = reader.Read(buffer, 0, 1024);                }                //从解析出的字符串里判断charset,如果和http应答的编码不一直                //那么以页面声明的为准,再次从内存流里重新读取文本                Regex reg =                   new Regex(@"<meta[\s\S]+?charset=(.*?)""[\s\S]+?>",                              RegexOptions.Multiline | RegexOptions.IgnoreCase);                MatchCollection mc = reg.Matches(str);                if (mc.Count > 0)                {                    string tempCharSet = mc[0].Result("$1");                    if (string.Compare(tempCharSet, characterSet, true) != 0)                    {                        encode = Encoding.GetEncoding(tempCharSet);                        str = string.Empty;                        mStream.Seek(0, SeekOrigin.Begin);                        reader = new StreamReader(mStream, encode);                        buffer = new char[255];                        count = reader.Read(buffer, 0, 255);                        while (count > 0)                        {                            str += new String(buffer, 0, count);                            count = reader.Read(buffer, 0, 255);                        }                    }                }                reader.Close();                mStream.Close();                if (response != null)                    response.Close();                return str;            }            catch (Exception ex)            {                if (s)                {                    Console.WriteLine(ex.Message);                    StreamWriter st = new StreamWriter("err.dst", true);                    st.WriteLine(url);                    st.WriteLine(ex.ToString());                    st.Close();                    s = false;                    //System.Threading.Thread.Sleep(100000);                    GetDataFromUrl(url, ref encode);                }            }            return "";        }


原创粉丝点击