asp.net 抓取网页 自动识别编码
来源:互联网 发布:nginx rtmp 直播配置 编辑:程序博客网 时间:2024/04/27 11:58
/// <summary>
/// 获取页面编码
/// </summary>
/// <param name="response">HttpWebResponse</param>
/// <returns></returns>
private Encoding GetPageEncoding(HttpWebResponse response)
{
//如果发现content-type头
string ctype = response.Headers["content-type"];
string charset = string.Empty;
if (!string.IsNullOrEmpty(ctype))
{
int s = ctype.IndexOf("charset=");
if (s > -1)
{
charset = ctype.Substring(s+8);//因为“charset=”长度为8位
}
}
//如果没有发现content-type,只好从脚本中搜索了
if (string.IsNullOrEmpty(charset))
{
HttpWebRequest all_codeRequest = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse all_codeResponse = (HttpWebResponse)all_codeRequest.GetResponse();
if (all_codeResponse.StatusCode == HttpStatusCode.OK)
{
StreamReader the_Reader = new StreamReader(all_codeResponse.GetResponseStream(), encoding);
///所有的页面代码文本
string all_code = the_Reader.ReadToEnd();
if (!string.IsNullOrEmpty(all_code))
{
int s = all_code.IndexOf("charset=");
int e = -1;
if (s > -1)
{
s = s + 8;
e = all_code.IndexOf("/"", s + 1);
if (e > -1)
{
charset = all_code.Substring(s, e - s);
///去掉开始位置的引号
charset = charset.TrimStart(new Char[] { '"' });
///去掉结束位置的引号
charset = charset.TrimEnd(new Char[] { '>', '"' });
}
}
}
the_Reader.Close();
the_Reader.Dispose();
}
all_codeResponse.Close();
}
if (!string.IsNullOrEmpty(charset))
{
try
{
encoding = Encoding.GetEncoding(charset);
}
catch (Exception)
{
encoding = Encoding.UTF8;
}
}
return encoding;
}
public void SaveFile()
{
string all_code = "";
try
{
HttpWebRequest all_codeRequest = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse all_codeResponse = (HttpWebResponse)all_codeRequest.GetResponse();
if (all_codeResponse.StatusCode == HttpStatusCode.OK)
{
encoding = GetPageEncoding(all_codeResponse);
StreamReader the_Reader = new StreamReader(all_codeResponse.GetResponseStream(), encoding);
all_code = the_Reader.ReadToEnd();
FileStream fs = new FileStream("F://test.html", FileMode.Create);
StreamWriter sw = new StreamWriter(fs, encoding);
sw.WriteLine(all_code);
sw.Close();
fs.Close();
the_Reader.Close();
the_Reader.Dispose();
all_codeResponse.Close();
}
}
catch (Exception)
{
throw;
}
}
- asp.net 抓取网页 自动识别编码
- asp.net 网页抓取
- ASP.NET 抓取网页
- ASP.NET 抓取网页内容
- ASP.NET 抓取网页源文件
- ASP.NET 抓取网页内容
- ASP.NET 抓取网页内容
- ASP.NET 抓取网页内容
- ASP.NET 抓取网页内容
- asp.net 网页抓取内容
- 自动识别网页编码
- ASP.NET 网页抓取-----抓取超链接
- ASP.NET自动识别GB2312与UTF-8编码的文件
- c#网页数据抓取/asp.net网页数据抓取
- asp.net C#抓取网页链接
- ASP.NET实现抓取网页中的链接
- ASP.NET抓取其他网页代码
- ASP.NET抓取其他网页代码
- libpcapz实例
- Linux下Socket编程中用send发送结构体
- ubuntu12.04安装Eclipse Indigo
- Window.open 打开的窗口标题中包含url
- 过程式编程-绘图程序
- asp.net 抓取网页 自动识别编码
- Web应用中的轻量级消息队列
- (单纯二维) Mobile phones (1195)
- android消息推送demo
- oracle有Boolean类型
- 欧拉图与欧拉路
- 我与GO.COM
- css如何让文本框中的输入的文字始终垂直居中
- pysvn-workbench是客户端