通过正则表达式提取网页

来源：互联网发布：c语言德巴赫猜想编辑：程序博客网时间：2024/05/04 07:28

转帖地址：http://www.cnblogs.com/gaowenbin/articles/1777421.html

1，先看需求吧，我们需要设计一个文本框，然后输入网址后从该网址上下载HTML源码，设想想，我们可以通过wenclinet进行处理吧

private string GetWebPage(string uri)
{
try
{
 HttpWebRequest httprep = (HttpWebRequest)WebRequest.Create(uri); // 创建HTTP请求
 HttpWebResponse res = httprep.GetResponse() as HttpWebResponse; // 返回响应的值
 Encoding defaultEncoding = Encoding.UTF8; // 设置初步编码
 string strType = string.Empty; // 判断类型
 string strReturnedEncoding = string.Empty; // 编码
 strType = res.ContentType; // 获取响应内容值
 if (strType.IndexOf(";") > -1) // 判断响应内容类型中有无 ;
 {
 strType = strType.Split(new char[] { ';' })[0].Trim().ToLower(); // 存在分号进行分割
 }
 if (strType != "text/html") // 返回类型不为text/html 直接返回空值
 {
 return string.Empty;
 }
 if (res.ContentType.ToLower().Replace(" ", string.Empty).IndexOf("charset") > -1) // 先判断编码 Content-Type 中有无charset
 {
 strReturnedEncoding = res.ContentType.ToLower().Replace(" ", string.Empty).Substring(res.ContentType.ToLower().Replace(" ", string.Empty).IndexOf("charset=") + 8); // 取出charset= 后的内容
 if (strReturnedEncoding != string.Empty)
 {
 defaultEncoding = Encoding.GetEncoding(strReturnedEncoding); // 设置取出的编码
 }
 }
 if (strReturnedEncoding == string.Empty) // 获取响应的编码方法
 {
 strReturnedEncoding = res.ContentEncoding; // 获取web请求的编码
 if (strReturnedEncoding != string.Empty && strReturnedEncoding != "gzip") // 编码响应方法不为空设置编码切不为 gzip
 {
 defaultEncoding = Encoding.GetEncoding(strReturnedEncoding); // 设置取出的编码
 }
 }
 if (strReturnedEncoding == string.Empty) // 获取响应的字符
 {
 strReturnedEncoding = res.CharacterSet;
 if (strReturnedEncoding != string.Empty) // 响应字符不为空设置编码
 {
 defaultEncoding = Encoding.GetEncoding(strReturnedEncoding);
 }
 }
 httprep.Abort(); // 取消Internet资源请求
 res.Close(); // 关闭响应流
 string strHtml = string.Empty;
 WebClient wc = new WebClient();
 byte[] btHtml = wc.DownloadData(uri);
 strHtml = Encoding.GetEncoding(defaultEncoding.WebName).GetString(btHtml); // 以Url形式请求资源获取Internet编号管理机构注册的当前编码
 wc.Dispose(); // 释放所有资源
 Regex reg_charset = new Regex(@"charset/b/s*=/s*(?<charset>[^""|^'']*)"); // 正则匹配获取编码（charset=‘Value’）
 if (reg_charset.IsMatch(strHtml)) // 在网页源码中提取编码
 {
 strReturnedEncoding = reg_charset.Match(strHtml).Groups["charset"].Value; // 取出网页真实编码
 }
 if (strReturnedEncoding != string.Empty && Encoding.GetEncoding(strReturnedEncoding) != defaultEncoding)
 {
 strHtml = Encoding.GetEncoding(strReturnedEncoding).GetString(btHtml); // 设置真正的编码进行源码下载
 }
 return strHtml;
}
catch
{
 return string.Empty;
}
}

2，当提取出HTml源码后，需要的就是对源码进行处理

Html = Html.Replace(" ", "");
Html = Html.Replace("", "");
Html = Html.Replace(" ", "");
Html = Html.Replace("", "");
Html = Html.Replace("", "");
Html = Html.Replace("", "");
Html = Html.Replace("", "");
Html = Html.Replace("", "");
Html = Html.Replace("", "");
Html = Html.Replace(" ", "");
Html = Html.Replace("", "");
Html = Html.Replace(" ", "");

3，由于我们需要在TreeView中完整的将网页的数据显示出来，首先在页面上创建TreeView，然后创建节点

private TreeNode populateTagNode(string inputString, string Titles)
{
TreeNode htmlTagNode = new TreeNode();

try
{
  MatchCollection matchesFound;
  TreeNode htmlSubTagNode;
  string sTag;
  //通过正则表达式提取HTML
  matchesFound = regex.Matches(inputString);
  htmlTagNode.Text = Titles;

foreach (Match matchMade in matchesFound)
 {
 intMathchesMade = intMathchesMade + 1;
 sTag = "<" + matchMade.Groups[1].Value +" "+ matchMade.Groups[2].Value + ">";

   htmlSubTagNode = populateTagNode(matchMade.Groups[3].Value, sTag);
   htmlTagNode.Nodes.Add(htmlSubTagNode);
  }

}
catch (Exception ex)
{
MessageBox.Show("Error：" + ex.Message.ToString());
}

return htmlTagNode;
}