通过正则表达式提取网页

来源:互联网 发布:c语言德巴赫猜想 编辑:程序博客网 时间:2024/05/04 07:28

转帖地址:http://www.cnblogs.com/gaowenbin/articles/1777421.html

 

1,先看需求吧,我们需要设计一个文本框,然后输入网址后从该网址上下载HTML源码,设想想,我们可以通过wenclinet进行处理吧

 

private string GetWebPage(string uri)
{
 try
 {
  HttpWebRequest httprep = (HttpWebRequest)WebRequest.Create(uri);     // 创建HTTP请求
  HttpWebResponse res = httprep.GetResponse() as HttpWebResponse;      // 返回响应的值
  Encoding defaultEncoding = Encoding.UTF8;                            // 设置初步编码
  string strType = string.Empty;                                       // 判断类型
  string strReturnedEncoding = string.Empty;                           // 编码
  strType = res.ContentType;                                           // 获取响应内容值
  if (strType.IndexOf(";") > -1)                                       // 判断响应内容类型中有无 ;
  {
   strType = strType.Split(new char[] { ';' })[0].Trim().ToLower(); // 存在分号进行分割
  }
  if (strType != "text/html")                                          // 返回类型不为text/html 直接返回空值
  {
   return string.Empty;
  }
  if (res.ContentType.ToLower().Replace(" ", string.Empty).IndexOf("charset") > -1) // 先判断编码   Content-Type 中 有无charset
  {
   strReturnedEncoding = res.ContentType.ToLower().Replace(" ", string.Empty).Substring(res.ContentType.ToLower().Replace(" ", string.Empty).IndexOf("charset=") + 8); // 取出charset= 后的内容
   if (strReturnedEncoding != string.Empty)
   {
    defaultEncoding = Encoding.GetEncoding(strReturnedEncoding);             // 设置取出的编码
   }
  }
  if (strReturnedEncoding == string.Empty)                                         // 获取响应的编码方法
  {
   strReturnedEncoding = res.ContentEncoding;                                   // 获取web请求的编码
   if (strReturnedEncoding != string.Empty && strReturnedEncoding != "gzip")    // 编码响应方法不为空设置编码 切不为 gzip
   {
    defaultEncoding = Encoding.GetEncoding(strReturnedEncoding);             // 设置取出的编码
   }
  }
  if (strReturnedEncoding == string.Empty)                                         // 获取响应的字符
  {
   strReturnedEncoding = res.CharacterSet;
   if (strReturnedEncoding != string.Empty)                                     // 响应字符不为空设置编码
   {
    defaultEncoding = Encoding.GetEncoding(strReturnedEncoding);
   }
  }
  httprep.Abort();                                                                 // 取消Internet资源请求
  res.Close();                                                                     // 关闭响应流
  string strHtml = string.Empty;
  WebClient wc = new WebClient();
  byte[] btHtml = wc.DownloadData(uri);
  strHtml = Encoding.GetEncoding(defaultEncoding.WebName).GetString(btHtml);       // 以Url形式请求资源 获取Internet编号管理机构注册的当前编码
  wc.Dispose();                                                                    // 释放所有资源
  Regex reg_charset = new Regex(@"charset/b/s*=/s*(?<charset>[^""|^'']*)");        // 正则匹配获取编码(charset=‘Value’)
  if (reg_charset.IsMatch(strHtml))                                                // 在网页源码中提取编码
  {
   strReturnedEncoding = reg_charset.Match(strHtml).Groups["charset"].Value;    // 取出网页真实编码
  }
  if (strReturnedEncoding != string.Empty && Encoding.GetEncoding(strReturnedEncoding) != defaultEncoding)
  {
   strHtml = Encoding.GetEncoding(strReturnedEncoding).GetString(btHtml);       // 设置真正的编码进行源码下载
  }
  return strHtml;
 }
 catch
 {
  return string.Empty;
 }
}

 

2, 当提取出HTml源码后,需要的就是对源码进行处理

Html = Html.Replace("<BR>", ""); 
Html = Html.Replace("<P>", ""); 
Html = Html.Replace("<br>", ""); 
Html = Html.Replace("<p>", ""); 
Html = Html.Replace("</P>", ""); 
Html = Html.Replace("<P/>", ""); 
Html = Html.Replace("</p>", ""); 
Html = Html.Replace("<p/>", ""); 
Html = Html.Replace("</BR>", ""); 
Html = Html.Replace("<BR/>", ""); 
Html = Html.Replace("</br>", ""); 
Html = Html.Replace("<br/>", ""); 

 

3, 由于我们需要在TreeView中完整的将网页的数据显示出来,首先在页面上创建TreeView,然后创建节点

 

private TreeNode populateTagNode(string inputString, string Titles)
{
 TreeNode htmlTagNode = new TreeNode();

 try
 {
  MatchCollection matchesFound;
  TreeNode htmlSubTagNode;
  string sTag;
  //通过正则表达式提取HTML
  matchesFound = regex.Matches(inputString);
  htmlTagNode.Text = Titles;

  foreach (Match matchMade in matchesFound)
  {
   intMathchesMade = intMathchesMade + 1;
   sTag = "&lt;" + matchMade.Groups[1].Value +" "+ matchMade.Groups[2].Value + "&gt;";
   


   htmlSubTagNode = populateTagNode(matchMade.Groups[3].Value, sTag);
   htmlTagNode.Nodes.Add(htmlSubTagNode);
  }

 }
 catch (Exception ex)
 {
  MessageBox.Show("Error:" + ex.Message.ToString());
 }

 return htmlTagNode;
}