C#正则表达式 解析html+table tr td 内容

来源:互联网 发布:上海行知教育怎么样 编辑:程序博客网 时间:2024/05/16 23:39

aspx页面获取方法:    var tbZHXX = GetWorldexWyHtml(s, @"id=""tbZHXX""", @"class=""GridCommonItem""", "Worldex"); 提交参数及隐藏hiddle值 var postUrl = "http://xxm.cn/glj/querydata/xxSearchOld.aspx";            List<KeyValuePair<String, String>> paramList = new List<KeyValuePair<String, String>>();            GetViewHiddenData(postUrl).ToList().ForEach(x => paramList.Add(new KeyValuePair<string, string>(x.Key, x.Value)));            paramList = (from p in paramList where !string.IsNullOrEmpty(p.Value) select p).ToList();            paramList.Add(new KeyValuePair<string, string>("txtBillNo", strBlNo));            paramList.Add(new KeyValuePair<string, string>("btnSearch", "查询")); var s = HttpAspxPostMathHtml(postUrl, paramList); /// <summary>/// /// </summary>/// <param name="regexInfo">解析html内容</param>/// <param name="regexParm">table class或者id</param>/// <param name="classParm">tr class或者id</param>/// <param name="companyCode"></param>/// <returns></returns>public static List<WWyDetails> GetWWyHtml(string regexInfo, string regexParm, string classParm, string companyCode){List<WWyDetails> resultWdHtml = new List<WWyDetails>();WWyDetails wd = new WWyDetails();var rex = "(?is)(?<=<table[^>]*?" + regexParm + "[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?" + classParm + "[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>";Regex reg = new Regex(rex);var td = new List<string>();foreach (Match m in reg.Matches(regexInfo).Cast<Match>()){if (companyCode == "Worldex") //港联捷{//查找每个TD的内容(\s+scope=[^>]+)? td 后面跟着 class align等 \r\n|\s+ \r\n\s+处理TD内容后面有换行td = Regex.Matches(m.Value, @"(?<=<td(\s+scope=[^>]+)?>)\r\n\s+.*?\r\n\s+(?=</td>)")// @"(?<=<td>)\r\n\s+[\s\S]*?\r\n\s+(?=</td>)") .Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();}else{//<font color="#333333">SNL7QDJL510757</font>//td = Regex.Matches(m.Value, @"(?<=<td>).*?(?=</td>)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();td = Regex.Matches(m.Value, @"(?<=<font(\s+color=[^>]+)?>).*?(?=</font>)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();}resultWdHtml.Add(new WWyDetails() { WSinotrans = td.ToList() });}return resultWdHtml;}public class WWyDetails{public List<string> WSinotrans { get; set; }}



获取asp页面及解析使用方法:   var sPuci = HttpAspPostMathHtml("http://xxx/index_dt_container.asp", "search=true&companyname=&companycode=&container_no=&bill_no=" + strBlNo + "&btn3.x=39&btn3.y=15", "gb2312");var gdvContainer = GetSYDetails(sPuci, @"class=tableGrid", @"class=gridHeader", "铅封号", "YTWY"); /// <summary>/// 获取不同Table 中内容/// </summary>/// <param name="regexInfo">解析内容</param>/// <param name="classTable">table class或者id</param>/// <param name="classParm">tr 中class 或者id</param>/// <param name="compareInfo">进行提取对比的关键字</param>/// <param name="companyCode">对比的公司名</param>/// <returns></returns>public static List<SYDetails> GetSYDetails(string regexInfo, string classTable, string classParm, string compareInfo, string companyCode){                   //<table.*? class=grid[^>]*?>[\s\S]*?<\/table> 匹配所有table                                                                          //tr[^>]*?Regex regTable = new Regex(@"<table.*?" + classTable + "[^>]*?>[\\s\\S]*?<\\/table>"); //@"(?is)(?<=<table[^>]*?class=grid[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\s*<td[^>]*>(.*?)</td>)*\s*</tr>"); // new Regex(@"(?is)(?<=<table[^>]*?class=grid[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\s*<td[^>]*>(.*?)</td>)*\s*</tr>");List<SYDetails> lstSTX = new List<SYDetails>();var td = new List<string>();foreach (Match mTable in regTable.Matches(regexInfo).Cast<Match>()){//进行每个table里面关键标题对比是否存在var compare = Regex.Match(mTable.Value, "(?is)<tr " + classParm + ">(?:\\s*<td[^>]*>(.*?)</td>)*\\s*((?!</tr>).)*").Groups[0].Value.Trim();//.Groups[1].Value.Trim();if (compare.Contains(compareInfo)){//解析table 里面包含多少个trRegex regTr = new Regex(@"(?is)(?<=<table[^>]*?" + classTable + "[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>");foreach (Match mTr in regTr.Matches(mTable.Value).Cast<Match>().Skip(1))  //Skip(1跳过tr 标题列{if (companyCode == "YTWY") //烟台外运国际码头{if (!mTr.Value.Contains("查询数据为空。")){//查找每个TD的内容 包含td 后面 class等 td = Regex.Matches(mTr.Value, @"(?is)(?<=<td(\s+align=[^>]+)?>).*?(?=\s*</td)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();}}else{//查找每个TD的内容 包含td 后面 class等 td = Regex.Matches(mTr.Value, @"(?is)(?<=<td(\s+class=[^>]+)?>).*?(?=\s*</td)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();}lstSTX.Add(new SYDetails() { SYTwy = td });}break;}}return lstSTX;}public class SYDetails{public List<string> SYTwy { get; set; }}


#region ASP/ASPX页面  Get/Post获取返回数据/// <summary>/// ASP 页面POST请求与获取结果/// </summary>/// <param name="Url">posturl</param>/// <param name="postDataStr">post参数</param>/// <param name="encoding">页面编码</param>/// <returns></returns>public static string HttpAspPostMathHtml(string Url, string postDataStr, string encoding){HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);request.Method = "POST";request.Accept = "text/html, application/xhtml+xml, image/jxr, */*";request.ContentType = "application/x-www-form-urlencoded";request.KeepAlive = true;request.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.7,ja;q=0.3");request.Headers.Add("Accept-Encoding", "gzip, deflate");request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko";byte[] bytes = System.Text.Encoding.Default.GetBytes(postDataStr);request.ContentLength = bytes.Length;Stream stream = request.GetRequestStream();stream.Write(bytes, 0, bytes.Length);stream.Close();//以上是POST数据的写入HttpWebResponse response = (HttpWebResponse)request.GetResponse();var retString = string.Empty;using (Stream responsestream = response.GetResponseStream()){using (StreamReader sr = new StreamReader(responsestream, System.Text.Encoding.GetEncoding(encoding))){retString = sr.ReadToEnd();}} //直接获取body内容var resultStr = Regex.Matches(DelHTML(retString), @"(?is)<body[^>]*?>([\s\S].*?)</body>") .Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();return resultStr[0].ToString();}/// <summary>/// ASP 页面Get请求与获取结果/// </summary>/// <param name="Url">posturl</param>/// <param name="postDataStr">post参数</param>/// <param name="encoding">页面编码</param>/// <returns></returns>public static string HttpAspGetMathHtml(string Url, string postDataStr, string encoding){HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + "?" + postDataStr);request.Method = "Get";request.Accept = "text/html, application/xhtml+xml, image/jxr, */*";request.KeepAlive = true;request.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.7,ja;q=0.3");request.Headers.Add("Accept-Encoding", "gzip, deflate");request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko";HttpWebResponse response = (HttpWebResponse)request.GetResponse();var retString = string.Empty;using (Stream responsestream = response.GetResponseStream()){using (StreamReader sr = new StreamReader(responsestream, System.Text.Encoding.GetEncoding(encoding))){retString = sr.ReadToEnd();}}return retString;}/// <summary>/// 获取ASPX页面中隐藏post值Viewstae 等 进行post提交/// </summary>/// <param name="Url"></param>/// <returns></returns>public Dictionary<string, string> GetViewHiddenData(string Url){HttpClient httpClient = new HttpClient();httpClient.MaxResponseContentBufferSize = 256000;httpClient.DefaultRequestHeaders.Add("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36");HttpResponseMessage response = httpClient.GetAsync(new Uri(Url)).Result;var result = Regex.Matches(response.Content.ReadAsStringAsync().Result, @"<input type=""hidden""[^>]*?.*?\/>").Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();Dictionary<string, string> returnHidden = new Dictionary<string, string>();foreach (var item in result){//获取 隐藏域中的 id  value//var reg = @"(?isn)<input((?!([<>]|id=)).)+id=""(?<id>[^""<>]+)""[^<>]*?value=""(?<value>[^<>""]*)""";//var keyvalue = Regex.Match(item, reg);//returnHidden.Add(keyvalue.Groups[1].Value, keyvalue.Groups[2].Value);var key = Regex.Match(item, @"<input type=""hidden""[^>]*?[^>]+?id=""([\s\S]+?)""[^>]+>").Groups[1].Value;var value = Regex.Match(item, @"<input type=""hidden""[^>]*?[^>]+?value=""([\s\S]+?)""[^>]+>").Groups[1].Value;returnHidden.Add(key, value);}//用完要记得释放httpClient.Dispose();return returnHidden;}/// <summary>/// ASPX页面POST请求与获取结果/// </summary>/// <param name="Url"></param>/// <param name="postDataStr"></param>/// <returns></returns>public static string HttpAspxPostMathHtml(string Url, List<KeyValuePair<String, String>> postDataStr){var retString = string.Empty;HttpClient httpClient = new HttpClient();HttpResponseMessage response = httpClient.GetAsync(new Uri(Url)).Result;response = httpClient.PostAsync(new Uri(Url), new FormUrlEncodedContent(postDataStr)).Result;retString = response.Content.ReadAsStringAsync().Result;var resultStr = Regex.Matches(DelHTML(retString), @"(?is)<body[^>]*?>([\s\S].*?)</body>").Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();//用完要记得释放httpClient.Dispose();return resultStr[0].ToString();}#region将HTML去除一些无用数据/// <summary>/// //将HTML去除一些无用数据/// </summary>/// <param name="Htmlstring"></param>/// <returns></returns>public static string DelHTML(string Htmlstring){//删除脚本Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);//<input((?< !<).) *? hidden.*?\/>Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<input type=""hidden""[^>]*?.*?\/>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);//删除HTMLHtmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"-->", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<!--.*", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(amp|#38);", "&", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(lt|#60);", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(gt|#62);", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);//Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&#(\d+);", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring.Replace("\r\n", "");return Htmlstring;}#endregion#endregion

0 0
原创粉丝点击