C#正则表达式 解析html+table tr td 内容
来源:互联网 发布:上海行知教育怎么样 编辑:程序博客网 时间:2024/05/16 23:39
aspx页面获取方法: var tbZHXX = GetWorldexWyHtml(s, @"id=""tbZHXX""", @"class=""GridCommonItem""", "Worldex"); 提交参数及隐藏hiddle值 var postUrl = "http://xxm.cn/glj/querydata/xxSearchOld.aspx"; List<KeyValuePair<String, String>> paramList = new List<KeyValuePair<String, String>>(); GetViewHiddenData(postUrl).ToList().ForEach(x => paramList.Add(new KeyValuePair<string, string>(x.Key, x.Value))); paramList = (from p in paramList where !string.IsNullOrEmpty(p.Value) select p).ToList(); paramList.Add(new KeyValuePair<string, string>("txtBillNo", strBlNo)); paramList.Add(new KeyValuePair<string, string>("btnSearch", "查询")); var s = HttpAspxPostMathHtml(postUrl, paramList); /// <summary>/// /// </summary>/// <param name="regexInfo">解析html内容</param>/// <param name="regexParm">table class或者id</param>/// <param name="classParm">tr class或者id</param>/// <param name="companyCode"></param>/// <returns></returns>public static List<WWyDetails> GetWWyHtml(string regexInfo, string regexParm, string classParm, string companyCode){List<WWyDetails> resultWdHtml = new List<WWyDetails>();WWyDetails wd = new WWyDetails();var rex = "(?is)(?<=<table[^>]*?" + regexParm + "[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?" + classParm + "[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>";Regex reg = new Regex(rex);var td = new List<string>();foreach (Match m in reg.Matches(regexInfo).Cast<Match>()){if (companyCode == "Worldex") //港联捷{//查找每个TD的内容(\s+scope=[^>]+)? td 后面跟着 class align等 \r\n|\s+ \r\n\s+处理TD内容后面有换行td = Regex.Matches(m.Value, @"(?<=<td(\s+scope=[^>]+)?>)\r\n\s+.*?\r\n\s+(?=</td>)")// @"(?<=<td>)\r\n\s+[\s\S]*?\r\n\s+(?=</td>)") .Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();}else{//<font color="#333333">SNL7QDJL510757</font>//td = Regex.Matches(m.Value, @"(?<=<td>).*?(?=</td>)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();td = Regex.Matches(m.Value, @"(?<=<font(\s+color=[^>]+)?>).*?(?=</font>)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();}resultWdHtml.Add(new WWyDetails() { WSinotrans = td.ToList() });}return resultWdHtml;}public class WWyDetails{public List<string> WSinotrans { get; set; }}
获取asp页面及解析使用方法: var sPuci = HttpAspPostMathHtml("http://xxx/index_dt_container.asp", "search=true&companyname=&companycode=&container_no=&bill_no=" + strBlNo + "&btn3.x=39&btn3.y=15", "gb2312");var gdvContainer = GetSYDetails(sPuci, @"class=tableGrid", @"class=gridHeader", "铅封号", "YTWY"); /// <summary>/// 获取不同Table 中内容/// </summary>/// <param name="regexInfo">解析内容</param>/// <param name="classTable">table class或者id</param>/// <param name="classParm">tr 中class 或者id</param>/// <param name="compareInfo">进行提取对比的关键字</param>/// <param name="companyCode">对比的公司名</param>/// <returns></returns>public static List<SYDetails> GetSYDetails(string regexInfo, string classTable, string classParm, string compareInfo, string companyCode){ //<table.*? class=grid[^>]*?>[\s\S]*?<\/table> 匹配所有table //tr[^>]*?Regex regTable = new Regex(@"<table.*?" + classTable + "[^>]*?>[\\s\\S]*?<\\/table>"); //@"(?is)(?<=<table[^>]*?class=grid[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\s*<td[^>]*>(.*?)</td>)*\s*</tr>"); // new Regex(@"(?is)(?<=<table[^>]*?class=grid[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\s*<td[^>]*>(.*?)</td>)*\s*</tr>");List<SYDetails> lstSTX = new List<SYDetails>();var td = new List<string>();foreach (Match mTable in regTable.Matches(regexInfo).Cast<Match>()){//进行每个table里面关键标题对比是否存在var compare = Regex.Match(mTable.Value, "(?is)<tr " + classParm + ">(?:\\s*<td[^>]*>(.*?)</td>)*\\s*((?!</tr>).)*").Groups[0].Value.Trim();//.Groups[1].Value.Trim();if (compare.Contains(compareInfo)){//解析table 里面包含多少个trRegex regTr = new Regex(@"(?is)(?<=<table[^>]*?" + classTable + "[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>");foreach (Match mTr in regTr.Matches(mTable.Value).Cast<Match>().Skip(1)) //Skip(1跳过tr 标题列{if (companyCode == "YTWY") //烟台外运国际码头{if (!mTr.Value.Contains("查询数据为空。")){//查找每个TD的内容 包含td 后面 class等 td = Regex.Matches(mTr.Value, @"(?is)(?<=<td(\s+align=[^>]+)?>).*?(?=\s*</td)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();}}else{//查找每个TD的内容 包含td 后面 class等 td = Regex.Matches(mTr.Value, @"(?is)(?<=<td(\s+class=[^>]+)?>).*?(?=\s*</td)").Cast<Match>().Select(mx => mx.Groups[0].Value).ToList();}lstSTX.Add(new SYDetails() { SYTwy = td });}break;}}return lstSTX;}public class SYDetails{public List<string> SYTwy { get; set; }}
#region ASP/ASPX页面 Get/Post获取返回数据/// <summary>/// ASP 页面POST请求与获取结果/// </summary>/// <param name="Url">posturl</param>/// <param name="postDataStr">post参数</param>/// <param name="encoding">页面编码</param>/// <returns></returns>public static string HttpAspPostMathHtml(string Url, string postDataStr, string encoding){HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);request.Method = "POST";request.Accept = "text/html, application/xhtml+xml, image/jxr, */*";request.ContentType = "application/x-www-form-urlencoded";request.KeepAlive = true;request.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.7,ja;q=0.3");request.Headers.Add("Accept-Encoding", "gzip, deflate");request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko";byte[] bytes = System.Text.Encoding.Default.GetBytes(postDataStr);request.ContentLength = bytes.Length;Stream stream = request.GetRequestStream();stream.Write(bytes, 0, bytes.Length);stream.Close();//以上是POST数据的写入HttpWebResponse response = (HttpWebResponse)request.GetResponse();var retString = string.Empty;using (Stream responsestream = response.GetResponseStream()){using (StreamReader sr = new StreamReader(responsestream, System.Text.Encoding.GetEncoding(encoding))){retString = sr.ReadToEnd();}} //直接获取body内容var resultStr = Regex.Matches(DelHTML(retString), @"(?is)<body[^>]*?>([\s\S].*?)</body>") .Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();return resultStr[0].ToString();}/// <summary>/// ASP 页面Get请求与获取结果/// </summary>/// <param name="Url">posturl</param>/// <param name="postDataStr">post参数</param>/// <param name="encoding">页面编码</param>/// <returns></returns>public static string HttpAspGetMathHtml(string Url, string postDataStr, string encoding){HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + "?" + postDataStr);request.Method = "Get";request.Accept = "text/html, application/xhtml+xml, image/jxr, */*";request.KeepAlive = true;request.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.7,ja;q=0.3");request.Headers.Add("Accept-Encoding", "gzip, deflate");request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko";HttpWebResponse response = (HttpWebResponse)request.GetResponse();var retString = string.Empty;using (Stream responsestream = response.GetResponseStream()){using (StreamReader sr = new StreamReader(responsestream, System.Text.Encoding.GetEncoding(encoding))){retString = sr.ReadToEnd();}}return retString;}/// <summary>/// 获取ASPX页面中隐藏post值Viewstae 等 进行post提交/// </summary>/// <param name="Url"></param>/// <returns></returns>public Dictionary<string, string> GetViewHiddenData(string Url){HttpClient httpClient = new HttpClient();httpClient.MaxResponseContentBufferSize = 256000;httpClient.DefaultRequestHeaders.Add("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36");HttpResponseMessage response = httpClient.GetAsync(new Uri(Url)).Result;var result = Regex.Matches(response.Content.ReadAsStringAsync().Result, @"<input type=""hidden""[^>]*?.*?\/>").Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();Dictionary<string, string> returnHidden = new Dictionary<string, string>();foreach (var item in result){//获取 隐藏域中的 id value//var reg = @"(?isn)<input((?!([<>]|id=)).)+id=""(?<id>[^""<>]+)""[^<>]*?value=""(?<value>[^<>""]*)""";//var keyvalue = Regex.Match(item, reg);//returnHidden.Add(keyvalue.Groups[1].Value, keyvalue.Groups[2].Value);var key = Regex.Match(item, @"<input type=""hidden""[^>]*?[^>]+?id=""([\s\S]+?)""[^>]+>").Groups[1].Value;var value = Regex.Match(item, @"<input type=""hidden""[^>]*?[^>]+?value=""([\s\S]+?)""[^>]+>").Groups[1].Value;returnHidden.Add(key, value);}//用完要记得释放httpClient.Dispose();return returnHidden;}/// <summary>/// ASPX页面POST请求与获取结果/// </summary>/// <param name="Url"></param>/// <param name="postDataStr"></param>/// <returns></returns>public static string HttpAspxPostMathHtml(string Url, List<KeyValuePair<String, String>> postDataStr){var retString = string.Empty;HttpClient httpClient = new HttpClient();HttpResponseMessage response = httpClient.GetAsync(new Uri(Url)).Result;response = httpClient.PostAsync(new Uri(Url), new FormUrlEncodedContent(postDataStr)).Result;retString = response.Content.ReadAsStringAsync().Result;var resultStr = Regex.Matches(DelHTML(retString), @"(?is)<body[^>]*?>([\s\S].*?)</body>").Cast<Match>().Select(mx => mx.Groups[0].Value.TrimStart().TrimEnd()).ToList();//用完要记得释放httpClient.Dispose();return resultStr[0].ToString();}#region将HTML去除一些无用数据/// <summary>/// //将HTML去除一些无用数据/// </summary>/// <param name="Htmlstring"></param>/// <returns></returns>public static string DelHTML(string Htmlstring){//删除脚本Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);//<input((?< !<).) *? hidden.*?\/>Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<input type=""hidden""[^>]*?.*?\/>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);//删除HTMLHtmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"-->", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"<!--.*", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(amp|#38);", "&", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(lt|#60);", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(gt|#62);", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);//Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring, @"&#(\d+);", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);Htmlstring.Replace("\r\n", "");return Htmlstring;}#endregion#endregion
0 0
- C#正则表达式 解析html+table tr td 内容
- java读取html页面并解析<table><tr><td>
- 表格<table> <tr><td>
- jQuery 遍历Table中tr中的td中的内容
- html table tr td 上下移动行实例源代码
- html table tr td br 什么意思 缩写
- html table tr td br 什么意思 缩写
- html table tr td br 什么意思 缩写
- html table tr td br 什么意思 缩写
- <html>table、tr/td、thead、tbody、tfoot、col、colgroup
- HTML中table的属性tr、th、td的使用
- HTML之表格子元素选择器table>tr>td
- 正则表达式。取Html中Table中的Td里面的值
- jq操作table tr td
- 遍历table中的tr td
- java根据 正则表达式解析html网页内容
- html 中的tr th td
- 解析html中的table内容
- pyinstaller的用法
- 使用left join连接带NULL值数据表案例
- Oracle ADF HelloWorld
- 整数转化
- javaSE_8系列博客——Java语言的特性(二)--高级语言的基础知识(7)-- 流程控制语句
- C#正则表达式 解析html+table tr td 内容
- windows下bat批处理实现守护进程(有日志)
- JavaScript语言基础7---函数的综合练习
- linux as4下安装oracle10g终结版
- 02 The TensorFlow Way(3)
- fopen后只能读到部分文件问题解决,文本方式vs二进制方式
- XML的生成与解析
- 开源软件expect软件安装
- Cocos2d-x:学习笔记(2017.05.12更新)