时间匹配小例子
来源:互联网 发布:java web war包下载 编辑:程序博客网 时间:2024/05/01 13:48
/// <summary> /// 根据网页信息得到文章集合 /// </summary> protected override List<CrawlerResult> GetArticleByHtml(string HTMLContent, int task_ID) { List<CrawlerResult> arrayList = new List<CrawlerResult>(); MatchCollection matchList; MatchCollection tempMatch; //Regex regex = new Regex(@"<p class=\042item\042><a href=[^>]*>[\s\S]+?</p>"); Regex regex = new Regex(@"<p class=""t4"">[\s\S]+?</p>"); Regex regexHref = new Regex("<a href=[^>]*>评论[^>]*</a>[^<]*<span"); Regex regexTime = new Regex(@"[\d]{1,2}月[\d]{1,2}日 [\d]{1,2}:[\d]{1,2}"); //Regex title = new Regex(@"<p class=\042item\042>[\s\S]+?评论"); Regex title = new Regex(@"<p class=""t4"">[\s\S]+?评论"); Regex Author = new Regex(@"<a href=""/t2/othdoc.do[^<]*</a>");//作者 matchList = regex.Matches(HTMLContent.ToLower()); for (int i = 0; i < matchList.Count; i++) { CrawlerResult item = new CrawlerResult(); item.Task_ID = task_ID; if (matchList[i].Value.ToString() != "") { //URL tempMatch = regexHref.Matches(matchList[i].Value.ToString()); if (tempMatch.Count > 0) { string str = GetURL(tempMatch[0].Value); item.Url = "http://w.sohu.com" + str; if (item.Url.StartsWith("http://", StringComparison.OrdinalIgnoreCase)) { //主题 tempMatch = title.Matches(matchList[i].Value); if (tempMatch.Count > 0) { //string tle = //if (tle.Length > 30) //{ // item.Title = tle.Substring(1,25)+"......"; //} //else item.Title = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()); } //作者 tempMatch = Author.Matches(matchList[i].Value); if (tempMatch.Count > 0) { //string tle = //if (tle.Length > 30) //{ // item.Title = tle.Substring(1,25)+"......"; //} //else item.Author = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()); } //媒体 item.SiteName = "XXXXXX";
-----------------------------------------------------------------------------------------------------------------------------------------
//时间 tempMatch = regexTime.Matches(matchList[i].Value.ToString()); if (tempMatch.Count > 0) { try { string Ctime = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()).Replace("月", "-").Replace("日", "-"); item.CreateTime = DateTime.Parse(tempMatch[0].Value); } catch { CommonFunction.logWirte(this.SearchName + "抓取匹配时间出错:源是" + matchList[i].Value, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning); } } else { if (tempMatch.Count == 0) { //Regex Time = new Regex(@"</a> <span class=\042time\042>.*?</span>"); Regex Time = new Regex(@"<span class=""time"">[\d]{1,2}[\s\S]+?</span>"); tempMatch = Time.Matches(matchList[i].Value.ToString()); } if (tempMatch.Count > 0) { string time = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()); DateTime terstr = DateTime.Now; Regex ter = new Regex(@"[\d]{1,2}:[\d]{1,2}"); Regex timeReg = new Regex(@"[\d]{4}-[\d]{1,2}-[\d]{1,2}"); tempMatch = timeReg.Matches(time); if (tempMatch.Count > 0) { try { item.CreateTime = DateTime.Parse(tempMatch[0].Value); } catch { CommonFunction.logWirte(this.SearchName + "抓取匹配时间出错:源是" + time, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning); } } if (tempMatch.Count == 0) { int TimeNumber = 0; string Timetype = ""; if (time.IndexOf("分钟前") > 0) { timeReg = new Regex(@"([\d]{1,2}) 分钟前"); Timetype = "h"; } else if (time.IndexOf("小时前") > 0) { timeReg = new Regex(@"([\d]{1,2}) 小时前"); Timetype = "k"; } else if (time.IndexOf("昨天") > -1) { timeReg = new Regex(@"昨天"); Timetype = "f"; } else if (time.IndexOf("前天") > -1) { timeReg = new Regex(@"前天"); Timetype = "m"; } else if (time.IndexOf("天前") > 0) { timeReg = new Regex(@"([\d]{1,2}) 天前"); Timetype = "d"; } tempMatch = timeReg.Matches(time); if (tempMatch.Count > 0) { try { TimeNumber = int.Parse(tempMatch[0].Groups[1].Value); } catch { CommonFunction.logWirte(this.SearchName + "抓取匹配时间2出错:源是" + time, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning); } } switch (Timetype) { case "h": item.CreateTime = DateTime.Now.AddMinutes(-TimeNumber); break; case "k": item.CreateTime = DateTime.Now.AddHours(-TimeNumber); break; case "f": tempMatch = ter.Matches(matchList[i].Value.ToString()); if (tempMatch.Count > 0) { terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString())); } item.CreateTime = terstr.AddDays(-1); break; case "m": tempMatch = ter.Matches(matchList[i].Value.ToString()); if (tempMatch.Count > 0) { terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString())); } item.CreateTime = terstr.AddDays(-2); break; case "d": tempMatch = ter.Matches(matchList[i].Value.ToString()); if (tempMatch.Count > 0) { terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString())); } item.CreateTime = terstr.AddDays(-TimeNumber); break; } }
--------------------------------------------------------------------------------------------------------------------------------------
} } arrayList.Add(item); } } } } return arrayList; }