时间匹配小例子

来源:互联网 发布:java web war包下载 编辑:程序博客网 时间:2024/05/01 13:48
/// <summary>        /// 根据网页信息得到文章集合        /// </summary>        protected override List<CrawlerResult> GetArticleByHtml(string HTMLContent, int task_ID)        {            List<CrawlerResult> arrayList = new List<CrawlerResult>();            MatchCollection matchList;            MatchCollection tempMatch;            //Regex regex = new Regex(@"<p class=\042item\042><a href=[^>]*>[\s\S]+?</p>");            Regex regex = new Regex(@"<p class=""t4"">[\s\S]+?</p>");            Regex regexHref = new Regex("<a href=[^>]*>评论[^>]*</a>[^<]*<span");            Regex regexTime = new Regex(@"[\d]{1,2}月[\d]{1,2}日 [\d]{1,2}:[\d]{1,2}");            //Regex title = new Regex(@"<p class=\042item\042>[\s\S]+?评论");            Regex title = new Regex(@"<p class=""t4"">[\s\S]+?评论");            Regex Author = new Regex(@"<a href=""/t2/othdoc.do[^<]*</a>");//作者            matchList = regex.Matches(HTMLContent.ToLower());            for (int i = 0; i < matchList.Count; i++)            {                CrawlerResult item = new CrawlerResult();                item.Task_ID = task_ID;                if (matchList[i].Value.ToString() != "")                {                    //URL                    tempMatch = regexHref.Matches(matchList[i].Value.ToString());                    if (tempMatch.Count > 0)                    {                        string str = GetURL(tempMatch[0].Value);                                              item.Url = "http://w.sohu.com" + str;                        if (item.Url.StartsWith("http://", StringComparison.OrdinalIgnoreCase))                        {                            //主题                             tempMatch = title.Matches(matchList[i].Value);                            if (tempMatch.Count > 0)                            {                                //string tle =                                 //if (tle.Length > 30)                                //{                                //    item.Title = tle.Substring(1,25)+"......";                                //}                                //else                                item.Title = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString());                            }                            //作者                             tempMatch = Author.Matches(matchList[i].Value);                            if (tempMatch.Count > 0)                            {                                //string tle =                                 //if (tle.Length > 30)                                //{                                //    item.Title = tle.Substring(1,25)+"......";                                //}                                //else                                item.Author = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString());                            }                                //媒体                            item.SiteName = "XXXXXX";                       
-----------------------------------------------------------------------------------------------------------------------------------------
                          //时间                            tempMatch = regexTime.Matches(matchList[i].Value.ToString());                            if (tempMatch.Count > 0)                            {                                try                                {                                    string Ctime = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()).Replace("月", "-").Replace("日", "-");                                    item.CreateTime = DateTime.Parse(tempMatch[0].Value);                                }                                catch                                {                                    CommonFunction.logWirte(this.SearchName + "抓取匹配时间出错:源是" + matchList[i].Value, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);                                }                            }                            else                            {                                if (tempMatch.Count == 0)                                {                                    //Regex Time = new Regex(@"</a> <span class=\042time\042>.*?</span>");                                    Regex Time = new Regex(@"<span class=""time"">[\d]{1,2}[\s\S]+?</span>");                                    tempMatch = Time.Matches(matchList[i].Value.ToString());                                }                                if (tempMatch.Count > 0)                                {                                    string time = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString());                                    DateTime terstr = DateTime.Now;                                    Regex ter = new Regex(@"[\d]{1,2}:[\d]{1,2}");                                    Regex timeReg = new Regex(@"[\d]{4}-[\d]{1,2}-[\d]{1,2}");                                    tempMatch = timeReg.Matches(time);                                    if (tempMatch.Count > 0)                                    {                                        try                                        {                                            item.CreateTime = DateTime.Parse(tempMatch[0].Value);                                        }                                        catch                                        {                                            CommonFunction.logWirte(this.SearchName + "抓取匹配时间出错:源是" + time, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);                                        }                                    }                                    if (tempMatch.Count == 0)                                    {                                        int TimeNumber = 0;                                        string Timetype = "";                                        if (time.IndexOf("分钟前") > 0)                                        {                                            timeReg = new Regex(@"([\d]{1,2}) 分钟前");                                            Timetype = "h";                                        }                                        else if (time.IndexOf("小时前") > 0)                                        {                                            timeReg = new Regex(@"([\d]{1,2}) 小时前");                                            Timetype = "k";                                        }                                        else if (time.IndexOf("昨天") > -1)                                        {                                            timeReg = new Regex(@"昨天");                                            Timetype = "f";                                        }                                        else if (time.IndexOf("前天") > -1)                                        {                                            timeReg = new Regex(@"前天");                                            Timetype = "m";                                        }                                        else if (time.IndexOf("天前") > 0)                                        {                                            timeReg = new Regex(@"([\d]{1,2}) 天前");                                            Timetype = "d";                                        }                                        tempMatch = timeReg.Matches(time);                                        if (tempMatch.Count > 0)                                        {                                            try                                            {                                                TimeNumber = int.Parse(tempMatch[0].Groups[1].Value);                                            }                                            catch                                            {                                                CommonFunction.logWirte(this.SearchName + "抓取匹配时间2出错:源是" + time, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);                                            }                                        }                                        switch (Timetype)                                        {                                            case "h":                                                item.CreateTime = DateTime.Now.AddMinutes(-TimeNumber);                                                break;                                            case "k":                                                item.CreateTime = DateTime.Now.AddHours(-TimeNumber);                                                break;                                            case "f":                                                tempMatch = ter.Matches(matchList[i].Value.ToString());                                                if (tempMatch.Count > 0)                                                {                                                    terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()));                                                }                                                item.CreateTime = terstr.AddDays(-1);                                                break;                                            case "m":                                                tempMatch = ter.Matches(matchList[i].Value.ToString());                                                if (tempMatch.Count > 0)                                                {                                                    terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()));                                                }                                                item.CreateTime = terstr.AddDays(-2);                                                break;                                            case "d":                                                tempMatch = ter.Matches(matchList[i].Value.ToString());                                                if (tempMatch.Count > 0)                                                {                                                    terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()));                                                }                                                item.CreateTime = terstr.AddDays(-TimeNumber);                                                break;                                        }                                    }
--------------------------------------------------------------------------------------------------------------------------------------
                                }                            }                            arrayList.Add(item);                        }                    }                }            }            return arrayList;        }

原创粉丝点击