简单的网页采集程序

来源：互联网发布：win7网络连接不上编辑：程序博客网时间：2024/05/22 10:31

最近开发一个项目，甲方提出在另一个网站中采集信息列表并跳转，于是就简单做了一个信息列表采集功能

       /// <summary>        /// 采集视频列表        /// </summary>        /// <param name="sender"></param>        /// <param name="e"></param>        public static void getVideos(object sender, System.Timers.ElapsedEventArgs e)        {            try            {                string strHtml=GetWholeHtmlCode("<span style="font-family: Arial, Helvetica, sans-serif;">http://www.xxx.com/</span>");

<span style="white-space:pre"></span>//包含列表的内容截取                strHtml = strHtml.Substring(strHtml.IndexOf("<div class='x#xxd'>"));                strHtml = strHtml.Substring(0, strHtml.IndexOf("<div id=Footer>"));                Regex reg = new Regex(@"(?is)<a[^>]+?href=(['""]?)(?<url>[^'""\s>]+).+?title=(['""]?)(?<title>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");                string str = "";                MatchCollection mc = reg.Matches(strHtml);                foreach (Match m in mc)                {                    try                    {                        string url = m.Groups["url"].Value;                        string title = m.Groups["title"].Value;                        if (!url.Contains("http:/"))                        {                            url = "http://www.xxx.com/" + m.Groups["url"].Value;                        }                        // 如果url已经添加了，则中断循环                        string sql = " SELECT [Path]  FROM [ArticInfo] where [ArticPath]='"+url+"'";                        object obj = SqlHelper.ExecuteScalar(Art_conn, CommandType.Text, sql);                        if (obj != null)                        {                            continue;                        }

<span style="white-space:pre"></span>//添加到表中                        AddHtmlToArt(url, title);                    }                    catch                    {                        continue;                    }                }            }            catch            {            }        }

<pre name="code" class="csharp">/// <summary>        /// 根据url获取html        /// </summary>        /// <param name="sender"></param>        /// <param name="e"></param>

 public static string GetWholeHtmlCode(string url)        {            string strHtml = string.Empty;            StreamReader strReader = null;            HttpWebResponse wrpContent = null;            try            {                HttpWebRequest wrqContent = (HttpWebRequest)WebRequest.Create(url);                wrqContent.Timeout = 300000;                wrpContent = (HttpWebResponse)wrqContent.GetResponse();                if (wrpContent.StatusCode != HttpStatusCode.OK)                {                    strHtml = "Sorry, the web page is not run successful";                }                if (wrpContent != null)                {                    strReader = new StreamReader(wrpContent.GetResponseStream(), Encoding.UTF8);                    strHtml = strReader.ReadToEnd();                }            }            catch (Exception e)            {                strHtml = e.Message;            }            finally            {                if (strReader != null)                    strReader.Close();                if (wrpContent != null)                    wrpContent.Close();            }            return strHtml.Replace("\r", "").Replace("\n", "");        }

0 0