winform 简易爬网页

来源:互联网 发布:泛鹏天地知乎 编辑:程序博客网 时间:2024/06/05 18:23

            获取某网页某内容

        /// <summary>
        /// 获取加载页面的html
        /// </summary>
        /// <returns></returns>
        private string SendRequestData(string url)
        {

            Uri uri = new Uri(url, UriKind.Absolute);
            WebClient client = new WebClient();
            return client.DownloadString(uri);
        }

        /// <summary>
        /// 获取专家图片
        /// </summary>
        private void GetDocImage()
        {
            string baseUri = "http://www.zchospital.com/cms/RYLB.aspx?LMID=29";
            string htmls = SendRequestData(baseUri);
            if (string.IsNullOrEmpty(htmls))
            {
                return;
            }

            string[] splits = { "<div  style=\"float:left;width:24%;margin-right:5px;\">" };
            string[] dochtml = htmls.Split(splits, StringSplitOptions.RemoveEmptyEntries);//获取专家所在div
            if (dochtml.Length > 1)
            {
                string s = string.Empty;
                string url = string.Empty;
                string name = string.Empty;
                WebClient client = new WebClient();
                int num = 0;
                for (int i = 1; i < dochtml.Length; i++)
                {
                    s = dochtml[i].ToString();
                    Regex reg = new Regex("<img.+?/>");
                    Match m = reg.Match(s);//获取img的html

                    if (m != null)
                    {
                        string[] url1 = m.ToString().Split('\"');//获取img的src
                        if (url1.Length > 0)
                        {
                            foreach (string a in url1)
                            {
                                if (a.Contains("upfile"))
                                {
                                    url = "http://www.zchospital.com/" + a;
                                    reg = new Regex("<span.+?>(.|\n)+?</span>");
                                    m = reg.Match(s);//获取医生姓名span
                                    if (m != null)
                                    {
                                        name = m.ToString();
                                        reg = new Regex("<span.+?>");//匹配前面的span标签
                                        m = reg.Match(s);
                                        name = name.Replace(m.ToString(), "").Replace("</span>", "");//获取医生姓名
                                        //下载资源并重命名图片:专家姓名+图片名称
                                        string path = System.Windows.Forms.Application.StartupPath;
                                        DirectoryInfo logPath = new DirectoryInfo(path + "/docimage");
                                        if (!logPath.Exists)
                                        { //判断文件夹是否存在
                                            logPath.Create(); //不存在则创建文件夹
                                        }
                                        try
                                        {
                                            //该图片没有下载过

                                            if (!File.Exists(logPath + "\\" + name + url.Substring(url.LastIndexOf("/") + 1, url.Length - url.LastIndexOf("/") - 1)))
                                            {
                                                client.DownloadFile(url, logPath + "\\" + name + url.Substring(url.LastIndexOf("/") + 1, url.Length - url.LastIndexOf("/") - 1));
                                                num += 1;
                                                richTextBox1.Text += name + "图片下载成功!\r\n";

                                            }
                                        }
                                        catch (Exception ex)
                                        {
                                            WriteLog("下载" + name + url.Substring(url.LastIndexOf("/") + 1, url.Length - url.LastIndexOf("/") - 1) + "出错:" + ex.Message);
                                        }

                                    }
                                    break;
                                }

                            }

                        }
                    }
                    if (i == dochtml.Length - 1)
                    {
                        richTextBox1.Text += "专家图片下载全部完成!\r\n共下载" + num + "张专家图片";
                    }
                }
            }
        }

 

原创粉丝点击