读淘宝页面字节流提取宝贝图片地址宝贝标题宝贝价格

来源:互联网 发布:linux安装过程 编辑:程序博客网 时间:2024/04/28 02:03
public static class taobao_message
        {
            /// <summary>
            /// 读页面的byte转化为string
            /// </summary>
            /// <param name="url">地址</param>
            /// <returns></returns>
            public static string webclinet_content(string url)
            {
                url = url.Replace("http://", "");
                System.Net.WebClient client = new WebClient();
                byte[] page = client.DownloadData("http://" + url);  
                string content = System.Text.Encoding.GetEncoding("GB2312").GetString(page);//淘宝的页面编码为gb2312
                return content;
            }
            /// <summary>
            /// 读淘宝宝贝的信息
            /// 数组内容{宝贝图片地址,宝贝标题,宝贝价格}
            /// </summary>
            /// <param name="url">宝贝地址</param>
            /// <returns></returns>
            public static string[] baobei_mess(string url)
            {
                string content = webclinet_content(url);
                string baobei_img =get_taobao(content,1);
                string baobei_title = get_taobao(content,2);
                string baobei_price = get_taobao(content,3);
                string[] arry1 = {baobei_img,baobei_title,baobei_price};
                return arry1;;
            }
            /// <summary>
            /// 特定标签内容提取
            /// </summary>
            /// <param name="content">提取的字符串</param>
            /// <param name="type">验证类型0:空; 1:淘宝贝图片;2:宝贝标题;3:宝贝价格;</param>
            /// <returns></returns>
            public static string get_taobao(string content, int type)
            {
                string result = "";
                string reg = "";
                switch (type)
                {
                    case 0: return "";
                    case 1: reg = @"J_ImgBooth\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>"; break;
                    case 2: reg = "<h3>(<a[^>]*>)?([^<]*)(</a>)?</h3>"; break;
                    case 3: reg = "J_StrPrice[^>]*>([^<>]*)(</)"; break;
                }
                string regex = reg;
                Regex re = new Regex(regex);
                MatchCollection matches = re.Matches(content);
                System.Collections.IEnumerator enu = matches.GetEnumerator();
                switch (type)
                {
                    case 0: return "";
                    case 1:
                        while (enu.MoveNext() && enu.Current != null)
                        {
                            Match match = (Match)(enu.Current);
                            result += match.Groups["imgUrl"];
                        } break;
                    case 2:
                        while (enu.MoveNext() && enu.Current != null)
                        {
                            Match match = (Match)(enu.Current);
                            result += match.Groups[2];
                        } break;
                    case 3:
                        while (enu.MoveNext() && enu.Current != null)
                        {
                            Match match = (Match)(enu.Current);
                            result += match.Groups[1];
                        } break;
                }
                return result;
           }
        }
原创粉丝点击