程序中得到百度的搜索结果

来源:互联网 发布:yum openssl安装 编辑:程序博客网 时间:2024/05/01 23:22

using System;using System.Collections.Generic;using System.Text;using System.Text.RegularExpressions;using System.Web;using System.Net;using System.IO;namespace baiduRobot{    struct BaiduEntry    {        public string title, brief, link;    }    class Program    {        static string GetHtml(string keyword)        {            string url = @"http://www.baidu.com/";                                  string encodedKeyword = HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(936));            //百度使用codepage 936字符编码来作为查询串,果然专注于中文搜索……            //更不用说,还很喜欢微软            //谷歌能正确识别UTF-8编码和codepage这两种情况,不过本身网页在HTTP头里标明是UTF-8的            //估计谷歌也不讨厌微软(以及微软的专有规范)            string query = "s?wd=" + keyword;            HttpWebRequest req;            HttpWebResponse response;            Stream stream;            req = (HttpWebRequest)WebRequest.Create(url + query);            response = (HttpWebResponse)req.GetResponse();            stream = response.GetResponseStream();            int count = 0;            byte[] buf = new byte[8192];            string decodedString = null;            StringBuilder sb = new StringBuilder();            try            {                Console.WriteLine("正在读取网页{0}的内容……", url + query);                do                {                    count = stream.Read(buf, 0, buf.Length);                    if (count > 0)                    {                        decodedString = Encoding.GetEncoding(936).GetString(buf, 0, count);                        sb.Append(decodedString);                    }                } while (count > 0);            }            catch            {                Console.WriteLine("网络连接失败,请检查网络设置。");            }            return sb.ToString();        }        static void PrintResult(List<BaiduEntry> entries)        {            int count = 0;            entries.ForEach(delegate(BaiduEntry entry)            {                Console.WriteLine("找到了百度的第{0}条搜索结果:", count += 1);                if (entry.link != null)                {                    Console.WriteLine("找到了一条链接:");                    Console.WriteLine(entry.link);                }                if (entry.title != null)                {                    Console.WriteLine("标题为:");                    Console.WriteLine(entry.title);                }                if (entry.brief != null)                {                    Console.WriteLine("下面是摘要:");                    Console.WriteLine(entry.brief);                }                Program.Cut();            });        }        static void simpleOutput()        {            string html = "<table><tr><td><font>test</font><a>hello</a><br></td></tr></table>";            Console.WriteLine(RemoveSomeTags(html));        }        static string RemoveVoidTag(string html)        {            string[] filter = { "<br>" };            foreach (string tag in filter)            {                html = html.Replace(tag, "");            }            return html;        }        static string ReleaseXmlTags(string html)        {            string[] filter = { "<a.*?>", "</a>", "<em>", "</em>", "<b>", "</b>", "<font.*?>", "</font>" };            foreach (string tag in filter)            {                html = Regex.Replace(html, tag, "");            }            return html;        }        static string RemoveSomeTags(string html)        {            html = RemoveVoidTag(html);            html = ReleaseXmlTags(html);            return html;        }        static void Cut()        {            Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");        }        static void MainProc(string input)        {            MainProc(input, false);        }        static void MainProc(string input, bool tagsForBrief)        {            Regex r = new Regex("<table*</table>", RegexOptions.IgnoreCase);            //提取出(<table>,</table>)对,并等待进一步处理。            Match m = r.Match(input);            List<string> collection = new List<string>();            while (m.Success)            {                collection.Add(m.Value);                //找出tagname为table的节点并存储到collection变量中                m = m.NextMatch();            }            List<BaiduEntry> entries = new List<BaiduEntry>();            collection.ForEach(delegate(string entry)            {                r = new Regex("<td.*?>(.*)</td>", RegexOptions.IgnoreCase);                m = r.Match(entry);                while (m.Success)                {                    //Console.WriteLine(m.Value);                    GroupCollection gc = m.Groups;                    // Console.WriteLine(gc[0].Captures[0].Value == gc[0].Value);                    for (int i = 1; i < gc.Count; i++)                    {//放弃第一个group,那里只有整个match字符串,而且永远只有这1个捕获组(gc[0].Captures.Count恒为1)                        Capture result = gc[i].Captures[0];//正则对象r里只有1个分组,所以只需要提取第一个分组就可以了。                        string html = result.Value;                        //result里存储着td节点的innerHTML,那里有真正的搜索结果                        BaiduEntry baidu = new BaiduEntry();                        r = new Regex("<a.*?href=\"(.*?)\".*?>", RegexOptions.IgnoreCase);                        if (r.IsMatch(html))                        {                            string linkString = r.Match(html).Groups[1].Captures[0].Value;                            baidu.link = linkString;                        }                        r = new Regex("<font.*</font>");                        //td节点下有一些嵌套了2层的font标签,把这个大的font标签拿下来。                        html = r.Match(html).Value;//现在html变量里存储着比较浓缩的信息了。                        r = new Regex("<font.*?>(.*?)</font>");                        Match contentMatch = r.Match(html);                        if (contentMatch.Success)                        {                            //Console.WriteLine(html);                            string title = contentMatch.Groups[1].Captures[0].Value;                            title = RemoveSomeTags(title);                            baidu.title = title;                            contentMatch = contentMatch.NextMatch();                            if (contentMatch.Success)                            {                                string brief = contentMatch.Groups[1].Captures[0].Value;                                int splitIndex = brief.IndexOf("<font");                                if (splitIndex > -1)                                    brief = brief.Substring(0, splitIndex);                                if (!tagsForBrief)                                    brief = RemoveSomeTags(brief);                                //如果不需要带有HTML格式的摘要,那么就处理掉HTML标签                                baidu.brief = brief;                            }                        }                        else                        {                            if (html == "") continue;                            Console.WriteLine("怪了,这里没有找到任何结果。");                            Console.WriteLine("如果百度已经更改了页面的结构那么程序需要重新设计。");                            Console.WriteLine("Mark:");                            Console.WriteLine(html);                            Cut();                            Cut();                            Cut();                        }                        //Console.WriteLine(html);                        //Program.Cut();                        entries.Add(baidu);                    }                    m = m.NextMatch();                }            });            PrintResult(entries);        }        public static void Main(string[] args)        {            Console.WriteLine("请输入一个关键字。");            string keyword;            keyword = Console.ReadLine();            Console.WriteLine("正在从百度上获取结果,请稍等……");            string input;            input = GetHtml(keyword);            Regex r = new Regex("<table.*class=\"result\"[\\s\\S]*</table>", RegexOptions.IgnoreCase);            input = r.Match(input).Value;            MainProc(input);            Console.ReadKey(true);        }    }}

原创粉丝点击