程序中得到百度的搜索结果
来源:互联网 发布:yum openssl安装 编辑:程序博客网 时间:2024/05/01 23:22
using System;using System.Collections.Generic;using System.Text;using System.Text.RegularExpressions;using System.Web;using System.Net;using System.IO;namespace baiduRobot{ struct BaiduEntry { public string title, brief, link; } class Program { static string GetHtml(string keyword) { string url = @"http://www.baidu.com/"; string encodedKeyword = HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(936)); //百度使用codepage 936字符编码来作为查询串,果然专注于中文搜索…… //更不用说,还很喜欢微软 //谷歌能正确识别UTF-8编码和codepage这两种情况,不过本身网页在HTTP头里标明是UTF-8的 //估计谷歌也不讨厌微软(以及微软的专有规范) string query = "s?wd=" + keyword; HttpWebRequest req; HttpWebResponse response; Stream stream; req = (HttpWebRequest)WebRequest.Create(url + query); response = (HttpWebResponse)req.GetResponse(); stream = response.GetResponseStream(); int count = 0; byte[] buf = new byte[8192]; string decodedString = null; StringBuilder sb = new StringBuilder(); try { Console.WriteLine("正在读取网页{0}的内容……", url + query); do { count = stream.Read(buf, 0, buf.Length); if (count > 0) { decodedString = Encoding.GetEncoding(936).GetString(buf, 0, count); sb.Append(decodedString); } } while (count > 0); } catch { Console.WriteLine("网络连接失败,请检查网络设置。"); } return sb.ToString(); } static void PrintResult(List<BaiduEntry> entries) { int count = 0; entries.ForEach(delegate(BaiduEntry entry) { Console.WriteLine("找到了百度的第{0}条搜索结果:", count += 1); if (entry.link != null) { Console.WriteLine("找到了一条链接:"); Console.WriteLine(entry.link); } if (entry.title != null) { Console.WriteLine("标题为:"); Console.WriteLine(entry.title); } if (entry.brief != null) { Console.WriteLine("下面是摘要:"); Console.WriteLine(entry.brief); } Program.Cut(); }); } static void simpleOutput() { string html = "<table><tr><td><font>test</font><a>hello</a><br></td></tr></table>"; Console.WriteLine(RemoveSomeTags(html)); } static string RemoveVoidTag(string html) { string[] filter = { "<br>" }; foreach (string tag in filter) { html = html.Replace(tag, ""); } return html; } static string ReleaseXmlTags(string html) { string[] filter = { "<a.*?>", "</a>", "<em>", "</em>", "<b>", "</b>", "<font.*?>", "</font>" }; foreach (string tag in filter) { html = Regex.Replace(html, tag, ""); } return html; } static string RemoveSomeTags(string html) { html = RemoveVoidTag(html); html = ReleaseXmlTags(html); return html; } static void Cut() { Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"); } static void MainProc(string input) { MainProc(input, false); } static void MainProc(string input, bool tagsForBrief) { Regex r = new Regex("<table*</table>", RegexOptions.IgnoreCase); //提取出(<table>,</table>)对,并等待进一步处理。 Match m = r.Match(input); List<string> collection = new List<string>(); while (m.Success) { collection.Add(m.Value); //找出tagname为table的节点并存储到collection变量中 m = m.NextMatch(); } List<BaiduEntry> entries = new List<BaiduEntry>(); collection.ForEach(delegate(string entry) { r = new Regex("<td.*?>(.*)</td>", RegexOptions.IgnoreCase); m = r.Match(entry); while (m.Success) { //Console.WriteLine(m.Value); GroupCollection gc = m.Groups; // Console.WriteLine(gc[0].Captures[0].Value == gc[0].Value); for (int i = 1; i < gc.Count; i++) {//放弃第一个group,那里只有整个match字符串,而且永远只有这1个捕获组(gc[0].Captures.Count恒为1) Capture result = gc[i].Captures[0];//正则对象r里只有1个分组,所以只需要提取第一个分组就可以了。 string html = result.Value; //result里存储着td节点的innerHTML,那里有真正的搜索结果 BaiduEntry baidu = new BaiduEntry(); r = new Regex("<a.*?href=\"(.*?)\".*?>", RegexOptions.IgnoreCase); if (r.IsMatch(html)) { string linkString = r.Match(html).Groups[1].Captures[0].Value; baidu.link = linkString; } r = new Regex("<font.*</font>"); //td节点下有一些嵌套了2层的font标签,把这个大的font标签拿下来。 html = r.Match(html).Value;//现在html变量里存储着比较浓缩的信息了。 r = new Regex("<font.*?>(.*?)</font>"); Match contentMatch = r.Match(html); if (contentMatch.Success) { //Console.WriteLine(html); string title = contentMatch.Groups[1].Captures[0].Value; title = RemoveSomeTags(title); baidu.title = title; contentMatch = contentMatch.NextMatch(); if (contentMatch.Success) { string brief = contentMatch.Groups[1].Captures[0].Value; int splitIndex = brief.IndexOf("<font"); if (splitIndex > -1) brief = brief.Substring(0, splitIndex); if (!tagsForBrief) brief = RemoveSomeTags(brief); //如果不需要带有HTML格式的摘要,那么就处理掉HTML标签 baidu.brief = brief; } } else { if (html == "") continue; Console.WriteLine("怪了,这里没有找到任何结果。"); Console.WriteLine("如果百度已经更改了页面的结构那么程序需要重新设计。"); Console.WriteLine("Mark:"); Console.WriteLine(html); Cut(); Cut(); Cut(); } //Console.WriteLine(html); //Program.Cut(); entries.Add(baidu); } m = m.NextMatch(); } }); PrintResult(entries); } public static void Main(string[] args) { Console.WriteLine("请输入一个关键字。"); string keyword; keyword = Console.ReadLine(); Console.WriteLine("正在从百度上获取结果,请稍等……"); string input; input = GetHtml(keyword); Regex r = new Regex("<table.*class=\"result\"[\\s\\S]*</table>", RegexOptions.IgnoreCase); input = r.Match(input).Value; MainProc(input); Console.ReadKey(true); } }}