C#抓取百度和谷歌的搜索结果(标题和链接) 代码整理

来源:互联网 发布:免费手机定位追踪软件 编辑:程序博客网 时间:2024/04/27 08:39

     这两天公司让做一个小的功能就是抓取百度和谷歌的搜索结果,把搜索到的标题和链接一一提取出来。其实页面是很好提取的,主要的问题就是正则表达式处理下载下来的页面。于是在论坛上请教了大家,在大家的帮助下,这个功能的核心代码已经完成,现在整理出来,以提供需要的人参考。

C# 代码:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using httpState;
using System.Text.RegularExpressions;
using System.Collections;

namespace test
{
    public partial class DownLoadTest : Form
    {
       
        public DownLoadTest()
        {
            InitializeComponent();
        }
        /// <summary>
        /// 百度搜索
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void btnBaidu_Click(object sender, EventArgs e)
        {       
            int num = 20;//搜索条数
            string url = "http://www.baidu.com/s?wd=" + txtSearch.Text.Trim() + "&rn=" + num + "";
            string html=search(url,"gb2312");
            BaiduSearch baidu = new BaiduSearch();
            if (!string.IsNullOrEmpty(html))
            {
                int count = baidu.GetSearchCount(html);//搜索条数
                if (count > 0)
                {
                    List<Keyword> keywords = baidu.GetKeywords(html, txtSearch.Text.Trim());
                    dataGridView1.DataSource = keywords;
                }
              
            }
        }
        /// <summary>
        /// 谷歌搜索
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void button2_Click(object sender, EventArgs e)
        {
            int num=100;
            string url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=" + txtSearch.Text.Trim() + "&aq=f&aqi=&aql=&oq=&num="+num+"";
            string html=search(url,"utf-8");
            if (!string.IsNullOrEmpty(html))
            {

                googleSearch google = new googleSearch();
                List<Keyword> keywords = google.GetKeywords(html, txtSearch.Text.Trim());
                    dataGridView1.DataSource = keywords;
               
            }
        }
        /// <summary>
        /// 搜索处理
        /// </summary>
        /// <param name="url">搜索网址</param>
        /// <param name="Chareset">编码</param>
        public string search(string url,string Chareset)
        {
            HttpState result = new HttpState();
            Uri uri = new Uri(url);
            HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
            myHttpWebRequest.UseDefaultCredentials = true;
            myHttpWebRequest.ContentType = "text/html";
            myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)";
            myHttpWebRequest.Method = "GET";
            myHttpWebRequest.CookieContainer = new CookieContainer();

            try
            {
                HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse();
                // 从 ResponseStream 中读取HTML源码并格式化 add by cqp
                result.Html = readResponseStream(response, Chareset);
                result.CookieContainer = myHttpWebRequest.CookieContainer;
                 return result.Html;                
            }
            catch (Exception ex)
            {
                return ex.ToString();
            }
           
        }
        public string readResponseStream(HttpWebResponse response, string Chareset)
        {
            string result = "";
            using (StreamReader responseReader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(Chareset)))
            {         
                result = formatHTML(responseReader.ReadToEnd());
            }

            return result;
        }
        /// <summary>
        /// 描述:格式化网页源码
        ///
        /// </summary>
        /// <param name="htmlContent"></param>
        /// <returns></returns>
        public string formatHTML(string htmlContent)
        {
            string result = "";

            result = htmlContent.Replace("&raquo;", "").Replace("&nbsp;", "")
                    .Replace("&copy;", "").Replace("/r", "").Replace("/t", "")
                    .Replace("/n", "").Replace("&amp;", "&");

            return result;
        }

        class BaiduSearch
        {
            protected string uri = "http://www.baidu.com/s?wd=";
            //protected string uri = "http://www.baidu.com/s?wd=software&pn=10&usm=2"; // 第二页
            protected Encoding queryEncoding = Encoding.GetEncoding("gb2312");
            protected Encoding pageEncoding = Encoding.GetEncoding("gb2312");
            protected string resultPattern = @"(?<=找到相关结果[约]?)[0-9,]*?(?=个)";
            public int GetSearchCount(string html)
            {
                int result = 0;
                string searchcount = string.Empty;

                Regex regex = new Regex(resultPattern);
                Match match = regex.Match(html);

                if (match.Success)
                {
                    searchcount = match.Value;
                }
                else
                {
                    searchcount = "0";
                }

                if (searchcount.IndexOf(",") > 0)
                {
                    searchcount = searchcount.Replace(",", string.Empty);
                }

                int.TryParse(searchcount, out result);

                return result;
            }

            public List<Keyword> GetKeywords(string html, string word)
            {
                int i=1;
                List<Keyword> keywords = new List<Keyword>();

                Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{1,2}|100)/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
                //Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{2})/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
                Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);

                MatchCollection mcTable = regTable.Matches(html);
                foreach (Match mTable in mcTable)
                {
                    if (mTable.Success)
                    {
                        Match mA = regA.Match(mTable.Value);
                        if (mA.Success)
                        {

                            Keyword keyword = new Keyword();
                            keyword.ID=i++;
                            keyword.Link = mA.Groups["link"].Value;
                            keyword.Title = mA.Groups["title"].Value;
                            keywords.Add(keyword);
                        }
                    }
                }

                return keywords;
            }
        }
        class googleSearch
        {
            public List<Keyword> GetKeywords(string html, string word)
            {
                int i = 1;
                List<Keyword> keywords = new List<Keyword>();

                Regex regTable = new Regex(@"(?is)<h3[^>]*?>(?><h3[^>]*>(?<o>)|</h3>(?<-o>)|(?:(?!</?h3/b).)*)*(?(o)(?!))</h3>", RegexOptions.IgnoreCase);
                //Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{2})/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
                Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);

                MatchCollection mcTable = regTable.Matches(html);
                foreach (Match mTable in mcTable)
                {
                    if (mTable.Success)
                    {
                        Match mA = regA.Match(mTable.Value);
                        if (mA.Success)
                        {

                            Keyword keyword = new Keyword();
                            keyword.ID = i++;
                            keyword.Link = mA.Groups["link"].Value;
                            keyword.Title = mA.Groups["title"].Value;
                            keywords.Add(keyword);
                        }
                    }
                }

                return keywords;
            }
        }
        class Keyword
        {
            public int ID { get; set; }
            public string Title { get; set; }
            public string Link { get; set; }
            //private string title;
            //public string Title { get { return title; } set { title = value; } }
            //private string link;
            //public string Link { get { return link; } set { link = value; } }
        }
    }
}
HttpState:

using System.Net;
using System.Collections;

namespace httpState
{
    public class HttpState
    {

        // 获取与响应一起返回的状态说明。
        private string _statusDescription;

        public string StatusDescription
        {
            get { return _statusDescription; }
            set { _statusDescription = value; }
        }

        /// <summary>
        /// 回调 址址, 登陆测试中使用
        /// </summary>
        private string _callBackUrl;

        public string CallBackUrl
        {
            get { return _callBackUrl; }
            set { _callBackUrl = value; }
        }


        /// <summary>
        /// 网页网址 绝对路径格式
        /// </summary>
        private string _url;

        public string Url
        {
            get { return _url; }
            set { _url = value; }
        }

        /// <summary>
        /// 字符串的形式的Cookie信息
        /// </summary>
        private string _cookies;

        public string Cookies
        {
            get { return _cookies; }
            set { _cookies = value; }
        }

        /// <summary>
        /// Cookie信息
        /// </summary>
        private CookieContainer _cookieContainer = new CookieContainer();

        public CookieContainer CookieContainer
        {
            get { return _cookieContainer; }
            set { _cookieContainer = value; }
        }

        /// <summary>
        /// 网页源码
        /// </summary>
        private string _html;

        public string Html
        {
            get { return _html; }
            set { _html = value; }
        }

        /// <summary>
        /// 验证码临时文件(绝对路径)
        /// </summary>
        private string _tmpValCodePic;

        public string TmpValCodePic
        {
            get { return _tmpValCodePic; }
            set { _tmpValCodePic = value; }
        }

        /// <summary>
        /// 验证码临时文件名(相对路径)
        /// </summary>
        private string _tmpValCodeFileName = "emptyPic.gif";

        public string TmpValCodeFileName
        {
            get { return _tmpValCodeFileName; }
            set { _tmpValCodeFileName = value; }
        }

        /// <summary>
        /// 有验证码
        /// </summary>
        private bool _isValCode;

        public bool IsValCode
        {
            get { return _isValCode; }
            set { _isValCode = value; }
        }

        /// <summary>
        /// 验证码URL
        /// </summary>
        private string _valCodeURL;

        public string ValCodeURL
        {
            get { return _valCodeURL; }
            set { _valCodeURL = value; }
        }

        /// <summary>
        /// 验证码识别后的值
        /// </summary>
        private string _valCodeValue;

        public string ValCodeValue
        {
            get { return _valCodeValue; }
            set { _valCodeValue = value; }
        }

        /// <summary>
        /// 其它参数
        /// </summary>
        private Hashtable _otherParams = new Hashtable();

        public Hashtable OtherParams
        {
            get { return _otherParams; }
            set { _otherParams = value; }
        }

        // 重复添加处理 add by fengcj  09/11/19 PM
        public void addOtherParam(object key, object value)
        {
            if (!this.OtherParams.ContainsKey(key))
                this.OtherParams.Add(key, value);
            else
            {
                this.OtherParams[key] = value;
            }
        }

        public void removeOtherParam(object key)
        {
            this.OtherParams.Remove(key);
        }

        public object getOtherParam(object key)
        {
            return this.OtherParams[key];
        }
    }
}

 界面很简单一个输入框,两个搜索按钮和一个datagridview

原创粉丝点击