简单介绍:本程序是抓取某个固定网站的动态数据内容,因为每个网站的动态网页都有其不同的post参数和传递方式,并且其response回的Json格式也不尽相同;所以无法做到通用的动态网页的抓取。我所做的内容是将中国汽车网上的公告查询到的汽车信息提取存储到MDB文件中。这个过程需要两次request,第一次是获取查询到的车辆信息,然后提取车辆的tarid,将tarid作为第二次request的参数。这其中有一个问题就是,第一次request返回的数据只有100条。所以凡是超过100条的数据,就需要做分页处理了。代码中一处不足的地方就在于,解析第二次response所得到的数据的处理。我是每得到一条就插入MDB中,这无形之中就增加了程序的开销。好一点的做法是将每次得到的数据放入datatable中,然后在所有操作结束之后,再将datatable中的数据写入mdb。代码中的注释不太详细,各位看官只能自己去理解了。这也是我第一次做抓取的程序,有不足之处还请指正。
工具:vs2010,谷歌浏览器;
1、界面的后台代码:
using System;using System.Data;using System.Drawing;using System.Threading;using System.Windows.Forms;using System.ComponentModel;using System.Collections.Generic;namespace GetCarData{ public partial class Tool : Form { int count = 0; public Tool() { InitializeComponent(); } //确认按钮 private void btnOk_Click(object sender, EventArgs e) { btnOk.Enabled = false; txtBatch.Enabled = false; Thread thread = new Thread(new ThreadStart(LoadData)); thread.IsBackground = true; thread.Start(); pbSchedule.Minimum = 0; pbSchedule.Maximum = count; } //cancel按钮 private void btnCancel_Click(object sender, EventArgs e) { btnCancel.Enabled = false; txtCompanyName.Clear(); txtBatch.Clear(); Application.Restart(); btnCancel.Enabled = true; } private void LoadData() { SetLableText("数据加载中..."); GetData(txtBatch.Text); SetLableText("数据加载完成!"); this.BeginInvoke(new MethodInvoker(delegate() { btnOk.Enabled = true; txtBatch.Enabled = true; })); } #region GetData public void GetData(string condition) { GetCarData getData = new GetCarData(); //取第一页的数据 DataTable dt = getData.Search(1, condition); getData.CarData(dt); //获取返回的数据数量 count = Convert.ToInt32(dt.Rows[0]["tarid"]); //计算分页数目 count = count / 100; pbSchedule.Invoke(new pbMaxDelegate(SetpbMaximum),count); //获取从第2页开始的数据 SetPbValue(1); for (int i = 2; i < count + 2; i++) { getData.CarData(getData.Search(i, condition)); SetPbValue(i); } } #endregion //设置进度条的最大值 delegate void pbMaxDelegate(int max); private void SetpbMaximum(int max) { pbSchedule.Maximum = count + 1; } //委托 控制label的显示 delegate void labDelegate(string str); private void SetLableText(string str) { if (lMessage.InvokeRequired) { Invoke(new labDelegate(SetLableText), new string[] { str }); } else { lMessage.Text = str; } } //委托 设置进度条的值 delegate void pbDelegate(int value); private void SetPbValue(int value) { if (pbSchedule.InvokeRequired) { Invoke(new pbDelegate(SetPbValue), new object[] { value }); } else { pbSchedule.Value = value; } } }}
2、功能实现代码:
using System;using System.IO;using System.Web;using System.Net;using System.Text;using System.Data;using System.Data.OleDb;using System.Diagnostics;using System.Windows.Forms;using System.Collections.Generic;using System.Text.RegularExpressions;namespace GetCarData{ class GetCarData { #region search //search函数 第一次request public DataTable Search(int count ,string requirement) { DataTable dtTarid = null; //存放tarid HttpWebRequest request = null; HttpWebResponse response = null; try { //设置request参数 Encoding encoding = Encoding.Default; string requestUrl = "http://www.chinacar.com.cn/search_json.asp"; request = (HttpWebRequest)WebRequest.Create(requestUrl); request.Timeout = 1000 * 60 * 2; request.Method = "POST"; request.Accept = "application/x-json;text/x-json;charset=GB2312"; request.ContentType = "application/x-www-form-urlencoded; charset=GBK"; string start = ((count - 1) * 100).ToString(); //设置request参数 //string strPostData = "limit=100&page="+count.ToString()+"&start="+start+"&s20=1&s4=" + txtBatch.Text; string strPostData = "limit=100&page=" + count.ToString() + "&start=" + start + "&s20=1&s0=" + HttpUtility.UrlEncodeUnicode(requirement).Replace("%", "%25"); //将字符转化为字节,写入request中 byte[] buffer = encoding.GetBytes(strPostData); request.ContentLength = buffer.Length; request.GetRequestStream().Write(buffer, 0, buffer.Length); //获取response的json数据 response = (HttpWebResponse)request.GetResponse(); using (StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"))) { string searchResult = reader.ReadToEnd(); //调用json解析函数 解析第二次返回的json数据 dtTarid = CatchTarid(searchResult); //MessageBox.Show(dt.Rows[90][0].ToString()); } } catch (System.Exception ex) { //数据获取失败,Tarid可能有误 //MessageBox.Show("0、"+ex.Message); using (StreamWriter swError = File.AppendText(Application.StartupPath + @"\ERROR.log")) { swError.Write("错误代号:0" + "错误信息:" + ex.Message + "错误时间:" + DateTime.Now.ToString() + "\r\n",Encoding.UTF8); swError.Flush(); swError.Close(); } } finally { try { response.Close(); request.Abort(); } catch (System.Exception e) { //数据异常,不正常结束,导致数据丢失 using (StreamWriter swError = File.AppendText(Application.StartupPath + @"\ERROR.log")) { swError.Write("错误代号:1" + ",错误数据位置:" + count + ",错误信息:" + e.Message + ",错误时间:" + DateTime.Now.ToString() + "\r\n",Encoding.UTF8); swError.Flush(); swError.Close(); } } } return dtTarid; } #endregion #region CarData //GetCarData 第二次request public void CarData(DataTable dtJson) { HttpWebRequest request = null; HttpWebResponse response = null; for (int i = 1; i < dtJson.Rows.Count; i++) { //test a record time start Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); //test a record time start 0.12s string carModel = null; try { //设置request的参数 Encoding encoding = Encoding.Default; string requestUrl = "http://www.chinacar.com.cn/search_view.asp"; request = (HttpWebRequest)WebRequest.Create(requestUrl); request.Timeout = 1000 * 60 * 2; request.Method = "POST"; request.Accept = "application/x-json;text/x-json;charset=GB2312"; request.ContentType = "application/x-www-form-urlencoded; charset=GBK"; //每post一次,服务器返回一辆车的详细信息 //设置post的数据 //分页处理? string strPostdata = "tarid=" + dtJson.Rows[i][0].ToString() + "&page=1&start=0&limit=25"; byte[] buffer = encoding.GetBytes(strPostdata); request.ContentLength = buffer.Length; request.GetRequestStream().Write(buffer, 0, buffer.Length); //取得服务器响应的数据 response = (HttpWebResponse)request.GetResponse(); using (StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"))) { string strJson = reader.ReadToEnd(); //接受返回的json数据 //调用json解析函数 carModel = JsonToDataTable(strJson); } } catch (System.Exception ex) { //MessageBox.Show("1、"+ex.Message+i.ToString()); using (StreamWriter swError = File.AppendText(Application.StartupPath + @"\ERROR.log")) { swError.Write("错误代号:2" + ",错误数据:" + dtJson.Rows[1][0] + ",错误信息:" + ex.Message + ",错误时间:" + DateTime.Now.ToString() + "\r\n",Encoding.UTF8); swError.Flush(); swError.Close(); } } finally { try { response.Close(); //释放request连接 request.Abort(); } catch (Exception ex) { using (StreamWriter swError = File.AppendText(Application.StartupPath + @"\ERROR.log")) { swError.Write("错误代号:3" + ",错误数据:" + carModel + ",错误信息:" + ex.Message + ",错误时间:" + DateTime.Now.ToString() + "\r\n",Encoding.UTF8); swError.Flush(); swError.Close(); } } //test a record time end stopWatch.Stop(); TimeSpan timespan = stopWatch.Elapsed; //写入程序执行日志 using (StreamWriter swInfo = File.AppendText(Application.StartupPath + @"\INFO.log")) { swInfo.Write("完成时间:" + timespan.TotalSeconds.ToString() + "车辆型号:" + carModel +DateTime.Now.ToString() +"\r\n",Encoding.UTF8); swInfo.Flush(); swInfo.Close(); } // test a record time end } } } #endregion #region Catch tarid //解析第一次request后传回的json值,获取 tarid private DataTable CatchTarid(string strJson) { //获取总共的数据数量 string count = strJson.Substring(strJson.IndexOf(",") + 2); count = count.Substring(13, count.IndexOf(",") - 14); //创建datatable及其行、列 DataTable dtTarid = new DataTable(); DataColumn dcTarid = new DataColumn(); DataRow drTarid = dtTarid.NewRow(); //为datatable取名并将列和行附加如表中 dtTarid.TableName = "tarid"; dcTarid.ColumnName = "tarid"; dtTarid.Columns.Add(dcTarid); drTarid["tarid"] = count; dtTarid.Rows.Add(drTarid); dtTarid.AcceptChanges(); //去除字符串冗余的部分 strJson = strJson.Substring(strJson.IndexOf("[") + 1); strJson = strJson.Substring(0, strJson.IndexOf("]")); //将字符串按"}"分割 string[] strRows = strJson.Split('}'); //获取分割之后每一个字串中的tarid,并放入datatable中 for (int i = 1; i < strRows.Length; i++) { drTarid = dtTarid.NewRow(); //去名为tarid的值 drTarid["tarid"] = strRows[i - 1].Substring(Regex.Match(strRows[i - 1], "tarid").Index + 8, 6).Replace("\"", "").Replace(",", ""); ; dtTarid.Rows.Add(drTarid); dtTarid.AcceptChanges(); } return dtTarid; } #endregion #region JsonToDataTable //解析第二次request后传回的Json值,获取每一个属性值 private string JsonToDataTable(string strJson) { //去除字符串中开头的说明部分 strJson = strJson.Substring(strJson.IndexOf("[") + 1); strJson = strJson.Substring(0, strJson.IndexOf("]")); //strJson.Trim().Replace("{", "").Replace("\\", ""); //将字符串分割 string[] strRows = strJson.Split('}'); DataTable dtCarData = new DataTable(); //存放车辆详细信息 DataColumn dcCarData = dtCarData.Columns.Add("carMessages"); DataRow drCarData = null; //获取每个子串中的val1和val2 for (int i = 0; i < strRows.Length - 1; i++) { //string[] str = strRows[i].Split(','); drCarData = dtCarData.NewRow(); string[] val1 = strRows[i].Substring(strRows[i].IndexOf("val1") + 7).Split(','); drCarData["carMessages"] = val1[0]; dtCarData.Rows.Add(drCarData); drCarData = dtCarData.NewRow(); drCarData["carMessages"] = strRows[i].Substring(strRows[i].IndexOf("val2") + 7); dtCarData.Rows.Add(drCarData); dtCarData.AcceptChanges(); } //调用函数将该数据放入数据库中 InsertIntoAccess(dtCarData); return dtCarData.Rows[1]["carMessages"].ToString(); } #endregion #region InsertIntoAccess //将datatable中的数据插入到mdb数据库文件中 private void InsertIntoAccess(DataTable dtData) { string carTarid = null; OleDbConnection conn = new OleDbConnection("Provider=Microsoft.Jet.OLEDB.4.0;Data Source=" + Application.StartupPath + @"\CarData.mdb;Persist Security Info=False"); OleDbCommand cmd = new OleDbCommand(); try { conn.Open(); cmd.Connection = conn; carTarid = "'" + dtData.Rows[0]["carMessages"].ToString() + "'"; //取得datatable中的每一行,并将他们拼成sql语句 //for (int i = 1; i < dtData.Rows.Count - 1; i++) //{ // carTarid += ",'" + dtData.Rows[i][0].ToString() + "'"; // carTarid = carTarid.Trim().Replace("\"", "").Replace("<br\\>", "/"); //} carTarid = "'" + dtData.Rows[0][0] + "','" + dtData.Rows[1][0] + "','" + dtData.Rows[2][0] + "','" + dtData.Rows[3][0] + "','" + dtData.Rows[9][0] + "','" + dtData.Rows[10][0] + "','" + dtData.Rows[43][0] + "'"; carTarid = carTarid.Trim().Replace("\"", "").Replace("<br\\>", "/"); /* sqlStr = "INSERT INTO CarData (车辆名称,车辆型号,车辆类别,中文品牌, 免检, 燃油, 免证, 环保, 生产厂家," + " 企业地址, 公告批次, 目录序号, 发动机型号, 发动机厂商, 发动机功率,发动机排量,外形尺寸, 货箱尺寸," + " 总质量, 载质量, 整备质量, 额定质量, 挂车质量, 半挂鞍座, 额定载客, 前排乘客,油耗, 最高车速," + " [接近角/离去角], [前悬/后悬], 轴距, 轴荷, 轴数, 弹簧片数, 轮胎数, 轮胎规格, 前轮距, 后轮距, 转向形式," + " 启动方式, 标识型号, 标识商标, 标识企业, 识别代号, 依据标准, 底盘依据标准, 产品号, 发布日期, 其他) " + " VALUES (" + carTarid + ")";*/ string sqlStr = "INSERT INTO CarData (车辆名称,车辆型号,车辆类别,中文品牌, 企业地址, 公告批次, 识别代号) VALUES (" + carTarid + ")"; cmd.CommandText = sqlStr; cmd.ExecuteNonQuery(); //MessageBox.Show(record.ToString()); } catch (Exception e) { //MessageBox.Show("2、"+e.Message); using (StreamWriter swError = File.AppendText(Application.StartupPath + @"\ERROR.log")) { swError.Write("错误代号:4" + ",错误数据:" + carTarid + ",错误信息:" + e.Message + ",错误时间:" + DateTime.Now.ToString() + "\r\n",Encoding.UTF8); swError.Flush(); swError.Close(); } } finally { try { conn.Close(); } catch (Exception ex) { //数据库关闭异常 using (StreamWriter swError = File.AppendText(Application.StartupPath + @"\ERROR.log")) { swError.Write("错误代号:5" + ",错误信息:" + ex.Message + ",错误时间:" + DateTime.Now.ToString() + "\r\n",Encoding.UTF8); swError.Flush(); swError.Close(); } } } } #endregion }}
3、效果图: