C#网页爬虫抓取行政区划

来源:互联网 发布:tortoisesvn mac版 编辑:程序博客网 时间:2024/05/03 18:50

借鉴C#网页爬虫抓取行政区划,从国家统计局获取了最新行政区域数据。

以下为代码贴片:

数据库类:

public class City {    public decimal ID { get; set; }    public string Name { get; set; }    public string Code { get; set; }    public string Org_Level { get; set; }    public string ParentCode { get; set; }    public decimal ParentID { get; set; }    public string Contry { get; set; }    public string Loc_x { get; set; }    public string Loc_y { get; set; }  }
获取网页帮助类:

  public class HttpHelper {    private static ILog log = log4net.LogManager.GetLogger(typeof(HttpHelper));    public static string DownloadHtml(string url,Encoding encod) {      string html = string.Empty;      try {        //设置请求参数        HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;        request.Timeout = 10 * 1000;//10s超时        request.ContentType = "text/html;charset=utf-8";        request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36";                //获取结果        using(HttpWebResponse resp = request.GetResponse() as HttpWebResponse) {          if(resp.StatusCode != HttpStatusCode.OK) {            log.Fatal(string.Format("抓取{0}地址返回失败,response.StatusCode = {1}",url,resp.StatusCode));          } else {            try {              StreamReader sr = new StreamReader(resp.GetResponseStream(),encod);              html = sr.ReadToEnd();              sr.Close();            } catch(Exception e) {              log.Fatal(string.Format("DownLoadHtml抓取html{0}保存失败",url),e);                          }          }        }      } catch(Exception e) {        if(e.Message.Equals("远程服务器返回错误:(306)。")) {        }        log.Fatal(e);      } finally {      }      return html;    }  }

数据库保存帮助类:

  public class SQLHelper {        /// 一个有效的数据库连接对象     /// 命令类型(存储过程,命令文本或其它.)     /// T存储过程名称或T-SQL语句     /// SqlParamter参数数组     /// 返回影响的行数     public static int ExecuteNonQueryForCity(List<City> cityList) {      int count = 0;      //string dbConnectStr = System.Configuration.ConfigurationSettings.AppSettings["DBContext"].ToString();      var connectionString = System.Configuration.ConfigurationManager.ConnectionStrings["DBContext"].ConnectionString;      using(SqlConnection connection = new SqlConnection(connectionString)) {        if(connection.State != ConnectionState.Open) {          connection.Open();        }        // 创建SqlCommand命令,并进行预处理         using(SqlCommand cmd = new SqlCommand()) {          cmd.Connection = connection;          cmd.CommandText = "insert into base_city(ID,name,Code,Contry,Loc_x,Loc_y,Org_Level,ParentCode,ParentID,state) values(@ID,@name,@Code,@Contry,@Loc_x,@Loc_y,@Org_Level,@ParentCode,@ParentID,@state)";          foreach(var city in cityList) {            try {              if(string.IsNullOrEmpty(city.Name))                city.Name = "";              if(string.IsNullOrEmpty(city.Code))                city.Code = "";              if(string.IsNullOrEmpty(city.Contry))                city.Contry = "";              if(string.IsNullOrEmpty(city.Loc_x))                city.Loc_x = "";              if(string.IsNullOrEmpty(city.Loc_y))                city.Loc_y = "";              if(string.IsNullOrEmpty(city.Org_Level))                city.Org_Level = "";              if(string.IsNullOrEmpty(city.ParentCode))                city.ParentCode = "";              cmd.Parameters.Add(new SqlParameter("@ID",city.ID));              cmd.Parameters.Add(new SqlParameter("@name",city.Name));              cmd.Parameters.Add(new SqlParameter("@Code",city.Code));              cmd.Parameters.Add(new SqlParameter("@Contry",city.Contry));                           cmd.Parameters.Add(new SqlParameter("@Loc_x",city.Loc_x));                cmd.Parameters.Add(new SqlParameter("@Loc_y",city.Loc_y));              cmd.Parameters.Add(new SqlParameter("@Org_Level",city.Org_Level));              cmd.Parameters.Add(new SqlParameter("@ParentCode",city.ParentCode));              cmd.Parameters.Add(new SqlParameter("@ParentID",city.ParentID));              cmd.Parameters.Add(new SqlParameter("@state","1"));              // Finally, execute the command               int retval = cmd.ExecuteNonQuery();              if(retval == 0) {                Console.WriteLine("插入错误:");              }              count += retval;            } catch(Exception e) {              Console.WriteLine("插入错误:" + e.Message);            }            // 清除参数,以便再次使用.             cmd.Parameters.Clear();          }        }        connection.Close();      }      return count;    }  }
抓取数据:

 public class 省市县数据抓取 {    private ILog log = log4net.LogManager.GetLogger(typeof(省市县数据抓取));    public const string UrlStr = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html";    public List<City> SaveList = new List<City>();    public 省市县数据抓取() {      try {        log.Info("抓取数据");        string HtmlStr = HttpHelper.DownloadHtml(UrlStr,Encoding.UTF8);        HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();        doc.LoadHtml(HtmlStr);        //string goodsListPath = "//*[@id='J_goodsList']";        //HtmlNode goodsListNode = doc.DocumentNode.SelectSingleNode(goodsListPath);        string liPath = "//p[@class='MsoNormal']";        HtmlNodeCollection goodsNodeCollection = doc.DocumentNode.SelectNodes(liPath);        City c = new City() {           ID=1,          Name = "全国",          Code = "100000",          Contry = "China",          Org_Level = "1"        };        SaveList.Add(c);        foreach(var item in goodsNodeCollection) {          var firstNode = item.FirstChild;          if(firstNode.Name == "b")            GetProvince(item);          else if(firstNode.InnerText == " ") {            GetCity(item);          } else if(firstNode.InnerText == "  ") {            GetCounty(item);          }        }      } catch(Exception e) {        log.Info("last child code:" + SaveList.Last().Code);        log.Info(e);        throw (e);      }    }    private void GetCounty(HtmlNode item) {      City c = new City();      c.Code = item.ChildNodes[1].InnerText.Replace(" ","").Trim();      c.Name = item.ChildNodes[2].InnerText.Trim();      c.Org_Level = "4";      c.ID = SaveList.Last().ID + 1;      var pc = SaveList.Last(i => i.Org_Level == "3");      c.ParentCode = pc.Code;      c.ParentID = pc.ID;      c.Contry = "China";      //if(c.Name == "市辖区")      //  return;      SaveList.Add(c);    }    private void GetCity(HtmlNode item) {      City c = new City();      c.Code = item.ChildNodes[1].InnerText.Replace(" ","").Trim();      c.Name = item.ChildNodes[2].InnerText.Trim();           c.Org_Level = "3";      c.ID = SaveList.Last().ID + 1;      var pc = SaveList.Last(i => i.Org_Level == "2");      c.ParentCode = pc.Code;      c.ParentID = pc.ID;      c.Contry = "China";      SaveList.Add(c);    }    private void GetProvince(HtmlNode item) {      City c = new City();      c.Code = item.ChildNodes[0].FirstChild.InnerText.Replace(" ","").Trim();      c.Name = item.ChildNodes[1].FirstChild.InnerText.Trim();      c.Org_Level = "2";      c.ID = SaveList.Last().ID + 1;      var pc = SaveList.Last(i => i.Org_Level == "1");      c.ParentCode = pc.Code;      c.ParentID = pc.ID;      c.Contry = "China";      SaveList.Add(c);    }    public void Save() {      log.Info("保存数据");      SQLHelper.ExecuteNonQueryForCity(SaveList);    }  }
全国 Org_Level =1

Org_Level =2

Org_Level =3

Org_Level =4

SaveList 首先添加了一个全国属性城市,Org_Level =1

因为网页数据读取是从省->市->县  ->省->市->县  这样循环读取的,所以在获取省、市、县的父级时,可以直接从SaveList 获取最后一个上一级别的对象即可

执行类:

省市县数据抓取 CityCatch = new 省市县数据抓取();
CityCatch.Save();



获取的数据如下:




原创粉丝点击