C#网页爬虫抓取行政区划
来源:互联网 发布:tortoisesvn mac版 编辑:程序博客网 时间:2024/05/03 18:50
借鉴C#网页爬虫抓取行政区划,从国家统计局获取了最新行政区域数据。
以下为代码贴片:
数据库类:
public class City { public decimal ID { get; set; } public string Name { get; set; } public string Code { get; set; } public string Org_Level { get; set; } public string ParentCode { get; set; } public decimal ParentID { get; set; } public string Contry { get; set; } public string Loc_x { get; set; } public string Loc_y { get; set; } }获取网页帮助类:
public class HttpHelper { private static ILog log = log4net.LogManager.GetLogger(typeof(HttpHelper)); public static string DownloadHtml(string url,Encoding encod) { string html = string.Empty; try { //设置请求参数 HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest; request.Timeout = 10 * 1000;//10s超时 request.ContentType = "text/html;charset=utf-8"; request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"; //获取结果 using(HttpWebResponse resp = request.GetResponse() as HttpWebResponse) { if(resp.StatusCode != HttpStatusCode.OK) { log.Fatal(string.Format("抓取{0}地址返回失败,response.StatusCode = {1}",url,resp.StatusCode)); } else { try { StreamReader sr = new StreamReader(resp.GetResponseStream(),encod); html = sr.ReadToEnd(); sr.Close(); } catch(Exception e) { log.Fatal(string.Format("DownLoadHtml抓取html{0}保存失败",url),e); } } } } catch(Exception e) { if(e.Message.Equals("远程服务器返回错误:(306)。")) { } log.Fatal(e); } finally { } return html; } }
数据库保存帮助类:
public class SQLHelper { /// 一个有效的数据库连接对象 /// 命令类型(存储过程,命令文本或其它.) /// T存储过程名称或T-SQL语句 /// SqlParamter参数数组 /// 返回影响的行数 public static int ExecuteNonQueryForCity(List<City> cityList) { int count = 0; //string dbConnectStr = System.Configuration.ConfigurationSettings.AppSettings["DBContext"].ToString(); var connectionString = System.Configuration.ConfigurationManager.ConnectionStrings["DBContext"].ConnectionString; using(SqlConnection connection = new SqlConnection(connectionString)) { if(connection.State != ConnectionState.Open) { connection.Open(); } // 创建SqlCommand命令,并进行预处理 using(SqlCommand cmd = new SqlCommand()) { cmd.Connection = connection; cmd.CommandText = "insert into base_city(ID,name,Code,Contry,Loc_x,Loc_y,Org_Level,ParentCode,ParentID,state) values(@ID,@name,@Code,@Contry,@Loc_x,@Loc_y,@Org_Level,@ParentCode,@ParentID,@state)"; foreach(var city in cityList) { try { if(string.IsNullOrEmpty(city.Name)) city.Name = ""; if(string.IsNullOrEmpty(city.Code)) city.Code = ""; if(string.IsNullOrEmpty(city.Contry)) city.Contry = ""; if(string.IsNullOrEmpty(city.Loc_x)) city.Loc_x = ""; if(string.IsNullOrEmpty(city.Loc_y)) city.Loc_y = ""; if(string.IsNullOrEmpty(city.Org_Level)) city.Org_Level = ""; if(string.IsNullOrEmpty(city.ParentCode)) city.ParentCode = ""; cmd.Parameters.Add(new SqlParameter("@ID",city.ID)); cmd.Parameters.Add(new SqlParameter("@name",city.Name)); cmd.Parameters.Add(new SqlParameter("@Code",city.Code)); cmd.Parameters.Add(new SqlParameter("@Contry",city.Contry)); cmd.Parameters.Add(new SqlParameter("@Loc_x",city.Loc_x)); cmd.Parameters.Add(new SqlParameter("@Loc_y",city.Loc_y)); cmd.Parameters.Add(new SqlParameter("@Org_Level",city.Org_Level)); cmd.Parameters.Add(new SqlParameter("@ParentCode",city.ParentCode)); cmd.Parameters.Add(new SqlParameter("@ParentID",city.ParentID)); cmd.Parameters.Add(new SqlParameter("@state","1")); // Finally, execute the command int retval = cmd.ExecuteNonQuery(); if(retval == 0) { Console.WriteLine("插入错误:"); } count += retval; } catch(Exception e) { Console.WriteLine("插入错误:" + e.Message); } // 清除参数,以便再次使用. cmd.Parameters.Clear(); } } connection.Close(); } return count; } }抓取数据:
public class 省市县数据抓取 { private ILog log = log4net.LogManager.GetLogger(typeof(省市县数据抓取)); public const string UrlStr = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html"; public List<City> SaveList = new List<City>(); public 省市县数据抓取() { try { log.Info("抓取数据"); string HtmlStr = HttpHelper.DownloadHtml(UrlStr,Encoding.UTF8); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(HtmlStr); //string goodsListPath = "//*[@id='J_goodsList']"; //HtmlNode goodsListNode = doc.DocumentNode.SelectSingleNode(goodsListPath); string liPath = "//p[@class='MsoNormal']"; HtmlNodeCollection goodsNodeCollection = doc.DocumentNode.SelectNodes(liPath); City c = new City() { ID=1, Name = "全国", Code = "100000", Contry = "China", Org_Level = "1" }; SaveList.Add(c); foreach(var item in goodsNodeCollection) { var firstNode = item.FirstChild; if(firstNode.Name == "b") GetProvince(item); else if(firstNode.InnerText == " ") { GetCity(item); } else if(firstNode.InnerText == " ") { GetCounty(item); } } } catch(Exception e) { log.Info("last child code:" + SaveList.Last().Code); log.Info(e); throw (e); } } private void GetCounty(HtmlNode item) { City c = new City(); c.Code = item.ChildNodes[1].InnerText.Replace(" ","").Trim(); c.Name = item.ChildNodes[2].InnerText.Trim(); c.Org_Level = "4"; c.ID = SaveList.Last().ID + 1; var pc = SaveList.Last(i => i.Org_Level == "3"); c.ParentCode = pc.Code; c.ParentID = pc.ID; c.Contry = "China"; //if(c.Name == "市辖区") // return; SaveList.Add(c); } private void GetCity(HtmlNode item) { City c = new City(); c.Code = item.ChildNodes[1].InnerText.Replace(" ","").Trim(); c.Name = item.ChildNodes[2].InnerText.Trim(); c.Org_Level = "3"; c.ID = SaveList.Last().ID + 1; var pc = SaveList.Last(i => i.Org_Level == "2"); c.ParentCode = pc.Code; c.ParentID = pc.ID; c.Contry = "China"; SaveList.Add(c); } private void GetProvince(HtmlNode item) { City c = new City(); c.Code = item.ChildNodes[0].FirstChild.InnerText.Replace(" ","").Trim(); c.Name = item.ChildNodes[1].FirstChild.InnerText.Trim(); c.Org_Level = "2"; c.ID = SaveList.Last().ID + 1; var pc = SaveList.Last(i => i.Org_Level == "1"); c.ParentCode = pc.Code; c.ParentID = pc.ID; c.Contry = "China"; SaveList.Add(c); } public void Save() { log.Info("保存数据"); SQLHelper.ExecuteNonQueryForCity(SaveList); } }全国 Org_Level =1
省 Org_Level =2
市 Org_Level =3
县 Org_Level =4
SaveList 首先添加了一个全国属性城市,Org_Level =1
因为网页数据读取是从省->市->县 ->省->市->县 这样循环读取的,所以在获取省、市、县的父级时,可以直接从SaveList 获取最后一个上一级别的对象即可
执行类:
省市县数据抓取 CityCatch = new 省市县数据抓取();CityCatch.Save();
获取的数据如下:
阅读全文
0 0
- C#网页爬虫抓取行政区划
- C#网页爬虫抓取行政区划
- C# 爬虫,抓取网页数据
- C# 爬虫 、 网页数据抓取 随记
- 爬虫抓取网页图片
- 网页爬虫抓取URL
- 爬虫抓取网页图片
- 关于爬虫网页抓取
- 爬虫抓取网页
- Java抓取网页爬虫
- Java爬虫抓取网页
- C# 抓取网页Html源码 (网络爬虫)
- C# 使用 Abot 实现 爬虫 抓取网页信息 源码下载
- C# 爬虫 抓取小说
- C# 爬虫 抓取小说
- Python爬虫抓取网页图片
- linux C++ 爬虫抓取网页
- java抓取网页 --- 网络爬虫
- LinuxC/C++网络爬虫(1)
- 项目实训第三周(2)--信息维护模块编码之个人信息修改
- 对java String类型字符串的一些浅薄理解
- A Simple Math Problem(hdu5949)
- 师创杯”山东理工大学第九届ACM程序设计竞赛(网络同步赛)--I皮卡丘的梦想2
- C#网页爬虫抓取行政区划
- struts2--图片/Excel下载
- JAVA从基础到精通(字符串)
- 湖北民族学院oj 1802 (枚举)之 矩形问题
- CSS样式的展现方式和效果顺序
- 红黑树—BTree
- 侧栏工具条开发
- 123
- CGAL+VS2010环境配置