爬取中国行政区域数据

来源:互联网 发布:云计算和云存储的区别 编辑:程序博客网 时间:2024/05/01 10:27

实际工作需要县以上的行政区域数据,方便做地址数据的清晰。
原数据地址:国家统计局

原数据格式:
这里写图片描述

爬取后数据:
这里写图片描述

代码如下:

library(rvest)url <- "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html"web <- read_html(url)## 爬取原始数据areadata <- web %>% html_nodes('div.TRS_PreAppend')%>% html_nodes("p.MsoNormal") %>% html_nodes("span")%>% html_text()## 辅助函数### 函数功能:计算字符串中空格个数、实际字符个数#### 两个参数:x字符串;type返回值设置,默认返回空格个数trim_compute <- function(x,type="trim_num"){  if(type=="trim_num"){    trim_num <- nchar(x)-nchar(stringr::str_trim(x))    return(trim_num)  }else if(type=="char_num"){    char_num <- nchar(stringr::str_trim(x))    return(char_num)  }}trim_num <- unlist(lapply(areadata,trim_compute,type="trim_num"))char_num <- unlist(lapply(areadata,trim_compute,type="char_num"))rawdata <- data.frame(areadata = stringr::str_trim(areadata),trim_num,char_num,stringsAsFactors = FALSE)head(rawdata)## char_num为0说明该元素没有值,全部是空格,需要删除rawdata1 <- subset(rawdata,char_num!=0)head(rawdata1)## 拆分编码和名称area_info <-  data.frame(    area_code = rawdata1$areadata[seq(from = 1, to = 7016, by = 2)],    area_name = rawdata1$areadata[seq(from = 2, to = 7016, by = 2)],    stringsAsFactors = FALSE  )head(area_info)## 拆分省、市、区#### 如果area_code后四位是0000则为省#### 如果area_code后两位是00且后四位不是0000则为地级市is_province <-  as.numeric(stringr::str_sub(area_info$area_code,-4,-1) == "0000")is_city <- ((stringr::str_sub(area_info$area_code, -4, -1) != "0000") *              (stringr::str_sub(area_info$area_code, -2, -1) == "00"))province_index <- stringr::str_sub(area_info$area_code,1,2)city_index <- stringr::str_sub(area_info$area_code,1,4)area_info <-  data.frame(    area_info,    is_province = is_province ,    is_city = is_city,    province_index = province_index,    city_index = city_index,    stringsAsFactors = FALSE  )head(area_info,n=6)## 先提取省、直辖市、自治区的数据province_data <-  subset(    area_info,    is_province == 1,    select = c("area_code", "area_name", "province_index", "city_index")  )colnames(province_data) <- c("province_code", "province_name", "province_index", "city_index")## 地级市数据city_data <-  subset(    area_info,    is_city == 1,    select = c("area_code", "area_name", "province_index", "city_index")  )head(city_data)colnames(city_data) <- c("city_code", "city_name", "province_index", "city_index")## 县区数据county_data <-  subset(    area_info,    is_city != 1 &is_province != 1,    select = c("area_code", "area_name","city_index")  )head(county_data)colnames(county_data) <- c("county_code", "county_name","city_index")assistdata <- merge(x=province_data,y=city_data,by.x ="province_index",by.y ="province_index",all = TRUE)head(assistdata)assistdata <- assistdata[,c("province_code","province_name","city_code","city_name","city_index.y")]## all = TRUE 东莞这个奇葩没有县区。。。finish_data <- merge(assistdata,county_data,by.x ="city_index.y",by.y ="city_index",all = TRUE)head(finish_data)finish_data <- finish_data[,-1]## 输出Excellibrary(openxlsx)write.xlsx(finish_data,"china_areadata.xlsx")

2017-10-15 于杭州