sparkR-根据GPS坐标判断行政区域
来源:互联网 发布:java在线客服系统源码 编辑:程序博客网 时间:2024/05/01 07:10
sparkR-根据GPS坐标判断行政区域
根据GPS坐标判断其行政区域方法很多,本方法主要通过调用高德API接口,判断其行政区域
1.单机版,在数据量小的情况下可采用单机版,当大于10万条数据时,用时会超过20分钟
rm(list=ls())library(RCurl)#测试数据test <-data.frame(lon=c(115.5660583464,116.0255749621,116.3999056987,116.4603790196,116.7411326333,117.3849387344,115.1680472056), lat=c(40.1109818799,39.8600058894,39.9031383143,39.5424745878,39.5418774826,40.6195326408,39.6489488846))#连接经纬度存为导入API相应格式paste_lonlat <- function(format1,format2,x=length(format1)){ if(x==1) return(paste0(format1[1],",",format2[1])) all <- paste0(paste_lonlat(format1,format2,x-1),"|",paste0(format1[x],",",format2[x])) return(all)}#API限制一次性20个点sample_n <- 1times <- sample_n*20k_value <- ceiling(nrow(test)/times)value_all <- NULLfor(part in seq_len(k_value)){ if(part == k_value){ idx <- seq(1+(part-1)*times,nrow(test),by = sample_n) }else{ idx <- seq(1+(part-1)*times,part*times,by = sample_n) } location <- paste_lonlat(test[idx,"lon"],test[idx,"lat"]) url_head2 <- "http://restapi.amap.com/v3/geocode/regeo?location=" url_tail2 <- "&batch=true&output=xml&key=自己的key" url2 <- paste0(url_head2, location, url_tail2) str2 <- getURL(url2) rule <- gregexpr("<adcode>(.*?)</adcode>|<province>(.*?)</province>|<city>(.*?)</city>|<district>(.*?)</district>", str2) value <- unlist(regmatches(str2, rule)) value_all <-c(value_all,value)}#adcodeadcode <- value_all[grep("adcode",value_all)]adcode <- gsub(".*<adcode>(.*?)</adcode>.*", '\\1', adcode)#省份province <- value_all[grep("province",value_all)]province <- gsub(".*<province>(.*?)</province>.*", '\\1', province)#城市city <- value_all[grep("city",value_all)]city <- gsub(".*<city>(.*?)</city>.*", '\\1', city)#区域district <- value_all[grep("district",value_all)]district <- gsub(".*<district>(.*?)</district>.*", '\\1', district)#返回省市区和adcoderesult <- data.frame(test,adcode=adcode, province=province, city=city, district=district,stringsAsFactors = F)result$city[result$city==""] <- result$province[result$city==""]
2.sparkR版,之前版本为单机版,虽然能SparkR上运行,但是遇到大量数据是跑的还是挺慢的,改为在Spark环境下跑数据,现在总数据43万,代码如下:
命名为user_gps_boudary_sparkR.R
library(SparkR)library(RCurl)sc <- sparkR.init(appName="boundary")sqlContext <- sparkRSQL.init(sc)hiveContext <- sparkRHive.init(sc)SparkR:::includePackage(sqlContext, 'RCurl')##加载数据user <- sql(hiveContext,"select * from user_gps ")user_rdd<-SparkR:::toRDD(user)##跑RDD数据user_boundary <- SparkR:::lapplyPartition(user_rdd, function(x) { library("RCurl") paste_lonlat <- function(format1,format2,x=length(format1)){ if(x==1) return(paste0(format1[1],",",format2[1])) all <- paste0(paste_lonlat(format1,format2,x-1),"|",paste0(format1[x],",",format2[x])) return(all) } user <-matrix(unlist(x),floor(length(unlist(x))/3),ncol=3,byrow=T) user <- data.frame(user,stringsAsFactors = F) names(user) <- c("id","lat","lon") #API限制一次性20个点 sample_n <- 1 times <- sample_n*20 k_value <- ceiling(nrow(user)/times) value_all <- NULL for(part in seq_len(k_value)){ if(part == k_value){ idx <- seq(1+(part-1)*times,nrow(user),by = sample_n) }else{ idx <- seq(1+(part-1)*times,part*times,by = sample_n) } location <- paste_lonlat(user[idx,"lon"],user[idx,"lat"]) url_head2 <- "http://restapi.amap.com/v3/geocode/regeo?location=" url_tail2 <- "&batch=true&output=xml&key=自己的key" url2 <- paste0(url_head2, location, url_tail2) str2 <- getURL(url2) rule <- gregexpr("<adcode>(.*?)</adcode>|<province>(.*?)</province>|<city>(.*?)</city>|<district>(.*?)</district>", str2) value <- unlist(regmatches(str2, rule)) value_all <-c(value_all,value) } adcode <- value_all[grep("adcode",value_all)] adcode <- gsub(".*<adcode>(.*?)</adcode>.*", '\\1', adcode) province <- value_all[grep("province",value_all)] province <- gsub(".*<province>(.*?)</province>.*", '\\1', province) city <- value_all[grep("city",value_all)] city <- gsub(".*<city>(.*?)</city>.*", '\\1', city) district <- value_all[grep("district",value_all)] district <- gsub(".*<district>(.*?)</district>.*", '\\1', district) #返回省市区和adcode result <- data.frame(user,adcode=adcode, province=province, city=city, district=district,stringsAsFactors = F) result$city[result$city==""] <- result$province[result$city==""] adcode <-list() for(i in seq_len(nrow(result))){ adcode <- c(adcode,list(result[i,])) } adcode})# saveRDD -----------------------------------------------------------------#将RDD存到HDFS上,注:saveAsTextFile只能保存为string类型,要转换为相应类型在SQL中完成SparkR:::saveAsTextFile(user_boundary, "/.../rspark/user_boundary ")#建立SQL表,sql(hiveContext,"drop table user_boundary ")sql(hiveContext,"CREATE external TABLE user_boundary ( id string, lat string, lon string, adcode string, province string, city string, county string) ROW FORMAT delimited FIELDS TERMINATED BY ',' location 'hdfs://hadoop-namenode1:xxxx/.../rspark/user_boundary '")sparkR.stop()
运行代码
./sparkR /…/r/user_gps_boudary_sparkR.R
40万数据运行6分钟,还是很给力的
参考链接https://github.com/amplab-extras/SparkR-pkg
0 0
- sparkR-根据GPS坐标判断行政区域
- sparkR-根据GPS坐标判断行政区域(二)
- 判断GPS坐标是否在中国
- (iOS)判断GPS坐标是否在中国
- 判断GPS坐标是否在中国
- 中国行政区域(县区级带坐标经纬度)
- 中国行政区域边界坐标(google)
- java根据GPS经纬度坐标计算两点的距离算法
- java根据GPS经纬度坐标计算两点的距离算法
- 根据GPS经纬度坐标计算两点的距离算法
- 如何获取行政区域的边界gps数据
- 手机GPS 搜星 根据信燥比 判断 信号强弱
- SparkR
- SparkR
- SparkR
- SparkR
- gps坐标转火星坐标
- 百度坐标转换GPS坐标
- 红岭创投黑名单批量导入
- git 忽略已经跟踪文件的改动
- CentOS安装rar功能
- Win32项目与Win32控制台项目互转方法
- Android 开启闪光灯比较完美的兼容方案
- sparkR-根据GPS坐标判断行政区域
- 购物车实现逻辑
- ofbiz开发环境的配置总结
- Fragment 和RadioButton 实现底部导航拦
- js页面跳转整理
- 前台登录账号改变时迅速清空记录的登录密码
- iOS中NSString自适应宽高
- 八种排序算法
- Windows下使用DOS命令进入MySQL数据库