sparkR-根据GPS坐标判断行政区域

来源：互联网发布：java在线客服系统源码编辑：程序博客网时间：2024/05/01 07:10

sparkR-根据GPS坐标判断行政区域

根据GPS坐标判断其行政区域方法很多，本方法主要通过调用高德API接口，判断其行政区域

1.单机版，在数据量小的情况下可采用单机版，当大于10万条数据时，用时会超过20分钟

rm(list=ls())library(RCurl)#测试数据test <-data.frame(lon=c(115.5660583464,116.0255749621,116.3999056987,116.4603790196,116.7411326333,117.3849387344,115.1680472056),          lat=c(40.1109818799,39.8600058894,39.9031383143,39.5424745878,39.5418774826,40.6195326408,39.6489488846))#连接经纬度存为导入API相应格式paste_lonlat <- function(format1,format2,x=length(format1)){  if(x==1)    return(paste0(format1[1],",",format2[1]))  all <- paste0(paste_lonlat(format1,format2,x-1),"|",paste0(format1[x],",",format2[x]))  return(all)}#API限制一次性20个点sample_n <- 1times <- sample_n*20k_value <- ceiling(nrow(test)/times)value_all <- NULLfor(part in seq_len(k_value)){  if(part == k_value){    idx <- seq(1+(part-1)*times,nrow(test),by = sample_n)   }else{    idx <- seq(1+(part-1)*times,part*times,by = sample_n)  }  location <- paste_lonlat(test[idx,"lon"],test[idx,"lat"])  url_head2 <- "http://restapi.amap.com/v3/geocode/regeo?location="  url_tail2 <- "&batch=true&output=xml&key=自己的key"  url2 <-  paste0(url_head2, location, url_tail2)  str2 <- getURL(url2)  rule <- gregexpr("<adcode>(.*?)</adcode>|<province>(.*?)</province>|<city>(.*?)</city>|<district>(.*?)</district>", str2)  value <- unlist(regmatches(str2, rule))  value_all <-c(value_all,value)}#adcodeadcode <- value_all[grep("adcode",value_all)]adcode <- gsub(".*<adcode>(.*?)</adcode>.*", '\\1', adcode)#省份province <- value_all[grep("province",value_all)]province <- gsub(".*<province>(.*?)</province>.*", '\\1', province)#城市city <- value_all[grep("city",value_all)]city <- gsub(".*<city>(.*?)</city>.*", '\\1', city)#区域district <- value_all[grep("district",value_all)]district <- gsub(".*<district>(.*?)</district>.*", '\\1', district)#返回省市区和adcoderesult <- data.frame(test,adcode=adcode,                     province=province,                     city=city,                     district=district,stringsAsFactors = F)result$city[result$city==""] <- result$province[result$city==""]

测试点位置

2.sparkR版，之前版本为单机版，虽然能SparkR上运行，但是遇到大量数据是跑的还是挺慢的，改为在Spark环境下跑数据，现在总数据43万，代码如下：

命名为user_gps_boudary_sparkR.R

library(SparkR)library(RCurl)sc <- sparkR.init(appName="boundary")sqlContext <- sparkRSQL.init(sc)hiveContext <- sparkRHive.init(sc)SparkR:::includePackage(sqlContext, 'RCurl')##加载数据user <- sql(hiveContext,"select * from user_gps ")user_rdd<-SparkR:::toRDD(user)##跑RDD数据user_boundary <- SparkR:::lapplyPartition(user_rdd, function(x) {  library("RCurl")  paste_lonlat <- function(format1,format2,x=length(format1)){    if(x==1)      return(paste0(format1[1],",",format2[1]))    all <- paste0(paste_lonlat(format1,format2,x-1),"|",paste0(format1[x],",",format2[x]))    return(all)  }  user <-matrix(unlist(x),floor(length(unlist(x))/3),ncol=3,byrow=T)  user <- data.frame(user,stringsAsFactors = F)  names(user) <- c("id","lat","lon")  #API限制一次性20个点  sample_n <- 1  times <- sample_n*20  k_value <- ceiling(nrow(user)/times)  value_all <- NULL  for(part in seq_len(k_value)){    if(part == k_value){      idx <- seq(1+(part-1)*times,nrow(user),by = sample_n)     }else{      idx <- seq(1+(part-1)*times,part*times,by = sample_n)    }    location <- paste_lonlat(user[idx,"lon"],user[idx,"lat"])    url_head2 <- "http://restapi.amap.com/v3/geocode/regeo?location="    url_tail2 <- "&batch=true&output=xml&key=自己的key"    url2 <-  paste0(url_head2, location, url_tail2)    str2 <- getURL(url2)    rule <- gregexpr("<adcode>(.*?)</adcode>|<province>(.*?)</province>|<city>(.*?)</city>|<district>(.*?)</district>", str2)    value <- unlist(regmatches(str2, rule))    value_all <-c(value_all,value)  }  adcode <- value_all[grep("adcode",value_all)]  adcode <- gsub(".*<adcode>(.*?)</adcode>.*", '\\1', adcode)  province <- value_all[grep("province",value_all)]  province <- gsub(".*<province>(.*?)</province>.*", '\\1', province)  city <- value_all[grep("city",value_all)]  city <- gsub(".*<city>(.*?)</city>.*", '\\1', city)  district <- value_all[grep("district",value_all)]  district <- gsub(".*<district>(.*?)</district>.*", '\\1', district)  #返回省市区和adcode  result <- data.frame(user,adcode=adcode,                       province=province,                       city=city,                       district=district,stringsAsFactors = F)  result$city[result$city==""] <- result$province[result$city==""]  adcode <-list()  for(i in seq_len(nrow(result))){    adcode <- c(adcode,list(result[i,]))  }  adcode})# saveRDD -----------------------------------------------------------------#将RDD存到HDFS上,注：saveAsTextFile只能保存为string类型，要转换为相应类型在SQL中完成SparkR:::saveAsTextFile(user_boundary, "/.../rspark/user_boundary ")#建立SQL表，sql(hiveContext,"drop table user_boundary ")sql(hiveContext,"CREATE external TABLE user_boundary  (    id string,    lat string,    lon string,    adcode string,    province string,    city string,    county string) ROW FORMAT delimited FIELDS TERMINATED BY ',' location 'hdfs://hadoop-namenode1:xxxx/.../rspark/user_boundary '")sparkR.stop()

运行代码

./sparkR /…/r/user_gps_boudary_sparkR.R

SparkR运行情况
40万数据运行6分钟，还是很给力的

参考链接https://github.com/amplab-extras/SparkR-pkg

0 0