sparkR程序
来源:互联网 发布:机房网络拓扑图 编辑:程序博客网 时间:2024/06/07 02:25
sparkR
在工作目录下创建behavior.R文件
#加载包library(SparkR)library(magrittr)library(plyr)######## debugs spark 1.6 #######################################加载环境sc <- sparkR.init(appName="Behavior")sqlContext <- sparkRSQL.init(sc)hiveContext <- sparkRHive.init(sc)######## import data ######################################日期stat_date <- xxxx#sql语句,两表合并sqlpart1 <- "select a.deviceid,a.tid,a.vid,a.sort_st,a.sort_en, b.home1,b.home2,b.company1,b.company2"data_name1 <- paste0("ubi_dw_cluster_point_",stat_date)data_name2 <- paste0("ubi_dm_address_recognition_",stat_date)sqlpart2 <- paste("from", data_name1, "as a inner join" ,data_name2 ,"as b",sep = " ")sqlpart3 <- "on a.deviceid = b.deviceid"trip<-sql(hiveContext,paste(sqlpart1,sqlpart2,sqlpart3,"limit 10000",sep = " "))#加载包SparkR:::includePackage(sqlContext, 'plyr')#添加列trip = trip %>% withColumn("behavior", lit("0")) %>% withColumn("trip1", lit("0")) %>% withColumn("trip2", lit("0")) %>% withColumn("trip3", lit("0")) %>% withColumn("trip4", lit("0")) %>% withColumn("trip5", lit("0")) %>% withColumn("trip6", lit("0")) %>% withColumn("trip7", lit("0")) %>% withColumn("trip8", lit("0"))#转存为RDDtrip_rdd<-SparkR:::toRDD(trip)######## zip&groupBy with keys ######################################划分分区list_rd<-SparkR:::map(trip_rdd, function(x) { user<-matrix(unlist(x),floor(length(unlist(x))/18),ncol=18,byrow=T) user<-user[1,1]})#分区子集stat_rdd<-SparkR:::map(trip_rdd, function(x) { stat_trip<-matrix(unlist(x),floor(length(unlist(x))/18),ncol=18,byrow=T) stat_trip})#通过分区对数据进行操作rdd<-SparkR:::zipRDD(list_rd,stat_rdd)parts <- SparkR:::groupByKey(rdd,240L)SparkR:::cache(parts)#对分区内数据操作,里面和R程序一样user_behavior<-SparkR:::mapValues(parts, function(x) { library("plyr") user_trip<-matrix(unlist(x),floor(length(unlist(x))/18),ncol=18,byrow=T) user_trip <- data.frame(user_trip,stringsAsFactors = F) names(user_trip) <- c("imei","tid","vid","sort_st","sort_en","home1","home2","company1","company2","behavior", "trip1","trip2","trip3","trip4","trip5","trip6","trip7","trip8") user_trip$imei <- gsub(" ","",user_trip$imei) user_trip$tid <- as.numeric(user_trip$tid) user_trip$vid <- as.numeric(user_trip$vid) user_trip$sort_st <- as.numeric(user_trip$sort_st) user_trip$sort_en <- as.numeric(user_trip$sort_en) user_trip$home1 <- as.numeric(user_trip$home1) user_trip$home2 <- as.numeric(user_trip$home2) user_trip$company1 <- as.numeric(user_trip$company1) user_trip$company2 <- as.numeric(user_trip$company2) homecompany <- unique(user_trip[,c(1:3,6:9,11:18)]) homecompany$trip1 <-base::paste(homecompany$home1,"_",homecompany$company1,sep = "") homecompany$trip2 <-base::paste(homecompany$home1,"_",homecompany$company2,sep = "") homecompany$trip3 <-base::paste(homecompany$home2,"_",homecompany$company1,sep = "") homecompany$trip4 <-base::paste(homecompany$home2,"_",homecompany$company2,sep = "") homecompany$trip5 <-base::paste(homecompany$company1,"_",homecompany$home1,sep = "") homecompany$trip6 <-base::paste(homecompany$company1,"_",homecompany$home2,sep = "") homecompany$trip7 <-base::paste(homecompany$company2,"_",homecompany$home1,sep = "") homecompany$trip8 <-base::paste(homecompany$company2,"_",homecompany$home2,sep = "") exchange_id <- as.numeric(user_trip$sort_st) < as.numeric(user_trip$sort_en) user_trip$behavior[which(exchange_id)] <- base::paste(user_trip$sort_st,"_",user_trip$sort_en,sep = "")[which(exchange_id)] user_trip$behavior[which(!exchange_id)] <- base::paste(user_trip$sort_en,"_",user_trip$sort_st,sep = "")[which(!exchange_id)] user_main <- ddply(user_trip,c("imei","tid","vid","behavior"),each(nrow)) user_main <- ddply(user_main,c("imei","tid","vid"),summarise,behavior = behavior[which.max(nrow)]) user<-merge(user_main,homecompany,by=c("imei","tid","vid")) homecompany_id <- user$behavior == user$trip1 |user$behavior == user$trip2| user$behavior == user$trip3|user$behavior == user$trip4|user$behavior == user$trip5| user$behavior == user$trip6|user$behavior == user$trip7|user$behavior == user$trip8 user_main$car_behavior <- user_main$behavior user_main$car_behavior[homecompany_id] <- 2 user_main$car_behavior[!homecompany_id] <- 1 user_main$car_behavior <- as.integer(user_main$car_behavior) user_main$dim_month <- as.character(stat_date) user <- subset(user_main,select = c("imei","tid","vid","dim_month","car_behavior")) user})#提取结果user_behavior_rdd<-SparkR:::values(user_behavior)######## register dynamic.partitions table ######################################转存为DFuser_behavior_df<-SparkR:::toDF(user_behavior_rdd,list("imei",'tid','vid',"dim_month",'car_behavior'))#写到表中registerTempTable(user_behavior_df,"behavior")#写外部表,将运行结果写到表中和HDFS上sql(hiveContext,"drop table ubi_dw_user_behavior_xxxx")sql(hiveContext,"CREATE external TABLE ubi_dw_user_behavior_xxxxx ( imei string, tid int, vid int, dim_month string, car_behavior int) ROW FORMAT delimited FIELDS TERMINATED BY '#' location 'hdfs:.../ubi_dw_user_behavior/stat_date=xxxx'")sql(hiveContext,paste("insert overwrite table",paste0("ubi_dw_user_behavior_",stat_date,"01"),"select imei,tid,vid,dim_month,car_behavior from behavior7",sep = " "))sparkR.stop()
运行代码
./sparkR /…/r/behavior.R
参考链接https://github.com/amplab-extras/SparkR-pkg
0 0
- sparkR程序
- SparkR
- SparkR
- SparkR
- SparkR
- SparkR安装
- 折腾sparkR
- 安装SparkR
- SparkR初探
- SparkR Notebooks
- 安装SparkR
- SparkR安装注意事项
- sparkR介绍及安装
- SparkR的安装配置
- sparkR介绍及安装
- SparkR基本操作
- 【大数据处理架构】SparkR
- SparkR的编译安装
- Cent 7.2 搭建OpenVPN
- iOS常用宏定义总结 --Objective-C
- BootStrap datetimepicker使用
- 中国首家完美支持windows 2016 server系统的生产力级CMS
- jsp和servlet
- sparkR程序
- vue v-for 数据处理
- $_SERVER[]:包含一些诸如头信息,路径和脚本等信息的数组。
- 2005右键修改存储过程变成动态脚本及 2014 无法生产drop时的判断脚本
- Touch screen
- SpringDataJPA学习记录(一)--环境配置
- 第12周 oj 逆序输出
- Angular2 Directive 学习笔记-基础篇
- ListView的几种特殊属性