R--组比较:表和可视化
来源:互联网 发布:淘宝助理导入宝贝为0 编辑:程序博客网 时间:2024/06/06 02:27
小技巧:R的多行注释:现在Rstudio中选定要注释的区域,然后按ctrl+shif+c可进行多行注释
我们根据我们自己所做的数据和数据框来探索数据:
第一部分构造数据结构:
######数据结构########segVars<-c("age","gender","income","kids","ownHome","subscribe")segVargType<-c("norm","binom","norm","pois",'binom','binom')segNames<-c("Suburb","Urban hip","Travelers","Moving up")segSize<-c(100,50,80,70)#########通过矩阵赋值########segMeans<-matrix(c(40,0.5,55000,2,0.5,0.1, 24,0.7,21000,1,0.2,0.2, 58,0.5,64000,0,0.7,0.05, 36,0.3,52000,2,0.3,0.2), ncol=length(segVars), byrow = TRUE )segSDs<-matrix(c( 5,NA,12000,NA,NA,NA, 2,NA,5000,NA,NA,NA, 8,NA,21000,NA,NA,NA, 4,NA,1000,NA,NA,NA), ncol = length(segVars),byrow = TRUE)#i.seq<-rep(sqrt(seq(from=2.1,to=6.2,by=1.7)),3)#help("seq")#for( i in i.seq){# print(i)#}#for( i in c("hello"," welcome to"," R world!"))#{cat(i)}#x<-1:5#ifelse(x>1,"hi","bye")######最终客户数据分组数组 seq_along()函数;该函数能给出一个和给定向量some variable 一样长的向量 ##############seg.df<-NULLset.seed(02554)#下面的一句是四组的样泵量取值 取值按照所规定的概率分布去取值for(i in seq_along(segNames)){ cat(i,segNames[i],"\n") #这里定义了一个this,seg,这是一个临时的矩阵,因为每一次的循环后,这个矩阵边归为NULL,在这段程序的最后的一个操作里面便将这#这个矩阵的临时值通过一个函数不断的放到seg.df之中 this.seg<-data.frame(matrix(NA,nrow = segSize[i],ncol = length(segVars)))##这里的nrow = segSize[i],一次性产生一个组中的所有数据,比如说第一个数值100,一次性产生100行,6列的矩阵数据值 ##下面就是判定根据不同的匹配值给每列产生数据,一列的数据是100. for(j in seq_along(segVars)){ if(segVargType[j]=="norm"){ this.seg[,j]<-rnorm(segSize[i],mean = segMeans[i,j],sd=segSDs[i,j]) }else if(segVargType[j]=="pois"){ this.seg[,j]<-rpois(segSize[i],lambda = segMeans[i,j]) }else if(segVargType[j]=="binom"){ this.seg[,j]<-rbinom(segSize[i],size=1,prob = segMeans[i,j]) }else{ stop("Bad segment data type:",segVargType[i]) } } ########rbind是取一个向量、矩阵或数据帧参数的序列,并分别由列或行组合起来。这些是带有其他R类方法的泛型函数 seg.df<-rbind(seg.df,this.seg) }####程序运行完:#1 Suburb #2 Urban hip #3 Travelers #4 Moving up# 为列命名names(seg.df)<-segVarshelp("rep")# rep(x,...)代表复制x中的值# 我们要为列命名,添加类别,然后并把每个二项量变换成因子seg.df$Segment<-factor(rep(segNames,times=segSize))seg.df$ownHome<-factor(seg.df$ownHome,labels = c("ownNo","ownYes"))seg.df$gender<-factor(seg.df$gender,labels = c("Female","male"))seg.df$subscribe<-factor(seg.df$subscribe,labels = c("sunNo","subYes"))summary(seg.df)####### > summary(seg.df)# age gender income kids ownHome subscribe Segment # Min. :19.26 Female:157 Min. : -5183 Min. :0.00 ownNo :159 sunNo :260 Moving up: 70 # 1st Qu.:33.01 male :143 1st Qu.: 43731 1st Qu.:0.00 ownYes:141 subYes: 40 Suburb :100 # Median :39.49 Median : 51997 Median :1.00 Travelers: 80 # Mean :41.20 Mean : 50707 Mean :1.27 Urban hip: 50 # 3rd Qu.:47.90 3rd Qu.: 59856 3rd Qu.:2.00 # Max. :80.49 Max. :114278 Max. :7.00 # 保存数据到你的电脑,我们以后将对这个数据框进行探索save(seg.df,file = "F:\\segdf-Rintro-Ch5.RData")
# #我们对各组对应的描述统计量# 找到标准行,我们看到:"Moving up"类的平均收入,类似的我们对生成的值在进行筛选,加入& mean(seg.df$income[seg.df$Segment=="Moving up"])mean(seg.df$income[seg.df$Segment=="Moving up"&seg.df$subscribe=="sunNo"])# # 对于上述的过程相当繁琐,更加一般的方式是:用by(data,INDICES,FUN),by()能够按照某个索引变量INDICES# 将数据(data)分成不同的组,然后将函数(FUN)应用到每一组观测值上by(seg.df$income,seg.df$subscribe,mean)by(seg.df$income,seg.df$Segment,mean)# > by(seg.df$income,seg.df$subscribe,mean)# seg.df$subscribe: sunNo# [1] 51389.52# ------------------------------------------------------ # seg.df$subscribe: subYes# [1] 46273.9# > by(seg.df$income,seg.df$Segment,mean)# seg.df$Segment: Moving up# [1] 52109.1# ------------------------------------------------------ # seg.df$Segment: Suburb# [1] 55033.82# ------------------------------------------------------ # seg.df$Segment: Travelers# [1] 62213.94# ------------------------------------------------------ # seg.df$Segment: Urban hip# [1] 21681.93# # 那你也可以用多个因子变量通过list()一起传递给by来进行函数划分by(seg.df$income,list(seg.df$Segment,seg.df$subscribe),mean)# # 对于上面的操作,我们更喜欢用aggregate()来完成,它返回的是一个列表格式aggregate(seg.df$income,list(seg.df$Segment,seg.df$subscribe),mean)# # aggregate(seg.df$income,list(seg.df$Segment,seg.df$subscribe),mean)# Group.1 Group.2 x# 1 Moving up sunNo 52163.37# 2 Suburb sunNo 54942.69# 3 Travelers sunNo 62746.11# 4 Urban hip sunNo 22082.11# 5 Moving up subYes 51891.99# 6 Suburb subYes 56461.41# 7 Travelers subYes 58488.77# 8 Urban hip subYes 20081.19# # 下面的操作,我们看到seg.income.mean就是将第二列的值与seg.df$Segment相匹配,# 然后在seg.df添加一列segIncomeseg.income.mean<-aggregate(seg.df$income,list(seg.df$Segment),mean)seg.df$segIncome<-seg.income.mean[seg.df$Segment,2]library(car)some(seg.df)# > some(seg.df)# age gender income kids ownHome subscribe Segment segIncome# 71 36.22376 Female 53428.06 1 ownNo sunNo Suburb 55033.82# 84 34.28161 male 49346.70 2 ownYes subYes Suburb 55033.82# 86 35.87205 male 39442.72 1 ownNo sunNo Suburb 55033.82# 91 45.19053 male 68937.20 5 ownYes sunNo Suburb 55033.82# 171 64.85144 Female 78231.33 0 ownYes sunNo Travelers 62213.94# 173 64.70641 male 45517.15 0 ownNo subYes Travelers 62213.94# 184 68.05148 male 104312.45 0 ownYes sunNo Travelers 62213.94# 185 50.50749 Female 48946.07 0 ownYes subYes Travelers 62213.94# 202 64.63338 male 113456.80 0 ownNo sunNo Travelers 62213.94# 290 42.40763 male 52152.04 1 ownNo sunNo Moving up 52109.10##下面的代码大家敲一遍,然后用上面的some()函数,看看有什么不同seg.income.mean[seg.df$Segment,]seg.df$segIncome<-NULL# # R通过指定的公式来描述变量之间的关系提供标准方式,公式通过(~)操作符将响应变量(左边)和解释变量(右边)# # Y~xy,在aggergate(formula,data,FUN),下面的是意思是:从数据框seg.df中找到income,将其按照# segment划分成不同的组,然后对每组应用mean(均值函数)#双向量的描述统计量y~x1+X2+....aggregate(income~Segment,data=seg.df,mean)aggregate(income~Segment+ownHome,data = seg.df,mean)agg.data<-aggregate(income~Segment+ownHome,data = seg.df,mean)agg.data[2,3]# aggregate(income~Segment,data=seg.df,mean)# Segment income# 1 Moving up 52109.10# 2 Suburb 55033.82# 3 Travelers 62213.94# 4 Urban hip 21681.93# > aggregate(income~Segment+ownHome,data = seg.df,mean)# Segment ownHome income# 1 Moving up ownNo 52249.77# 2 Suburb ownNo 54932.83# 3 Travelers ownNo 63188.42# 4 Urban hip ownNo 21337.59# 5 Moving up ownYes 51821.64# 6 Suburb ownYes 55143.21# 7 Travelers ownYes 61889.12# 8 Urban hip ownYes 23059.27# # 我们或许还想知道不同的Segment和ownhome层级组合对应的观测函数(frequent),我们可# 通过table(factor1,factor2)计算单向或者多向频数with(seg.df,table(Segment,ownHome))with(seg.df,table(kids,Segment))# with(seg.df,table(Segment,ownHome))# ownHome# Segment ownNo ownYes# Moving up 47 23# Suburb 52 48# Travelers 20 60# Urban hip 40 10# > with(seg.df,table(kids,Segment))# Segment# kids Moving up Suburb Travelers Urban hip# 0 13 11 80 17# 1 17 36 0 17# 2 18 22 0 11# 3 13 19 0 4# 4 5 7 0 1# 5 3 3 0 0# 6 0 2 0 0# 7 1 0 0 0#下面四个方法的就是对某个类中的所有孩子的个数加起来得到样本的小孩总数。#实现方式一xtabs(kids~Segment,data=seg.df)#实现方式二aggregate(kids~Segment,data = seg.df,sum)#实现方式三seg.tab<-with(seg.df,table(kids,Segment))apply(seg.tab*0:7,2,sum)#实现方式四colSums(seg.tab*0:7)require(lattice)# ##下面的就是跟上面的类似,比如说我们要对每个类中订阅者做比例图,# # 了解不同类的订阅情况 histogram(formula,data,type)# 下面的三行代码分别对应图1,2,3histogram(~subscribe|Segment,data = seg.df)histogram(~subscribe|Segment,data = seg.df, type = "count",layout=c(4,1),col=c("burlywood","darkolivegreen"))histogram(~subscribe|Segment+ownHome,data = seg.df)# ###prop.table(table,margin)命令,如果你将prop.table(,margin=...)作用于table()# 上可以得到每个单元格数值对整个表格之和的比例,或对行求和的比例,或者对列的求和比例(margin=2)prop.table(table(seg.df$subscribe,seg.df$Segment),margin = 2)# > prop.table(table(seg.df$subscribe,seg.df$Segment),margin = 2)# # Moving up Suburb Travelers Urban hip# sunNo 0.800 0.940 0.875 0.800# subYes 0.200 0.060 0.125 0.200# # # 大家对多获得数据画图,就是对有房的人进行订阅统计,大家见图4barchart(prop.table(table(seg.df$subscribe,seg.df$Segment),margin = 2)[2,], xlab = "subscriber proportion by segment",col="darkolivegreen")#下面的例子是对前几个上面例子综合使用seg.means<-aggregate(income~Segment,data =seg.df,mean)library(lattice)barchart(income~Segment,data = seg.means,col="grey")#图6显示seg.income.agg<-aggregate(income~Segment+ownHome,data = seg.df,mean)barchart(income~Segment,data = seg.income.agg,groups=ownHome,auto.key=TRUE, par.setting=simpleTheme(col=terrain.colors(2)))#图7 箱线图boxplot(income~Segment,data = seg.df,yaxt="n",ylab="Incom($k)")ax.seq<-seq(from=0,to=120000,by=20000)axis(side=2,at=ax.seq,labels = paste(ax.seq/1000,"k",seq=""),las=1)bwplot(Segment~income,data = seg.df,horizontal = TRUE,xlab="Income")#lattic中有个更好的绘制箱线图的bwplot,bwplot()使用与常规方向相反的模式公式表达。bwplot(Segment~income|ownHome,data = seg.df,horizontal = TRUE,xlab="Income")
阅读全文
0 0
- R--组比较:表和可视化
- synchronized和volatile比较 (r)
- R语言文本挖掘和词云可视化实践
- R+GoogleVis数据可视化
- lecture R数据可视化
- R-ggplot 数据可视化
- R可视化(一)
- 用R可视化数据
- R语言_car_可视化
- R语言地图可视化
- R语言数据可视化
- 可视化篇:R可视化--map图
- 可视化篇:R可视化--迁徙/通勤图
- R语言:R-hive-mysql-php 可视化
- 【R 可视化】R 画关系网络图
- 【R 可视化】R语言画函数图
- R语言之数据可视化
- R语言之数据可视化
- JDBC_事务的使用
- 从零开始的web前端
- 光源选型的要素有哪些?
- 简单密码
- 机器学习——特征工程之概述
- R--组比较:表和可视化
- SharedPreferences实现数据存取
- oracle重新安装注意
- (springmvc+spring+mabatis)配置
- linux C/C++服务器后台开发面试题总结
- 栈-stack.peek和stack.pop的区别
- UVA 1476 A
- 分布式系统基础-远程过程调用(RPC)
- $.Deferred的使用