第12周课程：箱型图，热力图

来源：互联网发布：北京少儿编程培训机构编辑：程序博客网时间：2024/04/30 23:26

一、箱型图

1. 箱型图

air<-read.csv("airpollution.csv") boxplot(air,las=1)  #las指定坐标轴刻度的风格，1表示水平放置

2. 收窄箱体宽度

boxplot(air,boxwex=0.2,las=1)   #boxwex设置箱体收窄比例，但这里的0.2并不表示收窄到原来的0.2倍

3. 指定箱体的宽度

boxplot(air,width=c(1,2))  #width指定两个箱体的宽度，分别为1和2

4. 分组

metals<-read.csv("metals.csv") boxplot(Cu~Source,  #纵轴是Cu，横轴是Source        data=metals,          main="Summary of Copper (Cu) concentrations by Site") boxplot(Cu~Source*Expt,  #根据Source与Expt所有可能的不同组合值来分组        data=metals,          main="Summary of Copper (Cu) concentrations by Site")

5. 观测值数量决定箱体宽度

boxplot(Cu ~ Source,         data = metals,        varwidth=TRUE,  #varwidt如果TRUE，则箱体宽度由观测值数目决定        main="Summary of Copper concentrations by Site")

6. 带notch（切口）的箱型图

boxplot(Cu ~ Source,         data = metals,          varwidth=TRUE,        notch=TRUE,  #notch如果TRUE，则箱型图是带切口的        main="Summary of Copper concentrations by Site")

7. 排除离群值

boxplot(metals[,-1],        outline=FALSE, #outline如果FALSE，则排除离群值。默认为TRUE        main="Summary of metal concentrations by Site (without outliers)")

8. 水平放置

boxplot(metals[,-1],         horizontal=TRUE, #箱型图水平放置        las=1,         main="Summary of metal concentrations by Site")

9. 改变箱型风格

boxplot(metals[,-1],         border = "white", #箱体边框颜色        col = "black", #箱体颜色        boxwex = 0.3,  #箱体宽度        medlwd=1,  #中位数线的宽度        whiskcol="black", #触须的颜色        staplecol="black",  #两个横杆的颜色        outcol="red",  #离群值的颜色        cex=0.3,  #离群值点符号的大小        outpch=19,  #离群值点的符号        main="Summary of metal concentrations by Site")grid(nx=NA,   #不画竖网格线     ny=NULL, #横网格线采用默认     col="gray", #网格线颜色     lty="dashed")  #网格线风格:虚线

10. 延长须线

boxplot(metals[,-1],         range=0, #把触须延长到真正的最大值与最小值点，而不是默认的触须        border = "white",        col = "black",         boxwex = 0.3,        medlwd=1,        whiskcol="black",         staplecol="black",        outcol="red",        cex=0.3,        outpch=19,         main="Summary of metal concentrations by Site (range=0)")

11. 显示观测数量

b<-boxplot(metals[,-1],            xaxt="n",  #不画X轴           border = "white",           col = "black",            boxwex = 0.3,           medlwd=1,           whiskcol="black",            staplecol="black",           outcol="red",           cex=0.3,           outpch=19,            main="Summary of metal concentrations by Site") axis(side=1,  #在画板的下面添加坐标轴     at=1:length(b$names), #有多少个名字，就画多少个刻度     labels=paste(b$names,"\n(n=",b$n,")",sep=""), # \n是换行     mgp=c(3,2,0)) #坐标刻度与画板之间的间距

12. 使用gplot包

install.packages("gplots") library(gplots) boxplot.n(metals[,-1],           border = "white",          col = "black",          boxwex = 0.3,           medlwd=1,          whiskcol="black",          staplecol="black",           outcol="red",          cex=0.3,          outpch=19,           main="Summary of metal concentrations by Site") #boxplot.n函数画箱型图与boxplot函数画箱型图差不多

13. 分割数据

cuts<-c(0,40,80) Y<-split(x=metals$Cu, f=findInterval(metals$Cu, cuts)) boxplot(Y,        xaxt="n",         border = "white",        col = "black",        boxwex = 0.3,         medlwd=1,        whiskcol="black",        staplecol="black",         outcol="red",        cex=0.3,        outpch=19,         main="Summary of Copper concentrations",         xlab="Concentration ranges",        las=1) axis(1,     at=1:4,      labels=c("Below 0","0 to 40","40 to 80","Above 80"),      lwd=0,  #刻度线之间夹的坐标轴的宽度     lwd.ticks=1,  #坐标轴刻度线的宽度     col="gray") #刻度线的颜色

关于分割数据，请见: http://blog.csdn.net/zhuanzhu123/article/details/9343639

14. 函数化

#定义一个对分割数据作箱型图的函数boxplot.cuts<-function(y,cuts,...) {   Y<-split(y, f=findInterval(y, cuts))   b<-boxplot(Y,xaxt="n",              border = "white",col = "black",boxwex = 0.3,              medlwd=1,whiskcol="black",staplecol="black",              outcol="red",cex=0.3,outpch=19,              main="Summary of Copper concentrations",              xlab="Concentration ranges",las=1,...)   clabels<-paste("Below",cuts[1])   for(k in 1:(length(cuts)-1)) {     clabels<-c(clabels, paste(as.character(cuts[k]),                               "to", as.character(cuts[k+1])))   }   clabels<-c(clabels,              paste("Above",as.character(cuts[length(cuts)])))   axis(1,at=1:length(clabels),        labels=clabels,lwd=0,lwd.ticks=1,col="gray") } #调用函数boxplot.cuts(metals$Ba,c(20,40,80))

函数化的好处在于，你不需要重复以前的工作，只需将所需的代码写好，以后调用就行了，这样会节省大量的工作量和时间。

#另一个函数boxplot.cuts<-function(y,cuts) {   f=cut(y, c(min(y[!is.na(y)]),cuts,max(y[!is.na(y)])),         ordered_results=TRUE);   Y<-split(y, f=f)   b<-boxplot(Y,xaxt="n",              border = "white",col = "black",boxwex = 0.3,              medlwd=1,whiskcol="black",staplecol="black",              outcol="red",cex=0.3,outpch=19,              main="Summary of Copper concentrations",              xlab="Concentration ranges",las=1)   clabels = as.character(levels(f))   axis(1,at=1:length(clabels),        labels=clabels,lwd=0,lwd.ticks=1,col="gray") } #调用boxplot.cuts(metals$Cu,c(0,40,80))

15. 子集

boxplot(Cu~Source,        data=metals,        subset=Cu>40) #取子集：铜含量大于40

二、热力图

1. 热力图

#读数据sales<-read.csv("sales.csv") #安装、加载包install.packages("RColorBrewer") library(RColorBrewer) #把数据处理成矩阵rownames(sales)<-sales[,1] sales<-sales[,-1] data_matrix<-data.matrix(sales) #生成调色板pal=brewer.pal(7,"YlOrRd") #颜色分割点breaks<-seq(3000,12000,1500) #breaks序列是根据实际数据的大小来构造的#画板设置layout(matrix(data=c(1,2), nrow=1, ncol=2), widths=c(8,1),        heights=c(1,1)) #边距设置 par(mar = c(5,10,4,2),    oma=c(0.2,0.2,0.2,0.2),    mex=0.5)  ##画热力图image(x=1:nrow(data_matrix), #x轴      y=1:ncol(data_matrix), #y轴      z=data_matrix,      axes=FALSE,      xlab="Month",       ylab="",      col=pal[1:(length(breaks)-1)],       breaks=breaks,   #颜色分割点，必须必颜色的个数多一个，且要按照增序排列      main="Sales Heat Map") #添加x轴axis(1,     at=1:nrow(data_matrix),     labels=rownames(data_matrix),      col="white",  #横轴的颜色     las=1) #添加y轴axis(2,     at=1:ncol(data_matrix),     labels=colnames(data_matrix),      col="white",     las=1) #添加白色间隔线abline(h=c(1:ncol(data_matrix))+0.5,        v=c(1:nrow(data_matrix))+0.5,        col="white",       lwd=2,       xpd=FALSE) #xpd if TRUE, all plotting is clipped to the figure region##图例breaks2<-breaks[-length(breaks)] #新序列，辅助图例的添加par(mar = c(5,1,4,7))  #边距image(x=1,       y=0:length(breaks2),      z=t(matrix(breaks2))*1.001,       col=pal[1:length(breaks)-1],      axes=FALSE,      breaks=breaks,       xlab="",       ylab="",      xaxt="n") #添加图例的文字信息axis(4,     at=0:(length(breaks2)-1),      labels=breaks2,      col="white",      las=1) #添加白色间隔abline(h=c(1:length(breaks2)),       col="white",       lwd=2,       xpd=F)

2. 相关热力图

#读数据genes<-read.csv("genes.csv")  #该数据存储的是各个基因之间的相关系数#取行名rownames(genes)<-colnames(genes) data_matrix<-data.matrix(genes)#调色板pal=heat.colors(5)breaks<-seq(0,1,0.2) #画板分割layout(matrix(data=c(1,2), nrow=1, ncol=2), widths=c(8,1),        heights=c(1,1)) par(mar = c(3,7,12,2),oma=c(0.2,0.2,0.2,0.2),mex=0.5) #热力图image(x=1:nrow(data_matrix),y=1:ncol(data_matrix),       z=data_matrix,xlab="",ylab="",breaks=breaks,       col=pal,axes=FALSE) #横轴文字text(x=1:nrow(data_matrix)+0.75,      y=par("usr")[4] + 1.25,      srt = 45, #倾斜角度     adj = 1, #距离     labels = rownames(data_matrix),      xpd = TRUE) #纵轴文字axis(2,     at=1:ncol(data_matrix),     labels=colnames(data_matrix),      col="white",     las=1) #白色间隔abline(h=c(1:ncol(data_matrix))+0.5,       v=c(1:nrow(data_matrix))+0.5,        col="white",       lwd=2,       xpd=F)title("Correlation between genes",      line=8,      adj=0) ##图例breaks2<-breaks[-length(breaks)] # Color Scale par(mar = c(2.5,1,4,4.7)) #图例颜色image(x=1,       y=0:length(breaks2),      z=t(matrix(breaks2))*1.001,       col=pal[1:length(breaks)-1],      axes=FALSE,       breaks=breaks,      xlab="",      ylab="",       xaxt="n") #图例文字信息axis(4,     at=0:(length(breaks2)),     labels=breaks,     col="white",     las=1) #图例白色间隔abline(h=c(1:length(breaks2)),       col="white",       lwd=2,       xpd=F)

3. 展现多变量数据

nba <- read.csv("nba.csv")rownames(nba)<-nba[,1] data_matrix<-t(scale(data.matrix(nba[,-1]))) pal=brewer.pal(6,"Blues") statnames<-c("Games Played", "Minutes Played", "Total Points",              "Field Goals Made", "Field Goals Attempted",              "Field Goal Percentage", "Free Throws Made",              "Free Throws Attempted", "Free Throw Percentage",              "Three Pointers Made", "Three Pointers Attempted",              "Three Point Percentage", "Offensive Rebounds",              "Defensive Rebounds", "Total Rebounds", "Assists", "Steals",              "Blocks", "Turnovers", "Fouls") par(mar = c(3,14,19,2),oma=c(0.2,0.2,0.2,0.2),mex=0.5) #Heat map image(x=1:nrow(data_matrix),y=1:ncol(data_matrix),       z=data_matrix,xlab="",ylab="",col=pal,axes=FALSE) #X axis labels text(1:nrow(data_matrix), par("usr")[4] + 1,      srt = 45, adj = 0,labels = statnames,      xpd = TRUE, cex=0.85) #Y axis labels axis(side=2,at=1:ncol(data_matrix),      labels=colnames(data_matrix),      col="white",las=1, cex.axis=0.85) #White separating lines abline(h=c(1:ncol(data_matrix))+0.5,        v=c(1:nrow(data_matrix))+0.5,        col="white",lwd=1,xpd=F) #Graph Title text(par("usr")[1]+5, par("usr")[4] + 12,      "NBA per game performance of top 50corers",      xpd=TRUE,font=2,cex=1.5)