GSE2603基于R语言对乳腺癌转移的数据挖掘

来源：互联网发布：淘宝千万不要搜索的字编辑：程序博客网时间：2024/04/27 18:34

乳腺癌是以局部病变为首发症状的全身性疾病，肿瘤细胞转移生物学过程的复杂性,决定了从基因组水平蹄选与转移表型相关的功能基因成为目前乳腺癌转移预后相关基因研究的主要途径。通常是比较转移与不转移的细胞系,或同一细胞系的高转移与低转移的亚克隆虽然细胞系的遗传单一,蹄选基因的稳
定性和可重复性高,但由于细胞体外长期传代,其遗传基因已发生改变,不能客观反应体内的生物学特征。为此,基于乳腺癌病例组织标本,比较转移癌与原发癌或高、低转移潜能原发癌之间的基因表达差异,成为目前蹄选乳腺癌转移相关基因或基因表达特征(gene expression signatures)及乳腺癌生物学特征研究的主要方法。我们利用一组相关的乳腺癌组织标本芯片表达谱数据,采用生物信息学工具蹄选乳腺癌转移相关基因,为阐明乳腺癌发生和转移机制提供线索。
从NCBI共享数据库GEO下载乳腺癌转移相关的基因芯片数据,登录号为GSE2603其芯片平台为GPL96。GPL96 芯片平台([HG-U133 A] Affymetrix Human Genome U133 A Array)共包含1,000,000条寡核苷酸片段,22283个核酸探针,覆盖39,000个转录变异体,代表33,000条人全长基因。截至2012年3月,GPL96平台总共包含了 911个GSE中的30948个样品芯片数据信息。
GSE2603芯片数据来源于美国纽约Memorial Sloan Kettering癌症中心分子生物学系的基因组学中心心实验室,包括22例乳腺癌细胞系样本和99例乳腺癌组织样本,同时提供肿瘤大小、淋巴结状态、ER和PR状态等临床信息。去除细胞样本和未提供转移信息的组织样本,本研究只选取其中82例乳腺癌组织样本。在质量控制过程中我们可根据各种箱线图和RNA降解曲线图筛除不合格的乳腺癌组织样本，进而得到71例乳腺癌组织样本，我们用这71例样本做预处理实验，从中选取差异表达基因。
%Quality Control%
#Load the installation package of R
library(CLL)
library(simpleaffy)
library(gcrma)
library(affyPLM)
library(RColorBrewer)
library(affy)
library(graph)
library(affycoretools)
library(limma)
library(tcltk)
library(annotate)
library(XML)
library(IRanges)
library(org.Hs.eg.db)
library(DBI)
library(pheatmap)
library(GOstats)
library(base)
library(GeneAnswers)

%Read the data from the computer
filters <- matrix(c("CEL file", ".[Cc][Ee][Ll]", "All", ".*"), ncol = 2, byrow = T)
cel.files <-tk_choose.files(caption = "Select CELs", multi = TRUE,filters = filters, index = 1)
data.raw <- ReadAffy(filenames = cel.files)
data.raw #读取第一次质量控制筛选出的71例合格样本进行第二次质量控制

%Quality Control report
Data.qc <- qc(data.raw) #获取质量分析报告
plot(Data.qc) #图形化显示报告

%the original、weights、residual、residual.sign figures of Microarray data
Pset <- fitPLM(data.raw) #对数据集做回归计算，结果是一个“PLMset”类型的对象 image(data.raw[,1]) #画第一个芯片数据的原始图
image(Pset,type="weights",which=1,main="Weights") #根据计算结果画权重图
image(Pset,type="resids",which=1,main="Residuals") #根据计算结果画残差图
image(Pset,type="sign.resids",which=1,main="Residuals.sign") #根据计算结果画残差符号图

%draw relative log expression boxplot and normalized unscaled standard errors boxplot.
colors <- brewer.pal(12,"Set3") #载入一组颜色
Mbox(Pset,ylim=c(-1,1),col=colors,main="RLE",las=3) #绘制RLE（相对对数表达）箱线图
boxplot(Pset,ylim=c(0.95,1.22),col=colors,main="NUSE",las=3) #绘制NUSE（相对标准差）箱线图，较RLE精确
%draw RNA degradation figure
data.deg <- AffyRNAdeg(data.raw) #获取降解数据
colors <- brewer.pal(12,"Set3") #载入一组颜色
plotAffyRNAdeg(data.deg,col=colors) #绘制RNA降解图
legend("topleft",rownames(pData(data.raw)),col=colors,lwd=1,inset=0.05,cex=0.5)
#在RNA降解图的左上部位加注图注

%draw the figures of clustering analysis
dgcrma <- gcrma(data.raw) #使用gcrma算法来预处理数据
eset <- exprs(dgcrma) #提取基因表达矩阵
pearson_cor <- cor(eset) #计算样品两两之间的pearson相关系数
dist.lower <- as.dist(1 - pearson_cor) #得到pearson距离的下三角矩阵
hc <- hclust(dist.lower,"ave") #聚类分析
plot(hc) #根据聚类结果画图

%Standardized treatment
dmas5 <- mas5(data.raw)
drma <- rma(data.raw)
dgcrma <- gcrma(data.raw)
#分别用MAS5，rma，gcrma算法来预处理数据，需进行算法比较，进而确定哪种算法最合适

%draw histogram
colors <- brewer.pal(12,"Set3")
hist(data.raw,main="original",col=colors)
legend("topright",rownames(pData(data.raw)),col=colors,lwd=1,inset=0.05,cex=0.5,ncol=3) hist(dmas5,main="MAS5.0",xlim=c(-150,2^10),col=colors)
hist(drma,main="RMA",col=colors)
hist(dgcrma,main="gcRMA",col=colors) #绘制各个算法预处理前后的直方图

%draw boxplot
boxplot(data.raw,col=colors,las=3,ylim=c(0,20),main="original")
boxplot(dmas5,col=colors,las=3,ylim=c(0,1000),main="MAS5.0")
boxplot(drma,col=colors,las=3,ylim=c(0,20),main="RMA")
boxplot(dgcrma,col=colors,las=3,ylim=c(0,20),main="gcRMA")
#绘制各个算法预处理前后的箱线图

%draw MAplot
MAplot(data.raw[,1:4],pairs=TRUE,plot.method="smoothScatter",cex=0.8,main="originalMAplot")
MAplot(dgcrma[,1:4],pairs=TRUE,plot.method="smoothScatter",cex=0.8,main="gcRMAMAplot") #绘制gcrma算法预处理前后的MA图

%差异表达基因的筛选%
s<-sampleNames(data.raw) #把71例样本名称排成一竖列，列名为s
as.character(s) #查看此列样本名称
data.frame(s) #生成一个矩阵形式的列表
a<-data.frame(s) #将此列表赋值为a
View(a)
#查看此列表，将此列表复制粘贴在一个excel表中，新增一列样本性质，在每例样本后标注0或1。由于样本中包含转移癌和原发癌样本，本实验设定原发癌样本为0，转移癌样本为1。
列表如下：
ID Disease
GSM50034.CEL 0
GSM50035.CEL 1
GSM50036.CEL 1
GSM50037.CEL 0
GSM50038.CEL 0
GSM50039.CEL 0
GSM50040.CEL 1
GSM50042.CEL 1
GSM50043.CEL 0
GSM50044.CEL 0
GSM50045.CEL 1
GSM50046.CEL 1
GSM50047.CEL 1
GSM50048.CEL 1
GSM50049.CEL 0
GSM50051.CEL 0
GSM50053.CEL 1
GSM50054.CEL 1
GSM50059.CEL 0
GSM50060.CEL 1
GSM50061.CEL 0
GSM50062.CEL 0
GSM50063.CEL 1
GSM50064.CEL 1
GSM50065.CEL 1
GSM50066.CEL 1
GSM50067.CEL 1
GSM50068.CEL 1
GSM50071.CEL 1
GSM50072.CEL 1
GSM50073.CEL 1
GSM50074.CEL 0
GSM50075.CEL 1
GSM50078.CEL 1
GSM50079.CEL 1
GSM50080.CEL 0
GSM50081.CEL 0
GSM50082.CEL 0
GSM50083.CEL 0
GSM50084.CEL 0
GSM50085.CEL 0
GSM50086.CEL 0
GSM50089.CEL 0
GSM50090.CEL 0
GSM50092.CEL 1
GSM50093.CEL 1
GSM50094.CEL 1
GSM50095.CEL 0
GSM50096.CEL 1
GSM50097.CEL 0
GSM50098.CEL 0
GSM50099.CEL 0
GSM50100.CEL 0
GSM50101.CEL 0
GSM50102.CEL 1
GSM50103.CEL 1
GSM50106.CEL 1
GSM50107.CEL 1
GSM50111.CEL 0
GSM50112.CEL 1
GSM50115.CEL 0
GSM50116.CEL 1
GSM50118.CEL 1
GSM50119.CEL 1
GSM50121.CEL 0
GSM50122.CEL 1
GSM50123.CEL 1
GSM50127.CEL 1
GSM50128.CEL 0
GSM50130.CEL 1
GSM50131.CEL 1

getwd() #查看默认路径
disease<-read.csv("disease_state.csv",header = T)
#将表格另存为.csv格式的文件，存在默认路径下，将表格中的数据读入R中。
disease<-factor(disease_state[,"Disease"]) #提取实验条件信息
design<-model.matrix(~-1+disease) #构建实验设计矩阵
View(design)
colnames(design) <- c("s1", "s2")
#s1列样本若是转移癌为0，原发癌癌为1；s2列样本若是转移癌为1，若是原发癌为0。
contrast.matrix <- makeContrasts(s1-s2, levels=design)
#构建对比模型，比较两个实验条件下表达数据
fit <- lmFit(eset, design) #线性模型拟合
fit1<- contrasts.fit(fit, contrast.matrix) #根据对比模型进行差值运算
fit2 <- eBayes(fit1) #贝叶斯检验
results<-decideTests(fit2, method="global", adjust.method="BH",p.value=0.01,lfc=1.5)
summary(results)
View(results)
dif<- topTable(fit2, coef=1, number=10000, adjust.method="BH", sort.by="B", resort.by="M") #生成所有基因的检验结果报表
dif<-dif[dif[,"P.Value"]<0.001,]
#根据P-Value对结果进行筛选，得到全部差异表达基因，本实验得到1763个差异表达基因。
View(dif)

%GO注释%
affydb<-annPkgName(data.raw@annotation,type="db") #获得基因芯片注释包名称
affydb #GSE2603的样本的affydb为hgu133a
library(affydb,character.only=TRUE) #设定character.only
dif$symbols<-getSYMBOL(rownames(dif),affydb)
#根据每个探针组的ID获取对应的基因Gene Symbol，并作为一个新的列，加到数据框dif最后
head(dif) #显示结果前六行
dif$EntrezID<-getEG(rownames(dif),affydb)
#根据每个探针组的ID获取对应的基因Entrez ID,同样加到数据框dif最后
head(dif)
write.csv(dif, "results.go.0610.csv")
#生成dif结果的excel表格文件，文件名为results.go.0610
dif表前5行如下：
  logFC AveExpr t P.Value   adj.P.Val B symbol EntrezID
205225_at 3.2094368268.65813606 7.9455668752.08E-11 2.69E-0715.61294778 ESR12099
209173_at 2.9501936758.989609208 6.0847399085.29E-08 1.57E-058.206298828 AGR210551
209604_s_at 2.61314694910.83987543 7.1920672735.16E-10 2.29E-0612.58171324 GATA32625
209602_s_at 2.520255667.198314138 6.4120538631.37E-08 7.09E-069.483412359 GATA32625

%GO的富集分析及可视化
entrezUniverse<-unique(unlist(mget(rownames(eset),hgu133aENTREZID)))
#提取HG-U133 A芯片中所有探针组对应的EntrezID，注意保证uniq
entrezSelected<-unique(dif[!is.na(dif$EntrezID),"EntrezID"])
#提取所有差异表达基因及其对应的EntrezID，注意保证uniq
params<-new("GOHyperGParams",geneIds=entrezSelected,univereGeneIds=entrezUniverse,annotation=affydb,ontology="BP",pvalueCutoff=0.001,conditional=FALSE,testDirection="over")
params <- new("GOHyperGParams", geneIds=entrezSelected, universeGeneIds=entrezUniverse, annotation=affydb, ontology="BP", pvalueCutoff=0.001, conditional=FALSE, testDirection="over")
#设置GO富集分析的所有参数
hgOver<-hyperGTest(params)
#对所有的GOterm根据params参数做超几何实验
bp<-summary(hgOver)
#生成所有GOterm的检验结果报表
head(bp)

htmlReport(hgOver,file="ALL_go.html")
htmlReport(hgOver,dir=getwd(),file="ALL_go.html")
#同时生成所有GOterm的检验结果文件，每个GOterm都有指向官方网站的链接，可以获得其详细信息。

%KEGG通路富集分析及可视化

humanGeneInput<-dif[,c("EntrezID","logFC","P.Value")]
#选取dif中的三列信息构成新的矩阵，第一列必须是EntrezID
humanExpr<-eset[match(rownames(dif),rownames(eset)),]
#获得humanGeneInput中基因的表达值
humanExpr<-cbind(humanGeneInput[,"EntrezID"],humanExpr)
前两个数据做合并，第一列必须是EntrezID
humanGeneInput<-humanGeneInput[!is.na(humanGeneInput[,1]),]
去除NA数据

y<-geneAnswersBuilder(humanGeneInput,"org.Hs.eg.db",categoryType="KEGG",testType="hyperG",pvalueT=0.01,OnProfile=humanExpr,verbose=FALSE)
getEnrichmentInfo(y)[1:6,]
#KEGG通路的超几何检验

%绘制显著富集的KEGG通路的关系图和热图

selected<-eset[rownames(dif),] #从基因表达矩阵中选取差异表达基因对应的数据

rownames(selected)<-dif$symbols
#将selected矩阵每行的名称由探针组ID转换为对应的基因symbol
pheatmap(selected[1:20,],color=colorRampPalette(c("green","black","red"))(100),fontsize_row=4,scale="row",border_color=NA) #画前20个基因的热图

biocLite("Rgraphviz")
library(Rgraphviz)
ghandle<-goDag(hgOver)
subGHandle<-subGraph(snodes=as.character(summary(hgOver)[,1]),graph=ghandle)
plot(subGHandle)
yy<-geneAnswersReadable(y,verbose=FALSE)
geneAnswersConceptNet(yy,colorValueColumn="logFC",centroidSize="pvalue",output="interactive")
yyy<-geneAnswersHeatmap(yyy)

1 0