展示如何用R处理稀疏矩阵和利用GLMNET包
来源:互联网 发布:刘备出装 知乎 编辑:程序博客网 时间:2024/05/19 09:15
展示如何用R处理稀疏矩阵和利用GLMNET包
# 建立简单的some_data.frame
some_dataframe<-read.table("~\some_data.frame.txt",sep="\t",header=T)
some_dataframe
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## 1 2 7 0 0 0 0 0 0 0 0 0
## 2 0 0 3 0 0 0 0 0 0 0 0
## 3 0 0 0 6 1 0 0 0 0 0 0
## 4 0 0 0 2 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 12 0 1
## 6 0 0 0 0 0 25 0 0 0 0 1
## 7 1 0 0 0 2 0 0 0 0 0 0
## 8 0 0 0 2 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 14 0 1
## 10 0 0 0 0 0 21 0 0 0 0 1
## 11 0 0 0 0 0 0 28 0 0 0 1
## 12 0 0 0 0 0 0 0 35 0 0 1
## 13 0 0 0 0 0 0 0 0 42 0 1
## 14 0 0 0 0 0 0 0 0 0 49 1
some_matrix<-data.matrix(some_dataframe)
some_matrix
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## [1,] 2 7 0 0 0 0 0 0 0 0 0
## [2,] 0 0 3 0 0 0 0 0 0 0 0
## [3,] 0 0 0 6 1 0 0 0 0 0 0
## [4,] 0 0 0 2 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 12 0 1
## [6,] 0 0 0 0 0 25 0 0 0 0 1
## [7,] 1 0 0 0 2 0 0 0 0 0 0
## [8,] 0 0 0 2 0 0 0 0 0 0 0
## [9,] 0 0 0 0 0 0 0 0 14 0 1
## [10,] 0 0 0 0 0 21 0 0 0 0 1
## [11,] 0 0 0 0 0 0 28 0 0 0 1
## [12,] 0 0 0 0 0 0 0 35 0 0 1
## [13,] 0 0 0 0 0 0 0 0 42 0 1
## [14,] 0 0 0 0 0 0 0 0 0 49 1
# 显然,这跟data.frame格式差不多。为了将它转成稀疏矩阵,我们加载Matrix包,利用Matrix函数,将sparse参数设置为TRUE。
library(Matrix)
print(Matrix(some_matrix,sparse=T))
## 14 x 11 sparse Matrix of class "dgCMatrix"
## [[ suppressing 11 column names 'c1', 'c2', 'c3' ... ]]
##
## [1,] 2 7 . . . . . . . . .
## [2,] . . 3 . . . . . . . .
## [3,] . . . 6 1 . . . . . .
## [4,] . . . 2 . . . . . . .
## [5,] . . . . . . . . 12 . 1
## [6,] . . . . . 25 . . . . 1
## [7,] 1 . . . 2 . . . . . .
## [8,] . . . 2 . . . . . . .
## [9,] . . . . . . . . 14 . 1
## [10,] . . . . . 21 . . . . 1
## [11,] . . . . . . 28 . . . 1
## [12,] . . . . . . . 35 . . 1
## [13,] . . . . . . . . 42 . 1
## [14,] . . . . . . . . . 49 1
# 在这里,它只保留了非零值。
接下来,让我们将data.frame数据分成两份:2/3做为训练集,1/3做为测试集。
set.seed(2)
split<-sample(nrow(some_dataframe),floor(0.7*nrow(some_dataframe)))
train<-some_dataframe[split,]
test<-some_dataframe[-split,]
train
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## 3 0 0 0 6 1 0 0 0 0 0 0
## 10 0 0 0 0 0 21 0 0 0 0 1
## 7 1 0 0 0 2 0 0 0 0 0 0
## 2 0 0 3 0 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0 42 0 1
## 9 0 0 0 0 0 0 0 0 14 0 1
## 11 0 0 0 0 0 0 28 0 0 0 1
## 6 0 0 0 0 0 25 0 0 0 0 1
## 14 0 0 0 0 0 0 0 0 0 49 1
test
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## 1 2 7 0 0 0 0 0 0 0 0 0
## 4 0 0 0 2 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 12 0 1
## 8 0 0 0 2 0 0 0 0 0 0 0
## 12 0 0 0 0 0 0 0 35 0 0 1
# 然后,我们用sparse.model.matrix函数创建稀疏矩阵
train_sparse<-sparse.model.matrix(~.,train[1:10])
test_sparse<-sparse.model.matrix(~.,test[1:10])
train_sparse
## 9 x 11 sparse Matrix of class "dgCMatrix"
## [[ suppressing 11 column names '(Intercept)', 'c1', 'c2' ... ]]
##
## 3 1 . . . 6 1 . . . . .
## 10 1 . . . . . 21 . . . .
## 7 1 1 . . . 2 . . . . .
## 2 1 . . 3 . . . . . . .
## 13 1 . . . . . . . . 42 .
## 9 1 . . . . . . . . 14 .
## 11 1 . . . . . . 28 . . .
## 6 1 . . . . . 25 . . . .
## 14 1 . . . . . . . . . 49
test_sparse
## 5 x 11 sparse Matrix of class "dgCMatrix"
## [[ suppressing 11 column names '(Intercept)', 'c1', 'c2' ... ]]
##
## 1 1 2 7 . . . . . . . .
## 4 1 . . . 2 . . . . . .
## 5 1 . . . . . . . . 12 .
## 8 1 . . . 2 . . . . . .
## 12 1 . . . . . . . 35 . .
library(glmnet)
## Loaded glmnet 1.9-8
fit<-glmnet(train_sparse,train[,11])
pred<-predict(fit,test_sparse,test[,11],type="class")
print(head(pred[,1:5]))
## 1 2 3 4 5
## 1 0.9898 0.9898 0.6667 0.9898 0.6667
## 4 0.8306 0.8306 0.6667 0.8306 0.6667
## 5 0.9898 0.9898 0.6667 0.9898 0.6667
## 8 0.8306 0.8306 0.6667 0.8306 0.6667
## 12 0.9898 0.9898 0.6667 0.9898 0.6667
# 利用cv.glmnet找出最好的lambda/penalty
cv<-cv.glmnet(train_sparse,train[,11],nfolds=3)
pred<-predict(fit,test_sparse,type="response",s=cv$lambda.min)
print(names(cv))
## [1] "lambda" "cvm" "cvsd" "cvup" "cvlo"
## [6] "nzero" "name" "glmnet.fit" "lambda.min" "lambda.1se"
print(pred)
## 1
## 1 0.9898
## 4 0.8306
## 5 0.9898
## 8 0.8306
## 12 0.9898
# receiver operating characteristic (ROC curves)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## 下列对象被屏蔽了from 'package:glmnet':
##
## auc
##
## 下列对象被屏蔽了from 'package:stats':
##
## cov, smooth, var
auc<-roc(test[,11],pred)
print(auc$auc)
## Area under the curve: 0.833
# how does sparse deal with categorical data (adding mood feature with two levels)?
cat_dataframe<-data.frame(some_dataframe,
mood=c("happy","happy","happy","happy","sad","sad","happy","happy",
"sad","sad","sad","sad","sad","sad"))
cat_dataframe<-cat_dataframe[,c(colnames(cat_dataframe)[1:10],"mood","outcome")]
sparse.model.matrix(~.,cat_dataframe)
## 14 x 13 sparse Matrix of class "dgCMatrix"
## [[ suppressing 13 column names '(Intercept)', 'c1', 'c2' ... ]]
##
## 1 1 2 7 . . . . . . . . . .
## 2 1 . . 3 . . . . . . . . .
## 3 1 . . . 6 1 . . . . . . .
## 4 1 . . . 2 . . . . . . . .
## 5 1 . . . . . . . . 12 . 1 1
## 6 1 . . . . . 25 . . . . 1 1
## 7 1 1 . . . 2 . . . . . . .
## 8 1 . . . 2 . . . . . . . .
## 9 1 . . . . . . . . 14 . 1 1
## 10 1 . . . . . 21 . . . . 1 1
## 11 1 . . . . . . 28 . . . 1 1
## 12 1 . . . . . . . 35 . . 1 1
## 13 1 . . . . . . . . 42 . 1 1
## 14 1 . . . . . . . . . 49 1 1
print(levels(cat_dataframe$mood))
## [1] "happy" "sad"
dim(cat_dataframe)
## [1] 14 12
dim(sparse.model.matrix(~.,cat_dataframe))
## [1] 14 13
# 建立简单的some_data.frame
some_dataframe<-read.table("~\some_data.frame.txt",sep="\t",header=T)
some_dataframe
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## 1 2 7 0 0 0 0 0 0 0 0 0
## 2 0 0 3 0 0 0 0 0 0 0 0
## 3 0 0 0 6 1 0 0 0 0 0 0
## 4 0 0 0 2 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 12 0 1
## 6 0 0 0 0 0 25 0 0 0 0 1
## 7 1 0 0 0 2 0 0 0 0 0 0
## 8 0 0 0 2 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 14 0 1
## 10 0 0 0 0 0 21 0 0 0 0 1
## 11 0 0 0 0 0 0 28 0 0 0 1
## 12 0 0 0 0 0 0 0 35 0 0 1
## 13 0 0 0 0 0 0 0 0 42 0 1
## 14 0 0 0 0 0 0 0 0 0 49 1
some_matrix<-data.matrix(some_dataframe)
some_matrix
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## [1,] 2 7 0 0 0 0 0 0 0 0 0
## [2,] 0 0 3 0 0 0 0 0 0 0 0
## [3,] 0 0 0 6 1 0 0 0 0 0 0
## [4,] 0 0 0 2 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 12 0 1
## [6,] 0 0 0 0 0 25 0 0 0 0 1
## [7,] 1 0 0 0 2 0 0 0 0 0 0
## [8,] 0 0 0 2 0 0 0 0 0 0 0
## [9,] 0 0 0 0 0 0 0 0 14 0 1
## [10,] 0 0 0 0 0 21 0 0 0 0 1
## [11,] 0 0 0 0 0 0 28 0 0 0 1
## [12,] 0 0 0 0 0 0 0 35 0 0 1
## [13,] 0 0 0 0 0 0 0 0 42 0 1
## [14,] 0 0 0 0 0 0 0 0 0 49 1
# 显然,这跟data.frame格式差不多。为了将它转成稀疏矩阵,我们加载Matrix包,利用Matrix函数,将sparse参数设置为TRUE。
library(Matrix)
print(Matrix(some_matrix,sparse=T))
## 14 x 11 sparse Matrix of class "dgCMatrix"
## [[ suppressing 11 column names 'c1', 'c2', 'c3' ... ]]
##
## [1,] 2 7 . . . . . . . . .
## [2,] . . 3 . . . . . . . .
## [3,] . . . 6 1 . . . . . .
## [4,] . . . 2 . . . . . . .
## [5,] . . . . . . . . 12 . 1
## [6,] . . . . . 25 . . . . 1
## [7,] 1 . . . 2 . . . . . .
## [8,] . . . 2 . . . . . . .
## [9,] . . . . . . . . 14 . 1
## [10,] . . . . . 21 . . . . 1
## [11,] . . . . . . 28 . . . 1
## [12,] . . . . . . . 35 . . 1
## [13,] . . . . . . . . 42 . 1
## [14,] . . . . . . . . . 49 1
# 在这里,它只保留了非零值。
接下来,让我们将data.frame数据分成两份:2/3做为训练集,1/3做为测试集。
set.seed(2)
split<-sample(nrow(some_dataframe),floor(0.7*nrow(some_dataframe)))
train<-some_dataframe[split,]
test<-some_dataframe[-split,]
train
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## 3 0 0 0 6 1 0 0 0 0 0 0
## 10 0 0 0 0 0 21 0 0 0 0 1
## 7 1 0 0 0 2 0 0 0 0 0 0
## 2 0 0 3 0 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0 42 0 1
## 9 0 0 0 0 0 0 0 0 14 0 1
## 11 0 0 0 0 0 0 28 0 0 0 1
## 6 0 0 0 0 0 25 0 0 0 0 1
## 14 0 0 0 0 0 0 0 0 0 49 1
test
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## 1 2 7 0 0 0 0 0 0 0 0 0
## 4 0 0 0 2 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 12 0 1
## 8 0 0 0 2 0 0 0 0 0 0 0
## 12 0 0 0 0 0 0 0 35 0 0 1
# 然后,我们用sparse.model.matrix函数创建稀疏矩阵
train_sparse<-sparse.model.matrix(~.,train[1:10])
test_sparse<-sparse.model.matrix(~.,test[1:10])
train_sparse
## 9 x 11 sparse Matrix of class "dgCMatrix"
## [[ suppressing 11 column names '(Intercept)', 'c1', 'c2' ... ]]
##
## 3 1 . . . 6 1 . . . . .
## 10 1 . . . . . 21 . . . .
## 7 1 1 . . . 2 . . . . .
## 2 1 . . 3 . . . . . . .
## 13 1 . . . . . . . . 42 .
## 9 1 . . . . . . . . 14 .
## 11 1 . . . . . . 28 . . .
## 6 1 . . . . . 25 . . . .
## 14 1 . . . . . . . . . 49
test_sparse
## 5 x 11 sparse Matrix of class "dgCMatrix"
## [[ suppressing 11 column names '(Intercept)', 'c1', 'c2' ... ]]
##
## 1 1 2 7 . . . . . . . .
## 4 1 . . . 2 . . . . . .
## 5 1 . . . . . . . . 12 .
## 8 1 . . . 2 . . . . . .
## 12 1 . . . . . . . 35 . .
library(glmnet)
## Loaded glmnet 1.9-8
fit<-glmnet(train_sparse,train[,11])
pred<-predict(fit,test_sparse,test[,11],type="class")
print(head(pred[,1:5]))
## 1 2 3 4 5
## 1 0.9898 0.9898 0.6667 0.9898 0.6667
## 4 0.8306 0.8306 0.6667 0.8306 0.6667
## 5 0.9898 0.9898 0.6667 0.9898 0.6667
## 8 0.8306 0.8306 0.6667 0.8306 0.6667
## 12 0.9898 0.9898 0.6667 0.9898 0.6667
# 利用cv.glmnet找出最好的lambda/penalty
cv<-cv.glmnet(train_sparse,train[,11],nfolds=3)
pred<-predict(fit,test_sparse,type="response",s=cv$lambda.min)
print(names(cv))
## [1] "lambda" "cvm" "cvsd" "cvup" "cvlo"
## [6] "nzero" "name" "glmnet.fit" "lambda.min" "lambda.1se"
print(pred)
## 1
## 1 0.9898
## 4 0.8306
## 5 0.9898
## 8 0.8306
## 12 0.9898
# receiver operating characteristic (ROC curves)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## 下列对象被屏蔽了from 'package:glmnet':
##
## auc
##
## 下列对象被屏蔽了from 'package:stats':
##
## cov, smooth, var
auc<-roc(test[,11],pred)
print(auc$auc)
## Area under the curve: 0.833
# how does sparse deal with categorical data (adding mood feature with two levels)?
cat_dataframe<-data.frame(some_dataframe,
mood=c("happy","happy","happy","happy","sad","sad","happy","happy",
"sad","sad","sad","sad","sad","sad"))
cat_dataframe<-cat_dataframe[,c(colnames(cat_dataframe)[1:10],"mood","outcome")]
sparse.model.matrix(~.,cat_dataframe)
## 14 x 13 sparse Matrix of class "dgCMatrix"
## [[ suppressing 13 column names '(Intercept)', 'c1', 'c2' ... ]]
##
## 1 1 2 7 . . . . . . . . . .
## 2 1 . . 3 . . . . . . . . .
## 3 1 . . . 6 1 . . . . . . .
## 4 1 . . . 2 . . . . . . . .
## 5 1 . . . . . . . . 12 . 1 1
## 6 1 . . . . . 25 . . . . 1 1
## 7 1 1 . . . 2 . . . . . . .
## 8 1 . . . 2 . . . . . . . .
## 9 1 . . . . . . . . 14 . 1 1
## 10 1 . . . . . 21 . . . . 1 1
## 11 1 . . . . . . 28 . . . 1 1
## 12 1 . . . . . . . 35 . . 1 1
## 13 1 . . . . . . . . 42 . 1 1
## 14 1 . . . . . . . . . 49 1 1
print(levels(cat_dataframe$mood))
## [1] "happy" "sad"
dim(cat_dataframe)
## [1] 14 12
dim(sparse.model.matrix(~.,cat_dataframe))
## [1] 14 13
0 0
- 展示如何用R处理稀疏矩阵和利用GLMNET包
- 线性回归建模–变量选择和正则化:R包glmnet
- LASSO-Logistic模型--基于R语言glmnet包
- LASSO和L1正则包liblinear,glmnet使用和对比
- matlab稀疏矩阵处理
- 如何用matlab和R语言画K线图
- 如何用R和API免费获取Web数据?
- R语言解决Lasso问题----glmnet包(广义线性模型)
- 如何用R发送邮件
- 稀疏矩阵处理与运用
- 利用Rcpp和RcppArmadillo创建R包
- 如何用Java处理和优化字符串常量表达式
- 如何用R做计量经济学
- 如何用R连接mysql数据库
- 利用R语言的DMwR包处理样本不平衡
- 稀疏矩阵存储和查找
- 利用线性表压缩稀疏矩阵
- 如何用cublas计算逆矩阵?
- mysql-5.6.4-m7 install in linux
- 分支-07. 比较大小(10)
- 企业移动安全战略中的十大基本要素
- 如何在linux下安装mysql
- php 定时执行任务
- 展示如何用R处理稀疏矩阵和利用GLMNET包
- Maven类包冲突终极解决小技若干
- simplexml 使用实例
- Mule ESB-3.Build a webservice proxy
- Android通过Intent.ACTION_CLOSE_SYSTEM_DIALOGS监听Home按键消息
- HTML5 Boilerplate - 让页面有个好的开始
- error:js程序无法运行
- Perl 箭头符号
- 短信