R语言包_gbm

来源:互联网 发布:葛优演技 知乎 编辑:程序博客网 时间:2024/06/10 00:57

gbm效果和randomForest相近,但是占用内存更少,且支持多核crossValidation运算。

本文用到的处理二值数据的方法,有以下两种:

  1. glm(generalized boosted models)
  2. glmnet(generalized linear models)

glm使用了boosted trees,glmnet使用了regression

# load librarieslibrary(caret)library(pROC)################################################## data prep################################################## load datatitanicDF <- read.csv('http://math.ucdenver.edu/RTutorial/titanic.txt',sep='\t')titanicDF$Title <- ifelse(grepl('Mr ',titanicDF$Name),'Mr',ifelse(grepl('Mrs ',titanicDF$Name),'Mrs',ifelse(grepl('Miss',titanicDF$Name),'Miss','Nothing'))) titanicDF$Age[is.na(titanicDF$Age)] <- median(titanicDF$Age, na.rm=T)# miso formattitanicDF <- titanicDF[c('PClass', 'Age',    'Sex',   'Title', 'Survived')]# dummy variables for factors/characterstitanicDF$Title <- as.factor(titanicDF$Title)titanicDummy <- dummyVars("~.",data=titanicDF, fullRank=F)titanicDF <- as.data.frame(predict(titanicDummy,titanicDF))print(names(titanicDF))# what is the proportion of your outcome variable?prop.table(table(titanicDF$Survived))# save the outcome for the glmnet modeltempOutcome <- titanicDF$Survived  # generalize outcome and predictor variablesoutcomeName <- 'Survived'predictorsNames <- names(titanicDF)[names(titanicDF) != outcomeName]################################################## model it################################################## get names of all caret supported models names(getModelInfo())titanicDF$Survived <- ifelse(titanicDF$Survived==1,'yes','nope')# pick model gbm and find out what type of model it isgetModelInfo()$gbm$type# split data into training and testing chunksset.seed(1234)splitIndex <- createDataPartition(titanicDF[,outcomeName], p = .75, list = FALSE, times = 1)trainDF <- titanicDF[ splitIndex,]testDF  <- titanicDF[-splitIndex,]# create caret trainControl object to control the number of cross-validations performedobjControl <- trainControl(method='cv', number=3, returnResamp='none', summaryFunction = twoClassSummary, classProbs = TRUE)# run modelobjModel <- train(trainDF[,predictorsNames], as.factor(trainDF[,outcomeName]),                   method='gbm',                   trControl=objControl,                    metric = "ROC",                  preProc = c("center", "scale")))# find out variable importancesummary(objModel)# find out model detailsobjModel################################################## evalute mdoel################################################## get predictions on your testing data# class predictionpredictions <- predict(object=objModel, testDF[,predictorsNames], type='raw')head(predictions)postResample(pred=predictions, obs=as.factor(testDF[,outcomeName]))# probabilites predictions <- predict(object=objModel, testDF[,predictorsNames], type='prob')head(predictions)postResample(pred=predictions, obs=testDF[,outcomeName])auc <- roc(ifelse(testDF[,outcomeName]=="yes",1,0), predictions[[2]])print(auc$auc)################################################# glm model################################################# pick model gbm and find out what type of model it isgetModelInfo()$glmnet$type# save the outcome for the glmnet modeltitanicDF$Survived  <- tempOutcome# split data into training and testing chunksset.seed(1234)splitIndex <- createDataPartition(titanicDF[,outcomeName], p = .75, list = FALSE, times = 1)trainDF <- titanicDF[ splitIndex,]testDF  <- titanicDF[-splitIndex,]# create caret trainControl object to control the number of cross-validations performedobjControl <- trainControl(method='cv', number=3, returnResamp='none')# run modelobjModel <- train(trainDF[,predictorsNames], trainDF[,outcomeName], method='glmnet',  metric = "RMSE")# get predictions on your testing datapredictions <- predict(object=objModel, testDF[,predictorsNames])library(pROC)auc <- roc(testDF[,outcomeName], predictions)print(auc$auc)postResample(pred=predictions, obs=testDF[,outcomeName])# find out variable importancesummary(objModel)plot(varImp(objModel,scale=F))# find out model detailsobjModel# display variable importance on a +/- scale vimp <- varImp(objModel, scale=F)results <- data.frame(row.names(vimp$importance),vimp$importance$Overall)results$VariableName <- rownames(vimp)colnames(results) <- c('VariableName','Weight')results <- results[order(results$Weight),]results <- results[(results$Weight != 0),]par(mar=c(5,15,4,2)) # increase y-axis margin. xx <- barplot(results$Weight, width = 0.85,               main = paste("Variable Importance -",outcomeName), horiz = T,               xlab = "< (-) importance >  < neutral >  < importance (+) >", axes = FALSE,               col = ifelse((results$Weight > 0), 'blue', 'red')) axis(2, at=xx, labels=results$VariableName, tick=FALSE, las=2, line=-0.3, cex.axis=0.6)  ################################################# advanced stuff################################################# boosted tree model (gbm) adjust learning rate and and treesgbmGrid <-  expand.grid(interaction.depth =  c(1, 5, 9),                        n.trees = 50,                        shrinkage = 0.01)# run modelobjModel <- train(trainDF[,predictorsNames], trainDF[,outcomeName], method='gbm', trControl=objControl, tuneGrid = gbmGrid, verbose=F)# get predictions on your testing datapredictions <- predict(object=objModel, testDF[,predictorsNames])library(pROC)auc <- roc(testDF[,outcomeName], predictions)print(auc$auc)

参考资料

DataExploration博客

0 0
原创粉丝点击