机器学习与R语言

来源:互联网 发布:音速启动同类软件 编辑:程序博客网 时间:2024/05/17 22:56

相信能看到这边文章的同僚们都能明白SMOTE算法是什么,在此就不过多介绍了,直接应用:  

  #利用SMOTE算法对原数据进行预处理

whitewine<-read.csv("C:/Users/Administrator/Desktop/实验数据集/whitewines.csv")

上面是我自己电脑中存储的数据,你可以按照你自己的 数据进行处理,切勿粘贴复制

  #注意当条件是一个向量时,比较的结果也是一个向量,分段函数应该如下编程:

w<-matrix(data = whitewine$quality,nrow = 2)
summary(whitewine$quality)
length(w)
y<-numeric(length(w))
y[w>4]<-1
y[w<=4]<-2
y
whitewine$quality<-y

当然,比上述更简单的是下面这种:   

 #对原数据的标签进行处理,使之成为二类不平衡数据集

whitewine$quality<-ifelse(whitewine$quality>4,1,2)
summary(whitewine$quality)
table(whitewine$quality)
prop.table(table(whitewine$quality))


    #简要查看数据,因为均为数值型属性,故不需要二值化,注意可能要运用标准化
head(whitewine,3)
    #先对数据集进行分割,两种方法
set.seed(12345)
whitewine_rand<-whitewine[order(runif(4898)),]
summary(whitewine_rand$quality)
whitewine_rand_train<-whitewine_rand[1:4000,]
whitewine_rand_test<-whitewine_rand[4001:4898,]
prop.table(table(whitewine_rand_test$quality))
table(whitewine_rand_train$quality)
class(whitewine_rand_train$quality)


    #另一种
library(caret)
set.seed(12345)   
splitIndex<-createDataPartition(whitewine$quality,time=1,p=0.817,list=FALSE)
trainSplit<-whitewine[splitIndex,]
testSplit<-whitewine[-splitIndex,]
table(testSplit$quality)
prop.table(table(trainSplit$quality))
class(trainSplit$quality)


    #利用第一种分割方法建立模型
whitewine_rand_train$quality<-factor(whitewine_rand_train$quality)
library(C50)
whitewine_model_a<-C5.0(whitewine_rand_train[-12],whitewine_rand_train$quality)
summary(whitewine_model_a)
whitewine_pred_a<-predict(whitewine_model_a,whitewine_rand_test,type = "class")
library(gmodels)
CrossTable(whitewine_rand_test$quality,whitewine_pred_a,prop.chisq = F,prop.c = F,prop.r = F,
           dnn = c('actual quality','predict quality'))


    #画出roc曲线
library(pROC)
class(whitewine_pred_a)
whitewine_pred_a
whitewine_pred_a<-as.numeric(whitewine_pred_a)
auc<-roc(whitewine_rand_test$quality,whitewine_pred_a)   
print(auc)
plot(auc,ylim=c(0,1),print.thres=TRUE,main=paste('AUC',round(auc$auc[[1]],2)))
abline(h=1,col="blue",lwd=2)
abline(h=0,col="red",lwd=2)

下面我将数据导出为arff格式,以便在weka中运行

write.arff(whitewine,file = "C:/Users/Administrator/Desktop/实验数据集/whitewine_a.arff")

    
    #利用SMOTE算法预处理不平衡数据,这里未对原数据进行标准化
library(DMwR)
class(whitewine_rand_train$quality)  #需因子型目标变量
table(whitewine_rand_train$quality)
whitewine_rand_train<-SMOTE(quality~.,whitewine_rand_train,perc.over=600,perc.under=100)
prop.table(table(whitewine_rand_train$quality))


    #再利用处理之后的第一种分割方法建立模型
whitewine_rand_train$quality<-factor(whitewine_rand_train$quality)
library(C50)
whitewine_model_b<-C5.0(whitewine_rand_train[-12],whitewine_rand_train$quality)
summary(whitewine_model_b)
whitewine_pred_b<-predict(whitewine_model_b,whitewine_rand_test,type = "class")
library(gmodels)
CrossTable(whitewine_rand_test$quality,whitewine_pred_b,prop.chisq = F,prop.c = F,prop.r = F,
           dnn = c('actual quality','predict quality'))
    #画出roc曲线,并生成arff格式的预处理后的训练集与未处理的测试集
library(pROC)
class(whitewine_pred_b)
whitewine_pred_b
whitewine_pred_b<-as.numeric(whitewine_pred_b)
auc<-roc(whitewine_rand_test$quality,whitewine_pred_b)   
print(auc)
plot(auc,ylim=c(0,1),print.thres=TRUE,main=paste('AUC',round(auc$auc[[1]],2)))
abline(h=1,col="blue",lwd=2)
abline(h=0,col="red",lwd=2)
write.arff(whitewine_rand_train,file = "C:/Users/Administrator/Desktop/实验数据集/whitewine_train.arff")
whitewine_rand_test$quality<-ifelse(whitewine_rand_test$quality==0,1,2)
summary(whitewine_rand_test$quality)
write.arff(whitewine_rand_test,file = "C:/Users/Administrator/Desktop/实验数据集/whitewine_test.arff")


    #对原数据进行标准化,再进行SMOTE增加实例,随后模型建立与预测
normalize<-function(x){
  return((x-min(x))/(max(x)-min(x)))
}
whitewine_rand_train_n<-lapply(whitewine_rand_train[1:11],normalize) 
summary(whitewine_rand_train_n$alcohol)
whitewine_rand_train_n$quality<-whitewine_rand_train$quality
whitewine_rand_train_n<-as.data.frame(whitewine_rand_train_n)
summary(whitewine_rand_train_n$quality)


    #利用SMOTE算法预处理不平衡数据,这里已经对原数据进行标准化
library(DMwR)
class(whitewine_rand_train_n$quality)  #需因子型目标变量
whitewine_rand_train_n$quality<-factor(whitewine_rand_train_n$quality)
table(whitewine_rand_train_n$quality)
whitewine_rand_train_n<-SMOTE(quality~.,whitewine_rand_train_n,perc.over=600,perc.under=100)
table(whitewine_rand_train_n$quality)
prop.table(table(whitewine_rand_train_n$quality))


    #再利用处理之后的第一种分割方法建立模型,注意也一定要对测试数据集进行标准化!!!
whitewine_rand_train_n$quality<-factor(whitewine_rand_train_n$quality)
library(C50)
whitewine_model_c<-C5.0(whitewine_rand_train_n[-12],whitewine_rand_train_n$quality)
summary(whitewine_model_c)
whitewine_rand_test_n<-lapply(whitewine_rand_test[1:11],normalize)
whitewine_rand_test_n$quality<-whitewine_rand_test$quality
whitewine_rand_test_n<-as.data.frame(whitewine_rand_test_n)
summary(whitewine_rand_test_n$quality)
whitewine_pred_c<-predict(whitewine_model_c,whitewine_rand_test_n,type = "class")
library(gmodels)
CrossTable(whitewine_rand_test_n$quality,whitewine_pred_c,prop.chisq = F,prop.c = F,prop.r = F,
           dnn = c('actual quality','predict quality'))


    #画出roc曲线,并生成arff格式的预处理后的训练集与未处理的测试集
library(pROC)
class(whitewine_pred_c)
whitewine_pred_c
whitewine_pred_c<-as.numeric(whitewine_pred_c)
auc<-roc(whitewine_rand_test_n$quality,whitewine_pred_c)   
print(auc)
plot(auc,ylim=c(0,1),print.thres=TRUE,main=paste('AUC',round(auc$auc[[1]],2)))
abline(h=1,col="blue",lwd=2)
abline(h=0,col="red",lwd=2)
write.arff(whitewine_rand_train_n,file = "C:/Users/Administrator/Desktop/实验数据集/whitewine_train_n.arff")
write.arff(whitewine_rand_test_n,file = "C:/Users/Administrator/Desktop/实验数据集/whitewine_test_n.arff")


    #再利用另外一种划分方法进行模型建立
ctrl<-trainControl(method="cv",number=5)
library(ipred)
library(plyr)
class(trainSplit$quality)
tbmodel<-train(quality~.,data=trainSplit,method="treebag",
               trControl=ctrl)
predictors<-names(trainSplit)[names(trainSplit)!='quality']
pred<-predict(tbmodel$finalModel,testSplit[,predictors])
library(pROC)
auc<-roc(testSplit$quality,pred)   
print(auc)    #得出auc0.8947,可能不准确,因为是不平衡数据集
plot(auc,ylim=c(0,1),print.thres=TRUE,main=paste('AUC',round(auc$auc[[1]],2)))
abline(h=1,col="blue",lwd=2)
abline(h=0,col="red",lwd=2)


    #利用SMOTE算法预处理不平衡数据,这里未对原数据进行标准化
library(DMwR)
class(trainSplit$quality)  #需因子型目标变量
trainSplit$quality<-factor(trainSplit$quality)
table(trainSplit$quality)
trainSplit<-SMOTE(quality~.,trainSplit,perc.over=600,perc.under=100)
prop.table(table(trainSplit$quality))


    #再建立模型,并生成arff数据导入weka
ctrl<-trainControl(method="cv",number=5)
library(ipred)
library(plyr)
class(trainSplit$quality)
trainSplit$quality<-as.numeric(trainSplit$quality)
tbmodel<-train(quality~.,data=trainSplit,method="treebag",
               trControl=ctrl)
predictors<-names(trainSplit)[names(trainSplit)!='quality']
pred<-predict(tbmodel$finalModel,testSplit[,predictors])
library(pROC)
auc<-roc(testSplit$quality,pred)   
print(auc)    
plot(auc,ylim=c(0,1),print.thres=TRUE,main=paste('AUC',round(auc$auc[[1]],2)))
write.arff(trainSplit,file = "C:/Users/Administrator/Desktop/实验数据集/trainsplit1.arff")

write.arff(testSplit,file = "C:/Users/Administrator/Desktop/实验数据集/testsplit1.arff")

本人第一篇博文,全是代码偷笑,有兴趣的可以联系我,详述这个过程


原创粉丝点击