adaBoost 源码

来源:互联网 发布:如何判断存在sql注入 编辑:程序博客网 时间:2024/04/27 04:24
#!/usr/python/bin#-*- coding:utf-8 -*-from numpy import *def loadSimpData():datMat = matrix(([1., 2.1],[2., 1.1],[1.3, 1. ],[1. , 1. ],[2. ,1. ]))classLabels = [1.0,1.0,-1.0,-1.0,1.0]return datMat, classLabelsdef stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#根据选择的特征,阈值,规则将样本数据dataMatrix进行分类retArray = ones((shape(dataMatrix)[0],1))#先都初始化为正类if threshIneq == 'lt':#根据不同的分类方式(小于等于还是大于)来为元素进行分类retArray[dataMatrix[:,dimen] <= threshVal] = -1.0#将特定特征不满足要求的元素分到负类else:retArray[dataMatrix[:,dimen] > threshVal] = -1.0return retArraydef buildStump(dataArr, classLabels, D):dataMatrix = mat(dataArr)labelMat = mat(classLabels).Tm,n = shape(dataMatrix)numSteps = 10.0#将每个特征的最大最小值之间分为10步bestStump = {}#最佳分类时的各种数据,包括维度,阈值,分类采用的符号(大于,小于)bestClasEst = mat(zeros((m,1)))minError = inffor i in range(n):#从1~n个特征中找第i个特征rangeMin = dataMatrix[:,i].min()rangeMax = dataMatrix[:,i].max()stepSize = (rangeMax-rangeMin)/numStepsfor j in range(-1,int(numSteps)+1):#对于每个步长for inequal in ['lt','gt']:#对于两种分类方式threshVal = (rangeMin+float(j)*stepSize)#确定分类值predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)errArr = mat(ones((m,1)))#初始化错误矩阵均为1errArr[predictedVals == labelMat] = 0#如果预测正确了就改为0weightedError = D.T*errArr#将每个样本的权值与错误情况相乘,得到错误率#print "split: dim %d,thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" %(i,threshVal, inequal, weightedError)if weightedError<minError:#如果当前分类方式的错误率小于最小错误率,就进行相应的更新minError = weightedErrorbestClasEst = predictedVals.copy()bestStump['dim'] = ibestStump['thresh'] = threshValbestStump['ineq'] = inequalreturn bestStump,minError,bestClasEstdef adaBoostTrainDS(dataArr,classLabels,numIt = 40):#完整的adaBoost算法weakClassArr = []#初始化弱分类器矩阵m = shape(dataArr)[0]D = mat(ones((m,1))/m)#初始化权值aggClassEst = mat(zeros((m,1)))#初始化强分类器for i in range(numIt):bestStump,error,classEst = buildStump(dataArr,classLabels,D)#得到最优的弱分类器(在此处是决策树桩)print "D: ", D.Talpha = float(0.5*log((1.0-error)/max(error,1e-16)))#计算这个弱分类器的权重值,1e-16确保在没有错误时不会发生除0溢出bestStump['alpha'] = alphaweakClassArr.append(bestStump)#将此弱分类器放到弱分类器数组中print "classEst: ",classEst.T #输出此弱分类器的分类结果expon = multiply(-1*alpha*mat(classLabels).T,classEst)#更新每个样本的权值D = multiply(D,exp(expon))D = D/D.sum()#更新每个样本的权值aggClassEst = aggClassEst+alpha*classEst #将分类结果加到强分类器中,加入之前要先乘上权值print "aggClassEst: ",aggClassEst.TaggErrors = multiply(sign(aggClassEst)!=mat(classLabels).T,ones((m,1)))#计算强分类器的错误个数errorRate = aggErrors.sum()/m #多个若分类器累加的错误率print "total error: ",errorRate,"\n"if errorRate == 0.0:#若错误率为0,则提前退出breakreturn weakClassArr, aggClassEstdef adaClassify(datToClass, classifierArr):#输入测试数据和弱分类器数组,输出样本的分类dataMatrix = mat(datToClass)m = shape(dataMatrix)[0]aggClassEst = mat(zeros((m,1)))#初始化对这m个样本的分类结果for i in range(len(classifierArr)):classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],classifierArr[i]['thresh'],classifierArr[i]['ineq']) #弱分类器的分类结果aggClassEst = aggClassEst+classifierArr[i]['alpha']*classEst #累加到强分类器的结果print aggClassEst#打印出中间分类结果return sign(aggClassEst)#根据符号判断分类def loadDataSet(fileName):#把数据从文件中导入,返回样本和所属分类dataNum = len(open(fileName).readline().split('\t'))#查找数据中每个样本的特征数目dataMat = []labelMat = []fr = open(fileName)for line in fr.readlines():lineArr = []curLine = line.strip().split('\t')#当前行for i in range(dataNum-1):#将当前行的前dataNum-1个特征放入当前行数据集中lineArr.append(float(curLine[i]))dataMat.append(lineArr)#将当前行数据加入总的数据中labelMat.append(float(curLine[-1]))#将当前行的最后一个特征也就是样本的所属类放到labelMat中return dataMat,labelMatdef plotROC(predStrengths, classLabels):#画ROC曲线并计算AUC的函数,输入预测结果和样本类别import matplotlib.pyplot as pltcur = (1.0, 1.0)ySum = 0.0 #用于计算AUC的值numPosClas = sum(array(classLabels) == 1.0)#计算样本中正例的数目yStep = 1/float(numPosClas)#1/正例的数目就是y轴上的步长xStep = 1/float(len(classLabels)-numPosClas)#1/负例数目就是x轴上的步长sortedIndicies = predStrengths.argsort()#对分类结果进行排序,每个数字就是为正类的可能性排序(1就是最大可能)(在分类向量中值越大越可能是正类)fig = plt.figure()fig.clf()ax = plt.subplot(111)for index in sortedIndicies.tolist()[0]:if classLabels[index] == 1.0:#index为其中的元素,不是从1开始,而是从第一个元素为正类的排名开始delX = 0delY = yStepelse:#根据这个元素的正负情况移动x,y轴坐标的值delX = xStepdelY = 0ySum = ySum+cur[1]#每次x发生变化就累加当前的y值,最后在乘上x的步长,相当于每个小矩形相加了(是从[1,1]向[0,0]开始画的)ax.plot([cur[0],cur[0]-delX], [cur[1],cur[1]-delY], c = 'b')cur = (cur[0]-delX,cur[1]-delY)ax.plot([0,1],[0,1],'b--')plt.xlabel('False Positive Rate')plt.ylabel('True Positive Rate')plt.title('ROC curve for AdaBoost Horse Colic Detection System')ax.axis([0,1,0,1])plt.show()print "the Area Under the Curve is: ", ySum*xStep

原创粉丝点击