adaBoost算法学习笔记

来源:互联网 发布:微机原理接口编程题 编辑:程序博客网 时间:2024/06/07 02:26

本文参考自《机器学习实战》

其中adaboostTrainDS()函数的返回值要修改为aggClassEst.T,不然ROC曲线会画不出来


#coding=utf-8from numpy import *'''单层决策树算法'''def loadSimpData():    dataMat=matrix([[1.0,2.1],[2.0,1.1],[1.3,1.0],[1.0,1.0],[2.0,1.0]])    classLabels=[1.0,1.0,-1.0,-1.0,1.0]    return dataMat,classLabelsdef loadDataSet(filename):    numFeat=len(open(filename).readline().split('\t'))    dataMat=[];labelMat=[]    fr=open(filename)    for line in fr.readlines():        lineArr=[]        currline=line.strip().split('\t')        for i in range(numFeat-1):            lineArr.append(float(currline[i]))        dataMat.append(lineArr)        labelMat.append(float(currline[-1]))    return dataMat,labelMat'''dataMAtrix:数据集dimen:第几列threshVal:阈值threshIneq:lt或gt'''def stumpClassify(dataMAtrix,dimen,threshVal,threshIneq):    retArray=ones((shape(dataMAtrix)[0],1))    if threshIneq=='lt':        retArray[dataMAtrix[:,dimen] <= threshVal]=-1.0    else:        retArray[dataMAtrix[:,dimen] > threshVal]=-1.0    return retArray'''dataArr:数据集classLabels:标签集D:初始权重'''def buildStump(dataArr,classLabels,D):    dataMatrix=mat(dataArr)#矩阵化数据集    labelMat=mat(classLabels).T#矩阵化标签集然后转置    m,n=shape(dataMatrix)#或取数据集行数和列数    numSteps=10.0;    bestStump={};#最佳决策树    bestClassEnt=mat(zeros((m,1)))    minError=inf#初始化为无穷大    for i in range(n):#遍历数据集的所有特征        rangeMin=dataMatrix[:,i].min()#第i列的最小值        rangeMax=dataMatrix[:,i].max()#第i列的最大值        stepSize=(rangeMax-rangeMin)/numSteps#计算步长        for j in range(-1,int(numSteps)+1):            for inequal in ['lt','gt']:                threshVal=(rangeMin+float(j)*stepSize)#计算阈值                predictedVals=stumpClassify(dataMatrix,i,threshVal,inequal)#预测结果                errArr=mat(ones((m,1)))#初始化错误集                errArr[predictedVals == labelMat]=0#预测准确则变为0                weightedError=D.T*errArr#计算加权错误率                # print "split: dim %d,thresh %.2f,thresh inequal: %s,the weighted error is %.3f "%(i,threshVal,inequal,weightedError)                if weightedError<minError:#将当前错误率与已有错误率比较                    minError=weightedError                    bestClassEnt=predictedVals.copy()                    bestStump['dim']=i                    bestStump['thresh']=threshVal                    bestStump['ineq']=inequal    return bestStump,minError,bestClassEnt'''dataArr:数据集classLabels:标签集numIt:迭代次数'''#def adaBoostTrainDS(dataArr,classLabels,numIt=40):    weakClassArr=[]    m=shape(dataArr)[0]    D=mat(ones((m,1))/m)    aggClassEst=mat(zeros((m,1)))    for i in range(numIt):        bestStump,error,classEst=buildStump(dataArr,classLabels,D)        #print "D:",D.T        alpha=float(0.5*log((1.0-error)/max(error,1e-16)))        bestStump['alpha']=alpha        weakClassArr.append(bestStump)        #print "classEst:",classEst.T        expon=multiply(-1*alpha*mat(classLabels).T,classEst)        D=multiply(D,exp(expon))        D=D/D.sum()        aggClassEst+=alpha*classEst        #print  "aggClassEst:",aggClassEst.T        aggErrors=multiply(sign(aggClassEst)!=mat(classLabels).T,ones((m,1)))        errorRate=aggErrors.sum()/m        print "total error:",errorRate,"\n"        if errorRate==0.0:break    return weakClassArr,aggClassEst.T# dataMat,classLabels=loadSimpData()# D=mat(ones((5,1))/5)# bestStump,minError,bestClassEnt=buildStump(dataMat,classLabels,D)# print bestStump# classifierArray=adaBoostTrainDS(dataMat,classLabels,9)# print classifierArraydef adaClassify(datToClass,classifierArr):    dataMatrix=mat(datToClass)    m=shape(dataMatrix)[0]    aggClassEst=mat(zeros((m,1)))    for i in range(len(classifierArr)):        classEst=stumpClassify(dataMatrix,classifierArr[i]['dim'],classifierArr[i]['thresh'],classifierArr[i]['ineq'])        aggClassEst+=classifierArr[i]['alpha']*classEst        #print  aggClassEst    return sign(aggClassEst)def plotROC(predStrengths,classLabels):    import matplotlib.pyplot as plt    cur=(1.0,1.0)    ySum=0.0    numPosClas=sum(array(classLabels)==1.0)    yStep=1/float(numPosClas)    xStep=1/float(len(classLabels)-numPosClas)    sortedIndicies=predStrengths.argsort()    fig=plt.figure()    fig.clf()    ax=plt.subplot(111)    for index in sortedIndicies.tolist()[0]:        if classLabels[index]==1.0:            delX=0;delY=yStep        else:            delX=xStep;delY=0            ySum+=cur[1]        ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY],c='b')        cur=(cur[0]-delX,cur[1]-delY)    ax.plot([0,1],[0,1],'b--')    plt.xlabel('False Postive Rate')    plt.ylabel('True Positive Rate')    plt.title('ROC curve for AdaBoost Horse Colic Detection System')    ax.axis([0,1,0,1])    plt.show()    print  "the Area under the Curve is :",ySum*xStepdataArr,labelArr=loadDataSet('horseColicTraining2.txt')classifierArray,aggClassEst=adaBoostTrainDS(dataArr,labelArr,10)testArr,testLabelArr=loadDataSet('horseColicTest2.txt')prediction10=adaClassify(testArr,classifierArray)errArr=mat(ones((67,1)))print errArr[prediction10!=mat(testLabelArr).T].sum()plotROC(aggClassEst,labelArr)

原创粉丝点击