代码注释:机器学习实战第7章 利用AdaBoost元算法提高分类性能

来源:互联网 发布:淘宝a店是什么店 编辑:程序博客网 时间:2024/05/16 10:03

写在开头的话:在学习《机器学习实战》的过程中发现书中很多代码并没有注释,这对新入门的同学是一个挑战,特此贴出我对代码做出的注释,仅供参考,欢迎指正。

1、基于单层决策树构建弱分类器

#coding:gbkfrom numpy import *#作用:载入数据#输入:无#输出:数据矩阵,标签向量def loadSimpData():    datMat = matrix([[1. , 2.1],                     [2. , 1.1],                     [1.3, 1. ],                     [1. , 1. ],                     [2. , 1. ]])    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]    return datMat, classLabels#作用:从文本载入数据#输入:文件名#输出:数据矩阵,标签向量def loadDataSet(fileName):      #general function to parse tab -delimited floats    numFeat = len(open(fileName).readline().split('\t')) #get number of fields    dataMat = []; labelMat = []    fr = open(fileName)    for line in fr.readlines():        lineArr =[]        curLine = line.strip().split('\t')        for i in range(numFeat-1):            lineArr.append(float(curLine[i]))        dataMat.append(lineArr)        labelMat.append(float(curLine[-1]))    return dataMat,labelMat#作用:通过阈值比较对数据进行分类#输入:数据矩阵,维度,阈值,阈值不平等性#输出:返回标签列表,threshIneq为'lt'则小于等于时为-1.0,为'gt'则大于时为1.0,默认值为1.0def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):    retArray = ones((shape(dataMatrix)[0], 1))#构建m*1矩阵,注意(shape(dataMatrix)[0], 1)为元组    if threshIneq == 'lt':#小于等于情况        retArray[dataMatrix[:, dimen] <= threshVal] = -1.0    else:#大于情况        retArray[dataMatrix[:, dimen] > threshVal] = -1.0    return retArray#作用:找到数据集上最佳的单层决策树#输入:数据矩阵,标签向量,基于数据的权重向量D#输出:最佳单层决策树,最小误差数,最佳预测标签结果def buildStump(dataArr, classLabels, D):    dataMatrix = mat(dataArr)    labelMat = mat(classLabels).T    m, n = shape(dataMatrix)#dataMatrix行数与列数,即样例个数与特征值个数    numSteps = 10.0#步数    bestStump = {}#最佳单层决策树    bestClasEst = mat(zeros((m, 1)))#最佳预测标签结果    minError = inf#最小误差数,初始化为正无穷大    #对每个特征值来说    for i in range(n):        rangeMin = dataMatrix[:, i].min()        rangeMax = dataMatrix[:, i].max()        stepSize = (rangeMax - rangeMin) / numSteps        #j = -1, 0, 1, ... , int(numSteps)        for j in range(-1, int(numSteps) + 1):            for inequal in ['lt', 'gt']:                threshVal = (rangeMin + float(j) * stepSize)#threshVal最小值小于rangeMin,最大值等于rangeMax                predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)                #预测正确则为0,错误则为1                errArr = mat(ones((m, 1)))                errArr[predictedVals == labelMat] = 0                #加入权重向量后的误差值                weightedError = D.T * errArr                #print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % \                      #(i, threshVal, inequal, weightedError)                #得到新的最小误差值                if weightedError < minError:                    minError = weightedError                    bestClasEst = predictedVals.copy()                    bestStump['dim'] = i                    bestStump['thresh'] = threshVal                    bestStump['ineq'] = inequal    return bestStump, minError, bestClasEst


2、完整AdaBoost算法的实现、测试算法及应用算法

#作用:基于单层决策树的AdaBoost训练过程#输入:数据矩阵,标签向量,最大迭代次数#输出:单层决策树组def adaBoostTrainDS(dataArr, classLabels, numIt = 40):    weakClassArr = []#单层决策树组    m = shape(dataArr)[0]#数据矩阵行数,即样例个数    D = mat(ones((m, 1)) / m)#基于数据的权重向量    aggClassEst = mat(zeros((m, 1)))    for i in range(numIt):        bestStump, error, classEst = buildStump(dataArr, classLabels, D)        #print "D:", D.T        # 计算alpha,max(error, 1e-16)的作用是防止error为0,出现除法错误        alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16)))        bestStump['alpha'] = alpha        weakClassArr.append(bestStump)#将决策树压入决策树组        #print "classEst: ", classEst.T        #multiply的作用是计算classLabels和classEst对应的元素是否相等,即样本是否正确分类        expon = multiply(-1 * alpha * mat(classLabels).T, classEst)        D = multiply(D, exp(expon))        D = D / D.sum()        #错误率累加计算        aggClassEst += alpha * classEst        #print "aggClassEst: ", aggClassEst.T        # 当i<0,sign(i) = -1,当i=0,sign(i) = 0,当i>0,sign(i) = 1        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m, 1)))        errorRate = aggErrors.sum() / m        #print "total error: ", errorRate, "\n"        if errorRate == 0.0:            break    #return weakClassArr    return weakClassArr, aggClassEst#作用:利用训练处的单层决策树组进行分类#输入:需要分类数据,单层决策树组#输出:分类情况def adaClassify(datToClass, classifierArr):    dataMatrix = mat(datToClass)    m = shape(dataMatrix)[0]    aggClassEst = mat(zeros((m, 1)))    for i in range(len(classifierArr)):        classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], \                                 classifierArr[i]['thresh'], classifierArr[i]['ineq'])        aggClassEst += classifierArr[i]['alpha'] * classEst        #print aggClassEst    #当i<0,sign(i) = -1,当i=0,sign(i) = 0,当i>0,sign(i) = 1    return sign(aggClassEst)


3、ROC曲线的绘制及AUC计算函数

#作用:ROC曲线的绘制及AUC计算函数#输入:分类器的预测强度,分类标签#输出:无def plotROC(predStrengths, classLabels):    import matplotlib.pyplot as plt    cur = (1.0, 1.0)#一开始假定所有均预测为+1    ySum = 0.0    numPosClas = sum(array(classLabels) == 1.0)#真实为+1个数,即真阳率分母    yStep = 1 / float(numPosClas)#真阳率的1/分母    xStep = 1 / float(len(classLabels) - numPosClas)#假阳率的1/分母    sortedIndicies = predStrengths.argsort()#返回从小到大的索引值    fig = plt.figure()    fig.clf()    ax = plt.subplot(111)    #主窗口输入help(numpy.ndarray.tolist) 将数组转化为列表    #开始一个个预测为-1    for index in sortedIndicies.tolist()[0]:        if classLabels[index] == 1.0:#真实为1,表示预测错误,真阳率分子减少1,假阳率分子不变            delX = 0            delY = yStep        else:#真实为-1,表示预测正确,真阳率分子不变,假阳率分子减少1            delX = xStep            delY = 0            ySum += cur[1]        ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c = 'b')        cur = (cur[0] - delX, cur[1] - delY)#下一个坐标点的起点    ax.plot([0, 1], [0, 1], 'b--')#绘制随机猜测的结果曲线    plt.xlabel('False Positive Rate')#x轴标签    plt.ylabel('True Positive Rate')#y轴标签    plt.title('ROC curve for AdaBoost Horse Colic Detection System')#标题    ax.axis([0, 1, 0, 1])#绘制坐标轴    plt.show()    print "the Area Under the Curve is: ", ySum * xStep


0 0
原创粉丝点击