
来源:互联网 发布:上古卷轴5怪物数据 编辑:程序博客网 时间:2024/05/19 15:21

这里还有一种“新”分类算法,就是把多个分类器组合成一个分类器,主要有bagging 和boosting两种。
1. bagging每个训练集都不一样,而boosting每个训练集都一样。
2. bagging最终投票时,每个分类器权重都一样,而boosting最终投票时,每个分类器权重都不一样。
1. 通过训练数据训练出一个最优分类器。
2. 查看分类器的错误率,把错分类的样本数据提高一定权重,分类正群的样本,降低一定权重。然后按每个数据样本,不同权重来训练新的最优分类器。
3. 最终投票结果由这些分类器按不同权重来投票决定,其中各分类器的权重,按其预测的准确性来决定。




from numpy import *#载入数据def loadSimpData():    datMat = matrix([[1., 2.1],                     [2., 1.1],                     [1.3, 1.],                     [1., 1.],                     [2., 1.]])    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]    return datMat, classLabels#载入数据def loadDataSet(fileName):    numFeat = len(open(fileName).readline().split('\t'))    dataMat = []    labelMat = []    fr = open(fileName)    for line in fr.readlines():        lineArr = []        curLine = line.strip().split('\t')        for i in range(numFeat - 1):            lineArr.append(float(curLine[i]))        dataMat.append(lineArr)        labelMat.append(float(curLine[-1]))    return dataMat, labelMat#预测分类def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):    retArray = ones((shape(dataMatrix)[0], 1))    if threshIneq == 'lt': #比阀值小,就归为-1        retArray[dataMatrix[:, dimen] <= threshVal] = -1.0    else:        retArray[dataMatrix[:, dimen] > threshVal] = -1.0    return retArray#建立单层决策树def buildStump(dataArr, classLabels, D):    dataMatrix = mat(dataArr)    labelMat = mat(classLabels).T    m, n = shape(dataMatrix)    numSteps = 10.0    bestStump = {}    bestClasEst = mat(zeros((m, 1)))    minError = inf    for i in range(n):        rangeMin = dataMatrix[:, i].min()        rangeMax = dataMatrix[:, i].max()        stepSize = (rangeMax - rangeMin) / numSteps        for j in range(-1, int(numSteps) + 1):            for inequal in ['lt', 'gt'] : #less than 和greater than                threshVal = (rangeMin + float(j) * stepSize)                predictedVals = stumpClassify(dataMatrix, i, threshVal,inequal)                errArr = mat(ones((m, 1)))                errArr[predictedVals == labelMat] = 0 #分类错误的标记为1,正确为0                weightedError = D.T * errArr #增加分类错误的权重                print( "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" \                       % (i, threshVal, inequal, weightedError))                if weightedError < minError:                    minError = weightedError                    bestClasEst = predictedVals.copy()                    bestStump['dim'] = i                    bestStump['thresh'] = threshVal                    bestStump['ineq'] = inequal    return bestStump, minError, bestClasEst#训练分类器def adaBoostTrainDS(dataArr, classLabels, numIt=40):    weakClassArr = []    m = shape(dataArr)[0]    D = mat(ones((m, 1)) / m)  #设置一样的初始权重值    aggClassEst = mat(zeros((m, 1)))    for i in range(numIt):        bestStump, error, classEst = buildStump(dataArr, classLabels, D)  #得到“单层”最优决策树        print("D:",D.T)        alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16)))  #计算alpha值        bestStump['alpha'] = alpha        weakClassArr.append(bestStump)  #存储弱分类器        print("classEst: ",classEst.T)        expon = multiply(-1 * alpha * mat(classLabels).T, classEst)        D = multiply(D, exp(expon))  # 更新分类器权重        D = D / D.sum() #保证权重加和为1        aggClassEst += alpha * classEst        print("aggClassEst: ",aggClassEst.T)        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m, 1))) #检查分类出错的类别        errorRate = aggErrors.sum() / m        print("total error: ", errorRate)        if errorRate == 0.0:            break    return weakClassArr, aggClassEst#用训练出的分类器来作预测def adaClassify(datToClass, classifierArr):    dataMatrix = mat(datToClass)    m = shape(dataMatrix)[0]    aggClassEst = mat(zeros((m, 1)))    for i in range(len(classifierArr)):        classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], \                                 classifierArr[i]['thresh'], \                                 classifierArr[i]['ineq'])        aggClassEst += classifierArr[i]['alpha'] * classEst        print(aggClassEst)    return sign(aggClassEst)#绘制ROC曲线def plotROC(predStrengths, classLabels):    import matplotlib.pyplot as plt    cur = (1.0, 1.0)    ySum = 0.0    numPosClas = sum(array(classLabels) == 1.0)    yStep = 1 / float(numPosClas)    xStep = 1 / float(len(classLabels) - numPosClas)    sortedIndicies = predStrengths.argsort()    fig = plt.figure()    fig.clf()    ax = plt.subplot(111)    for index in sortedIndicies.tolist()[0]:        if classLabels[index] == 1.0:            delX = 0            delY = yStep        else:            delX = xStep            delY = 0            ySum += cur[1]        ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c='b')        cur = (cur[0] - delX, cur[1] - delY)    ax.plot([0, 1], [0, 1], 'b--')    plt.xlabel('False positive rate')    plt.ylabel('True positive rate')    plt.title('ROC curve for AdaBoost horse colic detection system')    ax.axis([0, 1, 0, 1])    print("the Area Under the Curve is: ", ySum * xStep) __name__=='__main__':    filename='C:\\Users\\Administrator\\Desktop\\data\\horseColicTraining2.txt'    dataMat,classLabels=loadDataSet(filename)    weakClassArr, aggClassEst=adaBoostTrainDS(dataMat,classLabels,50)    plotROC(aggClassEst.T,classLabels)


