机器学习实战代码详解（五）Logistic回归

来源：互联网发布：一句一句学古兰经软件编辑：程序博客网时间：2024/05/20 03:40

#coding=utf-8from numpy import *#----------加载数据-------------def loadDataSet():    dataMat = []; labelMat = []    fr = open('testSet.txt')            #读取样本    for line in fr.readlines():        lineArr = line.strip().split()  #移除字符串头尾空各，并由空格分割字符串        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])        labelMat.append(int(lineArr[2]))    return dataMat, labelMat            #获得特征矩阵与标记矩阵#----------sigmod函数----------def sigmod(inX):    return 1.0/(1 + exp(-inX))#---------梯度上升算法------------def gradAscent(dataMatIn, classLabels):    dataMatrix = mat(dataMatIn)        #转换为numpy矩阵    labelMat = mat(classLabels).transpose() #转换为numpy矩阵，并将行向量转置为列列向量    m, n = shape(dataMatrix)            #获取行，列    alpha = 0.001                       #移动步长    maxCycles = 500                     #迭代次数    weights = ones((n, 1))              #回归系数初始化为1    for k in range(maxCycles):        #w:=w+alpha*▽f(w)，其中▽f(w)=dataMatrix.transpose()*error        h = sigmod(dataMatrix * weights)    #计算预测值        error = (labelMat - h)              #更新真实类别与预测类别的差值        weights = weights + alpha * dataMatrix.transpose()*error    #使用真实预测的值与预测类别的差值更新回归系数    return weights#------------画出数据集和Logistic回归最佳拟合直线函数---------def plotBestFit(weights):    import matplotlib.pyplot as plt    dataMat, labelMat = loadDataSet()    dataArr = array(dataMat)    n = shape(dataMat)[0]    xcord1 = []; ycord1 = []    xcord2 = []; ycord2 = []    for i in range(n):        if int(labelMat[i]) == 1:            xcord1.append(dataArr[i, 1])            ycord1.append(dataArr[i, 2])        else:            xcord2.append(dataArr[i, 1])            ycord2.append(dataArr[i, 2])    fig = plt.figure()    ax = fig.add_subplot(111)    ax.scatter(xcord1, ycord1, s = 30, c = 'red', marker='s')    ax.scatter(xcord2, ycord2, s = 30, c = 'green')    x = arange(-3.0, 3.0, 0.1)    y = (-weights[0] - weights[1]*x)/weights[2]    ax.plot(x, y)    plt.xlabel('X1');plt.ylabel('x2')    plt.show()#随机梯度上升算法def stocGradAscent0(dataMatrix, classLabel):    m, n = shape(dataMatrix)    alpha = 0.01    weights = ones(n)    for i in range(m):      #在线学习，一次仅用一个样本更新回归系数        h = sigmod(sum(dataMatrix[i] * weights))        error = classLabel[i] - h        weights = weights + alpha * error *dataMatrix[i]    return weights#改进的随机梯度上升算法，避免系数发生来回波动def stocGradAscent1(dataMatrix, classLabels, numIter = 150):    m, n = shape(dataMatrix)    weights = ones(n)    for j in range(numIter):        dataIndex = range(m)        for i in range(m):            alpha = 4/(1.0 + j + i) + 0.01      #alpha随迭代次数减小，缓解数据波动或者高频波动            randIndex = int(random.uniform(0, len(dataIndex)))  #随机选取样本更新回归系数,减少周期性波动            h = sigmod(sum(dataMatrix[randIndex] * weights))            error = classLabels[randIndex] - h            weights = weights + alpha * error *dataMatrix[randIndex]            del(dataIndex[randIndex])         #删除掉已使用过的样本    return weights#-----------病马生死预测--------------#以回归系数和特征向量作为输入计算对应的sigmod值def classifyVector(inX, weights):    prob = sigmod(sum(inX * weights))    if prob > 0.5: return 1.0       #sigmod值大于0.5返回1    else: return 0.0                #否则返回0#logsitic回归分类器训练函数与测试函数def colicTest():    frTrain = open('horseColicTest.txt')        #打开训练集    frTest = open('horseColicTest.txt')         #打开测试集    trainningSet = []; trainningLabel = []    for line in frTrain.readlines():             #读取训练集        currLine = line.strip().split('\t')        lineArr = []        for i in range(21):            lineArr.append(float(currLine[i]))        trainningSet.append(lineArr)        trainningLabel.append(float(currLine[21]))    trainWeights = stocGradAscent1(array(trainningSet), trainningLabel, 500)    #用随机梯度法训练分类器    errorCount = 0; numTestVec = 0.0    for line in frTest.readlines():       #读取测试集        numTestVec += 1.0        currLine = line.strip().split('\t')        lineArr = []        for i in range(21):            lineArr.append(float(currLine[i]))        if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):  #计算错误率            errorCount += 1    errorRate = (float(errorCount)/numTestVec)    print 'the error rate of this test is: %f' %errorRate    return errorRate#分类器测试函数，求10次测试结果的平均值def multiTest():    numTests = 10; errorSum = 0.0    for k in range(numTests):        errorSum += colicTest()    print "after %d iterations the average eror rate is: %f" % (numTests, errorSum/float(numTests))

总结

Logistic回归的目的是寻找一个非线性函数sigmod的最佳拟合参数，求解过程中可以由最优化算法来完成。在最优化算法中，最长用的就是梯度上升算法，而梯度上升算法又可以简化为随机梯度上升算法。
随机梯度上升算法与梯度上升算法效果相当，但是占用更少的计算资源。此外，随机梯度上升是一个在线算法，它可以在新数据到来时完成参数更新，而不需要重新读取整个数据集来进行批处理。
机器学习的一个重要问题就是如何处理缺失数据。这个问题没有标准答案，取决于实际应用中的需求。现有的解决方案中，每个方案都有优缺点。

阅读全文

0 0