logRegres 源码

来源:互联网 发布:教化妆的软件 编辑:程序博客网 时间:2024/05/18 01:40
#!/usr/bin/python# -*- coding:utf-8 -*-from numpy import *def loadDataSet():dataMat = []labelMat = []fr = open('testSet.txt')for line in fr.readlines():lineArr = line.strip().split()dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])#开始加入的1.0就是X0,对应参数W0labelMat.append(int(lineArr[2]))return dataMat, labelMatdef sigmoid(inX):return 1/(1+exp(-inX))def gradAscent(dataMatIn,classLabels):#批量梯度下降dataMatrix = mat(dataMatIn)labelMat = mat(classLabels).transpose()m,n = shape(dataMatrix)alpha = 0.001maxCycles = 500weights = ones((n,1))#参数的个数应该为特征的数量+1,所以为数据矩阵的列数for k in range(maxCycles):h = sigmoid(dataMatrix*weights)error = (labelMat-h)weights = weights+alpha*dataMatrix.transpose()*errorreturn weightsdef plotBestFit(wei):#调用的时候wei实际上是个numpy的列矩阵(调用格式:plotBestFit(mat(weights).transpose()))import matplotlib.pyplot as pltweights = wei.getA()dataMat,labelMat = loadDataSet()dataArr = array(dataMat)n = shape(dataArr)[0]xcord1 = []ycord1 = []xcord2 = []ycord2 = []for i in range(n):if int(labelMat[i]) == 1:#将不同类型的样本加入到坐标数组中xcord1.append(dataArr[i,1])ycord1.append(dataArr[i,2])else:xcord2.append(dataArr[i,1])ycord2.append(dataArr[i,2])fig = plt.figure()ax = fig.add_subplot(111)ax.scatter(xcord1,ycord1,s=30,c='red',marker='^')#画两个种类的点ax.scatter(xcord2,ycord2,s=30,c='yellow')x = arange(-3.0,3.0,0.1)y = (-weights[0]-x*weights[1])/weights[2]#画分界线,X0 = 1,取sigmoid函数取0.5,直线等于0,x2用x1表示就是最终结果ax.plot(x,y)plt.xlabel('X1')plt.ylabel('X2')plt.show()def stocGradAscent0(dataMatrix, classLabels):#随机梯度下降(按顺序取样本,alpha是固定值)m,n = shape(dataMatrix)alpha = 0.01weights = ones(n)for i in range(m):h = sigmoid(sum(dataMatrix[i]*weights))#没有用numpy的库,是矩阵的点乘结果error = classLabels[i]-hweights = weights+alpha*error*dataMatrix[i]return weightsdef stocGradAscent1(dataMatrix, classLabels, numIter = 150):#改进的随机梯度下降(随机取样本,alpha值随迭代次数的增加逐渐减小)m,n = shape(dataMatrix)weights = ones(n)for j in range(numIter):dataIndex = range(m)for i in range(m):alpha = 4/(1.0+i+j)+0.01 #随着迭代次数的增加,alpha的值逐渐减小,避免了随机梯度下降带来的参数波动问题,且收敛更快randIndex = int(random.uniform(0,len(dataIndex)))h = sigmoid(sum(dataMatrix[randIndex]*weights))#没有用numpy的库,是矩阵的点乘结果error = classLabels[randIndex]-hweights = weights+alpha*error*dataMatrix[randIndex]del(dataIndex[randIndex])return weightsdef classifyVector(inX, weights):#根据测试向量判断其所属的类别prob = sigmoid(sum(inX*weights))if prob>0.5:#以0.5为阈值return 1.0else:return 0.0def colicTest():#训练集训练模型,得到参数,测试集进行模型的评估frTrain = open('horseColicTraining.txt')frTest = open('horseColicTest.txt')trainingSet = []trainingLabels = []for line in frTrain.readlines():currLine = line.strip().split('\t')lineArr = []for i in range(21):#因为共有21个特征(0~20)lineArr.append(float(currLine[i]))#组成一个样本向量trainingSet.append(lineArr)#向量加入到训练集中trainingLabels.append(float(currLine[21]))#结果标签加入到训练的结果集中trainWeights = stocGradAscent1(array(trainingSet),trainingLabels,100)#得到训练参数#trainWeights = gradAscent(array(trainingSet),trainingLabels)#采用批梯度下降训练参数,每次的结果就都一样了errorCount = 0numTestVec = 0.0for line in frTest.readlines():numTestVec = numTestVec+1.0 #统计测试集样本数量currLine = line.strip().split('\t')lineArr = []for i in range(21):lineArr.append(float(currLine[i]))if int(classifyVector(array(lineArr),trainWeights))!=int(currLine[21]):#如果测试集的分类与实际不同,错误数量就加一errorCount = errorCount+1errorRate = (float(errorCount)/numTestVec)print "the error rate of this test is: %f" %errorRatereturn errorRatedef multiTest():#十次实验的平均情况numTests = 10errorSum = 0.0for k in range(numTests):errorSum = errorSum+colicTest()#因为每次学习得到的参数值都有一些不同,所以结果也会稍有不同print "after %d iterations the average error rate is: %f" %(numTests,errorSum/float(numTests))