svmMLiA 源码

来源:互联网 发布:guide软件 编辑:程序博客网 时间:2024/06/05 20:34
#!/usr/bin/python  # -*- coding:utf-8 -*-  from numpy import *def loadDataSet(fileName):dataMat = []labelMat = []fr = open(fileName)for line in fr.readlines():lineArr = line.strip().split('\t')dataMat.append([float(lineArr[0]),float(lineArr[1])])labelMat.append(float(lineArr[2]))return dataMat,labelMatdef selectJrand(i,m):#从0~m个alpha中选择除i外的另一个alphaj = iwhile j == i:j = int(random.uniform(0,m))return jdef clipAlpha(aj,H,L):#用于调整大于H或小于L的值if aj>H:aj = Hif L>aj:aj = Lreturn ajdef smoSimple(dataMatIn, classLabels, C, toler, maxIter):#求使得对偶问题有最优解的参数alpha,实际上只有支持向量的那些alpha是有值的dataMatrix = mat(dataMatIn)labelMat = mat(classLabels).transpose()#label转置为列向量m,n = shape(dataMatrix)b = 0 #初始化balphas = mat(zeros((m,1)))#初始化alpha为一个都是0的列矩阵iter = 0while iter<maxIter:alphaPairsChanged = 0for i in range(m):#下面计算分类的公式中,已经把w换为由alpha表示的对偶形式了fXi = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T))+b #对第i个样本的预测值,multiply是两个数组对应的元素分别相乘组成一个新的数组,所以fXi是一个值Ei = fXi-float(labelMat[i])#预测值与真实值得误差,如果误差很大,就对alpha进行优化if ((labelMat[i]*Ei<-toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] >0 )):j = selectJrand(i,m)#选一个除第i个之外的另一个alphafXj = float(multiply(alphas,labelMat).T * (dataMatrix*dataMatrix[j,:].T))+bEj = fXj-float(labelMat[j])alphaIold = alphas[i].copy()#保存改变之前的alphai和alphaj的值alphaJold = alphas[j].copy()if labelMat[i] != labelMat[j]:#H,L是什么东西,分两种情况计算H和LL = max(0,alphas[j]-alphas[i])H = min(C,C+alphas[j]-alphas[i])else:L = max(0,alphas[j]+alphas[i]-C)H = min(C,alphas[j]+alphas[i])if L == H:print "L == H"#如果L与H相等就不会发生更新continueeta = 2.0*dataMatrix[i,:]*dataMatrix[j,:].T-dataMatrix[i,:]*dataMatrix[i,:].T-dataMatrix[j,:]*dataMatrix[j,:].Tif eta>=0:print "eta>=0"#如果最优修改量大于0,也不会发生更新continuealphas[j] = alphas[j]-labelMat[j]*(Ei-Ej)/etaalphas[j] = clipAlpha(alphas[j],H,L)if abs(alphas[j] - alphaJold)<0.00001:print "j not moving enough"#如果参数j变动的很小,也不会发生更新continuealphas[i] = alphas[i]+labelMat[j]*labelMat[i]*(alphaJold - alphas[j])b1 = b-Ei-labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].Tb2 = b-Ej-labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].Tif (0<alphas[i]) and (C>alphas[i]):b = b1elif (0<alphas[j]) and (C>alphas[j]):b = b2else:b = (b1+b2)/2.0alphaPairsChanged = alphaPairsChanged+1print "iter: %d i: %d, pairs changed %d" % (iter,i,alphaPairsChanged)if alphaPairsChanged == 0:iter = iter+1#只有在数据集上遍历了maxIter次,并不发生任何修改alpha之后才会停止迭代 else:iter = 0 #如果alpha发生了优化,就让iter变为0,重新开始迭代print "iteration number: %d" % iterreturn b,alphasclass optStruct:#存储关键参数的结构体def __init__(self, dataMatIn, classLabels, C, toler, kTup):self.X = dataMatInself.labelMat = classLabelsself.C = Cself.tol = tolerself.m = shape(dataMatIn)[0]self.alphas = mat(zeros((self.m,1)))self.b = 0self.eCache = mat(zeros((self.m,2)))#误差缓存self.K = mat(zeros((self.m,self.m)))for i in range(self.m):self.K[:,i] = kernelTrans(self.X,self.X[i,:],kTup)def calcEk(oS, k):#计算预测值与真实值误差的函数#fXk = float(multiply(oS.alphas,oS.labelMat).T*(oS.X*oS.X[k,:].T))+oS.bfXk = float(multiply(oS.alphas,oS.labelMat).T*oS.K[:,k]+oS.b)Ek = fXk-float(oS.labelMat[k])return Ekdef selectJ(i, oS, Ei):#选择第二个alpha的下标并输出它的误差值maxK = -1maxDeltaE = 0Ej = 0oS.eCache[i] = [1,Ei]validEcacheList = nonzero(oS.eCache[:,0].A)[0]if len(validEcacheList)>1:for k in validEcacheList:if k == i:continueEk = calcEk(oS,k)deltaE = abs(Ei-Ek)#步长就是选出的另一个样本的误差与这个误差的差值if deltaE>maxDeltaE:#选择具有最大步长的jmaxK = kmaxDeltaE = deltaEEj = Ekreturn maxK,Ejelse:j = selectJrand(i,oS.m)Ej = calcEk(oS,j)return j,Ejdef updateEk(oS, k):#将计算出的误差存入缓存中Ek = calcEk(oS,k)oS.eCache[k] = [1,Ek]def innerL(i, oS):#完整的内循环优化进程, 输入选出的第一个alpha和相关的参数,输出是否进行了参数的优化Ei = calcEk(oS,i)if ((oS.labelMat[i]*Ei<-oS.tol) and (oS.alphas[i]<oS.C)) or ((oS.labelMat[i]*Ei>oS.tol) and (oS.alphas[i]>0)):j,Ej = selectJ(i,oS,Ei)alphaIold = oS.alphas[i].copy()alphaJold = oS.alphas[j].copy()if oS.labelMat[i] != oS.labelMat[j]:L = max(0,oS.alphas[j]-oS.alphas[i])H = min(oS.C,oS.C+oS.alphas[j]-oS.alphas[i])else:L = max(0, oS.alphas[j]+oS.alphas[i]-oS.C)H = min(oS.C, oS.alphas[j]+oS.alphas[i])if L == H:print "l == H"return 0#eta = 2.0*oS.X[i,:]*oS.X[j,:].T - oS.X[i,:]*oS.X[i,:].T - oS.X[j,:]*oS.X[j,:].Teta = 2.0 * oS.K[i,j]-oS.K[i,i]-oS.K[j,j]if eta>=0:print "eta>=0"return 0oS.alphas[j] = oS.alphas[j]-oS.labelMat[j]*(Ei-Ej)/etaoS.alphas[j] = clipAlpha(oS.alphas[j],H,L)updateEk(oS,j)if(abs(oS.alphas[j]-alphaJold)<0.00001):print "j not moving enough"return 0oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold-oS.alphas[j])updateEk(oS,i)#b1 = oS.b- Ei-oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[i,:].T-oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.X[i,:]*oS.X[j,:].T#b2 = oS.b- Ej-oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[j,:].T-oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.X[j,:]*oS.X[j,:].Tb1 = oS.b-Ei-oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i]-oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]b2 = oS.b-Ej-oS.labelMat[i]*(oS.alphas[i]-alphaJold)*oS.K[i,j]-oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]if (0<oS.alphas[i]) and (oS.C>oS.alphas[i]):oS.b = b1elif (0<oS.alphas[j]) and (oS.C>oS.alphas[j]):oS.b = b2else:oS.b = (b1+b2)/2.0return 1else:return 0def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup = ('lin', 0)):oS = optStruct(mat(dataMatIn), mat(classLabels).transpose(), C, toler, kTup)#构建数据结构来容纳所有的数据iter = 0entireSet = TruealphaPairsChanged = 0while (iter<maxIter) and ((alphaPairsChanged>0) or (entireSet)):#当循环次数大于最大迭代次数或遍历整个集合都未对任意alpha进行修改时就退出循环alphaPairsChanged = 0 #优化后的外部函数将一次迭代定义为一次循环过程,不管alpha是否发生的改变if entireSet:#之前的外部函数只将参数alpha不发生改变的一次循环计为一次迭代,所以迭代的很慢for i in range(oS.m):#遍历数据集中的每个ialphaPairsChanged = alphaPairsChanged+innerL(i,oS)print "fullSet, iter: %d i: %d, pairs changed %d" %(iter,i,alphaPairsChanged)#原书此处有错误,少了缩进iter = iter + 1else:#寻找不在边界0或C上的值nonBoundIs = nonzero((oS.alphas.A>0)*(oS.alphas.A<C))[0]for i in nonBoundIs:alphaPairsChanged = alphaPairsChanged+innerL(i,oS)print "non-bound, iter: %d i: %d, pairs changed %d" %(iter,i,alphaPairsChanged)iter = iter+1if entireSet:entireSet = Falseelif alphaPairsChanged == 0:entireSet = Trueprint "iteration number: %d" % iterreturn oS.b, oS.alphasdef calcWs(alphas,dataArr,classLabels):X = mat(dataArr)labelMat = mat(classLabels).transpose()m,n = shape(X)w = zeros((n,1))for i in range(m):w = w+multiply(alphas[i]*labelMat[i], X[i,:].T)return wdef kernelTrans(X, A, kTup):#根据所选用的核函数,初始化每两个样本之间的向量内积结果,kTup第一个参数是核函数的类型,第二个参数是核函数中的可选参数m,n = shape(X)K = mat(zeros((m,1)))if kTup[0] == 'lin':#线性的多项式核函数K = X * A.Telif kTup[0] == 'rbf':#径向基函数,高斯核函数for j in range(m):deltaRow = X[j,:]-AK[j] = deltaRow*deltaRow.T #K相当于分子,做了一个平方K = exp(K/(-1*kTup[1]**2))#kTup[1]相当于高斯核函数中的分母中的符号,numpy中的除号/,是对数组中的每个元素进行除法操作else:raise NameError('we have a problem kernel is not recognized')return Kdef testRbf(k1=1.3):dataArr, labelArr = loadDataSet('testSetRBF.txt')b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf',k1))datMat = mat(dataArr)labelMat = mat(labelArr).transpose()svInd = nonzero(alphas.A>0)[0]#值大于0的参数alpha的数量sVs = datMat[svInd]#构建支持向量矩阵labelSV = labelMat[svInd]#支持向量对应的分类print "there are %d Support Vectors" % shape(sVs)[0]m,n = shape(datMat)errorCount = 0for i in range(m):kernelEval = kernelTrans(sVs,datMat[i,:],('rbf',k1))#计算数据集中第i个样本与样本集合的kernel结果predict = kernelEval.T * multiply(labelSV,alphas[svInd])+b #计算第i个样本的预测结果if sign(predict)!=sign(labelArr[i]):errorCount = errorCount+1print "the training error rate is: %f" %(float(errorCount)/m)#先计算训练误差dataArr,labelArr = loadDataSet('testSetRBF2.txt')errorCount = 0datMat = mat(dataArr)labelMat = mat(labelArr).transpose()m,n = shape(datMat)for i in range(m):kernelEval = kernelTrans(sVs,datMat[i,:],('rbf',k1))predict = kernelEval.T * multiply(labelSV,alphas[svInd])+bif sign(predict)!=sign(labelArr[i]):errorCount = errorCount+1print "the test error rate is: %f" %(float(errorCount)/m)#再计算测试误差def img2vector(filename):#从图像转换为向量    returnVect = zeros((1,1024))    fr = open(filename)    for i in range(32):        lineStr = fr.readline()        for j in range(32):            returnVect[0,32*i+j] = int(lineStr[j])    return returnVectdef loadImages(dirName):#载入图片,返回图片的数据矩阵和分类    from os import listdir    hwLabels = []    trainingFileList = listdir(dirName)           #load the training set    m = len(trainingFileList)    trainingMat = zeros((m,1024))    for i in range(m):        fileNameStr = trainingFileList[i]        fileStr = fileNameStr.split('.')[0]     #take off .txt        classNumStr = int(fileStr.split('_')[0])        if classNumStr == 9: hwLabels.append(-1)#如果是9就是负类,另外的数字是正类,因为svm是二分类算法        else: hwLabels.append(1)        trainingMat[i,:] = img2vector('%s/%s' % (dirName, fileNameStr))    return trainingMat, hwLabels    def testDigits(kTup=('rbf', 10)):    dataArr,labelArr = loadImages('trainingDigits')    b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)    datMat=mat(dataArr)    labelMat = mat(labelArr).transpose()    svInd=nonzero(alphas.A>0)[0]    sVs=datMat[svInd] #组成支持向量的矩阵    labelSV = labelMat[svInd]#支持向量的分类    print "there are %d Support Vectors" % shape(sVs)[0]    m,n = shape(datMat)    errorCount = 0    for i in range(m):        kernelEval = kernelTrans(sVs,datMat[i,:],kTup)#计算支持向量与第i个样本之间的核函数结果        predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b #计算第i个样本的预测结果,只需要算与支持向量之间的结果,因为其他样本的alpha都为0        if sign(predict)!=sign(labelArr[i]):         errorCount += 1    print "the training error rate is: %f" % (float(errorCount)/m)    dataArr,labelArr = loadImages('testDigits')    errorCount = 0    datMat=mat(dataArr)    labelMat = mat(labelArr).transpose()    m,n = shape(datMat)    for i in range(m):        kernelEval = kernelTrans(sVs,datMat[i,:],kTup)        predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b        if sign(predict)!=sign(labelArr[i]): errorCount += 1        print "the test error rate is: %f" % (float(errorCount)/m)