【读书笔记】机器学习实战 6.3节 SVM-简化版SMO算法

来源：互联网发布：p正装照软件编辑：程序博客网时间：2024/06/01 10:37
机器学习实战

6.3节支持向量机SVM -简化版SMO算法
#!/usr/bin/python# -*- coding: utf-8 -*-import randomfrom numpy import *def loadDataSet(fileName):  # 打开文件，返回特征列表，标签列表    dataMat = [];labelMat = []    fr = open(fileName)    for line in fr.readlines():        lineArr = line.strip().split('\t')        dataMat.append([float(lineArr[0]),float(lineArr[1])])  # 特征        labelMat.append(float(lineArr[2]))  # 标签    return dataMat,labelMatdef selectJrand(i,m):  # 随机选择函数：返回选择索引    j = i    while (j==i):        j = int(random.uniform(0,m))    return jdef clipAlpha(aj,H,L):  # 限制aj范围    if aj>H:        aj = H    if L>aj:        aj =L    return ajdef smoSimple(dataMatIn, classLabels, C, toler, maxIter):  # 简化版smo算法 ，输入：样本列表，标签列表，C，容错率，最大迭代次数    dataMatrix = mat(dataMatIn); labelMat = mat(classLabels).transpose() # 样本列表，标签列表转化为numpy矩阵    b = 0; m,n = shape(dataMatrix)    alphas = mat(zeros((m,1)))  # alpha向量初始化为0    iter = 0                    # alpha保持没有改变状态的迭代次数统计    while (iter < maxIter):        alphaPairsChanged = 0   # flag：记录alpha是否被优化        for i in range(m):      # 对每个样本数据            tmp_1 =  multiply(alphas,labelMat)            tmp_2 = multiply(tmp_1,dataMatrix)            tmp_3 = tmp_2*dataMatrix[i,:].T            fXi = tmp_3.T*mat(ones((m,1))) + b                                              # 模型计算f(Xi)            fXi = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b     # 预测样本类别f(Xi)，判定是否可以优化            Ei = fXi - float(labelMat[i])                                                   # 判断样本i误差Ei if checks if an example violates KKT conditions            if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0)):  # 若误差较大，且alpha没有到边界                j = selectJrand(i,m)                                                        # 随机选择另一个要优化的alpha                fXj = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b # 预测f(Xj)                Ej = fXj - float(labelMat[j])                                               # 判断样本j误差Ej                alphaIold = alphas[i].copy(); alphaJold = alphas[j].copy();                 # 深拷贝oldalpha值                # SMO算法参考：http://blog.csdn.net/luoshixian099/article/details/51227754                if (labelMat[i] != labelMat[j]):   # 参考 http://staff.ustc.edu.cn/~ketang/PPT/PRLec5.pdf                    L = max(0, alphas[j] - alphas[i])                    H = min(C, C + alphas[j] - alphas[i])                else:                    L = max(0, alphas[j] + alphas[i] - C)                    H = min(C, alphas[j] + alphas[i])                if L==H:                    print "L==H"                    continue                eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T  # 二阶导数 计算alphas[j]的最优修改量                if eta >= 0:                    print "eta>=0"                    continue                alphas[j] -= labelMat[j]*(Ei - Ej)/eta      # 修改alphas[j]的值                alphas[j] = clipAlpha(alphas[j],H,L)        # 限制alphas[j]的值                if (abs(alphas[j] - alphaJold) < 0.00001):  # 判定alphas[j]是否轻微改变                    print "j not moving enough"                    continue                alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j]) #  相反方向修改alphas[i]，改变的大小一样  update i by the same amount as j                                                                        #the update is in the oppostie direction                b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T  # 确定偏移项b                b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T                if (0 < alphas[i]) and (C > alphas[i]):                    b = b1                elif (0 < alphas[j]) and (C > alphas[j]):                    b = b2                else:                    b = (b1 + b2)/2.0                alphaPairsChanged += 1  # 更新修改flag                print "iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged)        if (alphaPairsChanged == 0):  # 迭代统计            iter += 1        else:            iter = 0        print "iteration number: %d" % iter    return b,alphasdataArr,labelArr = loadDataSet('testSet.txt')b,alphas = smoSimple(dataArr,labelArr,0.6,0.001,40)tmp = shape(alphas[alphas>0])for i in range(100):    if alphas[i]>0.0:        print dataArr[i],labelArr[i]pass
0 1