机器学习基石 作业4 带Regularizer和Cross Validation的线性回归分类器

来源:互联网 发布:《java并发编程实战》 编辑:程序博客网 时间:2024/04/28 14:48
#!/usr/bin/env python# -*- coding: utf-8 -*-"""__title__ = 'main.py'__author__ = 'w1d2s'__mtime__ = '2015/10/30'"""from numpy import *from RidgeReg import *from Validation import *import sysimport stringdef Data_Pretreatment(path):    rawData = open(path).readlines()    #print rawData    dataNum = len(rawData)    dataDim = len(rawData[0].strip().split(' ')) - 1    dataIdx = 0    X = zeros([dataNum, dataDim])    Y = zeros(dataNum)    print(dataNum, dataDim)    for line in rawData:        tempList = line.strip().split(' ')        Y[dataIdx] = string.atoi(tempList[dataDim])        X[dataIdx, :] = tempList[0: dataDim]        dataIdx += 1    return (X, Y)if __name__ == '__main__':    Xtrain, Ytrain = Data_Pretreatment('train.dat')    Xtest, Ytest = Data_Pretreatment('test.dat')    #(Wt, p) = Validate(Xtrain, Ytrain, 120, False)    (Wt, p) = Cross_Validate(Xtrain, Ytrain, 5)    rate = 10 ** p    W = Ridge_Regression(Xtrain, Ytrain, rate)    Ein = Err_Counter(Xtrain, Ytrain, W)    Eout = Err_Counter(Xtest, Ytest, W)    print '** Ein : ' + str(float(Ein)/200)    print '** Eout : ' + str(float(Eout)/1000)
#!/usr/bin/env python# -*- coding: utf-8 -*-"""__title__ = 'RidgeReg.py'__author__ = 'w1d2s'__mtime__ = '2015/10/30'"""from numpy import *from scipy import linalgimport randomdef Err_Counter(X, Y, W):    (dataSize, dataDim) = X.shape    Z = ones([dataSize, dataDim + 1])    Z[:, 1: dataDim + 1] = X    ErrCnt = 0    for i in range(0, dataSize):        if Y[i] * dot(Z[i, :], W) <= 0:            ErrCnt = ErrCnt + 1    return ErrCntdef Ridge_Regression(X, Y, rate):    (dataSize, dataDim) = X.shape    Z = ones([dataSize, dataDim + 1])    Z[:, 1: dataDim + 1] = X    Zt = transpose(Z)    ZtZ = dot(Zt, Z)    I = identity(len(ZtZ))    P = linalg.inv(ZtZ + rate * I)    W = dot(dot(P, Zt), Y)    return W
#!/usr/bin/env python# -*- coding: utf-8 -*-"""__title__ = 'Validation.py'__author__ = 'w1d2s'__mtime__ = '2015/10/30'"""from numpy import *from RidgeReg import *def Data_Spliter(X, Y, Num4Train):    Xtrain = X[0: Num4Train, :]    Ytrain = Y[0: Num4Train]    Xval = X[Num4Train: , :]    Yval = Y[Num4Train: ]    return [Xtrain, Ytrain, Xval, Yval]def Validate(X, Y, Num4Train, IsEt):    [Xt, Yt, Xv, Yv] = Data_Spliter(X, Y, Num4Train)    minEt = 120    minEv = 80    Wt = zeros([1, Xt.ndim + 1])    p = 0    for pow in range(-10, 3):        rate = 10 ** pow        W = Ridge_Regression(Xt, Yt, rate)        Et = Err_Counter(Xt, Yt, W)        Ev = Err_Counter(Xv, Yv, W)        if IsEt == True:            if Et <= minEt:                [Wt, minEt, p] = [W, Et, pow]            print '== Et : ' + str(float(Et)/120)            print '== log lambda : ' + str(pow)        else:            if Ev <= minEv:                [Wt, minEv, p] = [W, Ev, pow]            print '== Ev : ' + str(float(Ev)/80)            print '== log lambda : ' + str(pow)    Et = Err_Counter(Xt, Yt, Wt)    Ev = Err_Counter(Xv, Yv, Wt)    print 'log lambda : ' + str(p)    print 'Et : ' + str(float(Et)/120)    print 'Ev: ' + str(float(Ev)/80)    return (Wt, p)def Data_Spliter2(X, Y, folds):    dataSize = len(Y)    inc = dataSize / folds    Xlist = []    Ylist = []    for idx in range(0, dataSize, inc):        Xtemp = X[idx: idx + inc, :]        Ytemp = Y[idx: idx + inc]        Xlist.append(Xtemp)        Ylist.append(Ytemp)    return (Xlist, Ylist)def Cross_Validate(X, Y, folds):    (Xlist, Ylist) = Data_Spliter2(X, Y, folds)    (foldSize, foldDim) = Xlist[0].shape    Xt = zeros([foldSize * 4, foldDim])    Yt = zeros([foldSize * 4, 1])    Wt = zeros([1, foldDim + 1])    p = 0    minEcv = 10000    for pow in range(-10, 3):        rate = 10 ** pow        EcvSum = 0        for V in range(0, folds):            beg = 0            for idx in range(0, folds):                if idx == V:                    Xv = Xlist[idx]                    Yv = Ylist[idx]                else:                    Xt[beg: beg + foldSize, :] = Xlist[idx]                    Ylist[idx].shape = (Ylist[idx].shape[0], 1)                    Yt[beg: beg + foldSize] = Ylist[idx]                    beg = beg + foldSize            W = Ridge_Regression(Xt, Yt, rate)            Ecv = Err_Counter(Xv, Yv, W)            EcvSum = EcvSum + Ecv        if float(EcvSum)/folds <= minEcv:            minEcv = float(EcvSum)/folds            (Wt, p) = (W, pow)    print 'log lambda: ' + str(p)    print 'Ecv : ' + str(minEcv)    return (Wt, p)
0 0