机器学习基石 作业2 实现1维和多维Decision Stump

来源:互联网 发布:html点击按钮执行js 编辑:程序博客网 时间:2024/06/08 20:27

多维训练数据:https://d396qusza40orc.cloudfront.net/ntumlone%2Fhw2%2Fhw2_train.dat 

多维测试数据: https://d396qusza40orc.cloudfront.net/ntumlone%2Fhw2%2Fhw2_test.dat 


#!/usr/bin/env python# -*- coding: utf-8 -*-"""__title__ = 'OneDimension.py'__author__ = 'Administrator'__mtime__ = '2015/10/10'"""import randomfrom numpy import *def Data_Generator(size):    X = zeros(size)    Y = zeros(size)    for i in range(0, size):        tmp = random.uniform(-1, 1)        X[i] = tmp        flip = False        if random.uniform(0, 1) < 0.2:            flip = True        Y[i] = int(sign(tmp))        if flip:            Y[i] = -1 * Y[i]    return (X, Y)def Err_Counter(X, Y, sFlip, theta):    dataSize = len(X)    s = 1    if not sFlip:        s = -1    Err = 0    for i in range(0, dataSize):        h = s * sign(X[i] - theta)        if h * Y[i] < 0:            Err = Err + 1    return Errdef ErrOut_Calculator(sFlip, theta):    s = 1    if not sFlip:        s = -1    return 0.5 + 0.3 * s * (abs(theta) - 1)def Decision_Stump_1D(X, Y):    sFlip = True # True for s = +1, False for s = -1    dataSize = len(X)    sortedX = sort(X)    theta = 0    ErrIn = 0    thetaBest = 0    ErrInBest = dataSize    #print sortedX    for i in range(0, dataSize + 1):        if i == 0:            theta = ((-1.0) + sortedX[0]) * 0.5        elif i == dataSize:            theta = (1.0 + sortedX[i - 1]) * 0.5        else:            theta = (sortedX[i] + sortedX[i - 1]) * 0.5        ErrIn = Err_Counter(X, Y, True, theta)        if ErrIn < ErrInBest:            (sFlip, thetaBest, ErrInBest) = (True, theta, ErrIn)        ErrIn = Err_Counter(X, Y, False, theta)        if ErrIn < ErrInBest:            (sFlip, thetaBest, ErrInBest) = (False, theta, ErrIn)    ErrOut = ErrOut_Calculator(sFlip, thetaBest)    #print "ErrInBest: " + str(ErrInBest)    #print "ErrOut: " + str(ErrOut)    #print '============================='    #print "theta, sFlip = " + str(thetaBest) + ', ' + str(sFlip)    return ((sFlip, thetaBest), (ErrInBest, ErrOut))


#!/usr/bin/env python# -*- coding: utf-8 -*-"""__title__ = 'MultiDimension.py'__author__ = 'Administrator'__mtime__ = '2015/10/10'"""import randomfrom numpy import *from OneDimension import *def Decision_Stump_MD(X, Y):    (dataSize, dataDim) = X.shape    sFlipBest = []    thetaBest = []    ErrInBest = []    theta = 0    sFlip = True    for i in range(0, dataDim):        res = Decision_Stump_1D(X[:, i], Y)        sFlipBest.append(res[0][0])        thetaBest.append(res[0][1])        ErrInBest.append(res[1][0])    BestDim = argsort(ErrInBest)[0]    theta = thetaBest[BestDim]    sFlip = sFlipBest[BestDim]    print sFlipBest    print thetaBest    print ErrInBest    print ([sFlip, theta, BestDim], ErrInBest[BestDim])    return ([sFlip, theta, BestDim], ErrInBest[BestDim])def ErrOut_Estimator(X, Y, sFlip, theta, Dim):    ErrOut = Err_Counter(X[:, Dim], Y, sFlip, theta)    return ErrOut


#!/usr/bin/env python# -*- coding: utf-8 -*-"""__title__ = 'HW 2 main.py'__author__ = 'w1d2s'__mtime__ = '2015/10/10'"""from numpy import *from OneDimension import *from MultiDimension import *import sysimport stringdef Data_Pretreatment(path):    rawData = open(path).readlines()    print rawData    dataNum = len(rawData)    dataDim = len(rawData[0].strip().split(' ')) - 1    dataIdx = 0    X = zeros([dataNum, dataDim])    Y = zeros(dataNum)    print(dataNum, dataDim)    for line in rawData:        tempList = line.strip().split(' ')        Y[dataIdx] = string.atoi(tempList[dataDim])        X[dataIdx, :] = tempList[0: dataDim]        dataIdx += 1    return (X, Y)if __name__ == '__main__':    '''    print 'Hello world!'    ErrInCnt = 0    ErrOutCnt = 0    round = 5000    for i in range(0, round):        X, Y = Data_Generator(20)        res = Decision_Stump_1D(X, Y)        ErrInCnt = ErrInCnt + res[1][0]        ErrOutCnt = ErrOutCnt + res[1][1]    print 'Average ErrIn: ' + str(float(ErrInCnt / round))    print 'Average ErrOut: ' + str(float(ErrOutCnt / round))    '''    X, Y = Data_Pretreatment('train.txt')    X_t, Y_t = Data_Pretreatment('test.txt')    res = Decision_Stump_MD(X, Y)    sFlip = res[0][0]    theta = res[0][1]    Dim = res[0][2]    print ErrOut_Estimator(X_t, Y_t, sFlip, theta, Dim)




0 0