KNN cifar-10 L1 L2距离 交叉验证

来源:互联网 发布:datagridview数据滚动 编辑:程序博客网 时间:2024/06/03 19:41

K-NN k-Nearest Neighbor分类器

之前的近邻算法(NN)是仅仅选择一个最近的图像标签,K-NN是选出K个差值最小的图像标签,然后看那个标签的数量多就选用那个标签作为预测值,这样就提高了泛化能力。

交叉验证。

有时候,训练集数量较小(因此验证集的数量更小)。如果是交叉验证集,将训练集平均分成5份,其中4份用来训练,1份用来验证。然后我们循环着取其中4份来训练,其中1份来验证,最后取所有5次验证结果的平均值作为算法验证结果。
这里写图片描述

import numpy as npimport pickleimport matplotlib.pyplot as plt'''输入训练集及测试集'''file_path = "E:/cifar-10-python/cifar-10-batches-py/"def unpickle(file):    import pickle    with open(file, 'rb') as fo:        dict = pickle.load(fo, encoding='latin1')    return dict'''加载数据集'''def load_CIFAR10(file):    # dictTrain1 = unpickle(file + "data_batch_1")    # dataTrain1 = dictTrain1['data']    # labelTrain1 = dictTrain1['labels']    #    # dictTrain2 = unpickle(file + "data_batch_2")    # dataTrain2 = dictTrain2['data']    # labelTrain2 = dictTrain2['labels']    #    # dictTrain3 = unpickle(file + "data_batch_3")    # dataTrain3 = dictTrain3['data']    # labelTrain3 = dictTrain3['labels']    #    # dictTrain4 = unpickle(file + "data_batch_4")    # dataTrain4 = dictTrain4['data']    # labelTrain4 = dictTrain4['labels']    #    # dictTrain5 = unpickle(file + "data_batch_5")    # dataTrain5 = dictTrain5['data']    # labelTrain5 = dictTrain5['labels']    # dataTrain = np.vstack([dataTrain1, dataTrain2, dataTrain3, dataTrain4, dataTrain5])    # labelTrain = np.concatenate([labelTrain1, labelTrain2, labelTrain3, labelTrain4, labelTrain5])    dictTrain = unpickle(file + "data_batch_1")    dataTrain = dictTrain['data']    labelTrain = dictTrain['labels']    for i in range(2,6):        dictTrain = unpickle(file+"data_batch_"+str(i))        dataTrain = np.vstack([dataTrain, dictTrain['data']])        labelTrain = np.hstack([labelTrain, dictTrain['labels']])    dictTest = unpickle(file + "test_batch")    dataTest = dictTest['data']    labelTest = dictTest['labels']    labelTest = np.array(labelTest)    return dataTrain, labelTrain, dataTest, labelTestclass KNearestNeighbor(object):    def __init__(self):        self.X_train = None        self.y_train = None    def train(self, X_train, y_train):        """KNN无需训练        """        self.X_train = X_train        self.y_train = y_train    def compute_distances_L1(self, X_test):        """计算测试集和每个训练集的曼哈顿距离        :param X_test: 测试集 numpy.ndarray        :return: 测试集与训练集的欧氏距离数组 numpy.ndarray        """        dists = np.zeros((X_test.shape[0], self.X_train.shape[0]))        for i in range(X_test.shape[0]):            dists[i] = np.sum( np.abs(self.X_train- X_test[i]), axis=1)        return dists    def compute_distances_L2(self, X_test):        """计算测试集和每个训练集的欧氏距离        向量化实现需转化公式后实现(单个循环不需要)        :param X_test: 测试集 numpy.ndarray        :return: 测试集与训练集的欧氏距离数组 numpy.ndarray        """        dists = np.zeros((X_test.shape[0], self.X_train.shape[0]))        value_2xy = np.multiply(X_test.dot(self.X_train.T), -2)        value_x2 = np.sum(np.square(X_test), axis=1, keepdims=True) #保持其维度不变        value_y2 = np.sum(np.square(self.X_train), axis=1)        dists = value_2xy + value_x2 + value_y2        return dists    def predict_label(self, dists, k):        """选择前K个距离最近的标签,从这些标签中选择个数最多的作为预测分类        :param dists: 欧氏距离        :param k: 前K个分类        :return: 预测分类(向量)        """        y_pred = np.zeros(dists.shape[0])        for i in range(dists.shape[0]):            # 取前K个标签            closest_y = self.y_train[np.argsort(dists[i, :])[:k]]            # 取K个标签中个数最多的标签            y_pred[i] = np.argmax(np.bincount(closest_y))        return y_pred    def predict(self, X_test, k, L):        """选择前K个距离最近的标签,从这些标签中选择个数最多的作为预测分类        :param k: 前K个分类        :param L: 1 : L1(曼哈顿距离) 2:L2(欧氏距离)        :return: 预测向量        """        if(L==1):            dists = self.compute_distances_L1(X_test)        else:            dists = self.compute_distances_L2(X_test)        y_pred = self.predict_label(dists, k)        return y_preddef Cross_validation(X_train, y_train):    """交叉验证,确定超参K,同时可视化K值    :param X_train: 训练集    :param y_train: 训练标签    """    num_folds = 5    k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]    k_accuracy = {}    # 将数据集分为5份, X_train_folds ([],[],[],[],[]) 列表里面有个5个narray    X_train_folds = np.array_split(X_train, num_folds)    y_train_folds = np.array_split(y_train, num_folds)    print("length of x_train_folds", len(X_train_folds))    print("X_train shape", type(X_train[0]))    print("X_train len", X_train_folds[0].shape)    # 计算每种K值    for k in k_choices:        k_accuracy[k] = []        # 每个K值分别计算每份数据集作为测试集时的正确率        for index in range(num_folds):            # 构建数据集            X_te = X_train_folds[index]            y_te = y_train_folds[index]            X_tr = np.reshape( np.array(X_train_folds[:index] + X_train_folds[index + 1:]),                                (int(X_train.shape[0] * (num_folds - 1) / num_folds), -1) )            y_tr = np.reshape(y_train_folds[:index] + y_train_folds[index + 1:],                              int(X_train.shape[0] * (num_folds - 1) / num_folds))            # 预测结果            classify = KNearestNeighbor()            classify.train(X_tr, y_tr)            y_te_pred = classify.predict(X_te, k, 2)            accuracy = np.mean(y_te_pred == y_te)            k_accuracy[k].append(accuracy)    for k, accuracylist in k_accuracy.items():        for accuracy in accuracylist:            print("k = %d, accuracy = %.3f" % (k, accuracy))    # 可视化K值效果    for k in k_choices:        accuracies = k_accuracy[k]        plt.scatter([k] * len(accuracies), accuracies)    accuracies_mean = np.array([np.mean(v) for k, v in sorted(k_accuracy.items())])    accuracies_std = np.array([np.std(v) for k, v in sorted(k_accuracy.items())])    # 根据均值和方差构建误差棒图    plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)    plt.title('Cross-validation on k')    plt.xlabel('k')    plt.ylabel('Cross-validation accuracy')    plt.show()dataTrain, labelTrain, dataTest, labelTest = load_CIFAR10(file_path)# print(dataTrain.shape)# print(type(labelTrain))# print(dataTest.shape)# print(len(labelTest))Cross_validation(dataTrain[:1000,:], labelTrain[:1000])'''# find hyperparameters that work best on the validation setvalidation_accuracies = []AccuaracyL1L2 = np.zeros([2, 8])k_value = [1,2,4,8,16,32,64,128]for k in range(len(k_value)):    knn = KNearestNeighbor()    knn.train(dataTrain[:5000,:], labelTrain[:5000])    label_predict = knn.predict(dataTest[:50,:], k_value[k], 1)    AccuaracyL1L2[0][k] = np.mean( label_predict == labelTest[:50] )    label_predict = knn.predict(dataTest[:50, :], k_value[k], 2)    AccuaracyL1L2[1][k] = np.mean( label_predict == labelTest[:50] )accuracy = np.mean(AccuaracyL1L2, axis = 1)print(AccuaracyL1L2)if(accuracy[0] > accuracy[1]):    print("L1 准确率大于 L2")    print("最好的K取值为%d,最大准确率为 %f" % (k_value[ np.argmax(AccuaracyL1L2[0]) ], np.max(AccuaracyL1L2[0])))else:    print("L2 准确率大于 L1")    print("最好的K取值为", np.max(AccuaracyL1L2[1]))    print("最好的K取值为%d,最大准确率为 %f" % (k_value[np.argmax(AccuaracyL1L2[1])], np.max(AccuaracyL1L2[1])))'''

可以看到每个K对应5个训练集,这里使用的L1距离(欧氏距离),下图是对应的可视化图像,相对与L2距离,L1在交叉验证,总体验证时的效果要比L2好。
这里写图片描述
这里写图片描述
下图是L2距离的效果验证
这里写图片描述
这里写图片描述
参考

https://zhuanlan.zhihu.com/p/20900216

原创粉丝点击