cs231 assignment1--KNN

来源:互联网 发布:部落冲突男王数据 编辑:程序博客网 时间:2024/06/03 23:01

这次作业的主要的难点:
实现训练集和测试集之间距离的方法,主要是用向量形式来实现的部分
交叉检验的部分

import numpy as npfrom collections import Counterclass KNearestNeighbor(object):  """ a kNN classifier with L2 distance """  def __init__(self):    pass  def train(self, X, y):    """    Train the classifier. For k-nearest neighbors this is just     memorizing the training data.    Inputs:    - X: A numpy array of shape (num_train, D) containing the training data      consisting of num_train samples each of dimension D.    - y: A numpy array of shape (N,) containing the training labels, where         y[i] is the label for X[i].    """    self.X_train = X    self.y_train = y  def predict(self, X, k=1, num_loops=0):    """    Predict labels for test data using this classifier.    Inputs:    - X: A numpy array of shape (num_test, D) containing test data consisting         of num_test samples each of dimension D.    - k: The number of nearest neighbors that vote for the predicted labels.    - num_loops: Determines which implementation to use to compute distances      between training points and testing points.    Returns:    - y: A numpy array of shape (num_test,) containing predicted labels for the      test data, where y[i] is the predicted label for the test point X[i].      """    if num_loops == 0:      dists = self.compute_distances_no_loops(X)    elif num_loops == 1:      dists = self.compute_distances_one_loop(X)    elif num_loops == 2:      dists = self.compute_distances_two_loops(X)    else:      raise ValueError('Invalid value %d for num_loops' % num_loops)    return self.predict_labels(dists, k=k)  #两个循环部分 ,实现比较简单  def compute_distances_two_loops(self, X):    """    Compute the distance between each test point in X and each training point    in self.X_train using a nested loop over both the training data and the     test data.    Inputs:    - X: A numpy array of shape (num_test, D) containing test data.    Returns:    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]      is the Euclidean distance between the ith test point and the jth training      point.    """    num_test = X.shape[0]    num_train = self.X_train.shape[0]    dists = np.zeros((num_test, num_train))    for i in xrange(num_test):      for j in xrange(num_train):        dists[i, j] = np.sqrt(np.sum(np.square(self.X_train[j,:]- X[i, :])))        #####################################################################        # TODO: #        # Compute the l2 distance between the ith test point and the jth    #        # training point, and store the result in dists[i, j]. You should   #        # not use a loop over dimension.                                    #        #####################################################################        pass        #####################################################################        #                       END OF YOUR CODE                            #        #####################################################################    return dists  #一个循环实现部分,主要是通过numpy广播机制来实现  def compute_distances_one_loop(self, X):    """    Compute the distance between each test point in X and each training point    in self.X_train using a single loop over the test data.    Input / Output: Same as compute_distances_two_loops    """    num_test = X.shape[0]    num_train = self.X_train.shape[0]    dists = np.zeros((num_test, num_train))    for i in xrange(num_test):      #######################################################################      # TODO:                                                               #      # Compute the l2 distance between the ith test point and all training #      # points, and store the result in dists[i, :].                        #      #######################################################################      pass      dists[i,:] = np.sqrt(np.sum(np.square(self.X_train-X[i,:]),axis=1))      #X_train中每一行都减去X[i,:]      #######################################################################      #                         END OF YOUR CODE                            #      #######################################################################    return dists  #两个循环部分  def compute_distances_no_loops(self, X):    """    Compute the distance between each test point in X and each training point    in self.X_train using no explicit loops.    Input / Output: Same as compute_distances_two_loops    """    num_test = X.shape[0]    num_train = self.X_train.shape[0]    dists = np.zeros((num_test, num_train))     #########################################################################    # TODO:                                                                 #    # Compute the l2 distance between all test points and all training      #    # points without using any explicit loops, and store the result in      #    # dists.                                                                #    #                                                                       #    # You should implement this function using only basic array operations; #    # in particular you should not use functions from scipy.                #    #                                                                       #    # HINT: Try to formulate the l2 distance using matrix multiplication    #    #       and two broadcast sums.                                         #    #########################################################################    pass    #注意点:numpy中dot才是点乘,而*预算则是所有相对应的元素进行相乘    #此处实现的原理,计算距离方法:(X1-X2)^2,将其展开,X1^2+X2^2-2X1*X2    #下面就是分别计算 X1^2,X2^2,2X1*X2     # X1*X2部分 500*5000    M = np.dot(X,self.X_train.T)     # X1^2 部分 5000*1    sqtr = np.sum(np.square(self.X_train),axis=1,keepdims=True)     # X2^2 部分  500*1    sqte = np.sum(np.square(X),axis=1,keepdims=True)    # 三部分相加,其中sqte+np.matrix(sqtr).T 500*1 + 1*5000  广播机制实现    dists = np.sqrt(sqte+np.matrix(sqtr).T-2*M)       #########################################################################    #                         END OF YOUR CODE                              #    #########################################################################    return dists  def predict_labels(self, dists, k=1):    """    Given a matrix of distances between test points and training points,    predict a label for each test point.    Inputs:    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]      gives the distance betwen the ith test point and the jth training point.    Returns:    - y: A numpy array of shape (num_test,) containing predicted labels for the      test data, where y[i] is the predicted label for the test point X[i].      """    num_test = dists.shape[0]    y_pred = np.zeros(num_test)    for i in xrange(num_test):      # A list of length k storing the labels of the k nearest neighbors to      # the ith test point.      closest_y = []      """      closest_y = self.y_train[np.argsort(dists[i:])[:k]]      y_pred[i] = np.argmax(np.bincount(np.matrix(closest_y)))      """      labels = self.y_train[np.argsort(dists[i, :])].flatten()      closest_y = labels[0:k]      c = Counter(closest_y)      y_pred[i] = c.most_common(1)[0][0]      #########################################################################      # TODO:                                                                 #      # Use the distance matrix to find the k nearest neighbors of the ith    #      # testing point, and use self.y_train to find the labels of these       #      # neighbors. Store these labels in closest_y.                           #      # Hint: Look up the function numpy.argsort.                             #      #########################################################################      pass      #########################################################################      # TODO:                                                                 #      # Now that you have found the labels of the k nearest neighbors, you    #      # need to find the most common label in the list closest_y of labels.   #      # Store this label in y_pred[i]. Break ties by choosing the smaller     #      # label.                                                                #      #########################################################################      pass      #########################################################################      #                           END OF YOUR CODE                            #       #########################################################################    return y_pred
# Run some setup code for this notebook.import randomimport numpy as npfrom cs231n.data_utils import load_CIFAR10import matplotlib.pyplot as plt# This is a bit of magic to make matplotlib figures appear inline in the notebook# rather than in a new window.%matplotlib inlineplt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plotsplt.rcParams['image.interpolation'] = 'nearest'plt.rcParams['image.cmap'] = 'gray'# Some more magic so that the notebook will reload external python modules;# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython%load_ext autoreload%autoreload 2# Visualize some examples from the dataset.# We show a few examples of training images from each class.classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']num_classes = len(classes)samples_per_class = 7for y, cls in enumerate(classes):    idxs = np.flatnonzero(y_train == y)    idxs = np.random.choice(idxs, samples_per_class, replace=False)    for i, idx in enumerate(idxs):        plt_idx = i * num_classes + y + 1        plt.subplot(samples_per_class, num_classes, plt_idx)        plt.imshow(X_train[idx].astype('uint8'))        plt.axis('off')        if i == 0:            plt.title(cls)plt.show()# Subsample the data for more efficient code execution in this exercisenum_training = 5000mask = range(num_training)X_train = X_train[mask]y_train = y_train[mask]num_test = 500mask = range(num_test)X_test = X_test[mask]y_test = y_test[mask]# Reshape the image data into rowsX_train = np.reshape(X_train, (X_train.shape[0], -1))X_test = np.reshape(X_test, (X_test.shape[0], -1))print X_train.shape, X_test.shapefrom cs231n.classifiers import KNearestNeighbor# Create a kNN classifier instance. # Remember that training a kNN classifier is a noop: # the Classifier simply remembers the data and does no further processing classifier = KNearestNeighbor()classifier.train(X_train, y_train)# Open cs231n/classifiers/k_nearest_neighbor.py and implement# compute_distances_two_loops.# Test your implementation:dists = classifier.compute_distances_two_loops(X_test)print dists.shape# We can visualize the distance matrix: each row is a single test example and# its distances to training examplesplt.imshow(dists, interpolation='none')plt.show()# Now implement the function predict_labels and run the code below:# We use k = 1 (which is Nearest Neighbor).y_test_pred = classifier.predict_labels(dists, k=1)# Compute and print the fraction of correctly predicted examplesnum_correct = np.sum(y_test_pred == y_test)accuracy = float(num_correct) / num_testprint 'Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)y_test_pred = classifier.predict_labels(dists, k=5)num_correct = np.sum(y_test_pred == y_test)accuracy = float(num_correct) / num_testprint 'Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)# Now lets speed up distance matrix computation by using partial vectorization# with one loop. Implement the function compute_distances_one_loop and run the# code below:dists_one = classifier.compute_distances_one_loop(X_test)# To ensure that our vectorized implementation is correct, we make sure that it# agrees with the naive implementation. There are many ways to decide whether# two matrices are similar; one of the simplest is the Frobenius norm. In case# you haven't seen it before, the Frobenius norm of two matrices is the square# root of the squared sum of differences of all elements; in other words, reshape# the matrices into vectors and compute the Euclidean distance between them.difference = np.linalg.norm(dists - dists_one, ord='fro')print 'Difference was: %f' % (difference, )if difference < 0.001:  print 'Good! The distance matrices are the same'else:  print 'Uh-oh! The distance matrices are different'# Now implement the fully vectorized version inside compute_distances_no_loops# and run the codedists_two = classifier.compute_distances_no_loops(X_test)# check that the distance matrix agrees with the one we computed before:difference = np.linalg.norm(dists - dists_two, ord='fro')print 'Difference was: %f' % (difference, )if difference < 0.001:  print 'Good! The distance matrices are the same'else:  print 'Uh-oh! The distance matrices are different'# Let's compare how fast the implementations aredef time_function(f, *args):  """  Call a function f with args and return the time (in seconds) that it took to execute.  """  import time  tic = time.time()  f(*args)  toc = time.time()  return toc - tictwo_loop_time = time_function(classifier.compute_distances_two_loops, X_test)print 'Two loop version took %f seconds' % two_loop_timeone_loop_time = time_function(classifier.compute_distances_one_loop, X_test)print 'One loop version took %f seconds' % one_loop_timeno_loop_time = time_function(classifier.compute_distances_no_loops, X_test)print 'No loop version took %f seconds' % no_loop_time# you should see significantly faster performance with the fully vectorized implementationCross-validationWe have implemented the k-Nearest Neighbor classifier but we set the value k = 5 arbitrarily. We will now determine the best value of this hyperparameter with cross-validation.# num_folds = 5k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]X_train_folds = []y_train_folds = []################################################################################# TODO:                                                                        ## Split up the training data into folds. After splitting, X_train_folds and    ## y_train_folds should each be lists of length num_folds, where                ## y_train_folds[i] is the label vector for the points in X_train_folds[i].     ## Hint: Look up the numpy array_split function.                                #################################################################################passX_train_folds = np.array_split(X_train,num_folds,axis=0)y_train_folds = np.array_split(y_train,num_folds,axis=0)#################################################################################                                 END OF YOUR CODE                             ################################################################################## A dictionary holding the accuracies for different values of k that we find# when running cross-validation. After running cross-validation,# k_to_accuracies[k] should be a list of length num_folds giving the different# accuracy values that we found when using that value of k.# 交叉检验部分,一开始完全没有实现的思路,查阅各种资料后,才发现是才训练数据分成5部分# 5个部分轮流当测试集,并将5次的结果取平均值作为最终结果k_to_accuracies = {}classifier = KNearestNeighbor()for k in k_choices:    k_to_accuracies[k] = []    for j in xrange(num_folds):        X_tr = X_train_folds[0:j]+X_train_folds[(j+1):num_folds]        X_tr = np.reshape(X_tr,(X_train.shape[0]*(num_folds-1)/num_folds,-1))        y_tr = y_train_folds[0:j]+y_train_folds[(j+1):num_folds]        y_tr = np.reshape(y_tr,(X_train.shape[0]*(num_folds-1)/num_folds,-1))        X_te = np.reshape( X_train_folds[j], (X_train.shape[0]/num_folds,-1))        y_te = y_train_folds[j]        classifier.train(X_tr,y_tr)        y_test_pred = classifier.predict(X_te,k,0)        num_correct = sum(y_te==y_test_pred)        num_test1 = np.shape(X_te)[0]        accuracy = float(num_correct)/num_test1        k_to_accuracies[k].append(accuracy)################################################################################# TODO:                                                                        ## Perform k-fold cross validation to find the best value of k. For each        ## possible value of k, run the k-nearest-neighbor algorithm num_folds times,   ## where in each case you use all but one of the folds as training data and the ## last fold as a validation set. Store the accuracies for all fold and all     ## values of k in the k_to_accuracies dictionary.                               #################################################################################pass#################################################################################                                 END OF YOUR CODE                             ################################################################################## Print out the computed accuraciesfor k in sorted(k_to_accuracies):    for accuracy in k_to_accuracies[k]:        print 'k = %d, accuracy = %f' % (k, accuracy)# plot the raw observationsfor k in k_choices:  accuracies = k_to_accuracies[k]  plt.scatter([k] * len(accuracies), accuracies)# plot the trend line with error bars that correspond to standard deviationaccuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)plt.title('Cross-validation on k')plt.xlabel('k')plt.ylabel('Cross-validation accuracy')plt.show()# Based on the cross-validation results above, choose the best value for k,   # retrain the classifier using all the training data, and test it on the test# data. You should be able to get above 28% accuracy on the test data.best_k = 7num_test=500classifier = KNearestNeighbor()classifier.train(X_train, y_train)y_test_pred = classifier.predict(X_test, k=best_k)# Compute and display the accuracynum_correct = np.sum(y_test_pred == y_test)accuracy = float(num_correct) / num_testprint 'Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)
0 0