CS231n+assignment1（作业一）

来源：互联网发布：mysql group by 多列编辑：程序博客网时间：2024/05/23 13:56

一、第一个是KNN的代码，这里的trick是计算距离的三种方法，核心的话还是python和machine learning中非常实用的向量化操作，可以大大的提高计算速度。

import numpy as np class KNearestNeighbor(object):#首先是定义一个处理KNN的类  """ a kNN classifier with L2 distance """   def __init__(self):    pass   def train(self, X, y):    """    Train the classifier. For k-nearest neighbors this is just    memorizing the training data.     Inputs:    - X: A numpy array of shape (num_train, D) containing the training data      consisting of num_train samples each of dimension D.    - y: A numpy array of shape (N,) containing the training labels, where         y[i] is the label for X[i].    """    self.X_train = X    self.y_train = y       def predict(self, X, k=1, num_loops=0):    """    Predict labels for test data using this classifier.     Inputs:    - X: A numpy array of shape (num_test, D) containing test data consisting         of num_test samples each of dimension D.    - k: The number of nearest neighbors that vote for the predicted labels.    - num_loops: Determines which implementation to use to compute distances      between training points and testing points.     Returns:    - y: A numpy array of shape (num_test,) containing predicted labels for the      test data, where y[i] is the predicted label for the test point X[i].     """    if num_loops == 0:      dists = self.compute_distances_no_loops(X)    elif num_loops == 1:      dists = self.compute_distances_one_loop(X)    elif num_loops == 2:      dists = self.compute_distances_two_loops(X)    else:      raise ValueError('Invalid value %d for num_loops' % num_loops)     return self.predict_labels(dists, k=k)   def compute_distances_two_loops(self, X):    """    Compute the distance between each test point in X and each training point    in self.X_train using a nested loop over both the training data and the    test data.     Inputs:    - X: A numpy array of shape (num_test, D) containing test data.     Returns:    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]      is the Euclidean distance between the ith test point and the jth training      point.    """    num_test = X.shape[0]    num_train = self.X_train.shape[0]    dists = np.zeros((num_test, num_train))    for i in xrange(num_test):      for j in xrange(num_train):        dists[i][j] = np.sqrt(np.sum(np.square(self.X_train[j,:] - X[i,:])))        #####################################################################        # TODO:                                                             #        # Compute the l2 distance between the ith test point and the jth    #        # training point, and store the result in dists[i, j]. You should   #        # not use a loop over dimension.                                    #        #####################################################################        #####################################################################        #                       END OF YOUR CODE                            #        #####################################################################    return dists   def compute_distances_one_loop(self, X):    """    Compute the distance between each test point in X and each training point    in self.X_train using a single loop over the test data.     Input / Output: Same as compute_distances_two_loops    """    num_test = X.shape[0]    num_train = self.X_train.shape[0]    dists = np.zeros((num_test, num_train))    for i in xrange(num_test):      #######################################################################      # TODO:                                                               #      # Compute the l2 distance between the ith test point and all training #      # points, and store the result in dists[i, :].                        #      #######################################################################      dists[i,:] = np.sqrt(np.sum(np.square(self.X_train-X[i,:]),axis = 1))       #######################################################################      #                         END OF YOUR CODE                            #      #######################################################################    return dists   def compute_distances_no_loops(self, X):    """    Compute the distance between each test point in X and each training point    in self.X_train using no explicit loops.     Input / Output: Same as compute_distances_two_loops    """    num_test = X.shape[0]    num_train = self.X_train.shape[0]    dists = np.zeros((num_test, num_train))    #########################################################################    # TODO:                                                                 #    # Compute the l2 distance between all test points and all training      #    # points without using any explicit loops, and store the result in      #    # dists.                                                                #    #                                                                       #    # You should implement this function using only basic array operations; #    # in particular you should not use functions from scipy.                #    #                                                                       #    # HINT: Try to formulate the l2 distance using matrix multiplication    #    #       and two broadcast sums.                                         #    #########################################################################    dists = np.multiply(np.dot(X,self.X_train.T),-2)     sq1 = np.sum(np.square(X),axis=1,keepdims = True)     sq2 = np.sum(np.square(self.X_train),axis=1)     dists = np.add(dists,sq1)     dists = np.add(dists,sq2)     dists = np.sqrt(dists)    #########################################################################    #                         END OF YOUR CODE                              #    #########################################################################    return dists   def predict_labels(self, dists, k=1):    """    Given a matrix of distances between test points and training points,    predict a label for each test point.     Inputs:    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]      gives the distance betwen the ith test point and the jth training point.     Returns:    - y: A numpy array of shape (num_test,) containing predicted labels for the      test data, where y[i] is the predicted label for the test point X[i].     """    num_test = dists.shape[0]    y_pred = np.zeros(num_test)    for i in xrange(num_test):      # A list of length k storing the labels of the k nearest neighbors to      # the ith test point.      closest_y = []       #########################################################################       # TODO:                                                                 #       # Use the distance matrix to find the k nearest neighbors of the ith    #       # training point, and use self.y_train to find the labels of these      #       # neighbors. Store these labels in closest_y.                           #       # Hint: Look up the function numpy.argsort.                             #       #########################################################################       closest_y = self.y_train[np.argsort(dists[i,:])[:k]]           #########################################################################       # TODO:                                                                 #       # Now that you have found the labels of the k nearest neighbors, you    #       # need to find the most common label in the list closest_y of labels.   #       # Store this label in y_pred[i]. Break ties by choosing the smaller     #       # label.                                                                #       #########################################################################       y_pred[i] = np.argmax(np.bincount(closest_y))        #########################################################################      #                           END OF YOUR CODE                            #      #########################################################################     return y_pred

测试和交叉验证代码：

#coding:utf-8'''Created on 2017@author: '''import randomimport numpy as npfrom assignment1.data_utils import load_CIFAR10from  assignment1.classifiers.k_nearest_neighbor import KNearestNeighborimport matplotlib.pyplot as plt# This is a bit of magic to make matplotlib figures appear inline in the notebook# rather than in a new window.plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plotsplt.rcParams['image.interpolation'] = 'nearest'plt.rcParams['image.cmap'] = 'gray'X_train, y_train, X_test, y_test = load_CIFAR10('../datasets')# As a sanity check, we print out the size of the training and test data.print('Training data shape: ', X_train.shape)print('Training labels shape: ', y_train.shape)print('Test data shape: ', X_test.shape)print('Test labels shape: ', y_test.shape)# 从数据集中展示一部分数据# 每个类别展示若干张对应图片classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']num_classes = len(classes)samples_per_class = 7for y, cls in enumerate(classes):    idxs = np.flatnonzero(y_train == y)    idxs = np.random.choice(idxs, samples_per_class, replace=False)    for i, idx in enumerate(idxs):        plt_idx = i * num_classes + y + 1        plt.subplot(samples_per_class, num_classes, plt_idx)        plt.imshow(X_train[idx].astype('uint8'))        plt.axis('off')        if i == 0:            plt.title(cls)plt.show()# 截取部分样本数据，以提高本作业的执行效率num_training = 5000mask = range(num_training)X_train = X_train[mask]y_train = y_train[mask]num_test = 500mask = range(num_test)X_test = X_test[mask]y_test = y_test[mask]# reshape训练和测试数据，转换为行的形式X_train = np.reshape(X_train, (X_train.shape[0], -1))X_test = np.reshape(X_test, (X_test.shape[0], -1))print(X_train.shape)print(X_test.shape)classifier = KNearestNeighbor()classifier.train(X_train, y_train)dists = classifier.compute_distances_two_loops(X_test)print(dists.shape)plt.imshow(dists, interpolation='none')plt.show()# Now implement the function predict_labels and run the code below:# k=1时y_test_pred = classifier.predict_labels(dists, k=1)# Compute and print the fraction of correctly predicted examplesnum_correct = np.sum(y_test_pred == y_test)accuracy = float(num_correct) / num_testprint('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))# k=5时y_test_pred = classifier.predict_labels(dists, k=5)num_correct = np.sum(y_test_pred == y_test)accuracy = float(num_correct) / num_testprint('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))####测试三种距离计算法的效率dists_one = classifier.compute_distances_one_loop(X_test)difference = np.linalg.norm(dists - dists_one, ord='fro')print('Difference was: %f' % (difference, ))if difference < 0.001:  print('Good! The distance matrices are the same')else:  print('Uh-oh! The distance matrices are different')dists_two = classifier.compute_distances_no_loops(X_test)difference = np.linalg.norm(dists - dists_two, ord='fro')print('Difference was: %f' % (difference, ))if difference < 0.001:  print('Good! The distance matrices are the same')else:  print('Uh-oh! The distance matrices are different')def time_function(f, *args):  """  Call a function f with args and return the time (in seconds) that it took to execute.  """  import time  tic = time.time()  f(*args)  toc = time.time()  return toc - tictwo_loop_time = time_function(classifier.compute_distances_two_loops, X_test)print('Two loop version took %f seconds' % two_loop_time)one_loop_time = time_function(classifier.compute_distances_one_loop, X_test)print('One loop version took %f seconds' % one_loop_time)no_loop_time = time_function(classifier.compute_distances_no_loops, X_test)print('No loop version took %f seconds' % no_loop_time)# 交叉验证num_folds = 5k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]X_train_folds = []y_train_folds = []################################################################################# TODO:                                                                        ## Split up the training data into folds. After splitting, X_train_folds and    ## y_train_folds should each be lists of length num_folds, where                ## y_train_folds[i] is the label vector for the points in X_train_folds[i].     ## Hint: Look up the numpy array_split function.                                ##################################################################################数据划分X_train_folds = np.array_split(X_train, num_folds);y_train_folds = np.array_split(y_train, num_folds)#################################################################################                                 END OF YOUR CODE                             ################################################################################## A dictionary holding the accuracies for different values of k that we find# when running cross-validation. After running cross-validation,# k_to_accuracies[k] should be a list of length num_folds giving the different# accuracy values that we found when using that value of k.k_to_accuracies = {}################################################################################# TODO:                                                                        ## Perform k-fold cross validation to find the best value of k. For each        ## possible value of k, run the k-nearest-neighbor algorithm num_folds times,   ## where in each case you use all but one of the folds as training data and the ## last fold as a validation set. Store the accuracies for all fold and all     ## values of k in the k_to_accuracies dictionary.                               #################################################################################for k in k_choices:    k_to_accuracies[k] = []for k in k_choices:#find the best k-value    for i in range(num_folds):        X_train_cv = np.vstack(X_train_folds[:i]+X_train_folds[i+1:])        X_test_cv = X_train_folds[i]        y_train_cv = np.hstack(y_train_folds[:i]+y_train_folds[i+1:])  #size:4000        y_test_cv = y_train_folds[i]        classifier.train(X_train_cv, y_train_cv)        dists_cv = classifier.compute_distances_no_loops(X_test_cv)        y_test_pred = classifier.predict_labels(dists_cv, k)        num_correct = np.sum(y_test_pred == y_test_cv)        accuracy = float(num_correct) / y_test_cv.shape[0]        k_to_accuracies[k].append(accuracy)#################################################################################                                 END OF YOUR CODE                             ################################################################################## Print out the computed accuraciesfor k in sorted(k_to_accuracies):    for accuracy in k_to_accuracies[k]:        print('k = %d, accuracy = %f' % (k, accuracy))# plot the raw observationsfor k in k_choices:  accuracies = k_to_accuracies[k]  plt.scatter([k] * len(accuracies), accuracies)# plot the trend line with error bars that correspond to standard deviationaccuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)plt.title('Cross-validation on k')plt.xlabel('k')plt.ylabel('Cross-validation accuracy')plt.show()

testKNN.py

import numpy as npimport randomimport osimport syssys.path.append('cs231n.classifiers')from cs231n.classifiers.k_nearest_neighbor import KNearestNeighborfrom six.moves import cPickle as picklefrom scipy.misc import imreadfrom cs231n.data_utils import load_CIFAR10xtr,ytr,xte,yte=load_CIFAR10('data/cifar10')num_training = 5000mask = range(num_training)X_train = xtr[mask]y_train = ytr[mask]num_test = 500mask = range(num_test)X_test = xte[mask]y_test = yte[mask]# Reshape the image data into rowsX_train = X_train.reshape(X_train.shape[0], 32*32*3)X_test = X_test.reshape(X_test.shape[0],32*32*3)print X_train.shape, X_test.shapenum_folds = 5k_choices = [1, 5, 10, 15, 20, 50, 100]X_train_folds = []y_train_folds = []X_train_folds = np.array_split(X_train,num_folds,axis=0)y_train_folds = np.array_split(y_train,num_folds,axis=0)k_to_accuracies = {}nn = KNearestNeighbor()for k in k_choices:    k_to_accuracies[k] = []    for j in xrange(num_folds):        X_tr = np.array(X_train_folds[0:j]+X_train_folds[(j+1):])        X_tr = np.reshape(X_tr,(X_train.shape[0]*(num_folds-1)/num_folds,-1))        y_tr = np.array(y_train_folds[0:j]+y_train_folds[(j+1):])        y_tr = np.reshape(y_tr,(X_train.shape[0]*(num_folds-1)/num_folds))        X_te = np.array(X_train_folds[j])        X_te = np.reshape( X_te, (X_train.shape[0]/num_folds,-1))        y_te = np.array(y_train_folds[j])        y_te = np.reshape(y_te,(y_train.shape[0]/num_folds))        nn.train(X_tr,y_tr)        yte_pred = nn.predict(X_te,k,1)        num_correct = np.sum(y_te==yte_pred)        num_test1 = np.shape(X_te)[0]        accuracy = float(num_correct)/num_test1        k_to_accuracies[k].append(accuracy)for k in sorted(k_to_accuracies):    for accuracy in k_to_accuracies[k]:        print 'k = %d, accuracy = %f' % (k, accuracy)accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)plt.title('Cross-validation on k')plt.xlabel('k')plt.ylabel('Cross-validation accuracy')plt.show()plt.savefig("k.png")plt.clf()

二、softmax

同样是需要完成naive和vector的两种操作来比较速度。

import numpy as npdef softmax_loss_naive(W, X, y, reg):        # Initialize the loss and gradient to zero.    loss = 0.0    dW = np.zeros_like(W)    # 得到一个和W同样shape的矩阵    dW_each = np.zeros_like(W)    num_train, dim = X.shape    num_class = W.shape[1]    f = X.dot(W)    # N by C    # Considering the Numeric Stability    f_max = np.reshape(np.max(f, axis=1), (num_train, 1))   # 找到最大值然后减去，这样是为了防止后面的操作会出现数值上的一些偏差    prob = np.exp(f - f_max) / np.sum(np.exp(f - f_max), axis=1, keepdims=True) # N by C    y_trueClass = np.zeros_like(prob)    y_trueClass[np.arange(num_train), y] = 1.0    for i in xrange(num_train):        for j in xrange(num_class):                loss += -(y_trueClass[i, j] * np.log(prob[i, j]))    # 损失函数的公式L = -(1/N)∑i∑j1(k=yi)log(exp(fk)/∑j exp(fj)) + λR(W)            dW_each[:, j] = -(y_trueClass[i, j] - prob[i, j]) * X[i, :]#梯度的公式 ∇Wk L = -(1/N)∑i xiT(pi,m-Pm) + 2λWk, where Pk = exp(fk)/∑j exp(fj        dW += dW_each　　　　　　　　　　　　　　　　　　#这是把每个类的放在了一起    loss /= num_train    loss += 0.5 * reg * np.sum(W * W)  # 加上正则    dW /= num_traindW += reg * W    return loss, dWdef softmax_loss_vectorized(W, X, y, reg):        """        Softmax loss function, vectorized version.        Inputs and outputs are the same as softmax_loss_naive.        """        # Initialize the loss and gradient to zero.        loss = 0.0        dW = np.zeros_like(W)    # D by C        num_train, dim = X.shape    f = X.dot(W)    # N by C    # Considering the Numeric Stability    f_max = np.reshape(np.max(f, axis=1), (num_train, 1))   # N by 1    prob = np.exp(f - f_max) / np.sum(np.exp(f - f_max), axis=1), keepdims=True)    y_trueClass = np.zeros_like(prob)    y_trueClass[range(num_train), y] = 1.0    # N by C    loss += -np.sum(y_trueClass * np.log(prob)) / num_train + 0.5 * reg * np.sum(W * W)#向量化直接操作即可    dW += -np.dot(X.T, y_trueClass - prob) / num_train + reg * W    return loss, dW

三、SVM

import numpy as npdef svm_loss_naive(W, X, y, reg):    """    Inputs:    - W: A numpy array of shape (D, C) containing weights.    - X: A numpy array of shape (N, D) containing a minibatch of data.    - y: A numpy array of shape (N,) containing training labels; y[i] = c means          that X[i] has label c, where 0 <= c < C.    - reg: (float) regularization strength    Returns a tuple of:    - loss as single float    - gradient with respect to weights W; an array of same shape as W    """    dW = np.zeros(W.shape)   # initialize the gradient as zero    # compute the loss and the gradient    num_classes = W.shape[1]    num_train = X.shape[0]    loss = 0.0    for i in xrange(num_train):            scores = X[i].dot(W)            correct_class_score = scores[y[i]]        for j in xrange(num_classes):            if j == y[i]:    #根据公式，正确的那个不用算                continue            margin = scores[j] - correct_class_score + 1   # note delta = 1            if margin > 0:                loss += margin                dW[:, y[i]] += -X[i, :]     #  根据公式：∇Wyi Li = - xiT(∑j≠yi1(xiWj - xiWyi +1>0)) + 2λWyi                 dW[:, j] += X[i, :]         #  根据公式： ∇Wj Li = xiT 1(xiWj - xiWyi +1>0) + 2λWj , (j≠yi)    # Right now the loss is a sum over all training examples, but we want it    # to be an average instead so we divide by num_train.    loss /= num_train    dW /= num_train    # Add regularization to the loss.    loss += 0.5 * reg * np.sum(W * W)    dW += reg * W    return loss, dWdef svm_loss_vectorized(W, X, y, reg):    """    Structured SVM loss function, vectorized implementation.Inputs and outputs     are the same as svm_loss_naive.    """    loss = 0.0    dW = np.zeros(W.shape)   # initialize the gradient as zero    scores = X.dot(W)        # N by C    num_train = X.shape[0]    num_classes = W.shape[1]    scores_correct = scores[np.arange(num_train), y]   # 1 by N    scores_correct = np.reshape(scores_correct, (num_train, 1))  # N by 1    margins = scores - scores_correct + 1.0     # N by C    margins[np.arange(num_train), y] = 0.0    margins[margins <= 0] = 0.0    loss += np.sum(margins) / num_train    loss += 0.5 * reg * np.sum(W * W)    # compute the gradient    margins[margins > 0] = 1.0    row_sum = np.sum(margins, axis=1)                  # 1 by N    margins[np.arange(num_train), y] = -row_sum            dW += np.dot(X.T, margins)/num_train + reg * W     # D by C    return loss, dW

SVM_test.py

#-*-coding:utf-8-*-import randomimport numpy as npimport matplotlib.pyplot as pltfrom cs231n.data_utils import load_CIFAR10# Load the raw CIFAR-10 data.cifar10_dir = 'data/cifar10'X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)'''# As a sanity check, we print out the size of the training and test data.print('Training data shape: ', X_train.shape)print('Training labels shape: ', y_train.shape)print('Test data shape: ', X_test.shape)print('Test labels shape: ', y_test.shape)''''''# Visualize some examples from the dataset.# We show a few examples of training images from each class.classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']num_classes = len(classes)samples_per_class = 7for y, cls in enumerate(classes):    idxs = np.flatnonzero(y_train == y)    idxs = np.random.choice(idxs, samples_per_class, replace=False)    for i, idx in enumerate(idxs):        plt_idx = i * num_classes + y + 1        plt.subplot(samples_per_class, num_classes, plt_idx)        plt.imshow(X_train[idx].astype('uint8'))        plt.axis('off')        if i == 0:            plt.title(cls)plt.show()'''# Split the data into train, val, and test sets. In addition we will# create a small development set as a subset of the training data;# we can use this for development so our code runs faster.num_training = 49000num_validation = 1000num_test = 1000num_dev = 500# Our validation set will be num_validation points from the original training set.mask = range(num_training, num_training + num_validation)X_val = X_train[mask]y_val = y_train[mask]# Our training set will be the first num_train points from the original training set.mask = range(num_training)X_train = X_train[mask]y_train = y_train[mask]# We will also make a development set, which is a small subset of the training set.mask = np.random.choice(num_training, num_dev, replace=False)X_dev = X_train[mask]y_dev = y_train[mask]# We use the first num_test points of the original test set as our test set.mask = range(num_test)X_test = X_test[mask]y_test = y_test[mask]'''print('Train data shape: ', X_train.shape)print('Train labels shape: ', y_train.shape)print('Validation data shape: ', X_val.shape)print('Validation labels shape: ', y_val.shape)print('Test data shape: ', X_test.shape)print('Test labels shape: ', y_test.shape)'''# Preprocessing: reshape the image data into rowsX_train = np.reshape(X_train, (X_train.shape[0], -1))X_val = np.reshape(X_val, (X_val.shape[0], -1))X_test = np.reshape(X_test, (X_test.shape[0], -1))X_dev = np.reshape(X_dev, (X_dev.shape[0], -1))'''# As a sanity check, print out the shapes of the dataprint('Training data shape: ', X_train.shape)print('Validation data shape: ', X_val.shape)print('Test data shape: ', X_test.shape)print('dev data shape: ', X_dev.shape)'''# Preprocessing: subtract the mean image# first: compute the image mean based on the training datamean_image = np.mean(X_train, axis=0)'''print(mean_image[:10]) # print a few of the elementsplt.figure(figsize=(4,4))plt.imshow(mean_image.reshape((32,32,3)).astype('uint8')) # visualize the mean imageplt.show()'''# second: subtract the mean image from train and test dataX_train -= mean_imageX_val -= mean_imageX_test -= mean_imageX_dev -= mean_image# third: append the bias dimension of ones (i.e. bias trick) so that our SVM# only has to worry about optimizing a single weight matrix W.X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])#print(X_train.shape, X_val.shape, X_test.shape, X_dev.shape)# Evaluate the naive implementation of the loss we provided for you:from cs231n.classifiers.linear_svm import svm_loss_naiveimport time# generate a random SVM weight matrix of small numbersW = np.random.randn(3073, 10) * 0.0001loss, grad = svm_loss_naive(W, X_dev, y_dev, 0.000005)print('loss: %f' % (loss, )) #输出：loss: 9.548658# Compute the loss and its gradient at W.loss, grad = svm_loss_naive(W, X_dev, y_dev, 0.0)'''# Numerically compute the gradient along several randomly chosen dimensions, and# compare them with your analytically computed gradient. The numbers should match# almost exactly along all dimensions.from cs231n.gradient_check import grad_check_sparsef = lambda w: svm_loss_naive(w, X_dev, y_dev, 0.0)[0]grad_numerical = grad_check_sparse(f, W, grad)# do the gradient check once again with regularization turned on# you didn't forget the regularization gradient did you?loss, grad = svm_loss_naive(W, X_dev, y_dev, 5e1)f = lambda w: svm_loss_naive(w, X_dev, y_dev, 5e1)[0]grad_numerical = grad_check_sparse(f, W, grad)''''''# Next implement the function svm_loss_vectorized; for now only compute the loss;# we will implement the gradient in a moment.tic = time.time()loss_naive, grad_naive = svm_loss_naive(W, X_dev, y_dev, 0.000005)toc = time.time()print('Naive loss: %e computed in %fs' % (loss_naive, toc - tic))from cs231n.classifiers.linear_svm import svm_loss_vectorizedtic = time.time()loss_vectorized, _ = svm_loss_vectorized(W, X_dev, y_dev, 0.000005)toc = time.time()print('Vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))# The losses should match but your vectorized implementation should be much faster.print('difference: %f' % (loss_naive - loss_vectorized))''''''# Complete the implementation of svm_loss_vectorized, and compute the gradient# of the loss function in a vectorized way.# The naive implementation and the vectorized implementation should match, but# the vectorized version should still be much faster.tic = time.time()loss_naive, grad_naive = svm_loss_naive(W, X_dev, y_dev, 0.000005)toc = time.time()print('Naive loss and gradient: computed in %fs' % (toc - tic))tic = time.time()#_, grad_vectorized = svm_loss_vectorized(W, X_dev, y_dev, 0.000005)toc = time.time()print('Vectorized loss and gradient: computed in %fs' % (toc - tic))# The loss is a single number, so it is easy to compare the values computed# by the two implementations. The gradient on the other hand is a matrix, so# we use the Frobenius norm to compare them.#difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')#print('difference: %f' % difference)'''# In the file linear_classifier.py, implement SGD in the function# LinearClassifier.train() and then run it with the code below.from cs231n.classifiers.linear_classifier import LinearSVMsvm = LinearSVM()tic = time.time()loss_hist = svm.train(X_train, y_train, learning_rate=1e-7, reg=2.5e4, num_iters=1500, verbose=True)toc = time.time()print('That took %fs' % (toc - tic))'''# A useful debugging strategy is to plot the loss as a function of# iteration number:plt.plot(loss_hist)plt.xlabel('Iteration number')plt.ylabel('Loss value')plt.show()'''# Write the LinearSVM.predict function and evaluate the performance on both the# training and validation sety_train_pred = svm.predict(X_train)print('training accuracy: %f' % (np.mean(y_train == y_train_pred), ))y_val_pred = svm.predict(X_val)print('validation accuracy: %f' % (np.mean(y_val == y_val_pred), ))# Use the validation set to tune hyperparameters (regularization strength and# learning rate). You should experiment with different ranges for the learning# rates and regularization strengths; if you are careful you should be able to# get a classification accuracy of about 0.4 on the validation set.learning_rates = [1e-7, 5e-5]regularization_strengths = [2.5e4, 5e4]# results is dictionary mapping tuples of the form# (learning_rate, regularization_strength) to tuples of the form# (training_accuracy, validation_accuracy). The accuracy is simply the fraction# of data points that are correctly classified.results = {}best_val = -1   # The highest validation accuracy that we have seen so far.best_svm = None # The LinearSVM object that achieved the highest validation rate.iters= 1000for lr in learning_rates:    for rs in regularization_strengths:        svm = LinearSVM()        svm.train(X_train, y_train, learning_rate=lr, reg=rs, num_iters=iters)        y_train_pred = svm.predict(X_train)        acc_train = np.mean(y_train == y_train_pred)        y_val_pred = svm.predict(X_val)        acc_val = np.mean(y_val == y_val_pred)        results[(lr, rs)] = (acc_train, acc_val)        if best_val < acc_val:            best_val = acc_val            best_svm = svm# Print out results.for lr, reg in sorted(results):    train_accuracy, val_accuracy = results[(lr, reg)]    print 'lr %e reg %e train accuracy: %f val accuracy: %f' % (                lr, reg, train_accuracy, val_accuracy)print 'best validation accuracy achieved during cross-validation: %f' % best_val'''# Visualize the cross-validation resultsimport mathx_scatter = [math.log10(x[0]) for x in results]y_scatter = [math.log10(x[1]) for x in results]# plot training accuracymarker_size = 100colors = [results[x][0] for x in results]plt.subplot(2, 1, 1)plt.scatter(x_scatter, y_scatter, marker_size, c=colors)plt.colorbar()plt.xlabel('log learning rate')plt.ylabel('log regularization strength')plt.title('CIFAR-10 training accuracy')# plot validation accuracycolors = [results[x][1] for x in results] # default size of markers is 20plt.subplot(2, 1, 2)plt.scatter(x_scatter, y_scatter, marker_size, c=colors)plt.colorbar()plt.xlabel('log learning rate')plt.ylabel('log regularization strength')plt.title('CIFAR-10 validation accuracy')plt.show()'''# Evaluate the best svm on test sety_test_pred = best_svm.predict(X_test)test_accuracy = np.mean(y_test == y_test_pred)print('linear SVM on raw pixels final test set accuracy: %f' % test_accuracy)# Visualize the learned weights for each class.# Depending on your choice of learning rate and regularization strength, these may# or may not be nice to look at.w = best_svm.W[:-1, :]  # strip out the biasw = w.reshape(32, 32, 3, 10)w_min, w_max = np.min(w), np.max(w)classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']for i in range(10):    plt.subplot(2, 5, i + 1)    # Rescale the weights to be between 0 and 255    wimg = 255.0 * (w[:, :, :, i].squeeze() - w_min) / (w_max - w_min)    plt.imshow(wimg.astype('uint8'))    plt.axis('off')    plt.title(classes[i])plt.show()

四、linear_classifier

　　从编程思路上来看，上面三个是不同的策略，确切的说是线性分类器的集中方法，所以我们用一个LinearClassifier类来调用他们。

from linear_svm import *from softmax import *class LinearClassifier(object):        def __init__(self):                self.W = None        def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100,                           batch_size=200, verbose=True):  #注意这里传递的参数设置        """                Train this linear classifier using stochastic gradient descent.           Inputs:               - X: A numpy array of shape (N, D) containing training data; there are N                       training samples each of dimension D.                - y: A numpy array of shape (N,) containing training labels; y[i] = c                       means that X[i] has label 0 <= c < C for C classes.                - learning_rate: (float) learning rate for optimization.                - reg: (float) regularization strength.                - num_iters: (integer) number of steps to take when optimizing        - batch_size: (integer) number of training examples to use at each step.                - verbose: (boolean) If true, print progress during optimization.        Outputs:                 A list containing the value of the loss function at each training iteration.        """        num_train, dim = X.shape        # assume y takes values 0...K-1 where K is number of classes        num_classes = np.max(y) + 1          if self.W is None:            # lazily initialize W            self.W = 0.001 * np.random.randn(dim, num_classes)   # 初始化W        # Run stochastic gradient descent(Mini-Batch) to optimize W        loss_history = []        for it in xrange(num_iters):  #每次随机取batch的数据来进行梯度下降            X_batch = None            y_batch = None            # Sampling with replacement is faster than sampling without replacement.            sample_index = np.random.choice(num_train, batch_size, replace=False)            X_batch = X[sample_index, :]   # batch_size by D            y_batch = y[sample_index]      # 1 by batch_size            # evaluate loss and gradient            loss, grad = self.loss(X_batch, y_batch, reg)            loss_history.append(loss)            # perform parameter update            self.W += -learning_rate * grad            if verbose and it % 100 == 0:                print 'Iteration %d / %d: loss %f' % (it, num_iters, loss)        return loss_history    def predict(self, X):            """            Use the trained weights of this linear classifier to predict labels for           data points.            Inputs:            - X: D x N array of training data. Each column is a D-dimensional point.            Returns:            - y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional                       array of length N, and each element is an integer giving the                   predicted class.          """        y_pred = np.zeros(X.shape[1])    # 1 by N        X=X.T        y_pred = np.argmax(X.dot(self.W), axis=0) #预测直接找到最后y最大的那个值        return y_pred    def loss(self, X_batch, y_batch, reg):           """            Compute the loss function and its derivative.            Subclasses will override this.            Inputs:            - X_batch: A numpy array of shape (N, D) containing a minibatch of N                    data points; each point has dimension D.            - y_batch: A numpy array of shape (N,) containing labels for the minibatch.        - reg: (float) regularization strength.           Returns: A tuple containing:            - loss as a single float            - gradient with respect to self.W; an array of the same shape as W           """            passclass LinearSVM(LinearClassifier):       """     A subclass that uses the Multiclass SVM loss function     """        def loss(self, X_batch, y_batch, reg):                return svm_loss_vectorized(self.W, X_batch, y_batch, reg)class Softmax(LinearClassifier):       """     A subclass that uses the Softmax + Cross-entropy loss function     """        def loss(self, X_batch, y_batch, reg):                return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)

Softmax_test.py

#-*-coding:utf-8-*-from __future__ import print_functionimport randomimport numpy as npfrom cs231n.data_utils import load_CIFAR10import matplotlib.pyplot as plt#matplotlib inlineplt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plotsplt.rcParams['image.interpolation'] = 'nearest'plt.rcParams['image.cmap'] = 'gray'cifar10_dir = 'data/cifar10'X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)num_training=49000num_validation=1000num_test=1000num_dev=500mask = list(range(num_training,num_training+num_validation))x_val = X_train[mask]y_val = y_train[mask]mask = list(range(num_training))x_train = X_train[mask]y_train = y_train[mask]mask = list(range(num_test))x_test = X_test[mask]y_test = y_test[mask]mask = np.random.choice(num_training, num_dev, replace=False)x_dev = X_train[mask]y_dev = y_train[mask]#reshape the image date into rowsx_train = np.reshape(x_train,(x_train.shape[0],-1))x_test = np.reshape(x_test,(x_test.shape[0],-1))x_dev = np.reshape(x_dev,(x_dev.shape[0],-1))x_val = np.reshape(x_val,(x_val.shape[0],-1))# Normalize the data: subtract the mean imagemean_image = np.mean(x_train,axis=0)x_train -= mean_imagex_val -= mean_imagex_dev -= mean_imagex_test -= mean_image# add bias dimension and transform into columnsx_train = np.hstack([x_train,np.ones((x_train.shape[0],1))])x_test = np.hstack([x_test,np.ones((x_test.shape[0],1))])x_dev = np.hstack([x_dev,np.ones((x_dev.shape[0],1))])x_val = np.hstack([x_val,np.ones((x_val.shape[0],1))])from cs231n.classifiers.softmax import softmax_loss_naiveimport time# Generate a random softmax weight matrix and use it to compute the loss.w = np.random.randn(x_train.shape[1],10)*0.0001loss, grad = softmax_loss_naive(w, x_dev, y_dev, 0.0)'''# As we did for the SVM, use numeric gradient checking as a debugging tool.# The numeric gradient should be close to the analytic gradient.from cs231n.gradient_check import grad_check_sparsef = lambda w: softmax_loss_naive(w, x_dev, y_dev, 0.0)[0]grad_numerical = grad_check_sparse(f, w, grad, 10)# similar to SVM case, do another gradient check with regularizationloss, grad = softmax_loss_naive(w, x_dev, y_dev, 5e1)f = lambda w: softmax_loss_naive(w, x_dev, y_dev, 5e1)[0]grad_numerical = grad_check_sparse(f, w, grad, 10)''''''# Now that we have a naive implementation of the softmax loss function and its gradient,# implement a vectorized version in softmax_loss_vectorized.# The two versions should compute the same results, but the vectorized version should be# much faster.tic = time.time()loss_naive, grad_naive = softmax_loss_naive(w, x_dev, y_dev, 0.000005)toc = time.time()print('naive loss: %e computed in %fs' % (loss_naive, toc - tic))from cs231n.classifiers.softmax import softmax_loss_vectorizedtic = time.time()loss_vectorized, grad_vectorized = softmax_loss_vectorized(w, x_dev, y_dev, 0.000005)toc = time.time()print('vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))# As we did for the SVM, we use the Frobenius norm to compare the two versions# of the gradient.grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')print('Loss difference: %f' % np.abs(loss_naive - loss_vectorized))print('Gradient difference: %f' % grad_difference)'''# Use the validation set to tune hyperparameters (regularization strength and# learning rate). You should experiment with different ranges for the learning# rates and regularization strengths; if you are careful you should be able to# get a classification accuracy of over 0.35 on the validation set.from cs231n.classifiers import Softmaxresults = {}best_val = -1best_softmax = Nonelearning_rates = [1e-7, 5e-7]regularization_strengths = [2.5e4, 5e4]################################################################################# TODO:                                                                        ## Use the validation set to set the learning rate and regularization strength. ## This should be identical to the validation that you did for the SVM; save    ## the best trained softmax classifer in best_softmax.                          #################################################################################iters= 1000for lr in learning_rates:    for rs in regularization_strengths:        softmax = Softmax()        softmax.train(x_train, y_train, learning_rate=lr, reg=rs, num_iters=iters)        y_train_pred = softmax.predict(x_train)        acc_train = np.mean(y_train == y_train_pred)        y_val_pred = softmax.predict(x_val)        acc_val = np.mean(y_val == y_val_pred)        results[(lr, rs)] = (acc_train, acc_val)        if best_val < acc_val:            best_val = acc_val            best_softmax = softmax#################################################################################                              END OF YOUR CODE                                ################################################################################## Print out results.for lr, reg in sorted(results):    train_accuracy, val_accuracy = results[(lr, reg)]    print('lr %e reg %e train accuracy: %f val accuracy: %f' % (        lr, reg, train_accuracy, val_accuracy))print('best validation accuracy achieved during cross-validation: %f' % best_val)# evaluate on test set# Evaluate the best softmax on test sety_test_pred = best_softmax.predict(x_test)test_accuracy = np.mean(y_test == y_test_pred)print('softmax on raw pixels final test set accuracy: %f' % (test_accuracy, ))# Visualize the learned weights for each classw = best_softmax.W[:-1, :]  # strip out the biasw = w.reshape(32, 32, 3, 10)w_min, w_max = np.min(w), np.max(w)classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']for i in range(10):    plt.subplot(2, 5, i + 1)    # Rescale the weights to be between 0 and 255    wimg = 255.0 * (w[:, :, :, i].squeeze() - w_min) / (w_max - w_min)    plt.imshow(wimg.astype('uint8'))    plt.axis('off')    plt.title(classes[i])plt.show()

五、简单的两层神经网络

这里只是一个简单的神经网络的写法，在下次作业会有一个很好很强大的神经网络等我们去构造。

BP可以看这幅图来理解：

# -*- coding:utf-8 -*-from __future__ import print_functionimport numpy as npimport matplotlib.pyplot as pltfrom past.builtins import xrangeclass TwoLayerNet(object):  """  A two-layer fully-connected neural network. The net has an input dimension of  N, a hidden layer dimension of H, and performs classification over C classes.  We train the network with a softmax loss function and L2 regularization on the  weight matrices. The network uses a ReLU nonlinearity after the first fully  connected layer.  In other words, the network has the following architecture:  input - fully connected layer - ReLU - fully connected layer - softmax  The outputs of the second fully-connected layer are the scores for each class.  """  def __init__(self, input_size, hidden_size, output_size, std=1e-4):    """    Initialize the model. Weights are initialized to small random values and    biases are initialized to zero. Weights and biases are stored in the    variable self.params, which is a dictionary with the following keys:    W1: First layer weights; has shape (D, H)    b1: First layer biases; has shape (H,)    W2: Second layer weights; has shape (H, C)    b2: Second layer biases; has shape (C,)    Inputs:    - input_size: The dimension D of the input data.    - hidden_size: The number of neurons H in the hidden layer.    - output_size: The number of classes C.    """    self.params = {}    self.params['W1'] = std * np.random.randn(input_size, hidden_size)    self.params['b1'] = np.zeros(hidden_size)    self.params['W2'] = std * np.random.randn(hidden_size, output_size)    self.params['b2'] = np.zeros(output_size)  def loss(self, X, y=None, reg=0.0):    """    Compute the loss and gradients for a two layer fully connected neural    network.    Inputs:    - X: Input data of shape (N, D). Each X[i] is a training sample.    - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is      an integer in the range 0 <= y[i] < C. This parameter is optional; if it      is not passed then we only return scores, and if it is passed then we      instead return the loss and gradients.    - reg: Regularization strength.    Returns:    If y is None, return a matrix scores of shape (N, C) where scores[i, c] is    the score for class c on input X[i].    If y is not None, instead return a tuple of:    - loss: Loss (data loss and regularization loss) for this batch of training      samples.    - grads: Dictionary mapping parameter names to gradients of those parameters      with respect to the loss function; has the same keys as self.params.    """    # Unpack variables from the params dictionary    W1, b1 = self.params['W1'], self.params['b1']    W2, b2 = self.params['W2'], self.params['b2']    N, D = X.shape    # Compute the forward pass    scores = None    #############################################################################    # TODO: Perform the forward pass, computing the class scores for the input. #    # Store the result in the scores variable, which should be an array of      #    # shape (N, C).                                                             #    #############################################################################    # evaluate class scores, [N x K]    hidden_layer = np.maximum(0, np.dot(X, W1) + b1)  # ReLU activation    scores = np.dot(hidden_layer, W2) + b2    #############################################################################    #                              END OF YOUR CODE                             #    #############################################################################        # If the targets are not given then jump out, we're done    if y is None:      return scores    # Compute the loss    loss = None    #############################################################################    # TODO: Finish the forward pass, and compute the loss. This should include  #    # both the data loss and L2 regularization for W1 and W2. Store the result  #    # in the variable loss, which should be a scalar. Use the Softmax           #    # classifier loss.                                                          #    #############################################################################    # compute the class probabili。ties    # scores -= np.max(scores, axis = 1)[:, np.newaxis]    exp_scores = np.exp(scores - np.max(scores,axis=1,keepdims=True))    #exp_scores = np.exp(scores)    probs = exp_scores/np.sum(exp_scores,axis=1,keepdims=True)    correct_logprobs = np.log(probs[range(N), y])#find the num corrosd to y    data_loss = -np.sum(correct_logprobs)/N    reg_loss = 0.5*reg*(np.sum(W1*W1)+np.sum(W2*W2))    loss = data_loss + reg_loss    #############################################################################    #                              END OF YOUR CODE                             #    #############################################################################    # Backward pass: compute gradients    grads = {}    #############################################################################    # TODO: Compute the backward pass, computing the derivatives of the weights #    # and biases. Store the results in the grads dictionary. For example,       #    # grads['W1'] should store the gradient on W1, and be a matrix of same size #    #############################################################################    # compute the gradient on scores    dscores = probs    dscores[range(N), y] -= 1    dscores /= N    # backpropate the gradient to the parameters    # first backprop into parameters W2 and b2    dW2 = np.dot(hidden_layer.T, dscores)    db2 = np.sum(dscores, axis=0, keepdims=False)    # next backprop into hidden layer    dhidden = np.dot(dscores, W2.T) #上一层的损失*该层的权重    # backprop the ReLU non-linearity    dhidden[hidden_layer <= 0] = 0    # finally into W,b    dW1 = np.dot(X.T, dhidden)    db1 = np.sum(dhidden, axis=0, keepdims=False)    # add regularization gradient contribution    dW2 += reg * W2    dW1 += reg * W1    grads['W1'] = dW1    grads['W2'] = dW2    grads['b1'] = db1    grads['b2'] = db2    # print dW1.shape, dW2.shape, db1.shape, db2.shape    #############################################################################    #                              END OF YOUR CODE                             #    #############################################################################    return loss, grads  def train(self, X, y, X_val, y_val,            learning_rate=1e-3, learning_rate_decay=0.95,            reg=5e-6, num_iters=100,            batch_size=200, verbose=False):    """    Train this neural network using stochastic gradient descent.    Inputs:    - X: A numpy array of shape (N, D) giving training data.    - y: A numpy array f shape (N,) giving training labels; y[i] = c means that      X[i] has label c, where 0 <= c < C.    - X_val: A numpy array of shape (N_val, D) giving validation data.    - y_val: A numpy array of shape (N_val,) giving validation labels.    - learning_rate: Scalar giving learning rate for optimization.    - learning_rate_decay: Scalar giving factor used to decay the learning rate      after each epoch.    - reg: Scalar giving regularization strength.    - num_iters: Number of steps to take when optimizing.    - batch_size: Number of training examples to use per step.    - verbose: boolean; if true print progress during optimization.    """    num_train = X.shape[0]    iterations_per_epoch = max(num_train / batch_size, 1)    # Use SGD to optimize the parameters in self.model    loss_history = []    train_acc_history = []    val_acc_history = []    for it in xrange(num_iters):      X_batch = None      y_batch = None      #########################################################################      # TODO: Create a random minibatch of training data and labels, storing  #      # them in X_batch and y_batch respectively.                             #      #########################################################################      sample_index = np.random.choice(num_train, batch_size, replace=True)           X_batch = X[sample_index, :]      y_batch = y[sample_index]      #########################################################################      #                             END OF YOUR CODE                          #      #########################################################################      # Compute loss and gradients using the current minibatch      loss, grads = self.loss(X_batch, y=y_batch, reg=reg)      loss_history.append(loss)      #########################################################################      # TODO: Use the gradients in the grads dictionary to update the         #      # parameters of the network (stored in the dictionary self.params)      #      # using stochastic gradient descent. You'll need to use the gradients   #      # stored in the grads dictionary defined above.                         #      #########################################################################      dW1 = grads['W1']      dW2 = grads['W2']      db1 = grads['b1']      db2 = grads['b2']      self.params['W1'] -= learning_rate * dW1      self.params['W2'] -= learning_rate * dW2      self.params['b1'] -= learning_rate * db1      self.params['b2'] -= learning_rate * db2      #########################################################################      #                             END OF YOUR CODE                          #      #########################################################################      if verbose and it % 100 == 0:        print('iteration %d / %d: loss %f' % (it, num_iters, loss))      # Every epoch, check train and val accuracy and decay learning rate.      if it % iterations_per_epoch == 0:        # Check accuracy        train_acc = (self.predict(X_batch) == y_batch).mean()        val_acc = (self.predict(X_val) == y_val).mean()        train_acc_history.append(train_acc)        val_acc_history.append(val_acc)        # Decay learning rate        learning_rate *= learning_rate_decay    return {      'loss_history': loss_history,      'train_acc_history': train_acc_history,      'val_acc_history': val_acc_history,    }  def predict(self, X):    """    Use the trained weights of this two-layer network to predict labels for    data points. For each data point we predict scores for each of the C    classes, and assign each data point to the class with the highest score.    Inputs:    - X: A numpy array of shape (N, D) giving N D-dimensional data points to      classify.    Returns:    - y_pred: A numpy array of shape (N,) giving predicted labels for each of      the elements of X. For all i, y_pred[i] = c means that X[i] is predicted      to have class c, where 0 <= c < C.    """    y_pred = None    ###########################################################################    # TODO: Implement this function; it should be VERY simple!                #    ###########################################################################    hidden_lay = np.maximum(0, np.dot(X, self.params['W1']) + self.params['b1'])    y_pred = np.argmax(np.dot(hidden_lay, self.params['W2']), axis=1)    ###########################################################################    #                              END OF YOUR CODE                           #    ###########################################################################    return y_pred

TwoLayersNet_test.py

from __future__ import print_functionimport numpy as npimport matplotlib.pyplot as pltfrom cs231n.classifiers.neural_net import TwoLayerNet'''return ralatives error'''def rel_error(x,y):    return np.max(np.abs(x-y)/(np.maximum(1e-8,np.abs(x)+np.abs(y))))'''#creat a small net and some toy data to check your implementationsinput_size = 4hidden_size = 10num_classes = 3num_inputs = 5def init_model():    np.random.seed(0)    return TwoLayerNet(input_size, hidden_size, num_classes, std=1e-1)def init_data():    np.random.seed(1)    x = 10*np.random.randn(num_inputs,input_size)    #y = ([0, 1, 2, 2, 1])    y = np.array([0,1,2,2,1])    return x,yx,y = init_data()net = init_model()scores = net.loss(x)print (scores)print('correct scores:')correct_scores = np.asarray([  [-0.81233741, -1.27654624, -0.70335995],  [-0.17129677, -1.18803311, -0.47310444],  [-0.51590475, -1.01354314, -0.8504215 ],  [-0.15419291, -0.48629638, -0.52901952],  [-0.00618733, -0.12435261, -0.15226949]])print(correct_scores)print('Difference between your scores and correct scores:')print(np.sum(np.abs(scores - correct_scores)))loss, _ = net.loss(x, y, reg=0.05)correct_loss = 1.30378789133print ('loss:%f' % loss)# should be very small, we get < 1e-12print('Difference between your loss and correct loss:')print(np.sum(np.abs(loss - correct_loss)))from cs231n.gradient_check import eval_numerical_gradient# Use numeric gradient checking to check your implementation of the backward pass.# If your implementation is correct, the difference between the numeric and# analytic gradients should be less than 1e-8 for each of W1, W2, b1, and b2.loss, grads = net.loss(x, y, reg=0.05)# these should all be less than 1e-8 or sofor param_name in grads:    f = lambda W: net.loss(x, y, reg=0.05)[0]    param_grad_num = eval_numerical_gradient(f, net.params[param_name], verbose=False)    print('%s max relative error: %e' % (param_name, rel_error(param_grad_num, grads[param_name])))net = init_model()stats = net.train(x, y, x, y,            learning_rate=1e-1, reg=5e-6,            num_iters=100,  batch_size=4,verbose=False)print('Final training loss: ', stats['loss_history'][-1])# plot the loss historyplt.plot(stats['loss_history'])plt.xlabel('iteration')plt.ylabel('training loss')plt.title('Training Loss history')plt.show()'''from cs231n.data_utils import load_CIFAR10def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):    """    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare    it for the two-layer neural net classifier. These are the same steps as    we used for the SVM, but condensed to a single function.    """    # Load the raw CIFAR-10 data    cifar10_dir = 'data/cifar10'    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)    # Subsample the data    mask = list(range(num_training, num_training + num_validation))    X_val = X_train[mask]    y_val = y_train[mask]    mask = list(range(num_training))    X_train = X_train[mask]    y_train = y_train[mask]    mask = list(range(num_test))    X_test = X_test[mask]    y_test = y_test[mask]    # Normalize the data: subtract the mean image    mean_image = np.mean(X_train, axis=0)    X_train -= mean_image    X_val -= mean_image    X_test -= mean_image    # Reshape data to rows    X_train = X_train.reshape(num_training, -1)    X_val = X_val.reshape(num_validation, -1)    X_test = X_test.reshape(num_test, -1)    return X_train, y_train, X_val, y_val, X_test, y_test# Invoke the above function to get our data.x_train, y_train, x_val, y_val, x_test, y_test = get_CIFAR10_data()print('Train data shape: ', x_train.shape)print('Train labels shape: ', y_train.shape)print('Validation data shape: ', x_val.shape)print('Validation labels shape: ', y_val.shape)print('Test data shape: ', x_test.shape)print('Test labels shape: ', y_test.shape)input_size = 32*32*3hidden_size = 50num_classes = 10'''net = TwoLayerNet(input_size, hidden_size, num_classes)stats = net.train(x_train, y_train, x_val, y_val,                  learning_rate=1e-4, learning_rate_decay=0.95, reg=0.25,                  num_iters = 10000, batch_size=200,verbose= True)# Predict on the validation setval_acc = (net.predict(x_val) == y_val).mean()print('Validation accuracy: ', val_acc)# Plot the loss function and train / validation accuraciesplt.subplot(2, 1, 1)plt.plot(stats['loss_history'])plt.title('Loss history')plt.xlabel('Iteration')plt.ylabel('Loss')plt.subplot(2, 1, 2)plt.plot(stats['train_acc_history'], label='train')plt.plot(stats['val_acc_history'], label='val')plt.title('Classification accuracy history')plt.xlabel('Epoch')plt.ylabel('Clasification accuracy')plt.show()from cs231n.vis_utils import visualize_grid# Visualize the weights of the networkdef show_net_weights(net):    W1 = net.params['W1']    W1 = W1.reshape(32, 32, 3, -1).transpose(3, 0, 1, 2)    plt.imshow(visualize_grid(W1, padding=3).astype('uint8'))    plt.gca().axis('off')    plt.show()show_net_weights(net)'''best_net = None # store the best model into this \best_acc = 0hidden_size_choice = [x*100+50 for x in xrange(11)]reg_choice = [0.1, 0.5, 5, 15, 50, 100, 1000]learning_rate_choice = [1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 1e-1, 1]batch_size_choice = [8, 40, 80, 160, 500, 1000]#hidden_size_choice = [400]#learning_rate_choice = [3e-3]#reg_choice = [0.02, 0.05, 0.1]#batch_size_choice =[500]num_iters_choice = [5000]for batch_size_curr in batch_size_choice:    for reg_cur in reg_choice:        for learning_rate_curr in learning_rate_choice:            for hidden_size_curr in hidden_size_choice:                for num_iters_curr in num_iters_choice:                    print                    print ("current training hidden_size:",hidden_size_curr)                    print ("current training learning_rate:",learning_rate_curr)                    print ("current training reg:",reg_cur)                    print ("current training batch_size:",batch_size_curr)                    net = TwoLayerNet(input_size, hidden_size_curr, num_classes)                    stats = net.train(x_train, y_train, x_val, y_val,                                           learning_rate=learning_rate_curr, learning_rate_decay=0.95,reg=reg_cur,                                           num_iters=num_iters_curr, batch_size=batch_size_curr, verbose=True)                    val_acc = (net.predict(x_val) == y_val).mean()                    print ("current val_acc:%f" % val_acc)                    if val_acc>best_acc:                        best_acc = val_acc                        best_net = net                        best_stats = stats                        best_learning_rate = learning_rate_curr                        best_reg = reg_cur                        best_batch_size = batch_size_curr                        print                        print ("best_acc:",best_acc)                        print ("best hidden_size:",best_net.params['W1'].shape[1])                        print ("best learning_rate:",best_learning_rate)                        print ("best reg:",best_reg)                        print ("best batch_size:",best_batch_size)                        print

阅读全文

0 0