1 参数


















8)学习率(learning rate):η


9)Regularization parameter:正则化参数λ





一般的参数调试流程为:隐藏层的层数 -> 学习率η ->  Regularization parameter:λ -> mini_batch_size -> 其他。其他为hyper-parameter参数,包括激活函数(sigmoid,tanh,ReLU等),cost函数(二次,cross-entropy等),随机梯度下降的其他变种方法(如,Hessian优化等等)。

2 参数程序调试


2.1  network.py部分

"""network.py~~~~~~~~~~A module to implement the stochastic gradient descent learningalgorithm for a feedforward neural network.  Gradients are calculatedusing backpropagation.  Note that I have focused on making the codesimple, easily readable, and easily modifiable.  It is not optimized,and omits many desirable features."""#### Libraries# Standard libraryimport random# Third-party librariesimport numpy as np#先声明:要复制该代码时,要将中文注释去掉,否则编译会出错,因为有时候不能识别中文字符的原因
#定义类class Network(object):    def __init__(self, sizes):#初始化,类似于构造函数的初始化        """The list ``sizes`` contains the number of neurons in the        respective layers of the network.  For example, if the list        was [2, 3, 1] then it would be a three-layer network, with the        first layer containing 2 neurons, the second layer 3 neurons,        and the third layer 1 neuron.  The biases and weights for the        network are initialized randomly, using a Gaussian        distribution with mean 0, and variance 1.  Note that the first        layer is assumed to be an input layer, and by convention we        won't set any biases for those neurons, since biases are only        ever used in computing the outputs from later layers."""        self.num_layers = len(sizes)#身神经网络的层数        self.sizes = sizes        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]#整体分布随机生成权重和偏向值        self.weights = [np.random.randn(y, x)                        for x, y in zip(sizes[:-1], sizes[1:])]    def feedforward(self, a):#前向遍历,即第一次正向遍历        """Return the output of the network if ``a`` is input."""        for b, w in zip(self.biases, self.weights):            a = sigmoid(np.dot(w, a)+b)#使用的是sigmoid函数        return a    #随机梯度下降法,参数有:训练集,迭代次数,每小份数据大小,学习率,测试集    def SGD(self, training_data, epochs, mini_batch_size, eta,            test_data=None):        """Train the neural network using mini-batch stochastic        gradient descent.  The ``training_data`` is a list of tuples        ``(x, y)`` representing the training inputs and the desired        outputs.  The other non-optional parameters are        self-explanatory.  If ``test_data`` is provided then the        network will be evaluated against the test data after each        epoch, and partial progress printed out.  This is useful for        tracking progress, but slows things down substantially."""        if test_data: n_test = len(test_data)        n = len(training_data)        for j in xrange(epochs):            random.shuffle(training_data)            mini_batches = [                training_data[k:k+mini_batch_size]#将原训练数据分成很多小份,每一份大小为mini_batch_size                for k in xrange(0, n, mini_batch_size)]            for mini_batch in mini_batches:                self.update_mini_batch(mini_batch, eta)            if test_data:                print "Epoch {0}: {1} / {2}".format(                    j, self.evaluate(test_data), n_test)            else:                print "Epoch {0} complete".format(j)    def update_mini_batch(self, mini_batch, eta):#更新权重和偏向,根据推到公式        """Update the network's weights and biases by applying        gradient descent using backpropagation to a single mini batch.        The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``        is the learning rate."""        nabla_b = [np.zeros(b.shape) for b in self.biases]        nabla_w = [np.zeros(w.shape) for w in self.weights]        for x, y in mini_batch:            delta_nabla_b, delta_nabla_w = self.backprop(x, y)            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]        self.weights = [w-(eta/len(mini_batch))*nw                        for w, nw in zip(self.weights, nabla_w)]        self.biases = [b-(eta/len(mini_batch))*nb                       for b, nb in zip(self.biases, nabla_b)]    def backprop(self, x, y):#反向更新遍历        """Return a tuple ``(nabla_b, nabla_w)`` representing the        gradient for the cost function C_x.  ``nabla_b`` and        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar        to ``self.biases`` and ``self.weights``."""        nabla_b = [np.zeros(b.shape) for b in self.biases]        nabla_w = [np.zeros(w.shape) for w in self.weights]        # feedforward        activation = x        activations = [x] # list to store all the activations, layer by layer        zs = [] # list to store all the z vectors, layer by layer        for b, w in zip(self.biases, self.weights):            z = np.dot(w, activation)+b            zs.append(z)            activation = sigmoid(z)            activations.append(activation)        # backward pass        delta = self.cost_derivative(activations[-1], y) * \            sigmoid_prime(zs[-1])        nabla_b[-1] = delta        nabla_w[-1] = np.dot(delta, activations[-2].transpose())        # Note that the variable l in the loop below is used a little        # differently to the notation in Chapter 2 of the book.  Here,        # l = 1 means the last layer of neurons, l = 2 is the        # second-last layer, and so on.  It's a renumbering of the        # scheme in the book, used here to take advantage of the fact        # that Python can use negative indices in lists.        for l in xrange(2, self.num_layers):            z = zs[-l]            sp = sigmoid_prime(z)            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp            nabla_b[-l] = delta            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())        return (nabla_b, nabla_w)    def evaluate(self, test_data):        """Return the number of test inputs for which the neural        network outputs the correct result. Note that the neural        network's output is assumed to be the index of whichever        neuron in the final layer has the highest activation."""        test_results = [(np.argmax(self.feedforward(x)), y)                        for (x, y) in test_data]        return sum(int(x == y) for (x, y) in test_results)    def cost_derivative(self, output_activations, y):        """Return the vector of partial derivatives \partial C_x /        \partial a for the output activations."""        return (output_activations-y)#### Miscellaneous functionsdef sigmoid(z):    """The sigmoid function."""    return 1.0/(1.0+np.exp(-z))def sigmoid_prime(z):    """Derivative of the sigmoid function."""    return sigmoid(z)*(1-sigmoid(z))

2.1.1 改变神经网络的层数:增加层数,运行看准确率

import mnist_loaderimport networkimport network2training_data,validation_data,test_data = mnist_loader.load_data_wrapper()#参数的相关设置:net = network.Network([784,10])net.SGD(training_data,30,10,3.0,test_data=test_data)


net = network.Network([784,30,10])net.SGD(training_data,30,10,3.0,test_data=test_data)


net = network.Network([784,30,30,10])net.SGD(training_data,30,10,3.0,test_data=test_data)

2.2.2 改变参数:学习率

net = network.Network([784,30,30,10])net.SGD(training_data,30,10,0.1,test_data=test_data)

2.2 network2.py 部分

本部分主要改变参数:cost代价函数,权重初始化,并都加入了Regularization。network2.py 的代码如下:

"""network2.py~~~~~~~~~~~~~~An improved version of network.py, implementing the stochasticgradient descent learning algorithm for a feedforward neural network.Improvements include the addition of the cross-entropy cost function,regularization, and better initialization of network weights.  Notethat I have focused on making the code simple, easily readable, andeasily modifiable.  It is not optimized, and omits many desirablefeatures."""#### Libraries# Standard libraryimport jsonimport randomimport sys# Third-party librariesimport numpy as np#### Define the quadratic and cross-entropy cost functionsclass QuadraticCost(object):#cost代价函数:二次函数    @staticmethod    def fn(a, y):        """Return the cost associated with an output ``a`` and desired output        ``y``.        """        return 0.5*np.linalg.norm(a-y)**2    @staticmethod    def delta(z, a, y):        """Return the error delta from the output layer."""        return (a-y) * sigmoid_prime(z)class CrossEntropyCost(object):#代价函数:cross-entropy    @staticmethod    def fn(a, y):        """Return the cost associated with an output ``a`` and desired output        ``y``.  Note that np.nan_to_num is used to ensure numerical        stability.  In particular, if both ``a`` and ``y`` have a 1.0        in the same slot, then the expression (1-y)*np.log(1-a)        returns nan.  The np.nan_to_num ensures that that is converted        to the correct value (0.0).        """        return np.sum(np.nan_to_num(-y*np.log(a)-(1-y)*np.log(1-a)))    @staticmethod    def delta(z, a, y): #cross-entropy的导数        """Return the error delta from the output layer.  Note that the        parameter ``z`` is not used by the method.  It is included in        the method's parameters in order to make the interface        consistent with the delta method for other cost classes.        """        return (a-y)#### Main Network classclass Network(object):    def __init__(self, sizes, cost=CrossEntropyCost):        """The list ``sizes`` contains the number of neurons in the respective        layers of the network.  For example, if the list was [2, 3, 1]        then it would be a three-layer network, with the first layer        containing 2 neurons, the second layer 3 neurons, and the        third layer 1 neuron.  The biases and weights for the network        are initialized randomly, using        ``self.default_weight_initializer`` (see docstring for that        method).        """        self.num_layers = len(sizes)        self.sizes = sizes        self.default_weight_initializer()        self.cost=cost    def default_weight_initializer(self):#默认初始化权重的方法,与network.py中的不同,使用了校准方差        """Initialize each weight using a Gaussian distribution with mean 0        and standard deviation 1 over the square root of the number of        weights connecting to the same neuron.  Initialize the biases        using a Gaussian distribution with mean 0 and standard        deviation 1.        Note that the first layer is assumed to be an input layer, and        by convention we won't set any biases for those neurons, since        biases are only ever used in computing the outputs from later        layers.        """        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]        self.weights = [np.random.randn(y, x)/np.sqrt(x) #校准方差                        for x, y in zip(self.sizes[:-1], self.sizes[1:])]    def large_weight_initializer(self):#权重初始化,与network.py中相同,与上面的初始化方法不一样        """Initialize the weights using a Gaussian distribution with mean 0        and standard deviation 1.  Initialize the biases using a        Gaussian distribution with mean 0 and standard deviation 1.        Note that the first layer is assumed to be an input layer, and        by convention we won't set any biases for those neurons, since        biases are only ever used in computing the outputs from later        layers.        This weight and bias initializer uses the same approach as in        Chapter 1, and is included for purposes of comparison.  It        will usually be better to use the default weight initializer        instead.        """        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]        self.weights = [np.random.randn(y, x)                        for x, y in zip(self.sizes[:-1], self.sizes[1:])]    #下面的代码与network.py中的相同    def feedforward(self, a):        """Return the output of the network if ``a`` is input."""        for b, w in zip(self.biases, self.weights):            a = sigmoid(np.dot(w, a)+b)        return a    #这里的SGD与之前的不同之处在于:加入了正则化项    def SGD(self, training_data, epochs, mini_batch_size, eta,            lmbda = 0.0,            evaluation_data=None,            monitor_evaluation_cost=False,            monitor_evaluation_accuracy=False,            monitor_training_cost=False,            monitor_training_accuracy=False):        """Train the neural network using mini-batch stochastic gradient        descent.  The ``training_data`` is a list of tuples ``(x, y)``        representing the training inputs and the desired outputs.  The        other non-optional parameters are self-explanatory, as is the        regularization parameter ``lmbda``.  The method also accepts        ``evaluation_data``, usually either the validation or test        data.  We can monitor the cost and accuracy on either the        evaluation data or the training data, by setting the        appropriate flags.  The method returns a tuple containing four        lists: the (per-epoch) costs on the evaluation data, the        accuracies on the evaluation data, the costs on the training        data, and the accuracies on the training data.  All values are        evaluated at the end of each training epoch.  So, for example,        if we train for 30 epochs, then the first element of the tuple        will be a 30-element list containing the cost on the        evaluation data at the end of each epoch. Note that the lists        are empty if the corresponding flag is not set.        """        if evaluation_data: n_data = len(evaluation_data)        n = len(training_data)        evaluation_cost, evaluation_accuracy = [], []        training_cost, training_accuracy = [], []        for j in xrange(epochs):            random.shuffle(training_data)            mini_batches = [                training_data[k:k+mini_batch_size]                for k in xrange(0, n, mini_batch_size)]            for mini_batch in mini_batches:                self.update_mini_batch(                    mini_batch, eta, lmbda, len(training_data))            print "Epoch %s training complete" % j            if monitor_training_cost:                cost = self.total_cost(training_data, lmbda)                training_cost.append(cost)                print "Cost on training data: {}".format(cost)            if monitor_training_accuracy:                accuracy = self.accuracy(training_data, convert=True)                training_accuracy.append(accuracy)                print "Accuracy on training data: {} / {}".format(                    accuracy, n)            if monitor_evaluation_cost:                cost = self.total_cost(evaluation_data, lmbda, convert=True)                evaluation_cost.append(cost)                print "Cost on evaluation data: {}".format(cost)            if monitor_evaluation_accuracy:                accuracy = self.accuracy(evaluation_data)                evaluation_accuracy.append(accuracy)                print "Accuracy on evaluation data: {} / {}".format(                    self.accuracy(evaluation_data), n_data)            print        return evaluation_cost, evaluation_accuracy, \            training_cost, training_accuracy    def update_mini_batch(self, mini_batch, eta, lmbda, n):        """Update the network's weights and biases by applying gradient        descent using backpropagation to a single mini batch.  The        ``mini_batch`` is a list of tuples ``(x, y)``, ``eta`` is the        learning rate, ``lmbda`` is the regularization parameter, and        ``n`` is the total size of the training data set.        """        nabla_b = [np.zeros(b.shape) for b in self.biases]        nabla_w = [np.zeros(w.shape) for w in self.weights]        for x, y in mini_batch:            delta_nabla_b, delta_nabla_w = self.backprop(x, y)            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]        self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw                        for w, nw in zip(self.weights, nabla_w)]#加入了正则化项        self.biases = [b-(eta/len(mini_batch))*nb                       for b, nb in zip(self.biases, nabla_b)]    def backprop(self, x, y):        """Return a tuple ``(nabla_b, nabla_w)`` representing the        gradient for the cost function C_x.  ``nabla_b`` and        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar        to ``self.biases`` and ``self.weights``."""        nabla_b = [np.zeros(b.shape) for b in self.biases]        nabla_w = [np.zeros(w.shape) for w in self.weights]        # feedforward        activation = x        activations = [x] # list to store all the activations, layer by layer        zs = [] # list to store all the z vectors, layer by layer        for b, w in zip(self.biases, self.weights):            z = np.dot(w, activation)+b            zs.append(z)            activation = sigmoid(z)            activations.append(activation)        # backward pass        delta = (self.cost).delta(zs[-1], activations[-1], y)        nabla_b[-1] = delta        nabla_w[-1] = np.dot(delta, activations[-2].transpose())        # Note that the variable l in the loop below is used a little        # differently to the notation in Chapter 2 of the book.  Here,        # l = 1 means the last layer of neurons, l = 2 is the        # second-last layer, and so on.  It's a renumbering of the        # scheme in the book, used here to take advantage of the fact        # that Python can use negative indices in lists.        for l in xrange(2, self.num_layers):            z = zs[-l]            sp = sigmoid_prime(z)            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp            nabla_b[-l] = delta            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())        return (nabla_b, nabla_w)    def accuracy(self, data, convert=False):        """Return the number of inputs in ``data`` for which the neural        network outputs the correct result. The neural network's        output is assumed to be the index of whichever neuron in the        final layer has the highest activation.        The flag ``convert`` should be set to False if the data set is        validation or test data (the usual case), and to True if the        data set is the training data. The need for this flag arises        due to differences in the way the results ``y`` are        represented in the different data sets.  In particular, it        flags whether we need to convert between the different        representations.  It may seem strange to use different        representations for the different data sets.  Why not use the        same representation for all three data sets?  It's done for        efficiency reasons -- the program usually evaluates the cost        on the training data and the accuracy on other data sets.        These are different types of computations, and using different        representations speeds things up.  More details on the        representations can be found in        mnist_loader.load_data_wrapper.        """        if convert:            results = [(np.argmax(self.feedforward(x)), np.argmax(y))                       for (x, y) in data]        else:            results = [(np.argmax(self.feedforward(x)), y)                        for (x, y) in data]        return sum(int(x == y) for (x, y) in results)    def total_cost(self, data, lmbda, convert=False):        """Return the total cost for the data set ``data``.  The flag        ``convert`` should be set to False if the data set is the        training data (the usual case), and to True if the data set is        the validation or test data.  See comments on the similar (but        reversed) convention for the ``accuracy`` method, above.        """        cost = 0.0        for x, y in data:            a = self.feedforward(x)            if convert: y = vectorized_result(y)            cost += self.cost.fn(a, y)/len(data)        cost += 0.5*(lmbda/len(data))*sum(            np.linalg.norm(w)**2 for w in self.weights)        return cost    def save(self, filename):        """Save the neural network to the file ``filename``."""        data = {"sizes": self.sizes,                "weights": [w.tolist() for w in self.weights],                "biases": [b.tolist() for b in self.biases],                "cost": str(self.cost.__name__)}        f = open(filename, "w")        json.dump(data, f)        f.close()#### Loading a Networkdef load(filename):    """Load a neural network from the file ``filename``.  Returns an    instance of Network.    """    f = open(filename, "r")    data = json.load(f)    f.close()    cost = getattr(sys.modules[__name__], data["cost"])    net = Network(data["sizes"], cost=cost)    net.weights = [np.array(w) for w in data["weights"]]    net.biases = [np.array(b) for b in data["biases"]]    return net#### Miscellaneous functionsdef vectorized_result(j):    """Return a 10-dimensional unit vector with a 1.0 in the j'th position    and zeroes elsewhere.  This is used to convert a digit (0...9)    into a corresponding desired output from the neural network.    """    e = np.zeros((10, 1))    e[j] = 1.0    return edef sigmoid(z):    """The sigmoid function."""    return 1.0/(1.0+np.exp(-z))def sigmoid_prime(z):    """Derivative of the sigmoid function."""    return sigmoid(z)*(1-sigmoid(z))


2.2.1 改变cost函数为cross-entropy函数和加入regularization


2.2.2 改变权重和偏向初始化和加入regularization


2.3 network3.py 部分

import cPickleimport gzip# Third-party librariesimport numpy as npimport theanoimport theano.tensor as Tfrom theano.tensor.nnet import convfrom theano.tensor.nnet import softmaxfrom theano.tensor import shared_randomstreamsfrom theano.tensor.signal import downsample# Activation functions for neuronsdef linear(z): return zdef ReLU(z): return T.maximum(0.0, z)from theano.tensor.nnet import sigmoidfrom theano.tensor import tanh#### ConstantsGPU = True #使用GPU运行if GPU:    print "Trying to run under a GPU.  If this is not desired, then modify "+\        "network3.py\nto set the GPU flag to False."    try: theano.config.device = 'gpu'    except: pass # it's already set    theano.config.floatX = 'float32'else:    print "Running with a CPU.  If this is not desired, then the modify "+\        "network3.py to set\nthe GPU flag to True."#### Load the MNIST datadef load_data_shared(filename="../data/mnist.pkl.gz"):    f = gzip.open(filename, 'rb')    training_data, validation_data, test_data = cPickle.load(f)    f.close()    def shared(data):        """Place the data into shared variables.  This allows Theano to copy        the data to the GPU, if one is available.        """        shared_x = theano.shared(            np.asarray(data[0], dtype=theano.config.floatX), borrow=True)        shared_y = theano.shared(            np.asarray(data[1], dtype=theano.config.floatX), borrow=True)        return shared_x, T.cast(shared_y, "int32")    return [shared(training_data), shared(validation_data), shared(test_data)]#### Main class used to construct and train networksclass Network(object):    def __init__(self, layers, mini_batch_size):        """Takes a list of `layers`, describing the network architecture, and        a value for the `mini_batch_size` to be used during training        by stochastic gradient descent.        """        self.layers = layers        self.mini_batch_size = mini_batch_size        self.params = [param for layer in self.layers for param in layer.params]        self.x = T.matrix("x")        self.y = T.ivector("y")        init_layer = self.layers[0]        init_layer.set_inpt(self.x, self.x, self.mini_batch_size)        for j in xrange(1, len(self.layers)):            prev_layer, layer  = self.layers[j-1], self.layers[j]            layer.set_inpt(                prev_layer.output, prev_layer.output_dropout, self.mini_batch_size)        self.output = self.layers[-1].output        self.output_dropout = self.layers[-1].output_dropout    def SGD(self, training_data, epochs, mini_batch_size, eta,            validation_data, test_data, lmbda=0.0):        """Train the network using mini-batch stochastic gradient descent."""        training_x, training_y = training_data        validation_x, validation_y = validation_data        test_x, test_y = test_data        # compute number of minibatches for training, validation and testing        num_training_batches = size(training_data)/mini_batch_size        num_validation_batches = size(validation_data)/mini_batch_size        num_test_batches = size(test_data)/mini_batch_size        # define the (regularized) cost function, symbolic gradients, and updates        l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers])        cost = self.layers[-1].cost(self)+\               0.5*lmbda*l2_norm_squared/num_training_batches        grads = T.grad(cost, self.params)        updates = [(param, param-eta*grad)                   for param, grad in zip(self.params, grads)]        # define functions to train a mini-batch, and to compute the        # accuracy in validation and test mini-batches.        i = T.lscalar() # mini-batch index        train_mb = theano.function(            [i], cost, updates=updates,            givens={                self.x:                training_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],                self.y:                training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]            })        validate_mb_accuracy = theano.function(            [i], self.layers[-1].accuracy(self.y),            givens={                self.x:                validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],                self.y:                validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]            })        test_mb_accuracy = theano.function(            [i], self.layers[-1].accuracy(self.y),            givens={                self.x:                test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],                self.y:                test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]            })        self.test_mb_predictions = theano.function(            [i], self.layers[-1].y_out,            givens={                self.x:                test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size]            })        # Do the actual training        best_validation_accuracy = 0.0        for epoch in xrange(epochs):            for minibatch_index in xrange(num_training_batches):                iteration = num_training_batches*epoch+minibatch_index                if iteration % 1000 == 0:                    print("Training mini-batch number {0}".format(iteration))                cost_ij = train_mb(minibatch_index)                if (iteration+1) % num_training_batches == 0:                    validation_accuracy = np.mean(                        [validate_mb_accuracy(j) for j in xrange(num_validation_batches)])                    print("Epoch {0}: validation accuracy {1:.2%}".format(                        epoch, validation_accuracy))                    if validation_accuracy >= best_validation_accuracy:                        print("This is the best validation accuracy to date.")                        best_validation_accuracy = validation_accuracy                        best_iteration = iteration                        if test_data:                            test_accuracy = np.mean(                                [test_mb_accuracy(j) for j in xrange(num_test_batches)])                            print('The corresponding test accuracy is {0:.2%}'.format(                                test_accuracy))        print("Finished training network.")        print("Best validation accuracy of {0:.2%} obtained at iteration {1}".format(            best_validation_accuracy, best_iteration))        print("Corresponding test accuracy of {0:.2%}".format(test_accuracy))#### Define layer typesclass ConvPoolLayer(object):    """Used to create a combination of a convolutional and a max-pooling    layer.  A more sophisticated implementation would separate the    two, but for our purposes we'll always use them together, and it    simplifies the code, so it makes sense to combine them.    """    def __init__(self, filter_shape, image_shape, poolsize=(2, 2),                 activation_fn=sigmoid):        """`filter_shape` is a tuple of length 4, whose entries are the number        of filters, the number of input feature maps, the filter height, and the        filter width.        `image_shape` is a tuple of length 4, whose entries are the        mini-batch size, the number of input feature maps, the image        height, and the image width.        `poolsize` is a tuple of length 2, whose entries are the y and        x pooling sizes.        """        self.filter_shape = filter_shape        self.image_shape = image_shape        self.poolsize = poolsize        self.activation_fn=activation_fn        # initialize weights and biases        n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize))        self.w = theano.shared(            np.asarray(                np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape),                dtype=theano.config.floatX),            borrow=True)        self.b = theano.shared(            np.asarray(                np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)),                dtype=theano.config.floatX),            borrow=True)        self.params = [self.w, self.b]    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):        self.inpt = inpt.reshape(self.image_shape)        conv_out = conv.conv2d(            input=self.inpt, filters=self.w, filter_shape=self.filter_shape,            image_shape=self.image_shape)        pooled_out = downsample.max_pool_2d(            input=conv_out, ds=self.poolsize, ignore_border=True)        self.output = self.activation_fn(            pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))        self.output_dropout = self.output # no dropout in the convolutional layersclass FullyConnectedLayer(object):    def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0):        self.n_in = n_in        self.n_out = n_out        self.activation_fn = activation_fn        self.p_dropout = p_dropout        # Initialize weights and biases        self.w = theano.shared(            np.asarray(                np.random.normal(                    loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),                dtype=theano.config.floatX),            name='w', borrow=True)        self.b = theano.shared(            np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),                       dtype=theano.config.floatX),            name='b', borrow=True)        self.params = [self.w, self.b]    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):        self.inpt = inpt.reshape((mini_batch_size, self.n_in))        self.output = self.activation_fn(            (1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)        self.y_out = T.argmax(self.output, axis=1)        self.inpt_dropout = dropout_layer(            inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)        self.output_dropout = self.activation_fn(            T.dot(self.inpt_dropout, self.w) + self.b)    def accuracy(self, y):        "Return the accuracy for the mini-batch."        return T.mean(T.eq(y, self.y_out))class SoftmaxLayer(object):    def __init__(self, n_in, n_out, p_dropout=0.0):        self.n_in = n_in        self.n_out = n_out        self.p_dropout = p_dropout        # Initialize weights and biases        self.w = theano.shared(            np.zeros((n_in, n_out), dtype=theano.config.floatX),            name='w', borrow=True)        self.b = theano.shared(            np.zeros((n_out,), dtype=theano.config.floatX),            name='b', borrow=True)        self.params = [self.w, self.b]    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):        self.inpt = inpt.reshape((mini_batch_size, self.n_in))        self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)        self.y_out = T.argmax(self.output, axis=1)        self.inpt_dropout = dropout_layer(            inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)        self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b)    def cost(self, net):        "Return the log-likelihood cost."        return -T.mean(T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])    def accuracy(self, y):        "Return the accuracy for the mini-batch."        return T.mean(T.eq(y, self.y_out))#### Miscellaneadef size(data):    "Return the size of the dataset `data`."    return data[0].get_value(borrow=True).shape[0]def dropout_layer(layer, p_dropout):    srng = shared_randomstreams.RandomStreams(        np.random.RandomState(0).randint(999999))    mask = srng.binomial(n=1, p=1-p_dropout, size=layer.shape)    return layer*T.cast(mask, theano.config.floatX)
import network3from network3 import Network from network3 import ConvPoolLayer,FullyConnectedLayer,SoftmaxLayerfrom conv import mini_batch_sizeimport numpy as npimport theanoimport theano.tensor as Tfrom theano.tensor.nnet import convfrom theano.tensor.nnet import softmaxfrom theano.tensor import shared_randomstreamsfrom theano.tensor.signal import downsampledef ReLU(z): return T.maximum(0.0,z)training_data,validation_data,test_data = network3.load_data_shared()mini_batch_size = 10expend_training_data,_,_ = network3.load_data_shared("../data/mnist_expanded.pkl.gz")net = Network([    ConvPoolLayer(image_shape=(mini_batch_size,1,28,28),                  filter_shape=(20,1,5,5),                  poolsize=(2,2),                  activation_fn=ReLU),    ConvPoolLayer(image_shape=(mini_batch_size,20,12,12),                  filter_shape=(40,20,5,5),                  poolsize=(2,2),                  activation_fn=ReLU),    FullyConnectedLayer(        n_in=40*4*4,n_out=1000,activation_fn=ReLU,p_dropout=0.5),    FullyConnectedLayer(        n_in=1000,n_out=1000,activation_fn=ReLU,p_dropout=0.5),    SoftmaxLayer(n_in=1000,n_out=10,p_dropout=0.5)],              mini_batch_size)net.SGD(expend_training_data,40,mini_batch_size,0.03,validation_data,test_data)

