Repeat the Wheel of Deep Neural Networks

来源：互联网发布：手机屏幕亮点修复软件编辑：程序博客网时间：2024/06/17 00:33

提要

在上一篇中，我们简单了解了代价函数。现在我们要构建一个基于反向传播算法的基本网络，这个网络优化的目标，就是我们上一节讨论过的代价函数。

基本结构

一个基本的神经网络层包含了线性计算和激活函数两个部分。
我们将这一层的输入记为XB×FI，其中B为每次训练的batch size，FI为每个数据中feature的数目。该层输出记为YB×FO，其中FO是输出的feature数目。
由于都是矩阵计算，下表将涉及到的矩阵大小做一个简单的归纳。

Symbol Size Notes

B×FI input data

B×FO output data

FI×FO weights

1×FO bias

f(x) NA activation function

正向传播和反向传播

正向传播比较简单，就是线性计算和激活函数的级联。

S Y = X \times W + b = f (X)

反向传播是以

dY为输入，

dX为输出的计算过程。

d S d X = d Y \circ f' (X) = d S \times W T

在反向传播的同时，更新本层的系数。下面以Batch Mode SGD为例，说明系数更新的过程。（

α为Learning Rate）

W b = W - α B (X T \times d Y) = b - α B (I 1 \times B \times d Y)

上面的运算中，A×B 表示矩阵乘法（Matrix Multiplication），A∘B 表示元素乘法（Elementwise Multiplication）。 I1×B 表示长度为B的全1行向量。

代码

初始化

from __future__ import divisionimport numpy as np

激活函数和代价函数

#%% activation functionsdef sigmoid(x):    y = 1 / (1 + np.exp(-x))    g = y * (1 - y)    return y, gdef softmax(x):    s = np.exp(x)    n = np.sum(s, axis=1)    y = (s.T / n).T    g = np.ones_like(y)    return y, gdef relu(x):    y = np.copy(x)    g = np.ones_like(x)    k = (x <= 0).nonzero()    y[k] = 0    g[k] = 0    return y, gdef softmax_cross_entropy(y, y_):    return - np.mean(np.sum(y_ * np.log(y), axis=1))def sigmoid_cross_entropy(y, y_):    return - np.mean(np.sum(y_ * np.log(y) + (1-y_) * np.log(1-y), axis=1))def error_prob(y, y_):    return np.sum(np.argmax(y, axis=1) != np.argmax(y_, axis=1)) / y.shape[0]class activation_layer(object):    def __init__(self, a_type):        if a_type == 'sigmoid':            self.afunc = sigmoid            self.lfunc = sigmoid_cross_entropy        elif a_type == 'softmax':            self.afunc = softmax            self.lfunc = softmax_cross_entropy        elif a_type == 'relu':            self.afunc = relu        else:            print 'not supported %s' % (a_type)    def forward(self, x_):        x = np.copy(x_)        y, self.grad = self.afunc(x)        return y    def backward(self, x, gy):        gx = self.grad * gy        return gx

全连接层的定义

#%% fully connected layerdef weights_init(shape, cfg=dict()):    w = np.random.randn(*shape)    if 'weight_sigma' in cfg:        w *= cfg['weight_sigma']    else:        w *= 0.01    return wdef bias_init(shape, cfg=dict()):    b = np.zeros(shape)    return bclass neural_layer(object):    def __init__(self, w_shape, b_shape, cfg=dict()):        self.w = weights_init(w_shape, cfg=cfg)        self.b = bias_init(b_shape, cfg=cfg)        self.cfg = cfg    def forward(self, x):        y = np.dot(x, self.w) + self.b        return y    def backward(self, x, gy):        gx = np.dot(gy, self.w.T)        self.optimize_basic_sgd(x, gy)        return gx    def optimize_basic_sgd(self, x, gy):        n = x.shape[0]        gw = np.dot(x.T, gy) / n        gb = np.sum(gy, axis=0) / n        self.w -= self.cfg['alpha'] * gw        self.b -= self.cfg['alpha'] * gb

定义一个简单的模型

#%% simple neural network modelclass simple_nn_model(object):    def __init__(self, n_size, a_type, cfg=dict()):        self.cfg = cfg        self.layers = list()        assert len(n_size) == len(a_type) + 1, 'layer configuration wrong'        for k in range(len(a_type)):            self.add_linear_activate_layer(n_size[k], n_size[k+1], a_type[k], cfg=cfg)    def add_linear_activate_layer(self, n_in, n_out, a_type, cfg=dict()):        self.layers.append(neural_layer([n_in, n_out], [1, n_out], cfg))        self.layers.append(activation_layer(a_type))    def evaluate(self, x):        self.data = [x]        for k in range(len(self.layers)):            self.data.append(self.layers[k].forward(self.data[k]))    def check(self, y_):        E = self.layers[-1].lfunc(self.data[-1], y_)        pe = error_prob(self.data[-1], y_)        return pe, E    def train(self, x, y_):        self.evaluate(x)        n = len(self.layers)        self.grad = [[] for k in range(n)]        self.grad[-1] = self.data[-1] - y_        for k in range(n-2, -1, -1):            self.grad[k] = self.layers[k].backward(self.data[k], self.grad[k+1])

调用

下面的代码将创建一个三层的神经网络，隐藏层的节点数目为100，输出层采用softmax函数。

cfg = dict()cfg['weight_sigma'] = 0.01cfg['alpha'] = 1m1 = wh.simple_nn_model([784, 100, 10], ['relu', 'softmax'], cfg=cfg)for k in range(3000):    bx, by = mnist.train.next_batch(100)    m1.train(bx, by)    if (k+1) % 100 == 0:        bx, by = mnist.validation.get()        m1.evaluate(bx)        pe, E = m1.check(by)        print ('%.2f%%' % (100*pe)), E    if (k+1) % 1000 == 0:        cfg['alpha'] *= 0.1

上面的三层神经网络在Cross Validation中能够实现96.7%的正确率，这自然是不够的。下面我们将引入更多的优化方法，包括

Regularization
Intialization
Batch Normalization
Optimization methods beyond SGD
Neural network with convolutional layer (CNN)

阅读全文

0 0