多层神经网络前向及后向传播的公式总结

来源：互联网发布：国外程序员必备app 编辑：程序博客网时间：2024/04/28 09:44

1、初始化参数：

# layer_dims -- python array (list) containing the dimensions of each layer in our network

# parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL"

parameters = {}

L = len(layer_dims)  # number of layers in the network

for l in range(1, L):        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01       parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))

2、线性前向传播：

# A -- activations from previous layer (or input data): (size of previous layer, number of examples)# W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)# b -- bias vector, numpy array of shape (size of the current layer, 1)
Z = np.dot(W.T, A) + b

3、激活函数

# A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)

if activation == "sigmoid":        Z, linear_cache = linear_forward(A_prev, W, b)        A, activation_cache = sigmoid(Z)                # sigmoid(z) = 1 / (1 + np.exp(- z))elif activation == "relu":       Z, linear_cache = linear_forward(A_prev, W, b)       A, activation_cache = relu(Z)

4、L层模型
# X -- data, numpy array of shape (input size, number of examples)
# parameters -- output of initialize_parameters_deep()
caches = []A = X L = len(parameters) // 2  # number of layers in the neural networkfor l in range(1, L):    A_prev = A    A, cache = linear_activation_forward(A_prev , parameters["W" + str(l)], parameters["b" + str(l)], activation = 'relu')    caches.append(cache)
5、成本函数
# AL -- probability vector corresponding to your label predictions, shape (1, number of examples)# Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)
 m = Y.shape[1]
 cost = - np.sum(np.multiply(np.log(AL), Y) + np.multiply(np.log(1 - AL), 1 - Y)) / m
 cost = np.squeeze(cost)

6、线性反向传播
# dZ -- Gradient of the cost with respect to the linear output (of current layer l)# cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer
A_prev, W, b = cache
m = A_prev.shape[1]   
dW = np.dot(dZ, A_prev.T) / m
db = np.sum(dZ, axis=1, keepdims=True) / m
dA_prev = np.dot(W.T, dZ)

7、激活函数反向传播

    linear_cache, activation_cache = cache    if activation == "relu":        dZ = relu_backward(dA, activation_cache)        dA_prev, dW, db = linear_backward(dZ, linear_cache)    elif activation == "sigmoid":        dZ = sigmoid_backward(dA, activation_cache)        dA_prev, dW, db = linear_backward(dZ, linear_cache)

8、L层模型反向传播
    grads = {}
    L = len(caches)  # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)  # after this line, Y is the same shape as AL
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # derivative of cost with respect to AL
    current_cache = caches[L-1]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation="sigmoid")
    for l in reversed(range(L - 1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l+2)], current_cache, activation="relu")
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

9、更新参数
# parameters -- python dictionary containing your parameters# grads -- python dictionary containing your gradients, output of L_model_backward
L = len(parameters) // 2  # number of layers in the neural network    for l in range(L):        parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * grads["dW" + str(l)]        parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * grads["db" + str(l)]

阅读全文

0 0