深度学习DeepLearning.ai系列课程学习总结:12. 优化算法实战

来源:互联网 发布:三星手机3g网络设置 编辑:程序博客网 时间:2024/06/02 06:58

转载过程中,图片丢失,代码显示错乱。

为了更好的学习内容,请访问原创版本:

http://www.missshi.cn/api/view/blog/59bbcae0e519f50d04000204

Ps:初次访问由于js文件较大,请耐心等候(8s左右)



到目前为止,我们始终都是在使用梯度下降法来优化代价函数。

本文中,我们将使用一些更加高级的优化算法,利用这些优化算法,通常可以提高我们算法的收敛速度,并在最终得到更好的分离结果。

 

假设我们的代价函数就像这样一个山峰:

首先,我们需要引入一些相关的库:

  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. import scipy.io
  4. import math
  5. import sklearn
  6. import sklearn.datasets
  7.  
  8. from opt_utils import load_params_and_grads, initialize_parameters, forward_propagation, backward_propagation
  9. from opt_utils import compute_cost, predict, predict_dec, plot_decision_boundary, load_dataset
  10. from testCases import *
  11.  
  12. %matplotlib inline
  13. plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
  14. plt.rcParams['image.interpolation'] = 'nearest'
  15. plt.rcParams['image.cmap'] = 'gray'

其中,一些相关的函数如下:

  1. def load_params_and_grads(seed=1):
  2.     np.random.seed(seed)
  3.     W1 = np.random.randn(2,3)
  4.     b1 = np.random.randn(2,1)
  5.     W2 = np.random.randn(3,3)
  6.     b2 = np.random.randn(3,1)
  7.  
  8.     dW1 = np.random.randn(2,3)
  9.     db1 = np.random.randn(2,1)
  10.     dW2 = np.random.randn(3,3)
  11.     db2 = np.random.randn(3,1)
  12.     
  13.     return W1, b1, W2, b2, dW1, db1, dW2, db2
  14.     
  15. def initialize_parameters(layer_dims):
  16.     """
  17.     Arguments:
  18.     layer_dims -- python array (list) containing the dimensions of each layer in our network
  19.     
  20.     Returns:
  21.     parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
  22.                     W1 -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
  23.                     b1 -- bias vector of shape (layer_dims[l], 1)
  24.                     Wl -- weight matrix of shape (layer_dims[l-1], layer_dims[l])
  25.                     bl -- bias vector of shape (1, layer_dims[l])
  26.                     
  27.     Tips:
  28.     - For example: the layer_dims for the "Planar Data classification model" would have been [2,2,1]. 
  29.     This means W1's shape was (2,2), b1 was (1,2), W2 was (2,1) and b2 was (1,1). Now you have to generalize it!
  30.     - In the for loop, use parameters['W' + str(l)] to access Wl, where l is the iterative integer.
  31.     """
  32.     
  33.     np.random.seed(3)
  34.     parameters = {}
  35.     L = len(layer_dims) # number of layers in the network
  36.  
  37.     for l in range(1, L):
  38.         parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*  np.sqrt(2 / layer_dims[l-1])
  39.         parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
  40.         
  41.         assert(parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l-1])
  42.         assert(parameters['W' + str(l)].shape == layer_dims[l], 1)
  43.         
  44.     return parameters
  45.     
  46. def forward_propagation(X, parameters):
  47.     """
  48.     Implements the forward propagation (and computes the loss) presented in Figure 2.
  49.     
  50.     Arguments:
  51.     X -- input dataset, of shape (input size, number of examples)
  52.     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
  53.                     W1 -- weight matrix of shape ()
  54.                     b1 -- bias vector of shape ()
  55.                     W2 -- weight matrix of shape ()
  56.                     b2 -- bias vector of shape ()
  57.                     W3 -- weight matrix of shape ()
  58.                     b3 -- bias vector of shape ()
  59.     
  60.     Returns:
  61.     loss -- the loss function (vanilla logistic loss)
  62.     """
  63.     
  64.     # retrieve parameters
  65.     W1 = parameters["W1"]
  66.     b1 = parameters["b1"]
  67.     W2 = parameters["W2"]
  68.     b2 = parameters["b2"]
  69.     W3 = parameters["W3"]
  70.     b3 = parameters["b3"]
  71.     
  72.     # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
  73.     z1 = np.dot(W1, X) + b1
  74.     a1 = relu(z1)
  75.     z2 = np.dot(W2, a1) + b2
  76.     a2 = relu(z2)
  77.     z3 = np.dot(W3, a2) + b3
  78.     a3 = sigmoid(z3)
  79.     
  80.     cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3)
  81.     
  82.     return a3, cache
  83.  
  84. def backward_propagation(X, Y, cache):
  85.     """
  86.     Implement the backward propagation presented in figure 2.
  87.     
  88.     Arguments:
  89.     X -- input dataset, of shape (input size, number of examples)
  90.     Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
  91.     cache -- cache output from forward_propagation()
  92.     
  93.     Returns:
  94.     gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
  95.     """
  96.     m = X.shape[1]
  97.     (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache
  98.     
  99.     dz3 = 1./* (a3 - Y)
  100.     dW3 = np.dot(dz3, a2.T)
  101.     db3 = np.sum(dz3, axis=1, keepdims = True)
  102.     
  103.     da2 = np.dot(W3.T, dz3)
  104.     dz2 = np.multiply(da2, np.int64(a2 > 0))
  105.     dW2 = np.dot(dz2, a1.T)
  106.     db2 = np.sum(dz2, axis=1, keepdims = True)
  107.     
  108.     da1 = np.dot(W2.T, dz2)
  109.     dz1 = np.multiply(da1, np.int64(a1 > 0))
  110.     dW1 = np.dot(dz1, X.T)
  111.     db1 = np.sum(dz1, axis=1, keepdims = True)
  112.     
  113.     gradients = {"dz3": dz3, "dW3": dW3, "db3": db3,
  114.                  "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2,
  115.                  "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1}
  116.     
  117.     return gradients
  118.  
  119. def compute_cost(a3, Y):
  120.     
  121.     """
  122.     Implement the cost function
  123.     
  124.     Arguments:
  125.     a3 -- post-activation, output of forward propagation
  126.     Y -- "true" labels vector, same shape as a3
  127.     
  128.     Returns:
  129.     cost - value of the cost function
  130.     """
  131.     m = Y.shape[1]
  132.     
  133.     logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y)
  134.     cost = 1./* np.sum(logprobs)
  135.     
  136.     return cost
  137.  
  138. def predict(X, y, parameters):
  139.     """
  140.     This function is used to predict the results of a  n-layer neural network.
  141.     
  142.     Arguments:
  143.     X -- data set of examples you would like to label
  144.     parameters -- parameters of the trained model
  145.     
  146.     Returns:
  147.     p -- predictions for the given dataset X
  148.     """
  149.     
  150.     m = X.shape[1]
  151.     p = np.zeros((1,m), dtype = np.int)
  152.     
  153.     # Forward propagation
  154.     a3, caches = forward_propagation(X, parameters)
  155.     
  156.     # convert probas to 0/1 predictions
  157.     for i in range(0, a3.shape[1]):
  158.         if a3[0,i] > 0.5:
  159.             p[0,i] = 1
  160.         else:
  161.             p[0,i] = 0
  162.  
  163.     # print results
  164.  
  165.     #print ("predictions: " + str(p[0,:]))
  166.     #print ("true labels: " + str(y[0,:]))
  167.     print("Accuracy: "  + str(np.mean((p[0,:] == y[0,:]))))
  168.     
  169.     return p
  170.  
  171. def predict_dec(parameters, X):
  172.     """
  173.     Used for plotting decision boundary.
  174.     
  175.     Arguments:
  176.     parameters -- python dictionary containing your parameters 
  177.     X -- input data of size (m, K)
  178.     
  179.     Returns
  180.     predictions -- vector of predictions of our model (red: 0 / blue: 1)
  181.     """
  182.     
  183.     # Predict using forward propagation and a classification threshold of 0.5
  184.     a3, cache = forward_propagation(X, parameters)
  185.     predictions = (a3 > 0.5)
  186.     return predictions
  187.  
  188. def plot_decision_boundary(model, X, y):
  189.     # Set min and max values and give it some padding
  190.     x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
  191.     y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
  192.     h = 0.01
  193.     # Generate a grid of points with distance h between them
  194.     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
  195.     # Predict the function value for the whole grid
  196.     Z = model(np.c_[xx.ravel(), yy.ravel()])
  197.     Z = Z.reshape(xx.shape)
  198.     # Plot the contour and training examples
  199.     plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
  200.     plt.ylabel('x2')
  201.     plt.xlabel('x1')
  202.     plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)
  203.     plt.show()
  204.  
  205. def load_dataset():
  206.     np.random.seed(3)
  207.     train_X, train_Y = sklearn.datasets.make_moons(n_samples=300, noise=.2) #300 #0.2 
  208.     # Visualize the data
  209.     plt.scatter(train_X[:, 0], train_X[:, 1], c=train_Y, s=40, cmap=plt.cm.Spectral);
  210.     train_X = train_X.T
  211.     train_Y = train_Y.reshape((1, train_Y.shape[0]))
  212.     
  213.     return train_X, train_Y

 

梯度下降算法

更新参数的函数实现如下:

  1. def update_parameters_with_gd(parameters, grads, learning_rate):
  2.     """
  3.     Update parameters using one step of gradient descent
  4.     
  5.     Arguments:
  6.     parameters -- python dictionary containing your parameters to be updated:
  7.                     parameters['W' + str(l)] = Wl
  8.                     parameters['b' + str(l)] = bl
  9.     grads -- python dictionary containing your gradients to update each parameters:
  10.                     grads['dW' + str(l)] = dWl
  11.                     grads['db' + str(l)] = dbl
  12.     learning_rate -- the learning rate, scalar.
  13.     
  14.     Returns:
  15.     parameters -- python dictionary containing your updated parameters 
  16.     """
  17.  
  18.     L = len(parameters) // 2 # number of layers in the neural networks
  19.  
  20.     # Update rule for each parameter
  21.     for l in range(L):
  22.         ### START CODE HERE ### (approx. 2 lines)
  23.         parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads['dW' + str(l+1)]
  24.         parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads['db' + str(l+1)]
  25.         ### END CODE HERE ###
  26.         
  27.     return parameters

 由梯度下降算法演变来的还有随机梯度下降算法和小批量梯度下降算法。

 其中,对于梯度下降算法,实现如下:

  1. = data_input
  2. = labels
  3. parameters = initialize_parameters(layers_dims)
  4. for i in range(0, num_iterations):
  5.     # Forward propagation
  6.     a, caches = forward_propagation(X, parameters)
  7.     # Compute cost.
  8.     cost = compute_cost(a, Y)
  9.     # Backward propagation.
  10.     grads = backward_propagation(a, caches, parameters)
  11.     # Update parameters.
  12.     parameters = update_parameters(parameters, grads)

对于随机梯度下降算法,实现如下:

  1. = data_input
  2. = labels
  3. parameters = initialize_parameters(layers_dims)
  4. for i in range(0, num_iterations):
  5.     for j in range(0, m):
  6.         # Forward propagation
  7.         a, caches = forward_propagation(X[:,j], parameters)
  8.         # Compute cost
  9.         cost = compute_cost(a, Y[:,j])
  10.         # Backward propagation
  11.         grads = backward_propagation(a, caches, parameters)
  12.         # Update parameters.
  13.         parameters = update_parameters(parameters, grads)

 在随机梯度下降算法中,每次迭代中仅使用其中一个样本。

当训练集很大时,使用随机梯度下降算法的运行速度会很快,但是会存在一定的波动。

而在实践中,一个更好的实践是使用小批量梯度下降算法。

小批量梯度下降算法是一种综合批梯度下降算法和随机梯度下降算法的方法。

每次迭代过程中,既不是选择全部数据、也不是仅选择一个样本,而是选择一个批量。例如64,128等。

一方面,充分利用的GPU的并行性,更一方面,没有造成特别大的计算时间。


小批量梯度下降算法

首先,我们需要学习如果将训练集进行分批。

分批一共可以分为两个步骤:

第一步:打乱顺序。

首先,我们需要将输入样本和标定结果按照相同的顺序随机打乱,保证我们得到的每个批次都是随机的。

第二步:切分。

当我们已经把训练集随机打乱后,接下来就是对其进行切分。批次的大小可以选择64,128,256等。

  1. def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
  2. """
  3. Creates a list of random minibatches from (X, Y)
  4. Arguments:
  5. X -- input data, of shape (input size, number of examples)
  6. Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)
  7. mini_batch_size -- size of the mini-batches, integer
  8. Returns:
  9. mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)
  10. """
  11. np.random.seed(seed) # To make your "random" minibatches the same as ours
  12. m = X.shape[1] # number of training examples
  13. mini_batches = []
  14. # Step 1: Shuffle (X, Y)
  15. permutation = list(np.random.permutation(m))
  16. shuffled_X = X[:, permutation]
  17. shuffled_Y = Y[:, permutation].reshape((1,m))
  18.  
  19. # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
  20. num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
  21. for k in range(0, num_complete_minibatches):
  22. ### START CODE HERE ### (approx. 2 lines)
  23. mini_batch_X = shuffled_X[:, mini_batch_size * k : (k+1) * mini_batch_size]
  24. mini_batch_Y = shuffled_Y[:, mini_batch_size * k : (k+1) * mini_batch_size]
  25. ### END CODE HERE ###
  26. mini_batch = (mini_batch_X, mini_batch_Y)
  27. mini_batches.append(mini_batch)
  28. # Handling the end case (last mini-batch < mini_batch_size)
  29. if m % mini_batch_size != 0:
  30. ### START CODE HERE ### (approx. 2 lines)
  31. mini_batch_X = shuffled_X[:, mini_batch_size * (k+1) : m]
  32. mini_batch_Y = shuffled_Y[:, mini_batch_size * (k+1) : m]
  33. ### END CODE HERE ###
  34. mini_batch = (mini_batch_X, mini_batch_Y)
  35. mini_batches.append(mini_batch)
  36. return mini_batches


Momentum

当我们使用小批量梯度下降法时,每次对于一个训练样本的子集进行一次迭代。

因此,计算得到的梯度与真实的梯度可能存在一定的偏差。

此时,我们可以利用Momentum来减小偏差。

Momentum方法在更新的过程中,考虑了之前时刻的运行方向的影响,最终结合的作用可以克服一些非真实梯度引入的上下抖动现象。

momentum的更新规则如下:

  1. def initialize_velocity(parameters):
  2. """
  3. Initializes the velocity as a python dictionary with:
  4. - keys: "dW1", "db1", ..., "dWL", "dbL"
  5. - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.
  6. Arguments:
  7. parameters -- python dictionary containing your parameters.
  8. parameters['W' + str(l)] = Wl
  9. parameters['b' + str(l)] = bl
  10. Returns:
  11. v -- python dictionary containing the current velocity.
  12. v['dW' + str(l)] = velocity of dWl
  13. v['db' + str(l)] = velocity of dbl
  14. """
  15. L = len(parameters) // 2 # number of layers in the neural networks
  16. v = {}
  17. # Initialize velocity
  18. for l in range(L):
  19. ### START CODE HERE ### (approx. 2 lines)
  20. v["dW" + str(l+1)] = np.zeros(parameters['W' + str(l+1)].shape)
  21. v["db" + str(l+1)] = np.zeros(parameters['b' + str(l+1)].shape)
  22. ### END CODE HERE ###
  23. return v
  24. def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
  25. """
  26. Update parameters using Momentum
  27. Arguments:
  28. parameters -- python dictionary containing your parameters:
  29. parameters['W' + str(l)] = Wl
  30. parameters['b' + str(l)] = bl
  31. grads -- python dictionary containing your gradients for each parameters:
  32. grads['dW' + str(l)] = dWl
  33. grads['db' + str(l)] = dbl
  34. v -- python dictionary containing the current velocity:
  35. v['dW' + str(l)] = ...
  36. v['db' + str(l)] = ...
  37. beta -- the momentum hyperparameter, scalar
  38. learning_rate -- the learning rate, scalar
  39. Returns:
  40. parameters -- python dictionary containing your updated parameters
  41. v -- python dictionary containing your updated velocities
  42. """
  43.  
  44. L = len(parameters) // 2 # number of layers in the neural networks
  45. # Momentum update for each parameter
  46. for l in range(L):
  47. ### START CODE HERE ### (approx. 4 lines)
  48. # compute velocities
  49. v["dW" + str(l+1)] = beta * v["dW" + str(l+1)] + (1-beta) * grads['dW' + str(l+1)]
  50. v["db" + str(l+1)] = beta * v["db" + str(l+1)] + (1-beta) * grads['db' + str(l+1)]
  51. # update parameters
  52. parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * v["dW" + str(l+1)]
  53. parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * v["db" + str(l+1)]
  54. ### END CODE HERE ###
  55. return parameters, v

在momentum中,有一个参数beta。

当beta=0时,此时,momentum相当于没有使用momentum算法的标准梯度下降算法。

当beta越到,说明平滑的作用越明显。通常,在实践中,0.9是比较适当的值。


Adam算法

Adam算法是训练神经网络中最有效的算法之一。

它是RMSProp算法与Momentum算法的结合体。

其迭代公式如下:

实现过程如下:

  1. def initialize_adam(parameters) :
  2. """
  3. Initializes v and s as two python dictionaries with:
  4. - keys: "dW1", "db1", ..., "dWL", "dbL"
  5. - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.
  6. Arguments:
  7. parameters -- python dictionary containing your parameters.
  8. parameters["W" + str(l)] = Wl
  9. parameters["b" + str(l)] = bl
  10. Returns:
  11. v -- python dictionary that will contain the exponentially weighted average of the gradient.
  12. v["dW" + str(l)] = ...
  13. v["db" + str(l)] = ...
  14. s -- python dictionary that will contain the exponentially weighted average of the squared gradient.
  15. s["dW" + str(l)] = ...
  16. s["db" + str(l)] = ...
  17.  
  18. """
  19. L = len(parameters) // 2 # number of layers in the neural networks
  20. v = {}
  21. s = {}
  22. # Initialize v, s. Input: "parameters". Outputs: "v, s".
  23. for l in range(L):
  24. ### START CODE HERE ### (approx. 4 lines)
  25. v["dW" + str(l+1)] = np.zeros(parameters['W' + str(l+1)].shape)
  26. v["db" + str(l+1)] = np.zeros(parameters['b' + str(l+1)].shape)
  27. s["dW" + str(l+1)] = np.zeros(parameters['W' + str(l+1)].shape)
  28. s["db" + str(l+1)] = np.zeros(parameters['b' + str(l+1)].shape)
  29. ### END CODE HERE ###
  30. return v, s
  31. def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01,
  32. beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
  33. """
  34. Update parameters using Adam
  35. Arguments:
  36. parameters -- python dictionary containing your parameters:
  37. parameters['W' + str(l)] = Wl
  38. parameters['b' + str(l)] = bl
  39. grads -- python dictionary containing your gradients for each parameters:
  40. grads['dW' + str(l)] = dWl
  41. grads['db' + str(l)] = dbl
  42. v -- Adam variable, moving average of the first gradient, python dictionary
  43. s -- Adam variable, moving average of the squared gradient, python dictionary
  44. learning_rate -- the learning rate, scalar.
  45. beta1 -- Exponential decay hyperparameter for the first moment estimates
  46. beta2 -- Exponential decay hyperparameter for the second moment estimates
  47. epsilon -- hyperparameter preventing division by zero in Adam updates
  48.  
  49. Returns:
  50. parameters -- python dictionary containing your updated parameters
  51. v -- Adam variable, moving average of the first gradient, python dictionary
  52. s -- Adam variable, moving average of the squared gradient, python dictionary
  53. """
  54. L = len(parameters) // 2 # number of layers in the neural networks
  55. v_corrected = {} # Initializing first moment estimate, python dictionary
  56. s_corrected = {} # Initializing second moment estimate, python dictionary
  57. # Perform Adam update on all parameters
  58. for l in range(L):
  59. # Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
  60. ### START CODE HERE ### (approx. 2 lines)
  61. v["dW" + str(l+1)] = beta1 * v["dW" + str(l+1)] + (1-beta1) * grads['dW' + str(l+1)]
  62. v["db" + str(l+1)] = beta1 * v["db" + str(l+1)] + (1-beta1) * grads['db' + str(l+1)]
  63. ### END CODE HERE ###
  64.  
  65. # Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
  66. ### START CODE HERE ### (approx. 2 lines)
  67. v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (1 - beta1 ** t)
  68. v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (1 - beta1 ** t)
  69. ### END CODE HERE ###
  70.  
  71. # Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
  72. ### START CODE HERE ### (approx. 2 lines)
  73. s["dW" + str(l+1)] = beta2 * s["dW" + str(l+1)] + (1-beta2) * (grads['dW' + str(l+1)] ** 2)
  74. s["db" + str(l+1)] = beta2 * s["db" + str(l+1)] + (1-beta2) * (grads['db' + str(l+1)] ** 2)
  75. ### END CODE HERE ###
  76.  
  77. # Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
  78. ### START CODE HERE ### (approx. 2 lines)
  79. s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (1 - beta2 ** t)
  80. s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (1 - beta2 ** t)
  81. ### END CODE HERE ###
  82.  
  83. # Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".
  84. ### START CODE HERE ### (approx. 2 lines)
  85. parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * v_corrected["dW" + str(l+1)] / (s_corrected["dW" + str(l+1)] ** 0.5 + epsilon)
  86. parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * v_corrected["db" + str(l+1)] / (s_corrected["db" + str(l+1)] ** 0.5 + epsilon)
  87. ### END CODE HERE ###
  88.  
  89. return parameters, v, s


使用不同优化算法的模型效果对比

  1. train_X, train_Y = load_dataset()

基本模型实现如下(一个三层的神经网络模型):

  1. def model(X, Y, layers_dims, optimizer, learning_rate = 0.0007, mini_batch_size = 64, beta = 0.9,
  2. beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, num_epochs = 10000, print_cost = True):
  3. """
  4. 3-layer neural network model which can be run in different optimizer modes.
  5. Arguments:
  6. X -- input data, of shape (2, number of examples)
  7. Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)
  8. layers_dims -- python list, containing the size of each layer
  9. learning_rate -- the learning rate, scalar.
  10. mini_batch_size -- the size of a mini batch
  11. beta -- Momentum hyperparameter
  12. beta1 -- Exponential decay hyperparameter for the past gradients estimates
  13. beta2 -- Exponential decay hyperparameter for the past squared gradients estimates
  14. epsilon -- hyperparameter preventing division by zero in Adam updates
  15. num_epochs -- number of epochs
  16. print_cost -- True to print the cost every 1000 epochs
  17.  
  18. Returns:
  19. parameters -- python dictionary containing your updated parameters
  20. """
  21.  
  22. L = len(layers_dims) # number of layers in the neural networks
  23. costs = [] # to keep track of the cost
  24. t = 0 # initializing the counter required for Adam update
  25. seed = 10 # For grading purposes, so that your "random" minibatches are the same as ours
  26. # Initialize parameters
  27. parameters = initialize_parameters(layers_dims)
  28.  
  29. # Initialize the optimizer
  30. if optimizer == "gd":
  31. pass # no initialization required for gradient descent
  32. elif optimizer == "momentum":
  33. v = initialize_velocity(parameters)
  34. elif optimizer == "adam":
  35. v, s = initialize_adam(parameters)
  36. # Optimization loop
  37. for i in range(num_epochs):
  38. # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
  39. seed = seed + 1
  40. minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
  41.  
  42. for minibatch in minibatches:
  43.  
  44. # Select a minibatch
  45. (minibatch_X, minibatch_Y) = minibatch
  46.  
  47. # Forward propagation
  48. a3, caches = forward_propagation(minibatch_X, parameters)
  49.  
  50. # Compute cost
  51. cost = compute_cost(a3, minibatch_Y)
  52.  
  53. # Backward propagation
  54. grads = backward_propagation(minibatch_X, minibatch_Y, caches)
  55.  
  56. # Update parameters
  57. if optimizer == "gd":
  58. parameters = update_parameters_with_gd(parameters, grads, learning_rate)
  59. elif optimizer == "momentum":
  60. parameters, v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
  61. elif optimizer == "adam":
  62. t = t + 1 # Adam counter
  63. parameters, v, s = update_parameters_with_adam(parameters, grads, v, s,
  64. t, learning_rate, beta1, beta2, epsilon)
  65. # Print the cost every 1000 epoch
  66. if print_cost and i % 1000 == 0:
  67. print ("Cost after epoch %i: %f" %(i, cost))
  68. if print_cost and i % 100 == 0:
  69. costs.append(cost)
  70. # plot the cost
  71. plt.plot(costs)
  72. plt.ylabel('cost')
  73. plt.xlabel('epochs (per 100)')
  74. plt.title("Learning rate = " + str(learning_rate))
  75. plt.show()
  76.  
  77. return parameters


小批量梯度下降算法

  1. layers_dims = [train_X.shape[0], 5, 2, 1]
  2. parameters = model(train_X, train_Y, layers_dims, optimizer = "gd")
  3.  
  4. # Predict
  5. predictions = predict(train_X, train_Y, parameters)
  6.  
  7. # Plot decision boundary
  8. plt.title("Model with Gradient Descent optimization")
  9. axes = plt.gca()
  10. axes.set_xlim([-1.5,2.5])
  11. axes.set_ylim([-1,1.5])
  12. plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)


使用Momentum的小批量梯度算法

  1. layers_dims = [train_X.shape[0], 5, 2, 1]
  2. parameters = model(train_X, train_Y, layers_dims, beta = 0.9, optimizer = "momentum")
  3.  
  4. # Predict
  5. predictions = predict(train_X, train_Y, parameters)
  6.  
  7. # Plot decision boundary
  8. plt.title("Model with Momentum optimization")
  9. axes = plt.gca()
  10. axes.set_xlim([-1.5,2.5])
  11. axes.set_ylim([-1,1.5])
  12. plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)


使用Adam优化的小批量梯度算法

  1. layers_dims = [train_X.shape[0], 5, 2, 1]
  2. parameters = model(train_X, train_Y, layers_dims, optimizer = "adam")
  3.  
  4. # Predict
  5. predictions = predict(train_X, train_Y, parameters)
  6.  
  7. # Plot decision boundary
  8. plt.title("Model with Adam optimization")
  9. axes = plt.gca()
  10. axes.set_xlim([-1.5,2.5])
  11. axes.set_ylim([-1,1.5])
  12. plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

总结一下:

不同优化算法的对比结果如下表:

对比三种方法,我们通常会发现Adam算法往往可以得到更好的结果。


更多更详细的内容,请访问原创网站:

http://www.missshi.cn/api/view/blog/59bbcae0e519f50d04000204

Ps:初次访问由于js文件较大,请耐心等候(8s左右)


阅读全文
0 0
原创粉丝点击