深层神经网络的搭建
来源:互联网 发布:四川大学自考网络助学 编辑:程序博客网 时间:2024/06/06 14:17
在两层神经网络的设计与实现中,介绍了两层神经网络的工作原理。对于搭建多层神经网络,该方法依然适用。因此,本文不再推导公式,而是直接给出代码实现。
1. 定义激活函数
# 定义激活函数def sigmoid(Z): A = 1 / (1 + np.exp(-Z)) assert(A.shape == Z.shape) cache = Z return A, cache# 定义修正线性单元为激活函数def relu(Z): A = np.maximum(0, Z) assert(A.shape == Z.shape) cache = Z return A, cache
2. 定义初始化超参数
''' 初始化超参数,参数为每层网络的维度'''def initialize_parameters_deep(layer_dims): np.random.seed(3) # 生成随机种子 L = len(layer_dims) # 获取网络的深度(其实比我们平常所定义的层数多1) parameters = {} # 定义参数字典 for l in range(1, L): # 生成每层网络的权值w,阈值b parameters['W'+str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1]) # * 0.01 parameters['b'+str(l)] = np.zeros((layer_dims[l], 1)) assert(parameters['W'+str(l)].shape == (layer_dims[l], layer_dims[l-1])) assert(parameters['b'+str(l)].shape == (layer_dims[l], 1)) return parameters
注意:随机生成的权值这一超参数是非常重要的,它影响着代价函数的收敛。在训练week2的样例时,发现当网络变深时,直接乘上0.01代价函数收敛缓慢,而如果设置成上面表达的那种形式,代价函数可以很好的收敛。
3. 定义前向计算过程
# 正向线性传播计算def linear_forward(A, W, b): Z = np.dot(W, A) + b # 计算线性值 cache = (A, W, b) # 缓存A,W,b的值,为反向梯度求导使用 return Z, cache# 计算激活函数def linear_activation_forward(A_prev, W, b, activation): Z, linear_cache = linear_forward(A_prev, W, b) # 计算线性值 # 根据需要选择不同的激活函数 if activation == 'sigmoid': A, activation_cache = sigmoid(Z) if activation == 'relu': A, activation_cache = relu(Z) cache = (linear_cache, activation_cache) return A, cache# 正向计算整个网络def L_model_forward(X, parameters): A = X # X为训练数据 L = len(parameters)/2 # 计算网络深度,这里深度和我们定义的一样 caches = [] for l in range(1,L): A_prev = A A, cache = linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation='relu') caches.append(cache) AL, cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], activation='sigmoid') caches.append(cache) assert(AL.shape==(1,X.shape[1])) return AL, caches
4. 定义代价函数
# 定义损失函数或者代价函数def compute_cost(AL, Y): m = Y.shape[1] # 计算样例的数量 cost = - np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True) / m cost = np.squeeze(cost) return cost
注意:如果把语句“cost = - np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True) / m”中的“m”放到前面,应该写成“cost = - 1.0 / m * np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True)" 。
5. 定义后向推导过程
# 定义线性反向传播def linear_backward(dZ, cache): A_prev, W, b = cache m = A_prev.shape[1] dW = np.dot(dZ, A_prev.T) / m db = np.sum(dZ, axis = 1, keepdims=True) / m dA_prev = np.dot(W.T, dZ) assert (dA_prev.shape == A_prev.shape) assert (dW.shape == W.shape) assert (db.shape == b.shape) return dA_prev, dW, db def relu_backward(dA, cache): Z = cache dZ = np.array(dA, copy=True) # just converting dz to a correct object. # When z <= 0, you should set dz to 0 as well. dZ[Z <= 0] = 0 assert (dZ.shape == Z.shape) return dZdef sigmoid_backward(dA, cache): Z = cache s = 1/(1+np.exp(-Z)) dZ = dA * s * (1-s) assert (dZ.shape == Z.shape) return dZdef linear_activation_backward(dA, cache, activation='sigmoid'): linear_cache, activation_cache = cache if activation == 'relu': dZ = relu_backward(dA, activation_cache) dA_prev, dW, db = linear_backward(dZ, linear_cache) elif activation == 'sigmoid': dZ = sigmoid_backward(dA, activation_cache) dA_prev, dW, db = linear_backward(dZ, linear_cache) return dA_prev, dW, db# 计算反向传播的整个网络def L_model_backward(AL, Y, caches): grads = {} L = len(caches) # 网络的层数 m = AL.shape[1] Y = Y.reshape(AL.shape) # 把Y的形状和AL统一 # 先求最后一层的dA dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # 求最后一层的dZ current_cache = caches[L-1] grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid") for l in reversed(range(L-1)): current_cache = caches[l] dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l+2)], current_cache, activation = "relu") grads["dA" + str(l + 1)] = dA_prev_temp grads["dW" + str(l + 1)] = dW_temp grads["db" + str(l + 1)] = db_temp return grads
6. 变更参数
# 更新超参数def update_parameters(parameters, grads, learning_rate): L = len(parameters) / 2 for l in range(L): parameters['W'+str(l+1)] = parameters['W'+str(l+1)] - learning_rate * grads['dW'+str(l+1)] parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - learning_rate * grads['db'+str(l+1)] return parameters
7. 整个神经网络训练过程
def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):#lr was 0.009 np.random.seed(1) costs = [] # 记录损失函数 parameters = initialize_parameters_deep(layers_dims) # 初始化参数 # 循环梯度下降 for i in range(0, num_iterations): AL, caches = L_model_forward(X, parameters) # 前向计算 cost = compute_cost(AL, Y) # 计算代价 grads = L_model_backward(AL, Y, caches) # 后向推导 parameters = update_parameters(parameters, grads, learning_rate) # 更新参数 # Print the cost every 100 training example if print_cost and i % 100 == 0: print ("Cost after iteration %i: %f" %(i, cost)) if print_cost and i % 100 == 0: costs.append(cost) plt.plot(np.squeeze(costs)) plt.ylabel('cost') plt.xlabel('iterations (per tens)') plt.title("Learning rate =" + str(learning_rate)) plt.show() return parameters
8. 训练和测试
def load_dataset(): train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r") train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r") test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels classes = np.array(test_dataset["list_classes"][:]) # the list of classes train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0])) test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0])) return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classesdef train_model(X, Y, layers_dims, learning_rate, num_iterations): parameters = L_layer_model(X, Y, layers_dims, learning_rate, num_iterations, print_cost=True) return parametersdef predict_data(X, Y, parameters): AL, caches= L_model_forward(X, parameters) AL[AL >= 0.5] = 1 AL[AL < 0.5] = 0 print '准确率:', np.sum(AL==Y) * 1.0 / Y.shape[1] if __name__ == '__main__': train_data_x, train_data_y, test_data_x, test_data_y, classes = load_dataset() # 获取数据集 X = train_data_x.reshape(209, 64*64*3).T * 1.0 / 255 # 把训练数据构造成二维矩阵,行数为X的维度,列值为训练样本的个数 Y = train_data_y X2 = test_data_x.reshape(50, 64*64*3).T * 1.0 / 255 # 转换成对应格式的矩阵 Y2 = test_data_y row_count = 64*64*3 # 表示一个样例的特征维度 examples_count = 209 # 表示样例的数量 layers_dims = [12288, 20, 7, 5, 1] # [12288, 4, 1] parameters = train_model(X, Y, layers_dims, 0.0075, 1500) # 训练参数 print '训练', predict_data(X, Y, parameters) # 根据训练的参数进行预测 print '测试', predict_data(X2, Y2, parameters) # 根据训练的参数进行预测
9. 小结
通过测试发现,超参数的初始化非常重要;而且并非网络深度设置越深越好;也并非迭代次数越多越好。
10. 代码整合
import numpy as npimport mathimport h5pyfrom planar_utils import plot_decision_boundary, sigmoid, load_planar_dataset, load_extra_datasetsfrom testCases_v2 import *import matplotlib.pyplot as plt''' 初始化超参数,参数为每层网络的维度'''def initialize_parameters_deep(layer_dims): np.random.seed(3) # 生成随机种子 L = len(layer_dims) # 获取网络的深度(其实比我们平常所定义的层数多1) parameters = {} # 定义参数字典 for l in range(1, L): # 生成每层网络的权值w,阈值b parameters['W'+str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1]) # * 0.01 parameters['b'+str(l)] = np.zeros((layer_dims[l], 1)) assert(parameters['W'+str(l)].shape == (layer_dims[l], layer_dims[l-1])) assert(parameters['b'+str(l)].shape == (layer_dims[l], 1)) return parameters# 定义激活函数def sigmoid(Z): A = 1 / (1 + np.exp(-Z)) assert(A.shape == Z.shape) cache = Z return A, cache# 定义修正线性单元为激活函数def relu(Z): A = np.maximum(0, Z) assert(A.shape == Z.shape) cache = Z return A, cache# 正向线性传播计算def linear_forward(A, W, b): Z = np.dot(W, A) + b # 计算线性值 cache = (A, W, b) # 缓存A,W,b的值,为反向梯度求导使用 return Z, cache# 计算激活函数def linear_activation_forward(A_prev, W, b, activation): Z, linear_cache = linear_forward(A_prev, W, b) # 计算线性值 # 根据需要选择不同的激活函数 if activation == 'sigmoid': A, activation_cache = sigmoid(Z) if activation == 'relu': A, activation_cache = relu(Z) cache = (linear_cache, activation_cache) return A, cache# 正向计算整个网络def L_model_forward(X, parameters): A = X # X为训练数据 L = len(parameters)/2 # 计算网络深度,这里深度和我们定义的一样 caches = [] for l in range(1,L): A_prev = A A, cache = linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation='relu') caches.append(cache) AL, cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], activation='sigmoid') caches.append(cache) assert(AL.shape==(1,X.shape[1])) return AL, caches # 定义损失函数或者代价函数def compute_cost(AL, Y): m = Y.shape[1] # 计算样例的数量 cost = - np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True) / m cost = np.squeeze(cost) return cost# 定义线性反向传播def linear_backward(dZ, cache): A_prev, W, b = cache m = A_prev.shape[1] dW = np.dot(dZ, A_prev.T) / m db = np.sum(dZ, axis = 1, keepdims=True) / m dA_prev = np.dot(W.T, dZ) assert (dA_prev.shape == A_prev.shape) assert (dW.shape == W.shape) assert (db.shape == b.shape) return dA_prev, dW, db def relu_backward(dA, cache): Z = cache dZ = np.array(dA, copy=True) # just converting dz to a correct object. # When z <= 0, you should set dz to 0 as well. dZ[Z <= 0] = 0 assert (dZ.shape == Z.shape) return dZdef sigmoid_backward(dA, cache): Z = cache s = 1/(1+np.exp(-Z)) dZ = dA * s * (1-s) assert (dZ.shape == Z.shape) return dZdef linear_activation_backward(dA, cache, activation='sigmoid'): linear_cache, activation_cache = cache if activation == 'relu': dZ = relu_backward(dA, activation_cache) dA_prev, dW, db = linear_backward(dZ, linear_cache) elif activation == 'sigmoid': dZ = sigmoid_backward(dA, activation_cache) dA_prev, dW, db = linear_backward(dZ, linear_cache) return dA_prev, dW, db# 计算反向传播的整个网络def L_model_backward(AL, Y, caches): grads = {} L = len(caches) # 网络的层数 m = AL.shape[1] Y = Y.reshape(AL.shape) # 把Y的形状和AL统一 # 先求最后一层的dA dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # 求最后一层的dZ current_cache = caches[L-1] grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid") for l in reversed(range(L-1)): current_cache = caches[l] dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l+2)], current_cache, activation = "relu") grads["dA" + str(l + 1)] = dA_prev_temp grads["dW" + str(l + 1)] = dW_temp grads["db" + str(l + 1)] = db_temp return grads# 更新超参数def update_parameters(parameters, grads, learning_rate): L = len(parameters) / 2 for l in range(L): parameters['W'+str(l+1)] = parameters['W'+str(l+1)] - learning_rate * grads['dW'+str(l+1)] parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - learning_rate * grads['db'+str(l+1)] return parametersdef L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):#lr was 0.009 np.random.seed(1) costs = [] # 记录损失函数 parameters = initialize_parameters_deep(layers_dims) # 初始化参数 # 循环梯度下降 for i in range(0, num_iterations): AL, caches = L_model_forward(X, parameters) # 前向计算 cost = compute_cost(AL, Y) # 计算代价 grads = L_model_backward(AL, Y, caches) # 后向推导 parameters = update_parameters(parameters, grads, learning_rate) # 更新参数 # Print the cost every 100 training example if print_cost and i % 100 == 0: print ("Cost after iteration %i: %f" %(i, cost)) if print_cost and i % 100 == 0: costs.append(cost) plt.plot(np.squeeze(costs)) plt.ylabel('cost') plt.xlabel('iterations (per tens)') plt.title("Learning rate =" + str(learning_rate)) plt.show() return parametersdef load_dataset(): train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r") train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r") test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels classes = np.array(test_dataset["list_classes"][:]) # the list of classes train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0])) test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0])) return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classesdef train_model(X, Y, layers_dims, learning_rate, num_iterations): parameters = L_layer_model(X, Y, layers_dims, learning_rate, num_iterations, print_cost=True) return parametersdef predict_data(X, Y, parameters): AL, caches= L_model_forward(X, parameters) AL[AL >= 0.5] = 1 AL[AL < 0.5] = 0 print '准确率:', np.sum(AL==Y) * 1.0 / Y.shape[1] if __name__ == '__main__': train_data_x, train_data_y, test_data_x, test_data_y, classes = load_dataset() # 获取数据集 X = train_data_x.reshape(209, 64*64*3).T * 1.0 / 255 # 把训练数据构造成二维矩阵,行数为X的维度,列值为训练样本的个数 Y = train_data_y X2 = test_data_x.reshape(50, 64*64*3).T * 1.0 / 255 # 转换成对应格式的矩阵 Y2 = test_data_y row_count = 64*64*3 # 表示一个样例的特征维度 examples_count = 209 # 表示样例的数量 layers_dims = [12288, 20, 7, 5, 1] # [12288, 4, 1] parameters = train_model(X, Y, layers_dims, 0.0075, 1500) # 训练参数 print '训练', predict_data(X, Y, parameters) # 根据训练的参数进行预测 print '测试', predict_data(X2, Y2, parameters) # 根据训练的参数进行预测
阅读全文
0 0
- 深层神经网络的搭建
- 深层神经网络
- 深层神经网络的权值初始化问题
- 深层神经网络的正则化问题
- 深层神经网络与浅层神经网络的区别
- 第四周深层神经网络
- 改善深层神经网络
- 吴恩达学习-深层神经网络
- 基于深层神经网络的命名实体识别技术
- 基于深层神经网络的命名实体识别技术
- 基于深层神经网络的命名实体识别技术
- 基于深层神经网络的命名实体识别技术
- 基于深层神经网络的命名实体识别技术
- 基于深层神经网络的命名实体识别技术
- 基于深层神经网络的命名实体识别技术
- 深层神经网络结构及可能存在的致命问题详解
- 基于深层神经网络的命名实体识别技术
- 基于深层神经网络的命名实体识别技术
- 递归法实现汉诺塔
- HDOJ1412 {A} + {B}
- 12月9日 数据结构 周六
- 用户管理命令
- 模板
- 深层神经网络的搭建
- HTML5--ol-li属性
- Ubuntu 16.04 安装配置maven 3.5.0
- MPAndroidChart 动态更新
- HDOJ1418 抱歉(欧拉公式)
- tomcat问题:eclipse中web项目正常发布到tomcat后,浏览器不能访问。浏览器显示:HTTP Status 404 – Not Found
- 将数组A中的内容和数组B中的内容进行交换。(数组一样大)
- 20171209 生活随笔-让自己的工作和生活都能够有所好转
- LeetCode小白菜笔记[2]:Reverse Integer