theano-xnor-net代码注释 cifar10_train.py

来源:互联网 发布:php推广链接代码 编辑:程序博客网 时间:2024/06/08 11:47
# *_*coding:utf-8 *_*import sys, os, timeimport lasagneimport numpy as npimport theanoimport theano.tensor as Timport cPickleimport xnor_netimport cnn_utilsfrom external import bnn_utilsimport gzipfrom collections import OrderedDictdef construct_cifar10_net(input_var, alpha, eps):    ##################################################    #    # 结构:    #    # 输入层    #   |    # 卷积层1 + BN层    #   |    # 卷积层2 + 最大值混合层2 + BN层    #   |    # 卷积层3 + BN层    #   |    # 卷积层4 + 最大值混合层4 + BN层    #   |    # 卷积层5 + BN层    #   |    # 卷积层6 + 最大值混合层6 + BN层    #   |    # 全连接层1 + BN层    #   |    # 全连接层2 + BN层    #   |    # 全连接层3    #   |    # 输出    #    ##################################################    # input layer (输入层)    # Input conv layer is not binary. As the paper states, the computational savings are very less    # when the input channels to the conv layer are less    # 第一层卷积:根据论文所述,由于加速与激活图层深度与卷积大小关系密切,输入层有RGB三层,所以不使用xnor-conv    cnn = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input_var)    # 卷积层1    cnn = xnor_net.Conv2DLayer(        cnn,                                              # 输入层        xnor=False,                                       # 输入层是否是XNor层 (如果不是,会对输入层进行二值化)        num_filters=128,                                  # 滤波器个数        filter_size=(3, 3),                               # 滤波器大小        pad=1,                                            # 边界填充方式 (lasagne.layers.Conv2DLayer的参数)        nonlinearity=lasagne.nonlinearities.identity)     # 激活函数 (lasagne.layers.Conv2DLayer的参数)    # Batch Normalization层    cnn = lasagne.layers.BatchNormLayer(            cnn,            epsilon=eps,                                  # todo: BN层参数            alpha=alpha)    # 卷积层2    cnn = xnor_net.Conv2DLayer(            cnn,             xnor=True,            num_filters=128,             filter_size=(3, 3),            pad=1,            nonlinearity=lasagne.nonlinearities.identity)    # 最大值混合层2    cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))    # Batch Normalization层    cnn = lasagne.layers.BatchNormLayer(            cnn,            epsilon=eps,             alpha=alpha)    # 卷积层3    cnn = xnor_net.Conv2DLayer(            cnn,             xnor=True,            num_filters=256,             filter_size=(3, 3),            pad=1,            nonlinearity=lasagne.nonlinearities.identity)    # Batch Normalization层    cnn = lasagne.layers.BatchNormLayer(            cnn,            epsilon=eps,             alpha=alpha)    # 卷积层4    cnn = xnor_net.Conv2DLayer(            cnn,             xnor=True,            num_filters=256,             filter_size=(3, 3),            pad=1,            nonlinearity=lasagne.nonlinearities.identity)    # 最大值混合层4    cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))    # Batch Normalization层    cnn = lasagne.layers.BatchNormLayer(            cnn,            epsilon=eps,             alpha=alpha)    # 卷积层5    cnn = xnor_net.Conv2DLayer(            cnn,             xnor=True,            num_filters=512,             filter_size=(3, 3),            pad=1,            nonlinearity=lasagne.nonlinearities.identity)    # Batch Normalization层    cnn = lasagne.layers.BatchNormLayer(            cnn,            epsilon=eps,             alpha=alpha)    # 卷积层6    cnn = xnor_net.Conv2DLayer(            cnn,             xnor=True,            num_filters=512,             filter_size=(3, 3),            pad=1,            nonlinearity=lasagne.nonlinearities.identity)    # 最大值混合层6    cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))    # Batch Normalization层    cnn = lasagne.layers.BatchNormLayer(            cnn,            epsilon=eps,             alpha=alpha)    # 全连接层1    cnn = xnor_net.DenseLayer(            cnn,             xnor=True,            nonlinearity=lasagne.nonlinearities.identity,            num_units=1024)    # Batch Normalization层    cnn = lasagne.layers.BatchNormLayer(            cnn,            epsilon=eps,             alpha=alpha)    # 全连接层2    cnn = xnor_net.DenseLayer(            cnn,             xnor=True,            nonlinearity=lasagne.nonlinearities.identity,            num_units=1024)    # Batch Normalization层    cnn = lasagne.layers.BatchNormLayer(            cnn,            epsilon=eps,             alpha=alpha)    # 全连接层3    cnn = xnor_net.DenseLayer(            cnn,             xnor=False,            nonlinearity=lasagne.nonlinearities.softmax,            num_units=10)    return cnnif __name__ == '__main__':    # theano.function(inputs, outputs=None, mode=None, updates=None, givens=None):    #   根据输入inputs计算数据outputs的函数,其中:    #   inputs:  列表类型,用来保存输入量    #   outputs: 列表或字典类型,用来保存输出量。输入量与输出量的映射关系通常在输出量的定义中体现。    #   updates: 一组可迭代更新的量(shared_variable, new_expression),    #       对其中的shared_variable输入用new_expression表达式更新    #    # lasagne.updates.adam(loss_or_grads, params, learning_rate): 用于参数更新,    #   其中: loss_or_grads: 误差或梯度    #   params:             要更新的参数    #   learning_rate:      更新速率(学习速率)    #    # lasagne.layers.get_output(layer):            对指定网络,计算网络输出;    # lasagne.objectives.categorical_crossentropy(predictions, targets): 计算分类结果与目标的交叉熵(误差);    # lasagne.layers.get_all_params(layer):        返回一个列表,包含该层参数的theano共享变量或表达式 ?    # bnn_utils.compute_grads(loss, network):      计算梯度;    # bnn_utilsclipping_scaling(updates, network): 该函数在参数更新后规范化;    # OrderedDict:                                 有序字典类;    # dict.items():                                返回字典的键值;    # tensor.nep():                                相当于"a != b";    # tensor.argmax():                             返回沿指定轴取得最大值的下标;    # This is XNOR net (设定XNOR-神经网络标识)    xnor = True    # Model file name (设定模型文件名)    model_file = 'xnor_net_cifar10_nonxnor_first_lyr.npz'    # hyper parameters (设定超参数)    batch_size = 50                                       # 小批量数据大小    alpha = 0.1                                           # (BN层参数)    eps = 1e-4                                            # (BN层参数)    no_epochs = 200                                       # 迭代期次数    # learning rate (设定学习速率)    # similar setting    # s as in BinaryNet    LR_start = 0.001                                      # 学习速率-起点    LR_end = 0.0000003                                    # 学习速率-终点    LR_decay = (LR_end/LR_start)**(1./no_epochs)          # 平台值后,LR的衰减倍数    print('LR_start = {:f}\tLR_end = {:f}\tLR_decay = {:f}'.format(LR_start, LR_end, LR_decay))    # input data, target and learning rate as theano symbolic var (建立输入数据,目标和学习速率变量)    input_vars = T.tensor4('input')                       # 输入数据,四维tensor张量    targets = T.fmatrix('target')                         # 目标,tensor矩阵    LR = T.scalar('LR', dtype=theano.config.floatX)       # 学习速率,tensor标量    # construct deep network (构建深度网络)    print('Constructing the network...')    net = construct_cifar10_net(input_vars, alpha, eps)   # 建立网络,这里调用了construct_cifar10_net()函数    # Load data (加载数据)    print('Loading the data...')    train_x, val_x, test_x, train_y, val_y, test_y = cnn_utils.load_data('cifar10')  # 加载数据    # network output (计算网络输出)    # 注:用于建立训练数据的误差计算及参数更新函数    train_pred = lasagne.layers.get_output(net, deterministic=False)    print('Constructed symbolic output')    # loss (定义误差计算函数)    # 注:用于建立训练数据的误差计算及参数更新函数    # As per paper it is -ve log-liklihood on softmax output    loss = lasagne.objectives.categorical_crossentropy(train_pred, targets)    # mean loss across all images in the batch    # 计算小批量数据中所有图片误差的平均值    loss = T.mean(loss)    print('Constructed symbolic training loss')    # define the update process (定义参数更新函数)    # 注:用于建立训练数据的误差计算及参数更新函数    # No need of weight cliping as in BinaryNet    # 在二值神经网络中无需进行权重重置。    print('Defining the update process...')    if xnor:        # W updates (权重更新)        W = lasagne.layers.get_all_params(net, xnor=True)  # 取得网络权重        W_grads = bnn_utils.compute_grads(loss, net)      # 计算梯度        updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR)        #是的        updates = bnn_utils.clipping_scaling(updates, net)        # other parameters updates (其它参数更新)        params = lasagne.layers.get_all_params(net, trainable=True, xnor=False)        updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss,            params=params, learning_rate=LR).items())     # 将其它参数的更新添加到updates中    else:        # (参数更新)        params = lasagne.layers.get_all_params(net, trainable=True)        updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)    # test prediction and loss expressions (测试数据和误差表示)    # 注:用于建立测试数据的误差计算函数    print('Creating test prediction, loss and error expressions...')    test_pred = lasagne.layers.get_output(net, deterministic=True)    #多分类交叉熵损失函数,一般用作最后一层为sigmoid分类    test_loss = T.mean(lasagne.objectives.categorical_crossentropy(test_pred, targets))    test_err = T.mean(T.neq(T.argmax(test_pred, axis=1), T.argmax(targets, axis=1)), dtype=theano.config.floatX)    # construct theano function train, validation/testing process (建立训练数据和测试数据的theano函数)    train_fn = theano.function([input_vars, targets, LR], loss, updates=updates)  # 建立训练数据的误差计算及参数更新函数    # test_fn = theano.function([input_vars, targets], test_loss)    test_fn = theano.function([input_vars, targets], [test_loss, test_err])  # 建立测试数据的误差计算函数    print('Created theano functions for training and validation...')    # 训练数据    print('Training...')    # new_loss = train_fn(train_x[0:50], train_y[0:50], LR_start)    print('Trainset shape = ', train_x.shape, train_y.shape)    print('Valset shape = ', val_x.shape, val_y.shape)    print('Testset shape = ', test_x.shape, test_y.shape)    # new_loss, new_err = test_fn(val_x[0:50], val_y[0:50])    bnn_utils.train(            train_fn, test_fn,                            # 训练和测试数据的theano函数            net,                                          # 神经网络            batch_size,                                   # 小批量数据大小            LR_start, LR_decay,                           # 学习速率            no_epochs,                                    # 迭代期次数            train_x, train_y,                             # 训练数据            val_x, val_y,                                 # 验证数据            test_x, test_y,                               # 测试数据            save_path=model_file,                         # 模型存放路径            shuffle_parts=1)# This should produce at most 13.89% test error rate (这应该产生至多13.89%的测试错误率。)