深度学习-用户自定义层

来源：互联网发布：学vb必看书编辑：程序博客网时间：2024/06/07 00:16
这篇我们看下如果自定义层，如何做梯度测试，老吴还是很推崇梯度测试的，由于已经写过很多篇了，重复的内容不再解释，可以参考前面的文章
public class CustomLayerExample {    static{//静态代码块        //Double precision for the gradient checks. See comments in the doGradientCheck() method//梯度检测以小数为精度        // See also http://nd4j.org/userguide.html#miscdatatype        DataTypeUtil.setDTypeForContext(DataBuffer.Type.DOUBLE);//设置nd4j分配数据模式，这里是双精度小数    }    public static void main(String[] args) throws IOException {        runInitialTests();//连续调用        doGradientCheck();    }    private static void runInitialTests() throws IOException {//运行初始测试        /*        This method shows the configuration and use of the custom layer.        It also shows some basic sanity checks and tests for the layer.        In practice, these tests should be implemented as unit tests; for simplicity, we are just printing the results         */        System.out.println("----- Starting Initial Tests -----");        int nIn = 5;        int nOut = 8;        //Let's create a network with our custom layer        MultiLayerConfiguration config = new NeuralNetConfiguration.Builder()            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1)            .updater(Updater.RMSPROP).rmsDecay(0.95)//加速衰减系数，防止梯度变化过大，训练过早结束的参数更新方法            .weightInit(WeightInit.XAVIER)            .regularization(true).l2(0.03)            .list()            .layer(0, new DenseLayer.Builder().activation("tanh").nIn(nIn).nOut(6).build())     //Standard DenseLayer            .layer(1, new CustomLayer.Builder()//这里是用户自定义层，后面我们会有相关代码的解释                .activation("tanh")                                                             //Property inherited from FeedForwardLayer                .secondActivationFunction("sigmoid")                                            //Custom property we defined for our layer                .nIn(6).nOut(7)                                                                 //nIn and nOut also inherited from FeedForwardLayer                .build())            .layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)//使用交叉熵作为损失函数               //Standard OutputLayer                .activation("softmax").nIn(7).nOut(nOut).build())            .pretrain(false).backprop(true).build();        //First:  run some basic sanity checks on the configuration://第一步：检查配置是否正确        double customLayerL2 = config.getConf(1).getLayer().getL2();//.getConf(1)会获得3层网络的第2层配置，再getLayer().getL2()会获得该层的正则化参数        System.out.println("l2 coefficient for custom layer: " + customLayerL2);                //As expected: custom layer inherits the global L2 parameter configuration        Updater customLayerUpdater = config.getConf(1).getLayer().getUpdater();//.getConf(1)会获得3层网络的第2层配置，再getLayer().getUpdater()会获得该层的参数更新器        System.out.println("Updater for custom layer: " + customLayerUpdater);                  //As expected: custom layer inherits the global Updater configuration        //Second: We need to ensure that that the JSON and YAML configuration works, with the custom layer        // If there were problems with serialization, you'd get an exception during deserialization ("No suitable constructor found..." for example)//第二步：验证json和yaml配置可用，如果序列化有问题，反序列会报错        String configAsJson = config.toJson();//配置转成json和yaml字符串        String configAsYaml = config.toYaml();        MultiLayerConfiguration fromJson = MultiLayerConfiguration.fromJson(configAsJson);//再获取json和yaml的配置        MultiLayerConfiguration fromYaml = MultiLayerConfiguration.fromYaml(configAsYaml);        System.out.println("JSON configuration works: " + config.equals(fromJson));        System.out.println("YAML configuration works: " + config.equals(fromYaml));        MultiLayerNetwork net = new MultiLayerNetwork(config);        net.init();        //Third: Let's run some more basic tests. First, check that the forward and backward pass methods don't throw any exceptions        // To do this: we'll create some simple test data//第三步：基础测试，验证模型的前向后向反馈是否异常        int minibatchSize = 5;        INDArray testFeatures = Nd4j.rand(minibatchSize, nIn);//随机生成0,1之间的小数作为测试集属性        INDArray testLabels = Nd4j.zeros(minibatchSize, nOut);//初始化测试集标签为0        Random r = new Random(12345);        for( int i=0; i<minibatchSize; i++ ){            testLabels.putScalar(i,r.nextInt(nOut),1);  //Random one-hot labels data//第一个参数是行，第二个参数是列，第三个参数是要放入的值，putScalar是把1放入对应的行和列，形成一个one-hot标签，这方法还挺高级的        }        List<INDArray> activations = net.feedForward(testFeatures);//这个方法把testFeatures作为输入，也是输出的0层，一直往后计算，算出每层的输出，输出索引是0-3        INDArray activationsCustomLayer = activations.get(2);//获取第二层的输出结果                                   //Activations index 2: index 0 is input, index 1 is first layer, etc.        System.out.println("\nActivations from custom layer:");        System.out.println(activationsCustomLayer);        net.fit(new DataSet(testFeatures, testLabels));//网络定型        //Finally, let's check the model serialization process, using ModelSerializer://第四步：检查模型序列化过程        ModelSerializer.writeModel(net, new File("CustomLayerModel.zip"), true);//压缩模型并保存，true代表保存参数更新器        MultiLayerNetwork restored = ModelSerializer.restoreMultiLayerNetwork(new File("CustomLayerModel.zip"));//装载模型        System.out.println();//验证序列化反序列化的配置参数是否相同        System.out.println("Original and restored networks: configs are equal: " + net.getLayerWiseConfigurations().equals(restored.getLayerWiseConfigurations()));        System.out.println("Original and restored networks: parameters are equal: " + net.params().equals(restored.params()));    }    private static void doGradientCheck(){//梯度检测,设定双精度，单精度会出问题，同时设定更新器为空或都使用学习率为1的随机梯度下降更新器，因为我们测试的是梯度被学习率，动量等修正前的原始梯度        /*        Gradient checks are one of the most important components of implementing a layer        They are necessary to ensure that your implementation is correct: without them, you could easily have a subtle         error, and not even know it.        Deeplearning4j comes with a gradient check utility that you can use to check your layers.        This utility works for feed-forward layers, CNNs, RNNs etc.        For more details on gradient checks, and some references, see the Javadoc for the GradientCheckUtil class:        https://github.com/deeplearning4j/deeplearning4j/blob/master/deeplearning4j-core/src/main/java/org/deeplearning4j/gradientcheck/GradientCheckUtil.java        There are a few things to note when doing gradient checks:        1. It is necessary to use double precision for ND4J. Single precision (float - the default) isn't sufficiently           accurate for reliably performing gradient checks        2. It is necessary to set the updater to None, or equivalently use both the SGD updater and learning rate of 1.0           Reason: we are testing the raw gradients before they have been modified with learning rate, momentum, etc.        */        System.out.println("\n\n\n----- Starting Gradient Check -----");        Nd4j.getRandom().setSeed(12345);//通过Nd4j的方法获取随机数生成器并加入种子        int nIn = 3;        int nOut = 2;        MultiLayerConfiguration config = new NeuralNetConfiguration.Builder()            .seed(12345)            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1)            .updater(Updater.NONE).learningRate(1.0)            .weightInit(WeightInit.DISTRIBUTION).dist(new NormalDistribution(0,1))              //Larger weight init than normal can help with gradient checks//更大的初始化权重能更好的梯度检测            .regularization(true).l2(0.03)            .list()            .layer(0, new DenseLayer.Builder().activation("tanh").nIn(nIn).nOut(3).build())    //Standard DenseLayer            .layer(1, new CustomLayer.Builder()                .activation("tanh")                                                             //Property inherited from FeedForwardLayer                .secondActivationFunction("sigmoid")                                            //Custom property we defined for our layer                .nIn(3).nOut(3)                                                                 //nIn and nOut also inherited from FeedForwardLayer                .build())            .layer(2, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)                //Standard OutputLayer                .activation("softmax").nIn(3).nOut(nOut).build())            .pretrain(false).backprop(true).build();        MultiLayerNetwork net = new MultiLayerNetwork(config);        net.init();        boolean print = true;//在测试过程中打印参数状态                                                                   //Whether to print status for each parameter during testing        boolean return_on_first_failure = false;//失败后结束测试                                                //If true: terminate test on first failure        double gradient_check_epsilon = 1e-8;//梯度检测的灵敏度                                                  //Epsilon value used for gradient checks        double max_relative_error = 1e-5;//参数允许的最大相对错误                                                       //Maximum relative error allowable for each parameter        double min_absolute_error = 1e-10;//最小绝对误差，避免0和1e-30的错误                                                      //Minimum absolute error, to avoid failures on 0 vs 1e-30, for example.        //Create some random input data to use in the gradient check        int minibatchSize = 5;        INDArray features = Nd4j.rand(minibatchSize, nIn);//随机初始化属性        INDArray labels = Nd4j.zeros(minibatchSize, nOut);//初始化标签为0        Random r = new Random(12345);        for( int i=0; i<minibatchSize; i++ ){//标签转化为one-hot            labels.putScalar(i,r.nextInt(nOut),1);  //Random one-hot labels data        }        //Print the number of parameters in each layer. This can help to identify the layer that any failing parameters        // belong to.        for( int i=0; i<3; i++ ){//打印每层的参数数量            System.out.println("# params, layer " + i + ":\t" + net.getLayer(i).numParams());        }        GradientCheckUtil.checkGradients(net, gradient_check_epsilon, max_relative_error, min_absolute_error, print,            return_on_first_failure, features, labels);//使用框架自带的梯度检测方法检查梯度，传入网络，梯度检测灵敏度，最大相对误差，最小绝对误差，打印参数数量，失败即返回参数，属性，标签    }}
下面我们看下自定义层CustomLayer的代码，
public class CustomLayer extends FeedForwardLayer {//继承自FeedForwardLayer，FeedForwardLayer又继承抽象类Layer，有些方法需要复写    private String secondActivationFunction;//主要的变化就是定义了第二个激活函数    public CustomLayer() {//构造函数，我们需要一个无参的构造器，因为我们要反序列化配置，如果没有，很可能报错        //We need a no-arg constructor so we can deserialize the configuration from JSON or YAML format        // Without this, you will likely get an exception like the following:        //com.fasterxml.jackson.databind.JsonMappingException: No suitable constructor found for type [simple type, class org.deeplearning4j.examples.misc.customlayers.layer.CustomLayer]: can not instantiate from JSON object (missing default constructor or creator, or perhaps need to add/enable type information?)    }    private CustomLayer(Builder builder) {//调用父类传参方法,补充第二个激活函数传参方法        super(builder);        this.secondActivationFunction = builder.secondActivationFunction;//把内部类的变量赋值给当前对象的变量    }    public String getSecondActivationFunction() {//第二个激活函数的getter,setter，为了序列化        //We also need setter/getter methods for our layer configuration fields (if any) for JSON serialization        return secondActivationFunction;    }    public void setSecondActivationFunction(String secondActivationFunction) {        //We also need setter/getter methods for our layer configuration fields (if any) for JSON serialization        this.secondActivationFunction = secondActivationFunction;    }    @Override    public Layer instantiate(NeuralNetConfiguration conf, Collection<IterationListener> iterationListeners,                             int layerIndex, INDArray layerParamsView, boolean initializeParams) {//实现层的方法，传入配置，监听器，层索引，层参数，是否初始化参数，这个实例化方法帮助我们从配置类映射到实现接口        //The instantiate method is how we go from the configuration class (i.e., this class) to the implementation class        // (i.e., a CustomLayerImpl instance)        //For the most part, it's the same for each type of layer        CustomLayerImpl myCustomLayer = new CustomLayerImpl(conf);//实现一个接口类，传入配置，设置监听器和层索引        myCustomLayer.setListeners(iterationListeners);             //Set the iteration listeners, if any        myCustomLayer.setIndex(layerIndex);                         //Integer index of the layer        //Parameter view array: In Deeplearning4j, the network parameters for the entire network (all layers) are        // allocated in one big array. The relevant section of this parameter vector is extracted out for each layer,        // (i.e., it's a "view" array in that it's a subset of a larger array)        // This is a row vector, with length equal to the number of parameters in the layer        myCustomLayer.setParamsViewArray(layerParamsView);//设置该层参数，在dl4j中，网络的所有参数存在一个大的数组中，每层的参数是一个行向量，向量的长度就是该层参数的个数        //Initialize the layer parameters. For example,        // Note that the entries in paramTable (2 entries here: a weight array of shape [nIn,nOut] and biases of shape [1,nOut]        // are in turn a view of the 'layerParamsView' array.        Map<String, INDArray> paramTable = initializer().init(conf, layerParamsView, initializeParams);//初始化层参数，传入配置，层参数，是否初始化参数，注意有一个输入乘以输出大小的权重数组和一个输出大小的偏差数组        myCustomLayer.setParamTable(paramTable);//把参数表传给实现接口类,再传入网络配置        myCustomLayer.setConf(conf);        return myCustomLayer;//返回这个实现    }    @Override    public ParamInitializer initializer() {//复写初始化器，返回默认的参数初始化实例，默认参数初始化器对denseLayer是通用的，这里顺便提一下，DenseLayer，CustomLayer，OutputLayer都以FeedForwardLayer为基类        //This method returns the parameter initializer for this type of layer        //In this case, we can use the DefaultParamInitializer, which is the same one used for DenseLayer        //For more complex layers, you may need to implement a custom parameter initializer        //See the various parameter initializers here:        //https://github.com/deeplearning4j/deeplearning4j/tree/master/deeplearning4j-core/src/main/java/org/deeplearning4j/nn/params        return DefaultParamInitializer.getInstance();    }    //Here's an implementation of a builder pattern, to allow us to easily configure the layer    //Note that we are inheriting all of the FeedForwardLayer.Builder options: things like n    public static class Builder extends FeedForwardLayer.Builder<Builder> {//实现builder,继承FeedForwardLayer的builder，这样new CustomLayer.Builder()就可以调用了，Builder是CustomLayer的内部类        private String secondActivationFunction;//第二个激活函数        //This is an example of a custom property in the configuration        /**         * A custom property used in this custom layer example. See the CustomLayerExampleReadme.md for details         *         * @param secondActivationFunction Second activation function for the layer         */        public Builder secondActivationFunction(String secondActivationFunction) {//给当前对象传入第二个激活函数，返回当前builder            this.secondActivationFunction = secondActivationFunction;            return this;        }        @Override        @SuppressWarnings("unchecked")  //To stop warnings about unchecked cast. Not required.        public CustomLayer build() {//复写build方法，返回传入当前builder的用户层,由于是继承类，需要构造方法            return new CustomLayer(this);        }    }}

CustomLayerImpl是自定义层的实现类，代码如下
public class CustomLayerImpl extends BaseLayer<CustomLayer> { //Generic parameter here: the configuration class type//继承抽象类BaseLayer，BaseLayer又继承自Layer抽象类，由于BaseLayer是抽象类，所以CustomLayerImpl都是override的方法    public CustomLayerImpl(NeuralNetConfiguration conf) {//传入配置，调用父类的传参方法        super(conf);    }    @Override    public INDArray preOutput(INDArray x, boolean training) {//复写预输出，传入输入数据和是否是训练，预输出方法在不用激活函数的情况下计算激活值，由于我们没有对标准紧密层做什么不一样的，我们使用现有的实现，其他网络类型需要自己实现这个方法，对用于自定义层，也有必要自己实现计算l1正则化,l2正则化,参数数量等的方法        /*        The preOut method(s) calculate the activations (forward pass), before the activation function is applied.        Because we aren't doing anything different to a standard dense layer, we can use the existing implementation        for this. Other network types (RNNs, CNNs etc) will require you to implement this method.        For custom layers, you may also have to implement methods such as calcL1, calcL2, numParams, etc.         */        return super.preOutput(x, training);//调用父类预输出方法    }    @Override    public INDArray activate(boolean training) {//传入是否是训练，复写激活方法。注意这个方法依赖于预输出方法，本质上我们使用激活函数，在这个非常规的例子中，我们有两个激活函数，一个用于激活输出的前一半，另一个激活输出的后一半        /*        The activate method is used for doing forward pass. Note that it relies on the pre-output method;        essentially we are just applying the activation function (or, functions in this example).        In this particular (contrived) example, we have TWO activation functions - one for the first half of the outputs        and another for the second half.         */        INDArray output = preOutput(training);//预训练输出        int columns = output.columns();//输出的列数        INDArray firstHalf = output.get(NDArrayIndex.all(), NDArrayIndex.interval(0, columns / 2));//分别获取前一半输出，和后一半输出，get是截取output的一部分数组，NDArrayIndex.all()代表范围是所有索引，NDArrayIndex.interval(0, columns / 2)代表从0开始到columns/2的前一个索引        INDArray secondHalf = output.get(NDArrayIndex.all(), NDArrayIndex.interval(columns / 2, columns));        String activation1 = conf.getLayer().getActivationFunction();//获取两个激活函数，这里的conf是父类BaseLayer的成员变量        String activation2 = ((CustomLayer) conf.getLayer()).getSecondActivationFunction();        Nd4j.getExecutioner().exec(Nd4j.getOpFactory().createTransform(activation1, firstHalf));//分别计算不同的激活值，getExecutioner是获取nd4j的执行实例，exec是执行，getOpFactory是nd4j的操作实例，createTransforms是创建转化，第一个参数是激活函数，第二个是要操作的对象        Nd4j.getExecutioner().exec(Nd4j.getOpFactory().createTransform(activation2, secondHalf));        return output;//由于firstHalf,secondHalf和output公用地址，所以返回output已经用了不同的激活函数    }    @Override    public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {//复写反向梯度方法，传入灵敏度数组，返回梯度和数组的元组，这里的反向梯度方法与BaseLayer里反向梯度的实现很相似，主要的区别是双激活函数，注意灵敏度是损失函数对激活函数的导数，灵敏度和激活值数组维度相同，这不是神经网络论文常说的delta，这里的delta通过把灵敏度元素乘以激活函数的导数得到，注意一下：
1.对结果使用梯度视图数组非常重要
  注意gradientViews.get()和原地操作，由于dl4j使用简单的大数组用于梯度有效性，为了高效反向传播和内存管理，每层数组对应这个大数组的的子集
2.这个方法做了两件事
          (a)梯度对象(本质上是每个参数的梯度map(String，INDArray)，且是整个网络梯度数组的视图)
          (b)传给层的灵敏度数组，相对输入灵敏度就是梯度        /*        The baockprop gradient method here is very similar to the BaseLayer backprop gradient implementation        The only major difference is the two activation functions we have added in this example.        Note that epsilon is dL/da - i.e., the derivative of the loss function with respect to the activations.        It has the exact same shape as the activation arrays (i.e., the output of preOut and activate methods)        This is NOT the 'delta' commonly used in the neural network literature; the delta is obtained from the        epsilon ("epsilon" is dl4j's notation) by doing an element-wise product with the activation function derivative.
        Note the following:        1. Is it very important that you use the gradientViews arrays for the results.           Note the gradientViews.get(...) and the in-place operations here.           This is because DL4J uses a single large array for the gradients for efficiency. Subsets of this array (views)           are distributed to each of the layers for efficient backprop and memory management.        2. The method returns two things, as a Pair:           (a) a Gradient object (essentially a Map<String,INDArray> of the gradients for each parameter (again, these               are views of the full network gradient array)           (b) an INDArray. This INDArray is the 'epsilon' to pass to the layer below. i.e., it is the gradient with               respect to the input to this layer        */        INDArray activationDerivative = preOutput(true);//预输出，参数代表是否是训练,并计算返回值的列数，由于有5个样例，输出节点是3个，预输出是一个5*3的矩阵        int columns = activationDerivative.columns();        INDArray firstHalf = activationDerivative.get(NDArrayIndex.all(), NDArrayIndex.interval(0, columns / 2));        INDArray secondHalf = activationDerivative.get(NDArrayIndex.all(), NDArrayIndex.interval(columns / 2, columns));//firstHaslf是activationDerivative的第一列构成的数组，secondHalf是activationDerivative的第二三列构成矩阵        String activation1 = conf.getLayer().getActivationFunction();        String activation2 = ((CustomLayer) conf.getLayer()).getSecondActivationFunction();        Nd4j.getExecutioner().exec(Nd4j.getOpFactory().createTransform(activation1, firstHalf).derivative());        Nd4j.getExecutioner().exec(Nd4j.getOpFactory().createTransform(activation2, secondHalf).derivative());//和activate里的方法一样，这里多了一个.derivative()求导        //The remaining code for this method: just copy & pasted from BaseLayer.backpropGradient//下面的代码和BaseLayer.backpropGradient几乎一致        INDArray delta = epsilon.muli(activationDerivative);//把灵敏度epsilon和激活函数的导数按元素相乘，由于epsilon是5*3的矩阵，delta也是5*3的        if (maskArray != null) {//如果maskArray不为空再按元素乘以这个数组            delta.muliColumnVector(maskArray);        }        Gradient ret = new DefaultGradient();//弄一个默认梯度类        INDArray weightGrad = gradientViews.get(DefaultParamInitializer.WEIGHT_KEY);//由于继承了BaseLayer，gradientViews是BaseLayer里的一个map成员，key只有两个，一个是权重，一个是偏差，value就是key对应的参数数组        Nd4j.gemm(input, delta, weightGrad, true, false, 1.0, 0.0);//这是一个矩阵乘法，true代表让input转置，false代表delta不转置,如果op代表转置，那么公式为weightGrad = 1*op(input)*(delta) + 0*weightGrad,weightGrad是3*3的矩阵        INDArray biasGrad = gradientViews.get(DefaultParamInitializer.BIAS_KEY);//取出偏差参数数组        biasGrad.assign(delta.sum(0));  //TODO: do this without the assign//delta.sum(0)是delta按列汇总成一个数组，把结果放入偏差数组        ret.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, weightGrad);//获取ret中的梯度查询map,并给map赋值        ret.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, biasGrad);        INDArray epsilonNext = params.get(DefaultParamInitializer.WEIGHT_KEY).mmul(delta.transpose()).transpose();//params也是BaseLayer类中的成员，获取权重参数，mmul是矩阵相乘，乘以delta的转置，最后结果再转置，赋值给下个灵敏度，params.get(DefaultParamInitializer.WEIGHT_KEY)是3*3的矩阵，delta.transpose()是3*5的矩阵，最终epsilonNext是5*3的矩阵        return new Pair<>(ret, epsilonNext);//返回梯度类和下个灵敏度    }}
0 0