ConvnetJS源代码分析第三篇

来源：互联网发布：乌鲁木齐网站排名seo 编辑：程序博客网时间：2024/06/07 11:55

在前面的两篇文章中介绍了其使用的基本数据结构Vol和神经网络组件Layer。最后两个就是convnet_net.js和convnet_trainers.js。

2.6 convnet_net.js

在第二篇曾经初步的设计过这里面的内容，主要是接受Layer类的array，形成一个健全的神经网络。一下是文件convnetjs_net.js的全部代码。其中中文注释为自己添加的，英文为作者添加的。

(function(global) {  "use strict";  var Vol = global.Vol; // convenience  var assert = global.assert;  // Net manages a set of layers  // For now constraints: Simple linear order of layers, first layer input last layer a cost layer  var Net = function(options) {    this.layers = []; // 输入为layer类实例的array。其中第一层为input型，最后为Loss型。layers是一个重要的属性。  }  Net.prototype = {        // takes a list of layer definitions and creates the network layer objects    makeLayers: function(defs) {      // few checks      assert(defs.length >= 2, 'Error! At least one input layer and one loss layer are required.');      assert(defs[0].type === 'input', 'Error! First layer must be the input layer, to declare size of inputs');      // desuger函数完成了用户输入layer一些检查，并添加一部分的fullycnnlayer型层，同时对于输入的激活函数类型，添加对应的激活函数层，这一点在上一篇的补续有详细说明。      // desugar layer_defs for adding activation, dropout layers etc      var desugar = function() {        var new_defs = [];        for(var i=0;i<defs.length;i++) {          var def = defs[i];                    if(def.type==='softmax' || def.type==='svm') {            // add an fc layer here, there is no reason the user should            // have to worry about this and we almost always want to            new_defs.push({type:'fc', num_neurons: def.num_classes});          }          if(def.type==='regression') {            // add an fc layer here, there is no reason the user should            // have to worry about this and we almost always want to            new_defs.push({type:'fc', num_neurons: def.num_neurons});          }          if((def.type==='fc' || def.type==='conv')               && typeof(def.bias_pref) === 'undefined'){            def.bias_pref = 0.0;            if(typeof def.activation !== 'undefined' && def.activation === 'relu') {              def.bias_pref = 0.1; // relus like a bit of positive bias to get gradients early              // otherwise it's technically possible that a relu unit will never turn on (by chance)              // and will never get any gradient and never contribute any computation. Dead relu.            }          }          new_defs.push(def);          if(typeof def.activation !== 'undefined') {            if(def.activation==='relu') { new_defs.push({type:'relu'}); }            else if (def.activation==='sigmoid') { new_defs.push({type:'sigmoid'}); }            else if (def.activation==='tanh') { new_defs.push({type:'tanh'}); }            else if (def.activation==='maxout') {              // create maxout activation, and pass along group size, if provided              var gs = def.group_size !== 'undefined' ? def.group_size : 2;              new_defs.push({type:'maxout', group_size:gs});            }            else { console.log('ERROR unsupported activation ' + def.activation); }          }          if(typeof def.drop_prob !== 'undefined' && def.type !== 'dropout') {            new_defs.push({type:'dropout', drop_prob: def.drop_prob});          }        }        return new_defs;      }      defs = desugar(defs);      // 下面几行代码在第二篇中图一的截图。完成从输入layer参数到实际构建layer实例的转化。并将其值保存到layers属性。      // create the layers      this.layers = [];      for(var i=0;i<defs.length;i++) {        var def = defs[i];        if(i>0) {          var prev = this.layers[i-1];          def.in_sx = prev.out_sx;          def.in_sy = prev.out_sy;          def.in_depth = prev.out_depth;        }        switch(def.type) {          case 'fc': this.layers.push(new global.FullyConnLayer(def)); break;          case 'lrn': this.layers.push(new global.LocalResponseNormalizationLayer(def)); break;          case 'dropout': this.layers.push(new global.DropoutLayer(def)); break;          case 'input': this.layers.push(new global.InputLayer(def)); break;          case 'softmax': this.layers.push(new global.SoftmaxLayer(def)); break;          case 'regression': this.layers.push(new global.RegressionLayer(def)); break;          case 'conv': this.layers.push(new global.ConvLayer(def)); break;          case 'pool': this.layers.push(new global.PoolLayer(def)); break;          case 'relu': this.layers.push(new global.ReluLayer(def)); break;          case 'sigmoid': this.layers.push(new global.SigmoidLayer(def)); break;          case 'tanh': this.layers.push(new global.TanhLayer(def)); break;          case 'maxout': this.layers.push(new global.MaxoutLayer(def)); break;          case 'svm': this.layers.push(new global.SVMLayer(def)); break;          default: console.log('ERROR: UNRECOGNIZED LAYER TYPE: ' + def.type);        }      }    },    /* 下面是类net的第二个方法，也就是前向传播方法。注意的是这里的forword与之前每一个layer的forword的不同，layer//层面的forword是对于这个层的，输入是in_act输出是out_act有带有权值层和激活函数层。这里的forword是对于每一个layer调用forword.对于一个Vol类型输入，经过input_layer层的in_act和out_act转变输出一个Vol类型的out_act，然后这个out_act有///作为下一个层的in_act，依次循环到最后一层，因此通过调用不同layer级别的out_act我们就可以知道数据现在已经被加工成什///么样子，这一点在作者的Demo中被使用。   */     // fordword prop the netowrk . The trainer class passes is_training = true, but when this function is    // called from outside (not from the trainer), it defaults to prediction mode    forward: function(V, is_training) {      if(typeof(is_training) === 'undefined') is_training = false;      var act = this.layers[0].forward(V, is_training);      for(var i=1;i<this.layers.length;i++) {        act = this.layers[i].forward(act, is_training);      }      return act;    },    getCostLoss: function(V, y) {// 这里loss是最后一层backward()函数的返回的结果。具体的可以参看convnetjs_layer_los//s.js中。调用这个函数可以返回基于目前权重之下的Loss.      this.forward(V, false);      var N = this.layers.length;      var loss = this.layers[N-1].backward(y);      return loss;    },        // backprop: compute gradients wrt all parameters.这里的后向传播是在每一个layer的级别上完成的。    backward: function(y) {      var N = this.layers.length;      var loss = this.layers[N-1].backward(y); // last layer assumed to be loss layer      for(var i=N-2;i>=0;i--) { // first layer assumed input        this.layers[i].backward();      }      return loss;    },    getParamsAndGrads: function() {      // accumulate parameters and gradients for the entire network      var response = [];      for(var i=0;i<this.layers.length;i++) {        var layer_reponse = this.layers[i].getParamsAndGrads();        for(var j=0;j<layer_reponse.length;j++) {          response.push(layer_reponse[j]);        }      }      return response;    },    getPrediction: function() {      // this is a convenience function for returning the argmax      // prediction, assuming the last layer of the net is a softmax      var S = this.layers[this.layers.length-1];      assert(S.layer_type === 'softmax', 'getPrediction function assumes softmax as last layer of the net!');      var p = S.out_act.w;      var maxv = p[0];      var maxi = 0;      for(var i=1;i<p.length;i++) {        if(p[i] > maxv) { maxv = p[i]; maxi = i;}      }      return maxi; // return index of the class with highest class probability    },    toJSON: function() {      var json = {};      json.layers = [];      for(var i=0;i<this.layers.length;i++) {        json.layers.push(this.layers[i].toJSON());      }      return json;    },    fromJSON: function(json) {      this.layers = [];      for(var i=0;i<json.layers.length;i++) {        var Lj = json.layers[i]        var t = Lj.layer_type;        var L;        if(t==='input') { L = new global.InputLayer(); }        if(t==='relu') { L = new global.ReluLayer(); }        if(t==='sigmoid') { L = new global.SigmoidLayer(); }        if(t==='tanh') { L = new global.TanhLayer(); }        if(t==='dropout') { L = new global.DropoutLayer(); }        if(t==='conv') { L = new global.ConvLayer(); }        if(t==='pool') { L = new global.PoolLayer(); }        if(t==='lrn') { L = new global.LocalResponseNormalizationLayer(); }        if(t==='softmax') { L = new global.SoftmaxLayer(); }        if(t==='regression') { L = new global.RegressionLayer(); }        if(t==='fc') { L = new global.FullyConnLayer(); }        if(t==='maxout') { L = new global.MaxoutLayer(); }        if(t==='svm') { L = new global.SVMLayer(); }        L.fromJSON(Lj);        this.layers.push(L);      }    }  }    global.Net = Net;})(convnetjs);

2.7convnet_trainers.js

最后一个文件是类Trainer的定义文件，在文件中仅仅有一个方法train()。输入是之前的神级网络，和一些训练时指定的参数。

以下为convnet_trainers.js文件的全部代码，中文为我只加的注释。从下面的分析，可以知道作者的这里训练方式是BP算法。与一般的深度神经网络的训练方式不同的是，通常会对网络进行无监督训练，然后使用这些无监督训练得到的权值为初始化网络权值，然后使用BP算法，进行微调。

作者使用的默认训练方式是Stochastic Gradient Descent(SGD)随机梯度下降。使用了在梯度公式中有标准的参数的梯度+动量项+L1+L2正则项。

(function(global) {  "use strict";  var Vol = global.Vol; // convenience  var Trainer = function(net, options) {    this.net = net;    var options = options || {};    this.learning_rate = typeof options.learning_rate !== 'undefined' ? options.learning_rate : 0.01;    this.l1_decay = typeof options.l1_decay !== 'undefined' ? options.l1_decay : 0.0;    this.l2_decay = typeof options.l2_decay !== 'undefined' ? options.l2_decay : 0.0;    this.batch_size = typeof options.batch_size !== 'undefined' ? options.batch_size : 1;    this.method = typeof options.method !== 'undefined' ? options.method : 'sgd'; // sgd/adam/adagrad/adadelta/windowgrad/netsterov    this.momentum = typeof options.momentum !== 'undefined' ? options.momentum : 0.9;    this.ro = typeof options.ro !== 'undefined' ? options.ro : 0.95; // used in adadelta    this.eps = typeof options.eps !== 'undefined' ? options.eps : 1e-8; // used in adam or adadelta    this.beta1 = typeof options.beta1 !== 'undefined' ? options.beta1 : 0.9; // used in adam    this.beta2 = typeof options.beta2 !== 'undefined' ? options.beta2 : 0.999; // used in adam    this.k = 0; // iteration counter    this.gsum = []; // last iteration gradients (used for momentum calculations)    this.xsum = []; // used in adam or adadelta    // check if regression is expected     if(this.net.layers[this.net.layers.length - 1].layer_type === "regression")      this.regression = true;    else      this.regression = false;  }  Trainer.prototype = {    train: function(x, y) {      var start = new Date().getTime();      this.net.forward(x, true); // also set the flag that lets the net know we're just training      var end = new Date().getTime();      var fwd_time = end - start;      var start = new Date().getTime();      var cost_loss = this.net.backward(y);      var l2_decay_loss = 0.0;      var l1_decay_loss = 0.0;      var end = new Date().getTime();      var bwd_time = end - start;      if(this.regression && y.constructor !== Array)        console.log("Warning: a regression net requires an array as training output vector.");            this.k++;      if(this.k % this.batch_size === 0) {        var pglist = this.net.getParamsAndGrads();// 返回目前net中的每一个layer的参数和各个梯度。        // initialize lists for accumulators. Will only be done once on first iteration        if(this.gsum.length === 0 && (this.method !== 'sgd' || this.momentum > 0.0)) {          // only vanilla sgd doesnt need either lists          // momentum needs gsum          // adagrad needs gsum          // adam and adadelta needs gsum and xsum          for(var i=0;i<pglist.length;i++) {            this.gsum.push(global.zeros(pglist[i].params.length));// 建立和参数长度相同的统计量存储器gsum            if(this.method === 'adam' || this.method === 'adadelta') {              this.xsum.push(global.zeros(pglist[i].params.length));            } else {              this.xsum.push([]); // conserve memory            }          }        }        // perform an update for all sets of weights        for(var i=0;i<pglist.length;i++) {          var pg = pglist[i]; // param, gradient, other options in future (custom learning rate etc)          var p = pg.params;// 重要的变量p，神经网络的权值          var g = pg.grads;// 重要变量g,神经网络的梯度          // learning rate for some parameters.          var l2_decay_mul = typeof pg.l2_decay_mul !== 'undefined' ? pg.l2_decay_mul : 1.0;          var l1_decay_mul = typeof pg.l1_decay_mul !== 'undefined' ? pg.l1_decay_mul : 1.0;          var l2_decay = this.l2_decay * l2_decay_mul;          var l1_decay = this.l1_decay * l1_decay_mul;          var plen = p.length;          for(var j=0;j<plen;j++) {            l2_decay_loss += l2_decay*p[j]*p[j]/2; // accumulate weight decay loss            l1_decay_loss += l1_decay*Math.abs(p[j]);// 这里是对权值进行正则化的两种选择，L1,L2            var l1grad = l1_decay * (p[j] > 0 ? 1 : -1);            var l2grad = l2_decay * (p[j]);// 这里是对正则化项的梯度公式，注意L1,L2的不同            var gij = (l2grad + l1grad + g[j]) / this.batch_size; // raw batch gradient ，这里是batchlearning的梯度//公式            /// 下面是对于不同的训练方式的选择，默认的是SGD,我们直接跳入if语句的最后一行            var gsumi = this.gsum[i];            var xsumi = this.xsum[i];            if(this.method === 'adam') {              // adam update              gsumi[j] = gsumi[j] * this.beta1 + (1- this.beta1) * gij; // update biased first moment estimate              xsumi[j] = xsumi[j] * this.beta2 + (1-this.beta2) * gij * gij; // update biased second moment estimate              var biasCorr1 = gsumi[j] * (1 - Math.pow(this.beta1, this.k)); // correct bias first moment estimate              var biasCorr2 = xsumi[j] * (1 - Math.pow(this.beta2, this.k)); // correct bias second moment estimate              var dx =  - this.learning_rate * biasCorr1 / (Math.sqrt(biasCorr2) + this.eps);              p[j] += dx;            } else if(this.method === 'adagrad') {              // adagrad update              gsumi[j] = gsumi[j] + gij * gij;              var dx = - this.learning_rate / Math.sqrt(gsumi[j] + this.eps) * gij;              p[j] += dx;            } else if(this.method === 'windowgrad') {              // this is adagrad but with a moving window weighted average              // so the gradient is not accumulated over the entire history of the run.               // it's also referred to as Idea #1 in Zeiler paper on Adadelta. Seems reasonable to me!              gsumi[j] = this.ro * gsumi[j] + (1-this.ro) * gij * gij;              var dx = - this.learning_rate / Math.sqrt(gsumi[j] + this.eps) * gij; // eps added for better conditioning              p[j] += dx;            } else if(this.method === 'adadelta') {              gsumi[j] = this.ro * gsumi[j] + (1-this.ro) * gij * gij;              var dx = - Math.sqrt((xsumi[j] + this.eps)/(gsumi[j] + this.eps)) * gij;              xsumi[j] = this.ro * xsumi[j] + (1-this.ro) * dx * dx; // yes, xsum lags behind gsum by 1.              p[j] += dx;            } else if(this.method === 'nesterov') {            var dx = gsumi[j];            gsumi[j] = gsumi[j] * this.momentum + this.learning_rate * gij;                dx = this.momentum * dx - (1.0 + this.momentum) * gsumi[j];                p[j] += dx;            } else {              // assume SGD              if(this.momentum > 0.0) {                // momentum update                var dx = this.momentum * gsumi[j] - this.learning_rate * gij; // step                gsumi[j] = dx; // back this up for next iteration of momentum                p[j] += dx; // apply corrected gradient              } else {                // vanilla sgd                p[j] +=  - this.learning_rate * gij;              }            }            g[j] = 0.0; // zero out gradient so that we can begin accumulating anew          }        }      }      // appending softmax_loss for backwards compatibility, but from now on we will always use cost_loss      // in future, TODO: have to completely redo the way loss is done around the network as currently       // loss is a bit of a hack. Ideally, user should specify arbitrary number of loss functions on any layer      // and it should all be computed correctly and automatically.       return {fwd_time: fwd_time, bwd_time: bwd_time,               l2_decay_loss: l2_decay_loss, l1_decay_loss: l1_decay_loss,              cost_loss: cost_loss, softmax_loss: cost_loss,               loss: cost_loss + l1_decay_loss + l2_decay_loss}    }  }    global.Trainer = Trainer;  global.SGDTrainer = Trainer; // backwards compatibility})(convnetjs);

截止到目前为止，作者使用的基本API功能已经全部分析完了。在src中还有一个js文件convnet_magicnet.js。其中定义了magicnet.作者对其的描述是：

The MagicNet class performs fully-automatic prediction on your data. You don't have to worry about anything except providing your data and letting it train for a while. Internally, the MagicNet tries out many different types of networks, performs n-fold cross-validations of network hyper-parameters across folds of your data, and creates a final classifier ensemble by model averaging the best architectures.

从中可以看出这是一个对以上基本API的一个二次封装，用户使用时候不需要担心内部的模型选择。这个magicnet.js会后续文章中解析ConvnetJS华丽的应用上面进行详细说明。

1 0