深度学习-递归网络

来源：互联网发布：金融数据上报客户端编辑：程序博客网时间：2024/04/30 01:33
先介绍一个简单的例子，放入字符串的第一个字符，让递归网络复述出后面的字符串，我们看看是如何工作的
public class BasicRNNExample {   // define a sentence to learn   public static final char[] LEARNSTRING = "Der Cottbuser Postkutscher putzt den Cottbuser Postkutschkasten.".toCharArray();//定义一个要学习的字符数组   // a list of all possible characters   public static final List<Character> LEARNSTRING_CHARS_LIST = new ArrayList<Character>();//再定义一个所有可能的字符列表，也就是字符集合   // RNN dimensions   public static final int HIDDEN_LAYER_WIDTH = 50;//定义RNN网络的维度，隐层宽度是50   public static final int HIDDEN_LAYER_CONT = 2;//隐层的数量是2，也就是2个隐层   public static final Random r = new Random(7894);//随机生成器   public static void main(String[] args) {      // create a dedicated list of possible chars in LEARNSTRING_CHARS_LIST      LinkedHashSet<Character> LEARNSTRING_CHARS = new LinkedHashSet<Character>();//创建一个专用的linkedhashset存放要学习字符串的可能字符      for (char c : LEARNSTRING)         LEARNSTRING_CHARS.add(c);//把要学习的字符数组中的唯一字符放入刚才创建的linkedhashset      LEARNSTRING_CHARS_LIST.addAll(LEARNSTRING_CHARS);//把刚才的linkedhashset放到定义的字符列表      // some common parameters      NeuralNetConfiguration.Builder builder = new NeuralNetConfiguration.Builder();//和cnn一样，定义网络，设置迭代次数，学习率，参数优化方法为随机梯度下降，随机种子，初始化偏差为0，不做批处理，也就是一次处理全部数据，参数更新方法为可变学习率的方法，初始化权重服从均值为0，方差为2.0/(fanIn + fanOut)的高斯分布，fanIn是上一层节点数，fanOut是当前层节点数      builder.iterations(10);      builder.learningRate(0.001);      builder.optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT);      builder.seed(123);      builder.biasInit(0);      builder.miniBatch(false);      builder.updater(Updater.RMSPROP);      builder.weightInit(WeightInit.XAVIER);      ListBuilder listBuilder = builder.list();//创建一个多层配置的列表，list方法相当于配置的数组，所以是多层配置列表      // first difference, for rnns we need to use GravesLSTM.Builder      for (int i = 0; i < HIDDEN_LAYER_CONT; i++) {//遍历隐层层数         GravesLSTM.Builder hiddenLayerBuilder = new GravesLSTM.Builder();//首先需要创建LSTM递归网的层配置，这里的LSTM是基于有监督序列标签的递归网         hiddenLayerBuilder.nIn(i == 0 ? LEARNSTRING_CHARS.size() : HIDDEN_LAYER_WIDTH);//如果是隐层的第一层，输入大小为字符唯一计数20，否则输入为隐层大小50         hiddenLayerBuilder.nOut(HIDDEN_LAYER_WIDTH);//输出大小也为隐层大小50         // adopted activation function from GravesLSTMCharModellingExample         // seems to work well with RNNs         hiddenLayerBuilder.activation("tanh");//设置tanh为激活函数         listBuilder.layer(i, hiddenLayerBuilder.build());//把递归网的层配置装入网络配置列表      }      // we need to use RnnOutputLayer for our RNN      RnnOutputLayer.Builder outputLayerBuilder = new RnnOutputLayer.Builder(LossFunction.MCXENT);//配置RNN的输出层，设定损失函数为交叉熵      // softmax normalizes the output neurons, the sum of all outputs is 1      // this is required for our sampleFromDistribution-function      outputLayerBuilder.activation("softmax");//设定输出层的激活函数为softmax,为了使输出概率和为1      outputLayerBuilder.nIn(HIDDEN_LAYER_WIDTH);//设定输出层的输入节点50      outputLayerBuilder.nOut(LEARNSTRING_CHARS.size());//设定输出层的输出节点20      listBuilder.layer(HIDDEN_LAYER_CONT, outputLayerBuilder.build());//装入网络配置列表末端      // finish builder      listBuilder.pretrain(false);//不进行预训练      listBuilder.backprop(true);//使用反向传播      // create network      MultiLayerConfiguration conf = listBuilder.build();//通过build方法变成多层配置，注意MultiLayerConfiguration和NeuralNetConfiguration的关系      MultiLayerNetwork net = new MultiLayerNetwork(conf);//把配置转入多层网络      net.init();//初始化网络      net.setListeners(new ScoreIterationListener(1));//设置监听器，每次迭代打印score      /*       * CREATE OUR TRAINING DATA       */      // create input and output arrays: SAMPLE_INDEX, INPUT_NEURON,      // SEQUENCE_POSITION      INDArray input = Nd4j.zeros(1, LEARNSTRING_CHARS_LIST.size(), LEARNSTRING.length);//创建全0输入向量，1代表1维数组，每个元素是一个20*64的矩阵，20是字符计数，64是要学习的字符串长度      INDArray labels = Nd4j.zeros(1, LEARNSTRING_CHARS_LIST.size(), LEARNSTRING.length);//创建全0标签向量，1代表1维数组，每个元素是一个20*64的矩阵      // loop through our sample-sentence      int samplePos = 0;//初始索引为0      for (char currentChar : LEARNSTRING) {//遍历要学习的字符串，构造输入数据         // small hack: when currentChar is the last, take the first char as         // nextChar - not really required//如果当前字符是最后一个字符，把第一个字符作为下一个字符，这个不是必须的         char nextChar = LEARNSTRING[(samplePos + 1) % (LEARNSTRING.length)];//当前索引对要学习的字符串总长度求余，取出的字符作为下一个字符         // input neuron for current-char is 1 at "samplePos"         input.putScalar(new int[] { 0, LEARNSTRING_CHARS_LIST.indexOf(currentChar), samplePos }, 1);//new int[] { 0, LEARNSTRING_CHARS_LIST.indexOf(currentChar), samplePos }本身是一个3个元素的数组，元素分别是0，当前字符的在字符集合中的索引，自增位置，1代表置这个元素为1，putScalar就是把数组位置索引的元素置为1         // output neuron for next-char is 1 at "samplePos"         labels.putScalar(new int[] { 0, LEARNSTRING_CHARS_LIST.indexOf(nextChar), samplePos }, 1);//同理new int[] { 0, LEARNSTRING_CHARS_LIST.indexOf(currentChar), samplePos }本身是一个3个元素的数组，元素分别是0，下一个字符的在字符集合中的索引，自增位置，1代表置这个元素为1，putScalar就是把数组位置索引的元素置为1         samplePos++;//位置自增      }      DataSet trainingData = new DataSet(input, labels);//由上面两个数组构成训练数据,最终输入数据有是一个长度为20的数组，有64组数据，这样做的目的是保证每次只输入一个字符，其他字符都是0，然后顺序的输入其他字符，这也符合递归或者流式的思想，标签一直都代表下一个要输入的字符来不断矫正输入的字符      // some epochs      for (int epoch = 0; epoch < 100; epoch++) {//按步数训练         System.out.println("Epoch " + epoch);         // train the data         net.fit(trainingData);//网络定型，这里的输入和cnn的不太一样，trainingData的列是输入样本，cnn的行是输入样本，很奇怪         // clear current stance from the last example         net.rnnClearPreviousState();//由于要重新训练一批数据，所以把清除之前RNN时间步的状态和相应存储状态激活值的反向时间步状态         // put the first caracter into the rrn as an initialisation         INDArray testInit = Nd4j.zeros(LEARNSTRING_CHARS_LIST.size());//搞一个全0的长度为20的测试数组         testInit.putScalar(LEARNSTRING_CHARS_LIST.indexOf(LEARNSTRING[0]), 1);//给测试数据赋值，把第一个字符在唯一字符索引中的位置对应的数组位置置为1         // run one step -> IMPORTANT: rnnTimeStep() must be called, not         // output()         // the output shows what the net thinks what should come next         INDArray output = net.rnnTimeStep(testInit);//运行一个时间步，这里testInit是一个输入，关于rnnTimeStep的说明：如果多层网络包含一个或多个RNN层，前向传递但使用之前存储的状态递归RNN层，最终时间步的激活值也存在RNN层用于下次rnnTimeStep的调用，这样一次可以产生一个时间步或多个时间步的输出而不必总是从t=0开始前向传播，本例的数据是流式数据，一次产生一个时间步输出并反向传播给网络作为输入，如果没有之前的状态，例如通过初始化或者调用rnnClearPreviousState产生保存状态，默认的初始化通常是0，支持小批处理，例如并行训练预测多个值，也支持简单例子，输入input是参数，可能是一个时间步也可能是多个时间步，输入维度是[miniBatchSize,inputSize]或者[miniBatchSize,inputSize,1],对于简单例子miniBatchSize=1,对于多时间步是[miniBatchSize,inputSize,inputTimeSerierLength],输出激活值是返回值，如果输出是rnn层，如果输入维度是[miniBatchSize,inputSize]的二维形式，输出的维度也是[miniBatchSize,outputSize]的2d形式，否则输出是[miniBatchSize,outputSize,inputTimeSeriesLength]的3d形式          // now the net sould guess LEARNSTRING.length mor characters//现在开始猜测字符串         for (int j = 0; j < LEARNSTRING.length; j++) {//一共猜测多少次，也就是要猜测字符串的长度            // first process the last output of the network to a concrete            // neuron, the neuron with the highest output cas the highest            // cance to get chosen//首先处理一个具体神经元的最后输出，选择输出最大的神经元            double[] outputProbDistribution = new double[LEARNSTRING_CHARS.size()];//预测输出概率的数组            for (int k = 0; k < outputProbDistribution.length; k++) {//给预测输出概率数组赋值               outputProbDistribution[k] = output.getDouble(k);            }            int sampledCharacterIdx = findIndexOfHighestValue(outputProbDistribution);//找出数组中的最大值索引            // print the chosen output            System.out.print(LEARNSTRING_CHARS_LIST.get(sampledCharacterIdx));//输出预测值            // use the last output as input            INDArray nextInput = Nd4j.zeros(LEARNSTRING_CHARS_LIST.size());//创建存储上次输出的数组            nextInput.putScalar(sampledCharacterIdx, 1);//把最大索引位置置为1            output = net.rnnTimeStep(nextInput);//以上次的输出为输入再运行一个时间步         }         System.out.print("\n");      }   }   private static int findIndexOfHighestValue(double[] distribution) {//这个很简单啦，找出最大数组最大值的索引      int maxValueIndex = 0;      double maxValue = 0;      for (int i = 0; i < distribution.length; i++) {         if(distribution[i] > maxValue) {            maxValue = distribution[i];            maxValueIndex = i;         }      }      return maxValueIndex;   }}
0 0