
来源:互联网 发布:淘宝hd微淘 编辑:程序博客网 时间:2024/06/05 23:00

Part 2 




这次我们要建立一个字符级的语言模型来产生字符序列。 char-rnn (tensorflow中有开源实现)。这次的任务比上次难很多。






"""Imports"""import numpy as npimport tensorflow as tfimport matplotlib.pyplot as pltimport timeimport osimport urllib.requestfrom tensorflow.models.rnn.ptb import reader

"""Load and process data, utility functions"""file_url = 'https://raw.githubusercontent.com/jcjohnson/torch-rnn/master/data/tiny-shakespeare.txt'file_name = 'tinyshakespeare.txt'if not os.path.exists(file_name):    urllib.request.urlretrieve(file_url, file_name)with open(file_name,'r') as f:    raw_data = f.read()    print("Data length:", len(raw_data))vocab = set(raw_data)vocab_size = len(vocab)idx_to_vocab = dict(enumerate(vocab))vocab_to_idx = dict(zip(idx_to_vocab.values(), idx_to_vocab.keys()))data = [vocab_to_idx[c] for c in raw_data]del raw_datadef gen_epochs(n, num_steps, batch_size):    for i in range(n):        yield reader.ptb_iterator(data, batch_size, num_steps)def reset_graph():    if 'sess' in globals() and sess:        sess.close()    tf.reset_default_graph()def train_network(g, num_epochs, num_steps = 200, batch_size = 32, verbose = True, save=False):    tf.set_random_seed(2345)    with tf.Session() as sess:        sess.run(tf.initialize_all_variables())        training_losses = []        for idx, epoch in enumerate(gen_epochs(num_epochs, num_steps, batch_size)):            training_loss = 0            steps = 0            training_state = None            for X, Y in epoch:                steps += 1                feed_dict={g['x']: X, g['y']: Y}                if training_state is not None:                    feed_dict[g['init_state']] = training_state                training_loss_, training_state, _ = sess.run([g['total_loss'],                                                      g['final_state'],                                                      g['train_step']],                                                             feed_dict)                training_loss += training_loss_            if verbose:                print("Average training loss for Epoch", idx, ":", training_loss/steps)            training_losses.append(training_loss/steps)        if isinstance(save, str):            g['saver'].save(sess, save)    return training_losses

Data length: 1115394



def build_basic_rnn_graph_with_list(    state_size = 100,    num_classes = vocab_size,    batch_size = 32,    num_steps = 200,    learning_rate = 1e-4):    reset_graph()    x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')    y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')    x_one_hot = tf.one_hot(x, num_classes)    rnn_inputs = [tf.squeeze(i,squeeze_dims=[1]) for i in tf.split(1, num_steps, x_one_hot)]    cell = tf.nn.rnn_cell.BasicRNNCell(state_size)    init_state = cell.zero_state(batch_size, tf.float32)    rnn_outputs, final_state = tf.nn.rnn(cell, rnn_inputs, initial_state=init_state)    with tf.variable_scope('softmax'):        W = tf.get_variable('W', [state_size, num_classes])        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))    logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]    y_as_list = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(1, num_steps, y)]    loss_weights = [tf.ones([batch_size]) for i in range(num_steps)]    losses = tf.nn.seq2seq.sequence_loss_by_example(logits, y_as_list, loss_weights)    total_loss = tf.reduce_mean(losses)    train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)    return dict(        x = x,        y = y,        init_state = init_state,        final_state = final_state,        total_loss = total_loss,        train_step = train_step    )

t = time.time()build_basic_rnn_graph_with_list()print("It took", time.time() - t, "seconds to build the graph.")
It took 5.626644849777222 seconds to build the graph.
def build_multilayer_lstm_graph_with_list(    state_size = 100,    num_classes = vocab_size,    batch_size = 32,    num_steps = 200,    num_layers = 3,    learning_rate = 1e-4):    reset_graph()    x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')    y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')    embeddings = tf.get_variable('embedding_matrix', [num_classes, state_size])    rnn_inputs = [tf.squeeze(i) for i in tf.split(1,                                num_steps, tf.nn.embedding_lookup(embeddings, x))]    cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)    cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)    init_state = cell.zero_state(batch_size, tf.float32)    rnn_outputs, final_state = tf.nn.rnn(cell, rnn_inputs, initial_state=init_state)    with tf.variable_scope('softmax'):        W = tf.get_variable('W', [state_size, num_classes])        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))    logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]    y_as_list = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(1, num_steps, y)]    loss_weights = [tf.ones([batch_size]) for i in range(num_steps)]    losses = tf.nn.seq2seq.sequence_loss_by_example(logits, y_as_list, loss_weights)    total_loss = tf.reduce_mean(losses)    train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)    return dict(        x = x,        y = y,        init_state = init_state,        final_state = final_state,        total_loss = total_loss,        train_step = train_step    )
t = time.time()build_multilayer_lstm_graph_with_list()print("It took", time.time() - t, "seconds to build the graph.")
It took 25.640846967697144 seconds to build the graph.

def build_multilayer_lstm_graph_with_dynamic_rnn(    state_size = 100,    num_classes = vocab_size,    batch_size = 32,    num_steps = 200,    num_layers = 3,    learning_rate = 1e-4):    reset_graph()    x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')    y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')    embeddings = tf.get_variable('embedding_matrix', [num_classes, state_size])    # Note that our inputs are no longer a list, but a tensor of dims batch_size x num_steps x state_size    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)    cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)    cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)    init_state = cell.zero_state(batch_size, tf.float32)    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)    with tf.variable_scope('softmax'):        W = tf.get_variable('W', [state_size, num_classes])        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))    #reshape rnn_outputs and y so we can get the logits in a single matmul    rnn_outputs = tf.reshape(rnn_outputs, [-1, state_size])    y_reshaped = tf.reshape(y, [-1])    logits = tf.matmul(rnn_outputs, W) + b    total_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_reshaped))    train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)    return dict(        x = x,        y = y,        init_state = init_state,        final_state = final_state,        total_loss = total_loss,        train_step = train_step    )

t = time.time()build_multilayer_lstm_graph_with_dynamic_rnn()print("It took", time.time() - t, "seconds to build the graph.")

It took 0.5314393043518066 seconds to build the graph.

g = build_multilayer_lstm_graph_with_list()t = time.time()train_network(g, 3)print("It took", time.time() - t, "seconds to train for 3 epochs.")

Average training loss for Epoch 0 : 3.53323210245Average training loss for Epoch 1 : 3.31435756163Average training loss for Epoch 2 : 3.21755325109It took 117.78161263465881 seconds to train for 3 epochs.

g = build_multilayer_lstm_graph_with_dynamic_rnn()t = time.time()train_network(g, 3)print("It took", time.time() - t, "seconds to train for 3 epochs.")

Average training loss for Epoch 0 : 3.55792756053Average training loss for Epoch 1 : 3.3225021006Average training loss for Epoch 2 : 3.28286816745It took 96.69413661956787 seconds to train for 3 epochs.






cell = tf.nn.rnn_cell.BasicRNNCell(state_size)
cell = tf.nn.rnn_cell.LSTMCell(state_size)
cell = tf.nn.rnn_cell.GRUCell(state_size)

LSTM存储了两个内部状态向量,c和h, c是记忆单元或者常数错误,h是隐藏状态。默认情况下,他们被连接成一个单独的向量,但是
WARNING:tensorflow:<tensorflow.python.ops.rnn_cell.LSTMCell object at 0x7faade1708d0>: Using a concatenated state is slower and will soon be deprecated.  Use state_is_tuple=True.




cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicRNNCell(state_size)] * num_layers)

写一个定制版的RNN cell

简单变换。利用Cho et al.(2014)文中的记法,我们对输入W1x,W2x,W3x...Wnx进行加权,用ΣλiWixΣλiWix 取代
Wx , λiλi 的和为1. 这个新的权重λ 将会被计算成:λ=softmax(Wavgx(t)+Uavgh(t1)+b) 。

为了写出这个定制单元的代码,我们需要拓展tf.nn.rnn_cell.RNNCell. 特别地,我们需要实现三个抽象方法,
class GRUCell(tf.nn.rnn_cell.RNNCell):    """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""    def __init__(self, num_units):        self._num_units = num_units    @property    def state_size(self):        return self._num_units    @property    def output_size(self):        return self._num_units    def __call__(self, inputs, state, scope=None):        with tf.variable_scope(scope or type(self).__name__):  # "GRUCell"            with tf.variable_scope("Gates"):  # Reset gate and update gate.                # We start with bias of 1.0 to not reset and not update.                ru = tf.nn.rnn_cell._linear([inputs, state],                                        2 * self._num_units, True, 1.0)                ru = tf.nn.sigmoid(ru)                r, u = tf.split(1, 2, ru)            with tf.variable_scope("Candidate"):                c = tf.nn.tanh(tf.nn.rnn_cell._linear([inputs, r * state],                                             self._num_units, True))            new_h = u * state + (1 - u) * c        return new_h, new_h
def __init__(self, num_units, num_weights):    self._num_units = num_units    self._num_weights = num_weights

class CustomCell(tf.nn.rnn_cell.RNNCell):    """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""    def __init__(self, num_units, num_weights):        self._num_units = num_units        self._num_weights = num_weights    @property    def state_size(self):        return self._num_units    @property    def output_size(self):        return self._num_units    def __call__(self, inputs, state, scope=None):        with tf.variable_scope(scope or type(self).__name__):  # "GRUCell"            with tf.variable_scope("Gates"):  # Reset gate and update gate.                # We start with bias of 1.0 to not reset and not update.                ru = tf.nn.rnn_cell._linear([inputs, state],                                        2 * self._num_units, True, 1.0)                ru = tf.nn.sigmoid(ru)                r, u = tf.split(1, 2, ru)            with tf.variable_scope("Candidate"):                lambdas = tf.nn.rnn_cell._linear([inputs, state], self._num_weights, True)                lambdas = tf.split(1, self._num_weights, tf.nn.softmax(lambdas))                Ws = tf.get_variable("Ws",                        shape = [self._num_weights, inputs.get_shape()[1], self._num_units])                Ws = [tf.squeeze(i) for i in tf.split(0, self._num_weights, Ws)]                candidate_inputs = []                for idx, W in enumerate(Ws):                    candidate_inputs.append(tf.matmul(inputs, W) * lambdas[idx])                Wx = tf.add_n(candidate_inputs)                c = tf.nn.tanh(Wx + tf.nn.rnn_cell._linear([r * state],                                            self._num_units, True, scope="second"))            new_h = u * state + (1 - u) * c        return new_h, new_h
def build_multilayer_graph_with_custom_cell(    cell_type = None,    num_weights_for_custom_cell = 5,    state_size = 100,    num_classes = vocab_size,    batch_size = 32,    num_steps = 200,    num_layers = 3,    learning_rate = 1e-4):    reset_graph()    x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')    y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')    embeddings = tf.get_variable('embedding_matrix', [num_classes, state_size])    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)    if cell_type == 'Custom':        cell = CustomCell(state_size, num_weights_for_custom_cell)    elif cell_type == 'GRU':        cell = tf.nn.rnn_cell.GRUCell(state_size)    elif cell_type == 'LSTM':        cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)    else:        cell = tf.nn.rnn_cell.BasicRNNCell(state_size)    if cell_type == 'LSTM':        cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)    else:        cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)    init_state = cell.zero_state(batch_size, tf.float32)    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)    with tf.variable_scope('softmax'):        W = tf.get_variable('W', [state_size, num_classes])        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))    #reshape rnn_outputs and y    rnn_outputs = tf.reshape(rnn_outputs, [-1, state_size])    y_reshaped = tf.reshape(y, [-1])    logits = tf.matmul(rnn_outputs, W) + b    total_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_reshaped))    train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)    return dict(        x = x,        y = y,        init_state = init_state,        final_state = final_state,        total_loss = total_loss,        train_step = train_step    )
g = build_multilayer_graph_with_custom_cell(cell_type='GRU', num_steps=30)t = time.time()train_network(g, 5, num_steps=30)print("It took", time.time() - t, "seconds to train for 5 epochs.")
Average training loss for Epoch 0 : 2.92919953048Average training loss for Epoch 1 : 2.35888109404Average training loss for Epoch 2 : 2.21945820894Average training loss for Epoch 3 : 2.12258511006Average training loss for Epoch 4 : 2.05038544733It took 284.6971204280853 seconds to train for 5 epochs.
g = build_multilayer_graph_with_custom_cell(cell_type='Custom', num_steps=30)t = time.time()train_network(g, 5, num_steps=30)print("It took", time.time() - t, "seconds to train for 5 epochs.")
Average training loss for Epoch 0 : 3.04418995892Average training loss for Epoch 1 : 2.5172702761Average training loss for Epoch 2 : 2.37068433601Average training loss for Epoch 3 : 2.27533404217Average training loss for Epoch 4 : 2.20167231745It took 537.6112766265869 seconds to train for 5 epochs.



Dropout属于层间,不属于状态或者内部单元的连接部分。看一下Zaremba et al. (2015), 
Recurrent Neural Network Regularization, 想法就是只在非递归的部分增加dropout操作。
因此,我们需要包裹一下每个单元的输入或者输出以便增加dropout. 在我们的RNN实现中
rnn_inputs = [tf.nn.dropout(rnn_input, keep_prob) for rnn_input in rnn_inputs]rnn_outputs = [tf.nn.dropout(rnn_output, keep_prob) for nn_output in rnn_outputs]
cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=input_dropout, output_keep_prob=output_dropout)cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=global_dropout)cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=global_dropout)

Layer Normalization

层标准化是 Lei Ba et al. (2016)提出的一个特征,可以提升RNN的表现,启发自batch normalization.
Batch normalization和Layer Normalization加快了训练时间并且可以获得更好的效果。

LNinitial:vvμvσ2v+ϵLNinitial:v↦v−μvσv2+ϵ  , 增加了缩放、位移参数来初始化batch Normalization变化,增加了缩放因子αα

位移因子 β,整个层标准化函数:LN:vαvμvσ2v+ϵ+β

def ln(tensor, scope = None, epsilon = 1e-5):    """ Layer normalizes a 2D tensor along its second axis """    assert(len(tensor.get_shape()) == 2)    m, v = tf.nn.moments(tensor, [1], keep_dims=True)    if not isinstance(scope, str):        scope = ''    with tf.variable_scope(scope + 'layer_norm'):        scale = tf.get_variable('scale',                                shape=[tensor.get_shape()[1]],                                initializer=tf.constant_initializer(1))        shift = tf.get_variable('shift',                                shape=[tensor.get_shape()[1]],                                initializer=tf.constant_initializer(0))    LN_initial = (tensor - m) / tf.sqrt(v + epsilon)    return LN_initial * scale + shift
然后把它应用到Lei Ba et al. (2016)  的LSTMs实验中,Lei Ba et al.在LSTM单元内部的每一个输出门
class LayerNormalizedLSTMCell(tf.nn.rnn_cell.RNNCell):    """    Adapted from TF's BasicLSTMCell to use Layer Normalization.    Note that state_is_tuple is always True.    """    def __init__(self, num_units, forget_bias=1.0, activation=tf.nn.tanh):        self._num_units = num_units        self._forget_bias = forget_bias        self._activation = activation    @property    def state_size(self):        return tf.nn.rnn_cell.LSTMStateTuple(self._num_units, self._num_units)    @property    def output_size(self):        return self._num_units    def __call__(self, inputs, state, scope=None):        """Long short-term memory cell (LSTM)."""        with tf.variable_scope(scope or type(self).__name__):            c, h = state            # change bias argument to False since LN will add bias via shift            concat = tf.nn.rnn_cell._linear([inputs, h], 4 * self._num_units, False)            i, j, f, o = tf.split(1, 4, concat)            # add layer normalization to each gate            i = ln(i, scope = 'i/')            j = ln(j, scope = 'j/')            f = ln(f, scope = 'f/')            o = ln(o, scope = 'o/')            new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) *                   self._activation(j))            # add layer_normalization in calculation of new hidden state            new_h = self._activation(ln(new_c, scope = 'new_h/')) * tf.nn.sigmoid(o)            new_state = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h)            return new_h, new_state


def build_graph(    cell_type = None,    num_weights_for_custom_cell = 5,    state_size = 100,    num_classes = vocab_size,    batch_size = 32,    num_steps = 200,    num_layers = 3,    build_with_dropout=False,    learning_rate = 1e-4):    reset_graph()    x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')    y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')    dropout = tf.constant(1.0)    embeddings = tf.get_variable('embedding_matrix', [num_classes, state_size])    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)    if cell_type == 'Custom':        cell = CustomCell(state_size, num_weights_for_custom_cell)    elif cell_type == 'GRU':        cell = tf.nn.rnn_cell.GRUCell(state_size)    elif cell_type == 'LSTM':        cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)    elif cell_type == 'LN_LSTM':        cell = LayerNormalizedLSTMCell(state_size)    else:        cell = tf.nn.rnn_cell.BasicRNNCell(state_size)    if build_with_dropout:        cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=dropout)    if cell_type == 'LSTM' or cell_type == 'LN_LSTM':        cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)    else:        cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)    if build_with_dropout:        cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=dropout)    init_state = cell.zero_state(batch_size, tf.float32)    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)    with tf.variable_scope('softmax'):        W = tf.get_variable('W', [state_size, num_classes])        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))    #reshape rnn_outputs and y    rnn_outputs = tf.reshape(rnn_outputs, [-1, state_size])    y_reshaped = tf.reshape(y, [-1])    logits = tf.matmul(rnn_outputs, W) + b    predictions = tf.nn.softmax(logits)    total_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_reshaped))    train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)    return dict(        x = x,        y = y,        init_state = init_state,        final_state = final_state,        total_loss = total_loss,        train_step = train_step,        preds = predictions,        saver = tf.train.Saver()    )

g = build_graph(cell_type='GRU', num_steps=80)t = time.time()losses = train_network(g, 20, num_steps=80, save="saves/GRU_20_epochs")print("It took", time.time() - t, "seconds to train for 20 epochs.")print("The average loss on the final epoch was:", losses[-1])
It took 1051.6652357578278 seconds to train for 20 epochs.The average loss on the final epoch was: 1.75318197903
g = build_graph(cell_type='LSTM', num_steps=80)t = time.time()losses = train_network(g, 20, num_steps=80, save="saves/LSTM_20_epochs")print("It took", time.time() - t, "seconds to train for 20 epochs.")print("The average loss on the final epoch was:", losses[-1])
It took 614.4890048503876 seconds to train for 20 epochs.The average loss on the final epoch was: 2.02813237837
g = build_graph(cell_type='LN_LSTM', num_steps=80)t = time.time()losses = train_network(g, 20, num_steps=80, save="saves/LN_LSTM_20_epochs")print("It took", time.time() - t, "seconds to train for 20 epochs.")print("The average loss on the final epoch was:", losses[-1])
It took 3867.550405740738 seconds to train for 20 epochs.The average loss on the final epoch was: 1.71850851623
