Theano LSTM代码解析

来源：互联网发布：音悦台点歌机加歌软件编辑：程序博客网时间：2024/05/17 08:17

Theano官方LSTM代码解析

本文内容纯属个人观点，有误的地方敬请指正批评。

阅读本文需要一定的python,lstm和Theano的基础知识。

Theano是一种符号语言，它的优点在与它的自动求导机制以及GPU的透明性，缺点在于调试时极为不便。

基于Theano开发的Deep Learning代码一般分为四个部分：数据准备，模型建立，预训练和训练，测试。代码按照这个顺序进行分析。

Theano的LSTM代码见https://github.com/lisa-lab/DeepLearningTutorials

其中LSTM的代码见lstm.py和imdb.py，数据集为imdb.pkl。

代码如下：

If __name__ = ‘__main__’:

train_lstm(max_epochs=100,test_size=500,)

/*我们从主函数下手开始分析，__main__函数调用了train_lstm()函数，LSTM的整个过程都是从这个函数开始。*/

def train_lstm(

dim_proj=128, /*word embedding的维数和隐藏层的维数，用默认值。（word embedding是一种将一个词转成一个向量的过程，这里不去深究）*/

patience=10，/*该参数用于earlystop，如果10轮迭代的误差没有降低，就进行earlystop*/

max_epochs=5000,/*迭代次数（将训练集迭代一轮位一个epoch）*/

diapFreq=10,/*每更新10次显示训练过程，即显示训练、验证和测试误差*/

decay_c=0, /*参数U的正则权重，U为隐藏层h_t到输出层的参数*/

lrate=0.0001, /*sgd用的学习率*/

n_words=10000, /*词典大小，用于数据预处理部分，将词用该词在词典中的ID表示，超过10000的用1表示，仅仅用于数据，不做深究*/

optimizer=adadelta, /*优化方法，代码提供了sgd,adadelta和rmsprop三种方法，采用了adadelta*/

encoder='lstm', /*一个标识符，可以去掉，但是必须是lstm*/

saveto='lstm_model.npz', /*保存最好模型的文件，保存训练误差，验证误差和测试误差等等*/

validFreq=370，/*验证频率*/

saveFreq=1110,/*保存频率*/

maxlen=100,/*序列的最大长度，超出长度的数据被抛弃，见数据处理部分*/

batch_size=16,/*训练的batch大小*/

valid_batch_size=64,/*验证集用的*batch大小*/

dataset='imdb',/*用于数据预处理的参数，全局变量datasets的key'imdb'的value为两个处理数据的函数*/

noise_std=0.,/*后边好像没有出现过，恕我无知，我也不知道这是什么鬼*/!!!

use_dropout=True,/*控制dropout，不用dropout的话运行速度较快，但是效果不好，dropout不太好解释，以一定的概率随机丢弃模型中的一些节点，这样可以综合多种模型的结果，进行投票。需要自行补充deeplearning的知识*/

reload_model=None,/*加载模型参数的文件，用于已训练好的模型，或保存的中间结果*/

test_size=-1,/*测试集大小，如果为正，就只用这么多测试样本*/

model_options = locals().copy()

print("model options", model_options)

/*首先将当先的函数局部作用于的参数copy到字典model_options中，后面的很多函数就以model_options作为参数进行参数传递。*/

load_data, prepare_data = get_dataset(dataset)# dataset=‘imdb’

print('Loading data')

train, valid, test = load_data(n_words=n_words,valid_portion=0.05,

maxlen=maxlen)

/*获取处理数据的函数，定义在全局作用域的get_dataset函数中

调用以下代码：*/

datasets = {'imdb':(imdb.load_data, imdb.prepare_data)}

defget_dataset(name):

return datasets[name][0], datasets[name][1]

/*返回了两个函数：load_data,prepare_data这两个函数定义在imdb.py中*/

/*数据已经事先存在了imdb.pkl中，这里用pickle方法load进来，第一项为训练数据，第二项为测试数据；

load_data函数将数据集读入，舍弃长度超过maxlen的数据，并按照参数valid_portion的比例将一部分训练集划为验证集。

而第二个函数prepare_data负责数据的转换，在训练和测试的时候先将训练数据和测试数据的横轴和纵轴调换，并使数据维度保持一致，后面详细讲*/

if test_size > 0:

# The testset is sorted by size, but we want to keep random

# sizeexample. So we must select a randomselection of the

# examples.

idx =numpy.arange(len(test[0]))

numpy.random.shuffle(idx)

idx =idx[:test_size]

test =([test[0][n] for n in idx], [test[1][n] for n in idx])

/*如果我们设置了test_size的大小，这个步骤就是从测试集中随机找test_size个作为测试数据，如果没有设置test_size,会用所有的测试集数据做测试。原来的测试数据是根据长度排列的（imdb数据自身特点），这里做了一次打散*/

ydim = numpy.max(train[1]) + 1

model_options['ydim'] = ydim

/*ydim为标签y的维数，因为是从0开始的，所以后面+1，并将它加入模型参数中*/

print('Building model')

# This create the initial parameters as numpy ndarrays.

# Dict name (string) -> numpy ndarray

params = init_params(model_options)

/*模型建立阶段，首先初始化各种参数，调用了全局作用域的函数init_params()*/

definit_params(options):

"""

Global (not LSTM) parameter. For the embedingand the classifier.

"""

/*根据源代码的注释，embedding，claffier和lstm层的参数不是一起初始化的，这里先初始化embedding，claffier的参数*/

params = OrderedDict()

/*将所有的参数放在一个名为params的OrderedDict中*/

# embedding

randn =numpy.random.rand(options['n_words'],

options['dim_proj'])

params['Wemb'] = (0.01 * randn).astype(config.floatX)

/* 随机生成embedding矩阵，这里为10000 * 128维的，因为词典大小是10000，也就是说，词的ID范围是1-10000，我们将每个词转换成一个128维的向量，所以这里生成了一个10000*128的矩阵，每个词转换成它的ID的那一行的128维向量。比如“我”这个词的ID是5，那么“我”这个词就用params['Wemb']矩阵的第5行表示，第5行就是一个128维的向量，这里用随机生成的矩阵表示，作为示例。（这是下边用到的，这里先给出解释）*/

params =get_layer(options['encoder'])[0](options,

params,

prefix=options['encoder'])

/*这里需要调用：

def get_layer(name):

fns = layers[name]

return fns

layers = {'lstm':(param_init_lstm, lstm_layer)}

我们还记得options['encoder']只能为lstm，这里返回了layers['lstm']的第一项param_init_lstm函数：

defparam_init_lstm(options, params, prefix='lstm'):

"""

Init the LSTM parameter:

:see: init_params

"""

W =numpy.concatenate([ortho_weight(options['dim_proj']),

ortho_weight(options['dim_proj']),

ortho_weight(options['dim_proj'])],axis=1)

params[_p(prefix, 'W')] = W

U =numpy.concatenate([ortho_weight(options['dim_proj']),

ortho_weight(options['dim_proj']),

ortho_weight(options['dim_proj'])], axis=1)

params[_p(prefix,'U')] = U

b = numpy.zeros((4 * options['dim_proj'],))

params[_p(prefix,'b')] = b.astype(config.floatX)

return params

这里初始化了LSTM的参数，_p()这个函数是连接变量名的，

ortho_weight()函数用来生成正交的矩阵，先生成n*n的矩阵，做svd分解，这里不再细述。将4个矩阵列连接起来是为了方便运算，在计算门的时候，一步到位。*/

# classifier

params['U'] = 0.01 *numpy.random.randn(options['dim_proj'],

options['ydim']).astype(config.floatX)

params['b'] =numpy.zeros((options['ydim'],)).astype(config.floatX)

/*初始化softmax分类器的参数 */

return params

if reload_model:

load_params('lstm_model.npz', params)

# This create Theano Shared Variable from theparameters.

# Dict name (string) -> Theano Tensor SharedVariable

# params and tparams have different copy of theweights.

tparams = init_tparams(params)

/*如果是reload，将从保存的文件中加载参数,这里未使用，有需求时可以改动*/

/*init_tparams()将上一步初始化的参数转为Theano.shared类型。*/

(use_noise, x, mask,

y, f_pred_prob,f_pred, cost) = build_model(tparams, model_options)

/* 建立模型，代码：*/

def build_model(tparams, options):

trng= RandomStreams(SEED)

#随机数生成器

#Used for dropout.

use_noise= theano.shared(numpy_floatX(0.))

#是否用dropout

x= tensor.matrix('x', dtype='int64')

mask= tensor.matrix('mask', dtype=config.floatX)

y= tensor.vector('y', dtype='int64')

#为x,mask,y生成占位符号。

n_timesteps= x.shape[0]#x的行代表steps（经过变换）

n_samples= x.shape[1] #x的列代表不同的样本

emb= tparams['Wemb'][x.flatten()].reshape([n_timesteps,

n_samples, options['dim_proj']])

#将词用向量表示，如前所述

proj= get_layer(options['encoder'])[1](tparams, emb, options,

prefix=options['encoder'],

mask=mask)

/*隐藏层的计算，实现代码如下（黑体）：

def lstm_layer(tparams,state_below, options, prefix='lstm', mask=None):

nsteps= state_below.shape[0]

//state_below是输入x和w，b计算后的输入节点。同上，第一维代表step

if state_below.ndim == 3:

n_samples = state_below.shape[1]

else:

n_samples = 1

//如果输入三维的x，那么样本数就是第二维的长度，否则就是只有一个样本

assert mask is not None

// mask非None

def_slice(_x, n, dim):

if_x.ndim == 3:

return_x[:, :, n * dim:(n + 1) * dim]

return_x[:, n * dim:(n + 1) * dim]

// 切片，计算的时候是几个门一起计算，切片将各个门的值分开

def_step(m_, x_, h_, c_):

preact= tensor.dot(h_, tparams[_p(prefix, 'U')])

preact+= x_

i= tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))

f= tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))

o= tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))

c= tensor.tanh(_slice(preact, 3, options['dim_proj']))

c= f * c_ + i * c

c= m_[:, None] * c + (1. - m_)[:, None] * c_

h = o * tensor.tanh(c)

h= m_[:, None] * h + (1. - m_)[:, None] * h_

return h, c

/* 隐藏层计算，i：输入门；f：忘记门；o：输出门；c：cell。

h代表隐藏层的输出，h_和c_分别代表上一个时刻的cell和隐藏层输出，m_就是mask，它记录了变换后的x的非零值的位置。下文详细介绍

根据LSTM隐藏层的计算公式，state_below就是input_node。

c = m_[:, None] * c + (1. -m_)[:, None] * c_

h = m_[:, None] * h + (1. -m_)[:, None] * h_

这两句的意思是如果某个样本到一定的状态后没有值了，也就是mask矩阵对应位置的值为0了，那么它的c和h就保持上一个时刻的不变。结合prepare_data函数理解。

state_below = (tensor.dot(state_below,tparams[_p(prefix, 'W')]) +

tparams[_p(prefix, 'b')])

//这里相当于计算input_node

dim_proj = options['dim_proj']

rval, updates = theano.scan(_step,

sequences=[mask, state_below],

outputs_info=[tensor.alloc(numpy_floatX(0.),

n_samples,

dim_proj),

tensor.alloc(numpy_floatX(0.),

n_samples,

dim_proj)],

name=_p(prefix,'_layers'),

n_steps=nsteps)

/*scan函数，进行迭代的函数是_step，它的输入是m_, x_, h_, c_，迭代的是sequences中的mask，state_below,每次拿出他们的一行，作为输入的m_和x_，h_和c_的初始值设置为0（outputs_info设置初始值），每次计算，_step返回的h和c分别赋给下次迭代的h_和c_，迭代次数为nsteps，这样就实现了隐藏层节点的传递，最后函数返回h和c给rval。*/

return rval[0]

//整个函数最后返回rval[0]也就是最后的h

上述即为隐藏层的计算过程，继续分析之前的代码*/

ifoptions['encoder'] == 'lstm':

proj= (proj * mask[:, :, None]).sum(axis=0)

proj= proj / mask.sum(axis=0)[:, None]

/*计算样本的每个时刻的h的均值，对列求和，列就是样本的step，如果这个状态有值，那么相应的mask值为1，否则就是0，然后除以mask每列的和，也就是样本一共的step个数，求出平均值*/

ifoptions['use_dropout']:

proj= dropout_layer(proj, use_noise, trng)

//如果dropout，就调用dropout_layer()随机丢弃一些隐藏层

pred= tensor.nnet.softmax(tensor.dot(proj, tparams['U'])

+ tparams['b'])

//预测就是隐藏层h的均值输入到softmax函数得到的

f_pred_prob= theano.function([x, mask], pred, name='f_pred_prob')

f_pred = theano.function([x, mask],

pred.argmax(axis=1),name='f_pred')

// 将预测输出编译成x和mask的函数

off = 1e-8

ifpred.dtype == 'float16':

off= 1e-6

cost= -tensor.log(pred[tensor.arange(n_samples), y] + off).mean()

// 损失函数

returnuse_noise, x, mask, y, f_pred_prob, f_pred, cost

//模型建立完成，返回，回到train_lstm()函数

if decay_c > 0.:

decay_c = theano.shared(numpy_floatX(decay_c),name='decay_c')

weight_decay = 0.

weight_decay += (tparams['U'] ** 2).sum()

weight_decay *= decay_c

cost += weight_decay

/* 如果加入正则，损失函数加上L2损失。

f_cost = theano.function([x, mask, y], cost,name='f_cost')

/*编译损失函数*/

grads = tensor.grad(cost, wrt=list(tparams.values()))

f_grad = theano.function([x, mask, y], grads,name='f_grad')

// 求导，并编译求导函数

lr = tensor.scalar(name='lr')

f_grad_shared, f_update = optimizer(lr, tparams, grads,

x, mask, y,cost)

//优化

print('Optimization')

kf_valid = get_minibatches_idx(len(valid[0]),valid_batch_size)

kf_test = get_minibatches_idx(len(test[0]),valid_batch_size)

/* 将验证集和测试机分成batchs：get_minibatchs_idx,返回batchID和对应的样本序号，省略*/

print("%d train examples" % len(train[0]))

print("%d valid examples" % len(valid[0]))

print("%d test examples" % len(test[0]))

history_errs = []

best_p = None

bad_count = 0

//记录误差、最好的结果，和bad_count计数

if validFreq == -1:

validFreq = len(train[0]) // batch_size

if saveFreq == -1:

saveFreq = len(train[0]) // batch_size

/*如果未设置验证频率和保存频率，那么就设置为一个epoch，len（train[0]）/batch_size就是一个epoch*/

uidx = 0 #the number of update done 记录更新的次数

estop = False # early stop

start_time = time.time()

try:

for eidx in range(max_epochs):

n_samples = 0

# Get new shuffled index for thetraining set.

kf = get_minibatches_idx(len(train[0]),batch_size, shuffle=True)

//得到训练数据的mini_batchs

for _, train_index in kf:

uidx += 1 #更新次数+1

use_noise.set_value(1.) #设置drop_out

# Select the random examplesfor this minibatch

y = [train[1][t] for t in train_index]

x = [train[0][t]for t in train_index]

# Get the data in numpy.ndarrayformat

# This swap the axis!

# Return something of shape(minibatch maxlen, n samples)

x, mask, y = prepare_data(x, y)

/*这里需要注意prepare_data()函数：

defprepare_data(seqs, labels, maxlen=None):

"""Create the matrices from thedatasets.

This pad each sequence to the same length: the lengthof the

longuest sequence or maxlen.

if maxlen is set, we will cut all sequence to thismaximum

length.

This swap the axis!

"""

# x: a listof sentences

lengths = [len(s) for s in seqs]

/*如果设置了maxlen那么将扔掉所有step超过maxlen的数据*/

if maxlen is not None:

new_seqs = []

new_labels= []

new_lengths = []

for l, s, y in zip(lengths, seqs,labels):

if l < maxlen:

new_seqs.append(s)

new_labels.append(y)

new_lengths.append(l)

/*得到经过maxlen过滤的数据集*/

lengths = new_lengths

labels = new_labels

seqs = new_seqs

/*lengths 代表过滤后的数据每一行的长度，也就是每一个样本的steps，seqs保存数据矩阵。Labels保存标签*/

if len(lengths) < 1:

return None, None, None

n_samples = len(seqs)

maxlen = numpy.max(lengths)

/*原始的x（也就是seqs），每一行代表一个样本，同一行不同的列代表他们的先后次序，矩阵seqs的行数len(seqs)代表样本个数，maxlen为step最长的样本的step的长度*/

x = numpy.zeros((maxlen, n_samples)).astype('int64')

x_mask= numpy.zeros((maxlen, n_samples))

.astype(theano.config.floatX)

/*将x和x_mask定义为maxlen * n_samples的矩阵，这里行和列长度调换了*/

for idx, s in enumerate(seqs):

x[:lengths[idx], idx] = s

x_mask[:lengths[idx], idx] = 1.

/*这里，将原来的seqs中的每一行加入到新的x的相应的列中，如果原来的这行的长度（也就是step）小于maxlen（新矩阵x的行数，也是原来step最长的样本的step，那么seqs的那行转换成的x的列不足的部分就是0，相当于seqs中的行转置了一下，变成新的x的一列，长度不够后边就补0，而mask就记录了新的x矩阵每个位置是否有值，有值得话mask相应的位置就是1，没有的话就是0）*/

return x, x_mask, labels

n_samples += x.shape[1]

cost = f_grad_shared(x, mask, y)

f_update(lrate)

//更新，在后边专门讲

if numpy.isnan(cost) ornumpy.isinf(cost):

print('bad cost detected: ', cost)

return 1., 1., 1.

if numpy.mod(uidx, dispFreq) == 0:

print('Epoch ', eidx, 'Update ', uidx,'Cost ', cost)

//判断是否到了显示频率

if saveto and numpy.mod(uidx, saveFreq)== 0:

print('Saving...')

//判断是否到了保存频率，并保存params

if best_p is notNone:

params = best_p

else:

params = unzip(tparams)

numpy.savez(saveto,history_errs=history_errs, **params)

pickle.dump(model_options, open('%s.pkl'% saveto, 'wb'), -1)

print('Done')

//判断是否到了验证频率，到了就计算各种误差，并更新best_p

if numpy.mod(uidx, validFreq) == 0:

use_noise.set_value(0.)

train_err = pred_error(f_pred,prepare_data, train, kf)

valid_err = pred_error(f_pred,prepare_data, valid,

kf_valid)

test_err = pred_error(f_pred,prepare_data, test, kf_test)

history_errs.append([valid_err, test_err])

if (best_p is None or

valid_err <=numpy.array(history_errs)[:,

0].min()):

best_p = unzip(tparams)

bad_counter = 0

print( ('Train ', train_err, 'Valid ',valid_err,

'Test ', test_err) )

if (len(history_errs) > patience and

valid_err >=numpy.array(history_errs)[:-patience,

0].min()):

bad_counter += 1

if bad_counter > patience:

print('EarlyStop!')

estop = True

break

/* 每验证一次，记录下验证误差和测试误差，如果当前的验证误差大于前10(patience)次验证误差的最小值（也就是说误差没有降低），bad_counter+= 1，如果bad_counter>patience，就early stop！

print('Seen %d samples' % n_samples)

if estop:

break

except KeyboardInterrupt:

print("Traininginterupted")

end_time = time.time()

if best_p is not None:

zipp(best_p, tparams)

else:

best_p = unzip(tparams)

use_noise.set_value(0.)

kf_train_sorted =get_minibatches_idx(len(train[0]), batch_size)

train_err = pred_error(f_pred,prepare_data, train, kf_train_sorted)

valid_err = pred_error(f_pred,prepare_data, valid, kf_valid)

test_err = pred_error(f_pred, prepare_data,test, kf_test)

print( 'Train ', train_err, 'Valid ',valid_err, 'Test ', test_err )

if saveto:

numpy.savez(saveto,train_err=train_err,

valid_err=valid_err,test_err=test_err,

history_errs=history_errs,**best_p)

print('The code run for %d epochs, with %fsec/epochs' % (

(eidx + 1), (end_time - start_time) /(1. * (eidx + 1))))

print( ('Training took %.1fs' %

(end_time - start_time)),file=sys.stderr)

return train_err, valid_err,test_err

//后边就是计算误差和执行时间了

至此，该LSTM的代码已经解析结束，优化方法就不详细讲解了，代码提供了3种不同的优化函数，基本思想都是通过cost的梯度和learning_rate更新参数啦，其中sgd需要设置learning_rate，另两种方法是自动选择学习率的。

1 0