TensorFlow 中文语音识别
来源:互联网 发布:网络编程聊天室 编辑:程序博客网 时间:2024/05/16 19:05
本文转载自
http://blog.topspeedsnail.com/archives/10696
数据集下载参见该文。
其中下面的代码进行了一些小小的调整。
其中包含缩进、版本方面(作者是python 3.5)、求wav_max_len方面(并行)、tf.nn.ctc_loss的参数顺序。
(具体改变可与原文链接中的代码进行对比)
并在程序中给出了一些小注解。
感谢作者将这么好的资料开源,吾辈只能望洋兴叹。
#coding: utf-8import tensorflow as tfimport numpy as npimport osfrom collections import Counterimport librosafrom joblib import Parallel, delayedwav_path = 'train'label_file = "train.word.txt"def get_wav_files(wav_path = wav_path): wav_files = [] for (dirpath, dirnames, filenames) in os.walk(wav_path): for filename in filenames: if filename.endswith(".wav") or filename.endswith(".WAV"): filename_path = os.sep.join([dirpath, filename]) if os.stat(filename_path).st_size < 240000: continue wav_files.append(filename_path) return wav_fileswav_files = get_wav_files()def get_wav_label(wav_files = wav_files, label_file = label_file): labels_dict = {} with open(label_file, "r", encoding='utf-8') as f: for label in f: label = label.strip("\n") label_id, label_text = label.split(' ', 1) labels_dict[label_id] = label_text labels = [] new_wav_files = [] for wav_file in wav_files: wav_id = os.path.basename(wav_file).split(".")[0] if wav_id in labels_dict: labels.append(labels_dict[wav_id]) new_wav_files.append(wav_file) return new_wav_files, labelsdef get_wav_length(wav): import numpy as np import librosa print(wav) wav, sr = librosa.load(wav) mfcc = np.transpose(librosa.feature.mfcc(wav, sr), [1, 0]) return len(mfcc)pointer = 0def get_next_batches(batch_size, wav_max_len): global pointer batches_wavs = [] batches_labels = [] for i in range(batch_size): wav, sr = librosa.load(wav_files[pointer]) mfcc = np.transpose(librosa.feature.mfcc(wav, sr), [1,0]) batches_wavs.append(mfcc.tolist()) batches_labels.append(labels_vector[pointer]) pointer += 1 # 取零补齐 # label append 0 , 0 对应的字符 # mfcc 默认的计算长度为20(n_mfcc of mfcc) 作为channel length for mfcc in batches_wavs: while len(mfcc) < wav_max_len: mfcc.append([0]*20) for label in batches_labels: while len(label) < label_max_len: label.append(0) return batches_wavs, batches_labelsconv1d_index = 0def conv1d_layer(input_tensor, size, dim, activation, scale, bias): global conv1d_index with tf.variable_scope("conv1d_" + str(conv1d_index)): W = tf.get_variable('W', (size, input_tensor.get_shape().as_list()[-1], dim), dtype=tf.float32, initializer=tf.random_uniform_initializer(minval=-scale, maxval=scale)) if bias: b = tf.get_variable('b', [dim], dtype = tf.float32, initializer=tf.constant_initializer(0)) out = tf.nn.conv1d(input_tensor, W, stride=1, padding='SAME') + (b if bias else 0) if not bias: beta = tf.get_variable('beta', dim, dtype=tf.float32, initializer=tf.constant_initializer(0)) gamma = tf.get_variable('gamma', dim, dtype=tf.float32, initializer=tf.constant_initializer(1)) mean_running = tf.get_variable('mean', dim, dtype=tf.float32, initializer=tf.constant_initializer(0)) variance_running = tf.get_variable('variance', dim, dtype=tf.float32, initializer=tf.constant_initializer(1)) mean, variance = tf.nn.moments(out, axes=list(range(len(out.get_shape()) - 1))) def update_running_stat(): decay = 0.99 # 定义了均值方差指数衰减 见 http://blog.csdn.net/liyuan123zhouhui/article/details/70698264 update_op = [mean_running.assign(mean_running * decay + mean * (1 - decay)), variance_running.assign(variance_running * decay + variance * (1 - decay))] # 指定先执行均值方差的更新运算 见 http://blog.csdn.net/u012436149/article/details/72084744 with tf.control_dependencies(update_op): return tf.identity(mean), tf.identity(variance) # 条件运算(https://applenob.github.io/tf_9.html) 按照作者这里的指定 是不进行指数衰减的 m, v = tf.cond(tf.Variable(False, trainable=False), update_running_stat,lambda: (mean_running, variance_running)) out = tf.nn.batch_normalization(out, m, v, beta, gamma, 1e-8) if activation == 'tanh': out = tf.nn.tanh(out) elif activation == 'sigmoid': out = tf.nn.sigmoid(out) conv1d_index += 1 return out# 极黑卷积层 https://www.zhihu.com/question/57414498# 其输入参数中要包含一个大于 1 的rate 输出 channels与输入相同aconv1d_index = 0def aconv1d_layer(input_tensor, size, rate, activation, scale, bias): global aconv1d_index with tf.variable_scope('aconv1d_' + str(aconv1d_index)): shape = input_tensor.get_shape().as_list() # 利用 2 维极黑卷积函数计算相应 1 维卷积,expand_dims squeeze做了相应维度处理 # 实际 上一个 tf.nn.conv1d 在之前的tensorflow版本中是没有的,其的一个实现也是经过维度调整后调用 tf.nn.conv2d W = tf.get_variable('W', (1, size, shape[-1], shape[-1]), dtype=tf.float32, initializer=tf.random_uniform_initializer(minval=-scale, maxval=scale)) if bias: b = tf.get_variable('b', [shape[-1]], dtype=tf.float32, initializer=tf.constant_initializer(0)) out = tf.nn.atrous_conv2d(tf.expand_dims(input_tensor, dim=1), W, rate = rate, padding='SAME') out = tf.squeeze(out, [1]) if not bias: beta = tf.get_variable('beta', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0)) gamma = tf.get_variable('gamma', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1)) mean_running = tf.get_variable('mean', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(0)) variance_running = tf.get_variable('variance', shape[-1], dtype=tf.float32, initializer=tf.constant_initializer(1)) mean, variance = tf.nn.moments(out, axes=list(range(len(out.get_shape()) - 1))) def update_running_stat(): decay = 0.99 update_op = [mean_running.assign(mean_running * decay + mean * (1 - decay)), variance_running.assign(variance_running * decay + variance * (1 - decay))] with tf.control_dependencies(update_op): return tf.identity(mean), tf.identity(variance) m, v = tf.cond(tf.Variable(False, trainable=False), update_running_stat,lambda: (mean_running, variance_running)) out = tf.nn.batch_normalization(out, m, v, beta, gamma, 1e-8) if activation == 'tanh': out = tf.nn.tanh(out) elif activation == 'sigmoid': out = tf.nn.sigmoid(out) aconv1d_index += 1 return outdef speech_to_text_network(n_dim = 128, n_blocks = 3): out = conv1d_layer(input_tensor=X, size=1, dim = n_dim, activation='tanh', scale=0.14, bias=False) def residual_block(input_sensor, size, rate): conv_filter = aconv1d_layer(input_tensor=input_sensor, size=size, rate=rate, activation='tanh', scale=0.03, bias=False) conv_gate = aconv1d_layer(input_tensor=input_sensor, size=size, rate=rate, activation='sigmoid', scale=0.03, bias=False) out = conv_filter * conv_gate out = conv1d_layer(out, size = 1, dim=n_dim, activation='tanh', scale=0.08, bias=False) return out + input_sensor, out skip = 0 for _ in range(n_blocks): for r in [1, 2, 4, 8, 16]: out, s = residual_block(out, size = 7, rate = r) skip += s logit = conv1d_layer(skip, size = 1, dim = skip.get_shape().as_list()[-1], activation='tanh', scale = 0.08, bias=False) # 最后卷积层输出是词汇表大小 logit = conv1d_layer(logit, size = 1, dim = words_size, activation = None, scale = 0.04, bias = True) return logit# 作者自己定义了优化器class MaxPropOptimizer(tf.train.Optimizer): def __init__(self, learning_rate=0.001, beta2=0.999, use_locking=False, name="MaxProp"): super(MaxPropOptimizer, self).__init__(use_locking, name) self._lr = learning_rate self._beta2 = beta2 self._lr_t = None self._beta2_t = None def _prepare(self): self._lr_t = tf.convert_to_tensor(self._lr, name="learning_rate") self._beta2_t = tf.convert_to_tensor(self._beta2, name="beta2") def _create_slots(self, var_list): for v in var_list: self._zeros_slot(v, "m", self._name) def _apply_dense(self, grad, var): lr_t = tf.cast(self._lr_t, var.dtype.base_dtype) beta2_t = tf.cast(self._beta2_t, var.dtype.base_dtype) if var.dtype.base_dtype == tf.float16: eps = 1e-7 else: eps = 1e-8 m = self.get_slot(var, "m") m_t = m.assign(tf.maximum(beta2_t * m + eps, tf.abs(grad))) g_t = grad / m_t var_update = tf.assign_sub(var, lr_t * g_t) return tf.group(*[var_update, m_t]) def _apply_sparse(self, grad, var): return self._apply_dense(grad, var)def train_speech_to_text_network(wav_max_len): logit = speech_to_text_network() # CTC loss indices = tf.where(tf.not_equal(tf.cast(Y, tf.float32), 0.)) target = tf.SparseTensor(indices=indices, values=tf.gather_nd(Y, indices) - 1, dense_shape=tf.cast(tf.shape(Y), tf.int64)) loss = tf.nn.ctc_loss(target, logit, sequence_len, time_major=False) # optimizer lr = tf.Variable(0.001, dtype=tf.float32, trainable=False) optimizer = MaxPropOptimizer(learning_rate=lr, beta2=0.99) var_list = [t for t in tf.trainable_variables()] gradient = optimizer.compute_gradients(loss, var_list=var_list) optimizer_op = optimizer.apply_gradients(gradient) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) for epoch in range(16): sess.run(tf.assign(lr, 0.001 * (0.97 ** epoch))) global pointer pointer = 0 for batch in range(n_batch): batches_wavs, batches_labels = get_next_batches(batch_size, wav_max_len) train_loss, _ = sess.run([loss, optimizer_op], feed_dict={X: batches_wavs, Y: batches_labels}) print(epoch, batch, train_loss) if epoch % 5 == 0: saver.save(sess, r'/speech/module', global_step=epoch)# 训练#train_speech_to_text_network()# 语音识别# 把batch_size改为1def speech_to_text(wav_file): wav, sr = librosa.load(wav_file, mono=True) mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr), axis=0), [0,2,1]) logit = speech_to_text_network() saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, tf.train.latest_checkpoint('.')) decoded = tf.transpose(logit, perm=[1, 0, 2]) decoded, _ = tf.nn.ctc_beam_search_decoder(decoded, sequence_len, merge_repeated=False) predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].shape, decoded[0].values) + 1 output = sess.run(decoded, feed_dict={X: mfcc}) print(output)if __name__ == "__main__": wav_files = get_wav_files() wav_files, labels = get_wav_label() print(u"样本数 :", len(wav_files)) all_words = [] for label in labels: # 字符分解 all_words += [word for word in label] counter = Counter(all_words) count_pairs = sorted(counter.items(), key=lambda x: -x[1]) words, _ = zip(*count_pairs) words_size = len(words) print(u"词汇表大小:", words_size) word_num_map = dict(zip(words, range(len(words)))) # 当字符不在已经收集的words中时,赋予其应当的num,这是一个动态的结果 to_num = lambda word: word_num_map.get(word, len(words)) # 将单个file的标签映射为num 返回对应list,最终all file组成嵌套list labels_vector = [list(map(to_num, label)) for label in labels] label_max_len = np.max([len(label) for label in labels_vector]) print(u"最长句子的字数:" + str(label_max_len)) # 下面仅仅计算了语音特征相应的最长的长度。 # 如果仅仅是计算长度是否需要施加变换后计算长度? parallel_read = False if parallel_read: wav_max_len = np.max(Parallel(n_jobs=7)(delayed(get_wav_length)(wav) for wav in wav_files)) else: wav_max_len = 673 print("最长的语音", wav_max_len) batch_size = 16 n_batch = len(wav_files) // batch_size X = tf.placeholder(dtype=tf.float32, shape=[batch_size, None, 20]) # 实际mfcc中的元素并非同号,不严格的情况下如此得到序列长度也是可行的 sequence_len = tf.reduce_sum(tf.cast(tf.not_equal(tf.reduce_sum(X, reduction_indices=2), 0.), tf.int32), reduction_indices=1) Y = tf.placeholder(dtype=tf.int32, shape=[batch_size, None]) train_speech_to_text_network(wav_max_len)
阅读全文
1 0
- TensorFlow 中文语音识别
- Android TensorFlow 智能语音识别
- 中文语音识别代码
- TensorFlow15: 中文语音识别
- 使用Tensorflow搭建语音识别系统
- tensorflow练习11:语音识别程序
- 关于Tensorflow语音识别官方案例
- winform程序实现中文语音识别
- Unity3d中使用百度中文语音识别
- C#中英文语音合成与中文语音识别技术
- C#中英文语音合成与中文语音识别技术
- C#中英文语音合成与中文语音识别技术
- C#中英文语音合成与中文语音识别技术
- C#中英文语音合成与中文语音识别技术
- 如何用Tensorflow开发一个简单的语音识别器
- 基于Tensorflow的VCTK语音识别例子测试
- TensorFlow与中文手写汉字识别
- TensorFlow与中文手写汉字识别
- Tomcat性能调优
- caffe过程中的错误(一)
- hdu 1867 A + B for you again KMP
- Redis之aof日志持久化
- MongoDB windows下安装以及创建windows server
- TensorFlow 中文语音识别
- KVM管理
- uva 1586 分子量
- fork源码概要分析
- RSA加密算法原理及RES签名算法简介
- Java8 lambda表达式10个示例
- 扛水泥的技法与实践-----《相机篇2》
- POJ
- cocos中如何用砖块画出LED数字