TensorFlow17: “声音大挪移”

来源：互联网发布：购买网络存储服务器编辑：程序博客网时间：2024/04/27 05:44

看见本帖标题，你可能会问：“声音大挪移”是什么鬼玩意，和张无忌有什么关系。

如果你没看过鬼畜，先温习两个：【元首】粉红的回忆、【圣地亚哥金曲】客官不可以。（本帖内容和鬼畜关系不大）

前文《实现谷歌Deep Dream》可生成带有艺术感的图片。其实，还有另一种合成图片的方式，洋文叫Style Transfer，这种方法需要用到两张图片，如下图：

+ =
皮特 + Style = 皮特 Style

Style Transfer的原始论文：https://arxiv.org/abs/1508.06576；基于Torch的代码实现：neural-style

本帖只是在音频上应用Style Transfer，应该能生成非常搞笑的玩意。

代码：

import tensorflow as tfimport librosa  # 用来提取音频文件, 参看<中文语音识别>import numpy as npimport os#import shlex  # python2 pipes # 音频文件路径content_audio = "./Traveling_Light.mp3"style_audio = "./东风破.mp3"# 为洋文歌曲<Traveling Light>添加周杰伦风味 # 剪辑一段音频, 默认取开头的10s, 太大内存吃不消def cut_audio(filename, start_pos='00:00:00', lens=10):newfile = os.path.splitext(os.path.basename(filename))[0] + '_' + str(lens) + 's.mp3'# 确保系统中已安装ffmpegcmd = "ffmpeg -i {} -ss {} -t {} {}".format(filename, start_pos, lens, newfile)os.system(cmd)return newfile content_audio_10s = cut_audio(content_audio, start_pos='00:00:33')style_audio_10s = cut_audio(style_audio, start_pos='00:00:38') # Short Time Fourier Transform音频转spectrogram（把1维信号转为2维, 可以被视作图像）# https://en.wikipedia.org/wiki/Short-time_Fourier_transformN_FFT = 2048def read_audio(filename):x, fs = librosa.load(filename)S = librosa.stft(x, N_FFT)p = np.angle(S) S = np.log1p(np.abs(S[:,:430]))return S, fs content_data, _ = read_audio(content_audio_10s)style_data, fs = read_audio(style_audio_10s) samples_n = content_data.shape[1]  # 430channels_n = style_data.shape[0]   # 1025 style_data = style_data[:channels_n, :samples_n] content_data_tf = np.ascontiguousarray(content_data.T[None,None,:,:])style_data_tf = np.ascontiguousarray(style_data.T[None,None,:,:]) # filter shape "[filter_height, filter_width, in_channels, out_channels]"N_FILTERS = 4096std = np.sqrt(2) * np.sqrt(2.0 / ((channels_n + N_FILTERS) * 11))kernel = np.random.randn(1, 11, channels_n, N_FILTERS)*std # content and style featuresg = tf.Graph()with g.as_default(), g.device('/cpu:0'), tf.Session() as sess:# data shape "[batch, in_height, in_width, in_channels]",x = tf.placeholder('float32', [1, 1, samples_n, channels_n], name="x") kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')conv = tf.nn.conv2d(x, kernel_tf, strides=[1, 1, 1, 1], padding="VALID", name="conv") net = tf.nn.relu(conv) content_features = net.eval(feed_dict={x: content_data_tf})style_features = net.eval(feed_dict={x: style_data_tf}) features = np.reshape(style_features, (-1, N_FILTERS))style_gram = np.matmul(features.T, features) / samples_n # OptimizeALPHA= 0.01   # ALPHA越大,content越占主导; 如果ALPHA为0,表示没有contentresult = Nonewith tf.Graph().as_default():learning_rate= 0.001x = tf.Variable(np.random.randn(1, 1, samples_n, channels_n).astype(np.float32)*learning_rate, name="x")kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')conv = tf.nn.conv2d(x, kernel_tf, strides=[1, 1, 1, 1], padding="VALID", name="conv") net = tf.nn.relu(conv) content_loss = ALPHA * 2 * tf.nn.l2_loss(net - content_features)style_loss = 0 _, height, width, number = map(lambda i: i.value, net.get_shape()) size = height * width * numberfeats = tf.reshape(net, (-1, number))gram = tf.matmul(tf.transpose(feats), feats)  / samples_nstyle_loss = 2 * tf.nn.l2_loss(gram - style_gram) # lossloss = content_loss + style_lossopt = tf.contrib.opt.ScipyOptimizerInterface(loss, method='L-BFGS-B', options={'maxiter': 300}) # Optimizationwith tf.Session() as sess:sess.run(tf.global_variables_initializer())opt.minimize(sess)result = x.eval() # 把spectrogram转回wav音频audio = np.zeros_like(content_data)audio[:channels_n,:] = np.exp(result[0,0].T) - 1 p = 2 * np.pi * np.random.random_sample(audio.shape) - np.pifor i in range(500):S = audio * np.exp(1j*p)x = librosa.istft(S)p = np.angle(librosa.stft(x, N_FFT)) librosa.output.write_wav("output.mp3", x, fs)

生成的文件：output.mp3