学习笔记TF060:图像语音结合，看图说话

来源：互联网发布：sql 字符串拼接 oracle 编辑：程序博客网时间：2024/06/18 09:04

斯坦福大学人工智能实验室李飞飞教授，实现人工智能3要素：语法(syntax)、语义(semantics)、推理(inference)。语言、视觉。通过语法(语言语法解析、视觉三维结构解析)和语义(语言语义、视觉特体动作含义)作模型输入训练数据，实现推理能力，训练学习能力应用到工作，从新数据推断结论。《The Syntax,Semantics and Inference Mechanism in Natureal Language》 http://www.aaai.org/Papers/Symposia/Fall/1996/FS-96-04/FS96-04-010.pdf 。

看图说话模型。输入一张图片，根据图像像给出描述图像内容自然语言，讲故事。翻译图像信息和文本信息。https://github.com/tensorflow/models/tree/master/research/im2txt 。

原理。编码器-解码器框架，图像编码成固定中间矢量，解码成自然语言描述。编码器Inception V3图像识别模型，解码器LSTM网络。{s0,s1,…,sn-1}字幕词，{wes0,wes1,…,wesn-1}对应词嵌入向量，LSTM输出{p1,p2,…,pn}句子下一词生成概率分布，{logp1(s1),logp2(s2),…,logpn(sn)}正确词每个步骤对数似然，总和取负数是模型最小化目标。

最佳实践。微软Microsoft COCO Caption数据集 http://mscoco.org/ 。Miscrosoft Common Objects in Context(COCO)数据集。超过30万张图片，200万个标记实体。对原COCO数据集33万张图片，用亚马逊Mechanical Turk服务，人工为每张图片生成至少5句标注，标注语句超过150万句。2014版本、2015版本。2014版本82783张图片，验证集40504张图片，测试集40775张图片。
TensorFlow-Slim图像分类库 https://github.com/tensorflow/models/tree/master/research/inception/inception/slim 。

构建模型。show_and_tell_model.py。

from __future__ import absolute_importfrom __future__ import divisionfrom __future__ import print_functionimport tensorflow as tffrom im2txt.ops import image_embeddingfrom im2txt.ops import image_processingfrom im2txt.ops import inputs as input_opsclass ShowAndTellModel(object):  """Image-to-text implementation based on http://arxiv.org/abs/1411.4555.  "Show and Tell: A Neural Image Caption Generator"  Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan  """  def __init__(self, config, mode, train_inception=False):    """Basic setup.    Args:      config: Object containing configuration parameters.      mode: "train", "eval" or "inference".      train_inception: Whether the inception submodel variables are trainable.    """    assert mode in ["train", "eval", "inference"]    self.config = config    self.mode = mode    self.train_inception = train_inception    # Reader for the input data.    self.reader = tf.TFRecordReader()    # To match the "Show and Tell" paper we initialize all variables with a    # random uniform initializer.    self.initializer = tf.random_uniform_initializer(        minval=-self.config.initializer_scale,        maxval=self.config.initializer_scale)    # A float32 Tensor with shape [batch_size, height, width, channels].    self.images = None    # An int32 Tensor with shape [batch_size, padded_length].    self.input_seqs = None    # An int32 Tensor with shape [batch_size, padded_length].    self.target_seqs = None    # An int32 0/1 Tensor with shape [batch_size, padded_length].    self.input_mask = None    # A float32 Tensor with shape [batch_size, embedding_size].    self.image_embeddings = None    # A float32 Tensor with shape [batch_size, padded_length, embedding_size].    self.seq_embeddings = None    # A float32 scalar Tensor; the total loss for the trainer to optimize.    self.total_loss = None    # A float32 Tensor with shape [batch_size * padded_length].    self.target_cross_entropy_losses = None    # A float32 Tensor with shape [batch_size * padded_length].    self.target_cross_entropy_loss_weights = None    # Collection of variables from the inception submodel.    self.inception_variables = []    # Function to restore the inception submodel from checkpoint.    self.init_fn = None    # Global step Tensor.    self.global_step = None  def is_training(self):    """Returns true if the model is built for training mode."""    return self.mode == "train"  def process_image(self, encoded_image, thread_id=0):    """Decodes and processes an image string.    Args:      encoded_image: A scalar string Tensor; the encoded image.      thread_id: Preprocessing thread id used to select the ordering of color        distortions.    Returns:      A float32 Tensor of shape [height, width, 3]; the processed image.    """    return image_processing.process_image(encoded_image,                                          is_training=self.is_training(),                                          height=self.config.image_height,                                          width=self.config.image_width,                                          thread_id=thread_id,                                          image_format=self.config.image_format)  def build_inputs(self):    """Input prefetching, preprocessing and batching.    Outputs:      self.images      self.input_seqs      self.target_seqs (training and eval only)      self.input_mask (training and eval only)    """    if self.mode == "inference":      # In inference mode, images and inputs are fed via placeholders.      image_feed = tf.placeholder(dtype=tf.string, shape=[], name="image_feed")      input_feed = tf.placeholder(dtype=tf.int64,                                  shape=[None],  # batch_size                                  name="input_feed")      # Process image and insert batch dimensions.      images = tf.expand_dims(self.process_image(image_feed), 0)      input_seqs = tf.expand_dims(input_feed, 1)      # No target sequences or input mask in inference mode.      target_seqs = None      input_mask = None    else:      # Prefetch serialized SequenceExample protos.      input_queue = input_ops.prefetch_input_data(          self.reader,          self.config.input_file_pattern,          is_training=self.is_training(),          batch_size=self.config.batch_size,          values_per_shard=self.config.values_per_input_shard,          input_queue_capacity_factor=self.config.input_queue_capacity_factor,          num_reader_threads=self.config.num_input_reader_threads)      # Image processing and random distortion. Split across multiple threads      # with each thread applying a slightly different distortion.      assert self.config.num_preprocess_threads % 2 == 0      images_and_captions = []      for thread_id in range(self.config.num_preprocess_threads):        serialized_sequence_example = input_queue.dequeue()        encoded_image, caption = input_ops.parse_sequence_example(            serialized_sequence_example,            image_feature=self.config.image_feature_name,            caption_feature=self.config.caption_feature_name)        image = self.process_image(encoded_image, thread_id=thread_id)        images_and_captions.append([image, caption])      # Batch inputs.      queue_capacity = (2 * self.config.num_preprocess_threads *                        self.config.batch_size)      images, input_seqs, target_seqs, input_mask = (          input_ops.batch_with_dynamic_pad(images_and_captions,                                           batch_size=self.config.batch_size,                                           queue_capacity=queue_capacity))    self.images = images    self.input_seqs = input_seqs    self.target_seqs = target_seqs    self.input_mask = input_mask  def build_image_embeddings(self):    """Builds the image model subgraph and generates image embeddings.    Inputs:      self.images    Outputs:      self.image_embeddings    """    inception_output = image_embedding.inception_v3(        self.images,        trainable=self.train_inception,        is_training=self.is_training())    self.inception_variables = tf.get_collection(        tf.GraphKeys.GLOBAL_VARIABLES, scope="InceptionV3")    # Map inception output into embedding space.    with tf.variable_scope("image_embedding") as scope:      image_embeddings = tf.contrib.layers.fully_connected(          inputs=inception_output,          num_outputs=self.config.embedding_size,          activation_fn=None,          weights_initializer=self.initializer,          biases_initializer=None,          scope=scope)    # Save the embedding size in the graph.    tf.constant(self.config.embedding_size, name="embedding_size")    self.image_embeddings = image_embeddings  def build_seq_embeddings(self):    """Builds the input sequence embeddings.    Inputs:      self.input_seqs    Outputs:      self.seq_embeddings    """    with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"):      embedding_map = tf.get_variable(          name="map",          shape=[self.config.vocab_size, self.config.embedding_size],          initializer=self.initializer)      seq_embeddings = tf.nn.embedding_lookup(embedding_map, self.input_seqs)    self.seq_embeddings = seq_embeddings  def build_model(self):    """Builds the model.    Inputs:      self.image_embeddings      self.seq_embeddings      self.target_seqs (training and eval only)      self.input_mask (training and eval only)    Outputs:      self.total_loss (training and eval only)      self.target_cross_entropy_losses (training and eval only)      self.target_cross_entropy_loss_weights (training and eval only)    """    # This LSTM cell has biases and outputs tanh(new_c) * sigmoid(o), but the    # modified LSTM in the "Show and Tell" paper has no biases and outputs    # new_c * sigmoid(o).    lstm_cell = tf.contrib.rnn.BasicLSTMCell(        num_units=self.config.num_lstm_units, state_is_tuple=True)    if self.mode == "train":      lstm_cell = tf.contrib.rnn.DropoutWrapper(          lstm_cell,          input_keep_prob=self.config.lstm_dropout_keep_prob,          output_keep_prob=self.config.lstm_dropout_keep_prob)    with tf.variable_scope("lstm", initializer=self.initializer) as lstm_scope:      # Feed the image embeddings to set the initial LSTM state.      zero_state = lstm_cell.zero_state(          batch_size=self.image_embeddings.get_shape()[0], dtype=tf.float32)      _, initial_state = lstm_cell(self.image_embeddings, zero_state)      # Allow the LSTM variables to be reused.      lstm_scope.reuse_variables()      if self.mode == "inference":        # In inference mode, use concatenated states for convenient feeding and        # fetching.        tf.concat(axis=1, values=initial_state, name="initial_state")        # Placeholder for feeding a batch of concatenated states.        state_feed = tf.placeholder(dtype=tf.float32,                                    shape=[None, sum(lstm_cell.state_size)],                                    name="state_feed")        state_tuple = tf.split(value=state_feed, num_or_size_splits=2, axis=1)        # Run a single LSTM step.        lstm_outputs, state_tuple = lstm_cell(            inputs=tf.squeeze(self.seq_embeddings, axis=[1]),            state=state_tuple)        # Concatentate the resulting state.        tf.concat(axis=1, values=state_tuple, name="state")      else:        # Run the batch of sequence embeddings through the LSTM.        sequence_length = tf.reduce_sum(self.input_mask, 1)        lstm_outputs, _ = tf.nn.dynamic_rnn(cell=lstm_cell,                                            inputs=self.seq_embeddings,                                            sequence_length=sequence_length,                                            initial_state=initial_state,                                            dtype=tf.float32,                                            scope=lstm_scope)    # Stack batches vertically.    lstm_outputs = tf.reshape(lstm_outputs, [-1, lstm_cell.output_size])    with tf.variable_scope("logits") as logits_scope:      logits = tf.contrib.layers.fully_connected(          inputs=lstm_outputs,          num_outputs=self.config.vocab_size,          activation_fn=None,          weights_initializer=self.initializer,          scope=logits_scope)    if self.mode == "inference":      tf.nn.softmax(logits, name="softmax")    else:      targets = tf.reshape(self.target_seqs, [-1])      weights = tf.to_float(tf.reshape(self.input_mask, [-1]))      # Compute losses.      losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets,                                                              logits=logits)      batch_loss = tf.div(tf.reduce_sum(tf.multiply(losses, weights)),                          tf.reduce_sum(weights),                          name="batch_loss")      tf.losses.add_loss(batch_loss)      total_loss = tf.losses.get_total_loss()      # Add summaries.      tf.summary.scalar("losses/batch_loss", batch_loss)      tf.summary.scalar("losses/total_loss", total_loss)      for var in tf.trainable_variables():        tf.summary.histogram("parameters/" + var.op.name, var)      self.total_loss = total_loss      self.target_cross_entropy_losses = losses  # Used in evaluation.      self.target_cross_entropy_loss_weights = weights  # Used in evaluation.  def setup_inception_initializer(self):    """Sets up the function to restore inception variables from checkpoint."""    if self.mode != "inference":      # Restore inception variables only.      saver = tf.train.Saver(self.inception_variables)      def restore_fn(sess):        tf.logging.info("Restoring Inception variables from checkpoint file %s",                        self.config.inception_checkpoint_file)        saver.restore(sess, self.config.inception_checkpoint_file)      self.init_fn = restore_fn  def setup_global_step(self):    """Sets up the global step Tensor."""    global_step = tf.Variable(        initial_value=0,        name="global_step",        trainable=False,        collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])    self.global_step = global_step  def build(self):    """Creates all ops for training and evaluation."""    # 构建模型    self.build_inputs() # 构建输入数据    self.build_image_embeddings() # 采用Inception V3构建图像模型，输出图片嵌入向量    self.build_seq_embeddings() # 构建输入序列embeddings    self.build_model() # CNN、LSTM串联，构建完整模型    self.setup_inception_initializer() # 载入Inception V3预训练模型    self.setup_global_step() # 记录全局迭代次数

训练模型。train.py。

from __future__ import absolute_importfrom __future__ import divisionfrom __future__ import print_functionimport tensorflow as tffrom im2txt import configurationfrom im2txt import show_and_tell_modelFLAGS = tf.app.flags.FLAGStf.flags.DEFINE_string("input_file_pattern", "",                       "File pattern of sharded TFRecord input files.")tf.flags.DEFINE_string("inception_checkpoint_file", "",                       "Path to a pretrained inception_v3 model.")tf.flags.DEFINE_string("train_dir", "",                       "Directory for saving and loading model checkpoints.")tf.flags.DEFINE_boolean("train_inception", False,                        "Whether to train inception submodel variables.")tf.flags.DEFINE_integer("number_of_steps", 1000000, "Number of training steps.")tf.flags.DEFINE_integer("log_every_n_steps", 1,                        "Frequency at which loss and global step are logged.")tf.logging.set_verbosity(tf.logging.INFO)def main(unused_argv):  assert FLAGS.input_file_pattern, "--input_file_pattern is required"  assert FLAGS.train_dir, "--train_dir is required"  model_config = configuration.ModelConfig()  model_config.input_file_pattern = FLAGS.input_file_pattern  model_config.inception_checkpoint_file = FLAGS.inception_checkpoint_file  training_config = configuration.TrainingConfig()  # Create training directory.  # 创建训练结果存储路径  train_dir = FLAGS.train_dir  if not tf.gfile.IsDirectory(train_dir):    tf.logging.info("Creating training directory: %s", train_dir)    tf.gfile.MakeDirs(train_dir)  # Build the TensorFlow graph.  # 建立TensorFlow数据流图  g = tf.Graph()  with g.as_default():    # Build the model.    # 构建模型    model = show_and_tell_model.ShowAndTellModel(        model_config, mode="train", train_inception=FLAGS.train_inception)    model.build()    # Set up the learning rate.    # 定义学习率    learning_rate_decay_fn = None    if FLAGS.train_inception:      learning_rate = tf.constant(training_config.train_inception_learning_rate)    else:      learning_rate = tf.constant(training_config.initial_learning_rate)      if training_config.learning_rate_decay_factor > 0:        num_batches_per_epoch = (training_config.num_examples_per_epoch /                                 model_config.batch_size)        decay_steps = int(num_batches_per_epoch *                          training_config.num_epochs_per_decay)        def _learning_rate_decay_fn(learning_rate, global_step):          return tf.train.exponential_decay(              learning_rate,              global_step,              decay_steps=decay_steps,              decay_rate=training_config.learning_rate_decay_factor,              staircase=True)        learning_rate_decay_fn = _learning_rate_decay_fn    # Set up the training ops.    # 定义训练操作    train_op = tf.contrib.layers.optimize_loss(        loss=model.total_loss,        global_step=model.global_step,        learning_rate=learning_rate,        optimizer=training_config.optimizer,        clip_gradients=training_config.clip_gradients,        learning_rate_decay_fn=learning_rate_decay_fn)    # Set up the Saver for saving and restoring model checkpoints.    saver = tf.train.Saver(max_to_keep=training_config.max_checkpoints_to_keep)  # Run training.  # 训练  tf.contrib.slim.learning.train(      train_op,      train_dir,      log_every_n_steps=FLAGS.log_every_n_steps,      graph=g,      global_step=model.global_step,      number_of_steps=FLAGS.number_of_steps,      init_fn=model.init_fn,      saver=saver)if __name__ == "__main__":  tf.app.run()

预测生成模型。run_inference.py。

from __future__ import absolute_importfrom __future__ import divisionfrom __future__ import print_functionimport mathimport osimport tensorflow as tffrom im2txt import configurationfrom im2txt import inference_wrapperfrom im2txt.inference_utils import caption_generatorfrom im2txt.inference_utils import vocabularyFLAGS = tf.flags.FLAGStf.flags.DEFINE_string("checkpoint_path", "",                       "Model checkpoint file or directory containing a "                       "model checkpoint file.")tf.flags.DEFINE_string("vocab_file", "", "Text file containing the vocabulary.")tf.flags.DEFINE_string("input_files", "",                       "File pattern or comma-separated list of file patterns "                       "of image files.")tf.logging.set_verbosity(tf.logging.INFO)def main(_):  # Build the inference graph.  g = tf.Graph()  with g.as_default():    model = inference_wrapper.InferenceWrapper()    restore_fn = model.build_graph_from_config(configuration.ModelConfig(),                                               FLAGS.checkpoint_path)  g.finalize()  # Create the vocabulary.  vocab = vocabulary.Vocabulary(FLAGS.vocab_file)  filenames = []  for file_pattern in FLAGS.input_files.split(","):    filenames.extend(tf.gfile.Glob(file_pattern))  tf.logging.info("Running caption generation on %d files matching %s",                  len(filenames), FLAGS.input_files)  with tf.Session(graph=g) as sess:    # Load the model from checkpoint.    restore_fn(sess)    # Prepare the caption generator. Here we are implicitly using the default    # beam search parameters. See caption_generator.py for a description of the    # available beam search parameters.    generator = caption_generator.CaptionGenerator(model, vocab)    for filename in filenames:      with tf.gfile.GFile(filename, "r") as f:        image = f.read()      captions = generator.beam_search(sess, image)      print("Captions for image %s:" % os.path.basename(filename))      for i, caption in enumerate(captions):        # Ignore begin and end words.        sentence = [vocab.id_to_word(w) for w in caption.sentence[1:-1]]        sentence = " ".join(sentence)        print("  %d) %s (p=%f)" % (i, sentence, math.exp(caption.logprob)))if __name__ == "__main__":  tf.app.run()

参考资料：
《TensorFlow技术解析与实战》

欢迎推荐上海机器学习工作机会，我的微信：qingxingfengzi

阅读全文

0 0