1. mnist.py

# Copyright 2015 The TensorFlow Authors. All Rights Reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at##     http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.# =============================================================================="""Builds the MNIST network.Implements the inference/loss/training pattern for model building.1. inference() - Builds the model as far as required for running the networkforward to make predictions.2. loss() - Adds to the inference model the layers required to generate loss.3. training() - Adds to the loss model the Ops required to generate andapply gradients.This file is used by the various "fully_connected_*.py" files and not meant tobe run."""from __future__ import absolute_importfrom __future__ import divisionfrom __future__ import print_functionimport mathimport tensorflow as tf# The MNIST dataset has 10 classes, representing the digits 0 through 9.NUM_CLASSES = 10# The MNIST images are always 28x28 pixels.IMAGE_SIZE = 28IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZEdef inference(images, hidden1_units, hidden2_units):  """Build the MNIST model up to where it may be used for inference.  Args:    images: Images placeholder, from inputs().    hidden1_units: Size of the first hidden layer.    hidden2_units: Size of the second hidden layer.  Returns:    softmax_linear: Output tensor with the computed logits.  """  # Hidden 1  with tf.name_scope('hidden1'):    weights = tf.Variable(        tf.truncated_normal([IMAGE_PIXELS, hidden1_units], #初始函数将根据所得到的均值和标准差,生成一个随机分布                            stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),        name='weights')    biases = tf.Variable(tf.zeros([hidden1_units]),                         name='biases')    hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)  # Hidden 2  with tf.name_scope('hidden2'):    weights = tf.Variable(        tf.truncated_normal([hidden1_units, hidden2_units],                            stddev=1.0 / math.sqrt(float(hidden1_units))),        name='weights')    biases = tf.Variable(tf.zeros([hidden2_units]),                         name='biases')    hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)  # Linear  with tf.name_scope('softmax_linear'):    weights = tf.Variable(        tf.truncated_normal([hidden2_units, NUM_CLASSES],                            stddev=1.0 / math.sqrt(float(hidden2_units))),        name='weights')    biases = tf.Variable(tf.zeros([NUM_CLASSES]),                         name='biases')    logits = tf.matmul(hidden2, weights) + biases  return logitsdef loss(logits, labels):  """Calculates the loss from the logits and the labels.  Args:    logits: Logits tensor, float - [batch_size, NUM_CLASSES].    labels: Labels tensor, int32 - [batch_size].  Returns:    loss: Loss tensor of type float.  """  labels = tf.to_int64(labels)  cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(      labels=labels, logits=logits, name='xentropy')  return tf.reduce_mean(cross_entropy, name='xentropy_mean')def training(loss, learning_rate):  """Sets up the training Ops.  Creates a summarizer to track the loss over time in TensorBoard.  Creates an optimizer and applies the gradients to all trainable variables.  The Op returned by this function is what must be passed to the  `sess.run()` call to cause the model to train.  Args:    loss: Loss tensor, from loss().    learning_rate: The learning rate to use for gradient descent.  Returns:    train_op: The Op for training.  """  # Add a scalar summary for the snapshot loss.  tf.summary.scalar('loss', loss)  # Create the gradient descent optimizer with the given learning rate.  optimizer = tf.train.GradientDescentOptimizer(learning_rate)  # Create a variable to track the global step.  global_step = tf.Variable(0, name='global_step', trainable=False)  # Use the optimizer to apply the gradients that minimize the loss  # (and also increment the global step counter) as a single training step.  train_op = optimizer.minimize(loss, global_step=global_step)  return train_opdef evaluation(logits, labels):  """Evaluate the quality of the logits at predicting the label.  Args:    logits: Logits tensor, float - [batch_size, NUM_CLASSES].    labels: Labels tensor, int32 - [batch_size], with values in the      range [0, NUM_CLASSES).  Returns:    A scalar int32 tensor with the number of examples (out of batch_size)    that were predicted correctly.  """  # For a classifier model, we can use the in_top_k Op.  # It returns a bool tensor with shape [batch_size] that is true for  # the examples where the label is in the top k (here k=1)  # of all logits for that example.  correct = tf.nn.in_top_k(logits, labels, 1)  # Return the number of true entries.  return tf.reduce_sum(tf.cast(correct, tf.int32))

2. fully_connected_feed.py

These placeholders are used as inputs by the rest of the model building  code and will be fed from the downloaded data in the .run() loop, below.  Args:    batch_size: The batch size will be baked into both placeholders.  Returns:    images_placeholder: Images placeholder.    labels_placeholder: Labels placeholder.  """  # Note that the shapes of the placeholders match the shapes of the full  # image and label tensors, except the first dimension is now batch_size  # rather than the full size of the train or test data sets.  images_placeholder = tf.placeholder(tf.float32, shape=(batch_size,                                                         mnist.IMAGE_PIXELS))  labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))  return images_placeholder, labels_placeholderdef fill_feed_dict(data_set, images_pl, labels_pl):  """Fills the feed_dict for training the given step.  A feed_dict takes the form of:  feed_dict = {      <placeholder>: <tensor of values to be passed for placeholder>,      ....  }  Args:    data_set: The set of images and labels, from input_data.read_data_sets()    images_pl: The images placeholder, from placeholder_inputs().    labels_pl: The labels placeholder, from placeholder_inputs().  Returns:    feed_dict: The feed dictionary mapping from placeholders to values.  """  # Create the feed_dict for the placeholders filled with the next  # `batch size` examples.  images_feed, labels_feed = data_set.next_batch(FLAGS.batch_size,                                                 FLAGS.fake_data)  feed_dict = {      images_pl: images_feed,      labels_pl: labels_feed,  }  return feed_dictdef do_eval(sess,            eval_correct,            images_placeholder,            labels_placeholder,            data_set):  """Runs one evaluation against the full epoch of data.  Args:    sess: The session in which the model has been trained.    eval_correct: The Tensor that returns the number of correct predictions.    images_placeholder: The images placeholder.    labels_placeholder: The labels placeholder.    data_set: The set of images and labels to evaluate, from      input_data.read_data_sets().  """  # And run one epoch of eval.  true_count = 0  # Counts the number of correct predictions.  steps_per_epoch = data_set.num_examples // FLAGS.batch_size  num_examples = steps_per_epoch * FLAGS.batch_size  for step in xrange(steps_per_epoch):    feed_dict = fill_feed_dict(data_set,                               images_placeholder,                               labels_placeholder)    true_count += sess.run(eval_correct, feed_dict=feed_dict)  precision = float(true_count) / num_examples  print('  Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' %        (num_examples, true_count, precision))def run_training():  """Train MNIST for a number of steps."""  # Get the sets of images and labels for training, validation, and  # test on MNIST.  data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)  # Tell TensorFlow that the model will be built into the default Graph.  with tf.Graph().as_default():    # Generate placeholders for the images and labels.    images_placeholder, labels_placeholder = placeholder_inputs(        FLAGS.batch_size)    # Build a Graph that computes predictions from the inference model.    logits = mnist.inference(images_placeholder,                             FLAGS.hidden1,                             FLAGS.hidden2)    # Add to the Graph the Ops for loss calculation.    loss = mnist.loss(logits, labels_placeholder)    # Add to the Graph the Ops that calculate and apply gradients.    train_op = mnist.training(loss, FLAGS.learning_rate)    # Add the Op to compare the logits to the labels during evaluation.    eval_correct = mnist.evaluation(logits, labels_placeholder)    # Build the summary Tensor based on the TF collection of Summaries.    summary = tf.summary.merge_all()    # Add the variable initializer Op.    init = tf.global_variables_initializer()    # Create a saver for writing training checkpoints.    saver = tf.train.Saver()    # Create a session for running Ops on the Graph.    sess = tf.Session()    # Instantiate a SummaryWriter to output summaries and the Graph.    summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)    # And then after everything is built:    # Run the Op to initialize the variables.    sess.run(init)    # Start the training loop.    for step in xrange(FLAGS.max_steps):      start_time = time.time()      # Fill a feed dictionary with the actual set of images and labels      # for this particular training step.      feed_dict = fill_feed_dict(data_sets.train,                                 images_placeholder,                                 labels_placeholder)      # Run one step of the model.  The return values are the activations      # from the `train_op` (which is discarded) and the `loss` Op.  To      # inspect the values of your Ops or variables, you may include them      # in the list passed to sess.run() and the value tensors will be      # returned in the tuple from the call.      _, loss_value = sess.run([train_op, loss],                               feed_dict=feed_dict)      duration = time.time() - start_time      # Write the summaries and print an overview fairly often.      if step % 100 == 0:        # Print status to stdout.        print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))        # Update the events file.        summary_str = sess.run(summary, feed_dict=feed_dict)        summary_writer.add_summary(summary_str, step)        summary_writer.flush()      # Save a checkpoint and evaluate the model periodically.      if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:        checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')        saver.save(sess, checkpoint_file, global_step=step) #saver.restore()方法        # Evaluate against the training set.        print('Training Data Eval:')        do_eval(sess,                eval_correct,                images_placeholder,                labels_placeholder,                data_sets.train)        # Evaluate against the validation set.        print('Validation Data Eval:')        do_eval(sess,                eval_correct,                images_placeholder,                labels_placeholder,                data_sets.validation)        # Evaluate against the test set.        print('Test Data Eval:')        do_eval(sess,                eval_correct,                images_placeholder,                labels_placeholder,                data_sets.test)def main(_):  if tf.gfile.Exists(FLAGS.log_dir):    tf.gfile.DeleteRecursively(FLAGS.log_dir)  tf.gfile.MakeDirs(FLAGS.log_dir)  run_training()if __name__ == '__main__':  parser = argparse.ArgumentParser()  parser.add_argument(      '--learning_rate',      type=float,      default=0.01,      help='Initial learning rate.'  )  parser.add_argument(      '--max_steps',      type=int,      default=2000,      help='Number of steps to run trainer.'  )  parser.add_argument(      '--hidden1',      type=int,      default=128,      help='Number of units in hidden layer 1.'  )  parser.add_argument(      '--hidden2',      type=int,      default=32,      help='Number of units in hidden layer 2.'  )  parser.add_argument(      '--batch_size',      type=int,      default=100,      help='Batch size.  Must divide evenly into the dataset sizes.'  )  parser.add_argument(      '--input_data_dir',      type=str,      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),                           'tensorflow/mnist/input_data'),      help='Directory to put the input data.'  )  parser.add_argument(      '--log_dir',      type=str,      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),                           'tensorflow/mnist/logs/fully_connected_feed'),      help='Directory to put the log data.'  )  parser.add_argument(      '--fake_data',      default=False,      help='If true, uses fake data for unit testing.',      action='store_true'  )  FLAGS, unparsed = parser.parse_known_args()  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

本篇教程的目的,是向大家展示如何利用TensorFlow使用(经典)MNIST数据集训练并评估一个用于识别手写数字的简易前馈神经网络(feed-forward neural network)。我们的目标读者,是有兴趣使用TensorFlow的资深机器学习人士。





文件目的fully_connected_feed.py利用下载的数据集训练构建好的MNIST模型的主要代码,以数据反馈字典(feed dictionary)
的形式作为输入模型。mnist.py构建一个完全连接(fully connected)的MINST模型所需的代码。


python fully_connected_feed.py




更多相关信息,请查阅Chris Olah对MNIST的可视化探索。



data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)


数据集目的data_sets.train55000个图像和标签(labels),作为主要训练集。data_sets.validation5000个图像和标签,用于迭代验证训练准确度。data_sets.test10000个图像和标签,用于最终测试训练准确度(trained accuracy)。

了解更多数据有关信息,请查阅此系列教程的数据下载 部分.

输入与占位符(Inputs and Placeholders)


images_placeholder = tf.placeholder(tf.float32, shape=(batch_size,                                                       IMAGE_PIXELS))labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))

在训练循环(training loop)的后续步骤中,传入的整个图像和标签数据集会被切片,以符合每一个操作所设置的batch_size值,占位符操作将会填补以符合这个batch_size值。然后使用feed_dict参数,将数据传入sess.run()函数。

构建图表 (Build the Graph)

在为数据创建占位符之后,就可以运行mnist.py文件,经过三阶段的模式函数操作:inference(), loss(),和training()。图表就构建完成了。

1.inference() —— 尽可能地构建好图表,满足促使神经网络向前反馈并做出预测的要求。

2.loss() —— 往inference图表中添加生成损失(loss)所需要的操作(ops)。

3.training() —— 往损失图表中添加计算并应用梯度(gradients)所需的操作。


inference()函数会尽可能地构建图表,做到返回包含了预测结果(output prediction)的Tensor。

它接受图像占位符为输入,在此基础上借助ReLu(Rectified Linear Units)激活函数,构建一对完全连接层(layers),以及一个有着十个节点(node)、指明了输出logits模型的线性层。


with tf.name_scope('hidden1') as scope:


weights = tf.Variable(    tf.truncated_normal([IMAGE_PIXELS, hidden1_units],                        stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),    name='weights')biases = tf.Variable(tf.zeros([hidden1_units]),                     name='biases')


每个变量在构建时,都会获得初始化操作(initializer ops)。

在这种最常见的情况下,通过tf.truncated_normal函数初始化权重变量,给赋予的shape则是一个二维tensor,其中第一个维度代表该层中权重变量所连接(connect from)的单元数量,第二个维度代表该层中权重变量所连接到的(connect to)单元数量。对于名叫hidden1的第一层,相应的维度则是[IMAGE_PIXELS, hidden1_units],因为权重变量将图像输入连接到了hidden1层。tf.truncated_normal初始函数将根据所得到的均值和标准差,生成一个随机分布。

然后,通过tf.zeros函数初始化偏差变量(biases),确保所有偏差的起始值都是0,而它们的shape则是其在该层中所接到的(connect to)单元数量。


hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
logits = tf.matmul(hidden2, weights) + biases




首先,labels_placeholer中的值,将被编码为一个含有1-hot values的Tensor。例如,如果类标识符为“3”,那么该值就会被转换为: 
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

batch_size = tf.size(labels)labels = tf.expand_dims(labels, 1)indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)concated = tf.concat(1, [indices, labels])onehot_labels = tf.sparse_to_dense(    concated, tf.pack([batch_size, NUM_CLASSES]), 1.0, 0.0)

之后,又添加一个tf.nn.softmax_cross_entropy_with_logits操作,用来比较inference()函数与1-hot标签所输出的logits Tensor。

cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,                                                        onehot_labels,                                                        name='xentropy')

然后,使用tf.reduce_mean函数,计算batch维度(第一维度)下交叉熵(cross entropy)的平均值,将将该值作为总损失。

loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')


注意:交叉熵是信息理论中的概念,可以让我们描述如果基于已有事实,相信神经网络所做的推测最坏会导致什么结果。更多详情,请查阅博文《可视化信息理论》(<a rel="nofollow" href="http://colah.github.io/posts/2015-09-Visual-Information/" "="" style="box-sizing: border-box; color: rgb(45, 133, 202); text-decoration: none; background-color: transparent;">http://colah.github.io/posts/2015-09-Visual-Information/)


training()函数添加了通过梯度下降(gradient descent)将损失最小化所需的操作。

首先,该函数从loss()函数中获取损失Tensor,将其交给tf.scalar_summary,后者在与SummaryWriter(见下文)配合使用时,可以向事件文件(events file)中生成汇总值(summary values)。在本篇教程中,每次写入汇总值时,它都会释放损失Tensor的当前值(snapshot value)。

tf.scalar_summary(loss.op.name, loss)

接下来,我们实例化一个tf.train.GradientDescentOptimizer,负责按照所要求的学习效率(learning rate)应用梯度下降法(gradients)。

optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)

之后,我们生成一个变量用于保存全局训练步骤(global training step)的数值,并使用minimize()函数更新系统中的三角权重(triangle weights)、增加全局步骤的操作。根据惯例,这个操作被称为 train_op,是TensorFlow会话(session)诱发一个完整训练步骤所必须运行的操作(见下文)。

global_step = tf.Variable(0, name='global_step', trainable=False)train_op = optimizer.minimize(loss, global_step=global_step)

最后,程序返回包含了训练操作(training op)输出结果的Tensor。





with tf.Graph().as_default():





sess = tf.Session()


with tf.Session() as sess:



init = tf.initialize_all_variables()sess.run(init)





for step in xrange(max_steps):    sess.run(train_op)



执行每一步时,我们的代码会生成一个反馈字典(feed dictionary),其中包含对应步骤中训练所要使用的例子,这些例子的哈希键就是其所代表的占位符操作。


images_feed, labels_feed = data_set.next_batch(FLAGS.batch_size)


feed_dict = {    images_placeholder: images_feed,    labels_placeholder: labels_feed,}



在运行sess.run函数时,要在代码中明确其需要获取的两个值:[train_op, loss]

for step in xrange(FLAGS.max_steps):    feed_dict = fill_feed_dict(data_sets.train,                               images_placeholder,                               labels_placeholder)    _, loss_value = sess.run([train_op, loss],                             feed_dict=feed_dict)

因为要获取这两个值,sess.run()会返回一个有两个元素的元组。其中每一个Tensor对象,对应了返回的元组中的numpy数组,而这些数组中包含了当前这步训练中对应Tensor的值。由于train_op并不会产生输出,其在返回的元祖中的对应元素就是None,所以会被抛弃。但是,如果模型在训练中出现偏差,loss Tensor的值可能会变成NaN,所以我们要获取它的值,并记录下来。


if step % 100 == 0:    print 'Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)


为了释放TensorBoard所使用的事件文件(events file),所有的即时数据(在这里只有一个)都要在图表构建阶段合并至一个操作(op)中。

summary_op = tf.merge_all_summaries()


summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,                                        graph_def=sess.graph_def)


summary_str = sess.run(summary_op, feed_dict=feed_dict)summary_writer.add_summary(summary_str, step)


MNIST TensorBoard



为了得到可以用来后续恢复模型以进一步训练或评估的检查点文件(checkpoint file),我们实例化一个tf.train.Saver

saver = tf.train.Saver()


saver.save(sess, FLAGS.train_dir, global_step=step)


saver.restore(sess, FLAGS.train_dir)



print 'Training Data Eval:'do_eval(sess,        eval_correct,        images_placeholder,        labels_placeholder,        data_sets.train)print 'Validation Data Eval:'do_eval(sess,        eval_correct,        images_placeholder,        labels_placeholder,        data_sets.validation)print 'Test Data Eval:'do_eval(sess,        eval_correct,        images_placeholder,        labels_placeholder,        data_sets.test)

注意,更复杂的使用场景通常是,先隔绝data_sets.test测试数据集,只有在大量的超参数优化调整(hyperparameter tuning)之后才进行检查。但是,由于MNIST问题比较简单,我们在这里一次性评估所有的数据。

构建评估图表(Eval Graph)


test_all_images, test_all_labels = get_data(train=False)


eval_correct = mnist.evaluation(logits, labels_placeholder)

evaluation函数会生成tf.nn.in_top_k 操作,如果在K个最有可能的预测中可以发现真的标签,那么这个操作就会将模型输出标记为正确。在本文中,我们把K的值设置为1,也就是只有在预测是真的标签时,才判定它是正确的。

eval_correct = tf.nn.in_top_k(logits, labels, 1)

评估图表的输出(Eval Output)


for step in xrange(steps_per_epoch):    feed_dict = fill_feed_dict(data_set,                               images_placeholder,                               labels_placeholder)    true_count += sess.run(eval_correct, feed_dict=feed_dict)


precision = float(true_count) / float(num_examples)print '  Num examples: %d  Num correct: %d  Precision @ 1: %0.02f' % (    num_examples, true_count, precision)