tensorflow 数据读取

来源：互联网发布：java工程师工作累不累编辑：程序博客网时间：2024/06/04 19:03

tf支持三种方式读取数据：

Feeding：训练或测试的时候通过placeholder提供数据给计算图。
文件读取：通过管道在训练开始的时候读取数据。
预加载数据：预先加载到图中，适用于少量数据，使用的比较少。

Feeding

在计算图中定义placeholder，通过feed_dict将数据填充到placeholder。

xs = tf.placeholder(tf.float32, [None, 1])ys = tf.placeholder(tf.float32, [None, 1])···train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss)init = tf.initialize_all_variables()sess = tf.Session()sess.run(init)for i in range(10000):    sess.run(train_step, feed_dict={xs: x_data, ys: y_data})

文件读取

tf内建了三个文件数据读取类：
1. tf.TFRecordReader：读取TFRecored文件
1. tf.FixedLengthRecordReader ：读取固定长度格式文件
1. tf.TextLineReader ：读取文本文件，如csv

从文件读取的方式不同于使用feed_dic,batch数据的生成是在graph中完成的。

tf.TFRecordReader

TFRecored的写入和读取

def tf_writter(img_collection,save_to,resize_to=None):    filename = save_to    writer = tf.python_io.TFRecordWriter(filename)    lines=open(img_collection).readlines()    for idx,line in enumerate(lines):        splited_lines=line.strip('\n').split(' ')        img_path=splited_lines[0]        img_label=splited_lines[1]        try:            image = Image.open(img_path)        except Exception,x:            print x.message +str(line)            continue        if resize_to is not None:            image=image.resize(resize_to)        image_raw = image.tobytes()        label = int(img_label)        example = tf.train.Example(features = tf.train.Features(feature = {            'height': _int64_feature(image.height),            'width': _int64_feature(image.width),            'depth': _int64_feature(3),# 3 for rgb image            'label': _int64_feature(label),            'image_raw': _bytes_feature(image_raw)        }))        writer.write(example.SerializeToString())        if idx%1000==0:            print str(idx)+ " writed: "+splited_lines[0]    writer.close()def tf_reader(record_path,image_save_to,count_record):    filenames = [record_path]    filename_queue = tf.train.string_input_producer(filenames)    reader = tf.TFRecordReader()    _, serialized_example = reader.read(filename_queue)    features = tf.parse_single_example(        serialized_example,        # Defaults are not specified since both keys are required.        features={            'image_raw': tf.FixedLenFeature([], tf.string),            'label': tf.FixedLenFeature([], tf.int64),            'width': tf.FixedLenFeature([], tf.int64),            'height': tf.FixedLenFeature([], tf.int64),            'depth': tf.FixedLenFeature([], tf.int64)        })    image_raw = tf.decode_raw(features['image_raw'], tf.uint8)    label = tf.cast(features['label'], tf.int32)    height = tf.cast(features['height'], tf.int32)    width = tf.cast(features['width'], tf.int32)    depth = tf.cast(features['depth'], tf.int32)    image = tf.reshape(image_raw, [height, width, depth])    with tf.Session() as session:        init_op=tf.initialize_all_tables()        session.run(init_op)        coord=tf.train.Coordinator()        threads=tf.train.start_queue_runners(coord=coord)        for i in range(count_record):            example_image,example_label=session.run([image,label])            img=Image.fromarray(example_image,'RGB')            img.save(image_save_to+str(i)+'_''label_'+str(example_label)+'.jpg')            print(example_label)        coord.request_stop()        coord.join(threads)

使用TFRecord作为训练数据（参考minist官方代码）

def read_and_decode(filename_queue):  reader = tf.TFRecordReader()  _, serialized_example = reader.read(filename_queue)  features = tf.parse_single_example(      serialized_example,      # Defaults are not specified since both keys are required.      features={          'image_raw': tf.FixedLenFeature([], tf.string),          'label': tf.FixedLenFeature([], tf.int64),      })  # Convert from a scalar string tensor (whose single string has  # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape  # [mnist.IMAGE_PIXELS].  image = tf.decode_raw(features['image_raw'], tf.uint8)  image.set_shape([mnist.IMAGE_PIXELS])  # OPTIONAL: Could reshape into a 28x28 image and apply distortions  # here.  Since we are not applying any distortions in this  # example, and the next step expects the image to be flattened  # into a vector, we don't bother.  # Convert from [0, 255] -> [-0.5, 0.5] floats.  image = tf.cast(image, tf.float32) * (1. / 255) - 0.5  # Convert label from a scalar uint8 tensor to an int32 scalar.  label = tf.cast(features['label'], tf.int32)  return image, labeldef inputs(train, batch_size, num_epochs):  """Reads input data num_epochs times.  Args:    train: Selects between the training (True) and validation (False) data.    batch_size: Number of examples per returned batch.    num_epochs: Number of times to read the input data, or 0/None to       train forever.  Returns:    A tuple (images, labels), where:    * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS]      in the range [-0.5, 0.5].    * labels is an int32 tensor with shape [batch_size] with the true label,      a number in the range [0, mnist.NUM_CLASSES).    Note that an tf.train.QueueRunner is added to the graph, which    must be run using e.g. tf.train.start_queue_runners().  """  if not num_epochs: num_epochs = None  filename = os.path.join(FLAGS.train_dir,                          TRAIN_FILE if train else VALIDATION_FILE)  with tf.name_scope('input'):    filename_queue = tf.train.string_input_producer(        [filename], num_epochs=num_epochs)    # Even when reading in multiple threads, share the filename    # queue.    image, label = read_and_decode(filename_queue)    # Shuffle the examples and collect them into batch_size batches.    # (Internally uses a RandomShuffleQueue.)    # We run this in two threads to avoid being a bottleneck.    images, sparse_labels = tf.train.shuffle_batch(        [image, label], batch_size=batch_size, num_threads=2,        capacity=1000 + 3 * batch_size,        # Ensures a minimum amount of shuffling of examples.        min_after_dequeue=1000)    return images, sparse_labelsdef run_training():  """Train MNIST for a number of steps."""  # Tell TensorFlow that the model will be built into the default Graph.  with tf.Graph().as_default():    # Input images and labels.    images, labels = inputs(train=True, batch_size=FLAGS.batch_size,                            num_epochs=FLAGS.num_epochs)    # Build a Graph that computes predictions from the inference model.    logits = mnist.inference(images,                             FLAGS.hidden1,                             FLAGS.hidden2)    # Add to the Graph the loss calculation.    loss = mnist.loss(logits, labels)    # Add to the Graph operations that train the model.    train_op = mnist.training(loss, FLAGS.learning_rate)    # The op for initializing the variables.    init_op = tf.group(tf.global_variables_initializer(),                       tf.local_variables_initializer())    # Create a session for running operations in the Graph.    sess = tf.Session()    # Initialize the variables (the trained variables and the    # epoch counter).    sess.run(init_op)    # Start input enqueue threads.    coord = tf.train.Coordinator()    threads = tf.train.start_queue_runners(sess=sess, coord=coord)    try:      step = 0      while not coord.should_stop():        start_time = time.time()        # Run one step of the model.  The return values are        # the activations from the `train_op` (which is        # discarded) and the `loss` op.  To inspect the values        # of your ops or variables, you may include them in        # the list passed to sess.run() and the value tensors        # will be returned in the tuple from the call.        _, loss_value = sess.run([train_op, loss])        duration = time.time() - start_time        # Print an overview fairly often.        if step % 100 == 0:          print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,                                                     duration))        step += 1    except tf.errors.OutOfRangeError:      print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))    finally:      # When done, ask the threads to stop.      coord.request_stop()    # Wait for threads to finish.    coord.join(threads)    sess.close()

tf.FixedLengthRecordReader ：（参考 cifar10的官方代码）

cifar10_single_gpu_train.py:

def train():  """Train CIFAR-10 for a number of steps."""  with tf.Graph().as_default():    global_step = tf.contrib.framework.get_or_create_global_step()    # Get images and labels for CIFAR-10.    # Force input pipeline to CPU:0 to avoid operations sometimes ending up on    # GPU and resulting in a slow down.    with tf.device('/cpu:0'):      images, labels = cifar10.distorted_inputs()    # Build a Graph that computes the logits predictions from the    # inference model.    logits = cifar10.inference(images)    # Calculate loss.    loss = cifar10.loss(logits, labels)    # Build a Graph that trains the model with one batch of examples and    # updates the model parameters.    train_op = cifar10.train(loss, global_step)

cifar10.py

def distorted_inputs():  """Construct distorted input for CIFAR training using the Reader ops.  Returns:    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.    labels: Labels. 1D tensor of [batch_size] size.  Raises:    ValueError: If no data_dir  """  if not FLAGS.data_dir:    raise ValueError('Please supply a data_dir')  data_dir = os.path.join(FLAGS.data_dir, 'cifar-10-batches-bin')  images, labels = cifar10_input.distorted_inputs(data_dir=data_dir,                                                  batch_size=FLAGS.batch_size)  if FLAGS.use_fp16:    images = tf.cast(images, tf.float16)    labels = tf.cast(labels, tf.float16)  return images, labels

cifar10_input.py

def distorted_inputs(data_dir, batch_size):  """Construct distorted input for CIFAR training using the Reader ops.  Args:    data_dir: Path to the CIFAR-10 data directory.    batch_size: Number of images per batch.  Returns:    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.    labels: Labels. 1D tensor of [batch_size] size.  """  filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)               for i in xrange(1, 6)]  for f in filenames:    if not tf.gfile.Exists(f):      raise ValueError('Failed to find file: ' + f)  # Create a queue that produces the filenames to read.  filename_queue = tf.train.string_input_producer(filenames)  # Read examples from files in the filename queue.  read_input = read_cifar10(filename_queue)  reshaped_image = tf.cast(read_input.uint8image, tf.float32)  height = IMAGE_SIZE  width = IMAGE_SIZE  # Image processing for training the network. Note the many random  # distortions applied to the image.  # Randomly crop a [height, width] section of the image.  distorted_image = tf.random_crop(reshaped_image, [height, width, 3])  # Randomly flip the image horizontally.  distorted_image = tf.image.random_flip_left_right(distorted_image)  # Because these operations are not commutative, consider randomizing  # the order their operation.  # NOTE: since per_image_standardization zeros the mean and makes  # the stddev unit, this likely has no effect see tensorflow#1458.  distorted_image = tf.image.random_brightness(distorted_image,                                               max_delta=63)  distorted_image = tf.image.random_contrast(distorted_image,                                             lower=0.2, upper=1.8)  # Subtract off the mean and divide by the variance of the pixels.  float_image = tf.image.per_image_standardization(distorted_image)  # Set the shapes of tensors.  float_image.set_shape([height, width, 3])  read_input.label.set_shape([1])  # Ensure that the random shuffling has good mixing properties.  min_fraction_of_examples_in_queue = 0.4  min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *                           min_fraction_of_examples_in_queue)  print ('Filling queue with %d CIFAR images before starting to train. '         'This will take a few minutes.' % min_queue_examples)  # Generate a batch of images and labels by building up a queue of examples.  return _generate_image_and_label_batch(float_image, read_input.label,                                         min_queue_examples, batch_size,                                         shuffle=True)def read_cifar10(filename_queue):  """Reads and parses examples from CIFAR10 data files.  Recommendation: if you want N-way read parallelism, call this function  N times.  This will give you N independent Readers reading different  files & positions within those files, which will give better mixing of  examples.  Args:    filename_queue: A queue of strings with the filenames to read from.  Returns:    An object representing a single example, with the following fields:      height: number of rows in the result (32)      width: number of columns in the result (32)      depth: number of color channels in the result (3)      key: a scalar string Tensor describing the filename & record number        for this example.      label: an int32 Tensor with the label in the range 0..9.      uint8image: a [height, width, depth] uint8 Tensor with the image data  """  class CIFAR10Record(object):    pass  result = CIFAR10Record()  # Dimensions of the images in the CIFAR-10 dataset.  # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the  # input format.  label_bytes = 1  # 2 for CIFAR-100  result.height = 32  result.width = 32  result.depth = 3  image_bytes = result.height * result.width * result.depth  # Every record consists of a label followed by the image, with a  # fixed number of bytes for each.  record_bytes = label_bytes + image_bytes  # Read a record, getting filenames from the filename_queue.  No  # header or footer in the CIFAR-10 format, so we leave header_bytes  # and footer_bytes at their default of 0.  reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)  result.key, value = reader.read(filename_queue)  # Convert from a string to a vector of uint8 that is record_bytes long.  record_bytes = tf.decode_raw(value, tf.uint8)  # The first bytes represent the label, which we convert from uint8->int32.  result.label = tf.cast(      tf.strided_slice(record_bytes, [0], [label_bytes]), tf.int32)  # The remaining bytes after the label represent the image, which we reshape  # from [depth * height * width] to [depth, height, width].  depth_major = tf.reshape(      tf.strided_slice(record_bytes, [label_bytes],                       [label_bytes + image_bytes]),      [result.depth, result.height, result.width])  # Convert from [depth, height, width] to [height, width, depth].  result.uint8image = tf.transpose(depth_major, [1, 2, 0])  return result  def _generate_image_and_label_batch(image, label, min_queue_examples,                                    batch_size, shuffle):  """Construct a queued batch of images and labels.  Args:    image: 3-D Tensor of [height, width, 3] of type.float32.    label: 1-D Tensor of type.int32    min_queue_examples: int32, minimum number of samples to retain      in the queue that provides of batches of examples.    batch_size: Number of images per batch.    shuffle: boolean indicating whether to use a shuffling queue.  Returns:    images: Images. 4D tensor of [batch_size, height, width, 3] size.    labels: Labels. 1D tensor of [batch_size] size.  """  # Create a queue that shuffles the examples, and then  # read 'batch_size' images + labels from the example queue.  num_preprocess_threads = 16  if shuffle:    images, label_batch = tf.train.shuffle_batch(        [image, label],        batch_size=batch_size,        num_threads=num_preprocess_threads,        capacity=min_queue_examples + 3 * batch_size,        min_after_dequeue=min_queue_examples)  else:    images, label_batch = tf.train.batch(        [image, label],        batch_size=batch_size,        num_threads=num_preprocess_threads,        capacity=min_queue_examples + 3 * batch_size)  # Display the training images in the visualizer.  tf.summary.image('images', images)  return images, tf.reshape(label_batch, [batch_size])

tf.TextLineReader

从csv可以保存特征也可以保存图像地址，对于图像而言如果将二进制图像保存在csv文件中使用会是的csv文件过于庞大。从csv文件中获取图像路径，读取图像：

def read_from_csv(data_dir,csv_collection_file):    filename = os.path.join(data_dir, csv_collection_file)    with open(filename) as fid:        content = fid.read()    content = content.split('\n')    content = content[:-1]    valuequeue = tf.train.string_input_producer(content, shuffle=True)    reader = tf.TextLineReader()    key, value = reader.read(valuequeue)    dir, labels = tf.decode_csv(records=value, record_defaults=[["string"], [""]], field_delim=" ")    label = tf.string_to_number(label, tf.int32)    imagecontent = tf.read_file(dir)    image = tf.image.decode_png(imagecontent, channels=3, dtype=tf.uint8)    image = tf.cast(image, tf.float32)    rshape = tf.reshape(tf.reduce_mean(image, [0, 1]), [1, 1, 3])  # 这里是对像素值归到128的均值，即对每个channel分别除以均值乘以128    image = image / rshape * 128    image = tf.random_crop(image, [IMAGE_SIZE, IMAGE_SIZE, 3])    images, labels_batch = tf.train.shuffle_batch([image, label], batch_size=batch_size, num_threads=6,                                                  capacity=3 * batch_size + 3000, min_after_dequeue=3000)    return images, labels_batch

综上，从文件中读取batch数据，总体的流程分为如下几步：
1. 创建输入管道，向计算图中添加queue和对应的QueueRunner，可以通过如下几个类实现：

tf.train.match_filenames_once
tf.train.limit_epochs
tf.train.input_producer
tf.train.range_input_producer
tf.train.slice_input_producer
tf.train.string_input_producer

提供解析图像数据和label的op，如下面代码中的 read_my_file_format(filename_queue)方法，此时可以对图像做一些预处理操作。
生成批量数据以供训练和预测，可以使用如下api

tf.train.batch
tf.train.maybe_batch
tf.train.batch_join
tf.train.maybe_batch_join
tf.train.shuffle_batch
tf.train.maybe_shuffle_batch
tf.train.shuffle_batch_join
tf.train.maybe_shuffle_batch_join

使用QueueRunner创建多线程加载数据,第一和第三步中很多tf.train方法，这些方法会添加tf.train.QueueRunner对象到graph中因此需要在训练之前执行，tf.train.start_queue_runners,否则程序将挂起一直等待。最好的方式是配合tf.train.Coordinator方法一起使用。

def read_my_file_format(filename_queue):  reader = tf.SomeReader()  key, record_string = reader.read(filename_queue)  example, label = tf.some_decoder(record_string)  processed_example = some_processing(example)  return processed_example, labeldef input_pipeline(filenames, batch_size, num_epochs=None):  filename_queue = tf.train.string_input_producer(      filenames, num_epochs=num_epochs, shuffle=True)  example, label = read_my_file_format(filename_queue)  # min_after_dequeue defines how big a buffer we will randomly sample  #   from -- bigger means better shuffling but slower start up and more  #   memory used.  # capacity must be larger than min_after_dequeue and the amount larger  #   determines the maximum we will prefetch.  Recommendation:  #   min_after_dequeue + (num_threads + a small safety margin) * batch_size  min_after_dequeue = 10000  capacity = min_after_dequeue + 3 * batch_size  example_batch, label_batch = tf.train.shuffle_batch(      [example, label], batch_size=batch_size, capacity=capacity,      min_after_dequeue=min_after_dequeue)  return example_batch, label_batch# Create the graph, etc.init_op = tf.global_variables_initializer()# Create a session for running operations in the Graph.sess = tf.Session()# Initialize the variables (like the epoch counter).sess.run(init_op)# Start input enqueue threads.coord = tf.train.Coordinator()threads = tf.train.start_queue_runners(sess=sess, coord=coord)try:    while not coord.should_stop():        # Run training steps or whatever        sess.run(train_op)except tf.errors.OutOfRangeError:    print('Done training -- epoch limit reached')finally:    # When done, ask the threads to stop.    coord.request_stop()# Wait for threads to finish.coord.join(threads)sess.close()

预加载

适用于数据量少，可以将数据全部导入内存的情况。

training_data = ...training_labels = ...with tf.Session():  input_data = tf.constant(training_data)  input_labels = tf.constant(training_labels)

参考资料：

https://www.tensorflow.org/api_guides/python/reading_data#feeding
https://www.tensorflow.org/programmers_guide/threading_and_queues

阅读全文

0 0