JPG、PNG与MNIST数据集之间的转换

来源:互联网 发布:郑秀妍 知乎 编辑:程序博客网 时间:2024/06/06 07:44

最近用到手写识别,想起来 TensorFlow 的 tutorial 上有个手写识别的教程,想正好拿来用。但是问题很明显,TensorFlow 上的这个教程手写数据集的前期处理是自动完成的,如果我想输入自己的手写图片,该如何做前期处理呢?

TensorFlow 用到的数据集是MNIST,在数据集的官网上可以看到,MNIST 把图片文件转成了特定格式的二进制文件,文件后缀为 idx3-ubyte

同时在简书上找到这篇文章:利用 python 解析MNIST数据集(侵删),这篇文章的整体思路是用 python 自带的 struct 包来进行二进制文件的操作,代码如下:

"""对MNIST手写数字数据文件转换为bmp图片文件格式。相关格式转换见官网以及代码注释。========================关于IDX文件格式的解析规则:========================THE IDX FILE FORMATthe IDX file format is a simple format for vectors and multidimensional matrices of various numerical types.The basic format ismagic numbersize in dimension 0size in dimension 1size in dimension 2.....size in dimension NdataThe magic number is an integer (MSB first). The first 2 bytes are always 0.The third byte codes the type of the data:0x08: unsigned byte0x09: signed byte0x0B: short (2 bytes)0x0C: int (4 bytes)0x0D: float (4 bytes)0x0E: double (8 bytes)The 4-th byte codes the number of dimensions of the vector/matrix: 1 for vectors, 2 for matrices....The sizes in each dimension are 4-byte integers (MSB first, high endian, like in most non-Intel processors).The data is stored like in a C array, i.e. the index in the last dimension changes the fastest."""import numpy as npimport structimport matplotlib.pyplot as plt# 训练集文件train_images_idx3_ubyte_file = '../../data/mnist/bin/train-images.idx3-ubyte'# 训练集标签文件train_labels_idx1_ubyte_file = '../../data/mnist/bin/train-labels.idx1-ubyte'# 测试集文件test_images_idx3_ubyte_file = '../../data/mnist/bin/t10k-images.idx3-ubyte'# 测试集标签文件test_labels_idx1_ubyte_file = '../../data/mnist/bin/t10k-labels.idx1-ubyte'def decode_idx3_ubyte(idx3_ubyte_file):    """    解析idx3文件的通用函数    :param idx3_ubyte_file: idx3文件路径    :return: 数据集    """    # 读取二进制数据    bin_data = open(idx3_ubyte_file, 'rb').read()    # 解析文件头信息,依次为魔数、图片数量、每张图片高、每张图片宽    offset = 0    fmt_header = '>iiii'    magic_number, num_images, num_rows, num_cols = struct.unpack_from(fmt_header, bin_data, offset)    print '魔数:%d, 图片数量: %d张, 图片大小: %d*%d' % (magic_number, num_images, num_rows, num_cols)    # 解析数据集    image_size = num_rows * num_cols    offset += struct.calcsize(fmt_header)    fmt_image = '>' + str(image_size) + 'B'    images = np.empty((num_images, num_rows, num_cols))    for i in range(num_images):        if (i + 1) % 10000 == 0:            print '已解析 %d' % (i + 1) + '张'        images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset)).reshape((num_rows, num_cols))        offset += struct.calcsize(fmt_image)    return imagesdef decode_idx1_ubyte(idx1_ubyte_file):    """    解析idx1文件的通用函数    :param idx1_ubyte_file: idx1文件路径    :return: 数据集    """    # 读取二进制数据    bin_data = open(idx1_ubyte_file, 'rb').read()    # 解析文件头信息,依次为魔数和标签数    offset = 0    fmt_header = '>ii'    magic_number, num_images = struct.unpack_from(fmt_header, bin_data, offset)    print '魔数:%d, 图片数量: %d张' % (magic_number, num_images)    # 解析数据集    offset += struct.calcsize(fmt_header)    fmt_image = '>B'    labels = np.empty(num_images)    for i in range(num_images):        if (i + 1) % 10000 == 0:            print '已解析 %d' % (i + 1) + '张'        labels[i] = struct.unpack_from(fmt_image, bin_data, offset)[0]        offset += struct.calcsize(fmt_image)    return labelsdef load_train_images(idx_ubyte_file=train_images_idx3_ubyte_file):    """    TRAINING SET IMAGE FILE (train-images-idx3-ubyte):    [offset] [type]          [value]          [description]    0000     32 bit integer  0x00000803(2051) magic number    0004     32 bit integer  60000            number of images    0008     32 bit integer  28               number of rows    0012     32 bit integer  28               number of columns    0016     unsigned byte   ??               pixel    0017     unsigned byte   ??               pixel    ........    xxxx     unsigned byte   ??               pixel    Pixels are organized row-wise. Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).    :param idx_ubyte_file: idx文件路径    :return: n*row*col维np.array对象,n为图片数量    """    return decode_idx3_ubyte(idx_ubyte_file)def load_train_labels(idx_ubyte_file=train_labels_idx1_ubyte_file):    """    TRAINING SET LABEL FILE (train-labels-idx1-ubyte):    [offset] [type]          [value]          [description]    0000     32 bit integer  0x00000801(2049) magic number (MSB first)    0004     32 bit integer  60000            number of items    0008     unsigned byte   ??               label    0009     unsigned byte   ??               label    ........    xxxx     unsigned byte   ??               label    The labels values are 0 to 9.    :param idx_ubyte_file: idx文件路径    :return: n*1维np.array对象,n为图片数量    """    return decode_idx1_ubyte(idx_ubyte_file)def load_test_images(idx_ubyte_file=test_images_idx3_ubyte_file):    """    TEST SET IMAGE FILE (t10k-images-idx3-ubyte):    [offset] [type]          [value]          [description]    0000     32 bit integer  0x00000803(2051) magic number    0004     32 bit integer  10000            number of images    0008     32 bit integer  28               number of rows    0012     32 bit integer  28               number of columns    0016     unsigned byte   ??               pixel    0017     unsigned byte   ??               pixel    ........    xxxx     unsigned byte   ??               pixel    Pixels are organized row-wise. Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).    :param idx_ubyte_file: idx文件路径    :return: n*row*col维np.array对象,n为图片数量    """    return decode_idx3_ubyte(idx_ubyte_file)def load_test_labels(idx_ubyte_file=test_labels_idx1_ubyte_file):    """    TEST SET LABEL FILE (t10k-labels-idx1-ubyte):    [offset] [type]          [value]          [description]    0000     32 bit integer  0x00000801(2049) magic number (MSB first)    0004     32 bit integer  10000            number of items    0008     unsigned byte   ??               label    0009     unsigned byte   ??               label    ........    xxxx     unsigned byte   ??               label    The labels values are 0 to 9.    :param idx_ubyte_file: idx文件路径    :return: n*1维np.array对象,n为图片数量    """    return decode_idx1_ubyte(idx_ubyte_file)def run():    train_images = load_train_images()    train_labels = load_train_labels()    # test_images = load_test_images()    # test_labels = load_test_labels()    # 查看前十个数据及其标签以读取是否正确    for i in range(10):        print train_labels[i]        plt.imshow(train_images[i], cmap='gray')        plt.show()    print 'done'if __name__ == '__main__':    run()

代码整体不难,主要用的是 struct.unpack_from() 这个函数,函数的参数可以根据二进制文件的格式做调整

通过这篇文章,知道了可以通过 struct.pack() 和 struct.unpack() 分别来编码和解码二进制文件,提供了一种转换的思路,但仍然还不够好

随后在 GitHub 上发现了 JPG-PNG-to-MNIST-NN-Format 这个代码,用 PIL 里的 image 来进行处理,代码如下:

import osfrom PIL import Imagefrom array import *from random import shuffleimport sys# Load from and save toNames = [['./training-images','train'], ['./test-images','test']]for name in Names:    data_image = array('B')    data_label = array('B')    FileList = []    for dirname in os.listdir(name[0])[1:]: # [1:] Excludes .DS_Store from Mac OS        path = os.path.join(name[0],dirname)        for filename in os.listdir(path):            if filename.endswith(".png"):                FileList.append(os.path.join(name[0],dirname,filename))    shuffle(FileList) # Usefull for further segmenting the validation set    for filename in FileList:        label = int(filename.split('\\')[1])#<==原文这里是'/'[2]错了,改了一下        Im = Image.open(filename)        pixel = Im.load()        width, height = Im.size        for x in range(0,width):            for y in range(0,height):                data_image.append(pixel[y,x])        data_label.append(label) # labels start (one unsigned byte each)    hexval = "{0:#0{1}x}".format(len(FileList),6) # number of files in HEX    # header for label array    header = array('B')    header.extend([0,0,8,1,0,0])    header.append(int('0x'+hexval[2:][:2],16))    header.append(int('0x'+hexval[2:][2:],16))    data_label = header + data_label    # additional header for images array    if max([width,height]) <= 256:        header.extend([0,0,0,width,0,0,0,height])    else:        raise ValueError('Image exceeds maximum size: 256x256 pixels');    header[3] = 3 # Changing MSB for image data (0x00000803)    data_image = header + data_image    output_file = open(name[1]+'-images.idx3-ubyte', 'wb')    data_image.tofile(output_file)    output_file.close()    output_file = open(name[1]+'-labels.idx1-ubyte', 'wb')    data_label.tofile(output_file)    output_file.close()# gzip resulting files#for name in Names:#   os.system('gzip '+name[1]+'-images-idx3-ubyte')#   os.system('gzip '+name[1]+'-labels-idx1-ubyte')

效果还不错,代码也蛮简单的,于是根据以上的代码做了自己的修改:

import osfrom PIL import Imagefrom array import *from random import shuffleimport sysdef changeFile(fileName):    imageData = array('B')    FileList = []    Im = Image.open(fileName)    pixel = Im.load()    width,height = Im.size    for x in range(0, width):        for y in range(0, height):            imageData.append(pixel[y,x])    header = array('B')    header.extend([0,0,8,1,0,0])    header.append(int('0x' + '00', 16))#这两行是魔数的设置,格式要求    header.append(int('0x' + '01', 16))#关于魔数的一些东西以后会补上    if max([width,height]) <= 256:        header.extend([0,0,0,width,0,0,0,height])    else:        raise ValueError('Image exceeds maximum size: 256x256 pixels')    header[3] = 3    imageData = header + imageData    outPutFile = open(fileName.split('.')[0] + '.idx3-ubyte', 'wb')    imageData.tofile(outPutFile)    outPutFile.close()if __name__ == '__main__':    fileName = sys.argv[1]    changeFile(fileName)

就有了一个转换单张图片的 demo,后续如果有更多需求的话会再做修改

原创粉丝点击