Udacity作业——TensorFlow notMNIST代码及输出结果——Udacity学习笔记
来源:互联网 发布:如果南京是首都知乎 编辑:程序博客网 时间:2024/06/03 19:33
In [1]:
from __future__ import print_functionimport matplotlib.pyplot as plt#绘图模块import numpy as np#矩阵模块import osimport sysimport tarfile#文件解压模块from IPython.display import display, Imagefrom scipy import ndimagefrom sklearn.linear_model import LogisticRegression#回归模块from six.moves.urllib.request import urlretrieve#下载模块from six.moves import cPickle as pickle#压缩模块# Config the matplotlib backend as plotting inline in IPython%matplotlib inline
In [2]:
num_classes = 10np.random.seed(133)#创建每一个类别的文件夹名def maybe_extract(filename, force=False): root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz data_folders = [os.path.join(root, d) for d in sorted(os.listdir(root)) if os.path.isdir(os.path.join(root, d))] if len(data_folders) != num_classes: raise Exception( 'Expected %d folders, one per class. Found %d instead.' % ( num_classes, len(data_folders))) return data_folders#本地存储notMnist数据的的文件夹train_filename = '/home/zlong/workspace/udacity/notMNIST/notMNIST_large' test_filename = '/home/zlong/workspace/udacity/notMNIST/notMNIST_small'train_folders = maybe_extract(train_filename)test_folders = maybe_extract(test_filename)
In [3]:
#Problem1: Display a sample of the images that we just downloadnums_image_show = 2#显示的图像张数for index_class in range(num_classes): #i from 0 to 9 imagename_list = os.listdir(train_folders[index_class]) imagename_list_indice = imagename_list[0:nums_image_show] for index_image in range(nums_image_show): path = train_folders[index_class] +'/' + imagename_list_indice[index_image] display(Image(filename = path))
In [4]:
image_size = 28 # Pixel width and height.pixel_depth = 255.0 # Number of levels per pixel.def load_letter(folder, min_num_images): """Load the data for a single letter label.""" image_files = os.listdir(folder) dataset = np.ndarray(shape=(len(image_files), image_size, image_size), dtype=np.float32) print(folder) num_images = 0 for image in image_files: image_file = os.path.join(folder, image) try: image_data = (ndimage.imread(image_file).astype(float) - pixel_depth / 2) / pixel_depth if image_data.shape != (image_size, image_size): raise Exception('Unexpected image shape: %s' % str(image_data.shape)) dataset[num_images, :, :] = image_data num_images = num_images + 1 except IOError as e: print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.') dataset = dataset[0:num_images, :, :] if num_images < min_num_images: raise Exception('Many fewer images than expected: %d < %d' % (num_images, min_num_images)) print('Full dataset tensor:', dataset.shape) print('Mean:', np.mean(dataset)) print('Standard deviation:', np.std(dataset)) return dataset def maybe_pickle(data_folders, min_num_images_per_class, force=False): dataset_names = [] for folder in data_folders: set_filename = folder + '.pickle' dataset_names.append(set_filename) if os.path.exists(set_filename) and not force: # You may override by setting force=True. print('%s already present - Skipping pickling.' % set_filename) else: print('Pickling %s.' % set_filename) dataset = load_letter(folder, min_num_images_per_class) try: with open(set_filename, 'wb') as f: pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) except Exception as e: print('Unable to save data to', set_filename, ':', e) return dataset_namestrain_datasets = maybe_pickle(train_folders, 45000)test_datasets = maybe_pickle(test_folders, 1800)
In [5]:
#Problem2 Displaying a sample of the labels and images from the ndarray# Config the matplotlib backend as plotting inline in IPython%matplotlib inlineimport matplotlib.pyplot as pltdef load_and_displayImage_from_pickle(data_filename_set,NumClass,NumImage): if(NumImage <= 0): print('NumImage <= 0') return plt.figure('subplot') for index,pickle_file in enumerate(data_filename_set): with open(pickle_file,'rb') as f: data = pickle.load(f) ImageList = data[0:NumImage,:,:] for i,Image in enumerate(ImageList): #NumClass代表类别,每个类别一行;NumImage代表每个类显示的图像张数 plt.subplot(NumClass, NumImage, index*NumImage+i+1) plt.imshow(Image) index = index+1 #显示10类,每类显示5张图片 load_and_displayImage_from_pickle(train_datasets,10,5) load_and_displayImage_from_pickle(test_datasets,10,5)
In [6]:
def show_sum_of_different_class(data_filename_set): plt.figure(1) #read .pickle file sumofdifferentclass = [] for pickle_file in data_filename_set: with open(pickle_file,'rb') as f: data = pickle.load(f) print(len(data)) sumofdifferentclass.append(len(data)) #show the data x = range(10) plt.bar(x,sumofdifferentclass) plt.show()print('train_datasets:\n') show_sum_of_different_class(train_datasets) print('test_datasets:\n') show_sum_of_different_class(test_datasets)
In [7]:
def make_arrays(nb_rows, img_size): if nb_rows: dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32) labels = np.ndarray(nb_rows, dtype=np.int32) else: dataset, labels = None, None return dataset, labelsdef merge_datasets(pickle_files, train_size, valid_size=0): num_classes = len(pickle_files) valid_dataset, valid_labels = make_arrays(valid_size, image_size) train_dataset, train_labels = make_arrays(train_size, image_size) vsize_per_class = valid_size // num_classes tsize_per_class = train_size // num_classes start_v, start_t = 0, 0 end_v, end_t = vsize_per_class, tsize_per_class end_l = vsize_per_class+tsize_per_class for label, pickle_file in enumerate(pickle_files): try: with open(pickle_file, 'rb') as f: letter_set = pickle.load(f) # let's shuffle the letters to have random validation and training set np.random.shuffle(letter_set) if valid_dataset is not None: valid_letter = letter_set[:vsize_per_class, :, :] valid_dataset[start_v:end_v, :, :] = valid_letter valid_labels[start_v:end_v] = label start_v += vsize_per_class end_v += vsize_per_class train_letter = letter_set[vsize_per_class:end_l, :, :] train_dataset[start_t:end_t, :, :] = train_letter train_labels[start_t:end_t] = label start_t += tsize_per_class end_t += tsize_per_class except Exception as e: print('Unable to process data from', pickle_file, ':', e) raise return valid_dataset, valid_labels, train_dataset, train_labels train_size = 200000valid_size = 10000test_size = 10000valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets( train_datasets, train_size, valid_size)_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)print('Training:', train_dataset.shape, train_labels.shape)print('Validation:', valid_dataset.shape, valid_labels.shape)print('Testing:', test_dataset.shape, test_labels.shape)
In [8]:
def randomize(dataset, labels): permutation = np.random.permutation(labels.shape[0]) shuffled_dataset = dataset[permutation,:,:] shuffled_labels = labels[permutation] return shuffled_dataset, shuffled_labelstrain_dataset, train_labels = randomize(train_dataset, train_labels)test_dataset, test_labels = randomize(test_dataset, test_labels)valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)
In [9]:
'''Problem4 Convince yourself that the data is still good after shuffling!'''#data_set是数据集,NumImage是显示的图像张数def displayImage_from_dataset(data_set,NumImage): if(NumImage <= 0): print('NumImage <= 0') return plt.figure('subplot') ImageList = data_set[0:NumImage,:,:] for index,Image in enumerate(ImageList): #NumClass代表类别,每个类别一行;NumImage代表每个类显示的图像张数 plt.subplot(NumImage//5+1, 5, index+1) plt.imshow(Image) index = index+1 plt.show()displayImage_from_dataset(train_dataset,50)
In [10]:
data_root = '.' # Change me to store data elsewhereprint(data_root)pickle_file = os.path.join(data_root, 'notMNIST.pickle')print(pickle_file)try: f = open(pickle_file, 'wb') save = { 'train_dataset': train_dataset, 'train_labels': train_labels, 'valid_dataset': valid_dataset, 'valid_labels': valid_labels, 'test_dataset': test_dataset, 'test_labels': test_labels, } pickle.dump(save, f, pickle.HIGHEST_PROTOCOL) f.close()except Exception as e: print('Unable to save data to', pickle_file, ':', e) raise
In [11]:
#先使用hashimport hashlib#使用sha的作用是将二维数据和哈希值之间进行一一对应,这样,通过比较哈希值就能将二维数组是否相等比较出来def extract_overlap_hash_where(dataset_1,dataset_2): dataset_hash_1 = np.array([hashlib.sha256(img).hexdigest() for img in dataset_1]) dataset_hash_2 = np.array([hashlib.sha256(img).hexdigest() for img in dataset_2]) overlap = {} for i, hash1 in enumerate(dataset_hash_1): duplicates = np.where(dataset_hash_2 == hash1) if len(duplicates[0]): overlap[i] = duplicates[0] return overlap#display the overlapdef display_overlap(overlap,source_dataset,target_dataset): overlap = {k: v for k,v in overlap.items() if len(v) >= 3} item = np.random.choice(list(overlap.keys())) imgs = np.concatenate(([source_dataset[item]],target_dataset[overlap[item][0:7]])) plt.suptitle(item) for i,img in enumerate(imgs): plt.subplot(2,4,i+1) plt.axis('off') plt.imshow(img) plt.show()#数据清洗def sanitize(dataset_1,dataset_2,labels_1): dataset_hash_1 = np.array([hashlib.sha256(img).hexdigest() for img in dataset_1]) dataset_hash_2 = np.array([hashlib.sha256(img).hexdigest() for img in dataset_2]) overlap = [] for i,hash1 in enumerate(dataset_hash_1): duplictes = np.where(dataset_hash_2 == hash1) if len(duplictes[0]): overlap.append(i) return np.delete(dataset_1,overlap,0),np.delete(labels_1, overlap, None)overlap_test_train = extract_overlap_hash_where(test_dataset,train_dataset)print('Number of overlaps:', len(overlap_test_train.keys()))display_overlap(overlap_test_train, test_dataset, train_dataset)test_dataset_sanit,test_labels_sanit = sanitize(test_dataset,train_dataset,test_labels)print('Overlapping images removed from test_dataset: ', len(test_dataset) - len(test_dataset_sanit))valid_dataset_sanit, valid_labels_sanit = sanitize(valid_dataset, train_dataset, valid_labels)print('Overlapping images removed from valid_dataset: ', len(valid_dataset) - len(valid_dataset_sanit))print('Training:', train_dataset.shape, train_labels.shape)print('Validation:', valid_labels_sanit.shape, valid_labels_sanit.shape)print('Testing:', test_dataset_sanit.shape, test_labels_sanit.shape)pickle_file_sanit = 'notMNIST_sanit.pickle'try: f = open(pickle_file_sanit,'wb') save = { 'train_dataset':train_dataset, 'train_labels': train_labels, 'valid_dataset': valid_dataset, 'valid_labels': valid_labels, 'test_dataset': test_dataset, 'test_labels': test_labels, } pickle.dump(save,f,pickle.HIGHEST_PROTOCOL) f.close()except Exception as e: print('Unable to save data to', pickle_file, ':', e) raisestatinfo = os.stat(pickle_file_sanit)print('Compressed pickle size:', statinfo.st_size)
In [12]:
def train_and_predict(sample_size): regr = LogisticRegression() X_train = train_dataset[:sample_size].reshape(sample_size,784) y_train = train_labels[:sample_size] regr.fit(X_train,y_train) X_test = test_dataset.reshape(test_dataset.shape[0],28*28) y_test = test_labels pred_labels = regr.predict(X_test) print('Accuracy:', regr.score(X_test, y_test), 'when sample_size=', sample_size)for sample_size in [50,100,1000,5000,len(train_dataset)]: train_and_predict(sample_size)
更多学习笔记:http://blog.csdn.net/jidebingfeng/article/details/78014366
阅读全文
0 0
- Udacity作业——TensorFlow notMNIST代码及输出结果——Udacity学习笔记
- Udacity深度学习DeepLearning课程作业1—notMnist
- Udacity深度学习(google)笔记(1)——notmnist
- Udacity深度学习(google)笔记(2)——深度神经网络, tensorflow
- Udacity机器学习入门笔记——朴素贝叶斯
- Udacity机器学习入门笔记——Random Forest
- Udacity监督学习——决策树
- Udacity深度学习(google)笔记(3)——CNN 卷积模型
- Jupyter Notebook(原IPython Notebook)安装——Udacity学习笔记
- Udacity机器学习入门笔记——支持向量机(SVM)
- Udacity机器学习入门笔记——决策树(Decision Tree)
- ES6学习笔记-Udacity
- Udacity笔记
- Udacity深度学习(google)笔记(4)——文本和序列的深度模型(word2vec, RNN, LSTM)
- udacity 学习笔记:按天算年龄
- Udacity Deep Learning 任务 1: notMNIST
- Udacity机器学习进阶—监督学习之神经网络迷你项目
- Udacity机器学习(进阶)——朴素贝叶斯迷你项目2
- 机器视觉光源的作用及分类
- 从U盘引导并安装Ubuntu server 16.04.3 的方法
- BZOJ1179【APOI2009】ATM <Tarjan>
- 安装kinetic版的moveit失败或找不到相关包问题
- LeetCode: 22. Generate Parentheses
- Udacity作业——TensorFlow notMNIST代码及输出结果——Udacity学习笔记
- 删除链表中重复放入结点
- HotFix移动热修复详解
- C语言运算符
- 注解
- Electron打包总结, 附例子(windows+linux)
- ACM贪心算法之活动安排以及算法证明
- Windows下安装和配置MongoDB
- Java基础之集合框架--Collections.synchronizedList() 线程安全的List