【读书笔记】机器学习实战-2.3节
来源:互联网 发布:百度人工智能负责人 编辑:程序博客网 时间:2024/06/05 02:10
机器学习实战
第二章 2.3节 KNN-手写数字识别
#!/usr/bin/python# -*- coding: utf-8 -*-from numpy import *import operatorfrom os import listdirimport matplotlibimport matplotlib.pyplot as pltdef createDataSet(): group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) labels = ['A','A','B','B'] return group ,labelsdef classify0( inX,dataSet,labels,k ): # 待判定输入,训练样本,样本标签,kNN中k的取值 dataSetSize = dataSet.shape[0] # shape 返回矩阵的维度 diffMat = tile(inX, (dataSetSize, 1)) - dataSet # tile(a,b):将a重复b次 saDiffMat = diffMat**2 # 运算符模块:** 平方 sqDistances = saDiffMat.sum(axis=1) # axis=1: array按行求和 axis=0:array按列求和 distances = sqDistances**0.5 sortedDistIndicies = distances.argsort() # 返回数组值从大到小的索引值 classCount = {} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # dic.get() 访问不存在的键时,自动添加并设值为默认值 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) # dic.iteritems() :迭代器函数 operator.itemgetter():获取对象的指定域的值 key:进行比较的关键字 return sortedClassCount[0][0]def file2matrix(filename): fr = open(filename) arrayOLines = fr.readlines() # 将文件全部读入字符串列表,每个字符串为一行 numberOfLines = len(arrayOLines) returnMat = zeros((numberOfLines,3)) # numpy.zeros() 创建零矩阵 classLabelVector = [] index = 0 for line in arrayOLines: line = line.strip() # 除去两边的空格 listFromLine = line.split('\t') # '\t' = Tab split:拆解字符串 returnMat[index ,:] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) # 列表添加元素 index += 1 return returnMat, classLabelVectordef autoNorm(dataSet): minVals = dataSet.min(0) # numpy.array.min(0) 返回每列的最小值 maxVals = dataSet.max(0) # numpy.array.max(0) 返回每列的最大值 ranges = maxVals - minVals # normDataSet = zeros(dataSet.shape) # 利用shape属性,建同样大小的矩阵 m = dataSet.shape[0] normDataSet = dataSet - tile(minVals, (m, 1)) # array全部减去最小值 normDataSet = normDataSet/tile(ranges, (m, 1)) # array全部除去范围值 return normDataSet, ranges, minValsdef datingClassTest(): hoRatio = 0.1 # 测试样例比例 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] numTestVecs = int(m*hoRatio) # 测试数据集 errorCount = 0.0 for i in range(numTestVecs): classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m],3) print "the classifier came back with: %d,the real answer is:%d" %(classifierResult, datingLabels[i]) if(classifierResult != datingLabels[i]): errorCount += 1.0 print "the total error rate is :%f" %(errorCount/float(numTestVecs))def classifyPerson(): resultList = ['not at all ','in small doses ','in large doses '] percetTates = float(raw_input('percent of time spent playing video game?')) ffMiles = float(raw_input('frequent flier miles earned per year?')) iceCream = float(raw_input('liters of ice cream consumed per year?')) datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) inArr = array([ffMiles,percetTates,iceCream]) # 构建分类目标array inArr = (inArr-minVals)/ranges classifierResult = classify0(inArr, normMat,datingLabels,3) print 'you will like this person:',resultList[classifierResult-1]def img2vector(filename): returnVect = zeros(1024) fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): returnVect[32*i+j] = int(lineStr[j]) # 字符转数字 return returnVectdef handwritingClassTest(): hwLabels = [] trainingFileList = listdir('trainingDigits') # 获取训练目录文件内容 m = len(trainingFileList) trainingMat = zeros((m,1024)) for i in range(m): fileNameStr = trainingFileList[i] # 当前解析文件 fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) # 获取真实标签 hwLabels.append(classNumStr) trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr) testFileList = listdir('testDigits') # 获取测试数据样本 errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] #take off .txt classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = img2vector('testDigits/%s' % fileNameStr) classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr) if (classifierResult != classNumStr): errorCount += 1.0 print "\nthe total number of errors is: %d" % errorCount print "\nthe total error rate is: %f" % (errorCount/float(mTest))handwritingClassTest()# classifyPerson()# datingClassTest()# datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')# normMat,ranges,minVals = autoNorm(datingDataMat)# fig = plt.figure()# ax = fig.add_subplot(111)# ax.scatter(datingDataMat[:, 0], datingDataMat[:, 1], 15.0*array(datingLabels),15.0*array(datingLabels)) # 利用标签属性标记# fig.show() 显示后自动退出# plt.show() #显示不退出#group,labels = createDataSet()#result = classify0([0,0],group,labels,3)#pass
0 0
- 【读书笔记】机器学习实战-2.3节
- 《机器学习实战》读书笔记
- 《机器学习实战》读书笔记
- 《机器学习实战》读书笔记
- 《机器学习实战》读书笔记
- 【读书笔记】机器学习实战-2.2节
- 《机器学习实战》完整读书笔记
- 机器学习实战读书笔记-决策树
- 【读书笔记】机器学习实战-4.6节 朴素贝叶斯过滤垃圾邮件
- 【读书笔记】机器学习实战 5.2节 logistics回归
- 【读书笔记】机器学习实战-kNN(1)
- 【读书笔记】机器学习实战-kNN(2)
- 【读书笔记】机器学习实战-决策树(1)
- 【读书笔记】机器学习实战-决策树(2)
- 机器学习实战读书笔记(序)
- 《机器学习实战》——读书笔记1
- 《机器学习实战》——读书笔记1
- 机器学习实战读书笔记-kNN分类算法
- 深入理解Python(二)
- Codeforces Round #411 (Div. 2) D. Minimum number of steps
- SPI和IIC通信区别
- Pycharm首次安装
- 剑指offer 27. 二叉搜索树与双向链表
- 【读书笔记】机器学习实战-2.3节
- 利用Java操作FTP文件上传,下载,删除
- Dos 命令(一)
- 按同学身高从高到矮依次输出人名
- params.h
- Python处理数据--sort,sorted
- Python入门(四)——函数概述,参数,可变参数,关键字参数,组合参数,递归函数
- eclipse不更新web项目文件
- jxls使用模板生成excel文件