1、KNN 学习笔记

来源：互联网发布：魔方矩阵有什么用编辑：程序博客网时间：2024/05/10 22:23

K Near Neighbor

优点：

缺点：计算复杂度高，空间复杂度高

一般算法流程：

1、收集、准备和分析数据

2、训练数据(不适用 KNN 算法)

3、测试数据

4、使用算法

准备： createDataSet()

KNN 分类算法的流程：（5）

classify0()

约会网站上使用 KNN 算法流程：

file2matrix() 准备数据 autoNorm() 归一化

matplotlib 通过散点图分析数据

datingClassTest() 测试代码

classifyPerson() 约会网站预测函数，使用算法

fig.add_subplot(111)

ax.scatter(,)

A.min(0)

A.max(0)

测试代码：

datingClassTest()

完整系统

classifyPerson()

手写体识别

img2Vector

handwritingClassTest

#! /usr/bin/python
# -*- coding: utf8 -*-
'''
2017/8/16
KNN在约会网站和手写体识别系统中的应用
'''
from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt
import os

#数据准备
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels

# KNN算法
def classify0(inX,dataMat,labels,k):
length = dataMat.shape[0]
# diffData = dataMat - tile(inX,length) #代码错误****
diffData = dataMat - tile(inX,(length,1))
squareDiffData = diffData**2
addSquareData = squareDiffData.sum(axis = 1) #按照行相加
distances = addSquareData ** 0.5
sortedDistance = distances.argsort()
classCount = {}
for i in range(k):
votedLabel = labels[sortedDistance[i]]
classCount[votedLabel] = classCount.get(votedLabel,0) + 1
sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
return sortedClassCount[0][0]

# group,labels = createDataSet()
# print(classify0([0,0],group,labels,3))

#将文本结构化
def file2matrix(path):
fr = open(path)
lines = fr.readlines()
length = len(lines)
returnMat = zeros((length,3))
label = []
index = 0
for line in lines:
line = line.strip()
sqlitLine = line.split('\t')
returnMat[index,:] = sqlitLine[0:3]
label.append(int(sqlitLine[-1]))
index += 1
return returnMat,label

#数据归一化处理
def autoNorm(dataMat):
minVals = dataMat.min(0)
maxVals = dataMat.max(0)
ranges = maxVals - minVals
length = len(dataMat)
normalDataSet = (dataMat - tile(minVals,(length,1)))/tile(ranges,(length,1))
return minVals,maxVals,ranges,normalDataSet

# 分析数据（使用Matplotlib创建散点图）
# dataMat,labels = file2matrix('./../data/datingTestSet2.txt')
# fig = plt.figure()
# ax = fig.add_subplot(111)
# # ax.scatter(dataMat[:,1],dataMat[:,2])# 只有一种颜色
# ax.scatter(dataMat[:,1],dataMat[:,2],15.0*array(labels),15.0*array(labels))
# plt.xlabel('玩视频游戏所耗时间百分比')
# plt.ylabel('每周消费的冰淇淋公升数')
# plt.show()
# print('测试')

# 把集合分成两部分进行测试

# def datingClassTest(path):
# hoRatio = 0.10
# dataMat,labels = file2matrix(path)
# length = dataMat.shape[0]
# testLength =int(length * hoRatio)
# dataMin,dataMax,ranges,newDataMat = autoNorm(dataMat)
# errorCount = 0
# for i in range(testLength):
# classifierResult = classify0(newDataMat[i,:],newDataMat[testLength:length,:],labels[testLength:length],3)
# print("the classifier came back with %d,the real answer is %d" %(classifierResult,labels[i]))
# if (classifierResult == labels[i]):
# errorCount += 1
# print(" the error ratio is %f" %(errorCount/float(testLength)))
#
# datingClassTest('./../data/datingTestSet2.txt')

# #完整约会推检测系统
# def classifierPerson(path):
# resultList = ['not at all','in a small does','in large does']
# percentTats = float(input("percentage of time spent playing video games?"))
# ffMiles = float(input("frequent flier miles earned per year?"))
# iceCream = float(input("litters of icecream consumed per year?"))
# dataMat,labels = file2matrix(path)
# minEle,maxEle,ranges,newDataMat = autoNorm(dataMat)
# inX = ([percentTats,ffMiles,iceCream])
# newInX = (inX - minEle)/ranges
# resultLabel = classify0(newInX,newDataMat,labels,3)
# print(resultList[resultLabel])
# classifierPerson('./../data/datingTestSet2.txt')

# 使用KNN进行手写体识别

#首先将二进制文件转换成向量的结构化结果
def img2vector(filename):
returnMat = zeros((1,1024))
fr = open(filename)
for i in range(32):
line = fr.readline()
for j in range(32):
returnMat[0,i*32+j] = line[j]
return returnMat

def handwritingClassTest():
traindir = './../data//trainingDigits//'
testdir = './../data//testDigits//'
filelist = os.listdir(traindir)
length = len(filelist)
dataMat = zeros((length,1024))
labels = []
for i in range(length):
newpath = traindir + filelist[i]
singleLabel = filelist[i].split('_')[0]
labels.append(singleLabel)
dataMat[i,:] = img2vector(newpath)
testfilelist = os.listdir(testdir)
testlength = len(testfilelist)
testdataMat = zeros((testlength, 1024))
testlabels = []
errorRatio = 0.0
for i in range(testlength):
newpath = testdir + testfilelist[i]
singleLabel = int(testfilelist[i].split('_')[0])
testlabels.append(singleLabel)
testdataMat[i,:] = img2vector(newpath)
result = int(classify0(testdataMat[i],dataMat,labels,3))
print("real value: %d, predicted value:%d" %(testlabels[i],result))
if (testlabels[i] != result):
errorRatio += 1

print("errorRatio is %d , testlength is %d , ratio is %f" %(errorRatio,testlength,errorRatio/float(testlength)))

handwritingClassTest()

阅读全文

1 0