kNN算法实现和基础应用

来源:互联网 发布:监控用网络光端机 编辑:程序博客网 时间:2024/06/06 01:57

ml的学习笔记。

之前学的知识都较为零散,现在需要系统的学习各种算法和思想,目前主要练习内容是基于《机器学习实战》。

在学习过程中,除了学习算法的本身之外,对python库的运用和矩阵运算的代码风格是另外一个学习的重心,c风格的编程习惯如果放到python里远不如矩阵运算和内置函数的效率高,学习过程 的一些记录在代码里以注释方式表示。

kNN(k近邻)为无监督算法,原理即对每个测试数据向量,计算它与训练数据中每个点的距离,选择距离最小的前k个点,统计这k个点的标签,给该测试数据打上统计标签中出现频率最高的标签。


kNN.py

from numpy import *import operatorfrom os import listdir#创建一个简单的测试数据矩阵和标签向量def createDataSet():    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])    labels = ['A','A','B','B']    return group, labels
#分类器def classify0(inX, dataSet, labels, k):    dataSetSize = dataSet.shape[0]    diffMat = tile(inX, (dataSetSize,1))-dataSet    sqDiffMat = diffMat**2    sqDistances = sqDiffMat.sum(axis=1) #axis=1表示矩阵的每行相加    distances = sqDistances**0.5    sortedDistIndicies = distances.argsort() #从小到大排序,返回的矩阵存储的是原索引号    classCount = {}    for i in range(k):        voteIlabel = labels[sortedDistIndicies[i]]        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1    sortedClassCount = sorted(classCount.items(), key=lambda d:d[1], reverse = True)    #sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reversed=True)    return sortedClassCount[0][0]
#数据归一化:newValue = (oldValue - min)/(max - min)def autoNorm(dataSet):    minVals = dataSet.min(0)    maxVals = dataSet.max(0)    ranges = maxVals - minVals    normDataSet = zeros(shape(dataSet))    m = dataSet.shape[0]    normDataSet = dataSet - tile(minVals, (m,1))    normDataSet = normDataSet/tile(ranges, (m,1))    return normDataSet,ranges,minVals
#将文件中存储好的数据转换为内存中的矩阵def file2matrix(filename):    fr = open(filename)    arrayOLines = fr.readlines()    numberOfLines = len(arrayOLines)    returnMat = zeros((numberOfLines,3))    classLabelVector = []    index = 0    for line in arrayOLines:        line = line.strip() #除去回车符号        listFromLine = line.split('\t')        returnMat[index,:] = listFromLine[0:3]        classLabelVector.append(int(listFromLine[-1])) # -1取最后一列元素        index = index + 1    return returnMat,classLabelVectordef datingClassTest():    hoRatio = 0.10    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')    normMat, ranges ,minVals = autoNorm(datingDataMat)    m = normMat.shape[0]    numTestVecs = int(m*hoRatio)    errorCount = 0.0    for i in range(numTestVecs):        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)        print("the classifier came back with :%d,the real answer is: %d" %(classifierResult,datingLabels[i]))        if(classifierResult != datingLabels[i]):            errorCount =errorCount + 1.0    print("the total error rate is : %f" %(errorCount/float(numTestVecs)))datingClassTest()'''#测试kNN分类器group,labels = createDataSet()print(classify0([0,0], group, labels, 3))''''''data=array([[1,1,3],[1,2,3],[1,0,3]])data1=array([[3,2,1],[3,2,1],[1,0,3]])diffmat = data - data1print(diffmat)print(diffmat**2)sumdiffmat = diffmat**2sumvec = sumdiffmat.sum(axis=1)#矩阵每行相加print(sumvec)print(sumvec.argsort())''''''#字典的运用classCount= {}a='A'classCount[a] = 1classCount[a] = classCount[a] + 1classCount[a] = classCount.get(a,0) + 1classCount['b']=2classCount['c']=1#python2#sortedDscendCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reversed=True)#python3sortedDscendCount = sorted(classCount.items(), key=lambda d:d[1], reverse = True)print(sortedDscendCount)sortedAscendCount = sorted(classCount.items(), key=lambda d:d[1], reverse = False)print(sortedAscendCount)''''''group,labels = createDataSet()print(group)print(labels)'''


plot.py 测试Matplotlib创建散点图

import matplotlibimport matplotlib.pyplot as pltimport ch1.kNN as knnreturnMat,classLabelVector = knn.file2matrix("datingTestSet2.txt")fig = plt.figure()ax = fig.add_subplot(111)ax.scatter(returnMat[:,1],returnMat[:,2])plt.show()

test.py 学习的一些库函数的知识

#在python 3.2.3中  input和raw_input 整合了,没有了raw_input,input返回字符串型'''整数字符串转换为对应的整数int('12')小数字符串转换为对应小数float('12.34')数字转换为字符串str(123.45)ASCII码转换为相应字符chr(97)字符转换为响应ASCII码ord('a')''''''#测试listdir,可得到文件夹下一层所有文件和文件夹的名字from os import listdirfilelist = listdir("D:/testdata")m = len(filelist)print(m)for i in filelist:    print(i)    fileStr = i.split('.')    print(fileStr[0])    #print(fileStr[1])''''''a = input("hello :")print(a)c=2+int(a)print(c)'''