机器学习实战-kNN笔记

来源:互联网 发布:琴房隔音 知乎 编辑:程序博客网 时间:2024/05/21 01:56
import numpy as npimport operator'''《机器学习实战》kNN分类算法及注释'''
'\n《机器学习实战》kNN分类算法及注释\n'
#创建数据集def createDataSet():    group = [[1.0,1.1], [0.9,1.0], [0.1,0.2], [0.2,0.3]]    labels = ['A', 'A', 'B', 'B']    return group,labels
#numpytile函数,重复[1,2] 3行4列tile_result = np.tile([1,2],(3,4))print(tile_result)
[[1 2 1 2 1 2 1 2] [1 2 1 2 1 2 1 2] [1 2 1 2 1 2 1 2]]
shape_result = tile_result.shapeprint(shape_result)print(shape_result[0])print(shape_result[1])
(3, 8)38
#定义kNN分类函数def classify0(inX, dataSet, labels, k):    #inX代表输入的数据坐标,DataSet:n个m维数据,大小n*m    dataSetSize = dataSet.shape[0]    #求x与各数据坐标的差,形成n*m的差值数组    diffMat = np.tile(inX, (dataSetSize,1)) - dataSet    sqDiffMat = diffMat ** 2     #np.sum(axis = 0/1);python自带的sum(sequence,[start,])求和再加start    sqDistances = sqDiffMat.sum(axis = 1)    distances = sqDistances ** 0.5    #np.argsort(x),返回排序升序序号。http://blog.csdn.net/maoersong/article/details/21875705    sortedDistIndicies = distances.argsort()    classCount = {}    for i in range(k):        voteLabel = labels[sortedDistIndicies[i]]        classCount[voteLabel] = classCount.get(voteLabel, 0) +1    #http://www.cnblogs.com/zle1992/p/6271105.html    #http://blog.csdn.net/dongtingzhizi/article/details/12068205    #key = operator.itemgetter(1) 提取classCount第2个域作为排序    #python3 中 用dict.items()没有iteritems()方法了    #key = operator.itemgetter(1)    sortedClassCount = sorted(classCount.items(), key = lambda x:x[1], reverse = True)    #sortedClassCount[0]为行最小第一项,第二个[0]提取第一项的列class    return sortedClassCount[0][0]
#定义文本txt数据转换numpy格式def file2matrix(filename):    with open(filename,'r') as fr:        #读取全文readlines()        arrayLines = fr.readlines()        numberOfLines = len(arrayLines)        #生成与文件相同大小的zeros        returnMat = np.zeros(((numberOfLines),3))        classLabelVector = []        index = 0        for line in arrayLines:            #去空格            line = line.strip()            #去line中3项之间的'\t'            listFromLine = line.split('\t')            #是否可以for index,line in enumerate(arrayLines):提取index而不单用index            returnMat[index,:] = listFromLine[0:3]            #生成classLabel            classLabelVector.append(int(listFromLine[-1]))            index += 1        return returnMat,classLabelVector
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
datingDataMat
array([[  4.09200000e+04,   8.32697600e+00,   9.53952000e-01],       [  1.44880000e+04,   7.15346900e+00,   1.67390400e+00],       [  2.60520000e+04,   1.44187100e+00,   8.05124000e-01],       ...,        [  2.65750000e+04,   1.06501020e+01,   8.66627000e-01],       [  4.81110000e+04,   9.13452800e+00,   7.28045000e-01],       [  4.37570000e+04,   7.88260100e+00,   1.33244600e+00]])
datingLabels[0:20]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]
#归一化特征值,newValue = (oldValue - min)/(max - min)def autoNorm(dataSet):    minVals = dataSet.min(0)    maxVals = dataSet.max(0)    ranges = maxVals - minVals    #np.shape(a)a是列表或是array都可以提取,也可以array.shape(),但list没有shape属性    normDataSet = np.zeros(np.shape(dataSet))    m = dataSet.shape[0]    #全部减min    normDataSet = dataSet - np.tile(minVals, (m,1))    #除以ranges,得到归一化数据    normDataSet = normDataSet/np.tile(ranges, (m,1))    return normDataSet, ranges, minVals
normMat, ranges, minVals = autoNorm(datingDataMat)normMat
array([[ 0.44832535,  0.39805139,  0.56233353],       [ 0.15873259,  0.34195467,  0.98724416],       [ 0.28542943,  0.06892523,  0.47449629],       ...,        [ 0.29115949,  0.50910294,  0.51079493],       [ 0.52711097,  0.43665451,  0.4290048 ],       [ 0.47940793,  0.3768091 ,  0.78571804]])
def datingClassTest(k):    #输入待分类数据占训练数据的比例    hoRatio = 0.1    #'datingTestSet2.txt'可以运行,'datingTestSet.txt'不能运行    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')    normMat, ranges, minVals = autoNorm(datingDataMat)    m = normMat.shape[0]    #分割测试数据与训练数据的位置    numTestVecs = int(m*hoRatio)    errorCount = 0.0    for i in range(numTestVecs):        #输入数据前numTestVecs个,训练数据dataSet从numTestVecs到m,即剩下的数据集        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\                                    datingLabels[numTestVecs:m], k)        print("the classifier came back with: {0},the real answer in :{1}\        ".format(classifierResult, datingLabels[i]))        if(classifierResult != datingLabels[i]):            errorCount += 1.0    print("the total error rate is: {0}".format(errorCount/float(numTestVecs)))
datingClassTest(3)
the classifier came back with: 3,the real answer in :3        the classifier came back with: 2,the real answer in :2        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 3,the real answer in :3        the classifier came back with: 3,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 3,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 3,the real answer in :3        the classifier came back with: 2,the real answer in :2        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :2        the classifier came back with: 3,the real answer in :3        the classifier came back with: 2,the real answer in :2        the classifier came back with: 3,the real answer in :3        the classifier came back with: 2,the real answer in :2        the classifier came back with: 3,the real answer in :3        the classifier came back with: 2,the real answer in :2        the classifier came back with: 1,the real answer in :1        the classifier came back with: 3,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 3,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 3,the real answer in :3        the classifier came back with: 3,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 3,the real answer in :3        the classifier came back with: 3,the real answer in :3        the classifier came back with: 3,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 2,the real answer in :2        the classifier came back with: 1,the real answer in :1        the classifier came back with: 3,the real answer in :3        the classifier came back with: 2,the real answer in :2        the classifier came back with: 2,the real answer in :2        the classifier came back with: 2,the real answer in :2        the classifier came back with: 2,the real answer in :2        the classifier came back with: 3,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 2,the real answer in :2        the classifier came back with: 2,the real answer in :2        the classifier came back with: 2,the real answer in :2        the classifier came back with: 2,the real answer in :2        the classifier came back with: 3,the real answer in :3        the classifier came back with: 2,the real answer in :2        the classifier came back with: 3,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 3,the real answer in :3        the classifier came back with: 2,the real answer in :2        the classifier came back with: 2,the real answer in :2        the classifier came back with: 3,the real answer in :1        the classifier came back with: 3,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 3,the real answer in :3        the classifier came back with: 3,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 3,the real answer in :3        the classifier came back with: 3,the real answer in :1        the classifier came back with: 3,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 2,the real answer in :2        the classifier came back with: 1,the real answer in :1        the classifier came back with: 1,the real answer in :1        the classifier came back with: 3,the real answer in :3        the classifier came back with: 2,the real answer in :3        the classifier came back with: 1,the real answer in :1        the classifier came back with: 2,the real answer in :2        the classifier came back with: 1,the real answer in :1        the classifier came back with: 3,the real answer in :3        the classifier came back with: 3,the real answer in :3        the classifier came back with: 2,the real answer in :2        the classifier came back with: 1,the real answer in :1        the classifier came back with: 3,the real answer in :1        the total error rate is: 0.05
原创粉丝点击