机器学习实战_kNN算法python3.6实现与理解

来源:互联网 发布:js获取标签的id 编辑:程序博客网 时间:2024/06/06 05:42

机器学习实战_kNN算法python3.6实现与理解

标签(空格分隔): kNN算法


from numpy import *import operatorfrom os import listdir#创建数据集和标签def createDataSet():    group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])    labels = ['A', 'A', 'B', 'B']    return group, labels#k-近邻算法def classify0(inX, dataSet, labels, k):    dataSetSize = dataSet.shape[0]    diffMat = tile(inX, (dataSetSize, 1)) - dataSet    sqDiffMat = diffMat ** 2    sqDistances = sqDiffMat.sum(axis=1)    distances = sqDistances ** 0.5    sortedDistIndicies = distances.argsort()    classCount = {}    for i in range(k):        voteIlabel = labels[sortedDistIndicies[i]]        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)#Python3.5中:iteritems变为items    return sortedClassCount[0][0]#将文本记录转换为Numpy的解析程序#将原始数据转为为计算机可以分析的numpy数据(输入一些字符串——————————得到矩阵)def file2matrix(filename):    fr = open(filename)    numberOfLines = len(fr.readlines())  # get the number of lines in the file    returnMat = zeros((numberOfLines, 3))  # prepare matrix to return    classLabelVector = []  # prepare labels return    fr = open(filename)    index = 0    for line in fr.readlines():        line = line.strip()        listFromLine = line.split('\t')        returnMat[index, :] = listFromLine[0:3]        classLabelVector.append(int(listFromLine[-1]))        index += 1    return returnMat, classLabelVector#测试数据处理为numpy的结果datingDataMat,datingLables = file2matrix('datingTestSet2.txt')#print(datingDataMat)#print(datingLables)'''import matplotlibimport matplotlib.pyplot as pltfig = plt.figure()ax = fig.add_subplot(111)'''#ax.scatter(datingDataMat[:,1],datingDataMat[:,2])#用色彩参数个性化标记散点图上的点ax.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(datingLables),15.0*array(datingLables))plt.show()#归一化特征值norm规范化def autoNorm(dataSet):    minVals = dataSet.min(0)    #从列中选出最小的值    maxVals = dataSet.max(0)    #从列中选出最大的值    ranges = maxVals - minVals  #取值范围(最大值-最小值)    normDataSet = zeros(shape(dataSet))    m = dataSet.shape[0]    normDataSet = dataSet - tile(minVals,(m,1)) #tile()将变量的内容复制成输入矩阵同样大小的矩阵    normDataSet = normDataSet/tile(ranges,(m,1)) #这里是具体特征值相除 在numpy库中,矩阵的除法是函数:linalg.solve(matA,matB)    return normDataSet,ranges,minVals#print(autoNorm(datingDataMat))#分类器针对约会网站给定的数据进行分类的测试代码(k-近邻算法)def datingClassTest():    hoRatio = 0.50  # hold out 10%    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')  # load data setfrom file    normMat, ranges, minVals = autoNorm(datingDataMat)    m = normMat.shape[0]    numTestVecs = int(m * hoRatio)    errorCount = 0.0    for i in range(numTestVecs):        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)        print(        "分类器划分的类型: %d, 实际的类型是: %d" % (classifierResult, datingLabels[i]))        if (classifierResult != datingLabels[i]): errorCount += 1.0    print(    "错误率: %f" % (errorCount / float(numTestVecs)))    print("错误的总是:%d"%(errorCount))#约会网站预测函数def classifyPerson():    resultList = ['一点都喜欢','喜欢一点','很喜欢']    percentTats = float(input("打电子游戏的时间:?"))    ffMiles = float(input("坐飞机的时间:?"))    iceCream = float(input("吃多少冰淇淋:?"))    datingDataMat,datingLables = file2matrix('datingTestSet2.txt')    normMat,ranges,minVals = autoNorm(datingDataMat)    inArr = array([ffMiles, percentTats, iceCream])    classifyResult = classify0((inArr-minVals)/ranges,normMat,datingLables,3)    print("你的喜欢程度:",resultList[classifyResult -1 ])classifyPerson()

数据集下载:链接:https://pan.baidu.com/s/1eSvpjLS 密码:e1ni

阅读全文
0 0