KNN-Dating完整代码

来源:互联网 发布:淘宝上的大拿韩代 编辑:程序博客网 时间:2024/05/16 01:08
#coding:utf-8from numpy import *import operatorimport matplotlibimport matplotlib.pyplot as plt#建立初始数据def createDataSet():    group = array([[1.0,1.1],[1.0,1.0],[0,0.2],[0,0.1]])    labels = ['A','B','C','B']    return group,labels#KNN算法def classify0(inX,dataSet,labels,k):    dataSetSize = dataSet.shape[0]#    print dataSetSize    diffMat = tile(inX,(dataSetSize,1))-dataSet#    print diffMat    sqDiffMat = diffMat**2   # print sqDiffMat    sqDistances = sqDiffMat.sum(axis=1)    #print sqDistances    distances = sqDistances**0.5    sortedDistIndicies = distances.argsort()    classCount = {}    for i in range(k):        voteIlabel = labels[sortedDistIndicies[i]]        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1    #    print voteIlabel    sortedClassCount = sorted(classCount.iteritems(),                              key = operator.itemgetter(1),reverse = True)    #print sortedClassCount    return sortedClassCount[0][0]#解析数据def file2matrix(filename):    #创建指向文件的指针    fr = open(filename)    #获取文件所有列表:list    arrayOLines = fr.readlines()    #获取文件中所有的行数    numberOfLines = len(arrayOLines)    #创建numberOfLines行3列的数组    returnMat = zeros((numberOfLines,3))    classLabelVector = []    index = 0    #print arrayOLines    for line in arrayOLines:        #去掉所有的回车字符'\n'        line = line.strip()        #使用tab字符'\t'将上一步得到的整行数据分割成一个元素列表:list        listFromLine = line.split('\t')        #将列表listFromLine中的第一个元素到第三个元素存储到特征矩阵中        returnMat[index,:] = listFromLine[0:3]        #print returnMat[index,:]        #print listFromLine[-1]        #将列表中左后一个元素存储到数组classLabelVector中        classLabelVector.append(int(listFromLine[-1]))        index += 1    return returnMat,classLabelVector#画图#datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')#fig = plt.figure()#ax = fig.add_subplot(111)#ax.scatter(datingDataMat[:,0],datingDataMat[:,1],#           15.0*array(datingLabels),15.0*array(datingLabels))#plt.show()#归一化数据def autoNorm(dataSet):    minVals = dataSet.min(0)    maxVals = dataSet.max(0)    ranges = maxVals - minVals    normDataSet = zeros(shape(dataSet))    m = dataSet.shape[0]    normDataSet = dataSet - tile(minVals,(m,1))    normDataSet = normDataSet/tile(ranges,(m,1))    return normDataSet,ranges,minVals#normMat,ranges,minVals = autoNorm(datingDataMat)#print normMat#测试代码def datingClassTest():    hoRatio = 0.10    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')    normMat,ranges,minVals = autoNorm(datingDataMat)    m = normMat.shape[0]    numTestVecs = int(m*hoRatio)    errorCount = 0.0    for i in range(numTestVecs):        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\                datingLabels[numTestVecs:m],3)        print "The classifier came back with: %d,the real answer is: %d"\                % (classifierResult,datingLabels[i])        if(classifierResult != datingLabels[i]):            errorCount += 1.0        print "The total error rate is: %f" %(errorCount/float(numTestVecs))def classifyPerson():    resultList = ['not at all','in small doses','in large doses']    percentTats = float(raw_input("percentage of time spent playing video games?"))    ffMiles = float(raw_input("frequent flier miles earned per year?"))    iceCream = float(raw_input("liters of ice cream consumed per year?"))    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')    normMat,ranges,minVals = autoNorm(datingDataMat)    inArr = array([ffMiles,percentTats,iceCream])    classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)    print "You will probably like this person:",resultList[classifierResult-1]for i in range(10):    classifyPerson()    print('\n')