示例:在约会网站上使用k-近邻算法

来源:互联网 发布:历年粮食进出口数据 编辑:程序博客网 时间:2024/05/21 08:00

1,收集数据:提供文本文件

2,准备数据:使用Python解析文本文件

3,分析数据:使用Matplotlib画二维扩散图

4,训练算法:此步骤不适用于该算法

5,测试算法

6,使用算法:产生简单的命令行程序,然后输入一些特征数据以判断对方是否为自己喜欢的类型

from numpy import *import operator#运算符模块def createDataSet():    group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])    labels=['A','A','B','B']    return group,labelsdef classify0(inX,dataSet,labels,k):#分类程序    dataSetSize=dataSet.shape[0]    diffMat=tile(inX,(dataSetSize,1))-dataSet#tile()产生以inX dataSetSize x 1 的数组再减去dataSet    sqDiffMat=diffMat**2    sqDistances=sqDiffMat.sum(axis=1)#每个数组的和    distances=sqDistances**0.5    sortedDistIndices=distances.argsort()#按照升序排列返回对应索引值    classCount={}    for i in range(k):        voteIlabel=labels[sortedDistIndices[i]]        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1    sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)#classCount.iteritems 作用:{'k':1}转换为列表单位为一个[('k', 1)]    return sortedClassCount[0][0]                                                                                  #以第二个元素为值,reverse=True做从大到小排序,最后返回一个排序好的列表def filesmatrix(filename):#将文本记录转换为numpy    fr = open(filename)    arrayOLines=fr.readlines()#为一个列表元素为每一行的数据    numberOfLines=len(arrayOLines)#列表的长度即为行数    returnMat=zeros((numberOfLines,3))#创建矩阵    classLabelVector=[]    index=0    for line in arrayOLines:        line = line.strip()#用来移除字符串头尾的空白        listFromLine=line.split('\t')#按照\t分割字符串        returnMat[index,:]=listFromLine[0:3]        classLabelVector.append(int(listFromLine[-1]))        index+=1    return returnMat,classLabelVectordef autoNorm(dataSet):#归一化数据    minVals=dataSet.min(0)#在列中选出最小值    maxVals=dataSet.max(0)#在列中选出最大值    ranges=maxVals-minVals    normDataSet=zeros(shape(dataSet))    m=dataSet.shape[0]    normDataSet=dataSet-tile(minVals,(m,1))    normDataSet=normDataSet/tile(ranges,(m,1))    return normDataSet,ranges,minValsdef datingClassTest():    hoRatio=0.10    datingDataMat,datingLabels=filesmatrix('datingTestSet2.txt')    normMat, ranges, minVals = autoNorm(datingDataMat)    m=normMat.shape[0]    numTestVecs=int(m*hoRatio)    errorCount=0.0    for i in  range(numTestVecs):        classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],                                   datingLabels[numTestVecs:m],3)        print("the classifier came back with: %d,the real answer is:%d"              %(classifierResult,datingLabels[i]))        if (classifierResult != datingLabels[i]):errorCount+=1.0    print("the total error rate is :%f"%(errorCount/float(numTestVecs)))    print(numTestVecs)    print (datingLabels)def classifyPerson():    resultList=['not at all','in small doses','in large doses']    percentTats=float(raw_input(                "percenrage of time spent playing video games?"))    ffMiles=float(raw_input("frequent filer miles earnde per year?"))    iceCreams=float(raw_input("liters of ice cream consumed per year?"))    datingDataMat, datingLabels = filesmatrix('datingTestSet2.txt')    datingDataMat,datingLabels=filesmatrix('datingTestSet2.txt')    normMat, ranges, minVals = autoNorm(datingDataMat)    inArr=array([ffMiles,percentTats,iceCreams])    classifierResult=classify0((inArr- minVals)/ranges,normMat,datingLabels,3)#ranges=maxvals-minvals    print("you will probably like this person:",          resultList[classifierResult-1])

0 0