机器学习实战之k-近邻算法(5)--- 完整版约会网站数据分类

来源:互联网 发布:汽车销售app软件 编辑:程序博客网 时间:2024/05/20 09:08
from numpy import *import operator#创建数据集def createDataSet():    group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])    labels = ['A', 'A', 'B', 'B']    return group, labels#根据输入测试实例进行k-近邻分类def classify0(inX, dataSet, labels, k):    dataSetSize = dataSet.shape[0]    diffMat = tile(inX, (dataSetSize, 1)) - dataSet    sqDiffMat = diffMat ** 2    sqDistances = sqDiffMat.sum(axis=1)    distances = sqDistances**0.5    sortedDistIndicies = distances.argsort()    classCount = {}    for i in range(k):        voteIlabel = labels[sortedDistIndicies[i]]        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1    sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse=True)    return sortedClassCount[0][0]#处理输入格式问题,从txt文件中读取数据def file2matrix(filename, dim2):    fr = open(filename)    arrayOLines = fr.readlines()    numberOfLines = len(arrayOLines)    returnMat = zeros((numberOfLines, dim2))    classLabelVector = []    index = 0    for line in arrayOLines:        line = line.strip()        listFromLine = line.split('\t')        returnMat[index, :] = listFromLine[0:dim2]        classLabelVector.append(int(listFromLine[-1]))        index += 1    return returnMat, classLabelVector#归一化特征值def autoNorm(dataSet):    minVals = dataSet.min(0)    maxVals = dataSet.max(0)    ranges = maxVals - minVals    normDataSet = zeros(shape(dataSet))    m = dataSet.shape[0]    normDataSet = dataSet - tile(minVals, (m, 1))    normDataSet = normDataSet / tile(ranges, (m, 1))    return normDataSet, ranges, minVals#分类器针对约会网站的测试代码 hoRatio是测试样本占总样本数的比例def datingClassTest(hoRatio):    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt', 3)    normMat, ranges, minVals = autoNorm(datingDataMat)    m = normMat.shape[0]    numTestVecs = int(m * hoRatio)    errorCount = 0.0    for i in range(numTestVecs):        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)        print "the classifier came back with: %d, the real answer is: %d" %(classifierResult, datingLabels[i])        if(classifierResult != datingLabels[i]):    errorCount += 1.0    print "the total error rate is: %f" %(errorCount/float(numTestVecs))

当把测试数据占的比例设为0.1是,错误率仅为0.05

>>> import kNN>>> reload(kNN)
>>> kNN.datingClassTest(0.1)the classifier came back with: 3, the real answer is: 3the classifier came back with: 2, the real answer is: 2the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 3, the real answer is: 3the classifier came back with: 3, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 3, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 3, the real answer is: 3the classifier came back with: 2, the real answer is: 2the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 2the classifier came back with: 3, the real answer is: 3the classifier came back with: 2, the real answer is: 2the classifier came back with: 3, the real answer is: 3the classifier came back with: 2, the real answer is: 2the classifier came back with: 3, the real answer is: 3the classifier came back with: 2, the real answer is: 2the classifier came back with: 1, the real answer is: 1the classifier came back with: 3, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 3, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 3, the real answer is: 3the classifier came back with: 3, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 3, the real answer is: 3the classifier came back with: 3, the real answer is: 3the classifier came back with: 3, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 2, the real answer is: 2the classifier came back with: 1, the real answer is: 1the classifier came back with: 3, the real answer is: 3the classifier came back with: 2, the real answer is: 2the classifier came back with: 2, the real answer is: 2the classifier came back with: 2, the real answer is: 2the classifier came back with: 2, the real answer is: 2the classifier came back with: 3, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 2, the real answer is: 2the classifier came back with: 2, the real answer is: 2the classifier came back with: 2, the real answer is: 2the classifier came back with: 2, the real answer is: 2the classifier came back with: 3, the real answer is: 3the classifier came back with: 2, the real answer is: 2the classifier came back with: 3, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 3, the real answer is: 3the classifier came back with: 2, the real answer is: 2the classifier came back with: 2, the real answer is: 2the classifier came back with: 3, the real answer is: 1the classifier came back with: 3, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 3, the real answer is: 3the classifier came back with: 3, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 3, the real answer is: 3the classifier came back with: 3, the real answer is: 1the classifier came back with: 3, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 2, the real answer is: 2the classifier came back with: 1, the real answer is: 1the classifier came back with: 1, the real answer is: 1the classifier came back with: 3, the real answer is: 3the classifier came back with: 2, the real answer is: 3the classifier came back with: 1, the real answer is: 1the classifier came back with: 2, the real answer is: 2the classifier came back with: 1, the real answer is: 1the classifier came back with: 3, the real answer is: 3the classifier came back with: 3, the real answer is: 3the classifier came back with: 2, the real answer is: 2the classifier came back with: 1, the real answer is: 1the classifier came back with: 3, the real answer is: 1the total error rate is: 0.050000


0 0
原创粉丝点击