## prepare: importing data with pythonfrom numpy import *import operatordef createDataSet():    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])    labels = ['A','A','B','B']    return group, labelsgroup,labels = createDataSet()## k-Nearest Neighbors algorithmdef classify0(inX, dataSet, labels, k):    dataSetSize = dataSet.shape[0] #shape见《python小函数(一)》    diffMat = tile(inX, (dataSetSize,1)) - dataSet #tile见《python小函数(一)》    sqDiffMat = diffMat**2 # 特征坐标的差值    sqDistances = sqDiffMat.sum(axis=1)# 对差值按行求和,sum见《python小函数(一)》    distances = sqDistances**0.5 #计算欧氏距离    sortedDistIndicies = distances.argsort() #对距离按从小到大排序,argsort见《python小函数(一)》,其类似于R中的order函数    classCount={} #空列表,存储类的个数    for i in range(k):        voteIlabel = labels[sortedDistIndicies[i]]        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1    sortedClassCount = sorted(classCount.iteritems(),     key=operator.itemgetter(1), reverse=True) #sorted函数在不改变原序列条件下排序,itermgetter返回的是一个用于提取数值的函数,    return sortedClassCount[0][0]## text record to numpy parsing codedef file2matrix(filename):    fr = open(filename)    numberOfLines = len(fr.readlines())    returnMat = zeros((numberOfLines,3)) #返回零矩阵    classLabelVector = []    fr = open(filename)    index = 0    for line in fr.readlines():        line = line.strip() #strip见《python小函数(一)》        listFromLine = line.split('\t')#split见《python小函数(一)》        returnMat[index,:] = listFromLine[0:3]# 从0到3但不包括3        classLabelVector.append(int(listFromLine[-1])) #append见《python小函数(一)》        index += 1    return returnMat, classLabelVector##create scatter plot with matplotlib    import matplotlibimport matplotlib.pyplot as pltfig = plt.figure()ax = fig.add_subplot(111)#111分别表示表示1行1列1层ax.scatter(datingDataMat[:,1],datingDataMat[:,2])plt.show()## data-normalizing codedef autoNorm(dataSet):    minVals = dataSet.min(0)#对列求最小值    maxVals = dataSet.max(0)#对列求最大值    ranges = maxVals - minVals    m = dataSet.shape[0]#dataset有多少行    normDataSet = dataSet - tile(minVals,(m,1))#求差值必须一一对应,而不像R中可以循环替代    normDataSet = normDataSet/tile(ranges,(m,1)) #上同    return normDataSet, ranges, minVals##classifier testing code for dating site    def datingClassTest():    hoRatio = 0.10    datingDataMat, datingLabels = file2matrix(r'c:/Users/ll/Documents/datingTestSet2.txt')    normMat, ranges, minVals = autoNorm(datingDataMat)    m = normMat.shape[0]    numTestVecs = int(m*hoRatio)#抽总数据中一定比例检测    errorCount = 0.0    for i in range(numTestVecs):        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\        datingLabels[numTestVecs:m],3)        print i, " the classifier came back with: %d, the real answer is: %d" % (classifierResult,datingLabels[i])        if (classifierResult != datingLabels[i]):            errorCount += 1.0    print "the total error rate is: %f" % (errorCount/float(numTestVecs))##dating site predictor functiondef classifyPerson():    resultList = ['not at all','in small doses','in large doses']    personTats = float(raw_input("percentage of time spent playing video games?"))    ffMiles = float(raw_input("frequent flier miles earned per year?"))    iceCream = float(raw_input("liters of ice cream consued per year?"))    datingDataMat,datingLabels = file2matrix(r'c:/Users/ll/Documents/datingTestSet2.txt')    normMat, ranges, minVals = autoNorm(datingDataMat)    inArr = array([ffMiles,personTats, iceCream])    classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)    print "You will probably like this person: ", resultList[classifierResult - 1]def img2vector(filename):    returnVect = zeros((1,1024))    fr = open(filename)    for i in range(32):#每个图像32行32列        lineStr = fr.readline()        for j in range(32):            returnVect[0,32*i+j] = int(lineStr[j])    return returnVect##handwritten digits testing codeimport os    def handwritingClassTest():    hwLabels = []    trainingFileList = os.listdir(r'c:/Users/ll/Documents/trainingDigits')    m = len(trainingFileList)    trainingMat = zeros((m,1024))    for i in range(m):        fileNameStr = trainingFileList[i]        fileStr = fileNameStr.split('.')[0]        classNumStr = int(fileStr.split('_')[0])        hwLabels.append(classNumStr)        trainingMat[i,:] = img2vector(r'c:/Users/ll/Documents/trainingDigits/%s' % fileNameStr)testFileList = os.listdir(r'c:/Users/ll/Documents/testDigits')errorCount = 0.0mTest = len(testFileList)for i in range(mTest):    fileNameStr = testFileList[i]    fileStr = fileNameStr.split('.')[0]    classNumStr = int(fileStr.split('_')[0])    vectorUnderTest = img2vector(r'c:/Users/ll/Documents/testDigits/%s' % fileNameStr)    classifierResult = classify0(vectorUnderTest,trainingMat,hwLabels, 3)    print "the classifier came back with: %d, the real answer is: %d" % (classifierResult,classNumStr)    if (classifierResult != classNumStr): errorCount += 1.0print "\nthe total number of error is: %d" % errorCountprint "\nthe total error rate is: %f" % (errorCount/float(mTest))
