K-mean聚类的一个代码的详细注释

来源：互联网发布：淮南腾讯大数据编辑：程序博客网时间：2024/06/05 15:28

#<span style = "font-size: 18px">#  coding:utf-8from numpy import *import timeimport matplotlib.pyplot as plt# calculate Euclidean distancedef euclDistance(vector1, vector2):    return sqrt(sum(power(vector1 - vector2, 2)))    # 0ρ = sqrt( (x1-x2)^2+(y1-y2)^2 )　|x| = √( x2 + y2 )    # power 对列表计算2次方  求和后开方# init centroids with random samples 初始化质心随机样本def initCentroids(dataSet, k):    numSamples, dim = dataSet.shape    #numSamples为dataSet的行数 dim为dataSet的列数    centroids = zeros((k, dim))    #centroids为k行dim列的零矩阵    for i in range(k):        index = int(random.uniform(0, numSamples))        # uniform(x,y) 方法将随机生成下一个实数，它在[x,y]范围内。        centroids[i, :] = dataSet[index, :]    return centroids# k-means cluserdef kmeans(dataSet, l):    numSamples = dataSet.shape[0]    # first colum stores which cluster this sample belongs to,    # second colum stores the error between this sample and its centroid    clusterAssment = mat(zeros((numSamples, 2)))    '''    zeros((numSamples, 2))生成数组    mat(zeros((numSamples, 2)))将生成的数组转换为矩阵    '''    clusterChanged = True    # step 1: init centroidsimport    centroids = initCentroids(dataSet, k)    while clusterChanged:        clusterChanged = False        ## for each sample        for i in xrange(numSamples):            #range()直接生成一个list对象，            #xrange()生成一个生成器，由于每次调用只返回一个值            #xrang()的执行效率要高于range()            minDist = 100000.0            minIndex = 0            ## for each centroid            ##step 2: find the centroid who is closest            for j in range(k):                distance = euclDistance(centroids[j, :], dataSet[i, :])                if distance < minDist:                    minDist = distance                    minIndex = j            ## step 3: update its cluster            if clusterAssment[i, 0] != minIndex:                clusterChanged = True                clusterAssment[i, :] = minIndex, minDist**2        ## step 4: update centroids        for j in range(k):            pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]]            #numpy.nonzero(a)[source]            #Return the indices of the elements that are non-zero.            #官网文档：http://docs.scipy.org/doc/numpy/reference/generated/numpy.nonzero.html            centroids[j, :] = mean(pointsInCluster, axis = 0)    print 'Congratulatons, cluster complete!'    return centroids, clusterAssment# show your cluster only available with 2-D datadef showCluster(dataSet, k, centroids, clusterAssment):    numSamples, dim = dataSet.shape    if dim != 2:        print "Sorry! I can not draw because the dimension of your data is not 2!"        return 1    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']    if k > len(mark):        print "Sorry your k is too large! Please contact Zouxy"        return 1    # draw all samples    for i in xrange(numSamples):        markIndex = int(clusterAssment[i, 0])        plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])    mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']    # draw the centroids    for i in range(k):        plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 12)        plt.show()    #plt.savefig('foo1.png')#from numpy import *#import time#import matplotlib.pyplot as plt## step 1: load dataprint "step 1: load data..."dataSet = []fileIn = open('/home/amos/machine_learning')for line in fileIn.readlines():    lineArr = line.strip().split('\t')    dataSet.append([float(lineArr[0]), float(lineArr[1])])## step 2: clustering...print "step 2: clustering..."dataSet = mat(dataSet)k = 4centroids, clusterAssment = kmeans(dataSet, k)## step 3: show the resultprint "step 3: show the result..."showCluster(dataSet, k, centroids, clusterAssment)

#"setp 2:"可以通过直接调用官方库sklearn.cluster中的KMeans实现聚类print "step 2: clustering..."dataSet = mat(dataSet)k = 4#############################################################by importing KMeans from sklearn.cluster#call KMeans and get the same result as kmeans.pykmeans = KMeans(n_clusters=k, random_state=0).fit(dataSet)centroids = kmeans.cluster_centers_clusterAssment = kmeans.labels_#############################################################"step 3:"中需要将showCluster(dataSet, k, centroids, clusterAssment)中的# draw all samples部分的markIndex = int(clusterAssment[i, 0])#改为markIndex = int(clusterAssment[i])

0 0