K-Means算法

来源:互联网 发布:自媒体是什么 知乎 编辑:程序博客网 时间:2024/06/04 18:56
from numpy import *def loadDataSet(fileName):    dataSet = []    fr = open(fileName)    for line in fr.readlines():        curLine = line.strip().split('\t')        fltLine = map(float, curLine)        dataSet.append(fltLine)    return dataSetdef distEclud(vecA, vecB):    return sqrt(sum(power(vecA-vecB, 2)))def randCent(dataSet, k):    n = shape(dataSet)[1]    centroids = mat(zeros((k, n)))    for j in range(n):        minJ = min(dataSet[:, 0])        rangeJ = float(max(dataSet[:, j]) - minJ)        centroids[:, j] = minJ + rangeJ * random.rand(k, 1)    return centroidsdef kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):    m = shape(dataSet)[0]    clusterAssment = mat(zeros((m, 2)))    centroids = createCent(dataSet, k)    clusterChanged = True    while clusterChanged:        clusterChanged = False        for i in range(m):            minDist = inf; minIndex = -1            for j in range(k):                distJI = distMeas(centroids[j, :], dataSet[i, :])                if distJI < minDist:                    minDist = distJI; minIndex = j            if clusterAssment[i, 0] != minIndex:                clusterChanged = True            clusterAssment[i, :] = minIndex, minDist ** 2        # print centroids        for cent in  range(k):            ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]            # print nonzero(clusterAssment[:, 0].A == cent)[0]            centroids[cent,:] = mean(ptsInClust)    return centroids, clusterAssmentdef biKmeans(dataSet, k, distMeans=distEclud):    m = shape(dataSet)[0]    clusterAssment = mat(zeros((m, 2)))    centroid0 = mean(dataSet, axis = 0).tolist()[0]    centList = [centroid0]    for j in range(m):        clusterAssment[j, 1] = distMeans(mat(centroid0), dataSet[j, :]) ** 2    while(len(centList) < k):        lowestSSE = inf        for i in range(len(centList)):            ptsInCurrCluster = dataSet[nonzero(clusterAssment[:, 0].A == i)[0],:]            centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeans)            sseSplit = sum(splitClustAss[:, 1])            sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])            print "sseSplit, and notSplit: ",sseSplit,sseNotSplit            if (sseSplit + sseNotSplit) < lowestSSE:                bestCentToSplit = i                bestNewCents = centroidMat                bestClustAss = splitClustAss.copy()                lowestSSE = sseSplit + sseNotSplit        bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList)        bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit        print 'the bestCentToSplit is: ',bestCentToSplit        print 'the len of bestClustAss is: ', len(bestClustAss)        centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]        centList.append(bestNewCents[1,:].tolist()[0])        clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss    return mat(centList), clusterAssment
0 0
原创粉丝点击