[算法3]Kmeans算法

来源:互联网 发布:win7安装windows live 编辑:程序博客网 时间:2024/06/13 23:05

面试中,问面试者,你熟悉神马算法,然后得到的回复普遍就是,聚类呀,分类啥的。然后聚类算法了解什么呢?答案基本都是kmeans...


kmeans到底有多么的深入人心呢?在机​器​学​习​1​0​大​经​典​算​法中,kmeans排名第二,可见一斑


其实聚类算法不光只有kmeans的,kmeans虽然简单,易懂,计算比较快,但是有些问题无法解决,比如,需要提前设定类别的数目(k),在未知类数目的情况下,聚类肯定是不能通过枚举k的值,然后比较F值来确定K值,所以,层级聚类会是一个alternative。


层级聚类会在之后写写demo,今天写了个kmeans的python类,利用[算法2]http://blog.csdn.net/whzhcahzxh/article/details/24933585中的距离公式,提供余弦相似度,曼哈顿距离,切比雪夫距离(好像没啥意义,留着呗)还有欧氏距离的kmeans计算。不同的距离公式,可以决定最终聚类出的形状(欧氏距离是圆形,曼哈顿是菱形等等)


kmeans的算法步骤:

输出:k个簇的集合。

方法:

  1. 随意选择k个点做中心
  2. 根据簇中对象的均值,将每个对象指派到最相似的簇;
  3. 更新簇均值,即计算每个簇中对象的均值;
  4. 迭代到一定次数后停止或者迭代到不再变化后停止

kmeans 结果会依赖于k和最初随机的中心点的坐标,所以,多次kmeans后,综合考虑,可以避免局部最优值

python demo
#coding:UTF-8'''Created on 2014年5月4日@author: hao'''import sysfrom random import uniformimport numpyfrom distance.distanceClasses import distanceCalculateclass kmeansClustering():    def __init__(self, dataset, clusterNumber, iterationTime = 5, distanceClass = 1):        '''                distanceClass:2 Cosines distance        distanceClass:3 Manhattan distance        distanceClass:3 Chebyshev distance        others(1): Euclidean Distance        '''        # check clusterNumber        if clusterNumber<1 or int(clusterNumber)!= clusterNumber:            raise ValueError('Error: clusterNumber should be a positive integer!')            sys.exit(1)        self.clusterNumber = clusterNumber        # check clusterNumber        if iterationTime!=-1:            if iterationTime<1 or int(iterationTime)!= iterationTime:                raise ValueError('Error: iterationTime should be a positive integer or -1 means infinite!')                sys.exit(1)        self.iterationTime = iterationTime                # distanceCalculate        self.distanceClass = distanceCalculate()        # check dataset        if not isinstance(dataset, numpy.ndarray):            raise ValueError('''Error: input data should be formatted as numpy.ndarray. eg.numpy.array([[1,2],[2,4],[6,7]]),                                 means three points whose location is [1,2],[2,4],[6,7]''')            sys.exit(1)                self.dataset = numpy.matrix(dataset)                if isinstance(self.dataset.getA1()[0], list):            raise ValueError('''Error: input data should be matrix-like dimensions. eg. 300x200 ''')            sys.exit(1)                self.datasetShape = self.dataset.shape                self.clusterDimension = self.datasetShape[1]        self.dataSetNum = self.datasetShape[0]#         print uniform(10,20)        self.kmeansClasses = list()        '''        iteration        '''        # matrix maximum and minimum value for random        maxValue = self.dataset.max()        minValue = self.dataset.min()                tempCenters = [[uniform(minValue,maxValue) for j in range(self.clusterDimension)] for i in range(self.clusterNumber)]#         # initial centroids location        self.centroids = numpy.matrix(numpy.array(tempCenters))        self.tempcentroids = numpy.matrix(numpy.array(tempCenters))                self.clustering(iterationTime, distanceClass = 1)            def getDataSet(self):        '''        get the matrix type dataset        '''        return self.dataset        def getCentroids(self):        '''        get the final centroids locations        '''        return self.centroids        def getClasses(self):        '''        get the final classified index        '''        return self.kmeansClasses            def clustering(self, iterationTime, distanceClass = 1):        '''        Data matrix and center matrix calculation        '''        if distanceClass==2:            distanceCalculate = self.distanceClass.cosineDistance        elif distanceClass==3:            distanceCalculate = self.distanceClass.manhattanDistance        elif distanceClass==4:            distanceCalculate = self.distanceClass.chebyshevDistance        else:            distanceCalculate = self.distanceClass.euclideanDistance                if iterationTime == -1:            while True:                distanceMatrix = numpy.matrix([[distanceCalculate(list(centroid.getA1()), list(data.getA1()))                                    for centroid in self.centroids] for data in self.dataset])    #             print distanceMatrix                tempCluterIndex = distanceMatrix.argmin(1)                centerChanged = [0]*self.clusterNumber                newCentroids = [[0]*self.clusterDimension for j in range(self.clusterNumber)]    #             print tempCluterIndex.A1                for j in range(self.dataSetNum):    #                 new centroid location                    newCentroids[tempCluterIndex.getA1()[j]] = [x+y for (x,y) in zip(newCentroids[tempCluterIndex.getA1()[j]], self.dataset[j].getA1())]                    # changed number    #                 print tempCluterIndex.getA1()[j]                    centerChanged[tempCluterIndex.getA1()[j]]+=1                        #             print newCentroids    #             print centerChanged                tempCenter = list()                for (x,y) in zip(newCentroids, centerChanged):                    tmp = list()                    for j in range(len(x)):                        if y!=0:                            tmp.append(float(x[j])/y)                        else:                            tmp.append(0)                    tempCenter.append(tmp)                        #             newCentroids = [float(x)/y for (x,y) in zip(newCentroids, centerChanged) if y!=0]                for j in range(self.clusterNumber):                    if centerChanged[j]!=0:                        self.centroids[j] = tempCenter[j]                self.kmeansClasses = tempCluterIndex.A1                                if list(self.tempcentroids.A1) == list(self.centroids.A1):                    break                     self.tempcentroids = self.centroids             else:            for i in range(self.iterationTime):                distanceMatrix = numpy.matrix([[distanceCalculate(list(centroid.getA1()), list(data.getA1()))                                    for centroid in self.centroids] for data in self.dataset])    #             print distanceMatrix                tempCluterIndex = distanceMatrix.argmin(1)                centerChanged = [0]*self.clusterNumber                newCentroids = [[0]*self.clusterDimension for j in range(self.clusterNumber)]    #             print tempCluterIndex.A1                for j in range(self.dataSetNum):    #                 new centroid location                    newCentroids[tempCluterIndex.getA1()[j]] = [x+y for (x,y) in zip(newCentroids[tempCluterIndex.getA1()[j]], self.dataset[j].getA1())]                    # changed number    #                 print tempCluterIndex.getA1()[j]                    centerChanged[tempCluterIndex.getA1()[j]]+=1                        #             print newCentroids    #             print centerChanged                tempCenter = list()                for (x,y) in zip(newCentroids, centerChanged):                    tmp = list()                    for j in range(len(x)):                        if y!=0:                            tmp.append(float(x[j])/y)                        else:                            tmp.append(0)                    tempCenter.append(tmp)                        #             newCentroids = [float(x)/y for (x,y) in zip(newCentroids, centerChanged) if y!=0]                for j in range(self.clusterNumber):                    if centerChanged[j]!=0:                        self.centroids[j] = tempCenter[j]                  self.kmeansClasses = tempCluterIndex.A1                                if list(self.tempcentroids.A1) == list(self.centroids.A1):                    break                self.tempcentroids = self.centroids   #             print self.centroids            if __name__=='__main__':    data = numpy.array([[7,2,5],[2,4,7],[2,5,7],[6,8,9],[9,2,3]])        testKmeans = kmeansClustering(dataset=data, clusterNumber=2, iterationTime=201, distanceClass=1)        print testKmeans.getCentroids()    print testKmeans.getClasses()    print testKmeans.getDataSet()



0 0