k均值聚类算法

来源:互联网 发布:人工智能的优点英语 编辑:程序博客网 时间:2024/05/16 15:34

from numpy import *import timeimport matplotlib.pyplot as pltdef e_distance(v1,v2):#两点距离公式    return sqrt(sum(power(v2-v1,2)))def initcentroids(dataset,k):#聚类质心点初始化    numsamples,dim = dataset.shape()    centroids = zeros((k,dim))    for i in range(k):        index = int(random.uniform(0,numsamples))#随机选取样本中k个值作为当前聚类质心        centroids[i,:]=dataset[index,:]    return centroidsdef kmeans(dataset,k):#K_mean算法    numsamples = dataset.shape[0]    clu_ass = mat(zeros((numsamples,2)))#第一列存储样本所属的簇,第二列存储对应距离    clu_change = True    centroids = initcentroids(dataset,k)#质心初始化    while clu_change :        clu_change = False        for i in xrange(numsamples):            min_dis = 1000000            min_index = 0                        for j in range(k):                distance = e_distance(centroids[j,:],dataset[i,:])                if distance < min_dis:#更新最小距离以及对应的簇编号                    min_dis = distance                    min_index = j            if clu_ass[i,0] != min_index:                clu_change = True                 clu_ass[i,:] = min_index,min_dis**2#更新ass列表        for j in range(k):            point_in_clu = dataset[nonzero(clu_ass[:,0].A == j)[0]]#属于j簇的所有样本这,句真难懂            centroids[j,:] = mean(point_in_clu,axis = 0)#更新质心为所属簇样本的均值                        print 'mission complete'    return centroids,clu_ass



暂时还没有运算结果和数据可视化的程序。关于数据可视化后面单独写一篇笔记好了..