k-Means聚类算法实现--基于西瓜数据4.0

来源:互联网 发布:java文件上传原理 编辑:程序博客网 时间:2024/05/21 17:17

本文是k均值聚类算法源代码

语言:Python;数据集:西瓜数据4.0.xlsx

使用的库:

import numpy as npimport mathimport xlrdimport randomimport matplotlib.pyplot as plt
从xlsx中导入数据:

def loadData(filename):    data = xlrd.open_workbook(filename)    table = data.sheets()[0]    print "table: ", table    nrows = table.nrows    dataset = []    for i in xrange(nrows):        dataset.append(table.row_values(i))    return dataset
计算欧式距离:

def lengthcalc(inX,inY): #inX,inY  要求同为行向量    subdu = inX - inY      subdu.shape = (1,subdu.shape[0])#一维数组转置必须指定大小    return pow(np.dot(subdu,subdu.T),0.5)[0][0] #从类似于array([[ 0.09625487]])中取出值

选择最小距离,返回距离最小距离的蔟索引:

def minlength(inX,cluster):    cluster = np.array(cluster)    inX = np.array(inX)    clm = cluster.shape[0]    minindex = np.inf    minlen = np.inf    for i in xrange(clm):        currlen = lengthcalc(inX,cluster[i]) #还需要进一步完善,计算之前做数据处理,完成归一化        if currlen < minlen:            minindex = i            minlen = currlen    return minindex

k均值聚类实现:

def kMeans(data,k): #k为要分的蔟数    dataset = np.array(data)    m,n = dataset.shape    cluster = random.sample(dataset,k)    #cluster = np.array(cluster)    dic = {}    dicbak = {}    for time in xrange(500):        dic = {}        #print cluster        for data in dataset:            #print "666"            minindex = minlength(data,cluster)            if minindex not in dic.keys():                 dic[minindex] = []            dic[minindex].append((np.mat(data)).tolist()[0]) #array先转化为mat,mat用tolist函数转化为list(注意一维数组的特殊情况)        for index in range(k):            #print dic[index]            cluster[index] = np.array(dic[index]).mean(axis=0) #axis=0:按列求和        #print dic        if len(dic)!=0 and dic == dicbak:            break        dicbak = dic.copy()    return cluster,dic  #返回蔟和每个蔟中的样本

聚类完成后,通过散点图看聚类结果:

def figplot(dic):    fig = plt.figure()    ax = fig.add_subplot(1,1,1)    col = ['r','b','g','k','y','m'] #个数要大于蔟的个数    for key in dic.keys():        #ax.plot(np.array(dic[key])[:,0],np.array(dic[key])[:,1],linestyle='o--',color=col[key])        ax.scatter(np.array(dic[key])[:,0],np.array(dic[key])[:,1],color=col[key])    fig.show()

为了实现自动选择蔟个数,用DBI作为衡量标准,关于DBI请参考周志华《机器学习》和文章http://blog.sina.com.cn/s/blog_65c8baf901016flh.html

def DBIcalc(cul,dic):    dicset = np.array(dic)    k = len(dic)    kDBI = 0.0    for i in range(k):        maxsim = 0        kj = range(k)        del kj[i]        for j in kj:            sim = similar(dic[i],dic[j],cul[i],cul[j])            #print sim            if sim > maxsim:                maxsim = sim        #print 'maxsim',maxsim        kDBI += maxsim        #print 'kDBI',kDBI    return 1.0/k*kDBI    def avgC(data): #输入为一个类的数据集    dataset = np.array(data)    k = dataset.shape[0]    sumlength = 0.0    for i in range(k):        kj = range(k)        del kj[i]        for j in kj:            sumlength += lengthcalc(dataset[i],dataset[j])    return (2.0/(k*(k-1)))*sumlengthdef similar(dataX,dataY,inX,inY): #输入为两个分类中的数据集,inX,inY为两个类的中心    return float(avgC(dataX)+avgC(dataY))/lengthcalc(inX,inY)

选择最佳蔟的函数:

def bestCluster(data):    bestdbi = np.inf    for k in range(2,5):        cul,dic = kMeans(data,k)        currdbi = DBIcalc(cul,dic)        if currdbi < bestdbi:            bestdbi = currdbi    return k