谱聚类--SpectralClustering

来源：互联网发布：淘宝全屏导航不固定编辑：程序博客网时间：2024/06/09 19:41

谱聚类一般会先对两两样本间求相似度，然后根据相似度矩阵求出拉普拉斯矩阵，然后将每个样本映射到拉普拉斯矩阵特诊向量中，最后使用k-means聚类。

scikit-learn开源包中已经有现成的接口可以使用，具体见

http://scikit-learn.org/dev/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering

写了一个测试例子

构造二维空间样本点，

#!/usr/bin/env pythonimport randomimport numpy as npimport mathindex = 0pointlist = []fd = open("points.txt", 'w')for x in np.arange(0.1, 10., 0.5) :    for y in np.arange(0., 10., 0.1) :        print >> fd, str(index)+'\t'+str(x)+'\t'+str(y)        pointlist.append((index, (x, y)))        index += 1for x in np.arange(-10.0, -0.1, 0.5) :    for y in np.arange(0., 10., 0.1) :        print >> fd, str(index)+'\t'+str(x)+'\t'+str(y)        pointlist.append((index, (x, y)))        index += 1for x in np.arange(-10.0, -0.1, 0.5) :    for y in np.arange(-10.0, 0., 0.1) :        print >> fd, str(index)+'\t'+str(x)+'\t'+str(y)        pointlist.append((index, (x, y)))        index += 1fd.close()def get_dist(pnt1, pnt2) :    return math.sqrt((pnt1[1][0] - pnt2[1][0])**2 + (pnt1[1][1] - pnt2[1][1])**2)simfd = open("sim_pnts.txt", 'w')for pnt1 in pointlist :    for pnt2 in pointlist :        index1, index2 = pnt1[0], pnt2[0]        dist = get_dist(pnt1, pnt2)        if dist <=0.00001 :             print >> simfd, str(index1) + "\t"+str(index2) + "\t" + "10"            continue        sim = 1.0 / dist        print >> simfd, str(index1) + "\t"+str(index2) + "\t" + str(sim)simfd.close()

使用谱聚类：

#!/usr/bin/env python# Authors:  Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org>#           Gael Varoquaux <gael.varoquaux@normalesup.org># License: BSD 3 clauseimport sysimport numpy as npfrom sklearn.cluster import spectral_clusteringfrom scipy.sparse import coo_matrix###############################################################################fid2fname = {}for line in open("points.txt") :    line = line.strip().split('\t')    fid2fname.setdefault(int(line[0]), (float(line[1]), float(line[2])))N = len(fid2fname)rowlist = []collist = []datalist = []for line in open("sim_pnts.txt") :    line = line.strip().split('\t')    if len(line) < 3 : continue    f1, f2, sim = line[:3]    rowlist.append(int(f1))    collist.append(int(f2))    datalist.append(float(sim))for id in fid2fname :    rowlist.append(int(id))    collist.append(int(id))    datalist.append(1.0)row = np.array(rowlist)col = np.array(collist)data = np.array(datalist)graph = coo_matrix((data, (row, col)), shape=(N, N))################################################################################ Force the solver to be arpack, since amg is numerically# unstable on this examplelabels = spectral_clustering(graph, n_clusters=3, eigen_solver='arpack')#print labelscluster2fid = {}for index, lab in enumerate(labels) :    cluster2fid.setdefault(lab, [])    cluster2fid[lab].append(index)for index, lab in enumerate(cluster2fid) :    fd = open("cluster_%s" % index, "w")    for fid in cluster2fid[lab] :        print >> fd , fid2fname[fid]

将聚类后的样本可视化：

#!/usr/bin/env pythonimport matplotlib.pyplot as pltplt.figure(figsize=(12,6))cluster_list = []cluster_0_x = []cluster_0_y = []for line in open("cluster_0"):    line = line.strip().split(',')    x = float(line[0][1:].strip())    y = float(line[1][:-1].strip())    cluster_0_x.append(x)    cluster_0_y.append(y)plt.plot(cluster_0_x, cluster_0_y, 'or')cluster_1_x = []cluster_1_y = []for line in open("cluster_1"):    line = line.strip().split(',')    x = float(line[0][1:].strip())    y = float(line[1][:-1].strip())    cluster_1_x.append(x)    cluster_1_y.append(y)plt.plot(cluster_1_x, cluster_1_y, 'xb')cluster_2_x = []cluster_2_y = []for line in open("cluster_2"):    line = line.strip().split(',')    x = float(line[0][1:].strip())    y = float(line[1][:-1].strip())    cluster_2_x.append(x)    cluster_2_y.append(y)plt.plot(cluster_2_x, cluster_2_y, '+g')plt.show()

不同颜色代表不同的聚类，可以看到聚类效果还是不错的。

0 0