python进行聚类(scikit-lean、scipy)

来源：互联网发布：智百威软件800 编辑：程序博客网时间：2024/05/19 08:26

python进行聚类(scikit-lean、scipy)

[python] view plain copy
 print?
用于聚类的数据集  

[python] view plain copy
 print?
%matplotlib inline  
import scipy.io as sio  
import matplotlib.pyplot as plt  
  
  
''''' 
各种聚类数据 
'''  
#two_cluster  
def two_cluster():  
    two_cluster=u'cluster_data/two_cluster.mat'  
    two_cluster=sio.loadmat(two_cluster)['X'].T  
    data = two_cluster  
    return data  
#three_cluster  
def three_cluster():  
    path=u'cluster_data/three_cluster.mat'  
    three_cluster=sio.loadmat(path)['X'].T  
    data = three_cluster      
    return data  
#five_cluster  
def five_cluster():  
    path=u'cluster_data/five_cluster.mat'  
    five_cluster=sio.loadmat(path)  
    x=five_cluster['x'] #得到的数据为二行n列  
    y=five_cluster['y'] #到的数据为一行n列  
    data = np.vstack((x,y)).T #先垂直合并，而后转置  
    #data = np.array([x[0,:],x[1,:],y[0,:]]).T #list与array互换  
    return data  
#spiral  
def spiral():  
    path=u'cluster_data/spiral.mat'  
    spiral=sio.loadmat(path)['spiral']  
    spiral = spiral[0::3,:] #每隔3行取一个数据  
    data = spiral  
    data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换  
    return data  
#spiral_unbalance  
def spiral_unbalance():  
    path=u'cluster_data/spiral_unbalance.mat'  
    spiral_unbalance=sio.loadmat(path)['spiral_unbalance']  
    spiral_unbalance = spiral_unbalance[0::3,:] #每隔3行取一个数据  
    data = spiral_unbalance  
    data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换  
    return data  
#ThreeCircles  
def ThreeCircles():  
    path=u'cluster_data/ThreeCircles.mat'  
    ThreeCircles=sio.loadmat(path)['ThreeCircles']  
    ThreeCircles = ThreeCircles[0::3,:] #每隔3行取一个数据  
    data = ThreeCircles  
    data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换  
    return data  
#Twomoons  
def Twomoons():  
    path=u'cluster_data/Twomoons.mat'  
    Twomoons=sio.loadmat(path)['Twomoons']  
    Twomoons = Twomoons[0::3,:] #每隔3行取一个数据  
    data = Twomoons  
    data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换  
    plt.scatter(data[:,0],data[:,1],c=data[:,2])  
    return data  
#Twomoons1  
def Twomoons1():  
    path=u'cluster_data/Twomoons.mat'  
    Twomoons1=sio.loadmat(path)['Twomoons']  
    Twomoons1 = Twomoons1[0::3,:] #每隔3行取一个数据  
    data = Twomoons1  
    data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换  
    return data  
def test():  
    print 'test'  
  
  
def show_all():  
    plt.figure(figsize=(16,8))  
    #动态调用方法  
    func_name_list = ['two_cluster','three_cluster','five_cluster','spiral','spiral_unbalance','ThreeCircles','Twomoons','Twomoons1']  
    for i in range(8):  
        data_list.append(eval(func_name_list[i])())  
    #动态画图  
    for i in range(8):  
        data = data_list[i]  
        plt.subplot(2,4,i+1)  
        #plt.figure()  
        plt.scatter(data[:,0],data[:,1],c=data[:,2])  
      
data_list = []  
show_all()  

[python] view plain copy
 print?
使用scikit的kmeans进行聚类  

[python] view plain copy
 print?
%matplotlib inline  
import scipy.io as sio  
#matlab文件名    
two_cluster=u'cluster_data/two_cluster.mat'  
data=sio.loadmat(two_cluster)  
print data  

[python] view plain copy
 print?
%matplotlib inline  
import matplotlib.pyplot as plt  
x = data['X']  
cValue = x[2]  
plt.scatter(x[0],x[1],c=cValue)  

[python] view plain copy
 print?
from sklearn import cluster, datasets  
b = np.array(x).T  
b = b[:,0:2]  
  
y_pred = cluster.KMeans(n_clusters=2, random_state=170).fit_predict(b)  
  
cValue = x[2]  
plt.scatter(x[0],x[1],c=y_pred)  

数据集下载

scikit-learn教程

[python] view plain copy
 print?
%matplotlib inline  
import scipy.io as sio  
#matlab文件名    
two_cluster=u'cluster_data/spiral.mat'  
spiral=sio.loadmat(two_cluster)['spiral']  
spiral = spiral[0::3,:] #每隔3行取一个数据  
print len(spiral),len(spiral[0])  
cValue = spiral[:,0]  
print cValue.shape  
color = ['b','y']  
cValue = [color[int(i)] for i in list(cValue)]  
plt.scatter(spiral[:,1],spiral[:,2],c=cValue)  

使用kmeans结果

[python] view plain copy
 print?
from sklearn import cluster, datasets  
  
y_pred = cluster.KMeans(n_clusters=2, random_state=170).fit_predict(spiral[:,1:3])  
  
plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)  

使用scipy进行聚类效果

[python] view plain copy
 print?
# -*- coding: utf8 -*-  
%matplotlib inline  
import scipy.io as sio  
import matplotlib.pyplot as plt  
import scipy.cluster.hierarchy as hcluster  
from sklearn.cluster import AgglomerativeClustering  
import numpy.random as random    
import numpy as np    
import numpy.core.fromnumeric    
  
  
def loadData():  
    #matlab文件名    
    two_cluster=u'cluster_data/spiral.mat'  
    spiral=sio.loadmat(two_cluster)['spiral']  
    spiral = spiral[0::3,:] #每隔3行取一个数据  
    print len(spiral),len(spiral[0])  
    cValue = spiral[:,0]  
    print cValue.shape  
    color = ['b','y']  
    cValue = [color[int(i)] for i in list(cValue)]  
    plt.scatter(spiral[:,1],spiral[:,2],c=cValue)  
  
  
def spiralSample():  
    plt.subplot(131)  
    plt.title(u'origal data')  
    plt.scatter(spiral[:,1],spiral[:,2],c=spiral[:,0])  
    #scipy进行聚类,默认depth=2（可得到两类），阈值t为距离阈值，设置criterion='maxclust',找到两类之间最小距离小于t的进行合并  
    #http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster  
    y_pred=hcluster.fclusterdata(spiral[:,1:3],criterion='maxclust',t=2)      
    plt.subplot(132)  
    plt.title(u'use scipy to hierarchy cluster')  
    plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)  
    #scikit进行聚类  
    plt.subplot(133)  
    plt.title(u'use scikit to hierarchy cluster')  
    y_pred = AgglomerativeClustering(n_clusters=2, linkage='ward').fit_predict(spiral[:,1:3])      
    plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)  
    plt.show()  
spiralSample()  

转自：http://blog.csdn.net/yan456jie/article/details/52214815

0 0