机器学习实战 Kmeans

来源:互联网 发布:教程网网站源码php 编辑:程序博客网 时间:2024/05/17 09:16

    
from numpy import *from blaze import infdef loadDataSet(filename):    dataMat=[]    fr=open(filename)    for line in fr.readlines():        curLine=line.strip().split('\t')        fltLine=[float(s) for s in curLine]        dataMat.append(fltLine)    return dataMatdef distEclud(vecA,vecB):    return sqrt(sum(power(vecA-vecB,2)))def randCent(dataSet,k):    n=shape(dataSet)[1]    centroids=mat(zeros((k,n)))    for j in range(n):        minJ=min(dataSet[:,j])        rangeJ=float(max(dataSet[:,j])-minJ)        centroids[:,j]=mat(minJ+rangeJ*random.rand(k,1))  #k个随机数    return centroidsdatMat=mat(loadDataSet('C:/Users/xuwei/Desktop/机器学习/机器学习实战(pdf版+源码)/machinelearninginaction/Ch10/testSet.txt'))def kMeans(dataSet,k,distMeas=distEclud,createCent=randCent):    m=shape(dataSet)[0]    clusterAssment=mat(zeros((m,2)))    centroids=createCent(dataSet,k)    clusterChanged=True    while clusterChanged:        clusterChanged=False        for i in range(m):            minDist=inf            minIndex=-1            for j in range(k):                distJI=distMeas(centroids[j,:],dataSet[i,:])                if distJI<minDist:                    minDist=distJI                    minIndex=j             if clusterAssment[i,0]!=minIndex:                clusterChanged=True            clusterAssment[i,:]=minIndex,minDist**2        print(centroids)        for cent in range(k):            ptsInClust=dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]            centroids[cent,:]=mean(ptsInClust,axis=0)    return centroids,clusterAssment'''datMat=mat(loadDataSet('C:/Users/xuwei/Desktop/机器学习/机器学习实战(pdf版+源码)/machinelearninginaction/Ch10/testSet.txt'))myCentroids,clustAssing=kMeans(datMat, 4)print(myCentroids)print(clustAssing)'''#二分k均值算法def biKmeans(dataSet,k,distMeas=distEclud):    m=shape(dataSet)[0]    clusterAssment=mat(zeros((m,2)))    centroid0=mean(dataSet,axis=0).tolist()[0]  #将matrix转成list,计算数据质心    centList=[centroid0]    for j in range(m):  #遍历每个点到质心的误差值        clusterAssment[j,1]=distMeas(mat(centroid0),dataSet[j,:])**2    while(len(centList)<k):        lowestSSE=Inf        for i in range(len(centList)):            ptsIncurrCluster=dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]  #获得每个簇中的数据集,由于一开始是0,所以第一个数据集是所有数据            centroidMat,splitClustAss=kMeans(ptsIncurrCluster, 2,distMeas)            sseSplit=sum(splitClustAss[:,1])  #这次划分的数据的误差            sseNotSplit=sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])  #剩余数据堆的误差            print("sseSplit,and notSplit:",sseSplit,sseNotSplit)            if(sseSplit+sseNotSplit)<lowestSSE:                bestCentToSplit=i                 bestNewCents=centroidMat                bestClustAss=splitClustAss.copy()                lowestSSE=sseSplit+sseNotSplit        bestClustAss[nonzero(bestClustAss[:,0].A==1)[0],0]=len(centList)  #修改簇的划分编号,一个用原来的编号,一个为新编号即(len(centList))        bestClustAss[nonzero(bestClustAss[:,0].A==0)[0],0]=bestCentToSplit        print('the bestCentToSplit is:',bestCentToSplit)        print('the len of bestClustAss is:',len(bestClustAss))        centList[bestCentToSplit]=bestNewCents[0,:].tolist()[0] #更新簇中心        centList.append(bestNewCents[1,:].tolist()[0])  #将新簇加入        clusterAssment[nonzero(clusterAssment[:,0].A==bestCentToSplit)[0],:]=bestClustAss #更新总的分类簇中的误差    return mat(centList),clusterAssment'''#测试二分聚类        datMat3=mat(loadDataSet('C:/Users/xuwei/Desktop/机器学习/机器学习实战(pdf版+源码)/machinelearninginaction/Ch10/testSet2.txt'))centList,myNewAssments=biKmeans(datMat3,3)print(centList)    '''def distSLC(vecA,vecB):    a=sin(vecA[0,1]*pi/180)*sin(vecB[0,1]*pi/180)    b=cos(vecA[0,1]*pi/180)*cos(vecB[0,1]*pi/180)*cos(pi*(vecB[0,0]-vecA[0,0])/180)    return arccos(a+b)*6371.0import matplotlibimport matplotlib.pyplot as pltdef clusterClubs(numClust=5):    datList = []    for line in open('C:/Users/xuwei/Desktop/机器学习/机器学习实战(pdf版+源码)/machinelearninginaction/Ch10/places.txt').readlines():        lineArr = line.split('\t')        datList.append([float(lineArr[4]), float(lineArr[3])])    datMat = mat(datList)    myCentroids, clustAssing = biKmeans(datMat, numClust, distMeas=distSLC)    fig = plt.figure()    rect=[0.1,0.1,0.8,0.8]    scatterMarkers=['s', 'o', '^', '8', 'p', \                    'd', 'v', 'h', '>', '<']    axprops = dict(xticks=[], yticks=[])    ax0=fig.add_axes(rect, label='ax0', **axprops)    imgP = plt.imread('C:/Users/xuwei/Desktop/机器学习/机器学习实战(pdf版+源码)/machinelearninginaction/Ch10/Portland.png')    ax0.imshow(imgP)    ax1=fig.add_axes(rect, label='ax1', frameon=False)    for i in range(numClust):        ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:]        markerStyle = scatterMarkers[i % len(scatterMarkers)]        ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90)    ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300)    plt.show()    clusterClubs()    

原创粉丝点击