机器学习实战 Kmeans
来源:互联网 发布:教程网网站源码php 编辑:程序博客网 时间:2024/05/17 09:16
from numpy import *from blaze import infdef loadDataSet(filename): dataMat=[] fr=open(filename) for line in fr.readlines(): curLine=line.strip().split('\t') fltLine=[float(s) for s in curLine] dataMat.append(fltLine) return dataMatdef distEclud(vecA,vecB): return sqrt(sum(power(vecA-vecB,2)))def randCent(dataSet,k): n=shape(dataSet)[1] centroids=mat(zeros((k,n))) for j in range(n): minJ=min(dataSet[:,j]) rangeJ=float(max(dataSet[:,j])-minJ) centroids[:,j]=mat(minJ+rangeJ*random.rand(k,1)) #k个随机数 return centroidsdatMat=mat(loadDataSet('C:/Users/xuwei/Desktop/机器学习/机器学习实战(pdf版+源码)/machinelearninginaction/Ch10/testSet.txt'))def kMeans(dataSet,k,distMeas=distEclud,createCent=randCent): m=shape(dataSet)[0] clusterAssment=mat(zeros((m,2))) centroids=createCent(dataSet,k) clusterChanged=True while clusterChanged: clusterChanged=False for i in range(m): minDist=inf minIndex=-1 for j in range(k): distJI=distMeas(centroids[j,:],dataSet[i,:]) if distJI<minDist: minDist=distJI minIndex=j if clusterAssment[i,0]!=minIndex: clusterChanged=True clusterAssment[i,:]=minIndex,minDist**2 print(centroids) for cent in range(k): ptsInClust=dataSet[nonzero(clusterAssment[:,0].A==cent)[0]] centroids[cent,:]=mean(ptsInClust,axis=0) return centroids,clusterAssment'''datMat=mat(loadDataSet('C:/Users/xuwei/Desktop/机器学习/机器学习实战(pdf版+源码)/machinelearninginaction/Ch10/testSet.txt'))myCentroids,clustAssing=kMeans(datMat, 4)print(myCentroids)print(clustAssing)'''#二分k均值算法def biKmeans(dataSet,k,distMeas=distEclud): m=shape(dataSet)[0] clusterAssment=mat(zeros((m,2))) centroid0=mean(dataSet,axis=0).tolist()[0] #将matrix转成list,计算数据质心 centList=[centroid0] for j in range(m): #遍历每个点到质心的误差值 clusterAssment[j,1]=distMeas(mat(centroid0),dataSet[j,:])**2 while(len(centList)<k): lowestSSE=Inf for i in range(len(centList)): ptsIncurrCluster=dataSet[nonzero(clusterAssment[:,0].A==i)[0],:] #获得每个簇中的数据集,由于一开始是0,所以第一个数据集是所有数据 centroidMat,splitClustAss=kMeans(ptsIncurrCluster, 2,distMeas) sseSplit=sum(splitClustAss[:,1]) #这次划分的数据的误差 sseNotSplit=sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1]) #剩余数据堆的误差 print("sseSplit,and notSplit:",sseSplit,sseNotSplit) if(sseSplit+sseNotSplit)<lowestSSE: bestCentToSplit=i bestNewCents=centroidMat bestClustAss=splitClustAss.copy() lowestSSE=sseSplit+sseNotSplit bestClustAss[nonzero(bestClustAss[:,0].A==1)[0],0]=len(centList) #修改簇的划分编号,一个用原来的编号,一个为新编号即(len(centList)) bestClustAss[nonzero(bestClustAss[:,0].A==0)[0],0]=bestCentToSplit print('the bestCentToSplit is:',bestCentToSplit) print('the len of bestClustAss is:',len(bestClustAss)) centList[bestCentToSplit]=bestNewCents[0,:].tolist()[0] #更新簇中心 centList.append(bestNewCents[1,:].tolist()[0]) #将新簇加入 clusterAssment[nonzero(clusterAssment[:,0].A==bestCentToSplit)[0],:]=bestClustAss #更新总的分类簇中的误差 return mat(centList),clusterAssment'''#测试二分聚类 datMat3=mat(loadDataSet('C:/Users/xuwei/Desktop/机器学习/机器学习实战(pdf版+源码)/machinelearninginaction/Ch10/testSet2.txt'))centList,myNewAssments=biKmeans(datMat3,3)print(centList) '''def distSLC(vecA,vecB): a=sin(vecA[0,1]*pi/180)*sin(vecB[0,1]*pi/180) b=cos(vecA[0,1]*pi/180)*cos(vecB[0,1]*pi/180)*cos(pi*(vecB[0,0]-vecA[0,0])/180) return arccos(a+b)*6371.0import matplotlibimport matplotlib.pyplot as pltdef clusterClubs(numClust=5): datList = [] for line in open('C:/Users/xuwei/Desktop/机器学习/机器学习实战(pdf版+源码)/machinelearninginaction/Ch10/places.txt').readlines(): lineArr = line.split('\t') datList.append([float(lineArr[4]), float(lineArr[3])]) datMat = mat(datList) myCentroids, clustAssing = biKmeans(datMat, numClust, distMeas=distSLC) fig = plt.figure() rect=[0.1,0.1,0.8,0.8] scatterMarkers=['s', 'o', '^', '8', 'p', \ 'd', 'v', 'h', '>', '<'] axprops = dict(xticks=[], yticks=[]) ax0=fig.add_axes(rect, label='ax0', **axprops) imgP = plt.imread('C:/Users/xuwei/Desktop/机器学习/机器学习实战(pdf版+源码)/machinelearninginaction/Ch10/Portland.png') ax0.imshow(imgP) ax1=fig.add_axes(rect, label='ax1', frameon=False) for i in range(numClust): ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:] markerStyle = scatterMarkers[i % len(scatterMarkers)] ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90) ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300) plt.show() clusterClubs()
阅读全文
0 0
- 机器学习实战--kMeans
- 机器学习实战 Kmeans
- 机器学习实战之KMeans
- 机器学习实战笔记8(kmeans)
- Scikit-learn机器学习实战之Kmeans
- 机器学习实战--kmeans实例讲解
- 机器学习实战学习笔记7——Kmeans
- 机器学习实战之KMeans算法pandas实现
- 机器学习之Kmeans
- 机器学习Kmeans实现
- 机器学习算法-kMeans
- 【机器学习】--Kmeans聚类
- 机器学习:KMeans学习笔记
- python机器学习-聚类KMeans
- 机器学习(9)-kmeans
- 机器学习Matlab实战之图像压缩————Kmeans算法
- 《机器学习实战》kMeans算法(K均值聚类算法)
- 《机器学习实战》二分-kMeans算法(二分K均值聚类)
- mac/linux conda:command no found
- 1032. 挖掘机技术哪家强(20) PAT乙级真题
- BZOJ1090(SCOI2003)[字符串折叠]--区间DP
- CodeForces
- 1052. 卖个萌 (20)
- 机器学习实战 Kmeans
- 关于操作系统进程
- 深入理解 Neutron -- OpenStack 网络实现:VXLAN 模式
- ubuntu开机自动挂载硬盘分区
- 守护进程中/dev/null
- [转载]字符编码
- Legal or Not
- 1053. 住房空置率 (20)
- 微信小程序-项目案例 (二)配置 tabBar