k均值聚类,密度聚类,层次聚类
来源:互联网 发布:广州数据库开发工程师 编辑:程序博客网 时间:2024/04/28 06:54
聚类是机器学习中的无监督学习方法的重要一种,近来看了周志华老师的机器学习,专门研究了有关于聚类的一章,收获很多,对于其中的算法也动手实现了一下。主要实现的包括比较常见的k均值聚类、密度聚类和层次聚类,这三种聚类方法上原理都不难,算法过程也很清晰明白。有关于原理可以参阅周志华老师的机器学习第九章,这里只做一下代码的实现。
运行环境是Python2.7+numpy,说实话,numpy坑还是挺多的,其实用Matlab可能会更简单。
k均值聚类,核心是是不断更新簇样本的质心。
[python] view plain copy
- #encoding=utf-8
- __author__ = 'freedom'
- from numpy import*
- import matplotlib.pyplot as plt
- def loadDataSet(fileName):
- '''''
- 本函数用于加载数据
- :param fileName: 数据文件名
- :return:数据集,具有矩阵形式
- '''
- fr = open(fileName)
- dataSet = []
- for line in fr.readlines():
- curLine = line.strip().split('\t')
- inLine = map(float,curLine) # 利用map广播,是的读入的字符串变为浮点型
- dataSet.append(inLine)
- return mat(dataSet)
- def getDistance(vecA,vecB):
- '''''
- 本函数用于计算欧氏距离
- :param vecA: 向量A
- :param vecB: 向量B
- :return:欧氏距离
- '''
- return sqrt(sum(power(vecA-vecB,2)))
- def randCent(dataSet,k):
- '''''
- 本函数用于生成k个随机质心
- :param dataSet: 数据集,具有矩阵形式
- :param k:指定的质心个数
- :return:随机质心,具有矩阵形式
- '''
- n = shape(dataSet)[1] # 获取特征数目
- centRoids = mat(zeros((k,n)))
- for j in range(n):
- minJ = min(dataSet[:,j]) # 获取每个特征的最小值
- rangeJ = float(max(dataSet[:,j]-minJ)) # 获取每个特征的范围
- centRoids[:,j] = minJ + rangeJ*random.rand(k,1) # numpy下的rand表示随机生成k*1的随机数矩阵,范围0-1
- return centRoids
- def kMeans(dataSet,k,disMens = getDistance,createCent = randCent):
- '''''
- 本函数用于k均值聚类
- :param dataSet: 数据集,要求有矩阵形式
- :param k: 指定聚类的个数
- :param disMens: 求解距离的方式,除欧式距离还可以定义其他距离计算方式
- :param createCent: 生成随机质心方式
- :return:随机质心,簇索引和误差距离矩阵
- '''
- m = shape(dataSet)[0]
- clusterAssment = mat(zeros((m,2))) # 要为每个样本建立一个簇索引和相对的误差,所以需要m行的矩阵,m就是样本数
- centRoids = createCent(dataSet,k) # 生成随机质心
- clusterChanged = True
- while clusterChanged:
- clusterChanged = False
- for i in range(m): # 遍历所有样本
- minDist = inf;minIndex = -1 # 初始化最小值
- for j in range(k): # 遍历所有质心
- disJI = disMens(centRoids[j,:],dataSet[i,:])
- if disJI < minDist:
- minDist = disJI;minIndex = j # 找出距离当前样本最近的那个质心
- if clusterAssment[i,0] != minIndex: # 更新当前样本点所属于的质心
- clusterChanged = True # 如果当前样本点不属于当前与之距离最小的质心,则说明簇分配结果仍需要改变
- clusterAssment[i,:] = minIndex,minDist**2
- for cent in range(k):
- ptsInClust = dataSet[nonzero(clusterAssment[:,0].A == cent)[0]]
- # nonzero 返回的是矩阵中所有非零元素的坐标,坐标的行数与列数个存在一个数组或矩阵当中
- # 矩阵支持检查元素的操作,所有可以写成matrix == int这种形式,返回的一个布尔型矩阵,代表矩阵相应位置有无此元素
- # 这里指寻找当前质心下所聚类的样本
- centRoids[cent,:] = mean(ptsInClust,axis = 0) # 更新当前的质心为所有样本的平均值,axis = 0代表对列求平均值
- return centRoids,clusterAssment
- def plotKmens(dataSet,k,clusterMeans):
- '''''
- 本函数用于绘制kMeans的二维聚类图
- :param dataSet: 数据集
- :param k: 聚类的个数
- :return:无
- '''
- centPoids,assment = clusterMeans(dataSet,k)
- fig = plt.figure()
- ax = fig.add_subplot(111)
- ax.scatter(dataSet[:,0],dataSet[:,1],c = 'blue')
- ax.scatter(centRoids[:,0],centRoids[:,1],c = 'red',marker = '+',s = 70)
- plt.show()
- def binKMeans(dataSet, k, distMeas = getDistance):
- '''''
- 本函数用于二分k均值算法
- :param dataSet: 数据集,要求有矩阵形式
- :param k: 指定聚类个数
- :param distMeas: 求解距离的方式
- :return:质心,簇索引和误差距离矩阵
- '''
- m = shape(dataSet)[0]
- clusterAssment = mat(zeros((m,2)))
- centRoids0 = mean(dataSet,axis = 0).tolist()[0] # 初始化一个簇,只有一个质心,分量就是就是所有特征的均值
- # 注意,tolist函数用于将矩阵转化为一个列表,此列表为嵌套列表
- #print centRoids0
- centList = [centRoids0]
- for j in range(m): # 遍历所有样本,计算所有样本与当前质心的距离作为误差
- clusterAssment[j,1] = distMeas(mat(centRoids0),dataSet[j,:])**2
- while (len(centList) < k): # 循环条件为当前质心数目还不够指定数目
- lowestSSE = inf
- for i in range(len(centList)): # 遍历所有质心
- ptsCurrCluster = dataSet[nonzero(clusterAssment[:,0].A == i)[0],:] # 搜索到当前质心所聚类的样本
- centroidsMat,splitClusterAss = kMeans(ptsCurrCluster,2,distMeas) # 将当前分割成两个簇
- sseSplit = sum(splitClusterAss[:,1]) # 计算分裂簇后的SSE
- sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A != i)[0],1])
- # 计算分裂之前的SSE
- if (sseSplit + sseNotSplit) < lowestSSE: # 如果分裂之后的SSE小,则更新
- bestCent2Split = i
- bestNewCents = centroidsMat
- bestClustAss = splitClusterAss.copy()
- lowestSSE = sseSplit+sseNotSplit
- #重新编制簇的编号,凡是分裂后编号为1的簇,编号为质心列表长度,编号为0的簇,编号为最佳分裂质心的编号,以此更新
- bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList)
- bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCent2Split
- centList[bestCent2Split] = bestNewCents[0,:].tolist()[0] # 添加分裂的质心到质心列表中
- centList.append(bestNewCents[1,:].tolist()[0])
- clusterAssment[nonzero(clusterAssment[:,0].A == bestCent2Split)[0],:] = bestClustAss
- return mat(centList),clusterAssment
- def biKmeans(dataSet, k, distMeas=getDistance):
- m = shape(dataSet)[0]
- clusterAssment = mat(zeros((m,2)))
- centroid0 = mean(dataSet, axis=0).tolist()[0]
- centList =[centroid0] #create a list with one centroid
- for j in range(m):#calc initial Error
- clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
- while (len(centList) < k):
- lowestSSE = inf
- for i in range(len(centList)):
- ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]#get the data points currently in cluster i
- centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
- sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
- sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
- print "sseSplit, and notSplit: ",sseSplit,sseNotSplit
- if (sseSplit + sseNotSplit) < lowestSSE:
- bestCentToSplit = i
- bestNewCents = centroidMat
- bestClustAss = splitClustAss.copy()
- lowestSSE = sseSplit + sseNotSplit
- bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever
- bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
- print 'the bestCentToSplit is: ',bestCentToSplit
- print 'the len of bestClustAss is: ', len(bestClustAss)
- centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids
- centList.append(bestNewCents[1,:].tolist()[0])
- clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE
- return mat(centList), clusterAssment
[python] view plain copy
- #encoding=utf-8
- import numpy as np
- import kmeans as km
- import matplotlib.pyplot as plt
- def createDisMat(dataMat):
- m = dataMat.shape[0]
- n = dataMat.shape[1]
- distMat = np.mat(np.zeros((m,m))) # 初始化距离矩阵,这里默认使用欧式距离
- for i in range(m):
- for j in range(m):
- if i == j:
- distMat[i,j] = 0
- else:
- dist = km.getDistance(dataMat[i,:],dataMat[j,:])
- distMat[i,j] = dist
- distMat[j,i] = dist
- return distMat
- def findCore(dataMat,delta,minPts):
- core = []
- m = dataMat.shape[0]
- n = dataMat.shape[1]
- distMat = createDisMat(dataMat)
- for i in range(m):
- temp = distMat[i,:] < delta # 单独抽取矩阵一行做过滤,凡是小于邻域值的都被标记位True类型
- ptsNum = np.sum(temp,1) # 按行加和,统计小于邻域值的点个数
- if ptsNum >= minPts:
- core.append(i) # 满足条件,增加核心点
- return core
- def DBSCAN(dataMat,delta,minPts):
- k = 0
- m = dataMat.shape[0]
- distMat = createDisMat(dataMat) # 获取距离矩阵
- core = findCore(dataMat,delta,minPts) # 获取核心点列表
- unVisit = [1] * m # hash值作为标记,当某一位置的数据位1时,表示还未被访问,为0表示已经被访问
- Q = []
- ck = []
- unVistitOld = []
- while len(core) != 0:
- print 'a'
- unVistitOld = unVisit[:] # 保留原始的未被访问集
- i = np.random.choice(core) # 在核心点集中随机选择样本
- Q.append(i) # 加入对列Q
- unVisit[i] = 0 #剔除当前加入对列的数据,表示已经访问到了
- while len(Q) != 0:
- print len(Q)
- temp = distMat[Q[0],:]<delta # 获取在此核心点邻域范围内的点集
- del Q[0]
- ptsNum = np.sum(temp,1)
- if ptsNum >= minPts:
- for j in range(len(unVisit)):
- if unVisit[j] == 1 and temp[0,j] == True:
- Q.append(j)
- unVisit[j] = 0
- k += 1
- ck.append([])
- for index in range(m):
- if unVistitOld[index] == 1 and unVisit[index] == 0: # 上一轮未被访问到此轮被访问到的点均要加入当前簇
- ck[k-1].append(index)
- if index in core: # 在核心点集中清除当前簇的点
- del core[core.index(index)]
- return ck
- def plotAns(dataSet,ck):
- fig = plt.figure()
- ax = fig.add_subplot(111)
- ax.scatter(dataSet[ck[0],0],dataSet[ck[0],1],c = 'blue')
- ax.scatter(dataSet[ck[1],0],dataSet[ck[1],1],c = 'red')
- ax.scatter(dataSet[ck[2],0],dataSet[ck[2],1],c = 'green')
- ax.scatter(dataSet[ck[3],0],dataSet[ck[3],1],c = 'yellow')
- #ax.scatter(centRoids[:,0],centRoids[:,1],c = 'red',marker = '+',s = 70)
- plt.show()
- if __name__ == '__main__':
- dataMat = km.loadDataSet("testSet.txt")
- # distMat = createDisMat(dataMat)
- # core = findCore(dataMat,1,5)
- # print distMat
- # print len(core)
- ck = DBSCAN(dataMat,2,15)
- print ck
- print len(ck)
- plotAns(dataMat,ck)
层次聚类,核心是定义了簇之间的距离衡量,不断寻找距离最近的簇归为一簇。
[python] view plain copy
- #encoding=utf-8
- import numpy as np
- import DBSCAN as db
- import kmeans as km
- def calcDistByMin(dataMat,ck1,ck2): # 最小距离点作为簇间的距离
- min = np.inf
- for vec1 in ck1:
- for vec2 in ck2:
- dist = km.getDistance(dataMat[vec1,:],dataMat[vec2,:])
- if dist <= min:
- min = dist
- return min
- def calcDistByMax(dataMat,ck1,ck2): # 最大距离点作为簇间的距离
- max = 0
- for vec1 in ck1:
- for vec2 in ck2:
- dist = km.getDistance(dataMat[vec1,:],dataMat[vec2,:])
- if dist >= max:
- max = dist
- return max
- def createDistMat(dataMat,calcDistType = calcDistByMin): # 生成初始的距离矩阵
- m = dataMat.shape[0]
- distMat = np.mat(np.zeros((m,m)))
- for i in range(m):
- for j in range(m):
- listI = [i];listJ = [j] # 为配合距离函数的输入参数形式,在这里要列表化一下
- distMat[i,j] = calcDistType(dataMat,listI,listJ)
- distMat[j,i] = distMat[i,j]
- return distMat
- def findMaxLoc(distMat,q): # 寻找矩阵中最小的元素并返回其位置,注意,这里不能返回相同的坐标
- min = np.inf
- I = J = 0
- for i in range(q):
- for j in range(q):
- if distMat[i,j] < min and i != j:
- min = distMat[i,j]
- I = i
- J = j
- return I,J
- def ANGES(dataMat,k,calcDistType = calcDistByMax):
- m = dataMat.shape[0]
- ck = []
- for i in range(m):
- ck.append([i])
- distMat = createDistMat(dataMat,calcDistType)
- q = m # 初始化点集个数
- while q > k:
- i,j = findMaxLoc(distMat,q)
- #print i,j
- if i > j:
- i,j = j,i # 保证i<j,这样做是为了删除的是序号较大的簇
- ck[i].extend(ck[j]) # 把序号较大的簇并入序号小的簇
- del ck[j] # 删除序号大的簇
- distMat = np.delete(distMat,j,0) # 在距离矩阵中删除该簇的数据,注意这里delete函数有返回值,否则不会有删除作用
- distMat = np.delete(distMat,j,1)
- print distMat.shape
- for index in range(0,q-1): # 重新计算新簇和其余簇之间的距离
- distMat[i,index] = calcDistType(dataMat,ck[i],ck[index])
- distMat[i,index] = distMat[index,i]
- q -= 1 # 一个点被分入簇中,自减
- return ck
- if __name__ == '__main__':
- dataMat = km.loadDataSet("testSet.txt")
- ck = ANGES(dataMat,4)
- print ck
- db.plotAns(dataMat,ck)
0 0
- k均值聚类,密度聚类,层次聚类
- K-均值聚类
- K均值聚类
- k均值聚类
- k均值聚类
- K均值聚类
- K均值聚类
- k均值聚类
- K-均值聚类
- K-均值聚类
- K均值聚类
- K均值聚类
- k均值聚类
- k-均值聚类
- k均值聚类
- k均值聚类和二分k均值聚类
- [C#]K均值聚类
- K-均值聚类算法
- 解释器模式——我最懂你
- JavaScript对象介绍
- WebView笔记
- 2017 京东校招编程题 进制转换
- Ubuntu 14.04利用byzanz制作gif格式动画
- k均值聚类,密度聚类,层次聚类
- Java的抽象类和接口
- java关键字:fianl的一些简单的用法
- jquery判断某个元素是否包含某个类名
- cookie的生命周期
- error C2470: “main()”: 看起来像函数定义,但没有参数列表;跳过明显的函数体
- 手机蓝牙各类服务对应的UUID(常用的几个已通过验证)
- 使用share prefernces实现轻量级数据存储
- hdu 2846-字典树