python数据分析之kDD(二)

来源:互联网 发布:mac如何隐藏dock栏 编辑:程序博客网 时间:2024/06/06 08:47

针对前篇得到的数据,目的做聚类,希望找到不同的学生群体,这些群体分别都有什么特征。聚类算法用的最简单的二元聚类,就是将簇分成两个慢慢聚下去。`#coding: utf-8
”’
Created on Feb 16, 2011
k Means Clustering for Ch10 of Machine Learning in Action
@author: Peter Harrington
”’
from numpy import *

import edXnew

def loadDataSet(fileName): #general function to parse tab -delimited floats
dataMat = [] #assume last column is target value
fileName= unicode(fileName, “utf8”)
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split(‘\t’)
fltLine = map(float,curLine) #map all elements to float()
dataMat.append(fltLine)
return dataMat

def distEclud(vecA, vecB):
return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)

def randCent(dataSet, k):
n = shape(dataSet)[1]
centroids = mat(zeros((k,n)))#create centroid mat
for j in range(1,n):#create random cluster centers, within bounds of each dimension
minJ = min(dataSet[:,j])
# print minJ
rangeJ = float(max(dataSet[:,j]) - minJ)
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
# print centroids
return centroids

def kMeans1(dataSet, k, distMeas=distEclud, createCent=randCent):
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m,2)))#簇分配结果矩阵:一列记录簇的索引,第二列存储误差(质心与数据点的距离).该变量代表数据点的分配情况
centroids = createCent(dataSet, k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):#for each data point assign it to the closest centroid
minDist = inf; minIndex = -1
for j in range(k):
distJI = distMeas(centroids[j,:],dataSet[i,:])#每个质心逐个与数据点计算距离,找到当前数据点i的质心
if distJI < minDist:
minDist = distJI; minIndex = j
if clusterAssment[i,0] != minIndex: #判断之前的计算中质心是否有所改变
clusterChanged = True
clusterAssment[i,:] = minIndex,minDist**2
for cent in range(k):#recalculate centroids
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]#获取当前簇cent的所有点
# if ptsInClust != []:
centroids[cent,:] = mean(ptsInClust, axis=0) #固定行,求每列的平均值。质心等于当前簇获得的所有点的均值点
# else:
# continue
return centroids, clusterAssment
def biKmeans(dataSet, k, distMeas=distEclud):
print ‘start kmeans’
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m,2)))
centroid0 = mean(dataSet, axis=0).tolist()[0]
centList =[centroid0] #create a list with one centroid
for j in range(m):#calc initial Error
clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
while (len(centList) < k):
lowestSSE = inf
for i in range(len(centList)):
ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]#get the data points currently in cluster i
if ptsInCurrCluster!=[]:
centroidMat, splitClustAss = kMeans1(ptsInCurrCluster, 2, distMeas)
sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
else:
continue
# print “sseSplit, and notSplit: “,sseSplit,sseNotSplit
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever
bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
# print ‘the bestCentToSplit is: ‘,bestCentToSplit
# print ‘the len of bestClustAss is: ‘, len(bestClustAss)
print ‘assign cluster’
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids
centList.append(bestNewCents[1,:].tolist()[0])
clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE
print ‘complete kMeans!’
return mat(centList), clusterAssment
代码是下载来的,也不记得在哪下的。主函数是biKmeans.
针对自己的数据的代码#coding: utf-8
import numpy
from numpy import *
import kMeans
class dataMining():
FeatureDict={}
FeatureVec=[]
FeatureVecSVM=[]
enrollment=[]
labelSVM=[]
id=[]
def getDataSet2(self,filename,trainNum1,trainNum2):
self.grade=[]
self.explore=[]
f=open(filename)
i=0
for line in f.readlines():
i=i+1
if i<trainNum2 and i>=trainNum1:
str=line.split(",")
if str[0]!='id':
features=[float(str[1]),float(str[2]),float(str[3]),float(str[4]),float(str[5]),float(str[6]),float(str[7])]
# features=[float(str[4]),float(str[5])]
self.FeatureVec.append(features)
self.enrollment.append(str[8][0])
f.close()
return self.FeatureVec,self.enrollment
def getDataSet3(self,filename,trainNum1,trainNum2):
self.grade=[]
self.explore=[]
f=open(filename)
i=0
for line in f.readlines():
i=i+1
if i<trainNum2 and i>=trainNum1:
str=line.split(",")
if str[0]!='id':
features=[float(str[1]),float(str[2]),float(str[3]),float(str[4]),float(str[5]),float(str[6]),float(str[7]),float(str[8]),float(str[9]),float(str[10])]
# features=[float(str[1])]
self.FeatureVec.append(features)
self.enrollment.append(str[3][0])
f.close()
return self.FeatureVec,self.enrollment
def getDataSet4(self,filename,trainNum1,trainNum2):
self.grade=[]
self.explore=[]
f=open(filename)
i=0
for line in f.readlines():
i=i+1
if i<trainNum2 and i>=trainNum1:
str=line.split(",")
if str[0]!='id':
features=[float(str[1]),float(str[2]),float(str[3]),float(str[4]),float(str[5]),float(str[6]),float(str[7])]
# features=[float(str[1])]
self.id.append(str[0])
self.FeatureVec.append(features)
self.enrollment.append(str[8][0])
f.close()
return self.id,self.FeatureVec,self.enrollment
def getDataSet5(self,filename,trainNum1,trainNum2):
self.grade=[]
self.explore=[]
f=open(filename)
i=0
for line in f.readlines():
i=i+1
if i<trainNum2 and i>=trainNum1:
str=line.split(",")
if str[0]!='id':
features=[float(str[1]),float(str[2]),float(str[3]),float(str[4]),float(str[5]),float(str[6]),float(str[7])]
self.id.append(str[0])
self.FeatureVec.append(features)
self.enrollment.append(str[50][0])
f.close()
return self.id,self.FeatureVec,self.enrollment
def PCA(self,filename,trainNum1,trainNum2):
dataSet,labels=self.getDataSet2(filename,trainNum1,trainNum2)
dataMat=mat(dataSet).transpose()
n,m=dataMat.shape
for i in range(0,n):
mean=dataMat.mean(axis=1)
for j in range(0,m):
dataMat[i,j]=dataMat[i,j]-mean[i]
C=(1/float(m))*dataMat*(dataMat.transpose())
featureValue,featureVec=numpy.linalg.eig(C)
print featureValue,featureVec
sumFeatureValue=0
listValue=[]
listVec={}
for l in range(0,n):
listValue.append(featureValue[l])
sumFeatureValue=sumFeatureValue+listValue[l]
listVec[featureValue[l]]=featureVec[l]
listValue.sort()
# print listValue,listVec
tributeFeature=listValue[7]+listValue[8]+listValue[9]
tribute=float(tributeFeature)/float(sumFeatureValue)
# print featureValue
# print featureVec
print float(tributeFeature)/float(sumFeatureValue)
P=numpy.vstack([listVec[listValue[7]],listVec[listValue[8]],listVec[listValue[9]]])
newDataMat=(P*dataMat).transpose()
X={}
newlist=[]
for p in xrange(m):
newlist.append(newDataMat[p])
for i in xrange(m):
X[newlist[i]]=dataSet[i]
return newDataMat,newlist,labels,X,tribute
def PCA2(self,filename,trainNum1,trainNum2):
dataSet,labels=self.getDataSet2(filename,trainNum1,trainNum2)
dataMat=mat(dataSet)
n,m=dataMat.shape
print n,m
featureValue,featureVec=numpy.linalg.eig(dataMat)
return featureValue,featureVec
def scatterKMeans1(self,filename,dataNum1,dataNum2,k):
print 'please waiting...'
# dataMat,newlist,labels,X,tribute=self.PCA(filename,dataNum1,dataNum2)
dataSet,labels=self.getDataSet3(filename,dataNum1,dataNum2)
dataMat=mat(dataSet)
F=[]
filenamejia=r'E:\new'
f1=open(filenamejia+'\\cluster1.csv','w')
f2=open(filenamejia+'\\cluster2.csv','w')
f3=open(filenamejia+'\\cluster3.csv','w')
f4=open(filenamejia+'\\cluster4.csv','w')
f5=open(filenamejia+'\\cluster5.csv','w')
f6=open(filenamejia+'\\cluster6.csv','w')
f7=open(filenamejia+'\\cluster7.csv','w')
f8=open(filenamejia+'\\cluster8.csv','w')
f9=open(filenamejia+'\\cluster9.csv','w')
f10=open(filenamejia+'\\cluster10.csv','w')
f11=open(filenamejia+'\\result.txt','w')
# f11.write('the feature tribute is :'+str(tribute)+'\n'+'\n')
F.append(f1)
F.append(f2)
F.append(f3)
F.append(f4)
F.append(f5)
F.append(f6)
F.append(f7)
F.append(f8)
F.append(f9)
F.append(f10)
listtotal=[]
listlabel=[]
# initialDataMat=mat(dataSet)
for i2 in range(0,k):
# F[i2].write('access->access,access->wiki,access->discussion,discussion->discussion,discussion->access,discussion->wiki,wiki->wiki,wiki->discussion,wiki->access,others,enrollment'+'\n')
listtotal.append(0)
listlabel.append(0)
# dataMat2=dataMat[:30000]
Centroids, clustAssing=kMeans.biKmeans(dataMat,k,kMeans.distEclud)
numSamples, dim = dataMat.shape
# Centroids=(P*Centroids).transpose()
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
# draw all samples
for i in xrange(numSamples):
markIndex=int(clustAssing[i, 0])
for i3 in range(0,k):
# print markIndex,i3
if markIndex==i3:
# for k1 in range(0,10):
# F[i3].write(str(X.get(newlist[i])[k1])+',')
F[i3].write(str(dataMat[i,0])+','+labels[i]+'\n')
# F[i3].write(labels[i]+'\n')
# F[i3].write(str(dataMat[i,0])+','+str(dataMat[i,1])+','+str(dataMat[i,2])+','+str(dataMat[i,3])+','+str(dataMat[i,4])+','+str(dataMat[i,5])+','+str(dataMat[i,6])+','+labels[i]+'\n')
listtotal[i3]=listtotal[i3]+1
listlabel[i3]=int(labels[i])+listlabel[i3]
for i4 in range(0,k):
F[i4].close()
f11.write('cluster:'+str(i4+1)+'\n'+'DropOut rate:'+str(float(listlabel[i4])/float(listtotal[i4]))+'\n')
f11.close()
# Centroids2,clustAssing2=kMeans.biKmeans(dataMat2,k,kMeans.distEclud)
# for i in range(k):
# matplotlib.pyplot.plot(Centroids2[i, 0], Centroids2[i, 1],marker='*',c='white', markersize = 20)
# matplotlib.pyplot.savefig('E:\experiment data\TsingHuaKMeans\TsingHua.png')
# matplotlib.pyplot.show()
def scatterKMeans2(self,filename,dataNum1,dataNum2,k):
print 'please waiting...'
# dataMat,newlist,labels,X,tribute=self.PCA(filename,dataNum1,dataNum2)
id,dataSet,labels=self.getDataSet4(filename,dataNum1,dataNum2)
dataMat=mat(dataSet)
self.F=[]
title=["navigate", "access", "page_close","discussion", "wiki", "video", "problem"]
# title=['navigate->navigate', 'navigate->access', 'navigate->page_close', 'navigate->discussion', 'navigate->wiki', 'navigate->video', 'navigate->problem', 'access->navigate', 'access->access', 'access->page_close', 'access->discussion', 'access->wiki', 'access->video', 'access->problem', 'page_close->navigate', 'page_close->access', 'page_close->page_close', 'page_close->discussion', 'page_close->wiki', 'page_close->video', 'page_close->problem', 'discussion->navigate', 'discussion->access', 'discussion->page_close', 'discussion->discussion', 'discussion->wiki', 'discussion->video', 'discussion->problem', 'wiki->navigate', 'wiki->access', 'wiki->page_close', 'wiki->discussion', 'wiki->wiki', 'wiki->video', 'wiki->problem', 'video->navigate', 'video->access', 'video->page_close', 'video->discussion', 'video->wiki', 'video->video', 'video->problem', 'problem->navigate', 'problem->access', 'problem->page_close', 'problem->discussion', 'problem->wiki', 'problem->video', 'problem->problem']
# filenamejia=r'E:\experiment data\TimeSequence\OneStepTrans\\'+str(k0+1)+'\\'+str(k)
filenamejia=r'E:\experiment data\TimeSequence\week classify\2ndweek\\'+str(k)
for i10 in range(0,k):
self.F.append(i10)
for i9 in range(0,k):
self.F[i9]=open(filenamejia+'\\cluster'+str(i9+1)+'.csv','w')
print self.F[i9]
self.F[i9].write('id,')
for i90 in range(0,len(title)):
self.F[i9].write(title[i90]+',')
self.F[i9].write('enrollment,\n')
# f11=open(r'E:\experiment data\TimeSequence\OneStepTrans\\'+str(k0+1)+'\\'+str(k)+'\\result.txt','w')
self.f11=open(r'E:\experiment data\TimeSequence\week classify\2ndweek\\'+str(k)+'\\result.txt','w')
# f11.write('the feature tribute is :'+str(tribute)+'\n'+'\n')
self.listtotal=[]
self.listlabel=[]
# initialDataMat=mat(dataSet)
for i2 in range(0,k):
# F[i2].write('access->access,access->wiki,access->discussion,discussion->discussion,discussion->access,discussion->wiki,wiki->wiki,wiki->discussion,wiki->access,others,enrollment'+'\n')
self.listtotal.append(0)
self.listlabel.append(0)
Centroids, clustAssing=kMeans.biKmeans(dataMat,k,kMeans.distEclud)
numSamples, dim = dataMat.shape
# Centroids=(P*Centroids).transpose()
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
# draw all samples
for i in xrange(numSamples):
markIndex=int(clustAssing[i, 0])
for i3 in range(0,k):
# print markIndex,i3
if markIndex==i3:
# for k1 in range(0,10):
# F[i3].write(str(X.get(newlist[i])[k1])+',')
# F[i3].write(str(dataMat[i,0])+','+labels[i]+'\n')
# F[i3].write(labels[i]+'\n')
self.F[i3].write(id[i]+','+str(dataMat[i,0])+','+str(dataMat[i,1])+','+str(dataMat[i,2])+','+str(dataMat[i,3])+','+str(dataMat[i,4])+','+str(dataMat[i,5])+','+str(dataMat[i,6])+','+labels[i]+',\n')
self.listtotal[i3]=self.listtotal[i3]+1
self.listlabel[i3]=int(labels[i])+self.listlabel[i3]
for i4 in range(0,k):
self.F[i4].flush()
self.F[i4].close()
self.f11.write('cluster:'+str(i4+1)+'\n'+'DropOut rate:'+str(float(self.listlabel[i4])/float(self.listtotal[i4]))+'\n')
self.f11.flush()
self.f11.close()
def scatterKMeans3(self,filename,dataNum1,dataNum2,k0,k):
print 'please waiting...'
# dataMat,newlist,labels,X,tribute=self.PCA(filename,dataNum1,dataNum2)
id,dataSet,labels=self.getDataSet4(filename,dataNum1,dataNum2)
dataMat=mat(dataSet)
F=[]
# title=['navigate->navigate', 'navigate->access', 'navigate->page_close', 'navigate->discussion', 'navigate->wiki', 'navigate->video', 'navigate->problem', 'access->navigate', 'access->access', 'access->page_close', 'access->discussion', 'access->wiki', 'access->video', 'access->problem', 'page_close->navigate', 'page_close->access', 'page_close->page_close', 'page_close->discussion', 'page_close->wiki', 'page_close->video', 'page_close->problem', 'discussion->navigate', 'discussion->access', 'discussion->page_close', 'discussion->discussion', 'discussion->wiki', 'discussion->video', 'discussion->problem', 'wiki->navigate', 'wiki->access', 'wiki->page_close', 'wiki->discussion', 'wiki->wiki', 'wiki->video', 'wiki->problem', 'video->navigate', 'video->access', 'video->page_close', 'video->discussion', 'video->wiki', 'video->video', 'video->problem', 'problem->navigate', 'problem->access', 'problem->page_close', 'problem->discussion', 'problem->wiki', 'problem->video', 'problem->problem']
filenamejia=r'E:\experiment data\TimeSequence\1_2week\\'+str(k0+1)+'\\'+str(k)
for i10 in range(0,k):
F.append(i10)
for i9 in range(0,k):
F[i9]=open(filenamejia+'\\cluster'+str(i9+1)+'.csv','w')
print F[i9]
F[i9].write('id,navigate,access,page_close,discussion,wiki,video,problem,enrollment,\n')
f11=open(r'E:\experiment data\TimeSequence\1_2week\\'+str(k0+1)+'\\'+str(k)+'\\result.txt','w')
# f11.write('the feature tribute is :'+str(tribute)+'\n'+'\n')
listtotal=[]
listlabel=[]
# initialDataMat=mat(dataSet)
for i2 in range(0,k):
# F[i2].write('access->access,access->wiki,access->discussion,discussion->discussion,discussion->access,discussion->wiki,wiki->wiki,wiki->discussion,wiki->access,others,enrollment'+'\n')
listtotal.append(0)
listlabel.append(0)
Centroids, clustAssing=kMeans.biKmeans(dataMat,k,kMeans.distEclud)
numSamples, dim = dataMat.shape
# Centroids=(P*Centroids).transpose()
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
# draw all samples
for i in xrange(numSamples):
markIndex=int(clustAssing[i, 0])
for i3 in range(0,k):
# print markIndex,i3
if markIndex==i3:
# for k1 in range(0,10):
# F[i3].write(str(X.get(newlist[i])[k1])+',')
# F[i3].write(str(dataMat[i,0])+','+labels[i]+'\n')
# F[i3].write(labels[i]+'\n')
F[i3].write(id[i]+','+str(dataMat[i,0])+','+str(dataMat[i,1])+','+str(dataMat[i,2])+','+str(dataMat[i,3])+','+str(dataMat[i,4])+','+str(dataMat[i,5])+','+str(dataMat[i,6])+','+labels[i]+',\n')
listtotal[i3]=listtotal[i3]+1
listlabel[i3]=int(labels[i])+listlabel[i3]
for i4 in range(0,k):
F[i4].close()
f11.write('cluster:'+str(i4+1)+'\n'+'DropOut rate:'+str(float(listlabel[i4])/float(listtotal[i4]))+'\n')
f11.close()

包含了机器学习的KNN,支持向量机用于分类。包含了聚类算法的应用。自己看一下应当能回忆起来。
聚类评估是自己写的,参考了一个文档,也不记得来源了。果然是需要多整理才行啊。

#coding: utf-8from kMeans import distEcludfrom numpy import *#计算单个簇内每两个点的距离的均值,用classinnerdis数组表示,一共k组数据;单个簇到其他单个簇的点对点距离的均值。用classdis表示,一共k平方组数据def classinner(filename):    distance=0.0    distanceCen=0.0    # distanceother=[]    # distanceother2=[]    classinnerdis=[]    disClass=0.0    averdis=0.0    averCen=[]    averClass=[]    sumdis=0    classdis1=[]    classdis2=[]    classdis=[]    for k0 in range(0,len(filename)):        # o1=0        point=[]        f=open(filename[k0])        classinnerdis.append(0.0)        classdis.append(0.0)        averCen.append(0.0)        print 'compute'+filename[k0]        for line in f.readlines():            # o1=o1+1            # if o1<100:            str=line.split(',')            # if not(str[0].__contains__('->')):            if not(str[0]=='id'):                point.append([float(str[1]),float(str[2]),float(str[3]),float(str[4]),float(str[5]),float(str[6]),float(str[7])])        point=mat(point)        # print point        classdis[k0]=point.mean(axis=0)        print classdis[k0]        if len(point)>1:            for i in range(0,len(point)):                distanceCen=distEclud(point[i,:],classdis[k0])+distanceCen            #     for j in range(0,len(point)):            #         if i!=j:            #             distance=distEclud(point[i,:],point[j,:])+distance            # classinnerdis[k0]=float(distance)/float((len(point))**(len(point)-1))        averCen[k0]=(float(distanceCen)/float(len(point)))**(1./2)        print 'complete a cluster point compute!'    for i01 in range(0,len(classinnerdis)):        sumdis=sumdis+classinnerdis[i01]    averdis=float(sumdis)/float(len(classinnerdis))    print 'complete all cluster compute'    return classinnerdis,averdis,classdis,averCendef autoNorm(dataSet):        minVals=dataSet.min(0)        maxVals=dataSet.max(0)        ranges=maxVals-minVals        normDataSet=zeros(shape(dataSet))        m=dataSet.shape[0]        normDataSet=dataSet-tile(minVals,(m,1))        normDataSet=normDataSet/tile(ranges,(m,1))        return normDataSetdef getInter(filename1,k,b):    print 'start compute...'    Cmp=0.0    Sep=0.0    classdis11=[]    point1=[]    distanceCen11=0.0    averdis11=0.0    disCluster=0.0    f1=open(filename1)    # o2=0    ocq=0.0    for line2 in f1.readlines():        # o2=o2+1        # if o2<100:        str1=line2.split(',')        if not(str1[0]=='id'):            point1.append([float(str1[1]),float(str1[2]),float(str1[3]),float(str1[4]),float(str1[5]),float(str1[6]),float(str1[7])])    point1=mat(point1)    classdis11=point1.mean(axis=0)    print 'intergrete data centry:'    if len(point1)>1:        for i in range(0,len(point1)):            distanceCen11=distEclud(point1[i,:],classdis11)+distanceCen11    averdis11=(float(distanceCen11)/float(len(point1)))**(1./2)    print 'complete intergrete data point compute!'    filename=[]    filenamejia=r'E:\experiment data\TimeSequence\week classify\4week\\'+str(k)    for i00 in range(0,k):        filename.append(filenamejia+'\\cluster'+str(i00+1)+'.csv')    classinnerdis,averdis,classdis,averCen=classinner(filename)    for j1 in range(0,k):        Cmp=Cmp+float(averCen[j1])/float(averdis11)    Cmp=float(Cmp)/float(k)    for j3 in range(0,len(classdis)):        for j4 in range(0,len(classdis)):            if j3!=j4:                disCluster=disCluster+exp(-((distEclud(classdis[j3],classdis[j4]))/float(1000)))    Sep=(float(disCluster))/float(k*(k-1))    print Sep    ocq=1-(b*Cmp+(1-b)*Sep)    return Cmp,Sep,ocqf0=open(r'E:\experiment data\TimeSequence\week classify\4week\clusterassess.csv','w')f0.write('Cmp,Sep,ocq,\n')for k in range(2,11):    Cmp,Sep,ocq= getInter(r'E:\experiment data\TimeSequence\autoNormData\4\TsingHuaAnalysis4thWeek.csv',k,0.5)    f0.write(str(Cmp)+','+str(Sep)+','+str(ocq)+',\n')f0.close()

包括了结果的输出。自动化地对每一组结果进行计算。从2到11循环是因为自己之前聚类的时候设置的k值是2到10.
大概就是计算类内距离和类间距离。取其综合值ocq作为聚类评估值,ocq越大,该k值下的聚类效果越好。

0 0