文本聚类——Kmeans

来源：互联网发布：教育行业网络推广方案编辑：程序博客网时间：2024/06/06 04:04

上两篇文章分别用朴素贝叶斯算法和KNN算法对newgroup文本进行了分类测试，本文使用Kmeans算法对文本进行聚类。

1、文本预处理

文本预处理在前面两本文章中已经介绍，此处（略）。

2、文本向量化

package com.datamine.kmeans;import java.io.*;import java.util.*;import java.util.Map.Entry;/** * 计算文档的属性向量，将所有文档向量化 * @author Administrator */public class ComputeWordsVector {/** * 计算文档的TF-IDF属性向量，返回Map<文件名，<特征词，TF-IDF值>> * @param testSampleDir 处理好的聚类样本测试样例集 * @return 所有测试样例的属性向量构成的map * @throws IOException */public Map<String,Map<String,Double>> computeTFMultiIDF(String testSampleDir) throws IOException{String word;Map<String,Map<String,Double>> allTestSampleMap = new TreeMap<String, Map<String,Double>>();Map<String,Double> idfPerWordMap = computeIDF(testSampleDir);Map<String,Double> tfPerDocMap = new TreeMap<String, Double>();File[] samples = new File(testSampleDir).listFiles();System.out.println("the total number of test files is " + samples.length);for(int i = 0;i<samples.length;i++){tfPerDocMap.clear();FileReader samReader = new FileReader(samples[i]);BufferedReader samBR = new BufferedReader(samReader);Double wordSumPerDoc = 0.0; //计算每篇文档的总词数while((word = samBR.readLine()) != null){if(!word.isEmpty()){wordSumPerDoc++;if(tfPerDocMap.containsKey(word))tfPerDocMap.put(word, tfPerDocMap.get(word)+1.0);elsetfPerDocMap.put(word, 1.0);}}Double maxCount = 0.0,wordWeight; //记录出现次数最多的词的次数，用作归一化  ？？？Set<Map.Entry<String, Double>> tempTF = tfPerDocMap.entrySet();for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){Map.Entry<String, Double> me = mt.next();if(me.getValue() > maxCount)maxCount = me.getValue();}for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator();mt.hasNext();){Map.Entry<String, Double> me = mt.next();Double IDF = Math.log(samples.length / idfPerWordMap.get(me.getKey()));wordWeight = (me.getValue() / wordSumPerDoc) * IDF;tfPerDocMap.put(me.getKey(), wordWeight);}TreeMap<String,Double> tempMap = new TreeMap<String, Double>();tempMap.putAll(tfPerDocMap);allTestSampleMap.put(samples[i].getName(), tempMap);}printTestSampleMap(allTestSampleMap);return allTestSampleMap;}/** * 输出测试样例map内容，用于测试 * @param allTestSampleMap * @throws IOException  */private void printTestSampleMap(Map<String, Map<String, Double>> allTestSampleMap) throws IOException {// TODO Auto-generated method stubFile outPutFile = new File("E:/DataMiningSample/KmeansClusterResult/allTestSampleMap.txt");FileWriter outPutFileWriter = new FileWriter(outPutFile);Set<Map.Entry<String, Map<String,Double>>> allWords = allTestSampleMap.entrySet();for(Iterator<Entry<String, Map<String, Double>>> it = allWords.iterator();it.hasNext();){Map.Entry<String, Map<String,Double>> me = it.next();outPutFileWriter.append(me.getKey()+" ");Set<Map.Entry<String, Double>> vectorSet = me.getValue().entrySet();for(Iterator<Map.Entry<String, Double>> vt = vectorSet.iterator();vt.hasNext();){Map.Entry<String, Double> vme = vt.next();outPutFileWriter.append(vme.getKey()+" "+vme.getValue()+" ");}outPutFileWriter.append("\n");outPutFileWriter.flush();}outPutFileWriter.close();}/** * 统计每个词的总出现次数，返回出现次数大于n次的词汇构成最终的属性词典 * @param strDir 处理好的newsgroup文件目录的绝对路径 * @param wordMap 记录出现的每个词构成的属性词典 * @return newWordMap 返回出现次数大于n次的词汇构成最终的属性词典 * @throws IOException */public SortedMap<String, Double> countWords(String strDir,Map<String, Double> wordMap) throws IOException {File sampleFile = new File(strDir);File[] sample = sampleFile.listFiles();String word;for(int i =0 ;i < sample.length;i++){if(!sample[i].isDirectory()){FileReader samReader = new FileReader(sample[i]);BufferedReader samBR = new BufferedReader(samReader);while((word = samBR.readLine()) != null){if(!word.isEmpty() && wordMap.containsKey(word))wordMap.put(word, wordMap.get(word)+1);elsewordMap.put(word, 1.0);}samBR.close();}else{countWords(sample[i].getCanonicalPath(),wordMap);}}/* * 去除停顿词后，先用DF算法选取特征词，后面再加入特征词的选取算法 */SortedMap<String,Double> newWordMap = new TreeMap<String, Double>();Set<Map.Entry<String, Double>> allWords = wordMap.entrySet();for(Iterator<Map.Entry<String, Double>> it = allWords.iterator();it.hasNext();){Map.Entry<String, Double> me = it.next();if(me.getValue() > 100) //DF算法降维newWordMap.put(me.getKey(), me.getValue());}return newWordMap;}/** * 计算IDF，即属性词典中每个词在多少个文档中出现过 * @param testSampleDir 聚类算法测试样本所在的目录 * @return 单词IDFmap <单词，包含该单词的文档数> * @throws IOException */public Map<String,Double> computeIDF(String testSampleDir) throws IOException{Map<String,Double> IDFPerWordMap = new TreeMap<String, Double>();//记下当前已经遇到过的该文档中的词Set<String> alreadyCountWord = new HashSet<String>();String word;File[] samples = new File(testSampleDir).listFiles();for(int i = 0;i<samples.length;i++){alreadyCountWord.clear();FileReader tsReader = new FileReader(samples[i]);BufferedReader tsBR = new BufferedReader(tsReader);while((word = tsBR.readLine()) != null){if(!alreadyCountWord.contains(word)){if(IDFPerWordMap.containsKey(word))IDFPerWordMap.put(word, IDFPerWordMap.get(word)+1.0);elseIDFPerWordMap.put(word, 1.0);alreadyCountWord.add(word);}}}return IDFPerWordMap;}/** * 创建聚类算法的测试样例集，主要是过滤出只含有特征词的文档写到一个目录下 * @param srcDir 源目录，已经预处理但是还没有过滤非特征词的文档目录 * @param desDir 目的目录，聚类算法的测试样例目录 * @return 创建测试样例集中特征词数组 * @throws IOException  */public String[] createTestSamples(String srcDir, String desDir) throws IOException {SortedMap<String,Double> wordMap = new TreeMap<String, Double>();wordMap = countWords(srcDir,wordMap);System.out.println("special words map sizes:" + wordMap.size());String word,testSampleFile;File[] sampleDir = new File(srcDir).listFiles();for(int i =0;i<sampleDir.length;i++){File[] sample = sampleDir[i].listFiles();for(int j =0;j<sample.length;j++){testSampleFile = desDir + sampleDir[i].getName()+"_"+sample[j].getName();FileReader samReader = new FileReader(sample[j]);BufferedReader samBR = new BufferedReader(samReader);FileWriter tsWriter = new FileWriter(new File(testSampleFile));while((word = samBR.readLine()) != null){if(wordMap.containsKey(word))tsWriter.append(word + "\n");}tsWriter.flush();tsWriter.close();}}//返回属性词典String[] terms = new String[wordMap.size()];int i = 0;Set<Map.Entry<String, Double>> allWords = wordMap.entrySet();for(Iterator<Map.Entry<String, Double>> it = allWords.iterator();it.hasNext();){Map.Entry<String, Double> me = it.next();terms[i] = me.getKey();i++;}return terms;}}

3、Kmeans算法

Kmeans算法是非常经典的聚类算法，算法主要步骤如下：先选K个（或者随机选择）初始聚类点作为初始中心点，然后就算其他所有点到K个聚类中心点的距离，将点分到最近的聚类中。聚类完后，再次计算各个类中的中心点，中心点发生变化，于是更新中心点，然后再计算其他点到中心点的距离重新聚类，中心点又发生变化，如此迭代下去。

初始点选取策略：随机选，均匀抽样，最大最小法等....

距离的度量方法：1-余弦相似度，2-向量内积

算法停止条件：计算准则函数及设置最大迭代次数

空聚类的处理：注意空聚类导致的程序bug

package com.datamine.kmeans;import java.io.BufferedReader;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.util.*;/** * kmeans聚类算法的实现类，将newsgroup文档集聚成10类、20类、30类 * 算法结束条件：当每个点最近的聚类中心点就是它所属的聚类中心点时，算法结束 * @author Administrator * */public class KmeansCluster {/** * kmeans算法主过程 * @param allTestSampleMap 聚类算法测试样本map(已经向量化) <文件名，<特征词，TF-IDF值>> * @param k 聚类的数量 * @return 聚类结果 <文件名，聚类完成后所属的类别号> */private Map<String, Integer> doProcess(Map<String, Map<String, Double>> allTestSampleMap, int k) {//0、首先获取allTestSampleMap所有文件名顺序组成的数组String[] testSampleNames = new String[allTestSampleMap.size()];int count =0,tsLength = allTestSampleMap.size();Set<Map.Entry<String, Map<String,Double>>> allTestSampleMapSet = allTestSampleMap.entrySet();for(Iterator<Map.Entry<String, Map<String,Double>>> it = allTestSampleMapSet.iterator();it.hasNext();){Map.Entry<String, Map<String,Double>> me = it.next();testSampleNames[count++] = me.getKey();}//1、初始点的选择算法是随机选择或者是均匀分开选择，这里采用后者Map<Integer,Map<String,Double>> meansMap = getInitPoint(allTestSampleMap,k);double [][] distance = new double[tsLength][k]; //distance[i][k]记录点i到聚类中心k的距离//2、初始化k个聚类int[] assignMeans = new int[tsLength]; //记录所有点属于的聚类序号，初始化全部为0Map<Integer,Vector<Integer>> clusterMember = new TreeMap<Integer, Vector<Integer>>();//记录每个聚类的成员点序号Vector<Integer> mem = new Vector<Integer>();int iterNum = 0; //迭代次数while(true){System.out.println("Iteration No." + (iterNum++) + "-------------------------");//3、计算每个点和每个聚类中心的距离for(int i = 0;i < tsLength;i++){for(int j = 0;j<k;j++)distance[i][j] = getDistance(allTestSampleMap.get(testSampleNames[i]),meansMap.get(j));}//4、找出每个点最近的聚类中心int [] nearestMeans = new int[tsLength];for(int i = 0;i < tsLength;i++){nearestMeans[i] = findNearestMeans(distance,i);}//5、判断当前所有点属于的聚类序号是否已经全部是其离的最近的聚类，如果是或者达到最大的迭代次数，那么结束算法int okCount = 0;for(int i= 0;i<tsLength;i++){if(nearestMeans[i] == assignMeans[i])okCount ++;}System.out.println("okCount = " + okCount);if(okCount == tsLength || iterNum >= 10)break;//6、如果前面条件不满足，那么需要重新聚类再次进行一次迭代，需要修改每个聚类的成员和每个点属于的聚类信息clusterMember.clear();for(int i = 0;i < tsLength;i++){assignMeans[i] = nearestMeans[i];if(clusterMember.containsKey(nearestMeans[i])){clusterMember.get(nearestMeans[i]).add(i);}else{mem.clear();mem.add(i);Vector<Integer> tempMem = new Vector<Integer>();tempMem.addAll(mem);clusterMember.put(nearestMeans[i], tempMem);}}//7、重新计算每个聚类的中心点for(int i = 0;i<k;i++){if(!clusterMember.containsKey(i)) //注意kmeans可能产生空聚类continue;Map<String,Double> newMean = computeNewMean(clusterMember.get(i),allTestSampleMap,testSampleNames);Map<String,Double> tempMean = new TreeMap<String,Double>();tempMean.putAll(newMean);meansMap.put(i, tempMean);}}//8、形成聚类结果并且返回 Map<String,Integer> resMap = new TreeMap<String,Integer>();for(int i = 0;i<tsLength;i++){resMap.put(testSampleNames[i], assignMeans[i]);}return resMap;}/** * 计算当前聚类的新中心，采用向量平均 * @param clusterM 该点到所有聚类中心的距离 * @param allTestSampleMap 所有测试样例 <文件名，向量> * @param testSampleNames 所有测试样例名构成的数组 * @return 新的聚类中心向量 */private Map<String, Double> computeNewMean(Vector<Integer> clusterM,Map<String, Map<String, Double>> allTestSampleMap,String[] testSampleNames) {double memberNum = (double)clusterM.size();Map<String,Double> newMeanMap = new TreeMap<String,Double>();Map<String,Double> currentMemMap = new TreeMap<String, Double>();for(Iterator<Integer> it = clusterM.iterator();it.hasNext();){int me = it.next();currentMemMap = allTestSampleMap.get(testSampleNames[me]);Set<Map.Entry<String, Double>> currentMemMapSet = currentMemMap.entrySet();for(Iterator<Map.Entry<String, Double>> jt = currentMemMapSet.iterator();jt.hasNext();){Map.Entry<String, Double> ne = jt.next();if(newMeanMap.containsKey(ne.getKey()))newMeanMap.put(ne.getKey(), newMeanMap.get(ne.getKey())+ne.getValue());elsenewMeanMap.put(ne.getKey(), ne.getValue());}}Set<Map.Entry<String, Double>> newMeanMapSet = newMeanMap.entrySet();for(Iterator<Map.Entry<String, Double>> it = newMeanMapSet.iterator();it.hasNext();){Map.Entry<String, Double> me = it.next();newMeanMap.put(me.getKey(), newMeanMap.get(me.getKey()) / memberNum);}return newMeanMap;}/** * 找出距离当前点最近的聚类中心 * @param distance 点到所有聚类中心的距离 * @param m 点（文本号） * @return 最近聚类中心的序号j */private int findNearestMeans(double[][] distance, int m) {double minDist = 10;int j = 0;for(int i = 0;i<distance[m].length;i++){if(distance[m][i] < minDist){minDist = distance[m][i];j = i;}}return j;}/** * 计算两个点的距离 * @param map1 点1的向量map * @param map2 点2的向量map * @return 两个点的欧式距离 */private double getDistance(Map<String, Double> map1, Map<String, Double> map2) {return 1 - computeSim(map1,map2);}/**计算两个文本的相似度 * @param testWordTFMap 文本1的<单词,词频>向量 * @param trainWordTFMap 文本2<单词,词频>向量 * @return Double 向量之间的相似度 以向量夹角余弦计算（加上注释部分代码即可）或者向量内积计算（不加注释部分，效果相当而速度更快） * @throws IOException  */private double computeSim(Map<String, Double> testWordTFMap,Map<String, Double> trainWordTFMap) {// TODO Auto-generated method stubdouble mul = 0;//, testAbs = 0, trainAbs = 0;Set<Map.Entry<String, Double>> testWordTFMapSet = testWordTFMap.entrySet();for(Iterator<Map.Entry<String, Double>> it = testWordTFMapSet.iterator(); it.hasNext();){Map.Entry<String, Double> me = it.next();if(trainWordTFMap.containsKey(me.getKey())){mul += me.getValue()*trainWordTFMap.get(me.getKey());}//testAbs += me.getValue() * me.getValue();}//testAbs = Math.sqrt(testAbs);/*Set<Map.Entry<String, Double>> trainWordTFMapSet = trainWordTFMap.entrySet();for(Iterator<Map.Entry<String, Double>> it = trainWordTFMapSet.iterator(); it.hasNext();){Map.Entry<String, Double> me = it.next();trainAbs += me.getValue()*me.getValue();}trainAbs = Math.sqrt(trainAbs);*/return mul ;/// (testAbs * trainAbs);}/** * 获取kmeans算法迭代的初始点 * @param allTestSampleMap <文件名，<特征词，TF-IDF值>> * @param k 聚类的数量 * @return  meansMap k个聚类的中心点向量 */private Map<Integer, Map<String, Double>> getInitPoint(Map<String, Map<String, Double>> allTestSampleMap, int k) {int count = 0, i = 0;//保存k个聚类的中心向量Map<Integer,Map<String,Double>> meansMap = new TreeMap<Integer, Map<String,Double>>();System.out.println("本次聚类的初始点对应的文件为：");Set<Map.Entry<String, Map<String,Double>>> allTestSampleMapSet = allTestSampleMap.entrySet();for(Iterator<Map.Entry<String, Map<String,Double>>> it = allTestSampleMapSet.iterator();it.hasNext();){Map.Entry<String, Map<String,Double>> me = it.next();if(count == i*allTestSampleMapSet.size() / k){meansMap.put(i, me.getValue());System.out.println(me.getKey());i++;}count++ ;}return meansMap;}/** * 输出聚类结果到文件中 * @param kmeansClusterResult 聚类结果 * @param kmeansClusterResultFile 输出聚类结果到文件中 * @throws IOException  */private void printClusterResult(Map<String, Integer> kmeansClusterResult,String kmeansClusterResultFile) throws IOException {FileWriter resultWriter = new FileWriter(kmeansClusterResultFile);Set<Map.Entry<String, Integer>> kmeansClusterResultSet = kmeansClusterResult.entrySet();for(Iterator<Map.Entry<String, Integer>> it = kmeansClusterResultSet.iterator();it.hasNext();){Map.Entry<String, Integer> me = it.next();resultWriter.append(me.getKey()+" "+me.getValue()+"\n");}resultWriter.flush();resultWriter.close();}/** * 评估函数根据聚类结果文件统计熵 和 混淆矩阵 * @param kmeansClusterResultFile 聚类结果文件 * @param k 聚类数目 * @return 聚类结果的熵值 * @throws IOException  */private double evaluateClusterResult(String kmeansClusterResultFile, int k) throws IOException {Map<String,String> rightCate = new TreeMap<String, String>();Map<String,String> resultCate = new TreeMap<String, String>();FileReader crReader = new FileReader(kmeansClusterResultFile);BufferedReader crBR  = new BufferedReader(crReader);String[] s;String line;while((line = crBR.readLine()) != null){s = line.split(" ");resultCate.put(s[0], s[1]);rightCate.put(s[0], s[0].split("_")[0]);}crBR.close();return computeEntropyAndConfuMatrix(rightCate,resultCate,k);//返回熵}/** * 计算混淆矩阵并输出，返回熵 * @param rightCate 正确的类目对应map * @param resultCate 聚类结果对应map * @param k 聚类的数目 * @return 返回聚类熵 */private double computeEntropyAndConfuMatrix(Map<String, String> rightCate,Map<String, String> resultCate, int k) {//k行20列，[i,j]表示聚类i中属于类目j的文件数int[][] confusionMatrix = new int[k][20];//首先求出类目对应的数组索引SortedSet<String> cateNames = new TreeSet<String>();Set<Map.Entry<String, String>> rightCateSet = rightCate.entrySet();for(Iterator<Map.Entry<String, String>> it = rightCateSet.iterator();it.hasNext();){Map.Entry<String, String> me = it.next();cateNames.add(me.getValue());}String[] cateNamesArray = cateNames.toArray(new String[0]);Map<String,Integer> cateNamesToIndex = new TreeMap<String, Integer>();for(int i =0;i < cateNamesArray.length ;i++){cateNamesToIndex.put(cateNamesArray[i], i);}for(Iterator<Map.Entry<String, String>> it = rightCateSet.iterator();it.hasNext();){Map.Entry<String, String> me = it.next();confusionMatrix[Integer.parseInt(resultCate.get(me.getKey()))][cateNamesToIndex.get(me.getValue())]++;}//输出混淆矩阵double [] clusterSum = new double[k]; //记录每个聚类的文件数double [] everyClusterEntropy = new double[k]; //记录每个聚类的熵double clusterEntropy = 0;System.out.print("      ");for(int i=0;i<20;i++){System.out.printf("%-6d",i);}System.out.println();for(int i =0;i<k;i++){System.out.printf("%-6d",i);for(int j = 0;j<20;j++){clusterSum[i] += confusionMatrix[i][j];System.out.printf("%-6d",confusionMatrix[i][j]);}System.out.println();}System.out.println();//计算熵值for(int i = 0;i<k;i++){if(clusterSum[i] != 0){for(int j = 0;j< 20 ;j++){double p = (double)confusionMatrix[i][j]/clusterSum[i];if(p!=0)everyClusterEntropy[i] += -p * Math.log(p); }clusterEntropy += clusterSum[i]/(double)rightCate.size() * everyClusterEntropy[i];  }}return clusterEntropy;}public void KmeansClusterMain(String testSampleDir) throws IOException {//首先计算文档TF-IDF向量，保存为Map<String,Map<String,Double>> 即为Map<文件名,Map<特征词，TF-IDF值>>ComputeWordsVector computV = new ComputeWordsVector();//int k[] = {10,20,30}; 三组分类int k[] = {20};Map<String,Map<String,Double>> allTestSampleMap = computV.computeTFMultiIDF(testSampleDir);for(int i =0;i<k.length;i++){System.out.println("开始聚类，聚成"+k[i]+"类");String KmeansClusterResultFile = "E:\\DataMiningSample\\KmeansClusterResult\\";Map<String,Integer> KmeansClusterResult = new TreeMap<String, Integer>();KmeansClusterResult = doProcess(allTestSampleMap,k[i]);KmeansClusterResultFile += k[i];printClusterResult(KmeansClusterResult,KmeansClusterResultFile);System.out.println("The Entropy for this Cluster is " + evaluateClusterResult(KmeansClusterResultFile,k[i]));}}public static void main(String[] args) throws IOException {KmeansCluster test = new KmeansCluster();String KmeansClusterResultFile = "E:\\DataMiningSample\\KmeansClusterResult\\20";System.out.println("The Entropy for this Cluster is " + test.evaluateClusterResult(KmeansClusterResultFile,20));}}

4、程序入口

package com.datamine.kmeans;import java.io.IOException;import java.text.SimpleDateFormat;import java.util.Date;public class ClusterMain {/** * Kmeans 聚类主程序入口 * @param args * @throws IOException  */public static void main(String[] args) throws IOException {//数据预处理 在分类算法中已经实现 这里（略）ComputeWordsVector computeV = new ComputeWordsVector();KmeansCluster kmeansCluster = new KmeansCluster();String srcDir = "E:\\DataMiningSample\\processedSample\\";String desDir = "E:\\DataMiningSample\\clusterTestSample\\";SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");String beginTime = sdf.format(new Date());System.out.println("程序开始执行时间："+beginTime);String[] terms = computeV.createTestSamples(srcDir,desDir);kmeansCluster.KmeansClusterMain(desDir);String endTime = sdf.format(new Date());System.out.println("程序结束执行时间："+endTime);}}

5、聚类结果

程序开始执行时间：2016-03-14 17:02:38special words map sizes:3832the total number of test files is 18828开始聚类，聚成20类本次聚类的初始点对应的文件为：alt.atheism_49960comp.graphics_38307comp.os.ms-windows.misc_10112comp.sys.ibm.pc.hardware_58990comp.sys.mac.hardware_50449comp.windows.x_66402comp.windows.x_68299misc.forsale_76828rec.autos_103685rec.motorcycles_105046rec.sport.baseball_104941rec.sport.hockey_54126sci.crypt_15819sci.electronics_54016sci.med_59222sci.space_61185soc.religion.christian_20966talk.politics.guns_54517talk.politics.mideast_76331talk.politics.misc_178699Iteration No.0-------------------------okCount = 512Iteration No.1-------------------------okCount = 10372Iteration No.2-------------------------okCount = 15295Iteration No.3-------------------------okCount = 17033Iteration No.4-------------------------okCount = 17643Iteration No.5-------------------------okCount = 18052Iteration No.6-------------------------okCount = 18282Iteration No.7-------------------------okCount = 18404Iteration No.8-------------------------okCount = 18500Iteration No.9-------------------------okCount = 18627      0     1     2     3     4     5     6     7     8     9     10    11    12    13    14    15    16    17    18    19    0     482   0     3     3     1     1     0     5     2     1     0     0     2     27    11    53    4     6     15    176   1     4     601   69    8     14    127   7     5     5     8     0     14    31    16    34    2     2     2     1     5     2     1     64    661   96    18    257   26    9     3     0     0     13    25    13    6     2     3     2     6     2     3     0     56    78    575   213   15    119   15    6     2     1     4     131   2     4     2     6     0     2     1     4     1     25    13    151   563   11    50    3     3     1     2     14    125   4     8     1     0     3     0     0     5     2     28    78    25    37    348   13    2     0     0     2     5     38    5     6     2     1     1     2     8     6     20    80    24    21    23    166   38    45    45    26    10    37    87    34    27    22    15    8     35    12    7     4     20    6     24    45    6     629   28    20    14    0     3     87    10    4     1     8     0     13    0     8     0     2     1     10    8     4     25    781   40    1     1     0     70    5     10    2     8     4     2     3     9     4     2     11    0     1     1     11    34    831   1     0     1     7     7     0     1     1     1     8     0     10    10    7     6     2     4     1     7     7     4     633   4     5     11    18    9     5     13    8     10    3     11    1     0     1     9     4     1     20    1     3     286   961   0     17    8     4     2     2     0     5     3     12    3     14    0     6     1     2     2     0     1     1     0     858   51    1     1     2     16    8     69    4     13    3     15    4     7     7     17    5     12    8     5     2     5     46    13    793   6     5     2     30    5     14    2     4     0     1     0     2     4     6     3     4     4     2     14    746   3     1     2     3     55    11    15    30    43    29    39    15    18    12    13    7     3     4     13    195   38    36    5     6     18    5     11    16    195   1     0     2     0     1     1     0     4     1     4     1     4     16    6     846   3     6     16    274   17    8     2     0     2     4     2     1     5     7     0     0     10    30    12    5     28    363   9     289   23    18    19    1     0     0     2     0     0     6     0     1     1     3     1     3     2     9     8     843   48    18    19    10    8     1     1     1     0     2     13    2     6     3     3     9     12    18    5     444   16    164   69    The Entropy for this Cluster is 1.2444339205006887程序结束执行时间：2016-03-14 17:08:24

0 0