K-means 算法
来源:互联网 发布:股市数据下载 编辑:程序博客网 时间:2024/06/02 01:39
欢迎关注我的博客:http://blog.csdn.NET/hit2015spring
前期预备知识
在无监督的算法中,训练样本的标记信息是未知的,目标是通过对训练样本学习来揭示数据的内在性质和规律。聚类试图将数据集中的样本划分为若干个通常是不相交的子集,每个子集称为一个簇,就是一堆不知道标签的数据样本,这些样本中每一个都包含着一个
就是描述一个事物它具有n个特征,这些特征可以反映出一个物体它属于哪个类别。于是聚类算法将这些样本D划分为
(当然这个男生女生的标签是我们自己加的,在k-means聚类的过程中算法是不知道这个标签的,它只是根据这些特征的联系(就是距离)把认为是同一类的样本聚集在一起)。
这里面引入了距离的定义:
对于两个样本:
表达式(1)叫做闵可夫斯基距离
当
当
当然上述的属性度量是基于这些属性是有“序”的关系。就像:属性值为(1,2,3)1和3距离比较远,和2距离比较近。具体可以用具体的值度量的。当然还有无序的属性,就像:{红衣服,黑衣服,蓝衣服}这样的属性我们不能直接用属性的值进行计算,这里就用到了VDM距离进行计算。具体可以见西瓜书的描述p200.
k均值聚类
给定样本集
这里面
可是要得到这个最小化的解其实是很不容易的,于是k均值用的是一个贪心算法进行近似求解的。
伪代码如下:
1、根据事先选择好的k值,随机在原始样本中选择初值,这些初值就当做是k个中心
2、对所有的点
3、每个点都能得到k个距离,选取最近的那个距离,把这个点归到该类别。
4、这下得到了这k个簇里面都有一些点了吧,计算这些点的中心点,然后更新一下这些k个簇的中心。
5、是否满足你要求的迭代条件,如果没有满足条件,从第2步继续重复。
具体的一个例子
c++代码
#include <stdlib.h>#include <math.h>#include <time.h>#include <iostream>#include "k-means.h"using namespace std;KMeans::KMeans(int dimNum, int clusterNum){ m_dimNum = dimNum; m_clusterNum = clusterNum; m_means = new double*[m_clusterNum]; for(int i = 0; i < m_clusterNum; i++) { m_means[i] = new double[m_dimNum]; memset(m_means[i], 0, sizeof(double) * m_dimNum); } m_initMode = InitRandom; m_maxIterNum = 100; m_endError = 0.001;}KMeans::~KMeans(){ for(int i = 0; i < m_clusterNum; i++) { delete[] m_means[i]; } delete[] m_means;}void KMeans::Cluster(const char* sampleFileName, const char* labelFileName){ // Check the sample file ifstream sampleFile(sampleFileName, ios_base::binary); assert(sampleFile); int size = 0; int dim = 0; sampleFile.read((char*)&size, sizeof(int)); sampleFile.read((char*)&dim, sizeof(int)); assert(size >= m_clusterNum); assert(dim == m_dimNum); // Initialize model Init(sampleFile); // Recursion double* x = new double[m_dimNum]; // Sample data int label = -1; // Class index double iterNum = 0; double lastCost = 0; double currCost = 0; int unchanged = 0; bool loop = true; int* counts = new int[m_clusterNum]; double** next_means = new double*[m_clusterNum]; // New model for reestimation for(int i = 0; i < m_clusterNum; i++) { next_means[i] = new double[m_dimNum]; } while(loop) { //clean buffer for classification memset(counts, 0, sizeof(int) * m_clusterNum); for(int i = 0; i < m_clusterNum; i++) { memset(next_means[i], 0, sizeof(double) * m_dimNum); } lastCost = currCost; currCost = 0; sampleFile.clear(); sampleFile.seekg(sizeof(int) * 2, ios_base::beg); // Classification for(int i = 0; i < size; i++) { sampleFile.read((char*)x, sizeof(double) * m_dimNum); currCost += GetLabel(x, &label); counts[label]++; for(int d = 0; d < m_dimNum; d++) { next_means[label][d] += x[d]; } } currCost /= size; // Reestimation for(int i = 0; i < m_clusterNum; i++) { if(counts[i] > 0) { for(int d = 0; d < m_dimNum; d++) { next_means[i][d] /= counts[i]; } memcpy(m_means[i], next_means[i], sizeof(double) * m_dimNum); } } // Terminal conditions iterNum++; if(fabs(lastCost - currCost) < m_endError * lastCost) { unchanged++; } if(iterNum >= m_maxIterNum || unchanged >= 3) { loop = false; } //DEBUG //cout << "Iter: " << iterNum << ", Average Cost: " << currCost << endl; } // Output the label file ofstream labelFile(labelFileName, ios_base::binary); assert(labelFile); labelFile.write((char*)&size, sizeof(int)); sampleFile.clear(); sampleFile.seekg(sizeof(int) * 2, ios_base::beg); for(int i = 0; i < size; i++) { sampleFile.read((char*)x, sizeof(double) * m_dimNum); GetLabel(x, &label); labelFile.write((char*)&label, sizeof(int)); } sampleFile.close(); labelFile.close(); delete[] counts; delete[] x; for(int i = 0; i < m_clusterNum; i++) { delete[] next_means[i]; } delete[] next_means;}//N 为特征向量数void KMeans::Cluster(double *data, int N, int *Label){ int size = 0; size = N; assert(size >= m_clusterNum); // Initialize model Init(data,N); // Recursion double* x = new double[m_dimNum]; // Sample data int label = -1; // Class index double iterNum = 0; double lastCost = 0; double currCost = 0; int unchanged = 0; bool loop = true; int* counts = new int[m_clusterNum]; double** next_means = new double*[m_clusterNum]; // New model for reestimation for(int i = 0; i < m_clusterNum; i++) { next_means[i] = new double[m_dimNum]; } while(loop) { //clean buffer for classification memset(counts, 0, sizeof(int) * m_clusterNum); for(int i = 0; i < m_clusterNum; i++) { memset(next_means[i], 0, sizeof(double) * m_dimNum); } lastCost = currCost; currCost = 0; // Classification for(int i = 0; i < size; i++) { for(int j = 0; j < m_dimNum; j++) x[j] = data[i*m_dimNum+j]; currCost += GetLabel(x, &label); counts[label]++; for(int d = 0; d < m_dimNum; d++) { next_means[label][d] += x[d]; } } currCost /= size; // Reestimation for(int i = 0; i < m_clusterNum; i++) { if(counts[i] > 0) { for(int d = 0; d < m_dimNum; d++) { next_means[i][d] /= counts[i]; } memcpy(m_means[i], next_means[i], sizeof(double) * m_dimNum); } } // Terminal conditions iterNum++; if(fabs(lastCost - currCost) < m_endError * lastCost) { unchanged++; } if(iterNum >= m_maxIterNum || unchanged >= 3) { loop = false; } //DEBUG //cout << "Iter: " << iterNum << ", Average Cost: " << currCost << endl; } // Output the label file for(int i = 0; i < size; i++) { for(int j = 0; j < m_dimNum; j++) x[j] = data[i*m_dimNum+j]; GetLabel(x,&label); Label[i] = label; } delete[] counts; delete[] x; for(int i = 0; i < m_clusterNum; i++) { delete[] next_means[i]; } delete[] next_means;}void KMeans::Init(double *data, int N){ int size = N; if(m_initMode == InitRandom) { int inteval = size / m_clusterNum; double* sample = new double[m_dimNum]; // Seed the random-number generator with current time srand((unsigned)time(NULL)); for(int i = 0; i < m_clusterNum; i++) { int select = inteval * i + (inteval - 1) * rand() / RAND_MAX; for(int j = 0; j < m_dimNum; j++) sample[j] = data[select*m_dimNum+j]; memcpy(m_means[i], sample, sizeof(double) * m_dimNum); } delete[] sample; } else if(m_initMode == InitUniform) { double* sample = new double[m_dimNum]; for(int i = 0; i < m_clusterNum; i++) { int select = i * size / m_clusterNum; for(int j = 0; j < m_dimNum; j++) sample[j] = data[select*m_dimNum+j]; memcpy(m_means[i], sample, sizeof(double) * m_dimNum); } delete[] sample; } else if(m_initMode == InitManual) { // Do nothing }}void KMeans::Init(ifstream& sampleFile){ int size = 0; sampleFile.seekg(0, ios_base::beg); sampleFile.read((char*)&size, sizeof(int)); if(m_initMode == InitRandom) { int inteval = size / m_clusterNum; double* sample = new double[m_dimNum]; // Seed the random-number generator with current time srand((unsigned)time(NULL)); for(int i = 0; i < m_clusterNum; i++) { int select = inteval * i + (inteval - 1) * rand() / RAND_MAX; int offset = sizeof(int) * 2 + select * sizeof(double) * m_dimNum; sampleFile.seekg(offset, ios_base::beg); sampleFile.read((char*)sample, sizeof(double) * m_dimNum); memcpy(m_means[i], sample, sizeof(double) * m_dimNum); } delete[] sample; } else if(m_initMode == InitUniform) { double* sample = new double[m_dimNum]; for (int i = 0; i < m_clusterNum; i++) { int select = i * size / m_clusterNum; int offset = sizeof(int) * 2 + select * sizeof(double) * m_dimNum; sampleFile.seekg(offset, ios_base::beg); sampleFile.read((char*)sample, sizeof(double) * m_dimNum); memcpy(m_means[i], sample, sizeof(double) * m_dimNum); } delete[] sample; } else if(m_initMode == InitManual) { // Do nothing }}double KMeans::GetLabel(const double* sample, int* label){ double dist = -1; for(int i = 0; i < m_clusterNum; i++) { double temp = CalcDistance(sample, m_means[i], m_dimNum); if(temp < dist || dist == -1) { dist = temp; *label = i; } } return dist;}double KMeans::CalcDistance(const double* x, const double* u, int dimNum){ double temp = 0; for(int d = 0; d < dimNum; d++) { temp += (x[d] - u[d]) * (x[d] - u[d]); } return sqrt(temp);}ostream& operator<<(ostream& out, KMeans& kmeans){ out << "<KMeans>" << endl; out << "<DimNum> " << kmeans.m_dimNum << " </DimNum>" << endl; out << "<ClusterNum> " << kmeans.m_clusterNum << " </CluterNum>" << endl; out << "<Mean>" << endl; for(int i = 0; i < kmeans.m_clusterNum; i++) { for(int d = 0; d < kmeans.m_dimNum; d++) { out << kmeans.m_means[i][d] << " "; } out << endl; } out << "</Mean>" << endl; out << "</KMeans>" << endl; return out;}#pragma once#include <fstream>class KMeans{public: enum InitMode { InitRandom, InitManual, InitUniform, }; KMeans(int dimNum = 1, int clusterNum = 1); ~KMeans(); void SetMean(int i, const double* u){ memcpy(m_means[i], u, sizeof(double) * m_dimNum); } void SetInitMode(int i) { m_initMode = i; } void SetMaxIterNum(int i) { m_maxIterNum = i; } void SetEndError(double f) { m_endError = f; } double* GetMean(int i) { return m_means[i]; } int GetInitMode() { return m_initMode; } int GetMaxIterNum() { return m_maxIterNum; } double GetEndError() { return m_endError; } /* SampleFile: <size><dim><data>... LabelFile: <size><label>... */ void Cluster(const char* sampleFileName, const char* labelFileName); void Init(std::ifstream& sampleFile); void Init(double *data, int N); void Cluster(double *data, int N, int *Label); friend std::ostream& operator<<(std::ostream& out, KMeans& kmeans);private: int m_dimNum; int m_clusterNum; double** m_means; int m_initMode; int m_maxIterNum; // The stopping criterion regarding the number of iterations double m_endError; // The stopping criterion regarding the error double GetLabel(const double* x, int* label); double CalcDistance(const double* x, const double* u, int dimNum);};#include <iostream>#include "k-means.h"using namespace std;int main(){ double data[] = { 0.0, 0.2, 0.4, 0.3, 0.2, 0.4, 0.4, 0.2, 0.4, 0.5, 0.2, 0.4, 5.0, 5.2, 8.4, 6.0, 5.2, 7.4, 4.0, 5.2, 4.4, 10.3, 10.4, 10.5, 10.1, 10.6, 10.7, 11.3, 10.2, 10.9 }; const int size = 10; //Number of samples const int dim = 3; //Dimension of feature const int cluster_num = 4; //Cluster number KMeans* kmeans = new KMeans(dim,cluster_num); int* labels = new int[size]; kmeans->SetInitMode(KMeans::InitUniform); kmeans->Cluster(data,size,labels); for(int i = 0; i < size; ++i) { printf("%f, %f, %f belongs to %d cluster\n", data[i*dim+0], data[i*dim+1], data[i*dim+2], labels[i]); } delete []labels; delete kmeans; return 0;}
- K-means算法补充:K-means++
- k-means算法
- K-MEANS算法
- K-MEANS算法
- K-means算法
- k-means算法
- K-Means 算法
- K-Means 算法
- 深入浅出K-Means算法
- 聚类:K-means算法
- K-Means 算法
- k-means 算法
- 深入浅出K-Means算法
- K-Means 算法
- 深入浅出K-means算法
- 深入浅出K-Means算法
- k means算法入门
- K-means算法
- 比较好的Android开发帖子
- 有关ViewPager使用及解决ViewPager的item需要展示多个控件
- C#获取鼠标位置,模拟鼠标,模拟键盘(多方转载)
- memcached--统计命令
- scp命令
- K-means 算法
- 剑指offer--之字符串中先出现两次的字符
- 我是怎么绕过微信小程序审核机制的
- 【C/C++】文件流操作
- ps -ef 命令详解
- spring学习笔记(二)
- Python开发实战pdf
- Mysql Binlog三种格式介绍及分析
- 5.x的AssetBundle