Kmeans聚类c++实现

来源:互联网 发布:淘宝上隐形文胸哪种好 编辑:程序博客网 时间:2024/06/05 12:43

Kmeans聚类算法

Kmeans算法首先随机选取K个质心,然后计算每个样本到每个质心的距离,与哪个质心近就属于哪一个簇,利用分好的样本重新计算质心,再重新计算距离形成新的簇,反复执行该过程,知道簇不再发生变化或者达到最大迭代次数。

K值的选取可以采用canopy方法:
(1)设样本集合为S,确定两个阈值t1和t2,且t1>t2。
(2)任取一个样本点p,作为一个Canopy,记为C,从S中移除p。
(3)计算S中所有点到p的距离dist
(4)若dist小于t1,则将相应点归到C,作为弱关联。
(5)若dist小于t2,则将相应点移出S,作为强关联。
(6)重复(2)~(5),直至S为空。

具体代码如下:

#include <iostream>#include <vector>#include <math.h>#include <time.h>#include <fstream>using namespace std;const int k = 3;double computedistance(vector<double> x, vector<double> central)//一个样本点到一个质心的距离{    double distance=0;    for (int i = 0; i < x.size();i++)    {        distance += (x[i] - central[i])*(x[i] - central[i]);    }    return sqrt(distance);}int belongcluster(vector<double> x, vector<vector<double>> totalcentral)//一个样本属于哪个簇{    double dis = computedistance(x, totalcentral[0]);    double temp;    int label = 0;    for (int i = 1; i < totalcentral.size(); i++)    {        temp = computedistance(x, totalcentral[i]);        if (temp<dis)        {            dis = temp;            label = i;        }    }    return label;}double computevar(vector<vector<double>> cluster[], vector<vector<double>> totalcentral)//计算总方差{    double var = 0;    for (int i = 0; i < k; i++)    {        for (int j = 0; j < cluster[i].size(); j++)        {            var += computedistance(cluster[i][j],totalcentral[i]);        }    }    return var;}void computecentral(vector<vector<double>> cluster, vector<double> &central)//一个簇的质心{    double temp = 0;    for (int i = 0; i < cluster[0].size(); i++)    {        temp = 0;        for (int j = 0; j < cluster.size(); j++)        {            temp += cluster[j][i];        }        central[i] = temp / cluster.size();    }}void printcluster(vector<vector<double>> cluster[])//打印簇中的元素{    for (int i = 0; i < k;i++)    {        cout << "第" << i + 1 << "个簇中的元素:" << endl;        for (int j = 0; j < cluster[i].size();j++)        {            for (int k = 0; k < cluster[i][j].size();k++)            {                cout << cluster[i][j][k] << " ";            }            cout << endl;        }        cout << endl;    }}void Kmeans(vector<vector<double>> sample){    vector<vector<double>> cluster[k];    vector<vector<double>> totalcentral(k);    srand((unsigned int)time(NULL));    int r;//随机产生的一行的index    for (int i = 0; i < k;i++)    {        r = rand() % sample.size();        for (int j = 0; j < sample[i].size();j++)        {            totalcentral[i].push_back(sample[r][j]);        }    }    int label = 0;    for (int i = 0; i < sample.size(); i++)    {        label=belongcluster(sample[i], totalcentral);        cluster[label].push_back(sample[i]);    }    double newvar = computevar(cluster, totalcentral);    cout << "初始的的整体误差平方和为:" << newvar << endl;    double oldvar = 0;    int iter = 0;    while (abs(newvar-oldvar)>=1)    {        cout << "第" << iter + 1 << "次迭代开始:" << endl;        for (int i = 0; i < k;i++)        {            computecentral(cluster[i], totalcentral[i]);        }        for (int i = 0; i < k; i++) //清空每个簇          {            cluster[i].clear();        }        for (int i = 0; i < sample.size(); i++)        {            label = belongcluster(sample[i], totalcentral);            cluster[label].push_back(sample[i]);        }        oldvar = newvar;        newvar = computevar(cluster, totalcentral);        cout << "此次迭代之后的整体误差平方和为:" << newvar << endl;        iter++;    }    cout << "The result is:\n";    printcluster(cluster);}int main(){    ifstream indata;    indata.open("data.txt");    if (!indata)        cout << "打开文件错误!" << endl;    vector<vector<double>> data;    vector<double> rowdata;    double temp;    int featuredim = 4;    while (!indata.eof())    {        for (int j = 0; j < featuredim;j++)        {            indata >> temp;            rowdata.push_back(temp);        }        data.push_back(rowdata);        rowdata.erase(rowdata.begin(), rowdata.end());    }    Kmeans(data);    return 0;}
0 1
原创粉丝点击