k Means clustering c++ implementation

来源:互联网 发布:做条形码的软件 编辑:程序博客网 时间:2024/04/30 19:56

 KMeans is a very common unsupervised learning algorithm, below is the algorithm.


c++ implementation:

#include <iostream>#include <fstream>#include <string>#include <stdlib.h>#include <time.h>using namespace std;// get the all the lines of the fileint getFileLines(string fileName){string line;ifstream inf;inf.open(fileName);if(!inf){cerr << "Error: file could not be opened" << endl;exit(1);}int total = 0;while (!inf.eof()) {getline(inf, line);++total;}inf.close();return total;}/**get file columns through a delimiter, because all lines in the test fileshas the same columns, so just count the first line to get the file column**/int getFileColumns(string fileName, string delimiter){string line;ifstream inf;inf.open(fileName);if(!inf){cerr << "Error: file could not be opened" << endl;exit(1);}int column = 0;getline(inf, line);int start, end;start = 0;end = line.find(delimiter);while(end != std::string::npos){//line.substr(start, end - start);column++;start = end + 1;end = line.find(delimiter, start);}column++;inf.close();//cout << "column is " << column << endl;return column;}/*get max and min value of an array*/template<class T> void getMaxMin(T array[], int size, T& min, T& max){min = max = array[0];for (int i = 0; i < size; i++){if (array[i] > max){max = array[i];}else if (array[i] < min){min = array[i];}}}/*create two dimensional double array, allocate the memory*/double** new2DArray(int m, int n){double** resultSet;resultSet = new double*[m];for (int i = 0; i < m; i++){resultSet[i] = new double[n];}return resultSet;}/*create two dimensional array, allocate the memory*/template<class T>void new2DArray(T** resultSet, int m, int n){resultSet = new T*[m];for (int i = 0; i < m; i++){resultSet[i] = new T[n];}}/*delete two dimensional array, free the allocated memory*/template<class T> void delete2DAarray(T** twoDArray, int m){for (int i = 0; i < m; i++){delete[] twoDArray[i];}delete[] twoDArray;}/*get the data set of a file and return to a two dimensional array.*/double** getDataSet(string fileName, string delimiter){double** resultSet;int fileLines = getFileLines(fileName);int fileColumns = getFileColumns(fileName, delimiter);resultSet = new2DArray(fileLines, fileColumns);ifstream inf;inf.open(fileName);if(!inf){cerr << "Error: file could not be opened" << endl;exit(1);}for (int i = 0; i < fileLines; i++){for (int j = 0; j < fileColumns; j++){inf >> resultSet[i][j];cout << resultSet[i][j] << " ";}cout << endl;}return resultSet;}/*get the data set of a file and return to a two dimensional array.*/double** getDataSet(string fileName, int fileLines, int fileColumns){double** resultSet;resultSet = new2DArray(fileLines, fileColumns);ifstream inf;inf.open(fileName);if(!inf){cerr << "Error: file could not be opened" << endl;exit(1);}for (int i = 0; i < fileLines; i++){for (int j = 0; j < fileColumns; j++){inf >> resultSet[i][j];cout << resultSet[i][j] << " ";}cout << endl;}return resultSet;}/*create k random cluster centers, each center between the dimensional bounds. */double** randCent(double** dataSet, int m, int n, int k){srand (time(NULL));double** centroids;centroids = new2DArray(k, n);for (int j = 0; j < n; j++){// get max and min value of column jdouble max = dataSet[0][j];double min = dataSet[0][j];for(int i = 0; i < m; i++){if (dataSet[i][j] > max){max = dataSet[i][j];}else if (dataSet[i][j] < min){min = dataSet[i][j];}}// get the range of column jdouble range = max - min;// get k random centroidsfor(int i = 0; i < k; i++){centroids[i][j] = min + (double)rand() / ( RAND_MAX + 1) * range; //centroids[i][j] = min + range * rand() / ((double) RAND_MAX);cout << centroids[i][j] << " ";}cout << endl;}return centroids;}// calculate the two vectors' euclid distancedouble disteuclid(double *vecA, double *vecB, int size){double dresult = 0.0;for (int i = 0; i < size; i++){dresult += pow(vecA[i] - vecB[i], 2);}return sqrt(dresult);}/*get data set mean values, assign the values to a n column array*/double* getdataSetMean(double** dataSet, int m, int n){double* result = new double[n];for (int j = 0; j < n; j++){double sum = 0.0;for (int i = 0; i < m; i++){sum += dataSet[i][j];}result[j] = sum / m;}return result;}/*get data set mean values of a specified cluster, assign the values to a n column arrayparameter cent is the cluster index*/double* getdataSetMean(double** dataSet, double** clustersRecord, int m, int n, int cent){double* result = new double[n];for (int j = 0; j < n; j++){double sum = 0.0;int count = 0;for (int i = 0; i < m; i++){if ((int)clustersRecord[i][0] == cent){++count;sum += dataSet[i][j];}}result[j] = sum / count;}return result;}/*recalculate the k cluster centroids, assign the centroid to mean of that cluster values*/void getNewCentroids(double** dataSet, double** centroids, double** clustersRecord, int m, int n, int k){int* kcount = new int[k];memset(kcount, 0, k * sizeof(int));/*for (int i = 0; i < k; i++){kcount[i] = 0;}*/// it seems that memset can't initialize this 2d array//memset(centroids, 0.0, k * n * sizeof(centroids[0][0]));for (int i = 0; i < k; i++){for (int j = 0; j < n; j++){centroids[i][j] = 0.0;}}for (int i = 0; i < m; i++){int kk = (int)clustersRecord[i][0];kcount[kk] += 1;for (int j = 0; j < n; j++){centroids[kk][j] += dataSet[i][j];}}for (int i = 0; i < k; i++){for (int j = 0; j < n; j++){centroids[i][j] = centroids[i][j] / kcount[i];}}}/*implement kmeans algorithm*/void kMeans(double** dataSet, int m, int n, int k){double** clustersRecord;// clustersRecord is a two dimensional array with 2 columns, // first the cluster index, second the euclid distanceclustersRecord = new2DArray(m, 2);// get k random centroidsdouble** centroids = randCent(dataSet, m, n, k);bool clusterChanged = true;int count = 0;// count is the number when to convergewhile (clusterChanged){clusterChanged = false;for (int i = 0; i < m; i++){double minDist = 100000;int minIndex = -1;for (int j = 0; j < k; j++){double distJ = disteuclid(centroids[j], dataSet[i], n);if (distJ < minDist){minDist = distJ;minIndex = j;}}if (clustersRecord[i][0] != minIndex){clusterChanged = true;}clustersRecord[i][0] = minIndex;clustersRecord[i][1] = minDist;}count++;cout << "The change time is : " << count << endl;// recalculate controids method 1getNewCentroids(dataSet, centroids, clustersRecord, m, n, k);//// recalculate controids method 2//for (int cent = 0; cent < k; cent++)//{//// get all the points in this cluster and assign centroid to mean//centroids[cent] = getdataSetMean(dataSet, clustersRecord, m, n, cent);//}}// print the cluster records, the first colum is the point belong to which cluster, // the second column is distance to that centroidfor (int i = 0; i < m; i++){cout << "clustersRecord[" << i << "][0] : " << clustersRecord[i][0] << " clustersRecord[" << i << "][1] : " << clustersRecord[i][1] << endl;}delete2DAarray(centroids, k);delete2DAarray(clustersRecord, m);}int main(){string fileName = "test.txt";// test get file data setint fileLines = getFileLines(fileName);int fileColumns = getFileColumns(fileName, "\t");double** dataSet = getDataSet(fileName, fileLines, fileColumns);//// test randCent//cout << "get randCent: " << endl;//randCent(dataSet, fileLines, fileColumns, 4);// test kmeanscout << "kmeans test: " << endl;kMeans(dataSet, fileLines, fileColumns, 4);delete2DAarray(dataSet, fileLines);cout << "the end" << endl;return 0;}

Below is the test.txt data set.

1.6589854.285136-3.4536873.4243214.838138-1.151539-5.379713-3.3621040.9725642.924086-3.5679191.5316110.450614-3.302219-3.487105-1.7244322.6687591.594842-3.1564853.1911373.165506-3.999838-2.786837-3.0993544.2081872.984927-2.1233372.9433660.704199-0.479481-0.392370-3.9637042.8316671.574018-0.7901533.3431442.943496-3.357075-3.195883-2.2839262.3364452.875106-1.7863452.5542482.190101-1.906020-3.403367-2.7782881.7781243.880832-1.6883462.2302672.592976-2.054368-4.007257-3.2070662.2577343.387564-2.6790110.7851190.939512-4.023563-3.674424-2.2610842.0462592.735279-3.1894701.7802694.372646-0.822248-2.579316-3.4975761.8890345.190400-0.7987472.1855882.836520-2.658556-3.837877-3.2538152.0967013.886007-2.7090342.9238873.367037-3.184789-2.121479-4.2325862.3295463.179764-3.2848163.2730993.091414-3.815232-3.762093-2.4321913.5420562.778832-1.7368224.2410412.127073-2.983680-4.323818-3.9381163.7921215.135768-4.7864733.3585472.624081-3.260715-4.009299-2.9781152.4935251.963710-2.5136612.6421621.864375-3.176309-3.171184-3.5724522.8942202.489128-2.5625392.8844383.491078-3.947487-2.565729-2.0121143.3329483.983102-1.6168053.5731882.280615-2.559444-2.651229-3.1031982.3213953.154987-1.6857032.9396973.031012-3.620252-4.599622-2.1858294.1962231.126677-2.1338633.0936864.668892-2.562705-2.793241-2.1497062.8841053.043438-2.9676472.8486964.479332-1.764772-4.905566-2.911070


0 0
原创粉丝点击