聚类算法-DBSCAN-C++实现

来源:互联网 发布:不要网络的混乱大枪战 编辑:程序博客网 时间:2024/05/29 06:36

程序流程图:


DBSCAN核心功能函数,计算每个point的eps范围内的point数量pts;

对于所有pts >Minpts的point,记为Core point;

对于所有的corepoint,将其eps范围内的core point下标添加到vector<int>::corepts中;

对于所有的corepoint,采用深度优先的方式遍历core point的所有cluster,使得相互连接的core point具有相同的cluster编号;

计算所有pts < Minpts且在Core point范围内的,记为Borderpoint;

将所有Borderpoint加入到任意一个关联的core point;

剩余的point的为Noise point,文件写写入时忽略;

将point信息写入clustering文件,程序结束。

/*DBSCAN Algorithm15S103182Ethan*/#include <iostream>#include <sstream>#include <fstream>#include <vector>#include <ctime>#include <cstdlib>#include <limits>#include <cmath>#include <stack>using namespace std;class point{public:float x;float y;int cluster=0;int pointType=1;//1 noise 2 border 3 coreint pts=0;//points in MinPts vector<int> corepts;int visited = 0;point (){}point (float a,float b,int c){x = a;y = b;cluster = c;}};float stringToFloat(string i){stringstream sf;float score=0;sf<<i;sf>>score;return score;}vector<point> openFile(const char* dataset){fstream file;file.open(dataset,ios::in);if(!file)     {        cout <<"Open File Failed!" <<endl;        vector<point> a;        return a;    } vector<point> data;int i=1;while(!file.eof()){string temp;file>>temp;int split = temp.find(',',0);point p(stringToFloat(temp.substr(0,split)),stringToFloat(temp.substr(split+1,temp.length()-1)),i++);data.push_back(p);}file.close();cout<<"successful!"<<endl;return data;}float squareDistance(point a,point b){return sqrt((a.x-b.x)*(a.x-b.x)+(a.y-b.y)*(a.y-b.y));}void DBSCAN(vector<point> dataset,float Eps,int MinPts){int len = dataset.size();//calculate ptscout<<"calculate pts"<<endl;for(int i=0;i<len;i++){for(int j=i+1;j<len;j++){if(squareDistance(dataset[i],dataset[j])<Eps)dataset[i].pts++;dataset[j].pts++;}}//core point cout<<"core point "<<endl;vector<point> corePoint;for(int i=0;i<len;i++){if(dataset[i].pts>=MinPts) {dataset[i].pointType = 3;corePoint.push_back(dataset[i]);}}cout<<"joint core point"<<endl;//joint core pointfor(int i=0;i<corePoint.size();i++){for(int j=i+1;j<corePoint.size();j++){if(squareDistance(corePoint[i],corePoint[j])<Eps){corePoint[i].corepts.push_back(j);corePoint[j].corepts.push_back(i);}}}for(int i=0;i<corePoint.size();i++){stack<point*> ps;if(corePoint[i].visited == 1) continue;ps.push(&corePoint[i]);point *v;while(!ps.empty()){v = ps.top();v->visited = 1;ps.pop();for(int j=0;j<v->corepts.size();j++){if(corePoint[v->corepts[j]].visited==1) continue;corePoint[v->corepts[j]].cluster = corePoint[i].cluster;corePoint[v->corepts[j]].visited = 1;ps.push(&corePoint[v->corepts[j]]);}}}cout<<"border point,joint border point to core point"<<endl;//border point,joint border point to core pointfor(int i=0;i<len;i++){if(dataset[i].pointType==3) continue;for(int j=0;j<corePoint.size();j++){if(squareDistance(dataset[i],corePoint[j])<Eps) {dataset[i].pointType = 2;dataset[i].cluster = corePoint[j].cluster;break;}}}cout<<"output"<<endl;//outputfstream clustering;clustering.open("clustering.txt",ios::out);for(int i=0;i<len;i++){if(dataset[i].pointType == 2)clustering<<dataset[i].x<<","<<dataset[i].y<<","<<dataset[i].cluster<<"\n";}for(int i=0;i<corePoint.size();i++){clustering<<corePoint[i].x<<","<<corePoint[i].y<<","<<corePoint[i].cluster<<"\n";}clustering.close();}int main(int argc, char** argv) {vector<point> dataset = openFile("dataset3.txt");DBSCAN(dataset,1.5,2);return 0;}
数据文件格式:(x,y)


运行结果格式:(x,y,cluster)


图形化展现:

特殊形状:


桥梁:


变化密度:


总结:

DBSCAN算法能够很好处理不同形状与大小的数据,并且抗噪音数据。但对于变化的密度,DBSCAN算法具有局限性。在实现Core point连接时,遇到了一点小小的麻烦,很难将相互连接的core point的cluster编号统一,后来通过给每个core point增加一个数组用于记录相连core point的下标信息,并采用深度优先进行遍历的方式,不仅提高了计算速度,同时也保证了准确性。对于实验数据结果,如果不对其进行图形化展现,很难看出聚类的效果,采用WPF(C#)技术对数据点进行处理,对不同cluster编号的点,赋予不同的颜色,进行图形展现。



0 0