K-Mean聚类算法
来源:互联网 发布:flashfxp mac版 编辑:程序博客网 时间:2024/06/01 10:20
1.C++实现
#include <vector>#include <algorithm>#include <iostream>#include <string>#include <time.h>#include <limits>#include <map>#include <cmath>#include <fstream>using namespace std;typedef vector<string> StrVec; //字符串向量typedef vector<int> IntVec; //整数向量typedef vector<vector<int> > Int2DVec;//整数二维向量typedef vector<vector<double> > Double2DVec;//浮点数二维向量typedef vector<double> DoubleVec;//浮点数向量string stopWordsList[] = {"的", "我们","要","自己","之","将","“","”",",","(",")","后","应","到","某","后", "个","是","位","新","一","两","在","中","或","有","更","好","" };//停用词int stopWordsLen = sizeof(stopWordsList)/sizeof(stopWordsList[0]);class StopWordsHandler//停用词{public: StopWordsHandler(void) { for (int i=0; i<stopWordsLen; ++i) { stopWords.push_back(stopWordsList[i]); } } ~StopWordsHandler(void) {} bool IsStopWord(string& str) { //是否是停用词 //transform(str.begin(),str.end(),str.begin(),tolower);//确保小写化 transform(str.begin(), str.end(), str.begin(), (int(*)(int)) tolower); return find(stopWords.begin(),stopWords.end(),str)!=stopWords.end(); }private: StrVec stopWords;};class Cluster{public: IntVec CurrentMembership;//该聚类的数据成员索引 DoubleVec Mean;//该聚类的中心public: Cluster(void) {} Cluster(int dataindex,DoubleVec& data) { CurrentMembership.push_back(dataindex); copy(data.begin(),data.end(),back_inserter(Mean)); } void UpdateMean(Double2DVec& coordinates) { for (int i = 0; i < CurrentMembership.size(); i++) { DoubleVec& coord = coordinates[CurrentMembership[i]]; for (int j = 0; j < coord.size(); j++) { Mean[j] += coord[j]; // 得到每个纵向列的和; } for (int k = 0; k < Mean.size(); k++) { Mean[k] /= coord.size(); // 对每个纵向列取平均值 } } }public: ~Cluster(void) {}};class TermVector{public: static double ComputeCosineSimilarity(const DoubleVec& vector1, const DoubleVec& vector2) { if (vector1.size() != vector2.size()) throw string("DIFER LENGTH"); double denom = (VectorLength(vector1) * VectorLength(vector2)); if (denom == 0) return 0; else return (InnerProduct(vector1, vector2) / denom); } static double InnerProduct(const DoubleVec& vector1, const DoubleVec& vector2) { if (vector1.size() != vector2.size()) throw string("DIFFER LENGTH ARE NOT ALLOWED"); double result = 0.0f; for (int i = 0; i < vector1.size(); i++) result += vector1[i] * vector2[i]; return result; } static double VectorLength(const DoubleVec& vector) { double sum = 0.0f; for (int i = 0; i < vector.size(); i++) sum = sum + (vector[i] * vector[i]); return (double)sqrt(sum); }};class KMeans{public: vector<Cluster*> _clusters;//聚类private: int _coordCount;//数据的数量 Double2DVec _coordinates;//原始数据 int _k;//聚类的数量 //定义一个变量用于记录和跟踪每个资料点属于哪个群聚类 // _clusterAssignments[j]=i; 表示第 j 个资料点对象属于第 i 个群聚类 IntVec _clusterAssignments; // 定义一个变量用于记录和跟踪每个资料点离聚类最近 IntVec _nearestCluster; /// 定义一个变量,来表示资料点到中心点的距离, /// 其中—_distanceCache[i][j]表示第i个资料点到第j个群聚对象中心点的距离; Double2DVec _distanceCache; void InitRandom() { srand(unsigned(time(NULL))); for (int i = 0; i < _k; i++) { int temp = rand()%(_coordCount);//产生随机数 _clusterAssignments[temp] = i; //记录第temp个资料属于第i个聚类 _clusters[i] = new Cluster(temp,_coordinates[temp]); } } static double getDistance(const DoubleVec& coord, const DoubleVec& center) { return 1- TermVector::ComputeCosineSimilarity(coord, center); } int NearestCluster(int ndx) { int nearest = -1; double min = numeric_limits<double>::max(); for (int c = 0; c < _k; c++) { double d = _distanceCache[ndx][c]; if (d < min) { min = d; nearest = c; } } return nearest; }public: KMeans(Double2DVec& data, int K) { int i; this->_coordinates.resize(data.size()); for (i=0; i<data.size(); ++i) { copy(data[i].begin(),data[i].end(),back_inserter(_coordinates[i])); } _coordCount = data.size(); _k = K; _clusters.resize(K); _clusterAssignments.resize(_coordCount); _nearestCluster.resize(_coordCount); _distanceCache.resize(_coordCount); for (i=0; i<_coordCount; ++i) { _distanceCache[i].resize(_coordCount); } InitRandom(); } void Start() { int iter = 0,i,j; while (true) { cout<<"Iteration "<<iter++<< "..."<<endl; //1、重新计算每个聚类的均值 for (i = 0; i < _k; i++) { _clusters[i]->UpdateMean(_coordinates); } //2、计算每个数据和每个聚类中心的距离 for (i = 0; i < _coordCount; i++) { for (j = 0; j < _k; j++) { double dist = getDistance(_coordinates[i], _clusters[j]->Mean); _distanceCache[i][j] = dist; } } //3、计算每个数据离哪个聚类最近 for (i = 0; i < _coordCount; i++) { _nearestCluster[i] = this->NearestCluster(i); } //4、比较每个数据最近的聚类是否就是它所属的聚类 //如果全相等表示所有的点已经是最佳距离了,直接返回; int k = 0; for (i = 0; i < _coordCount; i++) { if (_nearestCluster[i] == _clusterAssignments[i]) k++; } if (k == _coordCount) break; //5、否则需要重新调整资料点和群聚类的关系,调整完毕后再重新开始循环; //需要修改每个聚类的成员和表示某个数据属于哪个聚类的变量 for (j = 0; j < _k; j++) { _clusters[j]->CurrentMembership.clear(); } for (i = 0; i < _coordCount; i++) { _clusters[_nearestCluster[i]]->CurrentMembership.push_back(i); _clusterAssignments[i] = _nearestCluster[i]; } } }public: ~KMeans(void) { vector<Cluster*>::iterator iter; for (iter=this->_clusters.begin(); iter!=_clusters.end(); ++iter) { delete (*iter); } _clusters.clear(); }};class ITokeniser{public:virtual void Partition(string input,StrVec& retWords)=0;//分词算法};class TFIDFMeasure{private:StrVec _docs;//文档集合,每一行字符串代表一份文档int _numDocs;//文档数目int _numTerms;//单词数目StrVec _terms;//单词集合Int2DVec _termFreq;//每个单词出现在每份文档中的频率Double2DVec _termWeight;//每个单词在每份文档的权重IntVec _maxTermFreq;//记录每一份文档的最大词频IntVec _docFreq;//出现单词的文档频率ITokeniser* _tokenizer;//分词器map<string,int> _wordsIndex;//单词映射表,保存每一个单词及其对应的下标public:TFIDFMeasure(const StrVec& documents,ITokeniser* tokeniser);~TFIDFMeasure(void);protected:void Init();//初始化TF-IDF计算器void GenerateTerms(const StrVec& docs,StrVec& terms);//分词处理void GenerateTermFrequency();//计算词频void GenerateTermWeight();//计算词的权重void GetWordFrequency(string& input,map<string,int>& freq);int CountWords(string& word, const StrVec& words);//统计词数int GetTermIndex(const string& term);//查询词语对应的下标double ComputeTermWeight(int term, int doc);//计算词语在指定文档中的权重值double GetTermFrequency(int term, int doc);//获取词语在指定文档的词频double GetInverseDocumentFrequency(int term);//计算倒排文件频率public:inline int NumTerms()const{return this->_numTerms;}void GetTermVector(int doc,DoubleVec& vec);//获取项向量};TFIDFMeasure::~TFIDFMeasure(void){//销毁分词器if (this->_tokenizer!=NULL){delete _tokenizer;_tokenizer = NULL;}//清空数据_docs.clear();_terms.clear();_wordsIndex.clear();}TFIDFMeasure::TFIDFMeasure(const StrVec& documents,ITokeniser* tokeniser){_docs=documents;_numDocs=documents.size();_tokenizer = tokeniser;this->Init();}void TFIDFMeasure::GenerateTerms(const StrVec& docs,StrVec& terms){for (int i=0; i < docs.size() ; i++){StrVec words;_tokenizer->Partition(docs[i],words);//分词for (int j=0; j < words.size(); j++){//不在单词表中,则加入if (find(terms.begin(),terms.end(),words[j])==terms.end()){terms.push_back(words[j]);}}}}void TFIDFMeasure::Init(){//初始化this->GenerateTerms (_docs,_terms);//分出所有词项this->_numTerms=_terms.size() ;//所有文档中的词项数目//准备好存储空间_maxTermFreq.resize(_numDocs);_docFreq.resize(_numTerms);_termFreq.resize(_numTerms);_termWeight.resize(_numTerms);for(int i=0; i < _terms.size() ; i++){_termWeight[i].resize(_numDocs);_termFreq[i].resize(_numDocs) ;_wordsIndex[_terms[i]] = i;//将单词放入单词映射表中}this->GenerateTermFrequency ();//计算单词频率this->GenerateTermWeight();//计算单词权重}void TFIDFMeasure::GetWordFrequency(string& input,map<string,int>& freq){//计算单词频率//transform(input.begin(),input.end(),input.begin(),tolower); transform(input.begin(), input.end(), input.begin(), (int(*)(int)) tolower);StrVec temp;this->_tokenizer->Partition(input,temp);//对当前文档分词unique(temp.begin(),temp.end());StrVec::iterator iter;for (iter=temp.begin();iter!=temp.end();++iter){int count = CountWords(*iter, temp);//计算单词在文档中出现的次数freq[*iter] = count;//保存单词频率}}void TFIDFMeasure::GetTermVector(int doc,DoubleVec& vec){vec.resize(this->_numTerms);for (int i=0; i < this->_numTerms; i++)vec[i]=_termWeight[i][doc];//第i个单词在文档doc中的权重}//用于字符串比较的仿函数class WordComp{public:WordComp(string& sWord) : word(sWord) { } bool operator() (const string& lhs) { return lhs.compare(word)==0; }private:string word;};int TFIDFMeasure::CountWords(string& word, const StrVec& words){int nCount = 0;nCount = count_if(words.begin(),words.end(),WordComp(word));return nCount;}int TFIDFMeasure::GetTermIndex(const string& term){map<string,int>::iterator pos = _wordsIndex.find(term);if (pos!=_wordsIndex.end()){return pos->second;}elsereturn -1;}void TFIDFMeasure::GenerateTermFrequency(){//计算每个单词在每份文档出现的频率for(int i=0; i < _numDocs ; i++){string curDoc=_docs[i];//当前待处理的文档map<string,int> freq;this->GetWordFrequency(curDoc,freq);map<string,int>::iterator iter;_maxTermFreq[i]=numeric_limits<int>::min();for (iter = freq.begin();iter!=freq.end();++iter){string word=iter->first;int wordFreq=iter->second ;int termIndex=GetTermIndex(word);//单词下标if(termIndex == -1)continue;_termFreq [termIndex][i]=wordFreq;//单词在第i份文档中出现的频率_docFreq[termIndex]++;//出现第termIndex单词的文档频率加1if (wordFreq > _maxTermFreq[i]) _maxTermFreq[i]=wordFreq;//记录第i份文档中的最大词频}}}void TFIDFMeasure::GenerateTermWeight(){//计算每个单词在每份文档中的权重for(int i=0; i < _numTerms; i++){for(int j=0; j < _numDocs ; j++){_termWeight[i][j]=ComputeTermWeight (i, j);}}}double TFIDFMeasure::GetTermFrequency(int term, int doc){int freq=_termFreq [term][doc];//词频int maxfreq=_maxTermFreq[doc];return ( (float) freq/(float)maxfreq );}double TFIDFMeasure::ComputeTermWeight(int term, int doc){//计算单词在文档中的权重float tf=GetTermFrequency (term, doc);float idf=GetInverseDocumentFrequency(term);return tf * idf;}double TFIDFMeasure::GetInverseDocumentFrequency(int term){int df=_docFreq[term];//包含单词term的文档数目return log((float) (_numDocs) / (float) df );}class Tokeniser : public ITokeniser{public:Tokeniser(void);~Tokeniser(void);public:void Partition(string input,StrVec& retWords);};Tokeniser::Tokeniser(void){}Tokeniser::~Tokeniser(void){}void Tokeniser::Partition(string input,StrVec& retWords){//分词算法,input为输入串,retWords为处理后所分开的单词,这里就简单化处理了,以空格符为分隔符进行分词//transform(input.begin(),input.end(),input.begin(),tolower); transform(input.begin(), input.end(), input.begin(), (int(*)(int)) tolower);string::iterator pos = input.begin();StopWordsHandler stopHandler;do{string temp;pos = find(input.begin(),input.end(),' ');//找到分隔符copy(input.begin(),pos,back_inserter(temp));if (!stopHandler.IsStopWord(temp)){//不是停用词则保存retWords.push_back(temp);//保存分出的单词}if (pos==input.end()){//最后一个单词了break;}else{input.erase(input.begin(),++pos);}} while (pos!=input.end());}int main(){ StrVec strVec;ifstream inFile("agr_seg.txt");string tmpStr;while( getline(inFile,tmpStr) ){strVec.push_back(tmpStr);}//2,初始化TF-IDF计算器,用来生产每个文档的TF-IDF权重TFIDFMeasure tf(strVec,new Tokeniser());int docCount = strVec.size(); //文档个数int K = 3; //聚成3个聚类//3、生成k-means的输入数据,是一个联合数组,第一维表示文档个数,//第二维表示所有文档分出来的所有词Double2DVec data;data.resize(docCount);int dimension = tf.NumTerms();//所有词的数目for (int i = 0; i < docCount; i++){for (int j = 0; j < dimension; j++){tf.GetTermVector(i,data[i]); //获取第i个文档的TFIDF权重向量}}//4、初始化k-means算法,第一个参数表示输入数据,第二个参数表示要聚成几个类KMeans kmeans(data, K);//5、开始迭代kmeans.Start();//6、获取聚类结果并输出vector<Cluster*> clusters = kmeans._clusters;vector<Cluster*>::iterator iter;IntVec::iterator iter2;for (iter = clusters.begin();iter!=clusters.end();++iter){cout<<"-------------------------"<<endl;IntVec& vec = (*iter)->CurrentMembership;for (iter2 = vec.begin();iter2!=vec.end();++iter2){cout<<strVec[*iter2]<<endl;}}system("pause");return 0;}
1 1
- K-Mean聚类算法
- K-mean聚类算法
- K-均值聚类算法(K-mean)
- 聚类算法之K-mean算法
- 聚类问题-k-mean算法
- k-mean聚类算法实现
- 聚类方法之k-mean算法
- K-Mean聚类算法+C语言代码
- K-mean(多维度)聚类算法(matlab代码)
- k-mean算法实现
- K-mean clustering 算法
- Mean Shift 聚类算法
- 基本k-mean聚类的文本聚类算法原理和例子
- K-mean算法的优点缺点
- k-mean算法的java实现
- 几种计算机视觉中常用的聚类算法(K-means, Agglomerative clustering, Mean shift, Spectral clustering)
- K-mean聚类的一个代码的详细注释
- 机器学习(十)Mean Shift 聚类算法
- hdu 2604 Queuing 矩阵
- 设置模式-单例模式
- .net 访问Oracle的连接字符串 (果然BT)
- 初学HTML
- Regular Expression Matching(leetcode)
- K-Mean聚类算法
- 什么是熵(Entropy)?
- rails kaminari text modify
- openstreetmap地图数据转换为one能识别的wkt格式
- Codeforces Round #279 (Div. 2) B - Queue (简单链表)
- 基本字符串相关函数,基本宏,内存相关函数,类型转换函数实现合集
- PHP学习总结(一)
- 巴最强版本“枭龙”性能成谜 或无钱买歼-31【百度军事】
- OC中的block使用初探