Chinese Whispers 聚类算法

来源：互联网发布：晋中市教育网络平台编辑：程序博客网时间：2024/06/01 09:14

Chinese Whispers 聚类算法用于当你不知道有多少个类时。他的基本算法步骤是：

1，对于所有节点v，都赋值一个初始的类class（vi）=i

2，随机选取一个节点vt，找到v所有的临接节点，对临接节点所属的类进行打分。例如一个节点1的临接节点有2,3,4,5,分别属于a,b,c,b类别，边1-2,1-3,1-4,1-5的权值都为1，那么类a的得分就是1，类b得分2，类c得分1

3，将得分最高的类别赋值给vt

4，返回2

下面上dlib的代码进行解析：

  inline unsigned long chinese_whispers (        const std::vector<ordered_sample_pair>& edges,        std::vector<unsigned long>& labels,        const unsigned long num_iterations,        dlib::rand& rnd    )    {        // make sure requires clause is not broken，传进来的边集需要排好序        DLIB_ASSERT(is_ordered_by_index(edges),                    "\t unsigned long chinese_whispers()"                    << "\n\t Invalid inputs were given to this function"        );        labels.clear();        if (edges.size() == 0)            return 0;        std::vector<std::pair<unsigned long, unsigned long> > neighbors;        find_neighbor_ranges(edges, neighbors);        // Initialize the labels, each node gets a different label.                labels.resize(neighbors.size());        for (unsigned long i = 0; i < labels.size(); ++i)            labels[i] = i;        for (unsigned long iter = 0; iter < neighbors.size()*num_iterations; ++iter)        {            // Pick a random node.随机挑选一个节点            const unsigned long idx = rnd.get_random_64bit_number()%neighbors.size();            // Count how many times each label happens amongst our neighbors.对节点的临接几点所属的类别进行统计打分            std::map<unsigned long, double> labels_to_counts;            const unsigned long end = neighbors[idx].second;            for (unsigned long i = neighbors[idx].first; i != end; ++i)            {                labels_to_counts[labels[edges[i].index2()]] += edges[i].distance();            }            // find the most common label.找到得分最高的类，并给该节点归类            std::map<unsigned long, double>::iterator i;            double best_score = -std::numeric_limits<double>::infinity();            unsigned long best_label = labels[idx];            for (i = labels_to_counts.begin(); i != labels_to_counts.end(); ++i)            {                if (i->second > best_score)                {                    best_score = i->second;                    best_label = i->first;                }            }            labels[idx] = best_label;        }        // Remap the labels into a contiguous range.  First we find the        // mapping.因为上述找到的类别可能不是连续的0,1,2,3...,需要对类别进行重新映射为连续的编号        std::map<unsigned long,unsigned long> label_remap;        for (unsigned long i = 0; i < labels.size(); ++i)        {            const unsigned long next_id = label_remap.size();            if (label_remap.count(labels[i]) == 0)                label_remap[labels[i]] = next_id;        }        // now apply the mapping to all the labels.给所有节点赋值类别        for (unsigned long i = 0; i < labels.size(); ++i)        {            labels[i] = label_remap[labels[i]];        }        return label_remap.size();    }