python机器学习-聚类KMeans

来源:互联网 发布:松下fpxh编程样列 编辑:程序博客网 时间:2024/05/17 21:39

基本原理

代码实现

先给出完整代码,再分别说明

#-*- coding:utf-8import tracebackfrom sklearn.cluster import KMeansimport numpy as npimport matplotlib.pyplot as plt'''函数名:draw_original功能:根据样本和真相画图@X:样本@Y:真相'''def draw_original(X, Y):    try:        plt.subplot(1,1,1)        plt.scatter(X[:, 0], X[:, 1], c=Y)        plt.title("original clusters")        plt.xlabel("Feature1")        plt.ylabel("Feature2")        plt.show()    except Exception,e:        print traceback.print_exc()'''函数名: drawing_n_clusters功能: 根据聚类列表,聚类中心画图@cluster_list: 聚类列表,每个元素是一个聚类。这个聚类样本的列表,注意是列表@centroids: 聚类中心,矩阵形式存储'''def drawing_n_clusters(cluster_list, centroids):    try:        n_clusters = len(cluster_list)        k_clusters = [np.array(cluster) for cluster in cluster_list]  # 每一个类别以矩阵形式存储        plt.subplot(1, 1, 1)        color_list = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']        for idx, cluster in enumerate(k_clusters):            label_str = "cluster" + str(idx)            plt.scatter(cluster[:, 0], cluster[:, 1], c=color_list[idx], label=label_str)        plt.scatter(centroids[:, 0], centroids[:, 1], marker='+', color='y', s=200)        title = str(n_clusters) + " clusters"        plt.title(title)        plt.xlabel("Feature1")        plt.ylabel("Feature2")        plt.legend(loc=2)        plt.show()    except Exception, e:        print traceback.print_exc()'''函数名: draw_error功能: 画出不同K时的损失和@k_list: K列表@error_list: 每个K下的损失和'''def draw_error( k_list, error_list ):    try:        # 画框设置        plt.figure(figsize=(8, 5), dpi=80) # 创建图像        plt.subplot(1,1,1)        # 画点        plt.plot(k_list, error_list, marker='o', c = 'blue')        # 加标题        plt.title("Total Error vs. # of Clusters")        # 加坐标轴        plt.xticks(k_list)        plt.xlabel("k")        plt.ylabel("total squared error")        # 显示        plt.show()    except Exception,e:        print traceback.print_exc()'''-------------------------------------------------------------------------------''''''函数名:load_dataset功能:加载数据集:@input_path:输入路径@X: 样本列表,每个样本也以列表形式存储@Y: 原始标签列表,以列表形式存储'''def load_dataset( input_path ):    try:        X = []        Y = []        infile = open(input_path, "r")        for line in infile:            data = line.rstrip('\r\n').split('\t')            x = []            y = []            y.append(int(data[0]))            x.append(float(data[1]))            x.append(float(data[2]))            X.append(x)            Y.append(y)        infile.close()        return X, Y        print "[INFO]: load_dataset is finished!"    except Exception,e:        print traceback.print_exc()'''函数名:training功能:训练kmeans聚类器,初始点的选择采用kmeans++,对于K的训练迭代多次,返回最优值的聚类结果@X: 样本-矩阵形式,均以向量的形式保存@K: 聚类数量@label: 返回每个样本的训练标签@loss: means square均方误差@centroids: 聚类中心'''def training( X, K ):    try:        kmeans = KMeans(n_clusters=K).fit(X)        label = kmeans.labels_        loss = kmeans.inertia_        centroids = kmeans.cluster_centers_        return label, loss, centroids    except Exception,e:        print traceback.print_exc()'''函数名:get_clusters功能: 根据样本和聚类结果,获得每个聚类@X: 样本-矩阵形式@label: 样本标签-矩阵形式@K: 聚类数量@cluster_list: 类别'''def get_clusters(X, label, K):    try:        cluster_list = [ [] for x in range(K) ] # 每一个聚类用一个列表存。每个列表存这个聚类的样本向量        idx = 0        len_label = len(label)        while idx < len_label:            cluster_list[label[idx]].append(X[idx])            idx += 1        return cluster_list    except Exception,e:        print traceback.print_exc()'''-------------------------------------------------------------------------------'''def find_K():    try:        INPUT_PATH = "../data/4k2_far.txt"        OUTPUT_PATH_K = "../output/test_for_4k2/inertia.txt"        _X, _Y = load_dataset(INPUT_PATH)        X = np.array(_X)        #Y = np.array(_Y)        error_list = []        outfile = open(OUTPUT_PATH_K, "w")        for K in range(1,21):            _, loss, _ = training(X, K)            error_list.append(loss)            line = "K=" + str(K) + "," + str(loss)            outfile.write(line + '\n')        outfile.close()        k_list = [ k for k in range(1,21) ]        draw_error(k_list, error_list)        print "[INFO]: find_K is finished!"    except Exception,e:        print traceback.print_exc()def main():    try:        INPUT_PATH = "../data/4k2_far.txt"        _X, _Y = load_dataset(INPUT_PATH)        X = np.array(_X)        Y = np.array(_Y)        K = 2        label, loss, centroids = training(X, K)        cluster_list = get_clusters(X, label, K)        drawing_n_clusters(cluster_list, centroids)    except Exception,e:        print traceback.print_exc()if __name__ == '__main__':    #main()    find_K()
1 0