源码解读----之-----KMeans

来源:互联网 发布:golang mobile 编辑:程序博客网 时间:2024/04/21 00:03

(小白的个人理解,很多地方可能不准确,欢迎大家指正,向大家学习)
#encoding = utf-8"""@version:??@author: xq@contact:xiaoq_xiaoq@163.com@file: k_means.py@time: 2017/10/18 15:56"""import warningsimport numpy as npimport scipy.sparse as spfrom sklearn.base import BaseEstimator, ClusterMixin, TransformerMixinfrom sklearn.metrics.pairwise import euclidean_distancesfrom sklearn.utils.extmath import row_norms, squared_norm, stable_cumsumfrom sklearn.utils.sparsefuncs_fast import assign_rows_csrfrom sklearn.utils import check_arrayfrom sklearn.utils import check_random_statefrom sklearn.utils.validation import check_is_fittedfrom sklearn.utils.validation import FLOAT_DTYPESfrom sklearn.cluster import k_meansclass KMeans(BaseEstimator, ClusterMixin, TransformerMixin):    """K-Means clustering    Read more in the :ref:`User Guide <k_means>`.    Examples    --------    >>> from sklearn.cluster import KMeans    >>> import numpy as np    >>> X = np.array([[1, 2], [1, 4], [1, 0],    ...               [4, 2], [4, 4], [4, 0]])    >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X)    >>> kmeans.labels_    array([0, 0, 0, 1, 1, 1], dtype=int32)    >>> kmeans.predict([[0, 0], [4, 4]])    array([0, 1], dtype=int32)    >>> kmeans.cluster_centers_    array([[ 1.,  2.],           [ 4.,  2.]])    """    def __init__(self, n_clusters=8, init='k-means++', n_init=10,                 max_iter=300, tol=1e-4, precompute_distances='auto',                 verbose=0, random_state=None, copy_x=True,                 n_jobs=1, algorithm='auto'):        self.n_clusters = n_clusters#集群个数        self.init = init#选择中心点的方式        self.max_iter = max_iter#算法每次迭代的最大次数        self.tol = tol#迭代的总次数        self.precompute_distances = precompute_distances#是否提前计算距离        self.n_init = n_init#用不同的中心点初始化值运行算法的次数        self.verbose = verbose#是否输出详细信息        self.random_state = random_state#用于初始化中心点的生成器        self.copy_x = copy_x#是否对输入数据继续copy 操作        self.n_jobs = n_jobs#使用进程的数量        self.algorithm = algorithm#k-means算法的类型    def _check_fit_data(self, X):        """验证输入数据样本数据X要大于质点的数目K """        #输入的数据将被转化为至少是2维的数组,验证X的类型返回转化个验证后的X        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32])        print('X为',X)        print('X的sharp',X.shape[0])        #如果样本数据小于质点数K,返回ValueError:样本数量应该大于等于质点数K        if X.shape[0] < self.n_clusters:            raise ValueError("n_samples=%d should be >= n_clusters=%d" % (X.shape[0], self.n_clusters))        return X    def _check_test_data(self, X):        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES)        n_samples, n_features = X.shape        expected_n_features = self.cluster_centers_.shape[1]        if not n_features == expected_n_features:            raise ValueError("Incorrect number of features. "                             "Got %d features, expected %d" % (                                 n_features, expected_n_features))        return X    def fit(self, X, y=None):        """计算 k-means聚类        @Parameters X:array或者稀疏矩阵,shape=(样本数, 特征数)        @Parameters y:目标向量        """        #把self.random_state变成np.random.RandomState的实例        random_state = check_random_state(self.random_state)        #验证数据        X = self._check_fit_data(X)        #计算聚类,返回        # 1.centroid===》ndarray类型的质心,sharp(簇的数目,特征数)        # 2.label===》整型的ndarray,sharp(样本数),label[i]表示离索引为i的样本点最近的簇的索引值        # 3.inertia===》float,算法最后的风险评估值(训练集中所有点离最近的中心点的距离的平方和)        # 4.best_n_iter====》结果最好所对应的迭代次数,只有return_n_iter设置为True时返回        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \            k_means(                X, n_clusters=self.n_clusters, init=self.init,                n_init=self.n_init, max_iter=self.max_iter, verbose=self.verbose,                precompute_distances=self.precompute_distances,                tol=self.tol, random_state=random_state, copy_x=self.copy_x,                n_jobs=self.n_jobs, algorithm=self.algorithm,                return_n_iter=True)        return self    def fit_predict(self, X, y=None):        """        @Parameters X : 要分类的数据        @returns labels : array, shape [n_samples,]每个样本所属的簇的索引值        """        return self.fit(X).labels_    def fit_transform(self, X, y=None):        """计算集群并把X转换到cluster-distance空间,相当于fix(X).transform(X),但是更高效        @Parameters  X : {array-like, sparse matrix}, shape = [n_samples, n_features] 需要聚类的数据        @Parameters  y : Ignored        @Returns     X_new : array, shape [n_samples, k]X转换到新空间,每一个维表示该点到各个中心的距离        """        #检查数据        X = self._check_fit_data(X)        #返回每个样本点到各个中心的距离所构成的array        return self.fit(X)._transform(X)    def transform(self, X):        """把X转换到cluster-distance空间,返回X到中心的距离        @Parameters X : {array-like, sparse matrix}, shape = [n_samples, n_features]        @Returns X_new : array, shape [n_samples, k]        """        check_is_fitted(self, 'cluster_centers_')        X = self._check_test_data(X)        return self._transform(X)    def _transform(self, X):        """#返回X到中心的距离"""        return euclidean_distances(X, self.cluster_centers_)    def predict(self, X):        """预测X中的每个样本所属的最近的簇。        @Parameters  X : {array-like, sparse matrix}, shape = [n_samples, n_features]        @Returns labels : array, shape [n_samples,]返回所属质心的索引构成的array        """        check_is_fitted(self, 'cluster_centers_')        X = self._check_test_data(X)        x_squared_norms = row_norms(X, squared=True)        return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]    def score(self, X, y=None):(这个方法没有懂,如果明白的,留言告知,谢谢微笑        """Opposite of the value of X on the K-means objective.        @parameters  X : {array-like, sparse matrix}, shape = [n_samples, n_features]        @:parameter y : Ignored        @returns  score : float            Opposite of the value of X on the K-means objective.        """        check_is_fitted(self, 'cluster_centers_')        X = self._check_test_data(X)        x_squared_norms = row_norms(X, squared=True)        return -_labels_inertia(X, x_squared_norms, self.cluster_centers_)[1]

测试:

#测试数据stopList= [{'Id': '50001','lat': 28.571906,'lng': 112.337788},           {'Id': '50001','lat': 28.573678,'lng': 112.381103},           { 'Id': '50001','lat': 28.571915,'lng': 112.337533},           { 'Id': '50001','lat': 28.573978,'lng': 112.35765},            { 'Id': '50001','lat': 28.572656,'lng': 112.3366},           {'Id': '50001', 'lat': 28.578011, 'lng': 112.330688},           {'Id': '50001', 'lat': 28.572228, 'lng': 112.335841},           {'Id': '50001', 'lat': 28.57849, 'lng': 112.3338},           {'Id': '50001', 'lat': 28.57239, 'lng': 112.336491},           {'Id': '50001', 'lat': 28.577943, 'lng': 112.330995},           {'Id': '50001', 'lat': 28.571921, 'lng': 112.337783},           {'Id': '50001', 'lat': 28.572401, 'lng': 112.3359},           {'Id': '50001', 'lat': 28.569629, 'lng': 112.34005},           {'Id': '50001', 'lat': 28.588048, 'lng': 112.337783},           {'Id': '50001', 'lat': 28.572035, 'lng': 112.335683},           {'Id': '50001', 'lat': 28.560938, 'lng': 112.378183},           {'Id': '50001', 'lat': 28.544781, 'lng': 112.494936},           {'Id': '50001', 'lat': 28.572296, 'lng': 112.336288},           {'Id': '50001', 'lat': 28.571951, 'lng': 112.337806},           {'Id': '50001', 'lat': 28.571551, 'lng': 112.32685}]print('共有%d个点'%len(stopList))clustertest = clusterApi(stopList)#实例化data = clustertest.initData()# clustertest.k_meansUp()#聚类画图model = KMeans(n_clusters=6)testData =model.fit(data)print('labels_',testData.labels_)#每个点所属质心点的索引print('fit.transform(data)',testData.transform(data))#每个点到各个中心的距离print('*'*20)print('predict',model.predict(data))#每个点到各个中心的距离print('score',model.score(data))#score -0.000161647796631#print('transform(data)',model.transform(data))和fit.transform一样

输出结果:

共有20个点
labels_ [1 0 1 4 1 5 1 5 1 5 1 1 1 3 1 0 2 1 1 5]
fit.transform(data) [[ 0.04211125  0.00097656  0.15947     0.01616499  0.01997781  0.00859708]
 [ 0.00647778  0.04406992  0.11744154  0.0456432   0.02345784  0.0505931 ]
 [ 0.04235963  0.          0.15972395  0.01613546  0.02023865  0.00831513]
 [ 0.0229648   0.02068146  0.14035517  0.02434561  0.          0.02718635]
 [ 0.04336641  0.00069053  0.16076989  0.01545624  0.02108109  0.00717624]
 [ 0.05010537  0.00878906  0.16757369  0.0122752   0.02724767  0.00119604]
 [ 0.04407533  0.00119604  0.16144471  0.01594221  0.02185843  0.00676582]
 [ 0.04718424  0.00724238  0.16462249  0.01033497  0.02426714  0.00371864]
 [ 0.04344331  0.          0.16083217  0.01570111  0.02121637  0.00717624]
 [ 0.04979511  0.00854143  0.16726181  0.01221679  0.02695737  0.00169146]
 [ 0.04210559  0.00069053  0.15947747  0.01612068  0.01996587  0.00851347]
 [ 0.04402662  0.00097656  0.1614137   0.0157466   0.02180382  0.00673049]
 [ 0.03966215  0.00378221  0.15686524  0.01855469  0.01812569  0.01173908]
 [ 0.04671196  0.01612068  0.16299966  0.00069053  0.02433581  0.01356684]
 [ 0.04421575  0.00119604  0.16156872  0.01615023  0.02205389  0.00683594]
 [ 0.00651448  0.04255617  0.11786507  0.04866183  0.02432601  0.05007681]
 [ 0.11747199  0.16018453  0.          0.16300259  0.14035687  0.16738578]
 [ 0.0436459   0.00097656  0.16101736  0.01583718  0.02142882  0.00707587]
 [ 0.04209426  0.          0.15946252  0.01609107  0.01991804  0.00851347]
 [ 0.05296445  0.01019561  0.17020507  0.01979798  0.03088934  0.0061376 ]]
********************
predict [1 0 1 4 1 5 1 5 1 5 1 1 1 3 1 0 2 1 1 5]
score -0.000161647796631(这个结果没有理解)




原创粉丝点击