K-means算法实现

来源:互联网 发布:相册模板软件 编辑:程序博客网 时间:2024/05/21 11:05

算法学习记录

  • 开始学机器学习,٩(๑>◡<๑)۶

具体代码实现

import mathimport numpyclass point_data_reader:    file_name = str()    def __init__(self,file_name):        point_data_reader.file_name = file_name    def get_data_list(self,num_lost):        file_ = open(self.file_name,'r+')        db = list()               for line in file_.readlines():            reader_list = list()            for reader_ in line.split():                try:                    eval(reader_)                except:                    reader_list.append(num_lost)                  else:                    reader_list.append(eval(reader_))                 db.append(reader_list)              file_.close()        return dbclass k_means:    def __init__(self,point_list):        # 点的数据        k_means.point_list = point_list        # 维度        k_means.dimension = len(point_list[0])    # 随机点获取    def add_random_point(self):        now = 0        point = list()        while now != self.dimension:            min = None            max = None            for line in self.point_list:                if max == None or line[now] > max :                    max = line[now]                if min == None or line[now] < min :                    min = line[now]            point.append(numpy.random.rand()*(max-min)+min)            now += 1        return point    # 返回一个包含len(gather_point_list)个元素的列表,第n个列表内的点分给gather_point_list[n]    def seprate_data(self,gather_point_list):        split_data = list()        for i in range(len(gather_point_list)):            split_data.append([])        for data_reader in self.point_list:            length = 0            now_max_index = 0            for point_reader in gather_point_list:                # Euclidean Distance算距离 d_{ij} = sqrt_{sum_{k=1}^m (x_{ik}-x_{jk})^2}                now_length = self.euclidean_distance(data_reader,point_reader)                if length < now_length:                    length = now_length                    now_max_index = gather_point_list.index(point_reader)            split_data[now_max_index].append(data_reader)        return split_data    def euclidean_distance(self,point,gather_point):        now = 0        sum = 0        while now != len(point):            sum += (point[now] - gather_point[now])**2            now += 1        return math.sqrt(sum)    # 根据现有分割改变聚集点位置    # 即求点群簇中心点    # argmax_x = sum_{k=1}^n sqrt_{sum_{j=1}^m (x_j^n - x_j)^2} 等同于:    # argmax_x = sum_{k=1}^n sum_{j=1}^m (x_j^n - x_j)^2    # 求各个梯度的导得: x = <x_1...x_m> 为各维度平均数    def get_centeral_point(self,seprate_data):        gather_point = list()               for point_list_reader in seprate_data:            if len(point_list_reader) == 0:                continue            sum = numpy.zeros((1,self.dimension))            for point_reader in point_list_reader:                sum += numpy.mat(point_reader)            sum /= len(point_list_reader)            gather_point.append(sum.tolist()[0])        return gather_point    def get_seprate_point(self,k = 2,error = 0.000001):        gather_point = list()        for i in range(k):            # 增加k个随机点            gather_point.append(self.add_random_point())        # 上一次分类所得的聚集点        last_gather_point = list(gather_point)        # 把数据分割给聚集点        seprate_data = self.seprate_data(gather_point)        # 重新计算得到数据的聚集点坐标        gather_point = self.get_centeral_point(seprate_data)        # 保证至少有一个点附在聚集点,否则删掉它        while len(gather_point) != len(last_gather_point):            last_gather_point = list(gather_point)            seprate_data = self.seprate_data(gather_point)            gather_point = self.get_centeral_point(seprate_data)        while abs((numpy.mat(gather_point)-numpy.mat(last_gather_point)).sum()) >= error:            last_gather_point = list(gather_point)            seprate_data = self.seprate_data(gather_point)            gather_point = self.get_centeral_point(seprate_data)            # 保证至少有一个点附在聚集点,否则删掉它            while len(gather_point) != len(last_gather_point):                last_gather_point = list(gather_point)                seprate_data = self.seprate_data(gather_point)                gather_point = self.get_centeral_point(seprate_data)        return gather_point,seprate_dataif __name__ == '__main__':    db = point_data_reader('text.dat').get_data_list(num_lost = 1.0)    k = k_means(db)    p = k.add_random_point()    gather_point,seprate_data = k.get_seprate_point(k = numpy.random.random_integers(1,10),error = 0.001)    print(gather_point)    # print(seprate_data)