机器学习与数据挖掘系列算法之--knn的python实现

来源:互联网 发布:大数据 金融行业 编辑:程序博客网 时间:2024/05/16 05:06

K近邻算法原理较为简单,不多介绍,直接上python实现,有什么疑问或错误的地方敬请指出探讨,谢谢。

Knn.py
from algorithm.classification.common.PredictBase import PredictBasefrom algorithm.classification.common import Utils# author:chenhq# create date:2017/12/8'''    K近邻算法:        思想:基于最近k个样例的分类,分类最多的类别即为该类确定的类别,以下为加权k近邻算法        步骤:            1.计算x与所有样本X的距离,取topK近邻样本            2.y = argmax(j)∑[1/dis(x,xi)*I(yi = yj)], i∈topK                *: I(yi = yj) -> 指示函数,true->1, false->0'''class Knn(PredictBase):    def __init__(self):        self.sample = []        self.labels = []        self.k = 5    def train_set(self, train_data, class_vec, k):        self.sample = train_data        self.labels = class_vec        self.k = k    def predict(self, row):        dis_list = [(self.dis(vec, row), label) for (vec, label) in list(zip(self.sample, self.labels))]        top_k = sorted(dis_list, key=lambda d: d[0])[:self.k]        label_dict = {}        [label_dict.update({label: label_dict.get(label, 0) + 1.0 / (distance + 1)})for (distance, label) in top_k]        return sorted(label_dict.items(), key=lambda ld: -ld[0])[0]    def dis(self, vec1, vec2):        s = 0.0        for (v1, v2) in zip(vec1, vec2):            s += pow((v1 - v2), 2)        return pow(s, 0.5)if __name__ == '__main__':    # load_data-->split(train&test)    source_data, class_vec = Utils.load_classify_data_set()    train_data, train_class_vec, eva_data, eva_class_vec = Utils.split_data(source_data, class_vec, 0.8)    print(eva_data, eva_class_vec)    # train & evaluate & show    model = Knn()    model.train_set(train_data=train_data, class_vec=train_class_vec, k=5)    prediction, recall, f = model.evaluate(evaluate_set=eva_data, evaluate_label=eva_class_vec)    print("prediction:\t%f\nrecall:\t%f\nf-measure:\t%f" % (prediction, recall, f))


相关的基类及辅助方法

PredictBase.py
class PredictBase(object):    # __metaclass__ = ABCMeta #指定这是一个抽象类    def evaluate(self, evaluate_set, evaluate_label):        evaluate_list = list()        for (features, label) in list(zip(evaluate_set, evaluate_label)):            p_max = self.predict(features)            evaluate_list.append((label, p_max[0]))        tp = 0      # true-positive     真-->真        fp = 0      # false-positive    假-->真        fn = 0      # false-negative    真-->假        tn = 0      # true-negative     假-->假        for (label, predict_label) in evaluate_list:            if label == 1:                if predict_label == 1:                    tp += 1                else:                    fn += 1            else:                if predict_label == 1:                    fp += 1                else:                    tn += 1        if tp == 0:            return 0, 0, 0        prediction = float(tp) / (tp + fp)        recall = float(tp) / (tp + fn)        f = 2 * prediction * recall / (prediction + recall)        return prediction, recall, f    def predict(self, row):        pass

Utils.py
def split_data(data, label, rate):    train_lens = int(len(label) * 0.9)    (train_data, train_class_vec) = (data[0: train_lens], label[0: train_lens])    (eva_data, eva_class_vec) = (data[train_lens:], label[train_lens:])    return train_data, train_class_vec, eva_data, eva_class_vec# 准数据def load_classify_data_set():    source_data = [[7, 8, 10, 8, 6, 10, 1, 2, 0, 2, 1, 1],                   [0, 1, 1, 2, 2, 1, 8, 9, 9, 9, 7, 8],                   [0, 0, 2, 2, 2, 1, 8, 9, 9, 9, 7, 8],                   [7, 8, 7, 8, 7, 8, 1, 2, 0, 2, 1, 1],                   [0, 0, 1, 3, 2, 1, 8, 9, 9, 9, 7, 8],                   [0, 2, 1, 2, 2, 1, 8, 9, 9, 9, 7, 8],                   [0, 0, 1, 2, 4, 1, 8, 9, 9, 9, 7, 8],                   [7, 9, 10, 8, 6, 9, 1, 2, 0, 2, 1, 1],                   [1, 0, 1, 2, 2, 1, 8, 9, 9, 9, 7, 8],                   [7, 7, 9, 8, 6, 8, 1, 2, 0, 2, 1, 1],                   [7, 9, 9, 8, 9, 0, 11, 2, 0, 2, 1, 1],                   [7, 8, 9, 8, 6, 9, 1, 2, 0, 2, 2, 1],                   [0, 0, 1, 4, 2, 1, 8, 9, 9, 9, 7, 8],                   [8, 8, 9, 8, 8, 5, 1, 2, 0, 2, 1, 0],                   [7, 8, 9, 8, 6, 5, 1, 2, 0, 2, 0, 1],                   [0, 0, 3, 2, 2, 1, 8, 9, 9, 9, 7, 8],                   [7, 8, 9, 8, 6, 9, 1, 2, 0, 2, 2, 1],]    class_vec = [1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1]    return source_data, class_vec


原创粉丝点击