python3.4+KNN

来源:互联网 发布:unity3d内景 编辑:程序博客网 时间:2024/06/03 19:46
import numpy as npimport csv'''KNN思想:如果要判断某一个样本的类别,则只需要去找它的K个最近邻的样本,查看这K个样本的类别,出现最多次的样本类别就是待测样本的类别KNN缺点:首先K值不好缺点;其次对样本不平衡的情况,效果不好;可解释性不太好。采用默认参数scikit-learn估计器:fit()与pridect()fit()进行训练算法,接受训练集及其类别2个参数,得到训练模型。predict()对测试数据类别进行预测,接受测试集的参数。'''#天线采集数据,每个数据有35个值,前34个为采集数据(浮点型),最后一个要么是g,要么是b,表示采集数据的好坏x_d = np.zeros((351, 34), dtype="float")   #存放数据集x_c = np.zeros((351,), dtype="bool")       #存放类别#简洁的写法,而且文件在读完后就自动关闭。with open("ionosphere.data", 'r') as input_file:    reader = csv.reader(input_file)    for i, row in enumerate(reader):        data = [float(datum) for datum in row[:-1]]             x_d[i] = data        x_c[i] = row[-1] == 'g'  #得到数据与存放类别后,进行训练集和测试集的划分from sklearn.model_selection import train_test_split#xd_train存放训练集数据,xc_train存放训练集数据对应的类别,xd_test存放测试集数据,xc_test存放测试集数据对应的类别xd_train, xd_test, xc_train, xc_test = train_test_split(x_d, x_c, random_state=14)#导入k近邻分类器from sklearn.neighbors import KNeighborsClassifierestimator = KNeighborsClassifier()#近邻分类器:建立分类模型,找到待测试数据xd_test的近邻estimator.fit(xd_train, xc_train)xc_predicted = estimator.predict(xd_test)accuracy = np.mean(xc_test == xc_predicted) * 100print("The accuracy is {0:.1f}%".format(accuracy))#上述就完成了一个简单点的分类的预测(都是采用的默认参数,很强大)#为了减少训练集的划分对结果预测的正确性的影响,下面采用交叉检验from sklearn.model_selection import cross_val_scorescores = cross_val_score(estimator, x_d, x_c, scoring="accuracy")average_accuracy = np.mean(scores) * 100print("The average accuracy is {0:.1f}%".format(average_accuracy))#解决中文显示问题from pylab import mpl mpl.rcParams['font.sans-serif'] = ['SimHei'] #指定默认字体 (黑体)import matplotlib.pyplot as pltx = range(len(xc_test))for i in x:    plt.xlabel("点的编号");    plt.ylabel("点的类别")    plt.plot(x,xc_test-0.1,"ro")      #测试集数据对应的真实类别    plt.plot(x,xc_predicted-0.1,"bo") #测试集数据的预测类别plt.show()

运行结果如下:
这里写图片描述

上面都是采用默认参数的KNN算法,下面将实现通过输入不同的K值,来检验准确率:

#采用自定义参数import numpy as npimport csvx = np.zeros((351, 34), dtype="float")   #存放数据集y = np.zeros((351,), dtype="bool")       #存放类别#简洁的写法,而且文件在读完后就自动关闭。with open("ionosphere.data", 'r') as input_file:    reader = csv.reader(input_file)    for i, row in enumerate(reader):        data = [float(datum) for datum in row[:-1]]             x[i] = data        y[i] = row[-1] == 'g'  #得到数据与存放类别后,进行训练集和测试集的划分from sklearn.model_selection import cross_val_score#导入k近邻分类器,输入参数,KNeighborsClassifier定义n_neighbors的个数from sklearn.neighbors import KNeighborsClassifieravg_scores = []all_scores = []parameter_values = list(range(1, 21))for n_neighbors in parameter_values:    estimator = KNeighborsClassifier(n_neighbors)    scores = cross_val_score(estimator, x, y, scoring="accuracy")    avg_scores.append(np.mean(scores))    all_scores.append(scores)#设置matplotlib的默认字体,解决中文乱码问题from pylab import mpl mpl.rcParams['font.sans-serif'] = ['SimHei'] #指定默认字体 (黑体)#mpl.rcParams['axes.unicode_minus'] = False #解决保存图像是负号'-'显示为方块的问题 import matplotlib.pyplot as pltplt.plot(parameter_values, avg_scores, '-o') plt.title(u"scikit-learn")plt.xlabel(u"n_neighbors参数")plt.ylabel(u"accuracy精确度")plt.show()

运行结果如下:
这里写图片描述

0 0