Python scikit-learn分类 近邻算法KNN

来源:互联网 发布:12306数据库设计 编辑:程序博客网 时间:2024/06/05 07:53

《Python数据挖掘入门与实践》Robert Layton 人民邮电出版社,第2章

估计器Estimator:用于分类、聚类和回归分析
转换器Transformer:用于数据预处理和数据转换
流水线Pipeline:组合数据挖掘流程,便于再次使用

%matplotlib inline #jupyter notebook里的命令, 意思是将那些用matplotlib绘制的图显示在页面里而不是弹出一个窗口,参数inline表示将图表嵌入到Notebook中
import os #os模块可以处理文件和目录home_folder = os.path.expanduser("~")#输出主目录位置print(home_folder)
data_folder = os.path.join(home_folder, "Data", "Ionosphere")#拼接路径data_filename = os.path.join(data_folder, "ionosphere.data")print(data_filename)
import csv#处理csv文件的模块import numpy as np#numpy提供矩阵运算功能# Size taken from the dataset and is knownX = np.zeros((351, 34), dtype='float')y = np.zeros((351,), dtype='bool')with open(data_filename, 'r') as input_file:#获取文件句柄    reader = csv.reader(input_file)#读取文件内容    for i, row in enumerate(reader):#对于一个可迭代的的对象,enumerate将其组成一个索引序列,利用它可以同时获得索引和值        # Get the data, converting each item to a float        data = [float(datum) for datum in row[:-1]]#获取每行的前34个值并转换为浮点值        # Set the appropriate row in our dataset        X[i] = data        # 1 if the class is 'g', 0 otherwise        y[i] = row[-1] == 'g'#每行的最后一个数据表示数据好坏,g或者b
from sklearn.cross_validation import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=14)#train_test_split随机划分训练集和测试集print("There are {} samples in the training dataset".format(X_train.shape[0]))print("There are {} samples in the testing dataset".format(X_test.shape[0]))print("Each sample has {} features".format(X_train.shape[1]))
#KNeighborsClassifier在scikit-learn 在sklearn.neighbors包之中。#KNeighborsClassifier使用分三步:1)创建KNeighborsClassifier对象,2)调用fit函数在训练集上完成模型创建,3)调用predict函数进行预测。from sklearn.neighbors import KNeighborsClassifier#导入K近邻分类器estimator = KNeighborsClassifier()estimator.fit(X_train, y_train)#此处使用默认参数
y_predicted = estimator.predict(X_test)accuracy = np.mean(y_test == y_predicted) * 100#计算准确率print("The accuracy is {0:.1f}%".format(accuracy))

接下来进行算法的交叉检验

from sklearn.cross_validation import cross_val_score#导入交叉检验的函数scores = cross_val_score(estimator, X, y, scoring='accuracy')#返回不同划分下的准确率average_accuracy = np.mean(scores) * 100print("The average accuracy is {0:.1f}%".format(average_accuracy))

观察不同近邻个数n_neighbors对准确率的影响

avg_scores = []all_scores = []parameter_values = list(range(1, 21))  # 近邻个数从1到20含20for n_neighbors in parameter_values:    estimator = KNeighborsClassifier(n_neighbors=n_neighbors)    scores = cross_val_score(estimator, X, y, scoring='accuracy')    avg_scores.append(np.mean(scores))    all_scores.append(scores)
plt.plot?#查看格式化字符串的详细配置
from matplotlib import pyplot as plt#从matplotlib库中导入pyplotplt.figure(figsize=(32,20))#调用figure创建一个绘图对象,并且使它成为当前的绘图对象。通过figsize参数可以指定绘图对象的宽度和高度,单位为英寸plt.plot(parameter_values, avg_scores, '-o', linewidth=5, markersize=24)#x、y轴为近邻数和平均正确率'-'实线,'o'点#plt.axis([0, max(parameter_values), 0, 1.0])
for parameter, scores in zip(parameter_values, all_scores):    n_scores = len(scores)    plt.plot([parameter] * n_scores, scores, '-o')#for循环画图,每次的x,y分别为[n1 n1 n1]和[score1 score2 score3]
plt.plot(parameter_values, all_scores, 'bx')#画图'b'蓝色,'x'符号x
from collections import defaultdictall_scores = defaultdict(list)parameter_values = list(range(1, 21))  # Including 20for n_neighbors in parameter_values:    for i in range(100):#似乎不需要?        estimator = KNeighborsClassifier(n_neighbors=n_neighbors)        scores = cross_val_score(estimator, X, y, scoring='accuracy', cv=10)#cv折数        all_scores[n_neighbors].append(scores)for parameter in parameter_values:    scores = all_scores[parameter]    n_scores = len(scores)    plt.plot([parameter] * n_scores, scores, '-o')

利用转换器Transformer对特征值进行预处理

X_broken=np.array(x)X_broken[:,::2]/=10#建立X_broken用于展示预处理过程
#MinMaxScalerlei类可把特征值的值域规范化为0-1间。最小值为0,最大值为1,其余值介于二者之间。from sklearn.preprocessing import MinMaxScalerX_transformed=MinMaxScaler().fit_transform(X_broken)#规范化estimator=KNeighborsClassifier()#k近邻分类器transformed_scores=cross_val_score(estimator,X_transformed,y,scoring='accuracy')#交叉检验print('The average accuracy for is {0:.1f}%'.format(np.mean(transformed_scores)*100))#得出平均准确率

流水线

from sklearn.pipeline import Pipeline#导入Pipeline对象scaling_pipeline=Pipeline([('scale',MinMaxScaler()),('predict',KNeighborsClassifier())])#创建流水线
scores=cross_val_score(scaling_pipeline,X_broken,y,scoring='accuracy')#调用流水线print('The pipeline scored an average accuracy for is {0:.1f}%'.format(np.mean(transformed_scores)*100))
原创粉丝点击