    本系列是在作者学习《机器学习系统设计》([美] WilliRichert)过程中的思考与实践,全书通过Python从数据处理,到特征工程,再到模型选择,把机器学习解决问题的过程一一呈现。书中设计的源代码和数据集已上传到我的资源:http://download.csdn.net/detail/solomon1558/8971649


1.   算法概述






2.   分析数据


    面积(A)、周长(P)、紧密度(C = 4πA/P^2)、谷粒的长度、谷粒的宽度、偏度系数、谷粒的槽长度。


#coding=utf-8from matplotlib import pyplot as pltfrom load import load_datasetfeature_names = [    'area',    'perimeter',    'compactness',    'length of kernel',    'width of kernel',    'asymmetry coefficien',    'length of kernel groove',]lable_name = [    'Kama',    'Rosa',    'Canadian']features, lables = load_dataset('seeds')print lablesparis = [(0,1),(0,2),(0,3),(0,4),(0,5),(0,6),(1,2),(1,3),(1,4),(1,5),(1,6),(2,3),(2,4),(2,5),(2,6),(3,4),(3,5),(3,6),(4,5),(4,6),(5,6)]for i, (p0, p1) in enumerate(paris):    plt.subplot(3, 7, i+1)    for t, marker, c in zip(range(3), ">ox", "rgb"):        plt.scatter(features[lables == lable_name[t], p0], features[lables == lable_name[t], p1], marker=marker, c=c)    plt.xlabel(feature_names[p0])    plt.ylabel(feature_names[p1])    plt.xticks([])    plt.yticks([])plt.show()


     7种特征一共有21种顺序无关的排列组合。其中可以观察到area-perimeter、area-length of kernel、area-width of kernel、perimeter-length of kernel等图像呈现正相关性;area-perimeter、area-lengthof kernel groove、length of kernel-length of kernel groove等图像的三类种子可区分性较好。

3.   数据归一化





# 从特征值中减去特征的平均值
features -=features.mean(0)
# 将特征值除以它的标准差
features /=features.std(0)


            Ten fold cross-validated error was86.2%.


            Ten fold cross-validated errorafter z-scoring was 82.4%.




#coding=utf-8COLOUR_FIGURE = Falsefrom matplotlib import pyplot as pltfrom matplotlib.colors import ListedColormapfrom load import load_datasetimport numpy as npfrom knn import learn_model, apply_model, accuracyfrom seeds_knn import cross_validatefeature_names = [    'area',    'perimeter',    'compactness',    'length of kernel',    'width of kernel',    'asymmetry coefficien',    'length of kernel groove',]def train_plot(features, labels):    y0,y1 = features[:,2].min()*.9, features[:,2].max()*1.1    x0,x1 = features[:,0].min()*.9, features[:,0].max()*1.1    X = np.linspace(x0,x1,100)    Y = np.linspace(y0,y1,100)    X,Y = np.meshgrid(X,Y)    model = learn_model(1, features[:, (0,2)], np.array(labels))    test_error = accuracy(features[:, (0,2)], np.array(labels), model)    print (u"模型准确率: %f") % test_error    C = apply_model(np.vstack([X.ravel(),Y.ravel()]).T, model).reshape(X.shape)    if COLOUR_FIGURE:        cmap = ListedColormap([(1.,.6,.6),(.6,1.,.6),(.6,.6,1.)])    else:        cmap = ListedColormap([(1.,1.,1.),(.2,.2,.2),(.6,.6,.6)])    plt.xlim(x0,x1)    plt.ylim(y0,y1)    plt.xlabel(feature_names[0])    plt.ylabel(feature_names[2])    plt.pcolormesh(X,Y,C, cmap=cmap)    if COLOUR_FIGURE:        cmap = ListedColormap([(1.,.0,.0),(.0,1.,.0),(.0,.0,1.)])        plt.scatter(features[:,0], features[:,2], c=labels, cmap=cmap)    else:        for lab,ma in zip(range(3), "Do^"):            plt.plot(features[labels == lab,0], features[labels == lab,2], ma, c=(1.,1.,1.))features,labels = load_dataset('seeds')names = sorted(set(labels))labels = np.array([names.index(ell) for ell in labels])train_plot(features, labels)error = cross_validate(features[:, (0, 2)], labels)print('Ten fold cross-validated error was {0:.1%}.\n'.format(error))plt.savefig('../1400_02_04.png')plt.show()# 从特征值中减去特征的平均值features -= features.mean(0)# 将特征值除以它的标准差features /= features.std(0)train_plot(features, labels)error = cross_validate(features[:, (0, 2)], labels)print('Ten fold cross-validated error after z-scoring was {0:.1%}.'.format(error))plt.savefig('../1400_02_05.png')plt.show()

4.   实施kNN算法



    (1)   计算已知类别数据集中的每个点与当前的的欧氏距离;

    (2)   按照距离递增次序排序;

    (3)   选取与当前点距离最小的k个点;

    (4)   确定前k个点所在类别的出现频率;

    (5)   返回前k个点出现频率最高的类别作为当前点的预测分类。


#coding=utf-8import numpy as npdef learn_model(k, features, labels):    return k, features.copy(),labels.copy()def plurality(xs):    from collections import defaultdict    counts = defaultdict(int)  # 默认字典    for x in xs:        counts[x] += 1  # 以标签作为key值,类别对应的频次为value    maxv = max(counts.values())    for k,v in counts.items():        if v == maxv:            return kdef apply_model(features, model):    k, train_feats, labels = model    results = []    for f in features:        label_dist = []        for t,ell in zip(train_feats, labels):            label_dist.append( (np.linalg.norm(f-t), ell) )        label_dist.sort(key=lambda d_ell: d_ell[0])        label_dist = label_dist[:k]  # 取与新数据点欧氏距离最近的前k个样本        results.append(plurality([ell for _ , ell in label_dist]))    return np.array(results)def accuracy(features, labels, model):    preds = apply_model(features, model)    return np.mean(preds == labels)

5.   分类预测和交叉验证



5.1 load.py

import numpy as npdef load_dataset(dataset_name):    '''    data,labels = load_dataset(dataset_name)    Load a given dataset    Returns    -------    data : numpy ndarray    labels : list of str    '''    data = []    labels = []    with open('../data/{0}.tsv'.format(dataset_name)) as ifile:        for line in ifile:            tokens = line.strip().split('\t')            data.append([float(tk) for tk in tokens[:-1]])            labels.append(tokens[-1])    data = np.array(data)    labels = np.array(labels)    return data, labels

5.2 knn.py


5.3 seeds_knn.py

from load import load_datasetimport numpy as npfrom knn import learn_model, apply_model, accuracyfeatures,labels = load_dataset('seeds')def cross_validate(features, labels):    error = 0.0    for fold in range(10):        training = np.ones(len(features), bool)        training[fold::10] = 0        testing = ~training        model = learn_model(1, features[training], labels[training])        test_error = accuracy(features[testing], labels[testing], model)        error += test_error    return error/ 10.0error = cross_validate(features, labels)print('Ten fold cross-validated error was {0:.1%}.'.format(error))features -= features.mean(0)features /= features.std(0)error = cross_validate(features, labels)print('Ten fold cross-validated error after z-scoring was {0:.1%}.'.format(error))

5.4 测试结果:

            Tenfold cross-validated error was 89.5%.


            Tenfold cross-validated error after z-scoring was 94.3%.



