sklearn机器学习常用过程总结

来源：互联网发布：react和js关系编辑：程序博客网时间：2024/06/04 00:53

由于前面对sklearn或多或少接触了一下，但是不深入，随着最近学习，我下面介绍一下机器学习常用过程。

1. 加载数据集

scikit-learn中自带了一些数据集，比如说最著名的Iris数据集。 数据集中第3列和第4列数据表示花瓣的长度和宽度。而类别已经转成了数字，比如0=Iris-Setosa, 1=Iris-Versicolor, 2=Iris-Virginica.

from distutils.version import LooseVersion as Versionfrom sklearn import __version__ as sklearn_versionfrom sklearn import datasetsimport numpy as npiris = datasets.load_iris()iris.data    #查看数据X = iris.data[:, [2, 3]]y = iris.targetprint('Class labels:', np.unique(y))

2.通常我们会把数据集切分成训练集和测试集，这里70%的训练集，30%的测试集。

if Version(sklearn_version) < '0.18':    from sklearn.cross_validation import train_test_splitelse:    from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(    X, y, test_size=0.3, random_state=0)

3.对特征做标准化

from sklearn.preprocessing import StandardScalersc = StandardScaler()sc.fit(X_train)sc.scale_X_train_std = sc.transform(X_train)X_test_std = sc.transform(X_test)

4.用scikit-learn中的感知器做分类

from sklearn.linear_model import Perceptron#ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0)ppn = Perceptron()ppn.fit(X_train_std, y_train)ppn.coef_ppn.intercept_y_pred = ppn.predict(X_test_std)y_pred y_testy_pred == y_testprint('Misclassified samples: %d' % (y_test != y_pred).sum())from sklearn.metrics import accuracy_scoreprint('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

5.使用scikit-learn训练LR

from sklearn.linear_model import LogisticRegressionlr = LogisticRegression(C=1000.0, random_state=0)lr.fit(X_train_std, y_train)plot_decision_regions(X_combined_std, y_combined,                      classifier=lr, test_idx=range(105, 150))plt.xlabel('petal length [standardized]')plt.ylabel('petal width [standardized]')plt.legend(loc='upper left')plt.tight_layout()# plt.savefig('./figures/logistic_regression.png', dpi=300)plt.show()if Version(sklearn_version) < '0.17':    lr.predict_proba(X_test_std[0, :])else:    lr.predict_proba(X_test_std[0, :].reshape(1, -1))lr.predict_proba(X_test_std[0, :])

6.来谈谈过拟合/overfitting 与正则化/regularization
正则化L1 和L2 L1截断，会产生很多0，使矩阵稀疏；L2是缩放，把权重缩放到很小。

7.最大间隔分类与支持向量机

from sklearn.svm import SVCsvm = SVC(kernel='linear', C=1.0, random_state=0)svm.fit(X_train_std, y_train)plot_decision_regions(X_combined_std, y_combined,                      classifier=svm, test_idx=range(105, 150))plt.xlabel('petal length [standardized]')plt.ylabel('petal width [standardized]')plt.legend(loc='upper left')plt.tight_layout()# plt.savefig('./figures/support_vector_machine_linear.png', dpi=300)plt.show()

8.神奇的SVM核函数完成非线性切分

import matplotlib.pyplot as pltimport numpy as npnp.random.seed(0)X_xor = np.random.randn(200, 2)y_xor = np.logical_xor(X_xor[:, 0] > 0,                       X_xor[:, 1] > 0)y_xor = np.where(y_xor, 1, -1)plt.scatter(X_xor[y_xor == 1, 0],            X_xor[y_xor == 1, 1],            c='b', marker='x',            label='1')plt.scatter(X_xor[y_xor == -1, 0],            X_xor[y_xor == -1, 1],            c='r',            marker='s',            label='-1')plt.xlim([-3, 3])plt.ylim([-3, 3])plt.legend(loc='best')plt.tight_layout()# plt.savefig('./figures/xor.png', dpi=300)plt.show()

9.使用kernel trick在高维空间内找到一个可切分的超平面

svm = SVC(kernel='rbf', random_state=0, gamma=0.10, C=10.0)svm.fit(X_xor, y_xor)plot_decision_regions(X_xor, y_xor,                      classifier=svm)plt.legend(loc='upper left')plt.tight_layout()# plt.savefig('./figures/support_vector_machine_rbf_xor.png', dpi=300)plt.show()

from sklearn.svm import SVCsvm = SVC(kernel='rbf', random_state=0, gamma=0.2, C=1.0)svm.fit(X_train_std, y_train)plot_decision_regions(X_combined_std, y_combined,                      classifier=svm, test_idx=range(105, 150))plt.xlabel('petal length [standardized]')plt.ylabel('petal width [standardized]')plt.legend(loc='upper left')plt.tight_layout()# plt.savefig('./figures/support_vector_machine_rbf_iris_1.png', dpi=300)plt.show()

svm = SVC(kernel='rbf', random_state=0, gamma=100.0, C=1.0)svm.fit(X_train_std, y_train)plot_decision_regions(X_combined_std, y_combined,                       classifier=svm, test_idx=range(105, 150))plt.xlabel('petal length [standardized]')plt.ylabel('petal width [standardized]')plt.legend(loc='upper left')plt.tight_layout()# plt.savefig('./figures/support_vector_machine_rbf_iris_2.png', dpi=300)plt.show()

10.决策树学习

import matplotlib.pyplot as pltimport numpy as npdef gini(p):    return p * (1 - p) + (1 - p) * (1 - (1 - p))def entropy(p):    return - p * np.log2(p) - (1 - p) * np.log2((1 - p))def error(p):    return 1 - np.max([p, 1 - p])x = np.arange(0.0, 1.0, 0.01)ent = [entropy(p) if p != 0 else None for p in x]sc_ent = [e * 0.5 if e else None for e in ent]err = [error(i) for i in x]fig = plt.figure()ax = plt.subplot(111)for i, lab, ls, c, in zip([ent, sc_ent, gini(x), err],                           ['Entropy', 'Entropy (scaled)',                            'Gini Impurity', 'Misclassification Error'],                          ['-', '-', '--', '-.'],                          ['black', 'lightgray', 'red', 'green', 'cyan']):    line = ax.plot(x, i, label=lab, linestyle=ls, lw=2, color=c)ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15),          ncol=3, fancybox=True, shadow=False)ax.axhline(y=0.5, linewidth=1, color='k', linestyle='--')ax.axhline(y=1.0, linewidth=1, color='k', linestyle='--')plt.ylim([0, 1.1])plt.xlabel('p(i=1)')plt.ylabel('Impurity Index')plt.tight_layout()#plt.savefig('./figures/impurity.png', dpi=300, bbox_inches='tight')plt.show()

from sklearn.tree import DecisionTreeClassifiertree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)tree.fit(X_train, y_train)X_combined = np.vstack((X_train, X_test))y_combined = np.hstack((y_train, y_test))plot_decision_regions(X_combined, y_combined,                       classifier=tree, test_idx=range(105, 150))plt.xlabel('petal length [cm]')plt.ylabel('petal width [cm]')plt.legend(loc='upper left')plt.tight_layout()# plt.savefig('./figures/decision_tree_decision.png', dpi=300)plt.show()

from sklearn.tree import export_graphvizexport_graphviz(tree,                 out_file='tree.dot',                 feature_names=['petal length', 'petal width'])Image(filename='./images/03_18.png', width=600)

11.使用随机森林对树做叠加，变成增强分类器

from sklearn.ensemble import RandomForestClassifierforest = RandomForestClassifier(criterion='entropy',                                n_estimators=10,                                 random_state=1,                                n_jobs=2)forest.fit(X_train, y_train)plot_decision_regions(X_combined, y_combined,                       classifier=forest, test_idx=range(105, 150))plt.xlabel('petal length [cm]')plt.ylabel('petal width [cm]')plt.legend(loc='upper left')plt.tight_layout()# plt.savefig('./figures/random_forest.png', dpi=300)plt.show()

12.K最近邻，朴素的分类器

from sklearn.neighbors import KNeighborsClassifierknn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')knn.fit(X_train_std, y_train)plot_decision_regions(X_combined_std, y_combined,                       classifier=knn, test_idx=range(105, 150))plt.xlabel('petal length [standardized]')plt.ylabel('petal width [standardized]')plt.legend(loc='upper left')plt.tight_layout()# plt.savefig('./figures/k_nearest_neighbors.png', dpi=300)plt.show()

阅读全文

0 0