scikit-learn 常用分类算法的使用

来源:互联网 发布:网络用语bp是什么意思 编辑:程序博客网 时间:2024/06/07 19:34

scikit-learn机器学习的分类算法包括逻辑回归、朴素贝叶斯、KNN、支持向量机、决策树和随机森林等。这些模块的调用形式基本一致,训练用fit方法,预测用predict方法。用joblib.dump方法可以保存训练的模型,用joblib.load方法可以载入模型。

测试程序。测试数据采用小麦种子数据集 (seeds)。

# -*- coding: utf-8 -*-import numpy as np  from sklearn.cross_validation import KFoldfrom sklearn.linear_model import LogisticRegressionfrom sklearn.naive_bayes import GaussianNBfrom sklearn.neighbors import KNeighborsClassifier from sklearn import svmfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.ensemble import RandomForestClassifierfeature_names = [      'area',      'perimeter',      'compactness',      'length of kernel',      'width of kernel',      'asymmetry coefficien',      'length of kernel groove',  ]  COLOUR_FIGURE = False  def load_csv_data(filename):      data = []      labels = []      datafile = open(filename)      for line in datafile:          fields = line.strip().split('\t')          data.append([float(field) for field in fields[:-1]])          labels.append(fields[-1])      data = np.array(data)      labels = np.array(labels)      return data, labels      def accuracy(test_labels, pred_lables):      correct = np.sum(test_labels == pred_lables)      n = len(test_labels)      return float(correct) / n  #------------------------------------------------------------------------------#逻辑回归#------------------------------------------------------------------------------def testLR(features, labels):    kf = KFold(len(features), n_folds=3, shuffle=True)      clf = LogisticRegression()    result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]      score = [accuracy(labels[result[1]], result[0]) for result in result_set]      print(score)#------------------------------------------------------------------------------#朴素贝叶斯#------------------------------------------------------------------------------def testNaiveBayes(features, labels):    kf = KFold(len(features), n_folds=3, shuffle=True)      clf = GaussianNB()    result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]      score = [accuracy(labels[result[1]], result[0]) for result in result_set]      print(score)#------------------------------------------------------------------------------#K最近邻#------------------------------------------------------------------------------def testKNN(features, labels):    kf = KFold(len(features), n_folds=3, shuffle=True)      clf = KNeighborsClassifier(n_neighbors=5)     result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]      score = [accuracy(labels[result[1]], result[0]) for result in result_set]      print(score)             #------------------------------------------------------------------------------#--- 支持向量机#------------------------------------------------------------------------------def testSVM(features, labels):    kf = KFold(len(features), n_folds=3, shuffle=True)      clf = svm.SVC()    result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]      score = [accuracy(labels[result[1]], result[0]) for result in result_set]      print(score)#------------------------------------------------------------------------------#--- 决策树#------------------------------------------------------------------------------def testDecisionTree(features, labels):    kf = KFold(len(features), n_folds=3, shuffle=True)      clf = DecisionTreeClassifier()    result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]      score = [accuracy(labels[result[1]], result[0]) for result in result_set]      print(score)    #------------------------------------------------------------------------------#--- 随机森林#------------------------------------------------------------------------------def testRandomForest(features, labels):    kf = KFold(len(features), n_folds=3, shuffle=True)      clf = RandomForestClassifier()    result_set = [(clf.fit(features[train], labels[train]).predict(features[test]), test) for train, test in kf]      score = [accuracy(labels[result[1]], result[0]) for result in result_set]      print(score)    if __name__ == '__main__':    features, labels = load_csv_data('data/seeds_dataset.txt')     print(features)        print('LogisticRegression: \r')    testLR(features, labels)        print('GaussianNB: \r')    testNaiveBayes(features, labels)        print('KNN: \r')    testKNN(features, labels)        print('SVM: \r')    testSVM(features, labels)        print('Decision Tree: \r')    testDecisionTree(features, labels)        print('Random Forest: \r')    testRandomForest(features, labels)    

在Spyder中调试运行,运行结果。

runfile('E:/MyProject/_python/ScikitLearn/demo_clf.py', wdir='E:/MyProject/_python/ScikitLearn')[[ 15.26    14.84     0.871  ...,   3.312    2.221    5.22  ] [ 14.88    14.57     0.8811 ...,   3.333    1.018    4.956 ] [ 14.29    14.09     0.905  ...,   3.337    2.699    4.825 ] ...,  [ 13.2     13.66     0.8883 ...,   3.232    8.315    5.056 ] [ 11.84    13.21     0.8521 ...,   2.836    3.598    5.044 ] [ 12.3     13.34     0.8684 ...,   2.974    5.637    5.063 ]]LogisticRegression: [0.9142857142857143, 0.9714285714285714, 0.8857142857142857]GaussianNB: [0.9428571428571428, 0.8714285714285714, 0.9]KNN: [0.9285714285714286, 0.8571428571428571, 0.8857142857142857]SVM: [0.9, 0.9285714285714286, 0.8571428571428571]Decision Tree: [0.8714285714285714, 0.9714285714285714, 0.9142857142857143]Random Forest: [0.8857142857142857, 0.9142857142857143, 0.8428571428571429]