使用scikit-learn处理分类的基础流程

来源:互联网 发布:福彩3d算法 编辑:程序博客网 时间:2024/05/24 06:50
#coding=utf-8'''处理流程:1.加载数据,有些算法支持增量训练,有些不支持,对于支持增量训练的算法,一般都会有一个partial_fit方法2.数据预处理(对空值进行填补等)3.数据是否需要压缩高维稀疏矩阵4.shuffle数据和切出训练集和测试集5.数据是否需要归一化或者标准化6.是否需要PCA降低维度7.训练模型8.根据正确率,混淆矩阵,进行模型选择9.保存模型,以便下次可以将模型直接使用,有些算法不能生成模型,也保存不了'''from sklearn import datasetsfrom sklearn.svm import SVCimport numpy as npfrom sklearn.model_selection import train_test_splitimport picklefrom sklearn import metricsfrom sklearn.decomposition import PCA# 1.加载数据ALLData1 = np.loadtxt('D:\\xxx\\xxx.txt',delimiter=',')data = ALLData1[:,:-1]print data.shapetarget = ALLData1[:,-1]# print data# print target# 2.是否需要数据预处理# 3.数据是否需要压缩高维稀疏矩阵,有很多种高维稀疏矩阵压缩算法# from scipy.sparse import coo_matrix# X = np.array([[ 3, 100,0,0,0,0,0,0,0],[4,50000,0,0,0,0,0,0,0]])# print coo_matrix(X)# 4.shuffle数据和切出训练集和测试集X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.4, random_state=0)# 直接按样本比例取训练集# train_num = data.shape[0]*0.6# X_train = data[0:train_num]# print X_train.shape# y_train = target[0:train_num]# X_test = data[train_num:]# y_test = target[train_num:]# 从文件中另外读取测试集# X_train = ALLData1[:,:-1]# # print data.shape# y_train = ALLData1[:,-1]# ALLData2 = np.loadtxt('D:\\machinetest\\37resulttest.txt',delimiter=',',dtype=np.string_)# X_test = ALLData2[:,:-1]# print X_test.shape# y_test = ALLData2[:,-1]# 5.数据是否需要归一化或标准化# from sklearn import preprocessing# scaler = preprocessing.StandardScaler().fit(X_train)# pred_X_train = scaler.transform(X_train)# pred_X_test = scaler.transform(X_test)# 6.尝试使用PCA降低维度# pca=PCA(n_components=0.98)# pca=PCA(n_components='mle')# pca.fit(X_train)# 各个特征占所有特征的方差百分比,分值越高说明保留的信息就越多,越重要# print pca.explained_variance_ratio_# 该特征的方差,方差越大,说明主成分越重要# print pca.explained_variance_# pcaX_train=pca.transform(X_train)# pcaX_test=pca.transform(X_test)# 7.训练模型clf = SVC()# clf.fit(X_train, y_train)  # 8.使用SVM各种核函数查看结果# 线性核函数(Linear Kernel)print "Linear"clf.set_params(kernel='linear').fit(X_train, y_train)  print clf.score(X_train,y_train)predicted = clf.predict(X_test)# clf.set_params(kernel='linear').fit(pcaX_train, y_train)  # print clf.score(pcaX_train,y_train)# predicted = clf.predict(pcaX_test)# 输出正确率print np.mean(predicted == y_test)# 输出混淆矩阵print metrics.confusion_matrix(y_test, predicted)# 高斯核函数(Gaussian Kernel),也称为径向基核函数print "Gaussian"clf.set_params(kernel='rbf').fit(X_train, y_train)  print clf.score(X_train,y_train)predicted = clf.predict(X_test)# clf.set_params(kernel='rbf').fit(pcaX_train, y_train)  # print clf.score(pcaX_train,y_train)# predicted = clf.predict(pcaX_test)# 输出正确率print np.mean(predicted == y_test)# 输出混淆矩阵print metrics.confusion_matrix(y_test, predicted)# 多项式核函数(Polynomial Kernel)print "Polynomial"clf.set_params(kernel='poly').fit(X_train, y_train)  print clf.score(X_train,y_train)predicted = clf.predict(X_test)# clf.set_params(kernel='poly').fit(pcaX_train, y_train)  # print clf.score(pcaX_train,y_train)# predicted = clf.predict(pcaX_test)# 输出正确率print np.mean(predicted == y_test)# 输出混淆矩阵print metrics.confusion_matrix(y_test, predicted)# Sigmoid核函数(Sigmoid Kernel)print "Sigmoid"clf.set_params(kernel='sigmoid').fit(X_train, y_train)  print clf.score(X_train,y_train)predicted = clf.predict(X_test)# clf.set_params(kernel='sigmoid').fit(pcaX_train, y_train)  # print clf.score(pcaX_train,y_train)# predicted = clf.predict(pcaX_test)# 输出正确率print np.mean(predicted == y_test)# 输出混淆矩阵print metrics.confusion_matrix(y_test, predicted)# # 9.保存模型# output = open('D:\\xxx\\data.pkl', 'wb')# s = pickle.dump(clf, output)# output.close()# 调用模型# input = open('D:\\xxx\\data.pkl', 'rb')# clf2 = pickle.load(input)# input.close()# clf2.predict(X[0:1])