Python sklearn数据分析中常用方法
来源:互联网 发布:.旅游网络销售调查 编辑:程序博客网 时间:2024/05/21 17:59
一、数据处理
随机划分训练集和测试集:
from sklearn.model_selection import train_test_splitX_all = data_train.drop(['Survived', 'PassengerId'], axis=1) #只包含特征集,不包含预测目标y_all = data_train['Survived'] #只包含预测目标num_test = 0.20 # 测试集占据比例,,如果是整数的话就是样本的数量X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)# random_state参数表示随机种子,如果为0或不填,每次随机产生的随机数组不同。
from sklearn.model_selection import StratifiedShuffleSplitsss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)# sss对象用于划分数据集X = train[0::, 1::]# X为特征集y = train[0::, 0]# y为Label集for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]
二、模型选择
# machine learningfrom sklearn.linear_model import LogisticRegressionfrom sklearn.svm import SVC, LinearSVCfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.naive_bayes import GaussianNBfrom sklearn.linear_model import Perceptronfrom sklearn.linear_model import SGDClassifierfrom sklearn.tree import DecisionTreeClassifier
逻辑回归:
logreg = LogisticRegression()logreg.fit(X_train, Y_train)Y_pred = logreg.predict(X_test)acc_log = round(logreg.score(X_train, Y_train) * 100, 2)acc_log
# 查看特征系数coeff_df = pd.DataFrame(train_df.columns.delete(0))coeff_df.columns = ['Feature']coeff_df["Correlation"] = pd.Series(logreg.coef_[0])coeff_df.sort_values(by='Correlation', ascending=False)
SVC支持向量机:
svc = SVC()svc.fit(X_train, Y_train)Y_pred = svc.predict(X_test)acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
# # Linear SVC# linear_svc = LinearSVC()# linear_svc.fit(X_train, Y_train)# Y_pred = linear_svc.predict(X_test)# acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)# acc_linear_svc
K近邻学习KNN:
# knn = KNeighborsClassifier(n_neighbors = 3)# knn.fit(X_train, Y_train)# Y_pred = knn.predict(X_test)# acc_knn = round(knn.score(X_train, Y_train) * 100, 2)# acc_knn
朴素贝叶斯分类器:
# gaussian = GaussianNB()# gaussian.fit(X_train, Y_train)# Y_pred = gaussian.predict(X_test)# acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)# acc_gaussian
感知机:
# perceptron = Perceptron()# perceptron.fit(X_train, Y_train)# Y_pred = perceptron.predict(X_test)# acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)# acc_perceptron
随机梯度下降法:
# sgd = SGDClassifier()# sgd.fit(X_train, Y_train)# Y_pred = sgd.predict(X_test)# acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)# acc_sgd
决策树:
# # Decision Tree# decision_tree = DecisionTreeClassifier()# decision_tree.fit(X_train, Y_train)# Y_pred = decision_tree.predict(X_test)# acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)# acc_decision_tree
随机森林:
# # Random Forest# random_forest = RandomForestClassifier(n_estimators=100)# random_forest.fit(X_train, Y_train)# Y_pred = random_forest.predict(X_test)# random_forest.score(X_train, Y_train)# acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)# acc_random_forest
# 基于准确率搜索最佳参数的随机森林from sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import make_scorer, accuracy_scorefrom sklearn.model_selection import GridSearchCV# Choose the type of classifier. clf = RandomForestClassifier()# Choose some parameter combinations to tryparameters = {'n_estimators': [4, 6, 9], 'max_features': ['log2', 'sqrt','auto'], 'criterion': ['entropy', 'gini'], 'max_depth': [2, 3, 5, 10], 'min_samples_split': [2, 3, 5], 'min_samples_leaf': [1,5,8] }# Type of scoring used to compare parameter combinationsacc_scorer = make_scorer(accuracy_score)# Run the grid searchgrid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)grid_obj = grid_obj.fit(X_train, y_train)# Set the clf to the best combination of parametersclf = grid_obj.best_estimator_# Fit the best algorithm to the data. clf.fit(X_train, y_train)
遍历模型方法:
import matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.model_selection import StratifiedShuffleSplitfrom sklearn.metrics import accuracy_score, log_lossfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.svm import SVCfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifierfrom sklearn.naive_bayes import GaussianNBfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysisfrom sklearn.linear_model import LogisticRegressionclassifiers = [ KNeighborsClassifier(3), SVC(probability=True), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), LogisticRegression()]log_cols = ["Classifier", "Accuracy"]log = pd.DataFrame(columns=log_cols)sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)# sss对象用于划分数据集X = train[0::, 1::]# X为特征集y = train[0::, 0]# y为Label集acc_dict = {}for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for clf in classifiers: name = clf.__class__.__name__ clf.fit(X_train, y_train) train_predictions = clf.predict(X_test) acc = accuracy_score(y_test, train_predictions) if name in acc_dict: acc_dict[name] += acc else: acc_dict[name] = accfor clf in acc_dict: acc_dict[clf] = acc_dict[clf] / 10.0 # 计算平均准确率 log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols) log = log.append(log_entry)plt.xlabel('Accuracy')plt.title('Classifier Accuracy')sns.set_color_codes("muted")sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")# 画条形图分析
叠加多层(2)模型教程
三、模型评估
使用k折交叉验证法:
from sklearn.cross_validation import KFolddef run_kfold(clf): kf = KFold(891, n_folds=10) outcomes = [] fold = 0 for train_index, test_index in kf: fold += 1 X_train, X_test = X_all.values[train_index], X_all.values[test_index] y_train, y_test = y_all.values[train_index], y_all.values[test_index] clf.fit(X_train, y_train) predictions = clf.predict(X_test) accuracy = accuracy_score(y_test, predictions) outcomes.append(accuracy) print("Fold {0} accuracy: {1}".format(fold, accuracy)) mean_outcome = np.mean(outcomes) print("Mean Accuracy: {0}".format(mean_outcome)) run_kfold(clf)
四、其他
保存模型:
Pickle:>>> import pickle>>> s = pickle.dumps(clf)>>> clf2 = pickle.loads(s)>>> clf2.predict(X[0:1])joblib:>>> from sklearn.externals import joblib>>> joblib.dump(clf, 'filename.pkl') >>> clf = joblib.load('filename.pkl')
阅读全文
0 0
- Python sklearn数据分析中常用方法
- sklearn中常用数据预处理方法
- sklearn中常用的数据预处理方法
- sklearn中常用数据预处理方法
- sklearn中常用数据预处理方法
- sklearn中常用数据预处理方法
- Python Numpy数据分析中常用方法
- Python pandas数据分析中常用方法
- windows下 python数据分析包 sklearn 安装
- sklearn常用模块及类及方法----机器学习Python
- 机器学习数据分析之Python中Numpy的常用方法
- 转:『Sklearn』数据划分方法及python代码
- Python数据分析几个比较常用的方法
- Python数据分析几个比较常用的方法
- python中sklearn的朴素贝叶斯方法(sklearn.naive_bayes.GaussianNB)的简单使用
- 数据分析(4)-sklearn入门
- python数据挖掘包Sklearn
- python数据分析常用函数
- 一分钟简单了解Effective java
- 整理:深度学习 vs 机器学习 vs 模式识别
- CSS布局的两种方法
- 多渠道打包图片资源替换
- 记得ajax中要带上AntiForgeryToken防止CSRF攻击
- Python sklearn数据分析中常用方法
- 字符串相等的比较方法,字符串的基本数据类型和引用数据类型。
- MP3相关
- 后台转义操作
- Servlet中请求与响应的编码总结
- 【数据库基础】对SQL语言中视图的理解
- J
- 根据省市县获取code和邮编工具类
- Redis 事务和watch应用于秒杀商品应用