Python RandomForest 调参

来源：互联网发布：keynote软件编辑：程序博客网时间：2024/06/05 11:41

先载入一堆类库

>>> import pandas as pd>>> import numpy as np>>> from sklearn.ensemble import RandomForestClassifier>>> from sklearn.model_selection import GridSearchCV>>> from sklearn import cross_validation, metrics>>> import matplotlib.pylab as plt

数据准备

>>> train = pd.read_csv("D:\Python\\-1d\\B_train.csv")>>> target = 'flag'>>> IDcol = 'no'>>> predictors_train = [x for x in train.columns if x not in [target,IDcol]]>>> test = pd.read_csv("D:\Python\\-1d\\B_test.csv")>>> predictors_test = [x for x in train.columns if x not in [target,IDcol]]>>> x = train[predictors_train]>>> y = train[target]

先用默认参数试一下

>>> clf = RandomForestClassifier(oob_score=True, random_state=10)>>> clf.fit(x,y)RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',            max_depth=None, max_features='auto', max_leaf_nodes=None,            min_impurity_split=1e-07, min_samples_leaf=1,            min_samples_split=2, min_weight_fraction_leaf=0.0,            n_estimators=10, n_jobs=1, oob_score=True, random_state=10,            verbose=0, warm_start=False)>>> y_pred = clf.predict_proba(x)[:,1]>>> metrics.roc_auc_score(y,y_pred)0.93976492743333018

结果居然还可以，是我的错觉么~

参数调整

首先调整n_estimators，测出最佳值为50

>>> param_test1 = {'n_estimators':range(10,71,10)}>>> gsearch1 = GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_test1, scoring='roc_auc',cv=5)>>> gsearch1.fit(x,y)GridSearchCV(cv=5, error_score='raise',       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',            max_depth=None, max_features='auto', max_leaf_nodes=None,            min_impurity_split=1e-07, min_samples_leaf=1,            min_samples_split=2, min_weight_fraction_leaf=0.0,            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,            verbose=0, warm_start=False),       fit_params={}, iid=True, n_jobs=1,       param_grid={'n_estimators': range(10, 71, 10)},       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,       scoring='roc_auc', verbose=0)>>> gsearch1.best_params_{'n_estimators': 50}>>> gsearch1.best_score_0.53282185445290342

再调整max_depth,min_samples_split，最佳参数1，3

>>> param_test2 = {'max_depth':range(2,11,2), 'min_samples_split':range(2,20)}>>> gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=50), param_grid = param_test2, scoring='roc_auc',cv=5)>>> gsearch2.fit(x,y)GridSearchCV(cv=5, error_score='raise',       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',            max_depth=None, max_features='auto', max_leaf_nodes=None,            min_impurity_split=1e-07, min_samples_leaf=1,            min_samples_split=2, min_weight_fraction_leaf=0.0,            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,            verbose=0, warm_start=False),       fit_params={}, iid=True, n_jobs=1,       param_grid={'max_depth': range(2, 11, 2), 'min_samples_split': range(2, 20)},       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,       scoring='roc_auc', verbose=0)>>> gsearch2.best_params_{'max_depth': 2, 'min_samples_split': 10}>>> param_test2 = {'max_depth':range(1,5), 'min_samples_split':range(2,15)}>>> gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=50), param_grid = param_test2, scoring='roc_auc',cv=5)>>> gsearch2.fit(x,y)GridSearchCV(cv=5, error_score='raise',       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',            max_depth=None, max_features='auto', max_leaf_nodes=None,            min_impurity_split=1e-07, min_samples_leaf=1,            min_samples_split=2, min_weight_fraction_leaf=0.0,            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,            verbose=0, warm_start=False),       fit_params={}, iid=True, n_jobs=1,       param_grid={'max_depth': range(1, 5), 'min_samples_split': range(2, 15)},       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,       scoring='roc_auc', verbose=0)>>> gsearch2.best_params_{'max_depth': 1, 'min_samples_split': 3}>>> gsearch2.best_score_0.57823708595454526

min_samples_split和min_samples_leaf调参，一次一个值，调得我一脸懵逼

>>> param_test3 = {'min_samples_split':range(2,20,2), 'min_samples_leaf':range(10,60,10)}>>> gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=50,max_depth=1,min_samples_split=3), param_grid = param_test3, scoring='roc_auc',cv=5)>>> gsearch3.fit(x,y)GridSearchCV(cv=5, error_score='raise',       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',            max_depth=1, max_features='auto', max_leaf_nodes=None,            min_impurity_split=1e-07, min_samples_leaf=1,            min_samples_split=3, min_weight_fraction_leaf=0.0,            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,            verbose=0, warm_start=False),       fit_params={}, iid=True, n_jobs=1,       param_grid={'min_samples_split': range(2, 20, 2), 'min_samples_leaf': range(10, 60, 10)},       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,       scoring='roc_auc', verbose=0)>>> gsearch3.best_params_{'min_samples_leaf': 50, 'min_samples_split': 2}>>> param_test3 = {'min_samples_split':range(2,10), 'min_samples_leaf':range(40,60,2)}>>> gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=50,max_depth=1,min_samples_split=3), param_grid = param_test3, scoring='roc_auc',cv=5)>>> gsearch3.fit(x,y)GridSearchCV(cv=5, error_score='raise',       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',            max_depth=1, max_features='auto', max_leaf_nodes=None,            min_impurity_split=1e-07, min_samples_leaf=1,            min_samples_split=3, min_weight_fraction_leaf=0.0,            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,            verbose=0, warm_start=False),       fit_params={}, iid=True, n_jobs=1,       param_grid={'min_samples_split': range(2, 10), 'min_samples_leaf': range(40, 60, 2)},       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,       scoring='roc_auc', verbose=0)>>> gsearch3.best_params_{'min_samples_leaf': 46, 'min_samples_split': 8}>>> gsearch3.best_score_0.58185700631854675>>> gsearch3.fit(x,y)GridSearchCV(cv=5, error_score='raise',       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',            max_depth=1, max_features='auto', max_leaf_nodes=None,            min_impurity_split=1e-07, min_samples_leaf=1,            min_samples_split=3, min_weight_fraction_leaf=0.0,            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,            verbose=0, warm_start=False),       fit_params={}, iid=True, n_jobs=1,       param_grid={'min_samples_split': range(2, 10), 'min_samples_leaf': range(40, 60, 2)},       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,       scoring='roc_auc', verbose=0)>>> gsearch3.best_params_{'min_samples_leaf': 42, 'min_samples_split': 5}>>> gsearch3.best_score_0.58215690359768479>>> gsearch3.fit(x,y)GridSearchCV(cv=5, error_score='raise',       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',            max_depth=1, max_features='auto', max_leaf_nodes=None,            min_impurity_split=1e-07, min_samples_leaf=1,            min_samples_split=3, min_weight_fraction_leaf=0.0,            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,            verbose=0, warm_start=False),       fit_params={}, iid=True, n_jobs=1,       param_grid={'min_samples_split': range(2, 10), 'min_samples_leaf': range(40, 60, 2)},       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,       scoring='roc_auc', verbose=0)>>> gsearch3.best_params_{'min_samples_leaf': 48, 'min_samples_split': 7}>>> gsearch3.best_score_0.58009791043514225

还是三个一起来吧。。。

>>> param_test4 = {'max_depth':range(1,5), 'min_samples_split':range(2,10),'min_samples_leaf':range(40,60,2)}>>> gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=50,max_depth=1,min_samples_split=5,min_samples_leaf=42), param_grid = param_test4, scoring='roc_auc',cv=5)>>> gsearch4.fit(x,y)GridSearchCV(cv=5, error_score='raise',       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',            max_depth=1, max_features='auto', max_leaf_nodes=None,            min_impurity_split=1e-07, min_samples_leaf=42,            min_samples_split=5, min_weight_fraction_leaf=0.0,            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,            verbose=0, warm_start=False),       fit_params={}, iid=True, n_jobs=1,       param_grid={'max_depth': range(1, 5), 'min_samples_split': range(2, 10), 'min_samples_leaf': range(40, 60, 2)},       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,       scoring='roc_auc', verbose=0)>>> gsearch4.best_params_{'max_depth': 1, 'min_samples_leaf': 54, 'min_samples_split': 4}>>> gsearch4.best_score_0.58214400889185414

最终的模型

>>> clf = RandomForestClassifier(n_estimators=50,max_depth=1,min_samples_split=4,min_samples_leaf=54,oob_score=True)>>> clf.fit(x,y)RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',            max_depth=1, max_features='auto', max_leaf_nodes=None,            min_impurity_split=1e-07, min_samples_leaf=54,            min_samples_split=4, min_weight_fraction_leaf=0.0,            n_estimators=50, n_jobs=1, oob_score=True, random_state=None,            verbose=0, warm_start=False)>>> clf.oob_score_0.92974999999999997

0 0