机器学习-GridSearchCV自动调参,RF特征选择

来源:互联网 发布:淘宝卖家不可以更新 编辑:程序博客网 时间:2024/05/29 07:19
  • 主要思想:通过GridSearchCV算法进行特征的自动化筛选
import numpy as npimport pandas as pdfrom time import strftimefrom sklearn.ensemble import RandomForestClassifierfrom sklearn.model_selection import GridSearchCVfrom sklearn.model_selection import train_test_splitfrom sklearn.externals import joblibfrom sklearn.metrics import classification_reportfrom sklearn.metrics import roc_curvefrom sklearn.metrics import aucdef selectParam(clf,param,features=[]):        """        param:希望对模型进行调优的参数;        feature:RF模型调参时默认对所有特征进行处理;LR模型调参时选择RF筛选留下的特征        """        grid_search = GridSearchCV(estimator = clf, param_grid=param, n_jobs=4,scoring='roc_auc',cv=5)        x1 = np.asarray(self.x_train)        if len(features) > 0:                       x1 = np.asarray(self.x_train[features])        y1 = np.asarray(self.y_train)         grid_search.fit(x1, y1) #传入训练集矩阵和训练样本类标         # 输出best score#        print("Best score of %s: %0.3f" % (clf,grid_search.best_score_))#        print("Best parameters set of %s is:" %clf )        # 输出最佳的分类器到底使用了怎样的参数        best_parameters = grid_search.best_estimator_.get_params()        return best_parameters    def top_features(self):        """        step1:调整参数获得随机森林最优参数;        step2: 通过最优参数建模并获得rf_model        step3: 通过rf_model获得特征重要性        返回值:特征及其重要性并其按重要性排序的输出        """        # 随机深林 特征重要性阈值        threshold =0.005        clf_RF = RandomForestClassifier(random_state=10)        param_grid =[ {"max_depth": range(10,50,3),#一般数据量少或特征少时可以不考虑,否则常用的取值为10-100之间。                      #"min_samples_split": [5, 10,15,20,25],#默认为2,数据量级特别大时可以用                      #"min_samples_leaf": [5, 10,15,20,25],#默认1,当叶子节点样本数少于此值会被截肢                      #"bootstrap": [True, False],                      #"criterion": ["gini", "entropy"],                      "n_estimators": range(10,50,3),                      }                      # "class_weight": [{0:1,1:13.24503311,2:1.315789474,3:12.42236025,4:8.163265306,5:31.25,6:4.77326969,7:19.41747573}],                      # "max_features":如果样本特征数不多,比如小于50,用默认的"None",较多可采用log2、sqrt、auto,                      # "warm_start": [True, False],                      # "oob_score": [True, False],                      # "verbose": [True, False]}                    ]        best_parameters = self.selectParam(clf_RF,param_grid)        best_parameters.pop('random_state')            clf_RF_prior = RandomForestClassifier(random_state=10,**best_parameters)        clf_RF_prior.fit(self.x_train,self.y_train)        print("the best paramers of RF to choose features resulted in score : ", clf_RF_prior.score(self.x_test, self.y_test))        feature_imp = pd.DataFrame([self.x_train.columns,clf_RF_prior.feature_importances_]).T        feature_imp.columns = ['features','features_importance']        feature_imp = feature_imp.sort_values(['features_importance'],ascending=False)        feature_imp = feature_imp[feature_imp['features_importance']>threshold]        return feature_imp