机器学习sklearn参数解释(GDBT+XGBOOST)

来源:互联网 发布:阿里云搭建ss教程 编辑:程序博客网 时间:2024/05/21 21:35

机器学习总结-sklearn参数解释

实验数据集选取:

1分类数据选取 load_iris 鸢尾花数据集

from sklearn.datasets import load_irisdata = load_iris()data.data[[10, 25, 50]]data.target[[10, 25, 50]]list(data.target_names)list(data.feature_names)

2回归数据选取

from sklearn.datasets import load_bostonboston = load_boston()print(boston.data.shape)boston.feature_names

数据集-切分为 训练集-验证集

GBDT

系数说明参考
GradientBoostingClassifier支持二进制和多类分类

from  sklearn.datasets  import  make_hastie_10_2from  sklearn.ensemble  import  GradientBoostingClassifierX, y = make_hastie_10_2(random_state=0)X_train, X_test = X[:2000], X[2000:]y_train, y_test = y[:2000], y[2000:]clf = GradientBoostingClassifier(loss='deviance',  ##损失函数默认deviance  deviance具有概率输出的分类的偏差n_estimators=100, ##默认100 回归树个数 弱学习器个数learning_rate=0.1,  ##默认0.1学习速率/步长0.0-1.0的超参数  每个树学习前一个树的残差的步长max_depth=3,   ## 默认值为3每个回归树的深度  控制树的大小 也可用叶节点的数量max leaf nodes控制subsample=1,  ##树生成时对样本采样 选择子样本<1.0导致方差的减少和偏差的增加min_samples_split=2, ##生成子节点所需的最小样本数 如果是浮点数代表是百分比min_samples_leaf=1, ##叶节点所需的最小样本数  如果是浮点数代表是百分比max_features=None, ##在寻找最佳分割点要考虑的特征数量auto全选/sqrt开方/log2对数/None全选/int自定义几个/float百分比max_leaf_nodes=None, ##叶节点的数量 None不限数量min_impurity_split=1e-7, ##停止分裂叶子节点的阈值verbose=0,  ##打印输出 大于1打印每棵树的进度和性能warm_start=False, ##True在前面基础上增量训练(重设参数减少训练次数) False默认擦除重新训练random_state=0  ##随机种子-方便重现).fit(X_train, y_train)  ##多类别回归建议使用随机森林print clf.score(X_test, y_test)  ##tp / (tp + fp)正实例占所有正实例的比例test_y= clf.predict(X_test)test_y= clf.predict_proba(X_test)[:,1] ##预测概率print clf.feature_importances_  ##输出特征重要性print clf.train_score_  ##每次迭代后分数##test_y= clf.predict(X_test)##from sklearn.metrics import precision_score##precision_score(test_y, y_test,average='micro')  ##tp / (tp + fp)##from sklearn import metrics##fpr, tpr, thresholds = metrics.roc_curve(y_test, test_y)##print("auc : %.4g" % metrics.auc(fpr, tpr)y_pre= clf.predict(X_test)y_pro= clf.predict_proba(X_test)[:,1] ##预测概率from sklearn import metricsfpr, tpr, thresholds = metrics.roc_curve(y_test, y_pro)print("auc : %.4g" % metrics.auc(fpr, tpr),x%10000/100,x%100) #auc表示一print "AUC Score (Train): %f" % metrics.roc_auc_score(y_test, y_pro) #auc表示二 两种方式等价print"Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre) ##等价于clf.score(X_test, y_test)

sklearn.ensemble.GradientBoostingRegressor

import numpy as npfrom sklearn.metrics import mean_squared_errorfrom sklearn.datasets import make_friedman1from sklearn.ensemble import GradientBoostingRegressorX, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)X_train, X_test = X[:200], X[200:]y_train, y_test = y[:200], y[200:]est = GradientBoostingRegressor(loss='ls',      ##默认ls损失函数'ls'是指最小二乘回归lad'(最小绝对偏差)'huber'是两者的组合n_estimators=100, ##默认100 回归树个数 弱学习器个数learning_rate=0.1,  ##默认0.1学习速率/步长0.0-1.0的超参数  每个树学习前一个树的残差的步长max_depth=3,   ## 默认值为3每个回归树的深度  控制树的大小 也可用叶节点的数量max leaf nodes控制subsample=1,  ##用于拟合个别基础学习器的样本分数 选择子样本<1.0导致方差的减少和偏差的增加min_samples_split=2, ##生成子节点所需的最小样本数 如果是浮点数代表是百分比min_samples_leaf=1, ##叶节点所需的最小样本数  如果是浮点数代表是百分比max_features=None, ##在寻找最佳分割点要考虑的特征数量auto全选/sqrt开方/log2对数/None全选/int自定义几个/float百分比max_leaf_nodes=None, ##叶节点的数量 None不限数量min_impurity_split=1e-7, ##停止分裂叶子节点的阈值verbose=0,  ##打印输出 大于1打印每棵树的进度和性能warm_start=False, ##True在前面基础上增量训练 False默认擦除重新训练 增加树random_state=0  ##随机种子-方便重现).fit(X_train, y_train)mean_squared_error(y_test, est.predict(X_test))
import numpy as npfrom sklearn import ensemblefrom sklearn import datasetsfrom sklearn.utils import shufflefrom sklearn.metrics import mean_squared_errorfrom sklearn.metrics import r2_scoreboston = datasets.load_boston()X, y = shuffle(boston.data, boston.target, random_state=13) #抽取X = X.astype(np.float32)offset = int(X.shape[0] * 0.9) #设置取0.9做样本X_train, y_train = X[:offset], y[:offset]X_test, y_test = X[offset:], y[offset:]##参数可以放入一个字典当中params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,          'learning_rate': 0.01, 'loss': 'ls'}clf = ensemble.GradientBoostingRegressor(**params)clf.fit(X_train, y_train)mse = mean_squared_error(y_test, clf.predict(X_test))r2 = r2_score(y_test, clf.predict(X_test))print("MSE: %.4f" % mse) ##输出均方误差print("r^2 on test data : %f" % r2) ##R^2 拟合优度=(预测值-均值)^2之和/(真实值-均值)^2之和##绘图查看import matplotlib.pyplot as plttest_score = np.zeros((params['n_estimators'],), dtype=np.float64)##计算每次迭代分数变化for i, y_pred in enumerate(clf.staged_predict(X_test)):    test_score[i] = clf.loss_(y_test, y_pred)plt.figure(figsize=(12, 6))plt.subplot(1, 2, 1)plt.title('Deviance')plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',         label='Training Set Deviance')plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',         label='Test Set Deviance')plt.legend(loc='upper right')plt.xlabel('Boosting Iterations')plt.ylabel('Deviance')##输出特征重要性feature_importance = clf.feature_importances_# make importances relative to max importancefeature_importance = 100.0 * (feature_importance / feature_importance.max())sorted_idx = np.argsort(feature_importance)  ##返回的是数组值从小到大的索引值pos = np.arange(sorted_idx.shape[0]) + .5plt.subplot(1, 2, 2)plt.barh(pos, feature_importance[sorted_idx], align='center')plt.yticks(pos, boston.feature_names[sorted_idx])plt.xlabel('Relative Importance')plt.title('Variable Importance')plt.show()

网格搜索调整超参数

from sklearn.model_selection import GridSearchCVclf=GridSearchCV(estimator, ##模型param_grid, ##参数字典或者字典列表scoring=None,  ##评价分数的方法fit_params=None, ##fit的参数 字典n_jobs=1, ##并行数  -1全部启动iid=True,  ##每个cv集上等价refit=True,  ##使用整个数据集重新编制最佳估计量cv=None,   ##几折交叉验证None默认3verbose=0, ##控制详细程度:越高,消息越多pre_dispatch='2*n_jobs',  ##总作业的确切数量error_score='raise',  ##错误时选择的分数return_train_score=True   ##如果'False',该cv_results_属性将不包括训练得分)clf.cv_results_  ##结果表 常看mean_test_score std_test_scoreclf.cv_results_.keys()  ##clf.cv_results_['mean_test_score']clf.best_estimator_  ##最优模型clf.best_score_  ##最优分数clf.best_params_  ##最优参数
from sklearn.model_selection import train_test_splitfrom sklearn.model_selection import GridSearchCVfrom sklearn.metrics import classification_reportfrom sklearn import metricsfrom  sklearn.datasets  import  make_hastie_10_2from  sklearn.ensemble  import  GradientBoostingClassifierX, y = make_hastie_10_2(random_state=0)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)##test_size测试集合所占比例##设置参数tuned_parameters= [{'n_estimators':range(20,81,10),                  'max_depth':range(3,14,2),                  'learning_rate':[0.1, 0.5, 1.0],                  'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]                  }]##设置分数计算方法精度/召回scores = ['precision', 'recall']  ## roc_aucfor score in scores:    print("评测选择 %s" % score)    clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5,                       scoring='%s_macro' % score)    clf.fit(X_train, y_train)    print(clf.best_params_)    means = clf.cv_results_['mean_test_score']  ##tp / (tp + fp)    stds = clf.cv_results_['std_test_score']    for mean, std, params in zip(means, stds, clf.cv_results_['params']):        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))    ##预测    y_true, y_pred = y_test, clf.predict(X_test)    ##y_true, y_pred = y_test, clf.predict_proba(X_test)    print(classification_report(y_true, y_pred))    ##print"Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred)

XGBoost

xgb原始

from sklearn.model_selection import train_test_splitfrom sklearn import metricsfrom  sklearn.datasets  import  make_hastie_10_2import xgboost as xgb#记录程序运行时间import time start_time = time.time()X, y = make_hastie_10_2(random_state=0)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)##test_size测试集合所占比例#xgb矩阵赋值xgb_train = xgb.DMatrix(X_train, label=y_train)xgb_test = xgb.DMatrix(X_test,label=y_test)##参数params={'booster':'gbtree','silent':1 ,#设置成1则没有运行信息输出,最好是设置为0.#'nthread':7,# cpu 线程数 默认最大'eta': 0.007, # 如同学习率'min_child_weight':3, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言#,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。#这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。'max_depth':6, # 构建树的深度,越大越容易过拟合'gamma':0.1,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。'subsample':0.7, # 随机采样训练样本'colsample_bytree':0.7, # 生成树时进行的列采样 'lambda':2,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。#'alpha':0, # L1 正则项参数#'scale_pos_weight':1, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。#'objective': 'multi:softmax', #多分类的问题#'num_class':10, # 类别数,多分类与 multisoftmax 并用'seed':1000, #随机种子#'eval_metric': 'auc'}plst = list(params.items())num_rounds = 100 # 迭代次数watchlist = [(xgb_train, 'train'),(xgb_test, 'val')]#训练模型并保存# early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练model = xgb.train(plst, xgb_train, num_rounds, watchlist,early_stopping_rounds=100,pred_margin=1)#model.save_model('./model/xgb.model') # 用于存储训练出的模型print "best best_ntree_limit",model.best_ntree_limit y_pred = model.predict(xgb_test,ntree_limit=model.best_ntree_limit)print ('error=%f' % (  sum(1 for i in range(len(y_pred)) if int(y_pred[i]>0.5)!=y_test[i]) /float(len(y_pred))))  #输出运行时长cost_time = time.time()-start_timeprint "xgboost success!",'\n',"cost time:",cost_time,"(s)......"

xgb使用sklearn接口(推荐)
官方
会改变的函数名是:
eta -> learning_rate
lambda -> reg_lambda
alpha -> reg_alpha

from sklearn.model_selection import train_test_splitfrom sklearn import metricsfrom  sklearn.datasets  import  make_hastie_10_2from xgboost.sklearn import XGBClassifierX, y = make_hastie_10_2(random_state=0)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)##test_size测试集合所占比例clf = XGBClassifier(silent=0 ,#设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。#nthread=4,# cpu 线程数 默认最大learning_rate= 0.3, # 如同学习率min_child_weight=1, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言#,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。#这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。max_depth=6, # 构建树的深度,越大越容易过拟合gamma=0,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。subsample=1, # 随机采样训练样本 训练实例的子采样比max_delta_step=0,#最大增量步长,我们允许每个树的权重估计。colsample_bytree=1, # 生成树时进行的列采样 reg_lambda=1,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。#reg_alpha=0, # L1 正则项参数#scale_pos_weight=1, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重#objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标#num_class=10, # 类别数,多分类与 multisoftmax 并用n_estimators=100, #树的个数seed=1000 #随机种子#eval_metric= 'auc')clf.fit(X_train,y_train,eval_metric='auc')y_true, y_pred = y_test, clf.predict(X_test)print"Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred)#回归#m_regress = xgb.XGBRegressor(n_estimators=1000,seed=0)

网格搜索

可以先固定一个参数 最优化后继续调整
第一步:确定学习速率和tree_based 给个常见初始值 根据是否类别不平衡调节
max_depth,min_child_weight,gamma,subsample,scale_pos_weight
max_depth=3 起始值在4-6之间都是不错的选择。
min_child_weight比较小的值解决极不平衡的分类问题eg:1
subsample, colsample_bytree = 0.8: 这个是最常见的初始值了
scale_pos_weight = 1: 这个值是因为类别十分不平衡。
第二步: max_depth 和 min_weight 对最终结果有很大的影响
'max_depth':range(3,10,2),
'min_child_weight':range(1,6,2)
先大范围地粗调参数,然后再小范围地微调。
第三步:gamma参数调优
'gamma':[i/10.0 for i in range(0,5)]
第四步:调整subsample 和 colsample_bytree 参数
'subsample':[i/100.0 for i in range(75,90,5)],
'colsample_bytree':[i/100.0 for i in range(75,90,5)]
第五步:正则化参数调优
'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
'reg_lambda'
第六步:降低学习速率
learning_rate =0.01,

from sklearn.model_selection import GridSearchCVtuned_parameters= [{'n_estimators':[100,200,500],                  'max_depth':[3,5,7], ##range(3,10,2)                  'learning_rate':[0.5, 1.0],                  'subsample':[0.75,0.8,0.85,0.9]                  }]tuned_parameters= [{'n_estimators':[100,200,500,1000]                  }]clf = GridSearchCV(XGBClassifier(silent=0,nthread=4,learning_rate= 0.5,min_child_weight=1, max_depth=3,gamma=0,subsample=1,colsample_bytree=1,reg_lambda=1,seed=1000), param_grid=tuned_parameters,scoring='roc_auc',n_jobs=4,iid=False,cv=5)  clf.fit(X_train, y_train)##clf.grid_scores_, clf.best_params_, clf.best_score_print(clf.best_params_)y_true, y_pred = y_test, clf.predict(X_test)print"Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred) y_proba=clf.predict_proba(X_test)[:,1]print "AUC Score (Train): %f" % metrics.roc_auc_score(y_true, y_proba)               
from sklearn.model_selection import GridSearchCVparameters= [{'learning_rate':[0.01,0.1,0.3],'n_estimators':[1000,1200,1500,2000,2500]}]clf = GridSearchCV(XGBClassifier(             max_depth=3,             min_child_weight=1,             gamma=0.5,             subsample=0.6,             colsample_bytree=0.6,             objective= 'binary:logistic', #逻辑回归损失函数             scale_pos_weight=1,             reg_alpha=0,             reg_lambda=1,             seed=27            ),             param_grid=parameters,scoring='roc_auc')  clf.fit(X_train, y_train)print(clf.best_params_)  y_pre= clf.predict(X_test)y_pro= clf.predict_proba(X_test)[:,1] print "AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro) print"Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre)              

输出特征重要性

import pandas as pdimport matplotlib.pylab as pltfeat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)feat_imp.plot(kind='bar', title='Feature Importances')plt.ylabel('Feature Importance Score')plt.show()

GBDT输出新特征+blending/stacking/联级森林

R生成新特征

library(xgboost)  training <-iris  x1=rep(0,50)  x2=rep(1,50)  x3=rep(2,50)  x=c(x1,x2,x3)  d=training[,c(1:4)]  training=data.frame(d,x)   ind<-sample(2,nrow(training),replace=TRUE,prob=c(0.7,0.3)) #对数据分成两部分,70%训练数据,30%检测数据  traindata<- training [ind==1,]  #训练集  testdata<- training [ind==2,]  #测试集  traindatax=as.matrix(traindata[,c(1:4)])  traindatay=as.matrix(traindata[,5])  testdatax=as.matrix(testdata[,c(1:4)])  testdatay=as.matrix(testdata[,5])  ##多分类 默认从0开始bst <- xgboost(data = traindatax, label = traindatay, max.depth = 3, eta = 0.1,nround = 1000 ,objective = "multi:softmax",num_class=3) pred <- predict(bst, testdatax) new_feature_train <- predict(bst, traindatax,predleaf = T)new_feature_test <- predict(bst, testdatax,predleaf = T)t_train=cbind(traindatax,new_feature_train,traindatay)t_test=cbind(testdatax,new_feature_test,testdatay)

python生成GBDT特征

clf.apply(X_train)