scikit-learn的1.11.4 Gradient Tree boosting学习与翻译

来源:互联网 发布:淘宝上自然堂旗舰店 编辑:程序博客网 时间:2024/06/06 03:57
1.11.4 Gradient Tree boosting

梯度树提升或梯度提升回归树(GBRT)是针对不同损失函数具有改进的泛化能力,GBRT可以用在分类和回归的现成精确算法,适用于网络搜索排名和社会生态学中。
优势:1不同类型的特征数据处理;2预测的能力;3对异常值稳健
缺点:可扩展性,因为连续改进的特性很难并向处理
1.11.4.1. Classification

GradientBoostingClassifier支持多分类问题。

from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

X, y = make_hastie_10_2(random_state=0)

import numpy as np

np.unique(y)

array([-1.,  1.])
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=0).fit(X_train, y_train)

clf.score(X_test, y_test) 

>0.91300000000000003
参数解释:n_estimators控制弱分类器个数        max_depth控制决策树的深度 或 max_leaf_nodes叶子节点的最大个数(这2个参数控制树的大小,2选1)        learning_rate 学习率 通过Shrinkage(伸缩性)控制模型过拟合性
1.11.4.2. RegressionGradientBoostingRegressor支持一系列不同损失函数组合的回归模型,由loss参数控制,默认'ls'(最小2乘法)
import numpy as npfrom sklearn.metrics import mean_squared_errorfrom sklearn.datasets import make_friedman1from sklearn.ensemble import GradientBoostingRegressor
X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
X_train, X_test = X[:200], X[200:]y_train, y_test = y[:200], y[200:]
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,                                max_depth=1, random_state=0, loss='ls').fit(X_train, y_train)
mean_squared_error(y_test, est.predict(X_test)) 
>5.0091548599603213

下图是对波士顿房价预测的梯度提升回归算法,同过提升的迭代次数(n_estimators的设置)来看训练集误差和测试集误差,
决定最优迭代次数,另外feature_importances_ 是对特征重要性排序

#from IPython.display import Image
#Image(filename='./image/1.11.14.2.png', width=400)后面的代码生成此图,就不上传了

import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle#随机打乱数据,打乱第一个维度(一般为行)
from sklearn.metrics import mean_squared_error

 #载入数据
boston = datasets.load_boston()
X, y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)#取前90%的行的小技巧
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

#拟合回归模型
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)#**代表字典,*代表字典以外的数据结构


clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)

>MSE: 6.6282

# 画出训练集与测试集误差对比图
%matplotlib inline
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)#等价于np.zeros(500)


for i, y_pred in enumerate(clf.staged_predict(X_test)):#clf.staged_predict(X_test)分阶段(指的是单个分类器)的预测值
    test_score[i] = clf.loss_(y_test, y_pred)#clf.loss_损失函数计算误差值


plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)#一行2列第1副图
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-', #训练集损失函数的误差值
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')


#画出特征重要性
feature_importance = clf.feature_importances_#每个特征重要性值
feature_importance = 100.0 * (feature_importance / feature_importance.max())#将最大特征重要性值为1,其他的都是占比
sorted_idx = np.argsort(feature_importance)#np.argsort返回值从小到大排序后对应索引的位置
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)#一行2列第2副图
plt.barh(pos, feature_importance[sorted_idx], align='center')#feature_importance[sorted_idx]根据索引位置取值
plt.yticks(pos, boston.feature_names[sorted_idx])#locs, labels = yticks()
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()


Gradient Boosting Out-of-Bag estimates
名词:out of bag(the improvement in loss based on the examples not included in the bootstrap sample
对损失函数的改进是基于非自助法抽样的样本)
OOB估计是一种有用且具有启发性的估计最优迭代次数的方法,它等价于交叉验证法,但具有实时(on-the -fly)计算
不需要反复拟合模型。OOB只适用于随机梯度提升(即子样本个数<1,也就是说没有子样本,应该说它是每个基训练器树属性分割最优的参数结果的训练器)。OOB分类器是对真实的测试集的悲观估计,但仍然保留了小样本(未被bootstrap选中的)决策树的比较好的近似估计

下图显示了提升算法迭代次数的增加OOB loss是一种消极的增长。它在100次迭代前沿着test loss的轨迹,之后开始偏离。
下图也显示3合交叉验证的效果是个好的test loss的估计算法,但计算昂贵。

import numpy as np
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn.cross_validation import KFold#0.18.1版本以后是from sklearn.model_selection import KFold
from sklearn.cross_validation import train_test_split#from sklearn.model_selection import train_test_split

# Generate data (adapted from G. Ridgeway's gbm example)
n_samples = 1000
random_state = np.random.RandomState(13)
x1 = random_state.uniform(size=n_samples)#(0,1]之间的均匀分布
x2 = random_state.uniform(size=n_samples)
x3 = random_state.randint(0, 4, size=n_samples)


p = 1 / (1.0 + np.exp(-(np.sin(3 * x1) - 4 * x2 + x3)))
y = random_state.binomial(1, p, size=n_samples)#二项分布,1次实验p的概率成果,重复1000次


X = np.c_[x1, x2, x3]


X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,
                                                    random_state=9)

# Fit classifier with out-of-bag estimates
params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,#subsample决策树中子样本占总体的个数
          'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}#定义了树中终点节点所需要的最少的样本数
clf = ensemble.GradientBoostingClassifier(**params)


clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)
print("Accuracy: {:.4f}".format(acc))#损失函数误差值y


n_estimators = params['n_estimators']
x = np.arange(n_estimators) + 1




def heldout_score(clf, X_test, y_test):
    """compute deviance scores on ``X_test`` and ``y_test``. """
    score = np.zeros((n_estimators,), dtype=np.float64)
    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
        score[i] = clf.loss_(y_test, y_pred)
    return score#每一个基分类器的损失函数值组成的数组




def cv_estimate(n_folds=3):
    cv = KFold(n=X_train.shape[0],n_folds=n_folds)
    cv_clf = ensemble.GradientBoostingClassifier(**params)
    val_scores = np.zeros((n_estimators,), dtype=np.float64)
    for train, test in cv:#3分数据集
        cv_clf.fit(X_train[train], y_train[train])#对每个子数据集构建GBC分类器
        val_scores += heldout_score(cv_clf, X_train[test], y_train[test])#对测试集算损失函数值,并汇总
    val_scores /= n_folds
    return val_scores#求损失函数平均分

>Accuracy: 0.6780

# Estimate best n_estimator using cross-validation
cv_score = cv_estimate(3)#交叉验证集的损失评分


# Compute best n_estimator for test data
test_score = heldout_score(clf, X_test, y_test)#测试集的评分
#clf.oob_improvement_估计这个数组是经过标准化出处理结果
# negative cumulative sum of oob improvements
cumsum = -np.cumsum(clf.oob_improvement_)#一次都没抽到的训练集(最悲观情况的样本)的损失误差值的累加和


# min loss according to OOB
oob_best_iter = x[np.argmin(cumsum)]


# min loss according to test (normalize such that first loss is 0)
test_score -= test_score[0]#对损失函数值标准化,即令第一个基分类器损失值标准化0
test_best_iter = x[np.argmin(test_score)]


# min loss according to cv (normalize such that first loss is 0)
cv_score -= cv_score[0]
cv_best_iter = x[np.argmin(cv_score)]


# color brew for the three curves
oob_color = list(map(lambda x: x / 256.0, (190, 174, 212)))
test_color = list(map(lambda x: x / 256.0, (127, 201, 127)))
cv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))

# plot curves and vertical lines for best iterations
plt.plot(x, cumsum, label='OOB loss', color=oob_color)
plt.plot(x, test_score, label='Test loss', color=test_color)
plt.plot(x, cv_score, label='CV loss', color=cv_color)
plt.axvline(x=oob_best_iter, color=oob_color)#画一条垂直线,x为位置,color为线的颜色
plt.axvline(x=test_best_iter, color=test_color)
xticks = plt.xticks()#plt.xticks()返回x轴的刻度值、标签名
xticks_pos = np.array(xticks[0].tolist() +
                      [oob_best_iter, cv_best_iter, test_best_iter])#添加x轴刻度值
xticks_label = np.array(list(map(lambda t: int(t), xticks[0])) +   #添加x轴标签名
                        ['OOB', 'CV', 'Test'])
ind = np.argsort(xticks_pos)#对刻度值从小到大排序,返回排序后的索引值
xticks_pos = xticks_pos[ind]#从小到大的刻度值
xticks_label = xticks_label[ind]#从小到大的标签名
plt.xticks(xticks_pos, xticks_label)#设置刻度值、标签名

plt.legend(loc='upper right')#设置图例位置
plt.ylabel('normalized loss')
plt.xlabel('number of iterations')
plt.show()


1.11.4.3. Fitting additional weak-learners
梯度提升回归或梯度提升分类器支持热启动参数warm_start=True
,这个参数我的理解是在冲入的牛奶中添加奶粉和水

X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
X_train, X_test = X[:200], X[200:]
y_train, y_test = y[:200], y[200:]
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, 
                                random_state=0, loss='ls').fit(X_train, y_train)
mean_squared_error(y_test, est.predict(X_test))   

>5.0091548599603213

_ = est.set_params(n_estimators=200, warm_start=True)  # set warm_start and new nr of trees
_ = est.fit(X_train, y_train) # fit additional 100 trees to est
mean_squared_error(y_test, est.predict(X_test))

>3.8402347411053559
1.11.4.4 控制树的大小
max_depth=h 2叉树的深度h有2**h的叶子节点和2**h-1的分割点
max_leaf_nodes=k 叶子节点最大值,根据信息增益等方法计算不纯度对属性分割,分割到k个叶子节点即停止
不理解A tree with max_leaf_nodes=k has k - 1 split nodes and \
thus can model interactions of up to order max_leaf_nodes - 1

1.11.4.5. Mathematical formulation(数学原理不懂,强行解释???)
#GBRT是分阶段的累加F(x)函数模型,F(x)公式中的hm(x)指的是boosting中弱分类器的基本函数(决策树),
GBRT考虑了基础模型的累加成F(x)形式
F(x)公式中的hm(x)指的是boosting中弱分类器的基本函数(决策树),hm(x)是通过损失函数最小化来构造决策树
GB尝试用梯度下降处理损失函数最小化问题,梯度下降的方向是当前Fm-1的损失函数梯度的负值,
rm为第m个分类器的步长
GB回归与分类的区别在于具体损失函数的选择。

1.11.4.5.1 损失函数(具体的内容不懂)
回归:1.ls 最小二乘损失函数;2.lad最小绝对值偏差;3.huber结合ls与lad,通过alpha调整权重
    4.quantile 分位损失函数,可以预测值的区间
分类:1.二元偏差;2.多元偏差;3.指数偏差

1.11.4.6正则化
1.11.4.6.1 伸缩性
通过v伸缩性参数调整若分类器的贡献度程度,v称为学习率,参数v是learning_rate 控制步长的比率
learning_rate <= 0.1
1.11.4.6.2. Subsampling子抽样
max_features,每一次分割节点所用的最多特征数,可以减少运行时间

1.11.4.7可解释性
1.11.4.7.1. Feature importance特征重要性
特征重要性的原则是单个决策树特征在决策树上使用的越多越重要,多分类器的取特征重要性值的平均值

from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
X, y = make_hastie_10_2(random_state=0)
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=0).fit(X, y)
clf.feature_importances_  

>array([ 0.11, 0.1 , 0.11, 0.1 , 0.09, 0.11, 0.09, 0.1 , 0.1 , 0.09])

from __future__ import print_function
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt


from mpl_toolkits.mplot3d import Axes3D


from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence
from sklearn.datasets.california_housing import fetch_california_housing




def main():
    cal_housing = fetch_california_housing()


    # split 80/20 train-test
    X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
                                                        cal_housing.target,
                                                        test_size=0.2,
                                                        random_state=1)
    names = cal_housing.feature_names


    print('_' * 80)
    print("Training GBRT...")
    clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
                                    learning_rate=0.1, loss='huber',
                                    random_state=1)
    clf.fit(X_train, y_train)
    print("done.")


    print('_' * 80)
    print('Convenience plot with ``partial_dependence_plots``')
    print


    features = [0, 5, 1, 2, (5, 1)]
    fig, axs = plot_partial_dependence(clf, X_train, features,
                                       feature_names=names,
                                       n_jobs=3, grid_resolution=50)
    fig.suptitle('Partial dependence of house value on nonlocation features\n'
                 'for the California housing dataset')
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle


    print('_' * 80)
    print('Custom 3d plot via ``partial_dependence``')
    print
    fig = plt.figure()


    target_feature = (1, 5)
    pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature,
                                               X=X_train, grid_resolution=50)
    XX, YY = np.meshgrid(x_axis, y_axis)
    Z = pdp.T.reshape(XX.shape).T
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
    ax.set_xlabel(names[target_feature[0]])
    ax.set_ylabel(names[target_feature[1]])
    ax.set_zlabel('Partial dependence')
    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle('Partial dependence of house value on median age and '
                 'average occupancy')
    plt.subplots_adjust(top=0.9)


    plt.show()

# Needed on Windows because plot_partial_dependence uses multiprocessing
#if __name__ == '__main__':
main()

>
Automatically created module for IPython interactive environment________________________________________________________________________________Training GBRT...done.________________________________________________________________________________Convenience plot with ``partial_dependence_plots``________________________________________________________________________________Custom 3d plot via ``partial_dependence``
from sklearn.datasets import make_hastie_10_2from sklearn.ensemble import GradientBoostingClassifierfrom sklearn.ensemble.partial_dependence import plot_partial_dependenceX, y = make_hastie_10_2(random_state=0)clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,    max_depth=1, random_state=0).fit(X, y)features = [0, 1, (0, 1)]fig, axs = plot_partial_dependence(clf, X, features
1.11.5. VotingClassifier大数投票/硬性投票    voting='hard'加权平均概率/弹性投票 voting='soft'from sklearn.ensemble import VotingClassifiereclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)],\                        voting='soft', weights=[2,1,2])