scikit-learn的1.11.4 Gradient Tree boosting学习与翻译

来源：互联网发布：淘宝上自然堂旗舰店编辑：程序博客网时间：2024/06/06 03:57

1.11.4 Gradient Tree boosting

梯度树提升或梯度提升回归树（GBRT）是针对不同损失函数具有改进的泛化能力，GBRT可以用在分类和回归的现成精确算法，适用于网络搜索排名和社会生态学中。
优势：1不同类型的特征数据处理；2预测的能力；3对异常值稳健
缺点：可扩展性，因为连续改进的特性很难并向处理
1.11.4.1. Classification

GradientBoostingClassifier支持多分类问题。

from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

X, y = make_hastie_10_2(random_state=0)

import numpy as np

np.unique(y)

array([-1.,  1.])

X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0).fit(X_train, y_train)

clf.score(X_test, y_test)

>0.91300000000000003

参数解释：n_estimators控制弱分类器个数        max_depth控制决策树的深度 或 max_leaf_nodes叶子节点的最大个数（这2个参数控制树的大小，2选1）        learning_rate 学习率 通过Shrinkage（伸缩性）控制模型过拟合性

1.11.4.2. RegressionGradientBoostingRegressor支持一系列不同损失函数组合的回归模型，由loss参数控制，默认'ls'(最小2乘法)

import numpy as npfrom sklearn.metrics import mean_squared_errorfrom sklearn.datasets import make_friedman1from sklearn.ensemble import GradientBoostingRegressor

X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)

X_train, X_test = X[:200], X[200:]y_train, y_test = y[:200], y[200:]

est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,                                max_depth=1, random_state=0, loss='ls').fit(X_train, y_train)

mean_squared_error(y_test, est.predict(X_test))

>5.0091548599603213

下图是对波士顿房价预测的梯度提升回归算法，同过提升的迭代次数（n_estimators的设置）来看训练集误差和测试集误差，
决定最优迭代次数，另外feature_importances_ 是对特征重要性排序

#from IPython.display import Image
#Image(filename='./image/1.11.14.2.png', width=400)后面的代码生成此图，就不上传了

import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle#随机打乱数据，打乱第一个维度（一般为行）
from sklearn.metrics import mean_squared_error

#载入数据
boston = datasets.load_boston()
X, y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)#取前90%的行的小技巧
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

#拟合回归模型
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)#**代表字典，*代表字典以外的数据结构

clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)

>MSE: 6.6282

# 画出训练集与测试集误差对比图
%matplotlib inline
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)#等价于np.zeros（500）

for i, y_pred in enumerate(clf.staged_predict(X_test)):#clf.staged_predict（X_test）分阶段（指的是单个分类器）的预测值
test_score[i] = clf.loss_(y_test, y_pred)#clf.loss_损失函数计算误差值

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)#一行2列第1副图
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-', #训练集损失函数的误差值
label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')

#画出特征重要性
feature_importance = clf.feature_importances_#每个特征重要性值
feature_importance = 100.0 * (feature_importance / feature_importance.max())#将最大特征重要性值为1，其他的都是占比
sorted_idx = np.argsort(feature_importance)#np.argsort返回值从小到大排序后对应索引的位置
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)#一行2列第2副图
plt.barh(pos, feature_importance[sorted_idx], align='center')#feature_importance[sorted_idx]根据索引位置取值
plt.yticks(pos, boston.feature_names[sorted_idx])#locs, labels = yticks()
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

Gradient Boosting Out-of-Bag estimates
名词：out of bag（the improvement in loss based on the examples not included in the bootstrap sample
对损失函数的改进是基于非自助法抽样的样本）
OOB估计是一种有用且具有启发性的估计最优迭代次数的方法，它等价于交叉验证法，但具有实时（on-the -fly）计算
不需要反复拟合模型。OOB只适用于随机梯度提升（即子样本个数<1,也就是说没有子样本，应该说它是每个基训练器树属性分割最优的参数结果的训练器）。OOB分类器是对真实的测试集的悲观估计，但仍然保留了小样本（未被bootstrap选中的）决策树的比较好的近似估计

下图显示了提升算法迭代次数的增加OOB loss是一种消极的增长。它在100次迭代前沿着test loss的轨迹，之后开始偏离。
下图也显示3合交叉验证的效果是个好的test loss的估计算法，但计算昂贵。

import numpy as np
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn.cross_validation import KFold#0.18.1版本以后是from sklearn.model_selection import KFold
from sklearn.cross_validation import train_test_split#from sklearn.model_selection import train_test_split

# Generate data (adapted from G. Ridgeway's gbm example)
n_samples = 1000
random_state = np.random.RandomState(13)
x1 = random_state.uniform(size=n_samples)#(0,1]之间的均匀分布
x2 = random_state.uniform(size=n_samples)
x3 = random_state.randint(0, 4, size=n_samples)

p = 1 / (1.0 + np.exp(-(np.sin(3 * x1) - 4 * x2 + x3)))
y = random_state.binomial(1, p, size=n_samples)#二项分布，1次实验p的概率成果，重复1000次

X = np.c_[x1, x2, x3]

X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,
random_state=9)

# Fit classifier with out-of-bag estimates
params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,#subsample决策树中子样本占总体的个数
'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}#定义了树中终点节点所需要的最少的样本数
clf = ensemble.GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)
print("Accuracy: {:.4f}".format(acc))#损失函数误差值y

n_estimators = params['n_estimators']
x = np.arange(n_estimators) + 1

def heldout_score(clf, X_test, y_test):
"""compute deviance scores on ``X_test`` and ``y_test``. """
score = np.zeros((n_estimators,), dtype=np.float64)
for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
score[i] = clf.loss_(y_test, y_pred)
return score#每一个基分类器的损失函数值组成的数组

def cv_estimate(n_folds=3):
cv = KFold(n=X_train.shape[0],n_folds=n_folds)
cv_clf = ensemble.GradientBoostingClassifier(**params)
val_scores = np.zeros((n_estimators,), dtype=np.float64)
for train, test in cv:#3分数据集
cv_clf.fit(X_train[train], y_train[train])#对每个子数据集构建GBC分类器
val_scores += heldout_score(cv_clf, X_train[test], y_train[test])#对测试集算损失函数值，并汇总
val_scores /= n_folds
return val_scores#求损失函数平均分

>Accuracy: 0.6780

# Estimate best n_estimator using cross-validation
cv_score = cv_estimate(3)#交叉验证集的损失评分

# Compute best n_estimator for test data
test_score = heldout_score(clf, X_test, y_test)#测试集的评分
#clf.oob_improvement_估计这个数组是经过标准化出处理结果
# negative cumulative sum of oob improvements
cumsum = -np.cumsum(clf.oob_improvement_)#一次都没抽到的训练集（最悲观情况的样本）的损失误差值的累加和

# min loss according to OOB
oob_best_iter = x[np.argmin(cumsum)]

# min loss according to test (normalize such that first loss is 0)
test_score -= test_score[0]#对损失函数值标准化，即令第一个基分类器损失值标准化0
test_best_iter = x[np.argmin(test_score)]

# min loss according to cv (normalize such that first loss is 0)
cv_score -= cv_score[0]
cv_best_iter = x[np.argmin(cv_score)]

# color brew for the three curves
oob_color = list(map(lambda x: x / 256.0, (190, 174, 212)))
test_color = list(map(lambda x: x / 256.0, (127, 201, 127)))
cv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))

# plot curves and vertical lines for best iterations
plt.plot(x, cumsum, label='OOB loss', color=oob_color)
plt.plot(x, test_score, label='Test loss', color=test_color)
plt.plot(x, cv_score, label='CV loss', color=cv_color)
plt.axvline(x=oob_best_iter, color=oob_color)#画一条垂直线，x为位置，color为线的颜色
plt.axvline(x=test_best_iter, color=test_color)
xticks = plt.xticks()#plt.xticks()返回x轴的刻度值、标签名
xticks_pos = np.array(xticks[0].tolist() +
[oob_best_iter, cv_best_iter, test_best_iter])#添加x轴刻度值
xticks_label = np.array(list(map(lambda t: int(t), xticks[0])) + #添加x轴标签名
['OOB', 'CV', 'Test'])
ind = np.argsort(xticks_pos)#对刻度值从小到大排序，返回排序后的索引值
xticks_pos = xticks_pos[ind]#从小到大的刻度值
xticks_label = xticks_label[ind]#从小到大的标签名
plt.xticks(xticks_pos, xticks_label)#设置刻度值、标签名

plt.legend(loc='upper right')#设置图例位置
plt.ylabel('normalized loss')
plt.xlabel('number of iterations')
plt.show()

1.11.4.3. Fitting additional weak-learners
梯度提升回归或梯度提升分类器支持热启动参数warm_start=True
，这个参数我的理解是在冲入的牛奶中添加奶粉和水

X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
X_train, X_test = X[:200], X[200:]
y_train, y_test = y[:200], y[200:]
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1,
random_state=0, loss='ls').fit(X_train, y_train)
mean_squared_error(y_test, est.predict(X_test))

>5.0091548599603213

_ = est.set_params(n_estimators=200, warm_start=True) # set warm_start and new nr of trees
_ = est.fit(X_train, y_train) # fit additional 100 trees to est
mean_squared_error(y_test, est.predict(X_test))

>3.8402347411053559

1.11.4.4 控制树的大小
max_depth=h 2叉树的深度h有2**h的叶子节点和2**h-1的分割点
max_leaf_nodes=k 叶子节点最大值，根据信息增益等方法计算不纯度对属性分割，分割到k个叶子节点即停止
不理解A tree with max_leaf_nodes=k has k - 1 split nodes and \
thus can model interactions of up to order max_leaf_nodes - 1

1.11.4.5. Mathematical formulation（数学原理不懂，强行解释？？？）
#GBRT是分阶段的累加F(x)函数模型，F(x)公式中的hm(x)指的是boosting中弱分类器的基本函数（决策树），
GBRT考虑了基础模型的累加成F(x)形式
F(x)公式中的hm(x)指的是boosting中弱分类器的基本函数（决策树），hm(x)是通过损失函数最小化来构造决策树
GB尝试用梯度下降处理损失函数最小化问题，梯度下降的方向是当前Fm-1的损失函数梯度的负值，
rm为第m个分类器的步长
GB回归与分类的区别在于具体损失函数的选择。

1.11.4.5.1 损失函数（具体的内容不懂）
回归:1.ls 最小二乘损失函数;2.lad最小绝对值偏差；3.huber结合ls与lad,通过alpha调整权重
4.quantile 分位损失函数，可以预测值的区间
分类：1.二元偏差；2.多元偏差；3.指数偏差

1.11.4.6正则化
1.11.4.6.1 伸缩性
通过v伸缩性参数调整若分类器的贡献度程度，v称为学习率，参数v是learning_rate 控制步长的比率
learning_rate <= 0.1
1.11.4.6.2. Subsampling子抽样
max_features,每一次分割节点所用的最多特征数，可以减少运行时间

1.11.4.7可解释性
1.11.4.7.1. Feature importance特征重要性
特征重要性的原则是单个决策树特征在决策树上使用的越多越重要，多分类器的取特征重要性值的平均值

from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
X, y = make_hastie_10_2(random_state=0)
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0).fit(X, y)
clf.feature_importances_

>array([ 0.11, 0.1 , 0.11, 0.1 , 0.09, 0.11, 0.09, 0.1 , 0.1 , 0.09])

from __future__ import print_function
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence
from sklearn.datasets.california_housing import fetch_california_housing

def main():
cal_housing = fetch_california_housing()

# split 80/20 train-test
X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
cal_housing.target,
test_size=0.2,
random_state=1)
names = cal_housing.feature_names

print('_' * 80)
print("Training GBRT...")
clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
learning_rate=0.1, loss='huber',
random_state=1)
clf.fit(X_train, y_train)
print("done.")

print('_' * 80)
print('Convenience plot with ``partial_dependence_plots``')
print

features = [0, 5, 1, 2, (5, 1)]
fig, axs = plot_partial_dependence(clf, X_train, features,
feature_names=names,
n_jobs=3, grid_resolution=50)
fig.suptitle('Partial dependence of house value on nonlocation features\n'
'for the California housing dataset')
plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle

print('_' * 80)
print('Custom 3d plot via ``partial_dependence``')
print
fig = plt.figure()

target_feature = (1, 5)
pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature,
X=X_train, grid_resolution=50)
XX, YY = np.meshgrid(x_axis, y_axis)
Z = pdp.T.reshape(XX.shape).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
ax.set_xlabel(names[target_feature[0]])
ax.set_ylabel(names[target_feature[1]])
ax.set_zlabel('Partial dependence')
# pretty init view
ax.view_init(elev=22, azim=122)
plt.colorbar(surf)
plt.suptitle('Partial dependence of house value on median age and '
'average occupancy')
plt.subplots_adjust(top=0.9)

plt.show()

# Needed on Windows because plot_partial_dependence uses multiprocessing
#if __name__ == '__main__':
main()

Automatically created module for IPython interactive environment________________________________________________________________________________Training GBRT...done.________________________________________________________________________________Convenience plot with ``partial_dependence_plots``________________________________________________________________________________Custom 3d plot via ``partial_dependence``
from sklearn.datasets import make_hastie_10_2from sklearn.ensemble import GradientBoostingClassifierfrom sklearn.ensemble.partial_dependence import plot_partial_dependenceX, y = make_hastie_10_2(random_state=0)clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,    max_depth=1, random_state=0).fit(X, y)features = [0, 1, (0, 1)]fig, axs = plot_partial_dependence(clf, X, features
1.11.5. VotingClassifier大数投票/硬性投票    voting='hard'加权平均概率/弹性投票 voting='soft'from sklearn.ensemble import VotingClassifiereclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)],\                        voting='soft', weights=[2,1,2])

阅读全文

0 0