树回归:CART算法构建回归树和模型树(代码笔记)

来源:互联网 发布:jmeter ip欺骗 linux 编辑:程序博客网 时间:2024/05/05 04:37

分类回归树(Classification And Regression Trees,CART)是一种构造树的监督学习方法。

和ID3决策树作比较:

1. ID3每次直接用最佳特征分割数据,即如果当前特征有4个可能值,那么数据将被分成4份,处理的是标称型数据,不能直接处理连续型数据。CART则利用二元切分来处理连续型变量,每次会找一个最佳特征的阈值,把数据集分成两部分,也就是左子树和右子树。

2. CART使用方差计算来代替香农熵。但目的都是找最佳切分特征。

import numpy as np'''CART使用二元切分来处理连续型变量。回归树和分类树类似,只是叶节点的数据类型是连续型不是离散型(其实也不是真正的“连续”,切分时仍取决于属性值,只不过数值都是浮点数)以下是两种CART:回归树,模型树'''def loadData(filename):    dataM = []    fr = open(filename)    for line in fr.readlines():        curLine = line.strip().split('\t')        fltLine = map(float, curLine)  # 每行存成一组浮点数        dataM.append(fltLine)    return dataM# ----------------- 回归树(regression tree)每个叶节点包含单个值 -------------------def regLeaf(data): # 数据不需要再切分时用来生成叶节点(常量)    return np.mean(data[:,-1])def regErr(data):  # 误差用均方差计算    return np.var(data[:,-1]) * np.shape(data)[0]# 找最佳的切分的位置(特征)和阈值def chooseBestSplit(data, leafType=regLeaf, errType=regErr, ops=(1,4)):    tolS = ops[0]  # 允许的误差减少量的最低值    tolN = ops[1]  # 允许切分的最少样本数    if len(set(data[:,-1].T.tolist()[0])) == 1:  # 标签值只有一个值(都是一类)        return None, leafType(data)    m, n = np.shape(data)    S = errType(data)  # 目标变量的总方差    bestS = inf    bestIdx = 0    bestVal = 0    for featIdx in range(n-1):        for splitVal in set(data[:, featIdx]):            mat0, mat1 = binSplitData(data, featIdx, splitVal)            if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):                continue    # 划分条件            newS = errType(mat0) + errType(mat1)            if newS < bestS:                bestIdx = featIdx                bestVal = splitVal                bestS = newS    if (S-newS) < tolS:                                return None, leafType(data)  # 如果误差变化量很小就退出    mat0, mat1 = binSplitData(data, bestIdx, bestVal)    if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):        return None, leafType(data)  # 如果切分的数据集很小也退出    return bestIdx, bestVal# 数据集的切分函数def binSplitData(data, feature, value):    mat0 = data[np.nonzero(data[:, feature] > value)[0], :]  # 左子树    mat1 = data[np.nonzero(data[:, feature] <= value)[0], :] # 右子树    return mat0, mat1def createTree(data, leafType=regLeaf, errType=regErr, ops=(1,4)):    feat, val = chooseBestSplit(data, leafType, errType, ops)    if feat == None:  # feat为None是chooseBestSplit()决定的不再切分数据        return val    # val是leafType()生成的叶节点 (这里是常值, 变量均值)    retTree = {}    retTree['spInd'] = feat    retTree['spVal'] = val    lfData, rtData = binSplitData(data, feat, val)    retTree['left'] = createTree(lfData, leafType, errType, ops)    retTree['right']= createTree(rtData, leafType, errType, ops)    return retTree# ------------------ 模型树(model tree)每个叶节点包含一个线性方程 -------------------def linearNode(data):    m, n = np.shape(data)    x = np.mat(np.ones((m,n)))    y = np.mat(np.ones((m,1)))    x[:, 1:n] = data[:, 0:n-1]    y = data[:, -1]    xTx = x.T * x    if linalg.det(xTx) == 0.0:        raise(NameError('This matrix is singular, cannot do inverse'))    w = xTx.I * (x.T * y)    return w, x, ydef modelLeaf(data):  # 数据不需要再切分时用来生成叶节点(线性函数)     w, x, y = linearNode(data)    return wdef modelErr(data):   # 误差用平方差计算    w, x, y = linearNode(data)    yHat = x * w    return np.sum(np.power(y-yHat, 2))def createTree(data, leafType=modelLeaf, errType=modelErr, ops=(1,4)):    feat, val = chooseBestSplit(data, leafType, errType, ops)    if feat == None:  # feat为None是chooseBestSplit()决定的不再切分数据        return val    # val是leafType()生成的叶节点 (这里是直线, 回归系数 )    retTree = {}    retTree['spInd'] = feat    retTree['spVal'] = val    lfData, rtData = binSplitData(data, feat, val)    retTree['left'] = createTree(lfData, leafType, errType, ops)    retTree['right']= createTree(rtData, leafType, errType, ops)    return retTree# ----------------------------- 回归树和模型树做预测 ----------------------------------def regTreeEval(treeNode, xdata):   # 叶节点为常量值    return float(treeNode)def modelTreeEval(treeNode, xdata): # 叶节点为回归系数    n = np.shape(xdata)[1]    x = np.mat(np.ones((1, n+1)))    x[:, 1:n+1] = xdata    return float(x*treeNode)def isTree(obj):    return (type(obj).__name__ == 'dict')# modelEval指定树的类型,区分两种叶节点def treePredict(tree, xTest, modelEval=regTreeEval):     if not isTree(tree):        return modelEval(tree, xTest)    if xTest[tree['spInd']] > tree['spVal']:  # 划分特征的值大于阈值,分到左子树        if isTree(tree['left']):                       # 左子树还有分支            return treePredict(tree['left'], xTest, modelEval)        else:                                          # 左子树已经是叶节点            return modelEval(tree['left'], xTest)    else:                                     # 划分特征的值小于阈值,分到右子树        if isTree(tree['right']):            return treePredict(tree['right'], xTest, modelEval)        else:            return modelEval(tree['right'], xTest)



原创粉丝点击