python3 使用决策树进行分类

来源：互联网发布：通达信日线数据编辑：程序博客网时间：2024/06/07 08:22

我们使用ID3算法，通过计算构建出决策树，接下来，让我们看看如何进行代实现。

第一步：创建函数majorityCnt统计classList中出现此处最多的元素(类标签)，创建函数createTree用来递归构建决策树。编写代码如下：

# -*- coding: UTF-8 -*-from math import logimport operator"""函数说明:计算给定数据集的经验熵(香农熵)Parameters:    dataSet - 数据集Returns:    shannonEnt - 经验熵(香农熵)"""def calcShannonEnt(dataSet):    numEntires = len(dataSet)                        #返回数据集的行数    labelCounts = {}                                #保存每个标签(Label)出现次数的字典    for featVec in dataSet:                            #对每组特征向量进行统计        currentLabel = featVec[-1]                    #提取标签(Label)信息        if currentLabel not in labelCounts.keys():    #如果标签(Label)没有放入统计次数的字典,添加进去            labelCounts[currentLabel] = 0        labelCounts[currentLabel] += 1                #Label计数    shannonEnt = 0.0                                #经验熵(香农熵)    for key in labelCounts:                            #计算香农熵        prob = float(labelCounts[key]) / numEntires    #选择该标签(Label)的概率        shannonEnt -= prob * log(prob, 2)            #利用公式计算    return shannonEnt                                #返回经验熵(香农熵)"""函数说明:创建测试数据集Parameters:    无Returns:    dataSet - 数据集    labels - 特征标签"""def createDataSet():    dataSet = [[0, 0, 0, 0, 'no'],                        #数据集            [0, 0, 0, 1, 'no'],            [0, 1, 0, 1, 'yes'],            [0, 1, 1, 0, 'yes'],            [0, 0, 0, 0, 'no'],            [1, 0, 0, 0, 'no'],            [1, 0, 0, 1, 'no'],            [1, 1, 1, 1, 'yes'],            [1, 0, 1, 2, 'yes'],            [1, 0, 1, 2, 'yes'],            [2, 0, 1, 2, 'yes'],            [2, 0, 1, 1, 'yes'],            [2, 1, 0, 1, 'yes'],            [2, 1, 0, 2, 'yes'],            [2, 0, 0, 0, 'no']]    labels = ['年龄', '有工作', '有自己的房子', '信贷情况']        #特征标签    return dataSet, labels                             #返回数据集和分类属性"""函数说明:按照给定特征划分数据集Parameters:    dataSet - 待划分的数据集    axis - 划分数据集的特征    value - 需要返回的特征的值Returns:    无"""def splitDataSet(dataSet, axis, value):           retDataSet = []                                        #创建返回的数据集列表    for featVec in dataSet:                             #遍历数据集        if featVec[axis] == value:            reducedFeatVec = featVec[:axis]                #去掉axis特征            reducedFeatVec.extend(featVec[axis+1:])     #将符合条件的添加到返回的数据集            retDataSet.append(reducedFeatVec)    return retDataSet                                      #返回划分后的数据集"""函数说明:选择最优特征Parameters:    dataSet - 数据集Returns:    bestFeature - 信息增益最大的(最优)特征的索引值"""def chooseBestFeatureToSplit(dataSet):    numFeatures = len(dataSet[0]) - 1                    #特征数量    baseEntropy = calcShannonEnt(dataSet)                 #计算数据集的香农熵    bestInfoGain = 0.0                                  #信息增益    bestFeature = -1                                    #最优特征的索引值    for i in range(numFeatures):                         #遍历所有特征        #获取dataSet的第i个所有特征        featList = [example[i] for example in dataSet]        uniqueVals = set(featList)                         #创建set集合{},元素不可重复        newEntropy = 0.0                                  #经验条件熵        for value in uniqueVals:                         #计算信息增益            subDataSet = splitDataSet(dataSet, i, value)         #subDataSet划分后的子集            prob = len(subDataSet) / float(len(dataSet))           #计算子集的概率            newEntropy += prob * calcShannonEnt(subDataSet)     #根据公式计算经验条件熵        infoGain = baseEntropy - newEntropy                     #信息增益        # print("第%d个特征的增益为%.3f" % (i, infoGain))            #打印每个特征的信息增益        if (infoGain > bestInfoGain):                             #计算信息增益            bestInfoGain = infoGain                             #更新信息增益，找到最大的信息增益            bestFeature = i                                     #记录信息增益最大的特征的索引值    return bestFeature                                             #返回信息增益最大的特征的索引值"""函数说明:统计classList中出现此处最多的元素(类标签)Parameters:    classList - 类标签列表Returns:    sortedClassCount[0][0] - 出现此处最多的元素(类标签)"""def majorityCnt(classList):    classCount = {}    for vote in classList:                                        #统计classList中每个元素出现的次数        if vote not in classCount.keys():classCount[vote] = 0           classCount[vote] += 1    sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)        #根据字典的值降序排序    return sortedClassCount[0][0]                                #返回classList中出现次数最多的元素"""函数说明:创建决策树Parameters:    dataSet - 训练数据集    labels - 分类属性标签    featLabels - 存储选择的最优特征标签Returns:    myTree - 决策树"""def createTree(dataSet, labels, featLabels):    classList = [example[-1] for example in dataSet]            #取分类标签(是否放贷:yes or no)    if classList.count(classList[0]) == len(classList):            #如果类别完全相同则停止继续划分        return classList[0]    if len(dataSet[0]) == 1:                                    #遍历完所有特征时返回出现次数最多的类标签        return majorityCnt(classList)    bestFeat = chooseBestFeatureToSplit(dataSet)                #选择最优特征    bestFeatLabel = labels[bestFeat]                            #最优特征的标签    featLabels.append(bestFeatLabel)    myTree = {bestFeatLabel:{}}                                    #根据最优特征的标签生成树    del(labels[bestFeat])                                        #删除已经使用特征标签    featValues = [example[bestFeat] for example in dataSet]        #得到训练集中所有最优特征的属性值    uniqueVals = set(featValues)                                #去掉重复的属性值    for value in uniqueVals:                                    #遍历特征，创建决策树。                               myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), labels, featLabels)    return myTreeif __name__ == '__main__':    dataSet, labels = createDataSet()    featLabels = []    myTree = createTree(dataSet, labels, featLabels)    print(myTree)

递归创建决策树时，递归有两个终止条件：第一个停止条件是所有的类标签完全相同，则直接返回该类标签；第二个停止条件是使用完了所有特征，仍然不能将数据划分仅包含唯一类别的分组，即决策树构建失败，特征不够用。此时说明数据纬度不够，由于第二个停止条件无法简单地返回唯一的类标签，这里挑选出现数量最多的类别作为返回值。

依靠训练数据构造了决策树之后，我们可以将它用于实际数据的分类。在执行数据分类时，需要决策树以及用于构造树的标签向量。然后，程序比较测试数据与决策树上的数值，递归执行该过程直到进入叶子结点；最后将测试数据定义为叶子结点所属的类型。在构建决策树的代码，可以看到，有个featLabels参数。它是用来干什么的？它就是用来记录各个分类结点的，在用决策树做预测的时候，我们按顺序输入需要的分类结点的属性值即可。举个例子，比如我用上述已经训练好的决策树做分类，那么我只需要提供这个人是否有房子，是否有工作这两个信息即可，无需提供冗余的信息。代码如下：

# -*- coding: UTF-8 -*-from math import logimport operator"""函数说明:计算给定数据集的经验熵(香农熵)Parameters:    dataSet - 数据集Returns:    shannonEnt - 经验熵(香农熵)"""def calcShannonEnt(dataSet):    numEntires = len(dataSet)                        #返回数据集的行数    labelCounts = {}                                #保存每个标签(Label)出现次数的字典    for featVec in dataSet:                            #对每组特征向量进行统计        currentLabel = featVec[-1]                    #提取标签(Label)信息        if currentLabel not in labelCounts.keys():    #如果标签(Label)没有放入统计次数的字典,添加进去            labelCounts[currentLabel] = 0        labelCounts[currentLabel] += 1                #Label计数    shannonEnt = 0.0                                #经验熵(香农熵)    for key in labelCounts:                            #计算香农熵        prob = float(labelCounts[key]) / numEntires    #选择该标签(Label)的概率        shannonEnt -= prob * log(prob, 2)            #利用公式计算    return shannonEnt                                #返回经验熵(香农熵)"""函数说明:创建测试数据集Parameters:    无Returns:    dataSet - 数据集    labels - 特征标签"""def createDataSet():    dataSet = [[0, 0, 0, 0, 'no'],                        #数据集            [0, 0, 0, 1, 'no'],            [0, 1, 0, 1, 'yes'],            [0, 1, 1, 0, 'yes'],            [0, 0, 0, 0, 'no'],            [1, 0, 0, 0, 'no'],            [1, 0, 0, 1, 'no'],            [1, 1, 1, 1, 'yes'],            [1, 0, 1, 2, 'yes'],            [1, 0, 1, 2, 'yes'],            [2, 0, 1, 2, 'yes'],            [2, 0, 1, 1, 'yes'],            [2, 1, 0, 1, 'yes'],            [2, 1, 0, 2, 'yes'],            [2, 0, 0, 0, 'no']]    labels = ['年龄', '有工作', '有自己的房子', '信贷情况']        #特征标签    return dataSet, labels                             #返回数据集和分类属性"""函数说明:按照给定特征划分数据集Parameters:    dataSet - 待划分的数据集    axis - 划分数据集的特征    value - 需要返回的特征的值Returns:    无"""def splitDataSet(dataSet, axis, value):           retDataSet = []                                        #创建返回的数据集列表    for featVec in dataSet:                             #遍历数据集        if featVec[axis] == value:            reducedFeatVec = featVec[:axis]                #去掉axis特征            reducedFeatVec.extend(featVec[axis+1:])     #将符合条件的添加到返回的数据集            retDataSet.append(reducedFeatVec)    return retDataSet                                      #返回划分后的数据集"""函数说明:选择最优特征Parameters:    dataSet - 数据集Returns:    bestFeature - 信息增益最大的(最优)特征的索引值"""def chooseBestFeatureToSplit(dataSet):    numFeatures = len(dataSet[0]) - 1                    #特征数量    baseEntropy = calcShannonEnt(dataSet)                 #计算数据集的香农熵    bestInfoGain = 0.0                                  #信息增益    bestFeature = -1                                    #最优特征的索引值    for i in range(numFeatures):                         #遍历所有特征        #获取dataSet的第i个所有特征        featList = [example[i] for example in dataSet]        uniqueVals = set(featList)                         #创建set集合{},元素不可重复        newEntropy = 0.0                                  #经验条件熵        for value in uniqueVals:                         #计算信息增益            subDataSet = splitDataSet(dataSet, i, value)         #subDataSet划分后的子集            prob = len(subDataSet) / float(len(dataSet))           #计算子集的概率            newEntropy += prob * calcShannonEnt(subDataSet)     #根据公式计算经验条件熵        infoGain = baseEntropy - newEntropy                     #信息增益        # print("第%d个特征的增益为%.3f" % (i, infoGain))            #打印每个特征的信息增益        if (infoGain > bestInfoGain):                             #计算信息增益            bestInfoGain = infoGain                             #更新信息增益，找到最大的信息增益            bestFeature = i                                     #记录信息增益最大的特征的索引值    return bestFeature                                             #返回信息增益最大的特征的索引值"""函数说明:统计classList中出现此处最多的元素(类标签)Parameters:    classList - 类标签列表Returns:    sortedClassCount[0][0] - 出现此处最多的元素(类标签)"""def majorityCnt(classList):    classCount = {}    for vote in classList:                                        #统计classList中每个元素出现的次数        if vote not in classCount.keys():classCount[vote] = 0           classCount[vote] += 1    sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)        #根据字典的值降序排序    return sortedClassCount[0][0]                                #返回classList中出现次数最多的元素"""函数说明:创建决策树Parameters:    dataSet - 训练数据集    labels - 分类属性标签    featLabels - 存储选择的最优特征标签Returns:    myTree - 决策树"""def createTree(dataSet, labels, featLabels):    classList = [example[-1] for example in dataSet]            #取分类标签(是否放贷:yes or no)    if classList.count(classList[0]) == len(classList):            #如果类别完全相同则停止继续划分        return classList[0]    if len(dataSet[0]) == 1:                                    #遍历完所有特征时返回出现次数最多的类标签        return majorityCnt(classList)    bestFeat = chooseBestFeatureToSplit(dataSet)                #选择最优特征    bestFeatLabel = labels[bestFeat]                            #最优特征的标签    featLabels.append(bestFeatLabel)    myTree = {bestFeatLabel:{}}                                    #根据最优特征的标签生成树    del(labels[bestFeat])                                        #删除已经使用特征标签    featValues = [example[bestFeat] for example in dataSet]        #得到训练集中所有最优特征的属性值    uniqueVals = set(featValues)                                #去掉重复的属性值    for value in uniqueVals:                                    #遍历特征，创建决策树。                               myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), labels, featLabels)    return myTree"""函数说明:使用决策树分类Parameters:    inputTree - 已经生成的决策树    featLabels - 存储选择的最优特征标签    testVec - 测试数据列表，顺序对应最优特征标签Returns:    classLabel - 分类结果"""def classify(inputTree, featLabels, testVec):    firstStr = next(iter(inputTree))                                                        #获取决策树结点    secondDict = inputTree[firstStr]                                                        #下一个字典    featIndex = featLabels.index(firstStr)                                                   for key in secondDict.keys():        if testVec[featIndex] == key:            if type(secondDict[key]).__name__ == 'dict':                classLabel = classify(secondDict[key], featLabels, testVec)            else: classLabel = secondDict[key]    return classLabelif __name__ == '__main__':    dataSet, labels = createDataSet()    featLabels = []    myTree = createTree(dataSet, labels, featLabels)    testVec = [0,1]                                        #测试数据    result = classify(myTree, featLabels, testVec)    if result == 'yes':        print('放贷')    if result == 'no':        print('不放贷')

阅读全文

0 0