【python实战】3 决策树(代码讲解)

来源:互联网 发布:linux 文本全部删除 编辑:程序博客网 时间:2024/05/22 03:26

from math import log
import operator

def createDataSet():
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    labels = ['no surfacing','flippers']
    #change to discrete values
    return dataSet, labels

#计算给定数据集的香农熵
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {} 
    for featVec in dataSet: #the the number of unique elements and their occurance
        currentLabel = featVec[-1] #数据字典,键值是最后一列的数值。
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1 #数个数
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries #求概率
        shannonEnt -= prob * log(prob,2) #log base 2 
    return shannonEnt
    
def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]     #chop out axis used for splitting
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)

    return retDataSet


#程序3-3:选择最好的数据集划分方式

def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1      #the last column is used for the labels
    baseEntropy = calcShannonEnt(dataSet) #保留最初的无序度量值的原始香农熵
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):         #遍历所有特征(列)iterate over all the features
        featList = [example[i] for example in dataSet]       #create a list of这个特征的所有值(每一行的这一列)【http://python.jobbole.com/80823/】
        uniqueVals = set(featList)       #从列表中创建集合get a set of unique values去掉重复值
        newEntropy = 0.0
        for value in uniqueVals:    #遍历当前特征中的所有唯一属性值,对每一个特征分一次数据集
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
        if (infoGain > bestInfoGain):       #compare this to the best gain so far
            bestInfoGain = infoGain         #if better than current best, set to best
            bestFeature = i
    return bestFeature                      #returns an integer 


验证:

>>> import trees
>>> myDat,labels=trees.createDataSet()
>>> trees.chooseBestFeatureToSplit(myDat)
0.970951=0.970951-0.000000          #print(infoGain=baseEntropy-newEntropy)
0.970951=0.970951-0.000000
0
>>> exit()


3.1.3 递归构建决策树

以上我们学过的【度量数据集的信息熵】和【如何划分数据集】可看作构建决策树的子功能模块。递归结束的条件是:程序遍历完所有划分数据集的属性,或每个分支下的所有实例都具有相同的分类。 { 其他决策树算法:C4.5 CART暂不考虑。}

目前我们只考虑在算法执行前计算列的数目,查看是否使用了全部属性即可。如果已经处理了全部属性但类标签仍然不是唯一的,则使用【多数表决法】决定该叶子结点的分类。

#投票表决代码,返回出现次数最多的分类名称,该函数使用分类名称的列表,

import operator

def majorityCnt(classList):

    classCount={}

    for vote in classList:

        if vote not in classCount:classCount[vote]=0

        classCount+=1

    sortedClassCount=sorted(classCount.iteritems(),key=operater.itemgetter[1],reverse=Ture)

    return sortClassCount[0][0]


#程序清单3-4:创建树的函数代码 

def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet] #数据集的所有类标签(最后一列)
    if classList.count(classList[0]) == len(classList): #返回元素在列表中出现的次数。
        return classList[0]      #stop splitting when all of the classes are equal当剩余元素的类别完全相同
    if len(dataSet[0]) == 1:  #stop splitting when there are no more features in dataSet仅剩类标签(只有一列)
        return majorityCnt(classList)    #投票表决函数
    bestFeat = chooseBestFeatureToSplit(dataSet)    #选择下一个划分的属性 返回它的位置i(第几列)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}    #字典嵌套表示树结构,字典内数据表示叶子{特征:{特征:分类}}
    del(labels[bestFeat])                #划分好的属性值,删掉(labels为所有属性标签,即dataset除去最后一列)
    featValues = [example[bestFeat] for example in dataSet]   #该属性标签下的所有属性值
    uniqueVals = set(featValues)   #列表变成集合,去掉重复值
    for value in uniqueVals:
        subLabels = labels[:]       #copy all of labels, so trees don't mess up existing labels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)   #递归
    return myTree


3.2 matplotlib注解绘制树形图

treePlotter.py

import matplotlib.pyplot as plt

decisionNode = dict(boxstyle="sawtooth", fc="0.8")  #锯齿形状
leafNode = dict(boxstyle="round4", fc="0.8")  #四个圆角
arrow_args = dict(arrowstyle="<-")  #箭头形状


def plotNode(nodeTxt, centerPt, parentPt, nodeType):   #绘图功能,绘制树节点
    createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction',  #绘制带箭头的注释
             xytext=centerPt, textcoords='axes fraction',
             va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )

def createPlot():
    fig = plt.figure(1, facecolor='white')      #新建绘画窗口
    fig.clf()        #清空绘图区
    createPlot.ax1 = plt.subplot(111, frameon=False)           #ticks for demo puropses 
    plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode)
    plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)           #绘制结点(节点文字,箭头终点,箭头起点,节点格式形状)
    plt.show()

测试:

>>>import treePlotter 

>>>treePlotter.createPlot()

3.2.2  构造注解树

# 程序清单3-6 获取叶节点数目和树的层数

def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = myTree.keys()[0]                               #树mytree字典的第一个关键字key[0]
    secondDict = myTree[firstStr]                         #从第一个关键字出发,遍历整颗树的所有子节点
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':                    #test to see if the nodes are dictonaires判断子节点是否为字典类型
            numLeafs += getNumLeafs(secondDict[key])             #如果是:递归调用  ;否则:是叶子节点则数目加一
        else:   numLeafs +=1 
    return numLeafs

def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':                    #test to see if the nodes are dictonaires, if not they are leaf nodes
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:   thisDepth = 1
        if thisDepth > maxDepth: maxDepth = thisDepth           #一旦达到叶子节点则从递归调用中返回
    return maxDepth

# 下面函数输出预先存储的树信息,避免了每次测试代码时都要从数据中创建树的麻烦

def retrieveTree(i):

    listOfTrees =[ {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
                  {'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'} }}}
                  ]
    return listOfTrees[i]

验证:

>>> import treePlotter
>>> treePlotter.retrieveTree(1)
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
>>> myTree=treePlotter.retrieveTree(0)
>>> treePlotter.getNumLeafs(myTree)
3
>>> treePlotter.getTreeDepth(myTree)
2