决策树

来源:互联网 发布:昆山爱知国际 编辑:程序博客网 时间:2024/04/29 02:53

1 决策树原理

熵表示信息的不确定程度,决策树的原理是找出按特征分类后熵减少的最大的特征,当分到一类后最后只有一个特征时就不再分

2.python实现

import numpy as npimport pylab as plfrom math import logimport operator#计算熵def calEntropy(dataSet):    n = len(dataSet)    labels = {}    for featVec in dataSet:        label = featVec[-1]        if label not in labels.keys():            labels[label] = 0        labels[label] += 1    entropy = 0.0    for key in labels:        prob = float(labels[key])/n        entropy -= prob * log(prob,2)    return entropydef createDataSet():    dataSet = [[1, 1, 'yes'],               [1, 1, 'yes'],               [1, 0, 'no'],               [0, 1, 'no'],               [0, 1, 'no']]    labels = ['no surfacing','flippers']    #change to discrete values    return dataSet, labelsdef splitDataSet(dataSet, axis, value):    retDataSet = []    for featVec in dataSet:        if featVec[axis] == value:            reducedFeatVec = featVec[:axis]            reducedFeatVec.extend(featVec[axis+1:])            retDataSet.append(reducedFeatVec)    return retDataSet#选择最佳分类特征def chooseBestFeat(dataSet):    numFeat = len(dataSet[0]) - 1    baseEntropy = calEntropy(dataSet)    bestInfoGain = 0.0    bestFeat = -1    for i in range(numFeat):        featList = [example[i] for example in dataSet]        uniqueVals = set(featList)        newEntropy = 0.0        for value in uniqueVals:            subDataSet = splitDataSet(dataSet, i, value)            prob = len(subDataSet) / float(len(dataSet))            newEntropy += prob * calEntropy(subDataSet)        infoGain = baseEntropy - newEntropy        if (infoGain > bestInfoGain):            bestInfoGain = infoGain            bestFeat = i    return bestFeat#找最大分类def majorityCnt(classList):    classCount = {}    for vote in classList:        if vote not in classCount.keys():            classCount[vote] = 0        classCount[vote] += 1    sortedClass = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse = True)    return sortedClass[0][0]#创建决策树def createTree(dataSet, labels):    classList = [example[-1] for example in dataSet]    if classList.count(classList[0]) == len(classList):        return classList[0]    if len(dataSet[0]) == 1:        return majorityCnt(classList)    bestFeat = chooseBestFeat(dataSet)    bestFeatLabel = labels[bestFeat]    myTree = {bestFeatLabel:{}}    del(labels[bestFeat])    featValues = [example[bestFeat] for example in dataSet]    uniqueVals = set(featValues)    for value in uniqueVals:        subLabels = labels[:]        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)    return myTree#决策树分类函数def classify(inputTree, featLabels, test):    firstStr = inputTree.keys()[0]    secondDict = inputTree[firstStr]    featIndex = featLabels.index(firstStr)    for key in secondDict.keys():        if test[featIndex] == key:            if type(secondDict[key]).__name__ == 'dict':                classLabel = classify(secondDict[key], featLabels, test)            else : classLabel = secondDict[key]    return classLabel


0 0
原创粉丝点击