Machine Learning in Action

来源:互联网 发布:淘宝空单号多少钱一个 编辑:程序博客网 时间:2024/05/18 02:51

Classificaton

Chapter 3 Decision Trees

实现环境:

  • Python3.5
  • math
  • Operator

代码块

实现的python代码:
trees.py

from math import logimport operator# claculate entropydef calcShannonEnt(dataSet):    numEntries = len(dataSet)    labelCounts = {}    for featVec in dataSet:        currentLabel = featVec[-1]        if currentLabel not in labelCounts.keys():            labelCounts[currentLabel] = 0        labelCounts[currentLabel] += 1    shannonEnt = 0.0    for key in labelCounts:        prob = float(labelCounts[key])/numEntries        shannonEnt -= prob*log(prob,2)    return shannonEntdef createDataSet():    dataSet = [[1,1,'yes'],               [1,1,'yes'],               [1,0,'no'],               [0,1,'no'],               [0,1,'no']]    labels = ['no surfacing','flippers']    return dataSet, labelsdef splitDataSet(dataSet, axis, value):    retDataSet = []    for featVec in dataSet:        if featVec[axis] == value:            reducedFeatVec = featVec[:axis]            reducedFeatVec.extend(featVec[axis+1:])            retDataSet.append(reducedFeatVec)    return retDataSetdef chooseBestFeatureToSplit(dataSet):    numFeatures = len(dataSet[0]) - 1    baseEntropy = calcShannonEnt(dataSet)    bestInfoGain = 0.0    bestFeature = -1    for i in range(numFeatures):        featList = [example[i] for example in dataSet]        uniqueVals = set(featList)        newEntropy = 0.0        for value in uniqueVals:            subDataSet = splitDataSet(dataSet,i,value)            prob = len(subDataSet)/float(len(dataSet))            newEntropy += prob*calcShannonEnt(subDataSet)        infoGain = baseEntropy - newEntropy        if infoGain > bestInfoGain:            bestInfoGain = infoGain            bestFeature = i    return bestFeaturedef majorityCnt(classList):    classCount = {}    for vote in classList:        if vote not in classCount.keys(): classCount[vote]=0        classCount[vote] += 1    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)    return sortedClassCount[0][0]def createTree(dataSet,labels):    classList = [example[-1] for example in dataSet]    if classList.count(classList[0]) == len(classList):        return classList[0]    if len(dataSet[0]) == 1:        return majorityCnt(classList)    bestFeat = chooseBestFeatureToSplit(dataSet)    bestFeatLabel = labels[bestFeat]    myTree = {bestFeatLabel:{}}    del(labels[bestFeat])    featValues = [example[bestFeat] for example in dataSet]    uniqueVals = set(featValues)    for value in uniqueVals:        subLabels = labels[:]        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)    return myTree
原创粉丝点击