决策树
来源:互联网 发布:昆山爱知国际 编辑:程序博客网 时间:2024/04/29 02:53
1 决策树原理
熵表示信息的不确定程度,决策树的原理是找出按特征分类后熵减少的最大的特征,当分到一类后最后只有一个特征时就不再分
2.python实现
import numpy as npimport pylab as plfrom math import logimport operator#计算熵def calEntropy(dataSet): n = len(dataSet) labels = {} for featVec in dataSet: label = featVec[-1] if label not in labels.keys(): labels[label] = 0 labels[label] += 1 entropy = 0.0 for key in labels: prob = float(labels[key])/n entropy -= prob * log(prob,2) return entropydef createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing','flippers'] #change to discrete values return dataSet, labelsdef splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet#选择最佳分类特征def chooseBestFeat(dataSet): numFeat = len(dataSet[0]) - 1 baseEntropy = calEntropy(dataSet) bestInfoGain = 0.0 bestFeat = -1 for i in range(numFeat): featList = [example[i] for example in dataSet] uniqueVals = set(featList) newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet) / float(len(dataSet)) newEntropy += prob * calEntropy(subDataSet) infoGain = baseEntropy - newEntropy if (infoGain > bestInfoGain): bestInfoGain = infoGain bestFeat = i return bestFeat#找最大分类def majorityCnt(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClass = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse = True) return sortedClass[0][0]#创建决策树def createTree(dataSet, labels): classList = [example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataSet[0]) == 1: return majorityCnt(classList) bestFeat = chooseBestFeat(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) return myTree#决策树分类函数def classify(inputTree, featLabels, test): firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if test[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key], featLabels, test) else : classLabel = secondDict[key] return classLabel
0 0
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- JDBC连接MySQL
- java web 框架与学习经验
- 工业相机概述-选型事项-生产厂家汇总
- 微软自带杀毒软件Security Essentials占电脑内存很小 推荐使用可以安装下载
- [BZOJ4568][Scoi2016]幸运数字 树链剖分+线性基
- 决策树
- html 注意事项
- 机器学习5-自己的第一个分类器
- 电影列表
- 从0开始学习 GitHub 系列之「05.Git 进阶」
- 算法训练 寂寞的数
- gulp入门
- Java--02--主要用到的jar包介绍
- 一些面试题