决策树——实践
来源:互联网 发布:樱井知香痉挛场景 编辑:程序博客网 时间:2024/05/29 13:22
# -*- coding: utf-8 -*-"""Created on Tue Sep 08 10:10:15 2015@author: Administrator"""import operator #为了能够处理叶子节点的类标签仍然不唯一的情况。from math import log#计算训练数据集的香农熵def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 #这里不能为1,因为后一语句是一定执行的。 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob * log(prob, 2) return shannonEnt#创建训练数据集def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing', 'flippers'] return dataSet, labels#按照给定特征划分数据集def splitDataSet(dataSet, axis, value): #第二个参数为划分数据集的特征,第三个参数为需要返回的特征的值。 retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet #选择最好的数据集划分方式def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0; bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in dataSet] uniqueVals = set(featList) newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy if (infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature#处理叶子节点的类标签仍然不唯一的情况。def majorityCnt(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(), \ key = operator.itemgetter(1), reverse = True) return sortedClassCount[0][0] #创建树的函数代码def createTree(dataSet, labels): classList = [example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataSet[0]) == 1: return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet\ (dataSet, bestFeat, value), subLabels) return myTree #使用决策树的分类函数def classify(inputTree, featLabels, testVec): firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key])._name_ == 'dict': classLabel = classify(secondDict[key], featLabels, testVec) else: classLabel = secondDict[key] return classLabel #使用pickle模块存储决策树def storeTree(inputTree, filename): import pickle fw = open(filename, 'w') pickle.dump(inputTree, fw) fw.close()def grabTree(filename): import pickle fr = open(filename) return pickle.load(fr)
0 0
- 决策树——实践
- Python机器学习算法实践——决策树(ID3)
- 决策树系列算法总结——决策树
- 决策树——ID3构建决策树
- 决策树01——决策树的原理
- 决策树02——决策树的构建
- 基于R的数据挖掘方法与实践(3)——决策树分析
- Python-决策树ID3实践
- 决策树(实践)
- 决策树实践学习
- 决策树——理论
- MachineLearning——决策树
- 决策树——中文版
- 人工智能算法—决策树
- 人工智能算法—决策树
- 人工智能算法—决策树
- 人工智能算法—决策树
- Machine Learning—决策树
- java学习笔记参考
- [编程题]记负均正
- 使用docker搭建debian软件源镜像服务器
- Ajax初涉
- Selenium+Python 测试对象(控件)的登录操作
- 决策树——实践
- Struts2注解开发
- android模拟器启动出错
- MAC 设置环境变量path的几种方法
- 理解Python中的装饰器
- Android:自动完成文本框(AutoCompleteTextView、MultiAutoCompleteTextView )
- 六款值得推荐的android(安卓)开源框架简介
- 深入详解SQL中的Null
- 【虫师--系列13】性能测试知多少---性能测试计划