决策树
来源:互联网 发布:我的世界贴吧手机版js 编辑:程序博客网 时间:2024/05/23 14:24
#!/usr/bin/python # -*- coding: utf-8 -*- #coding=utf-8from math import logimport operator#创建数据集def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing', 'flippers'] return dataSet, labels#计算给定数据集的香农熵def calcShannonEnt(dataSet): numEntries = len(dataSet) #实例总数 labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] #键值是最后一列 #labelCounts[currentLabel] = labelCounts.get(currentLabel, 0) + 1 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries #标签出现的概率 shannonEnt -= prob * log(prob, 2) #以2为底求对数 return shannonEnt#按照给定特征划分数据集#输入:待划分的数据集、划分数据集的特征、特征的返回值#返回:下标为axis,且值为value的位于axis之后的列表,def splitDataSet(dataSet, axis, value): retDataSet = [] #创建新的list对象 for featVec in dataSet: #抽取 if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet#循环遍历整个数据集,循环计算香农熵和splitDataSet(),#找到最好的特征划分方式def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 #最后一列是标签 baseEntropy = calcShannonEnt(dataSet) #计算整个数据集的香农熵 bestInfoGain = 0.0; baseFeature = -1 for i in range(numFeatures): #列表推导创建新列表,将数据集中所有第i个特征值写入新列表 featList = [example[i] for example in dataSet] uniqueVals = set(featList) #创建唯一的分类标签列表 newEntropy = 0.0 for value in uniqueVals: #遍历当前特征中的所有唯一属性值,对每个特征划分划分一次数据集 subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) #计算每种划分方式的信息熵,求和 infoGain = baseEntropy - newEntropy #信息增益是熵的减少或无序度的减少 if(infoGain >= bestInfoGain): #print "%f %f %d" %(infoGain, bestInfoGain, i) bestInfoGain = infoGain bestFeature = i return bestFeature #返回最好特征划分的索引值#创建树#输入参数:数据集和标签列表def createTree(dataSet, label): classList = [example[-1] for example in dataSet] #包含了数据集和所有类标签 #类别完全相同则停止继续划分 if classList.count(classList[0] == len(classList)): return classList[0] #遍历完所有特征值时返回出现次数最多的类标签 if len(dataSet[0]) == 1: return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] #最后特征变量 myTree = {bestFeatLabel:{}} #字典类型 #得到列表中包含的所有属性值 labels1 = labels[:] #复制类标签 del(labels1[bestFeat]) #删除 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels1[:] #复制类标签,为保证每次调用时不改变原列表的内容 #函数终止时,字典中将嵌套很多代表叶子节点信息的字典数据 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) return myTree #使用决策树的分类函数def classify(inputTree, featLabels, testVec): firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) #将标签字符串转换为索引 for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__== 'dict': classLabel = classify(secondDict[key], featLabels, testVec) else: classLabel = secondDict[key] return classLabel#使用pickle模块存储决策树#将分类器存储在硬盘上,就不用每次对数据分类是重新学习一遍def storeTree(inputTree, filename): import pickle fw = open(filename, 'w') pickle.dump(inputTree, fw) fw.close()def grabTree(filename): import pickle fr = open(filename) return pickle.load(fr)
测试函数 calcShannonEnt
>>> import trees>>> myDat, labels = trees.createDataSet()>>> trees.calcShannonEnt(myDat)0.9709505944546686
测试函数 splitDataSet
>>> splitDataSet(myDat, 0, 1)[[1, 'yes'], [1, 'yes'], [0, 'no']]>>> splitDataSet(myDat, 0, 0)[[1, 'no'], [1, 'no']]
测试函数 chooseBestFeatureToSplit
>>> chooseBestFeatureToSplit(myDat)0
说明第0个特征是最好的用于划分数据集的特征
测试函数createTree
>>> import trees>>> myDat, labels = trees.createDataSet()>>> myTree = trees.createTree(myDat, labels)>>> myTree{'no surfacing': {0: {'flippers': {1: 'no'}}, 1: {'flippers': {0: 'no', 1: 'yes'}}}}
测试函数classify
>>> import trees>>> myDat,labels = trees.createDataSet()>>> myTree = createTree(myDat,labels)>>> classify(myTree,labels,[1,0])'no'>>> classify(myTree,labels,[1,1])'yes'
测试决策树的存取
>>> import trees>>> myDat,labels = trees.createDataSet()>>> myTree = createTree(myDat,labels)>>> storeTree(myTree, 'classfier.txt')>>> grabTree('classfier.txt'){'no surfacing': {0: {'flippers': {1: 'no'}}, 1: {'flippers': {0: 'no', 1: 'yes'}}}}
0 0
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- 决策树
- SQL查询之联结查询和子查询
- objc category的秘密
- java hdu2014青年歌手大奖赛_评委会打分
- 关于string的一段代码
- C++子父类成员之间相互访问的权限分析
- 决策树
- 关于获取imageview上的图片,保存到相册并通知相册自动更新
- git、Gradle、NDK、依赖及补充
- TCP连接的状态详解以及故障排查
- HashMap实现原理分析
- JDK源码解析之List和Set接口
- 文章标题
- dom4j 创建xml
- 深度解读谷歌开源的最精确自然语言解析器SyntaxNet