【数据挖掘】决策树之ID3算法

来源:互联网 发布:ajax循环遍历json数组 编辑:程序博客网 时间:2024/05/16 05:09

    在决策树中,如何衡量数据集的有序度至关重要。划分数据集的最大规则:将无序的数据变得更加有序,划分数据集前后信息发生的变化称为信息增益。香农熵表现为信息的期望值,熵越大,数据越混乱。另外一种度量集合是否无序的方法时基尼不纯度。

def calc_entropy(dataset):#香农熵的计算  lines = len(dataset)  lables={}  for curvect in dataset:    curlable = curvect[-1]#分类    labels.setdefault(curlable,0)    labels[curlable]+=1  shannonEntroy=0.0  for key in lables.keys():    prob = float(lables[key])/lines    shannonEntroy-=math.log(prob,2)*prob  return shannonEntroy

#划分数据集:从dataset中挑选特征cIndex中值为cValue的集合,返回集合中不包括cIndex对应的特征def splitDataset(dataset,cIndex,cValue):  rset=[]  for featureset in dataset:    if featureset[cIndex] == cValue:        tmpset = featureset[:cIndex]        tmpset.append(featureset[cIndex+1:])        rset.extend(tmpset)  return rset#ID3算法核心思想:遍历每个特征,找到最好的特征作为划分def choose_best_feature(dataset):  features = len(dataset[0]) - 1  baseEntropy = calc_entropy(dataset)  bestfearue = -1  bestGain = 0.0  for i in range(features):    #从第i个特征的香农熵    featurelist = [example[i] for example in dataset]    uniqfeatureval = set(featurelist)    newEntropy = 0.0 #第    #计算第i个特征每个特征值中的香农熵,计算信息增益    for val in uniqfeatureval:      subdataset = splitDataset(dataset,i,val)      prob = len(subdataset)/float(len(dataset))      newEntropy += prob * calc_entropy(subdataset)    infoGain = baseEntropy - newEntropy    #保存最佳的特征    if infoGain > bestGain:       bestGain = infoGain       bestfearue = i  return bestfearue

def majortiyCnt(classList):  classCount={}  for vote in classList:    classCount.setdefault(vote,0)    classCount[vote]+=1  sortedclassCount = sorted(classCount.iteritems(),               key=operator.itemgetter(1),reverse=True)    return sortedclassCount[0][0]

#构建决策树,递归终止的条件:1)类别完全相同 2)只剩下一个特征def createTree(dataset,lables):  classList = [example[-1] for example in dataset]  #划分类别完全相同  if classList.count(classList[0]) == len(classList):      return classList[0]  if len(dataset[0]) == 1:    return majortiyCnt(classList)  bestfeature = choose_best_feature(dataset)  bestlable = labels[bestfeature]  myTree = {bestlable:{}}    del labels[bestfeature]  #递归构建决策树  featrueVals = [example[bestfeature] for example in dataset]  uniqfeature = set(featrueVals)  #为第featrueVals特征的每个特征值构建子决策树  for value in uniqfeature:      sublables = lables[:]#复制      subdataset = splitDataset(dataset,bestfeature,value)      myTree[bestlable][value]=createTree(subdataset,sublables)  return myTree