决策树ID3 算法python实现

来源:互联网 发布:如何成为一名淘宝客 编辑:程序博客网 时间:2024/05/16 02:51

#!/usr/bin/python#-*-coding:utf-8 -*-from math import logdef createDataSET():    dataSet=[[1,1,"yes"],             [1,1,"yes"],             [1,0,"no"],             [0,1,"no"],             [0,1,"no"]]    labels=["no surfacing","flippers"]    return dataSet,labels#得到当前条件下最好的特征值def getBeastFeature(dataSet):    numFeature=len(dataSet[0])-1    #得到所有的特征值  (特征值以数字来定位)    featherList=[i for i in range(numFeature)]    baseFeather=-1    basegain=0.0    print(" --->feathreList"+str(featherList))    #比较所有的特征值的消息增益来的得到最好的消息增益    for i in featherList:        #条件熵        conEntropy=0.0        #经验熵        empirical=getEntropy(dataSet,i)        feathureList=[data[i] for data in dataSet]        uniqueValus=set(feathureList)        print("uniqueValues---->"+str(uniqueValus))        for value in uniqueValus:            print("i--->"+str(i)+"---->"+str(value))            splitData=splitDataSet(dataSet,i,value)            conEntropy+=(len(splitData)/len(dataSet))*getEntropy(splitData,i)        nowgain=empirical-conEntropy        if nowgain>basegain:            basegain=nowgain            baseFeather=i    return baseFeather#得到当前数据的经验熵def getEntropy(dataSet,i):    """    :param dataSet:数据集    :param i: 表示当前特征值位于第几个    :return:得到当前数据的经验熵值    """    #按照特征值划分数据    dataLen=len(dataSet)    # print(str(dataLen)+" getEntropy(dataSet,i): ---> dataLen")    entropy=0.0    # print(str(i)+"   ---->>>"+str(dataSet[0][0]))    feathureList=[data[i] for data in dataSet]    uniqueValus=set(feathureList)    for value in feathureList:        #类 value的样本个数        num_value=0        for data in dataSet:            if data[i]==value:                num_value+=1        # log(x,base) x 必须为float型        propotibity=float(num_value)/dataLen        entropy-=propotibity*log(propotibity,2)    return entropy# 划分数据,以第i 特征值划分数据def splitDataSet(dataSet,i,values):    '''    :param dataSet: 数据集    :param i:  以哪一个特征值划分数据    :param labels:标记这个剩下的特征值对应的名字    :param values:当前的特征值对应的值    :return:划分后的数据    '''    splitData=[]    for data in dataSet:        if data[i]==values:            nowData=[]            before=data[:i]            after=data[i+1:]            nowData.extend(before)            nowData.extend(after)            splitData.append(nowData)            #对特征标签进行处理    print("splitData---->"+str(splitData))    return splitData#将实例树最大的类Ck作为该节点的类标记def getMark(dataSet):    MarkList=[data[-1] for data in dataSet]    uniqueList=set(MarkList)    #给当前的类标记赋初值    MainClass=uniqueList[0]    for value in uniqueList:        num_value=0        for data in dataSet:            now_value=0            if data[-1]==value:                now_value+=1        if num_value<now_value:            num_value=now_value            MainClass=value    return MainClass# 数组表示,值代表的是子节点,字典代表的内部节点 如:{"no ":{1:1,0:{"fllipers"{1:1,0:0}}}}Tree={}def createTree(dataSet,labels):    classList=[examle[-1] for examle in dataSet]    #类别完全相同,则停止划分(意味着某一类别长度等于总长度)    if classList.count(classList[0])==len(dataSet):        return classList[0]    #遍历完所有特征,则返回所有最大的类Ck作为该节点的类标记    if dataSet[0]==1:        return getMark(dataSet)    baseFeature=getBeastFeature(dataSet)    baseLabel=labels[baseFeature]    print("baselabel"+str(baseLabel))    mytree={baseLabel:{}}    #去除当前的特征名    del(labels[baseFeature])    print("baseFeature--->"+str(baseFeature))    feathureList=[data[baseFeature] for data in dataSet]    uniqueValus=set(feathureList)    for values in uniqueValus:        sublables=labels[:]        splitData=splitDataSet(dataSet,baseFeature,values)        #它包含在这里面了        mytree[baseLabel][values]=createTree(splitData,sublables)        print("sublabels---->>"+str(sublables))    return mytreedataSet,labels=createDataSET()# baseFeature=getBeastFeature(dataSet,labels)# print(baseFeature)mytree=createTree(dataSet,labels)print(mytree)

原创粉丝点击