机器学习之自己实现决策树

来源：互联网发布：linux 删除路由编辑：程序博客网时间：2024/06/05 15:15

import pandasimport numpy as np# Set index_col to False to avoid pandas thinking that the first column is row indexes (it's age).columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex",            "capital_gain", "capital_loss", "hours_per_week", "native_country", "high_income"]income = pandas.read_csv("D:\\test\machineLearning\income.csv", names=columns)print(income.head(2))

   age          workclass  fnlwgt   education  education_num  \0   39          State-gov   77516   Bachelors             13   1   50   Self-emp-not-inc   83311   Bachelors             13           marital_status        occupation    relationship    race    sex  \0        Never-married      Adm-clerical   Not-in-family   White   Male   1   Married-civ-spouse   Exec-managerial         Husband   White   Male      capital_gain  capital_loss  hours_per_week  native_country high_income  0          2174             0              40   United-States       <=50K  1             0             0              13   United-States       <=50K

#去重print income["workclass"].unique()

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov' ' ?' ' Self-emp-inc']

#将"workclass"下的数据根据类别转换称数字col = pandas.Categorical.from_array(income["workclass"])print col#将值显示出来print col.codes

[State-gov, Self-emp-not-inc, Private, Private, Private, ..., Private, Private, Private, Self-emp-not-inc, Private]Length: 629Categories (7, object): [?, Federal-gov, Local-gov, Private, Self-emp-inc, Self-emp-not-inc, State-gov][6 5 3 3 3 3 3 5 3 3 3 6 3 3 3 3 5 3 3 5 3 3 1 3 3 2 3 0 3 3 2 3 3 1 6 3 3 3 3 5 3 5 3 3 3 1 3 3 6 3 3 3 3 1 4 3 3 3 3 3 3 0 3 3 3 3 3 3 4 0 3 3 5 3 3 3 3 0 3 2 3 3 3 3 3 3 2 3 3 1 3 3 3 3 2 2 5 3 3 1 3 3 5 3 3 4 0 3 2 3 3 3 5 3 3 3 4 2 3 3 3 3 3 6 3 3 3 3 0 3 3 3 5 3 3 1 5 3 3 3 4 3 3 3 3 3 3 3 3 0 2 3 3 3 0 3 3 5 3 3 0 3 4 3 5 3 1 6 3 2 3 6 3 3 6 3 3 3 3 3 2 3 3 3 1 3 5 0 3 6 3 3 2 3 1 3 3 1 3 5 3 0 3 2 6 3 3 3 4 3 2 3 3 3 4 3 3 3 3 3 3 0 3 3 2 3 0 5 3 3 3 3 3 3 3 6 6 3 2 3 3 5 6 0 3 3 3 3 3 3 3 3 2 2 3 5 3 3 3 3 3 3 3 3 3 4 0 3 3 3 3 3 5 2 3 3 3 3 3 3 3 2 3 3 3 1 3 3 3 3 3 3 2 3 3 3 4 0 3 3 3 3 3 3 3 3 3 3 3 4 3 3 0 5 3 3 5 3 3 3 5 3 3 3 5 5 0 3 5 3 3 3 3 2 3 3 3 3 3 3 3 3 6 3 6 5 0 0 3 3 3 3 3 2 0 1 3 3 3 5 3 3 3 3 3 3 5 3 3 3 1 3 6 3 3 2 3 3 5 3 3 3 3 3 2 3 3 3 3 3 3 3 5 6 3 3 3 0 3 4 3 3 6 3 3 3 3 5 0 3 2 3 3 3 6 3 3 3 2 6 3 3 6 3 3 3 1 3 4 2 0 0 5 3 3 3 2 2 3 2 3 3 1 3 3 3 3 3 3 0 3 4 3 3 5 3 3 3 3 0 3 2 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 5 6 3 0 3 0 2 2 3 3 1 3 3 3 3 3 3 3 0 2 2 3 3 3 3 3 3 2 3 3 0 3 3 3 0 3 0 0 3 4 3 2 2 3 3 3 3 5 5 3 3 3 3 3 2 3 3 4 0 3 3 3 3 3 3 3 2 3 3 5 5 3 3 1 3 3 3 3 3 3 3 3 3 5 3 4 3 5 3 3 3 3 3 3 3 0 5 2 5 0 3 3 3 5 3 3 3 3 4 0 0 3 3 3 3 0 5 3 3 3 3 3 2 5 3 3 3 2 3 3 3 3 2 3 3 3 5 4 3 3 3 3 3 3 3 3 5 3]

income["workclass"]=col.codes#同理转换以下数据for name in ["education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "high_income"]:    col = pandas.Categorical.from_array(income[name])    income[name]=col.codes

private_income = income[income["workclass"]==4]public_income = income[income["workclass"]!=4]print private_income.head(2)print public_income.head(2)

    age  workclass  fnlwgt  education  education_num  marital_status  \54   47          4  109832         11              9               0   68   49          4  191681         15             10               2       occupation  relationship  race  sex  capital_gain  capital_loss  \54           4             1     4    1             0             0   68           4             0     4    1             0             0       hours_per_week  native_country  high_income  54              60              26            0  68              50              26            1     age  workclass  fnlwgt  education  education_num  marital_status  \0   39          6   77516          9             13               4   1   50          5   83311          9             13               2      occupation  relationship  race  sex  capital_gain  capital_loss  \0           1             1     4    1          2174             0   1           4             0     4    1             0             0      hours_per_week  native_country  high_income  0              40              26            0  1              13              26            0

import mathimport numpy as np#本函数计算熵def calc_entropy(column):    #取出column里每一种数字出现的次数    count = np.bincount(column)    #将每一钟数字出现的次数除以总的数量，即为概率    probality = count/float(len(column))    entropy=0    for pro in probality:        #注意log里pro不能为0，否则会报错        if pro>0:            entropy += pro*math.log(pro,2)     return -entropycalc_entropy([1,1,0,0,1])

0.97095059445466858

#求信息增溢high_entropy=calc_entropy(income["high_income"])#求中间数median_age=income["age"].median()left_age = income[income["age"]<=median_age]right_age = income[income["age"]>median_age]info_gain = high_entropy - (left_age.shape[0]/len(income) * calc_entropy(left_age["high_income"]) +                             right_age.shape[0]/len(income) * calc_entropy(right_age["high_income"]))print info_gain

0.756141116271

print np.bincount([1,1,0,0,1,3,3,5])#最小的数0出现了2次，1出现了3次

[2 3 0 2 0 1]

#求信息增溢def calc_information_gain(data,split_name,target_name):    high_entropy=calc_entropy(income[target_name])    #求中间数    median_age=income[split_name].median()    left_age = income[income[split_name]<=median_age]    right_age = income[income[split_name]>median_age]    to_sub_result=0    for sub in [left_age,right_age]:        proba = sub.shape[0]/float(len(income))        to_sub_result += proba*calc_entropy(sub[target_name])    return high_entropy-to_sub_resultgain=calc_information_gain(income,"age","high_income")print gain columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]information_gains = []for sub in columns:    gain=calc_information_gain(income,sub,"high_income")    information_gains.append(gain)#在所有的信息增益里取出最大的一个index = information_gains.index(max(information_gains))print information_gains[index]

0.05012718485010.125533153029

def find_best_column(data, target_name, columns):    for sub in columns:        gain=calc_information_gain(data,sub,target_name)        information_gains.append(gain)    #在所有的信息增益里取出最大的一个    index = information_gains.index(max(information_gains))    return information_gains[index]find_best_column(income,"high_income",columns)

0.12553315302923063

0 0