决策树分类器+C代码

来源:互联网 发布:php fopen a 编辑:程序博客网 时间:2024/05/18 13:06

关于决策树的理解和计算过程,http://www.tuicool.com/articles/3EZJBz  这篇文章上有很详细的介绍。

就我完成决策树代码的一些步骤进行介绍:(ID3算法)

1. 获取样本,计算样本的增益值,选择增益值最大的作为下一步分支的根节点;

2. 将被选中的样本属性删除(我这里是将该属性列的值设置为99,即认为删除);

3. 对被选中的属性列中的子属性进行分类:如:在本例代码中,最先被选中的属性为outlook,将outlook中的子属性:sunny,overcast,rain分为三类s1,s2,s3;

4. 对s1,s2,s3分别进行递归,即将分别以s1,s2,s3为样本执行步骤1;

5. 递归结束的标志为:当前样本全为反例,即全为NO,输出NO并return;或 当前样本全为正例,即全为yes,输出yes并return;当前分支结束遍历。


递归的算法还是很容易编写的,难的是如何找到它的遍历路径和子属性的选择,本例代码在路径记录和选择输出上我感觉不是很完美和正确,希望大家多提提意见和纠正代码的不足。大笑


本代码用例如下:


根据上述表格,对应得到以下数据:

0 0 0 0 0
0 0 0 1 0
1 0 0 0 1
2 1 0 0 1
2 2 1 0 1
2 2 1 1 0
1 2 1 1 1
0 1 0 0 0
0 2 1 0 1
2 1 1 0 1
0 1 1 1 1

1 1 1 1 1
1 0 1 0 1
2 1 0 1 0


c语言代码如下:

#include "stdio.h"
#include "stdlib.h"
#include "math.h"
#include "string.h"
#include "vector"
using namespace std;


#define INF 99
#define dimNum 5  //样本维数
typedef vector<int> intVector;
vector<intVector> getFileInf(char *File);   //获取样本
void ID3(vector<intVector> sample);   //ID3决策树开始引擎
intVector getYESorNOnum(vector<intVector> sample);   //获取去和不去的数量
vector<double> getEntropy(vector<intVector> sample);  //获取各信息熵
int getGainLaber(double HD, vector<double> Entropy, int num);  //获取增益值最大处的位置
void Iter(vector<intVector> sample, int laber);  //开始迭代
void output();  //路径输出
void save();  //保存路径


struct pathInf
{
int att;    //属性类别
int num;    //使用的属性
int ID;     //调用的顺序
int r;      //结果
};


pathInf path[dimNum];
int ID = 1;
int result;
typedef vector<pathInf> pathVector;
vector<pathVector> savePath;


void main()
{
int i;
vector<intVector> sample;
char *File = "样本.txt";


//初始化路径
for(i=0; i<dimNum-1; i++)
{
path[i].att = INF;
path[i].num = INF;
path[i].ID = INF;
}


printf("天气  温度  湿度  风力   Y/N  (注:99表示未考虑。)\n");
sample = getFileInf(File);
ID3(sample);


output();


}




//ID3决策树开始引擎
void ID3(vector<intVector> sample)
{
int i, j;
intVector yesORno = getYESorNOnum(sample);
vector<double> Entropy;
double HD_Entropy;

if(yesORno.at(0)==0)
{
printf("样本全为不去!!!\n");
exit(0);
}


if(yesORno.at(1)==0)
{
printf("样本全为去!!!\n");
exit(0);
}


HD_Entropy = -((double)yesORno.at(0)/sample.size()) *log10((double)yesORno.at(0)/sample.size())/log10(2) 
  - ((double)yesORno.at(1)/sample.size()) *log10((double)yesORno.at(1)/sample.size())/log10(2);




Entropy = getEntropy(sample);


int laber = getGainLaber(HD_Entropy, Entropy, 0);   //最大增益值所对应的列


Iter(sample, laber);  //开始迭代


}




//迭代函数
void Iter(vector<intVector> sample, int laber)
{
int i, j;
intVector temp;
intVector YorN;
intVector yesORno = getYESorNOnum(sample);
vector<double> Entropy;
double HD_Entropy;
vector<intVector> samp0, samp1, samp2;
int laber0;


if(yesORno[0]==0)
{
result = 0;
save();
return;
}


if(yesORno[1]==0)
{
result = 1;
save();
return;
}




//进行分支
for(i=0; i<sample.size(); i++)
{
if (sample[i][laber]==0)
{
temp = sample[i];
temp[laber] = INF;
samp0.push_back(temp);
}

if (sample[i][laber]==1)
{
temp = sample[i];
temp[laber] = INF;
samp1.push_back(temp);
}

if (sample[i][laber]==2)
{
temp = sample[i];
temp[laber] = INF;
samp2.push_back(temp);
}
}


if(samp0.size()!=0)
{
YorN = getYESorNOnum(samp0);

HD_Entropy = -((double)YorN.at(0)/samp0.size()) *log10((double)YorN.at(0)/samp0.size())/log10(2) 
- ((double)YorN.at(1)/samp0.size()) *log10((double)YorN.at(1)/samp0.size())/log10(2);


Entropy = getEntropy(samp0);

laber0 = getGainLaber(HD_Entropy, Entropy, 0);   //最大增益值所对应的列


Iter(samp0, laber0);
}


if(samp1.size()!=0)
{
YorN = getYESorNOnum(samp1);

HD_Entropy = -((double)YorN.at(0)/samp1.size()) *log10((double)YorN.at(0)/samp1.size())/log10(2) 
- ((double)YorN.at(1)/samp1.size()) *log10((double)YorN.at(1)/samp1.size())/log10(2);


Entropy = getEntropy(samp1);

laber0 = getGainLaber(HD_Entropy, Entropy, 1);   //最大增益值所对应的列

Iter(samp1, laber0);
}


if(samp2.size()!=0)
{
YorN = getYESorNOnum(samp2);

HD_Entropy = -((double)YorN.at(0)/samp2.size()) *log10((double)YorN.at(0)/samp2.size())/log10(2) 
- ((double)YorN.at(1)/samp2.size()) *log10((double)YorN.at(1)/samp2.size())/log10(2);


Entropy = getEntropy(samp2);

laber0 = getGainLaber(HD_Entropy, Entropy, 2);   //最大增益值所对应的列

Iter(samp2, laber0);
}

}


//获取增益值
int getGainLaber(double HD, vector<double> Entropy, int num)
{
int i, l=0;
int laber;
double max;
vector<double> Gain;
for(i=0; i<Entropy.size(); i++)
Gain.push_back(HD-Entropy[i]);

max = Gain[0];
for(i=1; i<Gain.size(); i++)
if(max<Gain[i])
{
max = Gain[i];
l=i;
}



for(i=0; i<Entropy.size(); i++)
if(Entropy[i]>10 && path[i].att==INF)
{
path[i].att = i;
path[i].ID = ID;
path[i].num = num;
ID++;
}


return l;
}




//获取去和不去的数量
intVector getYESorNOnum(vector<intVector> sample)
{
int i;
intVector dst;
int yesNum=0;
int noNum=0;


for(i=0; i<sample.size(); i++)
{
if(sample[i][sample[0].size()-1]==1)
yesNum++;
else
noNum++;
}


dst.push_back(yesNum);
dst.push_back(noNum);


return dst;
}




//获取各信息熵
vector<double> getEntropy(vector<intVector> sample)
{
vector<double> Entropy;
int i, j;
int temp1, temp11, temp12;
int temp2, temp21, temp22;
int temp3, temp31, temp32;
double H1, H2, H3;


for(i=0; i<sample[0].size()-1; i++)
{
H1=H2=H3=0;
temp1 = temp11 = temp12 = temp2 = temp21 = temp22 = temp3 = temp31 = temp32 = 0;
for(j=0; j<sample.size(); j++)
{
//第1种情况
if(sample[j][i]==0)   
{
temp1++;
if(sample[j][i]==0 && sample[j][sample[0].size()-1]==1)  //第1种情况下yes
temp11++;

if(sample[j][i]==0 && sample[j][sample[0].size()-1]==0)  //第1种情况下NO
temp12++;
}


//第2种情况
if(sample[j][i]==1)   
{
temp2++;
if(sample[j][i]==1 && sample[j][sample[0].size()-1]==1)  //第2种情况下yes
temp21++;

if(sample[j][i]==1 && sample[j][sample[0].size()-1]==0)  //第2种情况下NO
temp22++;
}


//第3种情况
if(sample[j][i]==2)   
{
temp3++;
if(sample[j][i]==2 && sample[j][sample[0].size()-1]==1)  //第3种情况下yes
temp31++;

if(sample[j][i]==2 && sample[j][sample[0].size()-1]==0)  //第3种情况下NO
temp32++;
}
}


//计算信息熵
if(temp1!=0)
{
if(temp11==0 || temp12==0)
H1 = 0;
else
H1 = -((double)temp11/temp1)*log10((double)temp11/temp1)/log10(2) - ((double)temp12/temp1)*log10((double)temp12/temp1)/log10(2);
}


if(temp2!=0)
{
if(temp21==0 || temp22==0)
H2 = 0;
else
H2 = -((double)temp21/temp2)*log10((double)temp21/temp2)/log10(2) - ((double)temp22/temp2)*log10((double)temp22/temp2)/log10(2);
}


if(temp3!=0)
{
if(temp31==0 || temp32==0)
H3 = 0;
else
H3 = -((double)temp31/temp3)*log10((double)temp31/temp3)/log10(2) - ((double)temp32/temp3)*log10((double)temp32/temp3)/log10(2);
}


if(sample[0][i]==99)
Entropy.push_back(99);
else
Entropy.push_back(((double)temp1/sample.size())*H1 +((double)temp2/sample.size())*H2 +((double)temp3/sample.size())*H3);


}


return Entropy;
}




//获取样本
vector<intVector> getFileInf(char *File)
{
int i=1;
int num;
vector<intVector> samlpe;
intVector temp;

FILE *fp = fopen(File, "r");

if(fp==NULL)
{
printf("Open file error!\n");
exit(0);
}

while(fscanf(fp, "%d", &num)!=EOF)
{
temp.push_back(num);
if(i%dimNum==0)
{
samlpe.push_back(temp);
temp.clear();
}
i++;
}

return samlpe;
}




//保存路径
void save()
{
pathVector temp;
pathInf swp;
int i, j, l;
int min;
ID = 1;


for(i=0; i<dimNum-1; i++)
{
l=i;
min = path[i].ID;
for(j=i+1; j<dimNum-1; j++)
if(min>path[j].ID)
{
min = path[j].ID;
l = j;
}
swp = path[i];
path[i] = path[l];
path[l] = swp;
}


path[dimNum-2].r = result;


for(i=0; i<dimNum-1; i++)
temp.push_back(path[i]);


savePath.push_back(temp);
temp.clear();


for(i=0; i<dimNum-1; i++)
{
path[i].att = INF;
path[i].num = INF;
path[i].ID = INF;
}

}




//结果输出
void output()
{
int i, j;
int root;
int maxPro;
int Pro[dimNum] = {0};  //优先级


intVector temp(dimNum, INF);
vector<intVector> saveResult(savePath.size(), temp);


for(i=0; i<dimNum; i++)
for(j=0; j<dimNum; j++)
saveResult[i][j] = INF;

//计算优先级的大小
for(i=0; i<savePath.size(); i++)
{
for(j=0; j<dimNum-1; j++)
{
if(savePath[i][j].att==INF)
continue;
Pro[savePath[i][j].att] += abs(j-dimNum);  //累加值越大,说明该节点越深
}
}




//训练结果
for(i=0; i<savePath.size(); i++)
{
for(j=0; j<dimNum-1; j++)
{
if(savePath[i][j].att == INF)
break;


if(j==0)
saveResult[i][savePath[i][j].att] = savePath[i][j].num;


if(j>0)
{ //因为程序是递归的,根据节点深度确定程序的遍历顺序和当前节点所处的节点位置,判断当前子树是否遍历完成
if(Pro[savePath[i][j].att] > Pro[savePath[i][j-1].att])
saveResult[i][savePath[i][j].att] = saveResult[i-1][savePath[i][j].att];
else
saveResult[i][savePath[i][j].att] = savePath[i][j].num;
}
}
saveResult[i][dimNum-1] = savePath[i][dimNum-2].r;
}




//结果输出
for(i=0; i<saveResult.size(); i++)
{
for(j=0; j<dimNum; j++)
printf("%4d  ", saveResult[i][j]);
printf("\n");
}
}

运行结果如下:


0 0
原创粉丝点击