基于R的C4.5决策树的建立

来源：互联网发布：办公室网络组建方案编辑：程序博客网时间：2024/06/06 19:53

下边的代码是一个简单的C4.5决策树创建过程，该决策树不含有剪枝的过程，并且针对的属性必须是标量（非连续属性值）
#计算类属性的熵，其中data是一个数据框，它代表着原始数据，class.index是一个类的索引值
entropy <- function(data,class.index){
data[,class.index] = as.factor(data[,class.index])

class.freq.frame = as.data.frame(table(data[,class.index]))

class.freq.vec = class.freq.frame$Freq/sum(class.freq.frame$Freq)

class.log.freq.vec = log2(class.freq.vec)

#为了处理log2(0)的情况，人为设定log2(0)=0

if(any(class.freq.vec == 0)){
zero.index.vec <- which(0 == class.freq.vec)
class.log.freq.vec[zero.index.vec] = 0;
}

entropy = -sum(class.freq.vec * class.log.freq.vec)
return(entropy)
}

subGain <- function(x,data,attr.index,class.index){

data.sub <- data[which(x == data[,attr.index]),]

entropy.sub <- entropy(data.sub,class.index)

return((dim(data.sub)[1]/dim(data)[1])*entropy.sub)

}

#计算单个属性的信息熵值
gain <- function(data, attr.index, class.index){

attr.val.vec <- unique(data[,attr.index])
attr.entropy.vec<-unlist(lapply(attr.val.vec, subGain,data = data, attr.index = attr.index, class.index = class.index))

return(entropy(data,class.index)-sum(attr.entropy.vec))
}

#calculate split information

splitInfo <- function(data, attr.index, class.index){
attr.val.vec <- unique(data[,attr.index])
split.val.vec <- unlist(lapply(attr.val.vec, function(x,data,attr.index) {split.freq=length(which(x == data[,attr.index]))/dim(data)[1];return(split.freq * log2(split.freq))}, data=data, attr.index=attr.index))
return(-sum(split.val.vec))
}

# calculate gain ratio
gainRation <- function(x, data, class.index){
attr.index = x
splitInfo.attr = splitInfo(data, attr.index, class.index)
gain.attr = gain(data,attr.index, class.index)
return(ifelse(splitInfo.attr ==0,1,gain.attr/splitInfo.attr))
}

# create tree node
createNode <- function(data, tree.max.num, tree.level, class.index, node.index=NA, node.val=NA){
attr.vec <- (1:dim(data)[2])[-class.index]
gain.ration.vec <- unlist(lapply(attr.vec, gainRation, data = data, class.index = class.index))

#TEST_B
#print(data)
#TEST_E
#look for the index of the attribute that is the most gain rate
attr.best.index = attr.vec[which(max(gain.ration.vec) == gain.ration.vec)]

#select the first best attribute when there exist multiple best attributes
if(length(attr.best.index)!=1){
attr.best.index = attr.best.index[1]
}

#decision tree for label variables
if(!is.character(data[,attr.best.index]) && !is.integer(data[,attr.best.index]) && !is.factor(data[,attr.best.index])){
stop("The values of attributes are characters, integer or factor")
}
else{
attr.best.val.vec <- unique(data[,attr.best.index])
result.list=lapply(attr.best.val.vec,function(x,data) return(which(x == data)), data = data[,attr.best.index])

#get the label value for the special node
class.matrix = as.data.frame(table(data[,class.index]))
class.label = as.numeric(class.matrix[which( max(class.matrix[,2]) == class.matrix[,2]),1])

#create TreeNode class
node <- list(split.vec = NA, split.index = NA, nodes.list=list(),split.num = 0, node.label = class.label,
node.index = NA, node.val = NA, level = tree.level,node.num = 0)
class(node)<-"TreeNode"

#when the node is not root node
if(!is.na(node.val)){
node$node.val = node.val
node$node.index = node.index
node$node.num = dim(data)[1]
}

#calculate the purity of class in node
if(length(unique(data[,class.index])) == 1){
return(node)
}
node$split.vec = as.character(attr.best.val.vec)
node$split.num = length(attr.best.val.vec)
node$split.index = names(data)[attr.best.index]

# divide the data based on the selected variable
data.sub = data[,-attr.best.index]

#change class.index
if(attr.index < class.index){
class.index = class.index -1
}

#create children nodes, when the data has only one attribute or the height of the tree is max
if(length(attr.vec) == 1 || tree.max.num == 0){
for(i in 1:length(attr.best.val.vec)){
class.sub.matrix = as.data.frame(table(data[result.list[[i]],][,class.index]))
class.sub.label = class.sub.matrix[which(class.sub.matrix[,2] == max(class.sub.matrix[,2])),1]
# split.index =0 denotes leaf node
node.new <- list(split.vec = NA, split.index = NA, nodes.list = list(), split.num = 0, node.label=as.numeric(class.sub.label),
node.index = NA, node.val = NA, level = tree.level+1, node.num = 0)
class(node.new)<- "TreeNode"
node.new$node.index = node$split.index
node.new$node.val = node$split.vec[i]
node.new$node.num = length(result.list[[i]])
node$nodes.list[[i]] <- node.new
}
}
#create children using iterative method when the number of attributes is more than 0
else{
for(i in 1:length(attr.best.val.vec)){
node.new = createNode(data.sub[result.list[[i]],],tree.max.num-1, tree.level+1, class.index, node$split.index, node$split.vec[i])
node$nodes.list[[i]] = node.new
}
}
return(node)
}
}

#build decision tree

createTree <- function(data,tree.max.num, class.index){
if(tree.max.num <= 0){
stop("The height of tree must be more than 0")
}
root.node <- createNode(data, tree.max.num, 0, class.index)
tree <- list(root = root.node)
class(tree)<-"Tree"

return(tree)
}

#predict using tree

predict.tree <- function(tree.ins, x.ins){

}

print.TreeNode <- function(x){
if(x$level != 0){
for(i in 1:x$level){
cat("\t\t\t")
}
}
#leaf node
if(is.na(x$split.index)){
cat(x$node.index,"=",x$node.val,": ",x$label,"(",x$node.num,")\n")
}
else{
cat(x$node.index,"=",x$node.val,"\n")
for(i in 1:length(x$nodes.list)){
print(x$nodes.list[[i]])
}
}
}

print.Tree <- function(x){

#if(is.na(x)){
# cat("The tree is empty\n")
#}

node.root <- x$root

if(is.na(node.root) || length(node.root$nodes.list) == 0){
cat("The root is empty\n")
}

for(i in 1:length(node.root$nodes.list)){
print(node.root$nodes.list[[i]])
}

}

注：叶子节点的split.index=NA，root节点的node.index=NA和node.val=NA。

0 0