gbdt源码阅读
来源:互联网 发布:linux nbtscan 编辑:程序博客网 时间:2024/06/06 07:37
RegressionTree.
main函数:训练样本,最大深度,叶子节点最小样本数,特征采样,数据采样。加载训练样本,利用样本构建regressiontree,对训练样本(这里有些问题,实际上应该对样本进行划分)进行预测。
RegressionTree:
fit:选取采样特征,递归构建树,选取切分节点(特征)。
split_node: 选取切分点,如果深度达到最大深度或者切分之后左右树没有样本或者左右孩子样本数量和小于叶子节点最小样本数,返回预测值(预测值是均值)。左右子树递归进行split_node
search_best_split: 选取切分点,选择随机采样特征,按照特征均值划分每个特征,计算被划分后的方差,选择最小方差的特征作为划分点。得到划分的特征id,划分值,划分后的左右节点样本。
predict:eval_instance,对一个样本利用tree来训练。
eval_instance:如果tree无法继续划分,就返回当前节点预测值。如果样本特征值大于划分值,就在右子树递归计算,否则在左子树递归计算。
import numpy as npimport randomclass RegressionTree(): """ A regression tree model for training. """ def __init__(self, max_depth=16, min_leaf_sample_cnt=1, feat_sample_ratio=0.8, data_sample_ratio=0.8): self.max_depth = max_depth self.min_leaf_sample_cnt = min_leaf_sample_cnt self.feat_sample_ratio = feat_sample_ratio self.data_sample_ratio = data_sample_ratio self.feature_cnt = 0 self.feature_id_list = [] self.feature_sampled_cnt = 0 def fit(self, data_set): """ train the regression tree model by given data_set. :param: data_set :return: regression tree """ self.feature_cnt = len(data_set[0]) - 1 self.feature_id_list = range(0, self.feature_cnt) self.feature_sampled_cnt = int(self.feature_cnt * self.feat_sample_ratio) #print self.feature_cnt #print self.feature_id_list #print self.feature_sampled_cnt tree = dict() # split data iters self.split_node(data_set, tree, 0) self.tree = tree return tree def split_node(self, data_set, tree, current_depth): tree['fid'], tree['split_val'], tree['left_data_set'], tree['right_data_set'] = self.search_best_split(data_set) """ print "split feat id:", tree['fid'] print "split feat val:", tree['split_val'] print "left_data_set, size:", tree['left_data_set'][:,0].size print "right_data_set, size:", tree['right_data_set'][:,0].size """ if current_depth == self.max_depth or \ tree['left_data_set'] is None or \ tree['right_data_set'] is None or \ (tree['left_data_set'][:,0].size + tree['right_data_set'][:,0].size) <= self.min_leaf_sample_cnt: tree['predict'] = data_set[:, -1].mean() return None tree['left_node'] = dict() self.split_node(tree['left_data_set'], tree['left_node'], current_depth + 1) tree['right_node'] = dict() self.split_node(tree['right_data_set'], tree['right_node'], current_depth + 1) return None def search_best_split(self, data_set): feature_mean = data_set[:,:-1].mean(0) feature_var = data_set[:,:-1].var(0) #feature_medium = data_set[:,:-1].medium() #print "feature_mean", feature_mean #print "feature_var", feature_var sampled_feat_ids = random.sample(self.feature_id_list, self.feature_sampled_cnt) #print "sampled_feat_ids", sampled_feat_ids min_metric = np.inf fid = None split_val = None mask_leaf = [] mask_right = [] for id in sampled_feat_ids: if feature_var[id] == 0: continue m_l = data_set[:, id] <= feature_mean[id] m_r = data_set[:, id] > feature_mean[id] metric = data_set[:, -1][m_l].var() + data_set[:, -1][m_r].var() """ print "m_l size", sum(m_l) print "m_r size", sum(m_r) print "metric", metric """ if min_metric > metric: min_metric = metric fid = id split_val = feature_mean[id] mask_leaf = m_l mask_right = m_r return [ fid, split_val, data_set[mask_leaf,:], data_set[mask_right,:] ] def predict(self, instance): return self.eval_instance(self.tree, instance) def eval_instance(self, cur_node, instance): if not cur_node.has_key("left_node") or not cur_node.has_key("right_node"): return cur_node["predict"] if instance[cur_node['fid']] > cur_node["split_val"]: return self.eval_instance(cur_node["right_node"], instance) else: return self.eval_instance(cur_node["left_node"], instance)def load_data(data_file): data_set = [[float(j) for j in i.rstrip().split(',')] for i in open(data_file).readlines()] return np.array(data_set)def test(data_file_path, model): data_set = load_data(data_file_path) """ print data_set print data_set.shape print np.bincount([ int(i) for i in data_set[:,-1]]) """ correct = float(0) incorrect = float(0) sq_sum_error = float(0) for data in data_set: pv = model.predict(data) if abs(pv - data[-1]) < 0.5: correct += 1 else: incorrect += 1 # rmse sq_sum_error += np.power((pv - data[-1]), 2) rmse = np.sqrt( sq_sum_error / len(data_set[0]) ) accuracy = correct / (incorrect + correct) print "test accuracy is: %s "%(accuracy) print "test rmse is : %s"%(rmse)def main(data_file_path, max_depth, min_leaf_sample_cnt, feat_sample_ratio, data_sample_ratio): data_set = load_data(data_file_path) """ print data_set print data_set.shape print np.bincount([ int(i) for i in data_set[:,-1]]) """ model = RegressionTree(max_depth=max_depth, min_leaf_sample_cnt=min_leaf_sample_cnt, feat_sample_ratio=feat_sample_ratio, data_sample_ratio=data_sample_ratio) model.fit(data_set) print "train done!" test(data_file_path, model) print "test done!"if __name__ == "__main__": main('Iris_150_4.txt', 9, 1, 0.9, 0.9)
GBDT:
main:最大深度,叶子节点最小样本数,特征采样,数据采样,shrink,树个数。
load_data:加载训练数据
GBDT-fit:利用数据生成模型trees,每次对残差利用regressiontree建树,新预测结果+=shrink*残差模型预测结果。初始的预测结果是均值。每次建树之后求一次loss,loss用的square loss。
test:利用模型进行预测。
import osimport sysimport jsonimport copyfrom regression_tree import *class GBDT(): def __init__(self, max_depth=16, min_leaf_sample_cnt=1, feat_sample_ratio=0.8, data_sample_ratio=0.8, shrink_ratio=1, tree_num=10, ): self.max_depth = max_depth self.min_leaf_sample_cnt = min_leaf_sample_cnt self.feat_sample_ratio = feat_sample_ratio self.data_sample_ratio = data_sample_ratio self.shrink_ratio = shrink_ratio self.tree_num = tree_num self.model = [] print self.max_depth print self.min_leaf_sample_cnt print self.feat_sample_ratio print self.data_sample_ratio print self.shrink_ratio print self.tree_num def fit(self, data_set): data_cnt = len(data_set[:,0]) print "data set has %s instance"%(data_cnt) h_0 = data_set[:, -1].mean() y_pred_accumulate_buff = [h_0 for i in range(0, data_cnt)] #print "residual_buff", y_pred_accumulate_buff #print "residual_buff len %s"%(len(y_pred_accumulate_buff)) y_raw_buff = copy.deepcopy(data_set[:, -1]) #print "y_raw_buff", y_raw_buff self.model.append(h_0) for i in range(0, self.tree_num): loss = 0 if i != 0: for j in range(0, data_cnt): #print "y_pred_accumulate_buff before", j, y_pred_accumulate_buff[j] s = self.shrink_ratio * self.model[i].predict(data_set[j,:]) y_pred_accumulate_buff[j] += s loss += ( (y_raw_buff[j] - y_pred_accumulate_buff[j]) * (y_raw_buff[j] - y_pred_accumulate_buff[j]) ) #print "shrink_ratio", self.shrink_ratio #print "data_set[j,:]", data_set[j,:] #print "predict", self.model[i].predict(data_set[j]) #print "y_pred_accumulate_buff", j, y_pred_accumulate_buff[j] print "tid: %d loss: %s "%(i ,loss/(2*data_cnt)) for j in range(0, data_cnt): #print "data_set[j, -1] before", j, data_set[j, -1] #print "y_pred_accumulate_buff", j, y_pred_accumulate_buff[j] data_set[j, -1] = y_raw_buff[j] - y_pred_accumulate_buff[j] #print "data_set[j, -1]", j, data_set[j, -1] residual_data_set = data_set #---------------- #print data_set[0:2] #print residual_data_set[0:2] new_tree = RegressionTree(self.max_depth, self.min_leaf_sample_cnt, self.feat_sample_ratio, self.data_sample_ratio) #print residual_data_set new_tree.fit(residual_data_set) #print new_tree.tree self.model.append(new_tree) #print "self.model", self.model print "tree num %s" % (len(self.model)) # recover raw data_set label for j in range(0, data_cnt): data_set[j, -1] = y_raw_buff[j] def predict(self, instance): score = None for i, m in enumerate(self.model): if i == 0: score = m #print i, score else: score += self.shrink_ratio * m.predict(instance) #print i, score return scoredef load_data(data_file): data_set = [[float(j) for j in i.rstrip().split(',')] for i in open(data_file).readlines()] return np.array(data_set)def test(data_file_path, model): data_set = load_data(data_file_path) """ print data_set print data_set.shape print np.bincount([ int(i) for i in data_set[:,-1]]) """ correct = float(0) incorrect = float(0) sq_sum_error = float(0) for data in data_set[0:2]: pv = model.predict(data) if abs(pv - data[-1]) < 0.5: correct += 1 else: incorrect += 1 # rmse sq_sum_error += np.power((pv - data[-1]), 2) rmse = np.sqrt( sq_sum_error / len(data_set[0]) ) accuracy = correct / (incorrect + correct) print "test accuracy is: %s "%(accuracy) print "test rmse is : %s"%(rmse)def main(data_file_path, max_depth, min_leaf_sample_cnt, feat_sample_ratio, data_sample_ratio, shrink_ratio, tree_num): data_set = load_data(data_file_path) """ print data_set print data_set.shape print np.bincount([ int(i) for i in data_set[:,-1]]) """ model = GBDT(max_depth=max_depth, min_leaf_sample_cnt=min_leaf_sample_cnt, feat_sample_ratio=feat_sample_ratio, data_sample_ratio=data_sample_ratio, shrink_ratio=shrink_ratio, tree_num=tree_num ) model.fit(data_set) print "train done!" test(data_file_path, model) print "test done!"if __name__ == "__main__": # main('Jain_373_2.txt', 5, 3, 0.6, 0.6, 0.25, 100) main('Iris_150_4.txt', max_depth=2, min_leaf_sample_cnt=2, feat_sample_ratio=0.8, data_sample_ratio=0.8, shrink_ratio=0.25, tree_num=1000)
GBRank
重新构造损失函数,强化约束条件。
根据pair之间的关系,利用regressiontree每次重训负样例。
import osimport sysimport copyimport jsonimport numpy as npimport randomfrom regression_tree import *class GBRank(): def __init__(self, max_depth=5, min_leaf_sample_cnt=1, feat_sample_ratio=1.0, data_sample_ratio=1.0, learning_ratio=0.1, tree_num=10, margin=0.1, ): self.max_depth = max_depth self.min_leaf_sample_cnt = min_leaf_sample_cnt self.feat_sample_ratio = feat_sample_ratio self.data_sample_ratio = data_sample_ratio self.learning_ratio = learning_ratio self.tree_num = tree_num self.margin = margin self.model = [] self.data_cnt = 0 print self.max_depth print self.min_leaf_sample_cnt print self.feat_sample_ratio print self.data_sample_ratio print self.learning_ratio print self.tree_num print self.margin def get_negative_data_set(self, model_id, model, qu_set, h_accumulate): #print model_id, model, qu_set,h_accumulate neg_data_set = [] qu_cnt = len(qu_set[:,0]) if model_id != 0: for u in range(0, qu_cnt): #print "h_accumulate[u] before", h_accumulate[u], model_id g = self.learning_ratio * model.predict(qu_set[u, :]) h_accumulate[u] = (model_id * h_accumulate[u] + g) / (model_id + 1) #print "model.predict(qu_set[u, :])", model.predict(qu_set[u, :]), self.learning_ratio #print "g", g #print "h_accumulate[u]", h_accumulate[u], model_id for x in range(0, qu_cnt): for y in range(x+1, qu_cnt): if (h_accumulate[x] < h_accumulate[y] + self.margin) and (qu_set[x, -1] > qu_set[y, -1]): #print "h_accumulate[x] < h_accumulate[y] + self.margin", h_accumulate[x], h_accumulate[y] , self.margin #print "qu_set[x, -1] > qu_set[y, -1]", qu_set[x, -1], qu_set[y, -1] neg_instance_x = copy.deepcopy(qu_set[x,:]) neg_instance_x[-1] = (h_accumulate[y] + self.margin) #print "neg_instance_x", neg_instance_x neg_instance_y = copy.deepcopy(qu_set[y,:]) neg_instance_y[-1] = (h_accumulate[x] - self.margin) #print "neg_instance_y", neg_instance_y neg_data_set.append(neg_instance_x.tolist()) neg_data_set.append(neg_instance_y.tolist()) ''' elif (h_accumulate[x] + self.margin > h_accumulate[y] ) and (qu_set[x, -1] < qu_set[y, -1]): #print "h_accumulate[x] + self.margin > h_accumulate[y]", h_accumulate[x], h_accumulate[y], self.margin #print "qu_set[x, -1] < qu_set[y, -1]", qu_set[x, -1], qu_set[y, -1] neg_instance_x = copy.deepcopy(qu_set[x, :]) neg_instance_x[-1] = (h_accumulate[y] - self.margin) #print "neg_instance_x", neg_instance_x neg_instance_y = copy.deepcopy(qu_set[y, :]) neg_instance_y[-1] = (h_accumulate[x] + self.margin) #print "neg_instance_y", neg_instance_y neg_data_set.append(neg_instance_x.tolist()) neg_data_set.append(neg_instance_y.tolist()) ''' #neg_data_set = np.array(neg_data_set) #print "neg_data_set", neg_data_set #print "neg_data_set len", len(neg_data_set[:,0]) #print "qu_cnt", qu_cnt #print "label", qu_set[:,-1] return neg_data_set def fit(self, data_set): h_accumulate = dict() h_0 = 0.0 for q in data_set.keys(): cur_qu_cnt = len(data_set[q][:,0]) if cur_qu_cnt==0: continue h_accumulate[q] = [0.0 for i in range(0, cur_qu_cnt)] #print "h_accumulate", h_accumulate #print "h_accumulate keys %s"%(len(h_accumulate.keys())) self.model.append(h_0) for i in range(0, self.tree_num): print "tree id ", i neg_data_set =[] for q in data_set.keys(): new_set = self.get_negative_data_set(i, self.model[i], data_set[q], h_accumulate[q]) neg_data_set.extend(new_set) neg_data_set = np.array(neg_data_set) #print neg_data_set[0:2] print len(neg_data_set[:,0]) new_tree = RegressionTree(self.max_depth, self.min_leaf_sample_cnt, self.feat_sample_ratio, self.data_sample_ratio) new_tree.fit(neg_data_set) self.model.append(new_tree) #print "self.model", self.model print "tree num %s" % (len(self.model)) def predict(self, instance): score = None for i, m in enumerate(self.model): if i == 0: score = m #print i, score else: score = ( i*score + self.learning_ratio * m.predict(instance) )/ (i+1) #print i, score return scoredef test(data_file_path, model): data_set = load_data(data_file_path) """ print data_set print data_set.shape print np.bincount([ int(i) for i in data_set[:,-1]]) """ correct = float(0) incorrect = float(0) sq_sum_error = float(0) for data in data_set: pv = model.predict(data) if abs(pv - data[-1]) < 0.5: correct += 1 else: incorrect += 1 # rmse sq_sum_error += np.power((pv - data[-1]), 2) rmse = np.sqrt( sq_sum_error / len(data_set[0]) ) accuracy = correct / (incorrect + correct) print "test accuracy is: %s "%(accuracy) print "test rmse is : %s"%(rmse)def load_data(data_file): data_set = {} with open(data_file) as df: cnt = -1 for line in df: cnt += 1 if cnt==0: continue #print line.rstrip('\n').decode('gbk').encode('utf-8') items = line.rstrip('\n').rsplit('\t') #print items instance = items[4:90] instance.append(items[91]) instance.append(items[0]) instance = [float(i) for i in instance] #print len(instance) ins = copy.deepcopy(instance) #print ins if data_set.has_key(items[3]): data_set[items[3]].append(ins) else: data_set[items[3]] = [ins] for k in data_set.keys(): data_set[k] = np.array(data_set[k]) #if cnt ==10:break print "query", len(data_set.keys()), "qu", cnt return data_setdef main(data_file_path, max_depth, min_leaf_sample_cnt, feat_sample_ratio, data_sample_ratio, learning_ratio, tree_num, margin): data_set = load_data(data_file_path) """ print data_set print data_set.shape print np.bincount([ int(i) for i in data_set[:,-1]]) """ model = GBRank(max_depth=max_depth, min_leaf_sample_cnt=min_leaf_sample_cnt, feat_sample_ratio=feat_sample_ratio, data_sample_ratio=data_sample_ratio, learning_ratio=learning_ratio, tree_num=tree_num, margin=margin, ) model.fit(data_set) print "train done!" #test(data_file_path, model) #print "test done!"if __name__ == "__main__": # main('Jain_373_2.txt', 5, 3, 0.6, 0.6, 0.25, 100) main('dx_sample_data_1w.txt', max_depth=10, min_leaf_sample_cnt=50, feat_sample_ratio=0.6, data_sample_ratio=0.6, learning_ratio=1.0, tree_num=300, margin=0.5)
- gbdt源码阅读
- sklearn.GBDT 源码阅读(细节掌握)
- GBDT源码剖析
- gbdt-源码分析
- gbrt(gbdt)源码分享
- GBDT源码分析和注释
- sklearn的GBDT源码笔记
- GBDT
- GBDT
- GBDT
- GBDT
- GBDT
- GBDT
- GBDT
- GBDT
- GBDT
- GBDT
- gbdt
- javascript(js)中的 substring和substr方法
- Ubuntu下内存不足的解决办法
- CSS3新特性学习
- tensorflow16《TensorFlow实战Google深度学习框架》笔记-08-01 RNN前向传播 code
- FZU2150(Fire Game)(枚举+BFS)
- gbdt源码阅读
- StackOverflowError的一种原因及解决办法
- C语言--宏将一个数字的奇数位和偶数位交换
- 2017华中区邀请赛暨武汉大学校赛网络赛小结 + WOJ 642 Lost in WHU
- #7 C. Line (扩展欧几里得)
- 前端-CSS基础知识(一)
- 基本排序算法
- python2.7下utf字符错误带有'u'前缀
- SpringMVC之GET请求参数中文乱码