XGBoost 脚本学习
来源:互联网 发布:mac炉石传说无法运行 编辑:程序博客网 时间:2024/05/17 07:04
https://www.kaggle.com/happycube/bosch-production-line-performance/scirpus-extreme-bayes-faron-36312/code
说明:这个脚本是训练XGBoost的脚本,值得去学习,特别是XGBoost的一些参数设置
import gcimport numpy as npimport pandas as pdimport xgboost as xgbfrom sklearn.cross_validation import StratifiedKFoldfrom sklearn.metrics import matthews_corrcoeffrom operator import itemgetter# per raddar, all date features except for stations 24+25 are identicaldef get_date_features(): directory = '../input/' trainfile = 'train_date.csv' for i, chunk in enumerate(pd.read_csv(directory + trainfile, chunksize=1, low_memory=False)): features = list(chunk.columns) break seen = np.zeros(52) rv = [] for f in features: if f == 'Id' or 'S24' in f or 'S25' in f: rv.append(f) continue station = int(f.split('_')[1][1:])# print(station) if seen[station]: continue seen[station] = 1 rv.append(f) return rv usefuldatefeatures = get_date_features()def get_mindate(): directory = '../input/' trainfile = 'train_date.csv' testfile = 'test_date.csv' features = None subset = None for i, chunk in enumerate(pd.read_csv(directory + trainfile, usecols=usefuldatefeatures, chunksize=50000, low_memory=False)): print(i) if features is None: features = list(chunk.columns) features.remove('Id') df_mindate_chunk = chunk[['Id']].copy() df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values if subset is None: subset = df_mindate_chunk.copy() else: subset = pd.concat([subset, df_mindate_chunk]) del chunk gc.collect() for i, chunk in enumerate(pd.read_csv(directory + testfile, usecols=usefuldatefeatures, chunksize=50000, low_memory=False)): print(i) df_mindate_chunk = chunk[['Id']].copy() df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values subset = pd.concat([subset, df_mindate_chunk]) del chunk gc.collect() return subsetdf_mindate = get_mindate()df_mindate.sort_values(by=['mindate', 'Id'], inplace=True)df_mindate['mindate_id_diff'] = df_mindate.Id.diff()midr = np.full_like(df_mindate.mindate_id_diff.values, np.nan)midr[0:-1] = -df_mindate.mindate_id_diff.values[1:]df_mindate['mindate_id_diff_reverse'] = midrdef mcc(tp, tn, fp, fn): sup = tp * tn - fp * fn inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) if inf == 0: return 0 else: return sup / np.sqrt(inf)def eval_mcc(y_true, y_prob, show=False): idx = np.argsort(y_prob) y_true_sort = y_true[idx] n = y_true.shape[0] nump = 1.0 * np.sum(y_true) # number of positive numn = n - nump # number of negative tp = nump tn = 0.0 fp = numn fn = 0.0 best_mcc = 0.0 best_id = -1 mccs = np.zeros(n) for i in range(n): if y_true_sort[i] == 1: tp -= 1.0 fn += 1.0 else: fp -= 1.0 tn += 1.0 new_mcc = mcc(tp, tn, fp, fn) mccs[i] = new_mcc if new_mcc >= best_mcc: best_mcc = new_mcc best_id = i if show: best_proba = y_prob[idx[best_id]] y_pred = (y_prob > best_proba).astype(int) return best_proba, best_mcc, y_pred else: return best_mccdef mcc_eval(y_prob, dtrain): y_true = dtrain.get_label() best_mcc = eval_mcc(y_true, y_prob) return 'MCC', best_mccdef create_feature_map(features): outfile = open('xgb.fmap', 'w') for i, feat in enumerate(features): outfile.write('{0}\t{1}\tq\n'.format(i, feat)) outfile.close()def get_importance(gbm, features): create_feature_map(features) importance = gbm.get_fscore(fmap='xgb.fmap') importance = sorted(importance.items(), key=itemgetter(1), reverse=True) return importancedef LeaveOneOut(data1, data2, columnName, useLOO=False): grpOutcomes = data1.groupby(columnName)['Response'].mean().reset_index() grpCount = data1.groupby(columnName)['Response'].count().reset_index() grpOutcomes['cnt'] = grpCount.Response if(useLOO): grpOutcomes = grpOutcomes[grpOutcomes.cnt > 1] grpOutcomes.drop('cnt', inplace=True, axis=1) outcomes = data2['Response'].values x = pd.merge(data2[[columnName, 'Response']], grpOutcomes, suffixes=('x_', ''), how='left', on=columnName, left_index=True)['Response'] if(useLOO): x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1) # x = x + np.random.normal(0, .01, x.shape[0]) return x.fillna(x.mean())def GrabData(): directory = '../input/' trainfiles = ['train_categorical.csv', 'train_date.csv', 'train_numeric.csv'] testfiles = ['test_categorical.csv', 'test_date.csv', 'test_numeric.csv'] cols = [['Id', 'L1_S24_F1559', 'L3_S32_F3851', 'L1_S24_F1827', 'L1_S24_F1582', 'L3_S32_F3854', 'L1_S24_F1510', 'L1_S24_F1525'], ['Id', 'L3_S30_D3496', 'L3_S30_D3506', 'L3_S30_D3501', 'L3_S30_D3516', 'L3_S30_D3511'], ['Id', 'L1_S24_F1846', 'L3_S32_F3850', 'L1_S24_F1695', 'L1_S24_F1632', 'L3_S33_F3855', 'L1_S24_F1604', 'L3_S29_F3407', 'L3_S33_F3865', 'L3_S38_F3952', 'L1_S24_F1723', 'Response']] traindata = None testdata = None for i, f in enumerate(trainfiles): print(f) subset = None for i, chunk in enumerate(pd.read_csv(directory + f, usecols=cols[i], chunksize=50000, low_memory=False)): print(i) if subset is None: subset = chunk.copy() else: subset = pd.concat([subset, chunk]) del chunk gc.collect() if traindata is None: traindata = subset.copy() else: traindata = pd.merge(traindata, subset.copy(), on="Id") del subset gc.collect() del cols[2][-1] # Test doesn't have response! for i, f in enumerate(testfiles): print(f) subset = None for i, chunk in enumerate(pd.read_csv(directory + f, usecols=cols[i], chunksize=50000, low_memory=False)): print(i) if subset is None: subset = chunk.copy() else: subset = pd.concat([subset, chunk]) del chunk gc.collect() if testdata is None: testdata = subset.copy() else: testdata = pd.merge(testdata, subset.copy(), on="Id") del subset gc.collect() traindata = traindata.merge(df_mindate, on='Id') testdata = testdata.merge(df_mindate, on='Id') testdata['Response'] = 0 # Add Dummy Value visibletraindata = traindata[::2] blindtraindata = traindata[1::2] print(blindtraindata.columns) for i in range(2): for col in cols[i][1:]: print(col) blindtraindata.loc[:, col] = LeaveOneOut(visibletraindata, blindtraindata, col, False).values testdata.loc[:, col] = LeaveOneOut(visibletraindata, testdata, col, False).values del visibletraindata gc.collect() testdata.drop('Response', inplace=True, axis=1) return blindtraindata, testdatadef Train(): train, test = GrabData() print('Train:', train.shape) print('Test', test.shape) features = list(train.columns) features.remove('Response') features.remove('Id') print(features) num_rounds = 50 params = {} params['objective'] = "binary:logistic" params['eta'] = 0.021 params['max_depth'] = 7 params['colsample_bytree'] = 0.82 params['min_child_weight'] = 3 params['base_score'] = 0.005 params['silent'] = True print('Fitting') trainpredictions = None testpredictions = None dvisibletrain = \ xgb.DMatrix(train[features], train.Response, silent=True) dtest = \ xgb.DMatrix(test[features], silent=True) folds = 1 for i in range(folds): print('Fold:', i) params['seed'] = i watchlist = [(dvisibletrain, 'train'), (dvisibletrain, 'val')] clf = xgb.train(params, dvisibletrain, num_boost_round=num_rounds, evals=watchlist, early_stopping_rounds=20, feval=mcc_eval, maximize=True ) limit = clf.best_iteration+1 # limit = clf.best_ntree_limit predictions = \ clf.predict(dvisibletrain, ntree_limit=limit) best_proba, best_mcc, y_pred = eval_mcc(train.Response, predictions, True) print('tree limit:', limit) print('mcc:', best_mcc) print(matthews_corrcoef(train.Response, y_pred)) if(trainpredictions is None): trainpredictions = predictions else: trainpredictions += predictions predictions = clf.predict(dtest, ntree_limit=limit) if(testpredictions is None): testpredictions = predictions else: testpredictions += predictions imp = get_importance(clf, features) print('Importance array: ', imp) best_proba, best_mcc, y_pred = eval_mcc(train.Response, trainpredictions/folds, True) print(matthews_corrcoef(train.Response, y_pred)) submission = pd.DataFrame({"Id": train.Id, "Prediction": trainpredictions/folds, "Response": train.Response}) submission[['Id', 'Prediction', 'Response']].to_csv('rawtrainxgbsubmission'+str(folds)+'.csv', index=False) submission = pd.DataFrame({"Id": test.Id.values, "Response": testpredictions/folds}) submission[['Id', 'Response']].to_csv('rawxgbsubmission'+str(folds)+'.csv', index=False) y_pred = (testpredictions/folds > .08).astype(int) submission = pd.DataFrame({"Id": test.Id.values, "Response": y_pred}) submission[['Id', 'Response']].to_csv('xgbsubmission'+str(folds)+'.csv', index=False)if __name__ == "__main__": print('Started') Train() print('Finished')
值得学习的几个点:
1、自定义评价函数
2、使用early_stop来防止过拟合
3、train,test数据的生成方式简洁
4、对于同一个模型,跑n-fold取平均。
0 0
- XGBoost 脚本学习
- XGBoost cross_validation脚本学习
- xgboost学习
- XGBoost学习
- XGBoost学习
- xgboost学习
- 训练XGBoost的一些脚本,
- XGBoost学习日记1
- XGboost文献学习笔记
- 机器学习:XGBoost
- xgboost和lightgbm学习
- XGBoost学习资料汇总
- 【机器学习】Xgboost原理
- XGBoost学习笔记
- 机器学习-xgboost
- 机器学习----xgboost学习笔记
- 机器学习----XGBOOST参数说明
- 学习笔记:XGBoost原理解析
- hibernate无法自动建表问题
- java之断言assert
- 位向量实现的bitset集合
- JSP 动作元素 标签
- LeetCode 142. Linked List Cycle II
- XGBoost 脚本学习
- C++为什么remove以后需要erase
- jsonp原理
- POJ1068
- iOS中收起键盘的方法和时机
- matlab 交易建模
- 用node实现http服务器 三
- 关于a different object with the same identifier value was already associated with the session解决方案
- 存储MODE(文件的操作模式)