Titanic Xgboost版代码分析

来源:互联网 发布:cms主题是什么意思 编辑:程序博客网 时间:2024/06/04 22:37

代码来源
关于xbgboost的调参可参考这篇文章
或官网
流程如下:
1. 读取测试集,训练集
2. 根据相关性手动选取特征
3. 选取测试集
4. 填充缺失值
5. 处理非数字型数据
6. 训练算法
7. 存储数据

# 引入import pandas as pdimport xgboost as xgbfrom sklearn.preprocessing import LabelEncoderimport numpy as npfrom sklearn.base import TransformerMixinclass DataFrameImputer(TransformerMixin):    def fit(self, X, y=None):        # 遍历X的列生成数组,如果c的数据类型为np.dtype('O')直接返回X[c].value_counts().index[0](X[c]中最多的那位数),否则返回X[c].median()(X[c]的中值)。        # 生成的Series键值用X.columns的列names表示        self.fill = pd.Series([X[c].value_counts().index[0]            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],            index=X.columns)        return self        # 填补缺失值,按列对应    def transform(self, X, y=None):        return X.fillna(self.fill)# 读取数据train_df = pd.read_csv('train.csv', header=0)test_df = pd.read_csv('test.csv', header=0)# 手动特征选择feature_columns_to_use = ['Pclass','Sex','Age','Fare','Parch']# csv文件中sex列非数字项nonnumeric_columns = ['Sex']# 将训练集和测试集按特征选择组合起来# 二者的分布稍不同big_X = train_df[feature_columns_to_use].append(test_df[feature_columns_to_use])# fit_transform 是合并了fit函数和transform函数对数据的缺失值进行填充big_X_imputed = DataFrameImputer().fit_transform(big_X)# XGBoost无法处理非数字数据,需要转化,将字符串转化为整型le = LabelEncoder()for feature in nonnumeric_columns:    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])# 准备训练数据:训练集,测试集,训练集labeltrain_X = big_X_imputed[0:train_df.shape[0]].as_matrix()test_X = big_X_imputed[train_df.shape[0]::].as_matrix()train_y = train_df['Survived']# 训练gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train_X, train_y)# 预测predictions = gbm.predict(test_X)# 存为csv格式submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],                            'Survived': predictions })submission.to_csv("submission.csv", index=False)
1 0