Kaggle:Titanic问题(2)——实现代码

来源:互联网 发布:c语言中标识符是什么 编辑:程序博客网 时间:2024/04/30 18:18

Titanic预测

1.LogisticRegression

导入数据,相关模块(略去可视化分析过程)

import pandas as pdimport numpy as npfrom pandas import Series,DataFramedata_train = pd.read_csv("Titanic/train.csv") #文件读成DataFrame格式

数据预处理

#Step1:缺失值处理#用scikit-learn中的RandomForest来拟合一下缺失的年龄数据from sklearn.ensemble import RandomForestRegressor# 使用 RandomForestClassifier 填补缺失的年龄属性def set_missing_ages(df):    # 把已有的数值型特征取出来丢进Random Forest Regressor中    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]    # 乘客分成已知年龄和未知年龄两部分    known_age = age_df[age_df.Age.notnull()].as_matrix()    unknown_age = age_df[age_df.Age.isnull()].as_matrix()    y = known_age[:, 0]# y即目标年龄    X = known_age[:, 1:]# X即特征属性值    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)# fit到RandomForestRegressor之中    rfr.fit(X, y)    predictedAges = rfr.predict(unknown_age[:, 1::])# 用得到的模型进行未知年龄结果预测    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges # 用得到的预测结果填补原缺失数据    return df, rfrdef set_Cabin_type(df):    df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"    df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"    return dfdata_train, rfr = set_missing_ages(data_train)data_train = set_Cabin_type(data_train)#Step2,对类目型特征归一化(因子化)#使用pandas.get_dummiesdummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix= 'Cabin')dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked')dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)#Step3,对连续型特征数值scaling#各属性值之间scale差距太大,将对收敛速度造成几万点伤害值!甚至不收敛!#所以我们将一些变化幅度较大的特征化到[-1,1]之内。import sklearn.preprocessing as preprocessingscaler = preprocessing.StandardScaler()age_scale_param = scaler.fit(df['Age'])df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param)fare_scale_param = scaler.fit(df['Fare'])df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)

逻辑回归建模

#逻辑回归建模from sklearn import linear_model# 用正则取出我们要的属性值train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')train_np = train_df.as_matrix()y = train_np[:, 0]# y即Survival结果X = train_np[:, 1:]# X即特征属性值# fit到RandomForestRegressor之中clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)clf.fit(X, y)

预处理测试数据

data_test = pd.read_csv("Titanic/test.csv")data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0# 接着我们对test_data做和train_data中一致的特征变换# 首先用同样的RandomForestRegressor模型填上丢失的年龄tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]null_age = tmp_df[data_test.Age.isnull()].as_matrix()# 根据特征属性X预测年龄并补上X = null_age[:, 1:]predictedAges = rfr.predict(X)data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAgesdata_test = set_Cabin_type(data_test)dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param)df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param)

导出基础的预测结果

test=df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')predictions = clf.predict(test)result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})result.to_csv("Titanic/logistic_regression_predictions.csv", index=False)

查看拟合情况,判定一下当前模型所处状态(欠拟合or过拟合)

from sklearn.learning_curve import learning_curve# 用sklearn的learning_curve得到training_score和cv_score,使用matplotlib画出learning curvedef plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1,                         train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):    """    画出data在某模型上的learning curve.    参数解释    ----------    estimator : 你用的分类器。    title : 表格的标题。    X : 输入的feature,numpy类型    y : 输入的target vector    ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点    cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)    n_jobs : 并行的的任务数(默认1)    """    train_sizes, train_scores, test_scores = learning_curve(        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)    train_scores_mean = np.mean(train_scores, axis=1)    train_scores_std = np.std(train_scores, axis=1)    test_scores_mean = np.mean(test_scores, axis=1)    test_scores_std = np.std(test_scores, axis=1)    if plot:        plt.figure()        plt.title(title)        if ylim is not None:            plt.ylim(*ylim)        plt.xlabel('The number of training samples')        plt.ylabel('score')        plt.gca().invert_yaxis()        plt.grid()        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,                          alpha=0.1, color="b")        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,                          alpha=0.1, color="r")        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label='training score')        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label='CV score')        plt.legend(loc="best")        plt.draw()        plt.gca().invert_yaxis()        plt.show()    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])    return midpoint, diffplot_learning_curve(clf, 'learning curves', X, y)

这里写图片描述

在实际数据上看,我们得到的learning curve没有理论推导的那么光滑哈,但是可以大致看出来,训练集和交叉验证集上的得分曲线走势还是符合预期的。

目前的曲线看来,我们的model并不处于overfitting的状态(overfitting的表现一般是训练集上得分高,

而交叉验证集上要低很多,中间的gap比较大)。因此我们可以再做些feature engineering的工作,添加一些新产出的特征或者组合特征到模型中。

看看相关程度

pd.DataFrame({"columns":list(train_df.columns)[1:], "coef":list(clf.coef_.T)})

上面的系数和最后的结果是一个正相关的关系

我们先看看那些权重绝对值非常大的feature,在我们的模型上:

•Sex属性,如果是female会极大提高最后获救的概率,而male会很大程度拉低这个概率。

•Pclass属性,1等舱乘客最后获救的概率会上升,而乘客等级为3会极大地拉低这个概率。

•有Cabin值会很大程度拉升最后获救概率(这里似乎能看到了一点端倪,事实上从最上面的有无Cabin记录的Survived分布图上看出,即使有Cabin记录的乘客也有一部分遇难了,估计这个属性上我们挖掘还不够)

•Age是一个负相关,意味着在我们的模型里,年龄越小,越有获救的优先权(还得回原数据看看这个是否合理)

•有一个登船港口S会很大程度拉低获救的概率,另外俩港口压根就没啥作用(这个实际上非常奇怪,因为我们从之前的统计图上并没有看到S港口的获救率非常低,所以也许可以考虑把登船港口这个feature去掉试试)。

•船票Fare有小幅度的正相关(并不意味着这个feature作用不大,有可能是我们细化的程度还不够,举个例子,说不定我们得对它离散化,再分至各个乘客等级上?)

怎么样才知道,哪些优化的方法是promising的呢?

进行交叉验证

from sklearn import cross_validation# 简单看看打分情况clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)all_data = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')X = all_data.as_matrix()[:,1:]y = all_data.as_matrix()[:,0]print cross_validation.cross_val_score(clf, X, y, cv=5)split_train, split_cv = cross_validation.train_test_split(df, test_size=0.3, random_state=0)train_df = split_train.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')# 生成模型clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)clf.fit(train_df.as_matrix()[:,1:], train_df.as_matrix()[:,0])cv_df=split_cv.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')predictions = clf.predict(cv_df.as_matrix()[:,1:])origin_data_train = pd.read_csv("Titanic/Train.csv")bad_cases=origin_data_train.loc[origin_data_train['PassengerId'].isin(split_cv[predictions != cv_df.as_matrix()[:,0]]['PassengerId'].values)]bad_cases

对比bad case,我们仔细看看我们预测错的样本,到底是哪些特征有问题,咱们处理得还不够细?

我们随便列一些可能可以做的优化操作:

•Age属性不使用现在的拟合方式,而是根据名称中的『Mr』『Mrs』『Miss』等的平均值进行填充。

•Age不做成一个连续值属性,而是使用一个步长进行离散化,变成离散的类目feature。

•Cabin再细化一些,对于有记录的Cabin属性,我们将其分为前面的字母部分(我猜是位置和船层之类的信息) 和 后面的数字部分(应该是房间号,有意思的事情是,如果你仔细看看原始数据,你会发现,这个值大的情况下,似乎获救的可能性高一些)。

•Pclass和Sex俩太重要了,我们试着用它们去组出一个组合属性来试试,这也是另外一种程度的细化。

•单加一个Child字段,Age<=12的,设为1,其余为0(你去看看数据,确实小盆友优先程度很高啊)

•如果名字里面有『Mrs』,而Parch>1的,我们猜测她可能是一个母亲,应该获救的概率也会提高,因此可以多加一个Mother字段,此种情况下设为1,其余情况下设为0

•登船港口可以考虑先去掉试试(Q和C本来就没权重,S有点诡异)

•把堂兄弟/兄妹 和 Parch 还有自己 个数加在一起组一个Family_size字段(考虑到大家族可能对最后的结果有影响)

•Name是一个我们一直没有触碰的属性,我们可以做一些简单的处理,比如说男性中带某些字眼的(‘Capt’, ‘Don’, ‘Major’, ‘Sir’)可以统一到一个Title,女性也一样。

大家接着往下挖掘,可能还可以想到更多可以细挖的部分。我这里先列这些了,然后我们可以使用手头上的”train_df”和”cv_df”开始试验这些feature engineering的tricks是否有效了。

分析后优化代码

ata_train = pd.read_csv("Titanic/Train.csv")data_train['Sex_Pclass'] = data_train.Sex + "_" + data_train.Pclass.map(str)from sklearn.ensemble import RandomForestRegressor### 使用 RandomForestClassifier 填补缺失的年龄属性def set_missing_ages(df):    # 把已有的数值型特征取出来丢进Random Forest Regressor中    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]    # 乘客分成已知年龄和未知年龄两部分    known_age = age_df[age_df.Age.notnull()].as_matrix()    unknown_age = age_df[age_df.Age.isnull()].as_matrix()    # y即目标年龄    y = known_age[:, 0]    # X即特征属性值    X = known_age[:, 1:]    # fit到RandomForestRegressor之中    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)    rfr.fit(X, y)    # 用得到的模型进行未知年龄结果预测    predictedAges = rfr.predict(unknown_age[:, 1::])    # 用得到的预测结果填补原缺失数据    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges     return df, rfrdef set_Cabin_type(df):    df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"    df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"    return dfdata_train, rfr = set_missing_ages(data_train)data_train = set_Cabin_type(data_train)dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix= 'Cabin')dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked')dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')dummies_Sex_Pclass = pd.get_dummies(data_train['Sex_Pclass'], prefix= 'Sex_Pclass')df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass, dummies_Sex_Pclass], axis=1)df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Sex_Pclass'], axis=1, inplace=True)import sklearn.preprocessing as preprocessingscaler = preprocessing.StandardScaler()age_scale_param = scaler.fit(df['Age'])df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param)fare_scale_param = scaler.fit(df['Fare'])df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)from sklearn import linear_modeltrain_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*')train_np = train_df.as_matrix()# y即Survival结果y = train_np[:, 0]# X即特征属性值X = train_np[:, 1:]# fit到RandomForestRegressor之中clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)clf.fit(X, y)data_test = pd.read_csv("Titanic/test.csv")data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0data_test['Sex_Pclass'] = data_test.Sex + "_" + data_test.Pclass.map(str)# 接着我们对test_data做和train_data中一致的特征变换# 首先用同样的RandomForestRegressor模型填上丢失的年龄tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]null_age = tmp_df[data_test.Age.isnull()].as_matrix()# 根据特征属性X预测年龄并补上X = null_age[:, 1:]predictedAges = rfr.predict(X)data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAgesdata_test = set_Cabin_type(data_test)dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')dummies_Sex_Pclass = pd.get_dummies(data_test['Sex_Pclass'], prefix= 'Sex_Pclass')df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass, dummies_Sex_Pclass], axis=1)df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Sex_Pclass'], axis=1, inplace=True)df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param)df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param)test=df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*')predictions = clf.predict(test)result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})result.to_csv("logistic_regression_predictions2.csv", index=False)

Bagging算法优化模型

from sklearn.ensemble import BaggingRegressortrain_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')train_np = train_df.as_matrix()# y即Survival结果y = train_np[:, 0]# X即特征属性值X = train_np[:, 1:]# fit到BaggingRegressor之中clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)bagging_clf.fit(X, y)test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')predictions = bagging_clf.predict(test)result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})result.to_csv("Titanic/logistic_regression_predictions2.csv", index=False)

提交效果:

这里写图片描述