Titanic: Machine Learning from Disaster
来源:互联网 发布:java中的foreach循环 编辑:程序博客网 时间:2024/05/16 12:58
Titanic: Machine Learning from Disaster是Kaggle发起的一场机器学习入门级比赛。数据量很少,适合快速验证想法,让我们用机器学习来一场泰坦尼克之旅,
- 数据预处理和可视化函数
# -*- coding: utf-8 -*-import numpy as npimport pandas as pdclass data_preprocessing(): def __init__(self): pass #归一化函数 def data_norm(self, df, features): for f in features: max_value = np.max(df[f]) min_value = np.min(df[f]) df[f] = (df[f] - min_value) / (max_value - min_value) return df #数据转化函数,更符合标准正态分布 def data_convert(self, df, features, style = 'log'): if style == 'log': for f in features: df[f] = np.log(df[f]) return df #去除异常值 def remove_abnormal_value(self, df, features, alpha): for f in features: value_mean = np.mean(df[f]) value_std = np.std(df[f]) df = df[df[f] <= value_mean + alpha * value_std] df = df[df[f] >= value_mean - alpha * value_std] return df #填充空缺值 def data_fill(self, df, features, style = 'mean'): if style == 'mean': for f in features: df[f] = df[f].fillna(df[f][df[f].notnull()].mean()) if style == 'mode': for f in features: df[f] = df[f].fillna(df[f][df[f].notnull()].mode()[0]) if style == '0': for f in features: df[f] = df[f].fillna('0') return df def data_one_hot(self, df, features): for f in features: dummies = pd.get_dummies(df[f], prefix = df[[f]].columns[0]) df = df.join(dummies) df = df.drop([f], axis = 1) return df def data_replace(self, df, features): for f in features: value = list(set(df[f])) value_range = range(len(value)) df[f] = df[f].replace(value, value_range) return df def data_replace_random(self, df, features): for f in features: value = list(set(df[f])) np.random.shuffle(value) value_range = range(len(value)) df[f] = df[f].replace(value, value_range) return df def data_replace_sort_by_mean(self, train_df, df, features, label): for f in features: value = list(set(df[f])) value_of_train = list(set(train_df[f])) value_sort = [] for i in range(len(value)): if value[i] in value_of_train: value_sort.append(np.mean(train_df[train_df[f] == value[i]][label])) else: value_sort.append(0) value = [value[i] for i in np.argsort(-np.array(value_sort))] value_range = range(len(value)) df[f] = df[f].replace(value, value_range) return df
# -*- coding: utf-8 -*-import pandas as pdimport seaborn as snssns.set_style('whitegrid')import matplotlib.pyplot as pltclass data_visualization: def __init__(self): pass def plt_mean(self, df, features): average = df[features].groupby([features[0]],as_index = False).mean() sns.barplot(x = features[0], y = features[1], data = average) plt.show() def plt_count(self, df, features): for i in range(len(features)): plt.figure(i) sns.countplot(x=features[i], data=df) plt.show() def plt_density(self, df, feature): for i in range(len(features)): plt.figure(i) sns.distplot(df[features[i]], kde = True, rug = True) plt.show() def plt_density_compare(self, df1, df2, features): for i in range(len(features)): plt.figure(i) sns.distplot(df1[features[i]], kde = True, rug = True) sns.distplot(df2[features[i]], kde = True, rug = True) plt.show() def plt_factor_count(self, df, x, hue, col): sns.factorplot(x=x, hue=hue, col=col, data=df, kind='count') plt.show() def plt_factor_ratio(self,df, x, y, hue): sns.factorplot(x=x, y=y, hue=hue, data=df) plt.show() def plt_crosstab(self, df, features): print pd.crosstab(df[features[0]],df[features[1]])
- 导入库和数据(注意路径修改为自己电脑下的路径)
# -*- coding: utf-8 -*-import numpy as npimport pandas as pdfrom math import logimport randomimport matplotlib.pyplot as pltimport seaborn as snsimport graphvizsns.set_style('whitegrid')import syssys.path.append('/Users/zjx/python/competition_function')from sklearn import treefrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import accuracy_score, f1_scorefrom sklearn.model_selection import StratifiedKFoldfrom sklearn.tree import DecisionTreeRegressorfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.neighbors import KNeighborsClassifierfrom data_preprocessing import data_preprocessingfrom data_visualization import data_visualizationpath = '/Users/zjx/python/taitan/data/'train_df = pd.read_csv(path+'new_train.csv')test_df = pd.read_csv(path+'new_test.csv')dp = data_preprocessing()dv = data_visualization()label = 'Survived'
- 查看数据
print train_df.info(), test_df.info()
train_df :
test_df :
可以看到训练集大小为891行,测试集为418行,其中 Age、Cabin数据缺失较多,Fare、Embarked部分缺失。为了防止引入较多噪声,我们不对Age、Cabin进行填充,对于Fare、Embarked分别采用决策树回归填充、众数填充。
- Fare、Embarked填充
X_fare = df[['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch']]y_fare = df['Fare']#X_fare = dp.data_one_hot(X_fare,[])X_fare = dp.data_replace_sort_by_mean(train_df, X_fare,['Sex', 'Embarked'], label)fare_defined = df.Fare.notnull()dtr = DecisionTreeRegressor(max_leaf_nodes=5)dtr.fit(X_fare[fare_defined], y_fare[fare_defined])df.loc[~fare_defined, 'Fare'] = dtr.predict(X_fare[~fare_defined])df = dp.data_fill(df, ['Embarked'], style = 'mode')
画出Fare填充决策回归树
dot_data = tree.export_graphviz(dtr, out_file=None,feature_names=X_fare.columns,class_names=['Fare'],filled=True, rounded=True, special_characters=True,leaves_parallel=True)graph = graphviz.Source(dot_data)graph.render("dtr_of_fare")
- 特征探索
性别上的差异
dv.plt_mean(train_df,['Saex',label])
船舱位置的差异
dv.plt_mean(train_df,['Pclass',label])
Pclass和Sex的交叉
dv.plt_factor_count(train_df,'Pclass',label,'Sex')
可以看到1层和2层的女性几乎都获救了,3层的男性几乎都没有获救,相信这是一组最为重要的特征,从这里也可以发现预测的难点是1层的男性和3层的女性。
女性既然更高概率获救,儿童应该也有特殊性(由于3层的获救率普遍不高只考虑1、2层)
def get_is_child(age): age = float(age) if age < 14: return 1 else: return 0df['Is_child'] = df[['Age']].apply(get_is_child, axis=1)df.loc[df['Pclass'] > 2, 'Is_child'] = 0
考虑团体上的优劣(团结就是力量)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1def get_freq(df, feature): freq_col = 'Freq_' + feature freq = df[feature].value_counts().to_frame() freq.columns = [freq_col] df[freq_col] = df.merge(freq, how='left', left_on=feature, right_index=True)[freq_col] return dfdf = get_freq(df, 'Cabin')df = get_freq(df, 'Ticket')df = get_freq(df, 'Fare')def get_single(df): if df['FamilySize'] > 1: return 0 elif df['Freq_Ticket'] > 1: return 0 elif df['Freq_Cabin'] > 1: return 0 elif 1 < df['Freq_Fare'] < max_group: return 0 else: return 1df['Single'] = df.apply(get_single, axis=1)df['Single'] = df['Single'].astype('int')
对超大团体进行特殊化(大团体可能出现相顾无暇的情况)
def get_freq_ticket(df): if df['Freq_Ticket'] == 7 or df['Freq_Ticket'] == 11: return 1 elif df['Freq_Ticket'] == 8: return 2 else: return 0df['Big_Ticket'] = df.apply(get_freq_ticket, axis=1)
测试集中的人所在团体如果有获救的给以奖励(特殊化)
def get_reward(survived): group_nan = survived.isnull().sum() Single = survived.shape[0] group_sum = np.sum(survived) if (group_nan > 0 and Single > 1 and group_sum > 0): return 1 else: return 0rewards = df[['Ticket','Survived']].groupby('Ticket')['Survived'].apply(get_reward).to_frame()rewards.columns = ['Reward']df = df.merge(rewards, left_on='Ticket', right_index=True, how='left')
由于决定使用决策树数模型,所以高维稀疏的独热编码容易引入较多噪声(相关性过大),只将性别进行独热编码,并去掉无用特征
df = dp.data_replace_sort_by_mean(train_df,df,['Sex'],label)df = df.drop(['Name','Ticket','Freq_Ticket','Freq_Cabin','Freq_Fare','Cabin','Age','FamilySize','Parch','SibSp','Fare','Embarked'],axis=1)
- k折交叉验证和随机森林预测
dtc = DecisionTreeClassifier(min_samples_leaf=10)dtc.fit(train_df.drop(['Survived','PassengerId'],axis=1), train_df['Survived'])dot_data = tree.export_graphviz(dtc, out_file=None,feature_names=train_df.drop(['Survived','PassengerId'],axis=1).columns, class_names='Survived', filled=True, rounded=True, special_characters=True,leaves_parallel=True)graph = graphviz.Source(dot_data)graph.render("my_dtc_of_survived")train_X = train_df.drop(['Survived','PassengerId'],axis=1).valuestrain_y = train_df['Survived'].valuestest_X = test_df.drop(['Survived','PassengerId'],axis=1).valuesnum_folds = 7num_repeats = 5skf = StratifiedKFold(n_splits=num_folds, shuffle=True)rf = RandomForestClassifier(random_state=0)acc_scores = []f1_scores = []for i in range(num_repeats): for train_idx, test_idx in skf.split(train_X, train_y): train_X_cv = train_X[train_idx] test_X_cv = train_X[test_idx] train_y_cv = train_y[train_idx] test_y_cv = train_y[test_idx] rf.fit(train_X_cv, train_y_cv) y_pred_cv = rf.predict(test_X_cv) acc_scores.append(accuracy_score(test_y_cv, y_pred_cv)) f1_scores.append(f1_score(test_y_cv, y_pred_cv))acc_scores_mean = np.mean(acc_scores)acc_scores_std = np.std(acc_scores)f1_scores_mean = np.mean(f1_scores)f1_scores_std = np.std(f1_scores)print('CV summary for %s repeats on %s splits:'%(num_repeats, skf.n_splits))print('accuracy score: %s +/- %s'%(acc_scores_mean, acc_scores_std))print('f1 score: %s +/- %s'%(f1_scores_mean, f1_scores_std))rf = RandomForestClassifier(n_estimators=10,random_state=0)rf.fit(train_X, train_y)submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'], 'Survived': rf.predict(test_X).astype(int) })print np.mean(submission['Survived'])submission.to_csv(path+'submission.csv', index=False)
生存决策分类树
- 最终成绩和总结
通过整个流程可以发现,越是在危难时刻,绅士精神越能体现的如此惊人,而金钱也决定着你的生存概率,Money is winner!
机器学习感悟:决策树模型不适合使用高维稀疏独热编码,可以将无序特征用概率来转化,另外生成单特征的决策树对选取特征有很大帮助!决策树的可解释性也对于特征工程有很大帮助!
阅读全文
0 0
- Titanic: Machine Learning from Disaster
- Titanic: Machine Learning from Disaster
- Titanic: Machine Learning from Disaster
- Titanic: Machine Learning from Disaster
- Titanic : Machine Learning from Disaster
- Kaggle Titanic: Machine Learning from Disaster
- Kaggle | Titanic: Machine Learning from Disaster
- Kaggle之Titanic: Machine Learning from Disaster
- kaggle: Titanic: Machine Learning from Disaster
- kaggle competition 之 Titanic: Machine Learning from Disaster
- Titanic: Machine Learning from Disaster(Kaggle 数据挖掘竞赛)
- 【Kaggle练习赛】之Titanic: Machine Learning from Disaster
- Kaggle Titanic: Machine Learning from Disaster 一种思路
- Titanic: Machine Learning from Disaster——Linear regression
- Titanic: Machine Learning from Disaster——Logistic regression
- Titanic: Machine Learning from Disaster——Improving submission
- Titanic: Machine Learning from Disaster——总结
- Kaggle比赛经验总结之Titanic: Machine Learning from Disaster
- Redis数据结构-链表
- ZigBee CC2530 Z-Stack 27 终端设备低功耗模式与电池寿命1-理论
- 将“hello world” 输出为“world hello”
- POJ
- C语言实现矩阵的转置
- Titanic: Machine Learning from Disaster
- mysql 存储过程(临时表、循环、游标综合运用)
- 07_
- WindowsError的错误代码详解
- UVALive 7272 Promotions【拓扑排序】【bitset】
- Eclipse 修改背景图片(不是修改背景颜色)
- cartogarpher slam 3
- python实现最小二乘法(转)
- Codeforces-868C