titanic prediction
来源:互联网 发布:美国租车软件 编辑:程序博客网 时间:2024/03/28 18:57
# Imports# pandasimport pandas as pdfrom pandas import Series,DataFrame# numpy, matplotlib, seabornimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snssns.set_style('whitegrid')%matplotlib inline# machine learningfrom sklearn.linear_model import LogisticRegressionfrom sklearn.svm import SVC, LinearSVCfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.naive_bayes import GaussianNB
# get titanic & test csv files as a DataFrametitanic_df = pd.read_csv("train.csv", dtype={"Age": np.float64}, )test_df = pd.read_csv("test.csv", dtype={"Age": np.float64}, )# preview the datatitanic_df.head()
titanic_df=titanic_df.drop(['PassengerId','Name','Ticket'],axis=1)test_df =test_df.drop(['Name','Ticket'],axis=1)
# Embarked# only in titanic_df, fill the two missing values with the most occurred value, which is "S".titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S")# plotsns.factorplot('Embarked','Survived', data=titanic_df,size=4,aspect=3)fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))# sns.factorplot('Embarked',data=titanic_df,kind='count',order=['S','C','Q'],ax=axis1)# sns.factorplot('Survived',hue="Embarked",data=titanic_df,kind='count',order=[1,0],ax=axis2)sns.countplot(x='Embarked', data=titanic_df, ax=axis1)sns.countplot(x='Survived', hue="Embarked", data=titanic_df, order=[1,0], ax=axis2)# group by embarked, and get the mean for survived passengers for each value in Embarkedembark_perc = titanic_df[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)# Either to consider Embarked column in predictions,# and remove "S" dummy variable, # and leave "C" & "Q", since they seem to have a good rate for Survival.# OR, don't create dummy variables for Embarked column, just drop it, # because logically, Embarked doesn't seem to be useful in prediction.embark_dummies_titanic = pd.get_dummies(titanic_df['Embarked'])embark_dummies_titanic.drop(['S'], axis=1, inplace=True)embark_dummies_test = pd.get_dummies(test_df['Embarked'])embark_dummies_test.drop(['S'], axis=1, inplace=True)titanic_df = titanic_df.join(embark_dummies_titanic)test_df = test_df.join(embark_dummies_test)titanic_df.drop(['Embarked'], axis=1,inplace=True)test_df.drop(['Embarked'], axis=1,inplace=True)
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))
test_df.info()
titanic_df.info()
#fare#fill the missing "Fare" for the test_dftest_df['Fare'].fillna(test_df['Fare'].median(),inplace=True)#convert from float to int titanic_df['Fare']=titanic_df['Fare'].astype(int)test_df['Fare']=test_df['Fare'].astype(int)#get fare from survived and not survivedfare_not_survived=titanic_df['Fare'][titanic_df['Survived']==0]fare_survived=titanic_df['Fare'][titanic_df['Survived']==1]
fare_not_survived
0 7 4 8 5 8 6 51 7 21 12 8 13 31 14 7 16 29 18 18 20 26 24 21 26 7 27 263 29 7 30 27 33 10 34 82 35 52 37 8 38 18 40 9 41 21 42 7 45 8 46 15 48 21 49 17 50 39 51 7 … 844 8 845 7 846 69 847 7 848 33 850 31 851 7 852 15 854 26 859 7 860 14 861 11 863 69 864 13 867 50 868 9 870 7 872 5 873 9 876 9 877 7 878 7 881 7 882 10 883 10 884 7 885 29 886 13 888 23 890 7 Name: Fare, dtype: int64fare_survived
1 71 2 7 3 53 8 11 9 30 10 16 11 26 15 16 17 13 19 7 21 13 22 8 23 35 25 31 28 7 31 146 32 7 36 7 39 11 43 41 44 7 47 7 52 76 53 26 55 35 56 10 58 27 61 80 65 15 66 10 … 809 53 820 93 821 8 823 12 827 37 828 7 829 80 830 14 831 18 835 83 838 56 839 29 842 31 849 89 853 39 855 9 856 164 857 26 858 19 862 25 865 13 866 13 869 11 871 52 874 24 875 7 879 83 880 26 887 30 889 30 Name: Fare, dtype: int64#get average and std fare from survived and unsurvived passengersaverage_fare=DataFrame([fare_not_survived.mean(),fare_survived.mean()])std_fare=DataFrame([fare_not_survived.std(),fare_survived.std()])#plottitanic_df['Fare'].plot(kind='hist',figsize=(15,3),bins=100,xlim=(titanic_df['Fare'].min(),100))
<matplotlib.axes._subplots.AxesSubplot at 0x7f554fe37b70>
average_fare
std_fare
average_fare.index.names=std_fare.index.names=['survived']average_fare.plot(yerr=std_fare,kind='bar',legend=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f554fa62208>
#agefig,(axis1,axis2)=plt.subplots(1,2,figsize=(15,4))axis1.set_title('Original Age value_titanic')axis2.set_title('New Age value_titanic')#get average,std and number of NaN values in titanicaverage_age_titanic=titanic_df['Age'].mean()std_age_titanic=titanic_df['Age'].std()number_of_nan_titanic=titanic_df['Age'].isnull().sum()#get average,std and number of NaN values in testaverage_age_test=test_df['Age'].mean()std_age_test=test_df['Age'].std()number_of_nan_test=test_df['Age'].isnull().sum()#generate random values between average-std adn average+stdrand_1=np.random.randint(average_age_titanic-std_age_titanic,average_age_titanic+std_age_titanic,size=number_of_nan_titanic)rand_2=np.random.randint(average_age_test-std_age_test,average_age_test+std_age_test,size=number_of_nan_test)#plot the originial age value_titanic titanic_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)#fill NaN values in age with random age genetated titanic_df['Age'][np.isnan(titanic_df['Age'])]=rand_1test_df['Age'][np.isnan(test_df['Age'])]=rand_2#convert from float to inttitanic_df['Age']=titanic_df['Age'].astype(int)test_df['Age']=test_df['Age'].astype(int)#plot new age value_titanictitanic_df['Age'].hist(bins=70,ax=axis2)
/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py:24: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrameSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py:25: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrameSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy<matplotlib.axes._subplots.AxesSubplot at 0x7f554f9ad518>
#continue plot age#peaks for survived /not survived by their agefacet=sns.FacetGrid(titanic_df,hue='Survived',aspect=4)facet.map(sns.kdeplot,'Age',shade=True)facet.set(xlim=(0,titanic_df['Age'].max()))facet.add_legend()
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))<seaborn.axisgrid.FacetGrid at 0x7f554fac10b8>
#average survived passengers by ageaverage_survived_by_age=titanic_df[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()fig,axis1=plt.subplots(1,1,figsize=(18,4))sns.barplot(x='Age',y='Survived',data=average_survived_by_age)
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))<matplotlib.axes._subplots.AxesSubplot at 0x7f554f941a90>
#Cabin #it has a lot of NaN values,so it can't have huge impact on predicationtitanic_df.drop('Cabin',axis=1,inplace=True)test_df.drop('Cabin',axis=1,inplace=True)titanic_df.head()
#Family#instead of having two columns Sibsp & Parch#we use only one column to represent whether the passenger has any family member on board #meaning , if has family on board if will increase of chance of survivl or nottitanic_df['Family']=titanic_df['SibSp']+titanic_df['Parch']titanic_df['Family'].loc[titanic_df['Family']>0]=1titanic_df['Family'].loc[titanic_df['Family']==0]=0test_df['Family']=test_df['SibSp']+test_df['Parch']test_df['Family'].loc[test_df['Family']>0]=1test_df['Family'].loc[test_df['Family']==0]=0#drop SibSp & Parchtitanic_df=titanic_df.drop(['SibSp','Parch'],axis=1)test_df=test_df.drop(['SibSp','Parch'],axis=1)#plotfig,(axis1,axis2)=plt.subplots(1,2,sharex=True,figsize=(10,5))#countplotsns.countplot(x='Family',data=titanic_df,order=[1,0],ax=axis1)#average survival by Familyaverage_survival_by_Family=titanic_df[['Family','Survived']].groupby(['Family'],as_index=False).mean()sns.barplot(x='Family',y='Survived',data=average_survival_by_Family,order=[1,0],ax=axis2)axis1.set_xticklabels(['with family','alone'],rotation=0)
/usr/lib/python3/dist-packages/pandas/core/indexing.py:117: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrameSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self._setitem_with_indexer(indexer, value)/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))[<matplotlib.text.Text at 0x7f554f65a8d0>, <matplotlib.text.Text at 0x7f554f6519b0>]
# Sex# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.# So, we can classify passengers as males, females, and childdef get_person(passenger): age,sex = passenger if age<16: return 'child' else: return sextitanic_df['Person'] = titanic_df[['Age','Sex']].apply(get_person,axis=1)test_df['Person'] = test_df[['Age','Sex']].apply(get_person,axis=1)# No need to use Sex column since we created Person columntitanic_df.drop(['Sex'],axis=1,inplace=True)test_df.drop(['Sex'],axis=1,inplace=True)
person_dummy=pd.get_dummies(titanic_df['Person'])person_dummy.columns = ['Child','Female','Male']person_dummy.drop(['Male'],axis=1,inplace=True)person_dummy_test=pd.get_dummies(test_df['Person'])person_dummy_test.columns=['Child','Female','Male']person_dummy_test.drop(['Male'],axis=1,inplace=True)titanic_df=titanic_df.join(person_dummy)test_df=test_df.join(person_dummy_test)
fig,(axis1,axis2)=plt.subplots(1,2,figsize=(10,5))sns.countplot(x='Person',data=titanic_df,ax=axis1)person_perc=titanic_df[['Person','Survived']].groupby(['Person'],as_index=False).mean()sns.barplot(x='Person',y='Survived',data=person_perc,ax=axis2,order=['male','female','child'])
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key)) #drop persontitanic_df.drop(['Person'],axis=1,inplace=True)test_df.drop(['Person'],axis=1,inplace=True)
#Pclasssns.factorplot('Pclass','Survived',order=[1,2,3],data=titanic_df,size=5)
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))<seaborn.axisgrid.FacetGrid at 0x7f554f7d8c18>
pclass_dummies_titanic=pd.get_dummies(titanic_df['Pclass'])pclass_dummies_titanic.columns=['Class1','Class2','Class3']pclass_dummies_titanic.drop(['Class3'],axis=1,inplace=True)pclass_dummies_test=pd.get_dummies(test_df['Pclass'])pclass_dummies_test.columns=['Class1','Class2','Class3']pclass_dummies_test.drop(['Class3'],axis=1,inplace=True)titanic_df.drop(['Pclass'],axis=1,inplace=True)test_df.drop(['Pclass'],axis=1,inplace=True)titanic_df.join(pclass_dummies_titanic)test_df.join(pclass_dummies_test)
418 rows × 10 columns
#defining the training and testing setx_train=titanic_df.drop(['Survived'],axis=1)y_train=titanic_df['Survived']x_test=test_df.drop('PassengerId',axis=1)
#logistic regressionlogre=LogisticRegression()logre.fit(x_train,y_train)y_pred=logre.predict(x_test)logre.score(x_train,y_train)
0.77104377104377109
# Support Vector Machinessvc = SVC()svc.fit(x_train, y_train)Y_pred = svc.predict(x_test)svc.score(x_train, y_train)
0.88327721661054992
#Random Forestrandom_forest=RandomForestClassifier(n_estimators=100)random_forest.fit(x_train,y_train)y_pred=random_forest.predict(x_test)random_forest.score(x_train,y_train)
0.96520763187429859
submission = pd.DataFrame({ "PassengerId": test_df["PassengerId"], "Survived": y_pred })submission.to_csv('titanic.csv', index=False)
1 0
- titanic prediction
- Titanic
- POJ2354-Titanic
- 【kaggle】Titanic
- Kaggle: Titanic
- kaggle:titanic
- titanic+tensorflow
- kaggle-Titanic
- Game Prediction
- Game Prediction
- Game Prediction
- Caffe Prediction
- branch prediction
- poj1323Game Prediction
- Edge Prediction
- Game Prediction
- hdu5923 Prediction
- Game Prediction
- 使用Gitblit 搭建Windows Git服务器
- 简述Android的四种加载模式
- Keil中的环境变量和使用
- 多线程编程学习总结
- Java多线程讨论 (r)
- titanic prediction
- POJ 2449 Remmarguts' Date(A*+第k短的路)
- 基于Python查看SVD压缩图片的效果
- 51nod oj 1678 lyk与gcd 【容斥定理+打表】
- Codevs 1535 封锁阳光大学
- startActivityForResult
- 堆排序的模板
- UUID 和 UDID的区别
- 关于多线程编程您不知道的 5 件事 (r)