titanic prediction

来源:互联网 发布:美国租车软件 编辑:程序博客网 时间:2024/03/28 18:57
# Imports# pandasimport pandas as pdfrom pandas import Series,DataFrame# numpy, matplotlib, seabornimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snssns.set_style('whitegrid')%matplotlib inline# machine learningfrom sklearn.linear_model import LogisticRegressionfrom sklearn.svm import SVC, LinearSVCfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.naive_bayes import GaussianNB
# get titanic & test csv files as a DataFrametitanic_df = pd.read_csv("train.csv", dtype={"Age": np.float64}, )test_df    = pd.read_csv("test.csv", dtype={"Age": np.float64}, )# preview the datatitanic_df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th… female 38 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 NaN S
titanic_df=titanic_df.drop(['PassengerId','Name','Ticket'],axis=1)test_df   =test_df.drop(['Name','Ticket'],axis=1)
# Embarked# only in titanic_df, fill the two missing values with the most occurred value, which is "S".titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S")# plotsns.factorplot('Embarked','Survived', data=titanic_df,size=4,aspect=3)fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))# sns.factorplot('Embarked',data=titanic_df,kind='count',order=['S','C','Q'],ax=axis1)# sns.factorplot('Survived',hue="Embarked",data=titanic_df,kind='count',order=[1,0],ax=axis2)sns.countplot(x='Embarked', data=titanic_df, ax=axis1)sns.countplot(x='Survived', hue="Embarked", data=titanic_df, order=[1,0], ax=axis2)# group by embarked, and get the mean for survived passengers for each value in Embarkedembark_perc = titanic_df[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)# Either to consider Embarked column in predictions,# and remove "S" dummy variable, # and leave "C" & "Q", since they seem to have a good rate for Survival.# OR, don't create dummy variables for Embarked column, just drop it, # because logically, Embarked doesn't seem to be useful in prediction.embark_dummies_titanic  = pd.get_dummies(titanic_df['Embarked'])embark_dummies_titanic.drop(['S'], axis=1, inplace=True)embark_dummies_test  = pd.get_dummies(test_df['Embarked'])embark_dummies_test.drop(['S'], axis=1, inplace=True)titanic_df = titanic_df.join(embark_dummies_titanic)test_df    = test_df.join(embark_dummies_test)titanic_df.drop(['Embarked'], axis=1,inplace=True)test_df.drop(['Embarked'], axis=1,inplace=True)
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.  warnings.warn(self.msg_depr % (key, alt_key))

这里写图片描述

这里写图片描述

test_df.info()
titanic_df.info()
#fare#fill the missing "Fare" for the test_dftest_df['Fare'].fillna(test_df['Fare'].median(),inplace=True)#convert from float to int titanic_df['Fare']=titanic_df['Fare'].astype(int)test_df['Fare']=test_df['Fare'].astype(int)#get fare from survived and not survivedfare_not_survived=titanic_df['Fare'][titanic_df['Survived']==0]fare_survived=titanic_df['Fare'][titanic_df['Survived']==1]
fare_not_survived
0 7 4 8 5 8 6 51 7 21 12 8 13 31 14 7 16 29 18 18 20 26 24 21 26 7 27 263 29 7 30 27 33 10 34 82 35 52 37 8 38 18 40 9 41 21 42 7 45 8 46 15 48 21 49 17 50 39 51 7 … 844 8 845 7 846 69 847 7 848 33 850 31 851 7 852 15 854 26 859 7 860 14 861 11 863 69 864 13 867 50 868 9 870 7 872 5 873 9 876 9 877 7 878 7 881 7 882 10 883 10 884 7 885 29 886 13 888 23 890 7 Name: Fare, dtype: int64
fare_survived
1 71 2 7 3 53 8 11 9 30 10 16 11 26 15 16 17 13 19 7 21 13 22 8 23 35 25 31 28 7 31 146 32 7 36 7 39 11 43 41 44 7 47 7 52 76 53 26 55 35 56 10 58 27 61 80 65 15 66 10 … 809 53 820 93 821 8 823 12 827 37 828 7 829 80 830 14 831 18 835 83 838 56 839 29 842 31 849 89 853 39 855 9 856 164 857 26 858 19 862 25 865 13 866 13 869 11 871 52 874 24 875 7 879 83 880 26 887 30 889 30 Name: Fare, dtype: int64
#get average and std fare from survived and unsurvived passengersaverage_fare=DataFrame([fare_not_survived.mean(),fare_survived.mean()])std_fare=DataFrame([fare_not_survived.std(),fare_survived.std()])#plottitanic_df['Fare'].plot(kind='hist',figsize=(15,3),bins=100,xlim=(titanic_df['Fare'].min(),100))
<matplotlib.axes._subplots.AxesSubplot at 0x7f554fe37b70>

这里写图片描述

average_fare
0 0 21.690346 1 47.991228
std_fare
0 0 31.392191 1 66.608344
average_fare.index.names=std_fare.index.names=['survived']average_fare.plot(yerr=std_fare,kind='bar',legend=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f554fa62208>

这里写图片描述

#agefig,(axis1,axis2)=plt.subplots(1,2,figsize=(15,4))axis1.set_title('Original Age value_titanic')axis2.set_title('New Age value_titanic')#get average,std and number of NaN values in titanicaverage_age_titanic=titanic_df['Age'].mean()std_age_titanic=titanic_df['Age'].std()number_of_nan_titanic=titanic_df['Age'].isnull().sum()#get average,std and number of NaN values in testaverage_age_test=test_df['Age'].mean()std_age_test=test_df['Age'].std()number_of_nan_test=test_df['Age'].isnull().sum()#generate random values between average-std adn average+stdrand_1=np.random.randint(average_age_titanic-std_age_titanic,average_age_titanic+std_age_titanic,size=number_of_nan_titanic)rand_2=np.random.randint(average_age_test-std_age_test,average_age_test+std_age_test,size=number_of_nan_test)#plot the originial age value_titanic                              titanic_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)#fill NaN values in age with random age genetated titanic_df['Age'][np.isnan(titanic_df['Age'])]=rand_1test_df['Age'][np.isnan(test_df['Age'])]=rand_2#convert from float to inttitanic_df['Age']=titanic_df['Age'].astype(int)test_df['Age']=test_df['Age'].astype(int)#plot new age value_titanictitanic_df['Age'].hist(bins=70,ax=axis2)
/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py:24: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrameSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py:25: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrameSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy<matplotlib.axes._subplots.AxesSubplot at 0x7f554f9ad518>

这里写图片描述

#continue plot age#peaks for survived /not survived by their agefacet=sns.FacetGrid(titanic_df,hue='Survived',aspect=4)facet.map(sns.kdeplot,'Age',shade=True)facet.set(xlim=(0,titanic_df['Age'].max()))facet.add_legend()
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.  warnings.warn(self.msg_depr % (key, alt_key))<seaborn.axisgrid.FacetGrid at 0x7f554fac10b8>

这里写图片描述

#average survived passengers by ageaverage_survived_by_age=titanic_df[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()fig,axis1=plt.subplots(1,1,figsize=(18,4))sns.barplot(x='Age',y='Survived',data=average_survived_by_age)
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.  warnings.warn(self.msg_depr % (key, alt_key))<matplotlib.axes._subplots.AxesSubplot at 0x7f554f941a90>

这里写图片描述

#Cabin #it has a lot of NaN values,so it can't have huge impact on predicationtitanic_df.drop('Cabin',axis=1,inplace=True)test_df.drop('Cabin',axis=1,inplace=True)titanic_df.head()
Survived Pclass Sex Age SibSp Parch Fare C Q 0 0 3 male 22 1 0 7 0 0 1 1 1 female 38 1 0 71 1 0 2 1 3 female 26 0 0 7 0 0 3 1 1 female 35 1 0 53 0 0 4 0 3 male 35 0 0 8 0 0
#Family#instead of having two columns Sibsp & Parch#we use only one column to represent whether the passenger has any family member on board #meaning , if has family on board if will increase of chance of survivl or nottitanic_df['Family']=titanic_df['SibSp']+titanic_df['Parch']titanic_df['Family'].loc[titanic_df['Family']>0]=1titanic_df['Family'].loc[titanic_df['Family']==0]=0test_df['Family']=test_df['SibSp']+test_df['Parch']test_df['Family'].loc[test_df['Family']>0]=1test_df['Family'].loc[test_df['Family']==0]=0#drop SibSp & Parchtitanic_df=titanic_df.drop(['SibSp','Parch'],axis=1)test_df=test_df.drop(['SibSp','Parch'],axis=1)#plotfig,(axis1,axis2)=plt.subplots(1,2,sharex=True,figsize=(10,5))#countplotsns.countplot(x='Family',data=titanic_df,order=[1,0],ax=axis1)#average survival by Familyaverage_survival_by_Family=titanic_df[['Family','Survived']].groupby(['Family'],as_index=False).mean()sns.barplot(x='Family',y='Survived',data=average_survival_by_Family,order=[1,0],ax=axis2)axis1.set_xticklabels(['with family','alone'],rotation=0)
/usr/lib/python3/dist-packages/pandas/core/indexing.py:117: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrameSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy  self._setitem_with_indexer(indexer, value)/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.  warnings.warn(self.msg_depr % (key, alt_key))[<matplotlib.text.Text at 0x7f554f65a8d0>, <matplotlib.text.Text at 0x7f554f6519b0>]

这里写图片描述

# Sex# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.# So, we can classify passengers as males, females, and childdef get_person(passenger):    age,sex = passenger    if age<16:        return 'child'    else:        return sextitanic_df['Person'] = titanic_df[['Age','Sex']].apply(get_person,axis=1)test_df['Person']    = test_df[['Age','Sex']].apply(get_person,axis=1)# No need to use Sex column since we created Person columntitanic_df.drop(['Sex'],axis=1,inplace=True)test_df.drop(['Sex'],axis=1,inplace=True)
person_dummy=pd.get_dummies(titanic_df['Person'])person_dummy.columns = ['Child','Female','Male']person_dummy.drop(['Male'],axis=1,inplace=True)person_dummy_test=pd.get_dummies(test_df['Person'])person_dummy_test.columns=['Child','Female','Male']person_dummy_test.drop(['Male'],axis=1,inplace=True)titanic_df=titanic_df.join(person_dummy)test_df=test_df.join(person_dummy_test)
fig,(axis1,axis2)=plt.subplots(1,2,figsize=(10,5))sns.countplot(x='Person',data=titanic_df,ax=axis1)person_perc=titanic_df[['Person','Survived']].groupby(['Person'],as_index=False).mean()sns.barplot(x='Person',y='Survived',data=person_perc,ax=axis2,order=['male','female','child'])
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))
#drop persontitanic_df.drop(['Person'],axis=1,inplace=True)test_df.drop(['Person'],axis=1,inplace=True)
#Pclasssns.factorplot('Pclass','Survived',order=[1,2,3],data=titanic_df,size=5)
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.  warnings.warn(self.msg_depr % (key, alt_key))<seaborn.axisgrid.FacetGrid at 0x7f554f7d8c18>

这里写图片描述

pclass_dummies_titanic=pd.get_dummies(titanic_df['Pclass'])pclass_dummies_titanic.columns=['Class1','Class2','Class3']pclass_dummies_titanic.drop(['Class3'],axis=1,inplace=True)pclass_dummies_test=pd.get_dummies(test_df['Pclass'])pclass_dummies_test.columns=['Class1','Class2','Class3']pclass_dummies_test.drop(['Class3'],axis=1,inplace=True)titanic_df.drop(['Pclass'],axis=1,inplace=True)test_df.drop(['Pclass'],axis=1,inplace=True)titanic_df.join(pclass_dummies_titanic)test_df.join(pclass_dummies_test)
PassengerId Age Fare C Q Family Child Female Class1 Class2 0 892 34 7 0 1 0 0 0 0 0 1 893 47 7 0 0 1 0 1 0 0 2 894 62 9 0 1 0 0 0 0 1 3 895 27 8 0 0 0 0 0 0 0 4 896 22 12 0 0 1 0 1 0 0 5 897 14 9 0 0 0 1 0 0 0 6 898 30 7 0 1 0 0 1 0 0 7 899 26 29 0 0 1 0 0 0 1 8 900 18 7 1 0 0 0 1 0 0 9 901 21 24 0 0 1 0 0 0 0 10 902 30 7 0 0 0 0 0 0 0 11 903 46 26 0 0 0 0 0 1 0 12 904 23 82 0 0 1 0 1 1 0 13 905 63 26 0 0 1 0 0 0 1 14 906 47 61 0 0 1 0 1 1 0 15 907 24 27 1 0 1 0 1 0 1 16 908 35 12 0 1 0 0 0 0 1 17 909 21 7 1 0 0 0 0 0 0 18 910 27 7 0 0 1 0 1 0 0 19 911 45 7 1 0 0 0 1 0 0 20 912 55 59 1 0 1 0 0 1 0 21 913 9 3 0 0 1 1 0 0 0 22 914 21 31 0 0 0 0 1 1 0 23 915 21 61 1 0 1 0 0 1 0 24 916 48 262 1 0 1 0 1 1 0 25 917 50 14 0 0 1 0 0 0 0 26 918 22 61 1 0 1 0 1 1 0 27 919 22 7 1 0 0 0 0 0 0 28 920 41 30 0 0 0 0 0 1 0 29 921 39 21 1 0 1 0 0 0 0 … … … … … … … … … … … 388 1280 21 7 0 1 0 0 0 0 0 389 1281 6 21 0 0 1 1 0 0 0 390 1282 23 93 0 0 0 0 0 1 0 391 1283 51 39 0 0 1 0 1 1 0 392 1284 13 20 0 0 1 1 0 0 0 393 1285 47 10 0 0 0 0 0 0 1 394 1286 29 22 0 0 1 0 0 0 0 395 1287 18 60 0 0 1 0 1 1 0 396 1288 24 7 0 1 0 0 0 0 0 397 1289 48 79 1 0 1 0 1 1 0 398 1290 22 7 0 0 0 0 0 0 0 399 1291 31 7 0 1 0 0 0 0 0 400 1292 30 164 0 0 0 0 1 1 0 401 1293 38 21 0 0 1 0 0 0 1 402 1294 22 59 1 0 1 0 1 1 0 403 1295 17 47 0 0 0 0 0 1 0 404 1296 43 27 1 0 1 0 0 1 0 405 1297 20 13 1 0 0 0 0 0 1 406 1298 23 10 0 0 1 0 0 0 1 407 1299 50 211 1 0 1 0 0 1 0 408 1300 39 7 0 1 0 0 1 0 0 409 1301 3 13 0 0 1 1 0 0 0 410 1302 38 7 0 1 0 0 1 0 0 411 1303 37 90 0 1 1 0 1 1 0 412 1304 28 7 0 0 0 0 1 0 0 413 1305 25 8 0 0 0 0 0 0 0 414 1306 39 108 1 0 0 0 1 1 0 415 1307 38 7 0 0 0 0 0 0 0 416 1308 39 8 0 0 0 0 0 0 0 417 1309 27 22 1 0 1 0 0 0 0

418 rows × 10 columns

#defining the training and testing setx_train=titanic_df.drop(['Survived'],axis=1)y_train=titanic_df['Survived']x_test=test_df.drop('PassengerId',axis=1)
#logistic regressionlogre=LogisticRegression()logre.fit(x_train,y_train)y_pred=logre.predict(x_test)logre.score(x_train,y_train)
0.77104377104377109
# Support Vector Machinessvc = SVC()svc.fit(x_train, y_train)Y_pred = svc.predict(x_test)svc.score(x_train, y_train)
0.88327721661054992
#Random Forestrandom_forest=RandomForestClassifier(n_estimators=100)random_forest.fit(x_train,y_train)y_pred=random_forest.predict(x_test)random_forest.score(x_train,y_train)
0.96520763187429859
submission = pd.DataFrame({        "PassengerId": test_df["PassengerId"],        "Survived": y_pred    })submission.to_csv('titanic.csv', index=False)
1 0
原创粉丝点击