import pandas as pdimport matplotlib.pyplot as pltimport numpy as np%matplotlib inlinetrain_data = pd.read_csv('train.csv')
count_survivors = pd.value_counts(train_data['Survived'])count_survivors.plot(kind='bar')plt.xlabel('Is_survived')plt.ylabel('Number of People')plt.title('Survivor histogram')


from sklearn.preprocessing import StandardScalertrain_data['Sex'] = train_data['Sex'].map({'female':0, 'male':1})age_avg = np.mean([0 if np.isnan(item) else item for item in train_data['Age']])train_data['Age'] = [age_avg if np.isnan(item) else item for item in train_data['Age']]train_data['Age'] = StandardScaler().fit_transform(train_data['Age'].values.reshape(-1,1))train_data['SibSp'] = StandardScaler().fit_transform(train_data['SibSp'].values.reshape(-1,1))train_data['Parch'] = StandardScaler().fit_transform(train_data['Parch'].values.reshape(-1,1))train_data['Fare'] = StandardScaler().fit_transform(train_data['Fare'].values.reshape(-1,1))train_data['Embarked'] = train_data['Embarked'].map({'S':1, 'C':2, 'Q':3})pier = [0 if np.isnan(item) else item for item in train_data['Embarked']]train_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]train_data = train_data.drop(columns=['Name','Ticket','Cabin','PassengerId'])

X = train_data.ix[:, train_data.columns != 'Survived']Y = train_data.ix[:, train_data.columns == 'Survived']

from sklearn.linear_model import LogisticRegressionfrom sklearn.cross_validation import KFoldfrom sklearn.metrics import recall_score,confusion_matrixdef getBestC(X, Y):    folds = KFold(len(Y), 5)    c_param_range = [0.01,0.1,1,10,100]    results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])    results_table['C_parameter'] = c_param_range    for i in range(len(c_param_range)):        print '******** c_param = %.2f ********' % c_param_range[i]        recall_accs = []        for iteration, fold in enumerate(folds, start=1):            lr = LogisticRegression(C = c_param_range[i], penalty = 'l1')            lr.fit(X.iloc[fold[0]].values, Y.iloc[fold[0]].values)            Y_hat = lr.predict(X.iloc[fold[1]].values)            recall_acc = recall_score(Y.iloc[fold[1]].values, Y_hat)            recall_accs.append(recall_acc)            print 'Iteration %d: recall score = %f' % (iteration,recall_acc)        results_table.ix[i,'Mean recall score'] = np.mean(recall_accs)        print '\nMean recall score %f\n' % np.mean(recall_accs)    best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']    print '--------------------------------\nbest_c = %.2f' % best_c    return best_c

best_c = getBestC(X, Y)
******** c_param = 0.01 ********
Iteration 1: recall score = 0.000000
Iteration 2: recall score = 0.000000
Iteration 3: recall score = 0.000000
Iteration 4: recall score = 0.000000
Iteration 5: recall score = 0.000000

Mean recall score 0.000000

******** c_param = 0.10 ********
Iteration 1: recall score = 0.694915
Iteration 2: recall score = 0.683544
Iteration 3: recall score = 0.681159
Iteration 4: recall score = 0.583333
Iteration 5: recall score = 0.698413

Mean recall score 0.668273

******** c_param = 1.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.710145
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.746032

Mean recall score 0.701604

******** c_param = 10.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.739130
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.761905

Mean recall score 0.710576

******** c_param = 100.00 ********
Iteration 1: recall score = 0.745763
Iteration 2: recall score = 0.708861
Iteration 3: recall score = 0.739130
Iteration 4: recall score = 0.597222
Iteration 5: recall score = 0.761905

Mean recall score 0.710576

--------------------------------
best_c = 10.00
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):    plt.imshow(cm, interpolation='nearest', cmap=cmap)    plt.title(title)    plt.colorbar()    tick_marks = np.arange(len(classes))    plt.xticks(tick_marks, classes, rotation=0)    plt.yticks(tick_marks, classes)    thresh = cm.max() / 2.    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):        plt.text(j, i, cm[i, j],                 horizontalalignment="center",                 color="white" if cm[i, j] > thresh else "black")    plt.tight_layout()    plt.ylabel('True label')    plt.xlabel('Predicted label')
import itertoolslr = LogisticRegression(C = best_c, penalty = 'l1')lr.fit(X.values, Y.values)Y_hat = lr.predict(X.values)# Compute confusion matrixcnf_matrix = confusion_matrix(Y, Y_hat)#np.set_printoptions(precision=2)print "Recall value in training dataset: %f" % (1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))# Plot non-normalized confusion matrixclass_names = [0, 1]plt.figure()plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')plt.show()
Recall value in training dataset: 0.710526


lr = LogisticRegression(C = best_c, penalty = 'l1')lr.fit(X.values, Y.values)Y_hat_proba = lr.predict_proba(X.values)thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]plt.figure(figsize=(10,10))j = 1for i in thresholds:    Y_hat = Y_hat_proba[:,1] > i    plt.subplot(3,3,j)    j += 1    # Compute confusion matrix    cnf_matrix = confusion_matrix(Y, Y_hat)    print "Recall value in training dataset: %f, with threshold = %.1f" % ((1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])), i)    # Plot non-normalized confusion matrix    class_names = [0,1]    plot_confusion_matrix(cnf_matrix                          , classes=class_names                          , title='Threshold >= %s'%i) 
Recall value in training dataset: 0.938596, with threshold = 0.1Recall value in training dataset: 0.850877, with threshold = 0.2Recall value in training dataset: 0.824561, with threshold = 0.3Recall value in training dataset: 0.757310, with threshold = 0.4Recall value in training dataset: 0.710526, with threshold = 0.5Recall value in training dataset: 0.646199, with threshold = 0.6Recall value in training dataset: 0.532164, with threshold = 0.7Recall value in training dataset: 0.371345, with threshold = 0.8Recall value in training dataset: 0.204678, with threshold = 0.9



test_data = pd.read_csv('test.csv')
test_data['Sex'] = test_data['Sex'].map({'female':0, 'male':1})age_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Age']])test_data['Age'] = [age_avg if np.isnan(item) else item for item in test_data['Age']]test_data['Age'] = StandardScaler().fit_transform(test_data['Age'].values.reshape(-1,1))test_data['SibSp'] = StandardScaler().fit_transform(test_data['SibSp'].values.reshape(-1,1))test_data['Parch'] = StandardScaler().fit_transform(test_data['Parch'].values.reshape(-1,1))fare_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Fare']])test_data['Fare'] = [fare_avg if np.isnan(item) else item for item in test_data['Fare']]test_data['Fare'] = StandardScaler().fit_transform(test_data['Fare'].values.reshape(-1,1))test_data['Embarked'] = test_data['Embarked'].map({'S':1, 'C':2, 'Q':3})pier = [0 if np.isnan(item) else item for item in test_data['Embarked']]test_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]test_data = test_data.drop(columns=['Name','Ticket','Cabin'])
PassengerId Pclass Sex Age SibSp Parch Fare Embarked 0 892 3 1 0.428099 -0.499470 -0.400248 -0.498403 3 1 893 3 0 1.399492 0.616992 -0.400248 -0.513271 1 2 894 2 1 2.565163 -0.499470 -0.400248 -0.465085 3 3 895 3 1 -0.154736 -0.499470 -0.400248 -0.483463 1 4 896 3 0 -0.543293 0.616992 0.619896 -0.418468 1
Survived Pclass Sex Age SibSp Parch Fare Embarked 0 0 3 1 -0.494245 0.432793 -0.473674 -0.502445 1.0 1 1 1 0 0.717307 0.432793 -0.473674 0.786845 2.0 2 1 3 0 -0.191357 -0.474545 -0.473674 -0.488854 1.0 3 1 1 0 0.490141 0.432793 -0.473674 0.420730 1.0 4 0 3 1 0.490141 -0.474545 -0.473674 -0.486337 1.0
X_test = test_data.drop(['PassengerId'], axis=1)
Pclass Sex Age SibSp Parch Fare Embarked 0 3 1 0.428099 -0.499470 -0.400248 -0.498403 3 1 3 0 1.399492 0.616992 -0.400248 -0.513271 1 2 2 1 2.565163 -0.499470 -0.400248 -0.465085 3 3 3 1 -0.154736 -0.499470 -0.400248 -0.483463 1 4 3 0 -0.543293 0.616992 0.619896 -0.418468 1
lr = LogisticRegression(C = best_c, penalty = 'l1')lr.fit(X.values, Y.values)Y_hat_proba = lr.predict_proba(X_test.values)Y_hat = [1 if y > 0.6 else 0 for y in Y_hat_proba[:,1]]results = pd.DataFrame(Y_hat, columns=['Survived'])results.insert(0, 'PassengerId', test_data['PassengerId'])results.to_csv('results.csv')