逻辑回归实战 — Kaggle_Titanic 2

来源：互联网发布：vb if 不等于编辑：程序博客网时间：2024/06/06 04:04

数据来源：https://www.kaggle.com/c/titanic

Training

import pandas as pdimport matplotlib.pyplot as pltimport numpy as np%matplotlib inlinetrain_data = pd.read_csv('train.csv')

count_survivors = pd.value_counts(train_data['Survived'])count_survivors.plot(kind='bar')plt.xlabel('Is_survived')plt.ylabel('Number of People')plt.title('Survivor histogram')

survivor_hist

from sklearn.preprocessing import StandardScalertrain_data['Sex'] = train_data['Sex'].map({'female':0, 'male':1})age_avg = np.mean([0 if np.isnan(item) else item for item in train_data['Age']])train_data['Age'] = [age_avg if np.isnan(item) else item for item in train_data['Age']]train_data['Age'] = StandardScaler().fit_transform(train_data['Age'].values.reshape(-1,1))train_data['SibSp'] = StandardScaler().fit_transform(train_data['SibSp'].values.reshape(-1,1))train_data['Parch'] = StandardScaler().fit_transform(train_data['Parch'].values.reshape(-1,1))train_data['Fare'] = StandardScaler().fit_transform(train_data['Fare'].values.reshape(-1,1))train_data['Embarked'] = train_data['Embarked'].map({'S':1, 'C':2, 'Q':3})pier = [0 if np.isnan(item) else item for item in train_data['Embarked']]train_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]train_data = train_data.drop(columns=['Name','Ticket','Cabin','PassengerId'])

c:\python27\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler. warnings.warn(msg, DataConversionWarning)

X = train_data.ix[:, train_data.columns != 'Survived']Y = train_data.ix[:, train_data.columns == 'Survived']

c:\python27\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated “”“Entry point for launching an IPython kernel.

from sklearn.linear_model import LogisticRegressionfrom sklearn.cross_validation import KFoldfrom sklearn.metrics import recall_score,confusion_matrixdef getBestC(X, Y):    folds = KFold(len(Y), 5)    c_param_range = [0.01,0.1,1,10,100]    results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])    results_table['C_parameter'] = c_param_range    for i in range(len(c_param_range)):        print '******** c_param = %.2f ********' % c_param_range[i]        recall_accs = []        for iteration, fold in enumerate(folds, start=1):            lr = LogisticRegression(C = c_param_range[i], penalty = 'l1')            lr.fit(X.iloc[fold[0]].values, Y.iloc[fold[0]].values)            Y_hat = lr.predict(X.iloc[fold[1]].values)            recall_acc = recall_score(Y.iloc[fold[1]].values, Y_hat)            recall_accs.append(recall_acc)            print 'Iteration %d: recall score = %f' % (iteration,recall_acc)        results_table.ix[i,'Mean recall score'] = np.mean(recall_accs)        print '\nMean recall score %f\n' % np.mean(recall_accs)    best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']    print '--------------------------------\nbest_c = %.2f' % best_c    return best_c

c:\python27\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. “This module will be removed in 0.20.”, DeprecationWarning)

best_c = getBestC(X, Y)

******** c_param = 0.01 ********Iteration 1: recall score = 0.000000Iteration 2: recall score = 0.000000Iteration 3: recall score = 0.000000Iteration 4: recall score = 0.000000Iteration 5: recall score = 0.000000Mean recall score 0.000000******** c_param = 0.10 ********Iteration 1: recall score = 0.694915Iteration 2: recall score = 0.683544Iteration 3: recall score = 0.681159Iteration 4: recall score = 0.583333Iteration 5: recall score = 0.698413Mean recall score 0.668273******** c_param = 1.00 ********Iteration 1: recall score = 0.745763Iteration 2: recall score = 0.708861Iteration 3: recall score = 0.710145Iteration 4: recall score = 0.597222Iteration 5: recall score = 0.746032Mean recall score 0.701604******** c_param = 10.00 ********Iteration 1: recall score = 0.745763Iteration 2: recall score = 0.708861Iteration 3: recall score = 0.739130Iteration 4: recall score = 0.597222Iteration 5: recall score = 0.761905Mean recall score 0.710576******** c_param = 100.00 ********Iteration 1: recall score = 0.745763Iteration 2: recall score = 0.708861Iteration 3: recall score = 0.739130Iteration 4: recall score = 0.597222Iteration 5: recall score = 0.761905Mean recall score 0.710576--------------------------------best_c = 10.00c:\python27\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().  y = column_or_1d(y, warn=True)c:\python27\lib\site-packages\ipykernel_launcher.py:25: DeprecationWarning: .ix is deprecated. Please use.loc for label based indexing or.iloc for positional indexingSee the documentation here:http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated

def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):    plt.imshow(cm, interpolation='nearest', cmap=cmap)    plt.title(title)    plt.colorbar()    tick_marks = np.arange(len(classes))    plt.xticks(tick_marks, classes, rotation=0)    plt.yticks(tick_marks, classes)    thresh = cm.max() / 2.    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):        plt.text(j, i, cm[i, j],                 horizontalalignment="center",                 color="white" if cm[i, j] > thresh else "black")    plt.tight_layout()    plt.ylabel('True label')    plt.xlabel('Predicted label')

import itertoolslr = LogisticRegression(C = best_c, penalty = 'l1')lr.fit(X.values, Y.values)Y_hat = lr.predict(X.values)# Compute confusion matrixcnf_matrix = confusion_matrix(Y, Y_hat)#np.set_printoptions(precision=2)print "Recall value in training dataset: %f" % (1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))# Plot non-normalized confusion matrixclass_names = [0, 1]plt.figure()plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')plt.show()

Recall value in training dataset: 0.710526

confusion_matrix

lr = LogisticRegression(C = best_c, penalty = 'l1')lr.fit(X.values, Y.values)Y_hat_proba = lr.predict_proba(X.values)thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]plt.figure(figsize=(10,10))j = 1for i in thresholds:    Y_hat = Y_hat_proba[:,1] > i    plt.subplot(3,3,j)    j += 1    # Compute confusion matrix    cnf_matrix = confusion_matrix(Y, Y_hat)    print "Recall value in training dataset: %f, with threshold = %.1f" % ((1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])), i)    # Plot non-normalized confusion matrix    class_names = [0,1]    plot_confusion_matrix(cnf_matrix                          , classes=class_names                          , title='Threshold >= %s'%i)

Recall value in training dataset: 0.938596, with threshold = 0.1Recall value in training dataset: 0.850877, with threshold = 0.2Recall value in training dataset: 0.824561, with threshold = 0.3Recall value in training dataset: 0.757310, with threshold = 0.4Recall value in training dataset: 0.710526, with threshold = 0.5Recall value in training dataset: 0.646199, with threshold = 0.6Recall value in training dataset: 0.532164, with threshold = 0.7Recall value in training dataset: 0.371345, with threshold = 0.8Recall value in training dataset: 0.204678, with threshold = 0.9

thresholds_cnf

Testing

test_data = pd.read_csv('test.csv')

test_data['Sex'] = test_data['Sex'].map({'female':0, 'male':1})age_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Age']])test_data['Age'] = [age_avg if np.isnan(item) else item for item in test_data['Age']]test_data['Age'] = StandardScaler().fit_transform(test_data['Age'].values.reshape(-1,1))test_data['SibSp'] = StandardScaler().fit_transform(test_data['SibSp'].values.reshape(-1,1))test_data['Parch'] = StandardScaler().fit_transform(test_data['Parch'].values.reshape(-1,1))fare_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Fare']])test_data['Fare'] = [fare_avg if np.isnan(item) else item for item in test_data['Fare']]test_data['Fare'] = StandardScaler().fit_transform(test_data['Fare'].values.reshape(-1,1))test_data['Embarked'] = test_data['Embarked'].map({'S':1, 'C':2, 'Q':3})pier = [0 if np.isnan(item) else item for item in test_data['Embarked']]test_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]test_data = test_data.drop(columns=['Name','Ticket','Cabin'])

test_data.head()

PassengerId Pclass Sex Age SibSp Parch Fare Embarked 0 892 3 1 0.428099 -0.499470 -0.400248 -0.498403 3 1 893 3 0 1.399492 0.616992 -0.400248 -0.513271 1 2 894 2 1 2.565163 -0.499470 -0.400248 -0.465085 3 3 895 3 1 -0.154736 -0.499470 -0.400248 -0.483463 1 4 896 3 0 -0.543293 0.616992 0.619896 -0.418468 1

train_data.head()

Survived Pclass Sex Age SibSp Parch Fare Embarked 0 0 3 1 -0.494245 0.432793 -0.473674 -0.502445 1.0 1 1 1 0 0.717307 0.432793 -0.473674 0.786845 2.0 2 1 3 0 -0.191357 -0.474545 -0.473674 -0.488854 1.0 3 1 1 0 0.490141 0.432793 -0.473674 0.420730 1.0 4 0 3 1 0.490141 -0.474545 -0.473674 -0.486337 1.0

X_test = test_data.drop(['PassengerId'], axis=1)

X_test.head()

Pclass Sex Age SibSp Parch Fare Embarked 0 3 1 0.428099 -0.499470 -0.400248 -0.498403 3 1 3 0 1.399492 0.616992 -0.400248 -0.513271 1 2 2 1 2.565163 -0.499470 -0.400248 -0.465085 3 3 3 1 -0.154736 -0.499470 -0.400248 -0.483463 1 4 3 0 -0.543293 0.616992 0.619896 -0.418468 1

lr = LogisticRegression(C = best_c, penalty = 'l1')lr.fit(X.values, Y.values)Y_hat_proba = lr.predict_proba(X_test.values)Y_hat = [1 if y > 0.6 else 0 for y in Y_hat_proba[:,1]]results = pd.DataFrame(Y_hat, columns=['Survived'])results.insert(0, 'PassengerId', test_data['PassengerId'])results.to_csv('results.csv')

阅读全文

0 0