逻辑回归实战 — Kaggle_Titanic 2
来源:互联网 发布:vb if 不等于 编辑:程序博客网 时间:2024/06/06 04:04
数据来源:https://www.kaggle.com/c/titanic
Training
import pandas as pdimport matplotlib.pyplot as pltimport numpy as np%matplotlib inlinetrain_data = pd.read_csv('train.csv')
count_survivors = pd.value_counts(train_data['Survived'])count_survivors.plot(kind='bar')plt.xlabel('Is_survived')plt.ylabel('Number of People')plt.title('Survivor histogram')
from sklearn.preprocessing import StandardScalertrain_data['Sex'] = train_data['Sex'].map({'female':0, 'male':1})age_avg = np.mean([0 if np.isnan(item) else item for item in train_data['Age']])train_data['Age'] = [age_avg if np.isnan(item) else item for item in train_data['Age']]train_data['Age'] = StandardScaler().fit_transform(train_data['Age'].values.reshape(-1,1))train_data['SibSp'] = StandardScaler().fit_transform(train_data['SibSp'].values.reshape(-1,1))train_data['Parch'] = StandardScaler().fit_transform(train_data['Parch'].values.reshape(-1,1))train_data['Fare'] = StandardScaler().fit_transform(train_data['Fare'].values.reshape(-1,1))train_data['Embarked'] = train_data['Embarked'].map({'S':1, 'C':2, 'Q':3})pier = [0 if np.isnan(item) else item for item in train_data['Embarked']]train_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]train_data = train_data.drop(columns=['Name','Ticket','Cabin','PassengerId'])
c:\python27\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler. warnings.warn(msg, DataConversionWarning)X = train_data.ix[:, train_data.columns != 'Survived']Y = train_data.ix[:, train_data.columns == 'Survived']
c:\python27\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated “”“Entry point for launching an IPython kernel.from sklearn.linear_model import LogisticRegressionfrom sklearn.cross_validation import KFoldfrom sklearn.metrics import recall_score,confusion_matrixdef getBestC(X, Y): folds = KFold(len(Y), 5) c_param_range = [0.01,0.1,1,10,100] results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score']) results_table['C_parameter'] = c_param_range for i in range(len(c_param_range)): print '******** c_param = %.2f ********' % c_param_range[i] recall_accs = [] for iteration, fold in enumerate(folds, start=1): lr = LogisticRegression(C = c_param_range[i], penalty = 'l1') lr.fit(X.iloc[fold[0]].values, Y.iloc[fold[0]].values) Y_hat = lr.predict(X.iloc[fold[1]].values) recall_acc = recall_score(Y.iloc[fold[1]].values, Y_hat) recall_accs.append(recall_acc) print 'Iteration %d: recall score = %f' % (iteration,recall_acc) results_table.ix[i,'Mean recall score'] = np.mean(recall_accs) print '\nMean recall score %f\n' % np.mean(recall_accs) best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter'] print '--------------------------------\nbest_c = %.2f' % best_c return best_c
c:\python27\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. “This module will be removed in 0.20.”, DeprecationWarning)best_c = getBestC(X, Y)
******** c_param = 0.01 ********Iteration 1: recall score = 0.000000Iteration 2: recall score = 0.000000Iteration 3: recall score = 0.000000Iteration 4: recall score = 0.000000Iteration 5: recall score = 0.000000Mean recall score 0.000000******** c_param = 0.10 ********Iteration 1: recall score = 0.694915Iteration 2: recall score = 0.683544Iteration 3: recall score = 0.681159Iteration 4: recall score = 0.583333Iteration 5: recall score = 0.698413Mean recall score 0.668273******** c_param = 1.00 ********Iteration 1: recall score = 0.745763Iteration 2: recall score = 0.708861Iteration 3: recall score = 0.710145Iteration 4: recall score = 0.597222Iteration 5: recall score = 0.746032Mean recall score 0.701604******** c_param = 10.00 ********Iteration 1: recall score = 0.745763Iteration 2: recall score = 0.708861Iteration 3: recall score = 0.739130Iteration 4: recall score = 0.597222Iteration 5: recall score = 0.761905Mean recall score 0.710576******** c_param = 100.00 ********Iteration 1: recall score = 0.745763Iteration 2: recall score = 0.708861Iteration 3: recall score = 0.739130Iteration 4: recall score = 0.597222Iteration 5: recall score = 0.761905Mean recall score 0.710576--------------------------------best_c = 10.00c:\python27\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)c:\python27\lib\site-packages\ipykernel_launcher.py:25: DeprecationWarning: .ix is deprecated. Please use.loc for label based indexing or.iloc for positional indexingSee the documentation here:http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues): plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=0) plt.yticks(tick_marks, classes) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label')
import itertoolslr = LogisticRegression(C = best_c, penalty = 'l1')lr.fit(X.values, Y.values)Y_hat = lr.predict(X.values)# Compute confusion matrixcnf_matrix = confusion_matrix(Y, Y_hat)#np.set_printoptions(precision=2)print "Recall value in training dataset: %f" % (1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))# Plot non-normalized confusion matrixclass_names = [0, 1]plt.figure()plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')plt.show()
Recall value in training dataset: 0.710526
lr = LogisticRegression(C = best_c, penalty = 'l1')lr.fit(X.values, Y.values)Y_hat_proba = lr.predict_proba(X.values)thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]plt.figure(figsize=(10,10))j = 1for i in thresholds: Y_hat = Y_hat_proba[:,1] > i plt.subplot(3,3,j) j += 1 # Compute confusion matrix cnf_matrix = confusion_matrix(Y, Y_hat) print "Recall value in training dataset: %f, with threshold = %.1f" % ((1.0*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])), i) # Plot non-normalized confusion matrix class_names = [0,1] plot_confusion_matrix(cnf_matrix , classes=class_names , title='Threshold >= %s'%i)
Recall value in training dataset: 0.938596, with threshold = 0.1Recall value in training dataset: 0.850877, with threshold = 0.2Recall value in training dataset: 0.824561, with threshold = 0.3Recall value in training dataset: 0.757310, with threshold = 0.4Recall value in training dataset: 0.710526, with threshold = 0.5Recall value in training dataset: 0.646199, with threshold = 0.6Recall value in training dataset: 0.532164, with threshold = 0.7Recall value in training dataset: 0.371345, with threshold = 0.8Recall value in training dataset: 0.204678, with threshold = 0.9
Testing
test_data = pd.read_csv('test.csv')
test_data['Sex'] = test_data['Sex'].map({'female':0, 'male':1})age_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Age']])test_data['Age'] = [age_avg if np.isnan(item) else item for item in test_data['Age']]test_data['Age'] = StandardScaler().fit_transform(test_data['Age'].values.reshape(-1,1))test_data['SibSp'] = StandardScaler().fit_transform(test_data['SibSp'].values.reshape(-1,1))test_data['Parch'] = StandardScaler().fit_transform(test_data['Parch'].values.reshape(-1,1))fare_avg = np.mean([0 if np.isnan(item) else item for item in test_data['Fare']])test_data['Fare'] = [fare_avg if np.isnan(item) else item for item in test_data['Fare']]test_data['Fare'] = StandardScaler().fit_transform(test_data['Fare'].values.reshape(-1,1))test_data['Embarked'] = test_data['Embarked'].map({'S':1, 'C':2, 'Q':3})pier = [0 if np.isnan(item) else item for item in test_data['Embarked']]test_data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]test_data = test_data.drop(columns=['Name','Ticket','Cabin'])
test_data.head()
train_data.head()
X_test = test_data.drop(['PassengerId'], axis=1)
X_test.head()
lr = LogisticRegression(C = best_c, penalty = 'l1')lr.fit(X.values, Y.values)Y_hat_proba = lr.predict_proba(X_test.values)Y_hat = [1 if y > 0.6 else 0 for y in Y_hat_proba[:,1]]results = pd.DataFrame(Y_hat, columns=['Survived'])results.insert(0, 'PassengerId', test_data['PassengerId'])results.to_csv('results.csv')
阅读全文
0 0
- 逻辑回归实战 — Kaggle_Titanic 2
- 逻辑回归实战 — Kaggle_Titanic
- 机器学习实战—逻辑回归
- 机器学习实战-逻辑回归
- 线性回归与逻辑回归实战
- 机器学习实战(四)——logisticRegression逻辑回归
- PYTHON机器学习实战——逻辑回归
- 机器学习理论与实战:逻辑回归
- Python机器学习实战之逻辑回归
- 机器学习实战【4】(逻辑回归)
- python之实战----逻辑回归战iris
- MXnet代码实战之多类逻辑回归
- 机器学习实战(5)逻辑回归
- 逻辑回归(2)
- 深度学习2线性回归,逻辑回归
- 逻辑回归 — Logistic Regression
- 机器学习理论与实战(四)逻辑回归
- 机器学习理论与实战(四)逻辑回归
- docker 安装tensorflow
- 逻辑的计算进路--从莱布尼茨到图灵的逻辑发展
- ResourceBundle 的一种使用方法
- HTTP协议之URL
- IT的道德与伦理
- 逻辑回归实战 — Kaggle_Titanic 2
- banner的使用。
- 【读书笔记】iOS-iOS定位
- linux学习笔记
- python基础(二)基础语法
- Android Wifi模块
- jquery正则表达式写法
- 使用微信做系统监控告警的脚本
- 科学中的直觉和反直觉