练手项目:loan_prediction问题

来源:互联网 发布:java 线程同步问题 编辑:程序博客网 时间:2024/06/06 03:16
# -*- coding: utf-8 -*-import numpy as npimport pandas as pdimport matplotlib.pyplot as plt%matplotlib inlinedf = pd.read_csv('D:/my_project/Loan_Prediction/LoanPredictionProblem_train.csv')df.head()
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status 0 LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y 1 LP001003 Male Yes 1 Graduate No 4583 1508.0 128.0 360.0 1.0 Rural N 2 LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y 3 LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y 4 LP001008 Male No 0 Graduate No 6000 0.0 141.0 360.0 1.0 Urban Y
# 快速数据探索
df.describe() # get summary of numerical variables
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History count 614.000000 614.000000 592.000000 600.00000 564.000000 mean 5403.459283 1621.245798 146.412162 342.00000 0.842199 std 6109.041673 2926.248369 85.587325 65.12041 0.364878 min 150.000000 0.000000 9.000000 12.00000 0.000000 25% 2877.500000 0.000000 100.000000 360.00000 1.000000 50% 3812.500000 1188.500000 128.000000 360.00000 1.000000 75% 5795.000000 2297.250000 168.000000 360.00000 1.000000 max 81000.000000 41667.000000 700.000000 480.00000 1.000000
# 对于非数值变量(e.g. Property_Area, Credit_History etc.), 观察频率分布是否合理。df['Property_Area'].value_counts()
Semiurban 233 Urban 202 Rural 179 Name: Property_Area, dtype: int64# 分布分析
# 研究变量的分布#1.ApplicantIncomedf['ApplicantIncome'].hist(bins=50)plt.show()
![png](output_5_0.png)
# boxplotdf.boxplot(column='ApplicantIncome')plt.show()
![png](output_6_0.png)
# 观察不同教育程度间的申请者收入情况df.boxplot(column='ApplicantIncome', by='Education')plt.show()
D:\Anaconda2\lib\site-packages\numpy\core\fromnumeric.py:57: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(…) instead return getattr(obj, method)(*args, **kwds)![png](output_7_1.png)
#2.LoanAmountdf['LoanAmount'].hist(bins=50)plt.show()
![png](output_8_0.png)
df.boxplot(column='LoanAmount')plt.show()
![png](output_9_0.png)# 分类变量分析
temp1 = df['Credit_History'].value_counts(ascending=True)temp2 = df.pivot_table(values='Loan_Status', index=['Credit_History'], aggfunc=lambda x: x.map({'Y': 1, 'N': 0}).mean())print 'Frequency Table for Credit History:' print temp1print '\nProbility of getting loan for each Credit History class:' print temp2
Frequency Table for Credit History: 0.0 89 1.0 475 Name: Credit_History, dtype: int64 Probility of getting loan for each Credit History class: Loan_Status Credit_History 0.0 0.078652 1.0 0.795789
import matplotlib.pyplot as pltfig, (ax1, ax2) = plt.subplots(nrows=1,ncols=2,figsize=(10,5))temp1.plot(kind='bar', ax=ax1)ax1.set(title='Probability of getting loan by credit history', xlabel='Credit_History', ylabel='Probability of getting loan')temp2.plot(kind='bar', ax=ax2)ax2.set(title='Probability of getting loan by credit history',xlabel='Credit_History', ylabel='')plt.show()
![png](output_12_0.png)
temp3 = pd.crosstab(df['Credit_History'], df['Loan_Status'])temp3.plot(kind='bar',stacked=True,color=['red','blue'], grid=False)
#check 缺失值df.isnull().sum()
Loan_ID 0 Gender 13 Married 3 Dependents 15 Education 0 Self_Employed 32 ApplicantIncome 0 CoapplicantIncome 0 LoanAmount 22 Loan_Amount_Term 14 Credit_History 50 Property_Area 0 Loan_Status 0 dtype: int64
# fill missing valuesfrom scipy.stats import modedf['Gender'].fillna(mode(df['Gender']).mode[0], inplace=True)df['Married'].fillna(mode(df['Married']).mode[0], inplace=True)df['Self_Employed'].fillna(mode(df['Self_Employed']).mode[0], inplace=True)df['Credit_History'].fillna(mode(df['Credit_History']).mode[0], inplace=True)df.isnull().sum()
D:\Anaconda2\lib\site-packages\scipy\stats\stats.py:253: RuntimeWarning: The input array could not be properly checked for nan values. nan values will be ignored. “values. nan values will be ignored.”, RuntimeWarning) Loan_ID 0 Gender 0 Married 0 Dependents 15 Education 0 Self_Employed 0 ApplicantIncome 0 CoapplicantIncome 0 LoanAmount 22 Loan_Amount_Term 14 Credit_History 0 Property_Area 0 Loan_Status 0 dtype: int64
# 根据透视表给LoanAmount填充值impute_grps = df.pivot_table(values=['LoanAmount'],index=['Gender','Married','Self_Employed'],aggfunc=np.mean)print impute_grps
LoanAmount Gender Married Self_Employed Female No No 114.691176 Yes 125.800000 Yes No 134.222222 Yes 282.250000 Male No No 129.936937 Yes 180.588235 Yes No 153.882736 Yes 169.395833
for i, row in df.loc[df['LoanAmount'].isnull(),:].iterrows():    ind = tuple([row['Gender'],row['Married'],row['Self_Employed']])    df.loc[i,'LoanAmount'] = impute_grps.loc[ind].values[0]# 检查填充缺失值是否成功df.isnull().sum()
Loan_ID 0 Gender 0 Married 0 Dependents 15 Education 0 Self_Employed 0 ApplicantIncome 0 CoapplicantIncome 0 LoanAmount 0 Loan_Amount_Term 14 Credit_History 0 Property_Area 0 Loan_Status 0 dtype: int64
# 对LoanAmount and ApplicantIncome取对数df['LoanAmount'] = np.log(df['LoanAmount'])df['TotalIncome'] = df['ApplicantIncome'] + df['ApplicantIncome']df['TotalIncome_log'] = np.log(df['TotalIncome'])
# 将非数值型变量转换为数值型from sklearn.preprocessing import LabelEncodermod_var = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']le = LabelEncoder()for i in mod_var:    df[i] = le.fit_transform(df[i])df.dtypesprint df.head()
Loan_ID Gender Married Dependents Education Self_Employed \ 0 LP001002 1 0 15 0 0 1 LP001003 1 1 16 0 0 2 LP001005 1 1 15 0 1 3 LP001006 1 1 15 1 0 4 LP001008 1 0 15 0 0 ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term \ 0 5849 0.0 4.867049 360.0 1 4583 1508.0 4.852030 360.0 2 3000 0.0 4.189655 360.0 3 2583 2358.0 4.787492 360.0 4 6000 0.0 4.948760 360.0 Credit_History Property_Area Loan_Status TotalIncome TotalIncome_log 0 1.0 2 1 11698 9.367173 1 1.0 0 0 9166 9.123256 2 1.0 2 1 6000 8.699515 3 1.0 2 1 5166 8.549854 4 1.0 2 1 12000 9.392662 # 构建预测模型
from sklearn import metricsfrom sklearn.model_selection import train_test_splitX = df[['Gender','Married','Dependents','Education','Self_Employed','LoanAmount','Credit_History','Property_Area','TotalIncome_log']]y = df['Loan_Status']train_X, test_X, train_y, test_y = train_test_split(X, y)train_X.shape,train_y.shape,test_X.shape, test_y.shape
((460, 9), (460,), (154, 9), (154,))
#Logistic Regressionfrom sklearn.linear_model import LogisticRegressionlogis = LogisticRegression()logis.fit(train_X, train_y)predicted = logis.predict(test_X)expected = test_ylogis_score_train = logis.score(train_X, train_y)print('Training score:', logis_score_train)logis_score_test = logis.score(test_X, test_y)print('Testing score:', logis_score_test)
(‘Training score:’, 0.80652173913043479) (‘Testing score:’, 0.81818181818181823)
df = df[['Gender','Married','Dependents','Education','Self_Employed','LoanAmount','Credit_History','Property_Area','TotalIncome_log']]coeff_df = pd.DataFrame(df.columns.delete(0))coeff_df.columns = ['Features']coeff_df['Correlation'] = pd.Series(logis.coef_[0])coeff_df.sort_values(by='Correlation', ascending=False)
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } Features Correlation 6 Property_Area 3.079558 1 Dependents 0.553209 4 LoanAmount 0.016641 2 Education -0.022159 7 TotalIncome_log -0.025307 0 Married -0.089583 3 Self_Employed -0.345603 5 Credit_History -0.462348
#Decision Treefrom sklearn.tree import DecisionTreeClassifierdt = DecisionTreeClassifier()dt.fit(train_X, train_y)dt_score_train = dt.score(train_X, train_y)print('Training score: ', dt_score_train)dt_score_test = dt.score(test_X, test_y)print('Testing score: ', dt_score_test)
(‘Training score: ‘, 1.0) (‘Testing score: ‘, 0.69480519480519476)
#Random Forestfrom sklearn.ensemble import RandomForestClassifierrft = RandomForestClassifier()rft.fit(train_X, train_y)rft_score_train = rft.score(train_X, train_y)print('Training score: ', rft_score_train)rft_score_test = rft.score(test_X, test_y)print('Testing score: ', rft_score_test)
(‘Training score: ‘, 0.98478260869565215) (‘Testing score: ‘, 0.75324675324675328)
#Model comparisionmodels = pd.DataFrame({    'model':['Logistic Regression', 'Decision Tree', 'Random Forest'],    'training_score':[logis_score_train,dt_score_train,rft_score_train],    'testing_score':[logis_score_test,dt_score_test,rft_score_test]})models.sort_values(by='testing_score', ascending=True)
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } model testing_score training_score 1 Decision Tree 0.694805 1.000000 2 Random Forest 0.753247 0.984783 0 Logistic Regression 0.818182 0.806522
原创粉丝点击