特征工程
来源:互联网 发布:windows路径长度 编辑:程序博客网 时间:2024/05/08 02:19
In [1]:
import pandas as pdimport numpy as np%matplotlib inline
In [2]:
#载入数据:train = pd.read_csv('Train.csv')test = pd.read_csv('Test.csv')
In [3]:
train.shape, test.shape
Out[3]:
In [4]:
train.dtypes
Out[4]:
In [5]:
train.head(5)
Out[5]:
In [6]:
#合成一个总的datatrain['source']= 'train'test['source'] = 'test'data=pd.concat([train, test],ignore_index=True)data.shape
Out[6]:
In [7]:
data.apply(lambda x: sum(x.isnull()))
Out[7]:
In [8]:
var = ['Gender','Salary_Account','Mobile_Verified','Var1','Filled_Form','Device_Type','Var2','Source']for v in var: print '\n%s这一列数据的不同取值和出现的次数\n'%v print data[v].value_counts()
In [9]:
len(data['City'].unique())
Out[9]:
In [10]:
data.drop('City',axis=1,inplace=True)
In [11]:
data['DOB'].head()
Out[11]:
In [12]:
#创建一个年龄的字段Agedata['Age'] = data['DOB'].apply(lambda x: 115 - int(x[-2:]))data['Age'].head()
Out[12]:
In [13]:
#把原始的DOB字段去掉:data.drop('DOB',axis=1,inplace=True)
In [14]:
data.boxplot(column=['EMI_Loan_Submitted'],return_type='axes')
Out[14]:
In [15]:
#好像缺失值比较多,干脆就开一个新的字段,表明是缺失值还是不是缺失值data['EMI_Loan_Submitted_Missing'] = data['EMI_Loan_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)data[['EMI_Loan_Submitted','EMI_Loan_Submitted_Missing']].head(10)
Out[15]:
In [16]:
#原始那一列就可以不要了data.drop('EMI_Loan_Submitted',axis=1,inplace=True)
In [17]:
len(data['Employer_Name'].value_counts())
Out[17]:
In [18]:
#丢掉data.drop('Employer_Name',axis=1,inplace=True)
In [19]:
data.boxplot(column='Existing_EMI',return_type='axes')
Out[19]:
In [21]:
data['Existing_EMI'].describe()
Out[21]:
In [22]:
#缺省值不多,用均值代替data['Existing_EMI'].fillna(0, inplace=True)
In [23]:
data.boxplot(column=['Interest_Rate'],return_type='axes')
Out[23]:
In [24]:
#缺省值太多,也造一个字段,表示有无data['Interest_Rate_Missing'] = data['Interest_Rate'].apply(lambda x: 1 if pd.isnull(x) else 0)print data[['Interest_Rate','Interest_Rate_Missing']].head(10)
In [25]:
data.drop('Interest_Rate',axis=1,inplace=True)
In [26]:
#不!要!了!,是的,不要了!!!data.drop('Lead_Creation_Date',axis=1,inplace=True)data.head()
Out[26]:
In [27]:
#找中位数去填补缺省值(因为缺省的不多)data['Loan_Amount_Applied'].fillna(data['Loan_Amount_Applied'].median(),inplace=True)data['Loan_Tenure_Applied'].fillna(data['Loan_Tenure_Applied'].median(),inplace=True)
In [28]:
data.head()
Out[28]:
In [29]:
# 缺省值太多。。。是否缺省。。。data['Loan_Amount_Submitted_Missing'] = data['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)data['Loan_Tenure_Submitted_Missing'] = data['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
In [45]:
data.head()
Out[45]:
In [30]:
#原来的字段就没用了data.drop(['Loan_Amount_Submitted','Loan_Tenure_Submitted'],axis=1,inplace=True)
In [31]:
#没想好怎么用。。。不要了。。。data.drop('LoggedIn',axis=1,inplace=True)
In [32]:
# 可能对接多个银行,所以也不要了data.drop('Salary_Account',axis=1,inplace=True)
In [33]:
#和之前一样的处理,有或者没有data['Processing_Fee_Missing'] = data['Processing_Fee'].apply(lambda x: 1 if pd.isnull(x) else 0)#旧的字段不要了data.drop('Processing_Fee',axis=1,inplace=True)
In [34]:
data['Source'] = data['Source'].apply(lambda x: 'others' if x not in ['S122','S133'] else x)data['Source'].value_counts()
Out[34]:
In [35]:
data.head()
Out[35]:
In [36]:
data.describe()
Out[36]:
In [37]:
data.apply(lambda x: sum(x.isnull()))
Out[37]:
In [38]:
data.dtypes
Out[38]:
In [39]:
from sklearn.preprocessing import LabelEncoderle = LabelEncoder()var_to_encode = ['Device_Type','Filled_Form','Gender','Var1','Var2','Mobile_Verified','Source']for col in var_to_encode: data[col] = le.fit_transform(data[col])
In [40]:
data.head()
Out[40]:
In [41]:
data.dtypes
Out[41]:
In [42]:
data = pd.get_dummies(data, columns=var_to_encode)data.columns
Out[42]:
In [43]:
train = data.loc[data['source']=='train']test = data.loc[data['source']=='test']
In [44]:
train.drop('source',axis=1,inplace=True)test.drop(['source','Disbursed'],axis=1,inplace=True)
In [45]:
train.to_csv('train_modified.csv',index=False)test.to_csv('test_modified.csv',index=False)
In [ ]:
0 0
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 特征工程
- 第十六周项目1-(6)验证算法选择排序之堆排序
- JDK1.7 sun.net.ftp.FtpClient
- 奇怪的Java题:1000 == 1000为false,而100 == 100会为true?
- iOS 加载Bundle文件
- iOS系统库头文件中NS_AVAILABLE相关
- 特征工程
- 为何Java中子类重写方法的访问权限不能低于父类中权限
- 基于nginx容器的动态流量管理方案
- 第十四周项目4—Floyd算法验证
- 几张GIF看ConstraintLayout的特新
- caffe中 solver.prototxt文件
- Spring@Autowired注解与自动装配
- 第16周项目1 验证算法(7)归并排序
- LR-Linux-top详解