Python信贷数据处理与初步分析(ZIP解压)

来源:互联网 发布:python 微信接口 编辑:程序博客网 时间:2024/06/05 03:48
#!/usr/bin/python# coding=utf-8'''     @author: lenovo@software: 3.6 PyCharm@file: 8W信贷数据处理.py@time: 20170531@function:Credit data processing and preliminary analysis          信贷数据处理与初步分析@edition :1.0'''#导入模块from __future__ import division, print_functionimport osimport pandas as pdimport zipfile#设置文件位置文件夹名(本文件为zip)dataset_path = 'C:\\Users\lenovo\Desktop...\dataset'#设置文件名注意后缀zip_file_name = 'loan.zip'#此位置CSVcsv_file_name = './loan.csv'def run_main():    '''    主函数    '''    zip_file_path=os.path.join(dataset_path,zip_file_name)    csv_file_path=os.path.join(dataset_path,csv_file_name)#如果不存在CSV文件,解压zip文件    if not os.path.exists(csv_file_path):          with zipfile.ZipFile(zip_file_path) as zf:              zf.extractall(dataset_path)    # 读取数据    raw_data = pd.read_csv(csv_file_path,engine='python')    #查看数据集    print('\n数据预览:',raw_data.head())    print(' \n 数据描述: ')    print(raw_data.describe())    print('\n数据集基本信息: ')    print(raw_data.info())    #选择列    used_cols = ['loan_amnt', 'term', 'int_rate', 'grade', 'issue_d', 'addr_state']    used_data = raw_data[used_cols]    print('\n数据预览',used_data.head())    # #Q:按月份统计借贷金额    print('\n时间序列转换ING')    #原来为dec_12 改为 datetime    used_data['issue_d2']=pd.to_datetime(used_data['issue_d'])    print('\n数据预览')    print(used_data.head())    print('\数据基本信息',used_data.info)    #分组求和    data_group_by_date=used_data.groupby(['issue_d2']).sum()    #给新列命名    data_group_by_date.reset_index(inplace=True)    #apply(直接跟函数M)    data_group_by_date['issue_month']=data_group_by_date['issue_d2'].apply(lambda x:x.to_period('M'))    load_amout_group_by_month=data_group_by_date.groupby('issue_month')['loan_amnt'].sum()    #结果转换为dataframe    load_amout_group_by_month_df=pd.DataFrame(load_amout_group_by_month).reset_index()    print('\n按月统计借贷总额预览:',load_amout_group_by_month_df.head())    #保存结果,输出结果为load_amout_group_by_month无df    load_amout_group_by_month_df.to_csv('C:/Users/lenovo/Desktop/.../output/load_amouta_by_month.csv',index=False)    #Q:按州统计借贷金额`    data_group_by_state=used_data.groupby(['addr_state'])['loan_amnt'].sum()    #结果转DATAFRAME    load_amout_group_by_state_df=pd.DataFrame(data_group_by_state).reset_index()    print('/n按州统计预览',data_group_by_state.head())    load_amout_group_by_state_df.to_csv('C:/Users/lenovo/Desktop/...output/load_amout_by_state.csv',index=False)    #Q:借贷评级、期限和利率关系    #根据grade,term分组,int_rate求平均    data_group_by_grade_term=used_data.groupby(['grade','term'])['int_rate'].mean()    data_group_by_grade_term_df=pd.DataFrame(data_group_by_grade_term).reset_index()    print('/n借贷评级、期限和利率关系预览:',data_group_by_grade_term_df.head())    data_group_by_grade_term_df.to_csv('C:/Users/lenovo/Desktop/...output/intrate_by_grade_term.scv',index=False)    #if exists CSV文件删除,释放空间    if os.path.exists(csv_file_path):        os.remove(csv_file_path)'''    if  used_data['']=  :        return    if  used_data['']=    :        return    else:        return    print(raw_data.head())'''if __name__ =='__main__':    run_main()

原创粉丝点击