项目——数据预处理2+xgboost+knn

来源:互联网 发布:软件可靠性方法 编辑:程序博客网 时间:2024/06/03 11:18

20170410

#!/usr/bin/env python# -*- coding: utf-8 -*-import numpy as npimport pandas as pdimport seaborn as snssns.set_style('whitegrid')import matplotlib.pyplot as pltfrom scipy.cluster.hierarchy import linkage,fclusterdatafrom scipy.sparse import coo_matrix, bmatfrom sklearn import preprocessingPLOT = False  #是否画图N = 1    #输出结果个数data = pd.read_csv(r'E:\7 Python\data\t2.csv')print data.head()print data.info()print data.columnsdata.columns = ['ID', 'FAULT_TYPE_3', 'work_months', 'save_months',       'FAULT_MONTH', 'INST_MONTH', 'PS_MONTH', 'SYNC_ORG_NO',       'SPEC_CODE', 'COMM_MODE', 'ARRIVE_BATCH_NO', 'MANUFACTURER',       'TYPE_CODE', 'EXCHG_TYPE_CODE', 'LC_FLAG', 'TL_SHARE_FLAG',       'MP_CAP', 'TRADE_CODE', 'ELEC_TYPE_CODE', 'RUN_CAP',       'CUST_STATUS_CODE', 'TRANSFER_CODE', 'TMP_FLAG']#故障作图print data['FAULT_TYPE_3'].value_counts()fig, axis0 = plt.subplots(1, 1)sns.countplot(x='FAULT_TYPE_3', data=data, ax=axis0)# plt.show()# SYNC_ORG_NOprint data['SYNC_ORG_NO'].describe()# plotdef plot_fun(name_fea, name_fault, figsize=None, fontsize=None):    plt.figure(figsize=figsize)    fig, axis1 = plt.subplots(1, 1)    sns.countplot(x=name_fea, data=data, ax=axis1)    plt.figure(figsize=figsize)    fig, axis2 = plt.subplots(1, 1)    c = data[name_fea].value_counts()    s = c.cumsum() / c.sum()    axis2.plot(np.arange(s.shape[0]) + 1, s.values * 100)    axis2.set_title('precent of %s' % name_fea)    plt.figure(figsize=figsize)    fig, axis3 = plt.subplots(1, 1)    sns.countplot(x=name_fea, hue=name_fault, data=data, ax=axis3)    plt.legend(loc=2)    plt.figure(figsize=figsize)    fig, axis4 = plt.subplots(1, 1)    sns.countplot(x=name_fault, hue=name_fea, data=data, ax=axis4)    plt.legend(loc=2, fontsize=fontsize)    # calculate similar score    from scipy.cluster.hierarchy import dendrogram, linkage    # clustermap    fault_num1 = data.groupby([name_fault, name_fea])[data.columns[0]].count().unstack()    ratio = fault_num1 / fault_num1.sum()    g1 = sns.clustermap(ratio,                        cmap=plt.get_cmap('RdBu'),                        vmax=1,                        vmin=-1,                        linewidth=0,                        figsize=(10, 10),                        row_cluster=False,                        col_cluster=False                        )    plt.title('fault ratio')# 聚类函数def cluster_encoding(name):    global data    fault_num = data.groupby(['FAULT_TYPE_3', name])[data.columns[0]].count().unstack()    MAN_ratio = fault_num / fault_num.sum()    MAN_ratio_T = MAN_ratio.T    clusters = fclusterdata(np.array(MAN_ratio_T), 1)    print clusters.shape    clusters_mapping = {label: idx for label, idx in zip(MAN_ratio.columns, clusters)}    data[name] = data[name].map(clusters_mapping)# 编码函数def onehot_pre(name):    global data    le = preprocessing.LabelEncoder()    le.fit(data[name])    cat_name = list(le.classes_)    data[name] = le.transform(data[name])    return cat_name# plotif PLOT:    plot_fun('SYNC_ORG_NO', 'FAULT_TYPE_3')# get_dummies# SYNC_ORG_dummies = pd.get_dummies(data['SYNC_ORG_NO'],prefix='SYNC_ORG_NO')SYNC_ORG_dummies = coo_matrix(pd.get_dummies(data['SYNC_ORG_NO'],prefix='SYNC_ORG_NO'))# #ORG省份故障统计# print data['SYNC_ORG_NO'].describe()# #plot# #get_dummies# ORG_freq = data['SYNC_ORG_NO'].value_counts().index[data['SYNC_ORG_NO'].value_counts().values<100]# # data['SYNC_ORG_NO'] = data['SYNC_ORG_NO'].replace(ORG_freq.values,0)#报错# if PLOT:#     plot_fun('SYNC_ORG_NO', 'FAULT_TYPE_3', figsize=(20,6),fontsize=0.1)## ORG_dummies = coo_matrix(pd.get_dummies(data['SYNC_ORG_NO']))# 转化为稀疏矩阵# ORG_dummies.drop(['33101', '33407', '33411'], axis=1, inplace=True)# print ORG_dummies.shape##SPEC_CODEdata['SPEC_CODE'].describe()print data['SPEC_CODE'].value_counts()# if PLOT:#     plot_fun('SPEC_CODE', 'FAULT_TYPE_3')# spec_freq = data['SPEC_CODE'].value_counts().index[data['SPEC_CODE'].value_counts().values<100]# #spec_mapping = {label:idx for label,idx in zip(spec_freq, np.zeros(len(spec_freq)))}# print spec_freq.values# data['SPEC_CODE'].value_counts()# data['SPEC_CODE'] = data['SPEC_CODE'].replace(spec_freq.values, 0)# print data['SPEC_CODE'].value_counts()#get_dummies# SPEC_dummies = pd.get_dummies(data['SPEC_CODE'],prefix='SPEC_CODE')SPEC_dummies = coo_matrix(pd.get_dummies(data['SPEC_CODE'],prefix='SPEC_CODE'))#MANUFACTURERdata['MANUFACTURER'].value_counts()print len(data['MANUFACTURER'].value_counts())# spec_freq = data['MANUFACTURER'].value_counts().index[data['MANUFACTURER'].value_counts().values<500]# data['MANUFACTURER'] = data['MANUFACTURER'].replace(spec_freq.values, 0)# print len(data['MANUFACTURER'].value_counts())# #plot# if PLOT:#     plot_fun('MANUFACTURER', 'FAULT_TYPE',figsize=(20,6), fontsize=1)# print len(data['MANUFACTURER'].value_counts())#get_dummies# MAN_dummies = pd.get_dummies(data['MANUFACTURER'],prefix='MANUFACTURER')MAN_dummies = coo_matrix(pd.get_dummies(data['MANUFACTURER'],prefix='MANUFACTURER'))#COMM_MODEprint data['COMM_MODE'].value_counts()# if PLOT:#     plot_fun('COMM_MODE', 'FAULT_TYPE')## COMM_freq = data['COMM_MODE'].value_counts().index[data['COMM_MODE'].value_counts().values<100]# data['COMM_MODE'] = data['COMM_MODE'].replace(COMM_freq.values, 0)# COMM_dummies = pd.get_dummies(data['COMM_MODE'],prefix='COMM_MODE')# 转化为稀疏矩阵COMM_dummies = coo_matrix(pd.get_dummies(data['COMM_MODE'],prefix='COMM_MODE'))# 转化为稀疏矩阵# 故障月份data['FAULT_MONTH'] = pd.Categorical(data['FAULT_MONTH'], ordered=True)if PLOT:    m1 = data.groupby(['FAULT_MONTH', 'FAULT_TYPE_3']).size().unstack().reindex(index=np.arange(data.FAULT_MONTH.min(), data.FAULT_MONTH.max()+1)).fillna(0)    m1.plot(kind='bar', figsize=(12, 12), subplots=True)    plot_fun('FAULT_MONTH', 'FAULT_TYPE_3', fontsize=1)#get_dummies# FAUMON_dummies = pd.get_dummies(data['FAULT_MONTH'],prefix='FAULT_MONTH')FAUMON_dummies = coo_matrix(pd.get_dummies(data['FAULT_MONTH'],prefix='FAULT_MONTH'))#安装月份data['INST_MONTH'] = pd.Categorical(data['INST_MONTH'], ordered=True)if PLOT:    m1 = data.groupby(['INST_MONTH', 'FAULT_TYPE_3']).size().unstack().reindex(index=np.arange(data.FAULT_MONTH.min(), data.FAULT_MONTH.max()+1)).fillna(0)    m1.plot(kind='bar', figsize=(12, 12), subplots=True)    plot_fun('INST_MONTH', 'FAULT_TYPE_3', fontsize=1)#get_dummies# INSMON_dummies = pd.get_dummies(data['INST_MONTH'],prefix='INST_MONTH')INSMON_dummies = coo_matrix(pd.get_dummies(data['INST_MONTH'],prefix='INST_MONTH'))#库存时间# #提取数字,负值变为nan# data['save_months'] = data['save_months'].str.extract('(?P<save_months>\d)?')if PLOT:    c1 = data.groupby(['save_months']).size()    c1.plot(kind='bar', figsize=(12, 6))    c2 = data.groupby(['save_months', 'FAULT_TYPE']).size().unstack().reindex(        index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0)    c2.plot(kind='bar', figsize=(12, 12), subplots=True)    c3 = data.groupby(['save_months', 'SYNC_ORG_NO']).size().unstack().reindex(        index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0)    c3.plot(kind='bar', figsize=(12, 12), subplots=True)# # 归一化# min_max_scaler = preprocessing.MinMaxScaler()# data['save_months'] = min_max_scaler.fit_transform(data['save_months'])## save_dummies = pd.get_dummies(data['save_months'],prefix='save_months')save_dummies = coo_matrix(pd.get_dummies(data['save_months'],prefix='save_months'))# 工作时长# #提取数字,负值变为nan# data['work_months'] = data['work_months'].str.extract('(?P<work_months>\d)?')if PLOT:    c1 = data.groupby(['work_months']).size()    c1.plot(kind='bar', figsize=(12, 6))    c2 = data.groupby(['work_months', 'FAULT_TYPE_3']).size().unstack().reindex(        index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0)    c2.plot(kind='bar', figsize=(12, 12), subplots=True)    c3 = data.groupby(['work_months', 'SYNC_ORG_NO']).size().unstack().reindex(        index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0)    c3.plot(kind='bar', figsize=(12, 12), subplots=True)# 归一化# min_max_scaler = preprocessing.MinMaxScaler()# data['work_months'] = min_max_scaler.fit_transform(data['work_months'])# work_dummies = pd.get_dummies(data['work_months'],prefix='work_months')work_dummies = coo_matrix(pd.get_dummies(data['work_months'],prefix='work_months'))#ARRIVE_BATCH_NOprint len(data['ARRIVE_BATCH_NO'].value_counts())#cluster_encoding('ARRIVE_BATCH_NO')arr_freq = data['ARRIVE_BATCH_NO'].value_counts().index[data['ARRIVE_BATCH_NO'].value_counts().values<300]data['ARRIVE_BATCH_NO'] = data['ARRIVE_BATCH_NO'].replace(arr_freq.values, 0)print len(data['ARRIVE_BATCH_NO'].value_counts())#plotif PLOT:    plot_fun('ARRIVE_BATCH_NO', 'FAULT_TYPE',figsize=(20,6), fontsize=1)#get_dummies# arr_dummies = pd.get_dummies(data['ARRIVE_BATCH_NO'],prefix='ARRIVE_BATCH_NO')arr_dummies = coo_matrix(pd.get_dummies(data['ARRIVE_BATCH_NO'],prefix='ARRIVE_BATCH_NO'))print data['ARRIVE_BATCH_NO'].value_counts()# TYPE_CODE# TYPE_dummies = pd.get_dummies(data['TYPE_CODE'],prefix='TYPE_CODE')TYPE_dummies = coo_matrix(pd.get_dummies(data['TYPE_CODE'],prefix='TYPE_CODE'))# EXCHG_TYPE_CODE# EXCHG_dummies = pd.get_dummies(data['EXCHG_TYPE_CODE'],prefix='EXCHG_TYPE_CODE')EXCHG_dummies = coo_matrix(pd.get_dummies(data['EXCHG_TYPE_CODE'],prefix='EXCHG_TYPE_CODE'))# LC_FLAG# LC_dummies = pd.get_dummies(data['LC_FLAG'],prefix='LC_FLAG')LC_dummies = coo_matrix(pd.get_dummies(data['LC_FLAG'],prefix='LC_FLAG'))# TL_SHARE_FLAG# TL_dummies = pd.get_dummies(data['TL_SHARE_FLAG'],prefix='TL_SHARE_FLAG')TL_dummies = coo_matrix(pd.get_dummies(data['TL_SHARE_FLAG'],prefix='TL_SHARE_FLAG'))# MP_CAP# MP_dummies = pd.get_dummies(data['MP_CAP'],prefix='MP_CAP')MP_dummies = coo_matrix(pd.get_dummies(data['MP_CAP'],prefix='MP_CAP'))# TRADE_CODE# TRADE_dummies = pd.get_dummies(data['TRADE_CODE'],prefix='TRADE_CODE')TRADE_dummies = coo_matrix(pd.get_dummies(data['TRADE_CODE'],prefix='TRADE_CODE'))# ELEC_TYPE_CODE# ELEC_dummies = pd.get_dummies(data['ELEC_TYPE_CODE'],prefix='ELEC_TYPE_CODE')ELEC_dummies = coo_matrix(pd.get_dummies(data['ELEC_TYPE_CODE'],prefix='ELEC_TYPE_CODE'))# RUN_CAP# RUN_dummies = pd.get_dummies(data['RUN_CAP'],prefix='RUN_CAP')RUN_dummies = coo_matrix(pd.get_dummies(data['RUN_CAP'],prefix='RUN_CAP'))# CUST_STATUS_CODE# CUST_dummies = pd.get_dummies(data['CUST_STATUS_CODE'],prefix='CUST_STATUS_CODE')CUST_dummies = coo_matrix(pd.get_dummies(data['CUST_STATUS_CODE'],prefix='CUST_STATUS_CODE'))# TRANSFER_CODE# TRANSFER_dummies = pd.get_dummies(data['TRANSFER_CODE'],prefix='TRANSFER_CODE')TRANSFER_dummies = coo_matrix(pd.get_dummies(data['TRANSFER_CODE'],prefix='TRANSFER_CODE'))# TMP_FLAG# TMP_dummies = pd.get_dummies(data['TMP_FLAG'],prefix='TMP_FLAG')TMP_dummies = coo_matrix(pd.get_dummies(data['TMP_FLAG'],prefix='TMP_FLAG'))#整合数据data = data.join(pd.DataFrame(bmat([[ SYNC_ORG_dummies, SPEC_dummies, COMM_dummies,work_dummies,                                      save_dummies,INSMON_dummies,                                      FAUMON_dummies,arr_dummies,MAN_dummies,TYPE_dummies,                                      EXCHG_dummies,LC_dummies,TL_dummies,MP_dummies,TRADE_dummies,                                      ELEC_dummies,RUN_dummies,CUST_dummies,TRANSFER_dummies,TMP_dummies]]).toarray()))data.drop(['PS_MONTH','ID','SYNC_ORG_NO','SPEC_CODE','COMM_MODE','work_months','save_months',           'INST_MONTH','FAULT_MONTH','ARRIVE_BATCH_NO','MANUFACTURER',           'TYPE_CODE','EXCHG_TYPE_CODE','LC_FLAG',           'TL_SHARE_FLAG','MP_CAP','TRADE_CODE','ELEC_TYPE_CODE','RUN_CAP',           'CUST_STATUS_CODE','TRANSFER_CODE','TMP_FLAG'], axis=1, inplace=True)del SYNC_ORG_dummies, SPEC_dummies, COMM_dummies,work_dummies,save_dummies,\    INSMON_dummies,FAUMON_dummies,arr_dummies,MAN_dummies,TYPE_dummies,\    EXCHG_dummies,LC_dummies,TL_dummies,MP_dummies,TRADE_dummies,ELEC_dummies,\    RUN_dummies,CUST_dummies,TRANSFER_dummies,TMP_dummiesprint data#机器学习算法故障预测from sklearn.model_selection import train_test_splitfrom sklearn.metrics import classification_report,confusion_matrixfrom sklearn.model_selection import GridSearchCVimport picklefrom scipy.sparse import csc_matrixdata_X = data.drop(['FAULT_TYPE_3'], axis=1)data_y = data['FAULT_TYPE_3']data_X = csc_matrix(data_X)#encode labelle = preprocessing.LabelEncoder()data_y = le.fit_transform(data_y)'''data_X1 = coo_matrix(data_X.ix[:200000]) data_X2 = coo_matrix(data_X.ix[200001:400000])data_X3 = coo_matrix(data_X.ix[400001:])data_X4 = bmat([[data_X1], [data_X2], [data_X3]], format='coo')'''#del data_X1, data_X2, data_X3train, test, train_y, test_y = train_test_split(data_X, data_y, test_size=0.33, random_state=27)#XGBoostimport xgboost as xgbfrom xgboost.sklearn import XGBClassifierTRAIN = True  # 是否训练CV = False# split train set and test setdtrain = xgb.DMatrix(data_X, data_y)dtest = xgb.DMatrix(test)clf = xgb.XGBClassifier(    learning_rate = 0.2,    n_estimators = 720,    max_depth = 9,    colsample_bytree = 0.8,    subsample = 0.9,    objective = 'multi:softprob',    min_child_weight = 1,    gamma = 2,    seed = 27    )param = clf.get_xgb_params()param['num_class'] = 11if CV:    cvresult = xgb.cv(param, dtrain, num_boost_round=2000, nfold=3, stratified=True,                      metrics='merror', early_stopping_rounds=10, verbose_eval=True)    clf.set_params(n_estimators=cvresult.shape[0])  # set n_estimators as cv roundsif TRAIN:    clf.fit(data_X, data_y, eval_metric='merror')else:    clf = pickle.load(open("zhejiang_4_all.pkl", "rb"))ypred_xgb = clf.predict(test)ypred_xgb = le.inverse_transform(ypred_xgb)test_y_xgb = le.inverse_transform(test_y)# print model report:print(classification_report(test_y_xgb, ypred_xgb))print(confusion_matrix(test_y_xgb, ypred_xgb))xgb.plot_importance(clf.booster())plt.show()pickle.dump(clf, open("zhejiang_4_all_jiaoliu.pkl", "wb"))# #knn# from sklearn import neighbors## USE_GridSearch = False# n_neighbors = 25## clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')# if USE_GridSearch:#     param_test1 = {'n_neighbors': range(20, 60, 10), 'weights': ['uniform', 'distance']}#     gsearch1 = GridSearchCV(estimator=clf, param_grid=param_test1, scoring='accuracy', n_jobs=-1, cv=2, verbose=True)#     gsearch1.fit(train, train_y)#     print gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_#     clf = gsearch1# else:#     clf.fit(train, train_y)# ypred_knn = clf.predict(test)# ypred_knn = le.inverse_transform(ypred_knn)# test_y_knn = le.inverse_transform(test_y)# # print model report:# print(classification_report(test_y_knn, ypred_knn))# print(confusion_matrix(test_y_knn, ypred_knn))# pickle.dump(clf, open("zhejiang_4_KNN.pkl", "wb"))