项目——数据预处理2+xgboost+knn
来源:互联网 发布:软件可靠性方法 编辑:程序博客网 时间:2024/06/03 11:18
20170410
#!/usr/bin/env python# -*- coding: utf-8 -*-import numpy as npimport pandas as pdimport seaborn as snssns.set_style('whitegrid')import matplotlib.pyplot as pltfrom scipy.cluster.hierarchy import linkage,fclusterdatafrom scipy.sparse import coo_matrix, bmatfrom sklearn import preprocessingPLOT = False #是否画图N = 1 #输出结果个数data = pd.read_csv(r'E:\7 Python\data\t2.csv')print data.head()print data.info()print data.columnsdata.columns = ['ID', 'FAULT_TYPE_3', 'work_months', 'save_months', 'FAULT_MONTH', 'INST_MONTH', 'PS_MONTH', 'SYNC_ORG_NO', 'SPEC_CODE', 'COMM_MODE', 'ARRIVE_BATCH_NO', 'MANUFACTURER', 'TYPE_CODE', 'EXCHG_TYPE_CODE', 'LC_FLAG', 'TL_SHARE_FLAG', 'MP_CAP', 'TRADE_CODE', 'ELEC_TYPE_CODE', 'RUN_CAP', 'CUST_STATUS_CODE', 'TRANSFER_CODE', 'TMP_FLAG']#故障作图print data['FAULT_TYPE_3'].value_counts()fig, axis0 = plt.subplots(1, 1)sns.countplot(x='FAULT_TYPE_3', data=data, ax=axis0)# plt.show()# SYNC_ORG_NOprint data['SYNC_ORG_NO'].describe()# plotdef plot_fun(name_fea, name_fault, figsize=None, fontsize=None): plt.figure(figsize=figsize) fig, axis1 = plt.subplots(1, 1) sns.countplot(x=name_fea, data=data, ax=axis1) plt.figure(figsize=figsize) fig, axis2 = plt.subplots(1, 1) c = data[name_fea].value_counts() s = c.cumsum() / c.sum() axis2.plot(np.arange(s.shape[0]) + 1, s.values * 100) axis2.set_title('precent of %s' % name_fea) plt.figure(figsize=figsize) fig, axis3 = plt.subplots(1, 1) sns.countplot(x=name_fea, hue=name_fault, data=data, ax=axis3) plt.legend(loc=2) plt.figure(figsize=figsize) fig, axis4 = plt.subplots(1, 1) sns.countplot(x=name_fault, hue=name_fea, data=data, ax=axis4) plt.legend(loc=2, fontsize=fontsize) # calculate similar score from scipy.cluster.hierarchy import dendrogram, linkage # clustermap fault_num1 = data.groupby([name_fault, name_fea])[data.columns[0]].count().unstack() ratio = fault_num1 / fault_num1.sum() g1 = sns.clustermap(ratio, cmap=plt.get_cmap('RdBu'), vmax=1, vmin=-1, linewidth=0, figsize=(10, 10), row_cluster=False, col_cluster=False ) plt.title('fault ratio')# 聚类函数def cluster_encoding(name): global data fault_num = data.groupby(['FAULT_TYPE_3', name])[data.columns[0]].count().unstack() MAN_ratio = fault_num / fault_num.sum() MAN_ratio_T = MAN_ratio.T clusters = fclusterdata(np.array(MAN_ratio_T), 1) print clusters.shape clusters_mapping = {label: idx for label, idx in zip(MAN_ratio.columns, clusters)} data[name] = data[name].map(clusters_mapping)# 编码函数def onehot_pre(name): global data le = preprocessing.LabelEncoder() le.fit(data[name]) cat_name = list(le.classes_) data[name] = le.transform(data[name]) return cat_name# plotif PLOT: plot_fun('SYNC_ORG_NO', 'FAULT_TYPE_3')# get_dummies# SYNC_ORG_dummies = pd.get_dummies(data['SYNC_ORG_NO'],prefix='SYNC_ORG_NO')SYNC_ORG_dummies = coo_matrix(pd.get_dummies(data['SYNC_ORG_NO'],prefix='SYNC_ORG_NO'))# #ORG省份故障统计# print data['SYNC_ORG_NO'].describe()# #plot# #get_dummies# ORG_freq = data['SYNC_ORG_NO'].value_counts().index[data['SYNC_ORG_NO'].value_counts().values<100]# # data['SYNC_ORG_NO'] = data['SYNC_ORG_NO'].replace(ORG_freq.values,0)#报错# if PLOT:# plot_fun('SYNC_ORG_NO', 'FAULT_TYPE_3', figsize=(20,6),fontsize=0.1)## ORG_dummies = coo_matrix(pd.get_dummies(data['SYNC_ORG_NO']))# 转化为稀疏矩阵# ORG_dummies.drop(['33101', '33407', '33411'], axis=1, inplace=True)# print ORG_dummies.shape##SPEC_CODEdata['SPEC_CODE'].describe()print data['SPEC_CODE'].value_counts()# if PLOT:# plot_fun('SPEC_CODE', 'FAULT_TYPE_3')# spec_freq = data['SPEC_CODE'].value_counts().index[data['SPEC_CODE'].value_counts().values<100]# #spec_mapping = {label:idx for label,idx in zip(spec_freq, np.zeros(len(spec_freq)))}# print spec_freq.values# data['SPEC_CODE'].value_counts()# data['SPEC_CODE'] = data['SPEC_CODE'].replace(spec_freq.values, 0)# print data['SPEC_CODE'].value_counts()#get_dummies# SPEC_dummies = pd.get_dummies(data['SPEC_CODE'],prefix='SPEC_CODE')SPEC_dummies = coo_matrix(pd.get_dummies(data['SPEC_CODE'],prefix='SPEC_CODE'))#MANUFACTURERdata['MANUFACTURER'].value_counts()print len(data['MANUFACTURER'].value_counts())# spec_freq = data['MANUFACTURER'].value_counts().index[data['MANUFACTURER'].value_counts().values<500]# data['MANUFACTURER'] = data['MANUFACTURER'].replace(spec_freq.values, 0)# print len(data['MANUFACTURER'].value_counts())# #plot# if PLOT:# plot_fun('MANUFACTURER', 'FAULT_TYPE',figsize=(20,6), fontsize=1)# print len(data['MANUFACTURER'].value_counts())#get_dummies# MAN_dummies = pd.get_dummies(data['MANUFACTURER'],prefix='MANUFACTURER')MAN_dummies = coo_matrix(pd.get_dummies(data['MANUFACTURER'],prefix='MANUFACTURER'))#COMM_MODEprint data['COMM_MODE'].value_counts()# if PLOT:# plot_fun('COMM_MODE', 'FAULT_TYPE')## COMM_freq = data['COMM_MODE'].value_counts().index[data['COMM_MODE'].value_counts().values<100]# data['COMM_MODE'] = data['COMM_MODE'].replace(COMM_freq.values, 0)# COMM_dummies = pd.get_dummies(data['COMM_MODE'],prefix='COMM_MODE')# 转化为稀疏矩阵COMM_dummies = coo_matrix(pd.get_dummies(data['COMM_MODE'],prefix='COMM_MODE'))# 转化为稀疏矩阵# 故障月份data['FAULT_MONTH'] = pd.Categorical(data['FAULT_MONTH'], ordered=True)if PLOT: m1 = data.groupby(['FAULT_MONTH', 'FAULT_TYPE_3']).size().unstack().reindex(index=np.arange(data.FAULT_MONTH.min(), data.FAULT_MONTH.max()+1)).fillna(0) m1.plot(kind='bar', figsize=(12, 12), subplots=True) plot_fun('FAULT_MONTH', 'FAULT_TYPE_3', fontsize=1)#get_dummies# FAUMON_dummies = pd.get_dummies(data['FAULT_MONTH'],prefix='FAULT_MONTH')FAUMON_dummies = coo_matrix(pd.get_dummies(data['FAULT_MONTH'],prefix='FAULT_MONTH'))#安装月份data['INST_MONTH'] = pd.Categorical(data['INST_MONTH'], ordered=True)if PLOT: m1 = data.groupby(['INST_MONTH', 'FAULT_TYPE_3']).size().unstack().reindex(index=np.arange(data.FAULT_MONTH.min(), data.FAULT_MONTH.max()+1)).fillna(0) m1.plot(kind='bar', figsize=(12, 12), subplots=True) plot_fun('INST_MONTH', 'FAULT_TYPE_3', fontsize=1)#get_dummies# INSMON_dummies = pd.get_dummies(data['INST_MONTH'],prefix='INST_MONTH')INSMON_dummies = coo_matrix(pd.get_dummies(data['INST_MONTH'],prefix='INST_MONTH'))#库存时间# #提取数字,负值变为nan# data['save_months'] = data['save_months'].str.extract('(?P<save_months>\d)?')if PLOT: c1 = data.groupby(['save_months']).size() c1.plot(kind='bar', figsize=(12, 6)) c2 = data.groupby(['save_months', 'FAULT_TYPE']).size().unstack().reindex( index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0) c2.plot(kind='bar', figsize=(12, 12), subplots=True) c3 = data.groupby(['save_months', 'SYNC_ORG_NO']).size().unstack().reindex( index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0) c3.plot(kind='bar', figsize=(12, 12), subplots=True)# # 归一化# min_max_scaler = preprocessing.MinMaxScaler()# data['save_months'] = min_max_scaler.fit_transform(data['save_months'])## save_dummies = pd.get_dummies(data['save_months'],prefix='save_months')save_dummies = coo_matrix(pd.get_dummies(data['save_months'],prefix='save_months'))# 工作时长# #提取数字,负值变为nan# data['work_months'] = data['work_months'].str.extract('(?P<work_months>\d)?')if PLOT: c1 = data.groupby(['work_months']).size() c1.plot(kind='bar', figsize=(12, 6)) c2 = data.groupby(['work_months', 'FAULT_TYPE_3']).size().unstack().reindex( index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0) c2.plot(kind='bar', figsize=(12, 12), subplots=True) c3 = data.groupby(['work_months', 'SYNC_ORG_NO']).size().unstack().reindex( index=np.arange(data.month.min(), data.month.max() + 1)).fillna(0) c3.plot(kind='bar', figsize=(12, 12), subplots=True)# 归一化# min_max_scaler = preprocessing.MinMaxScaler()# data['work_months'] = min_max_scaler.fit_transform(data['work_months'])# work_dummies = pd.get_dummies(data['work_months'],prefix='work_months')work_dummies = coo_matrix(pd.get_dummies(data['work_months'],prefix='work_months'))#ARRIVE_BATCH_NOprint len(data['ARRIVE_BATCH_NO'].value_counts())#cluster_encoding('ARRIVE_BATCH_NO')arr_freq = data['ARRIVE_BATCH_NO'].value_counts().index[data['ARRIVE_BATCH_NO'].value_counts().values<300]data['ARRIVE_BATCH_NO'] = data['ARRIVE_BATCH_NO'].replace(arr_freq.values, 0)print len(data['ARRIVE_BATCH_NO'].value_counts())#plotif PLOT: plot_fun('ARRIVE_BATCH_NO', 'FAULT_TYPE',figsize=(20,6), fontsize=1)#get_dummies# arr_dummies = pd.get_dummies(data['ARRIVE_BATCH_NO'],prefix='ARRIVE_BATCH_NO')arr_dummies = coo_matrix(pd.get_dummies(data['ARRIVE_BATCH_NO'],prefix='ARRIVE_BATCH_NO'))print data['ARRIVE_BATCH_NO'].value_counts()# TYPE_CODE# TYPE_dummies = pd.get_dummies(data['TYPE_CODE'],prefix='TYPE_CODE')TYPE_dummies = coo_matrix(pd.get_dummies(data['TYPE_CODE'],prefix='TYPE_CODE'))# EXCHG_TYPE_CODE# EXCHG_dummies = pd.get_dummies(data['EXCHG_TYPE_CODE'],prefix='EXCHG_TYPE_CODE')EXCHG_dummies = coo_matrix(pd.get_dummies(data['EXCHG_TYPE_CODE'],prefix='EXCHG_TYPE_CODE'))# LC_FLAG# LC_dummies = pd.get_dummies(data['LC_FLAG'],prefix='LC_FLAG')LC_dummies = coo_matrix(pd.get_dummies(data['LC_FLAG'],prefix='LC_FLAG'))# TL_SHARE_FLAG# TL_dummies = pd.get_dummies(data['TL_SHARE_FLAG'],prefix='TL_SHARE_FLAG')TL_dummies = coo_matrix(pd.get_dummies(data['TL_SHARE_FLAG'],prefix='TL_SHARE_FLAG'))# MP_CAP# MP_dummies = pd.get_dummies(data['MP_CAP'],prefix='MP_CAP')MP_dummies = coo_matrix(pd.get_dummies(data['MP_CAP'],prefix='MP_CAP'))# TRADE_CODE# TRADE_dummies = pd.get_dummies(data['TRADE_CODE'],prefix='TRADE_CODE')TRADE_dummies = coo_matrix(pd.get_dummies(data['TRADE_CODE'],prefix='TRADE_CODE'))# ELEC_TYPE_CODE# ELEC_dummies = pd.get_dummies(data['ELEC_TYPE_CODE'],prefix='ELEC_TYPE_CODE')ELEC_dummies = coo_matrix(pd.get_dummies(data['ELEC_TYPE_CODE'],prefix='ELEC_TYPE_CODE'))# RUN_CAP# RUN_dummies = pd.get_dummies(data['RUN_CAP'],prefix='RUN_CAP')RUN_dummies = coo_matrix(pd.get_dummies(data['RUN_CAP'],prefix='RUN_CAP'))# CUST_STATUS_CODE# CUST_dummies = pd.get_dummies(data['CUST_STATUS_CODE'],prefix='CUST_STATUS_CODE')CUST_dummies = coo_matrix(pd.get_dummies(data['CUST_STATUS_CODE'],prefix='CUST_STATUS_CODE'))# TRANSFER_CODE# TRANSFER_dummies = pd.get_dummies(data['TRANSFER_CODE'],prefix='TRANSFER_CODE')TRANSFER_dummies = coo_matrix(pd.get_dummies(data['TRANSFER_CODE'],prefix='TRANSFER_CODE'))# TMP_FLAG# TMP_dummies = pd.get_dummies(data['TMP_FLAG'],prefix='TMP_FLAG')TMP_dummies = coo_matrix(pd.get_dummies(data['TMP_FLAG'],prefix='TMP_FLAG'))#整合数据data = data.join(pd.DataFrame(bmat([[ SYNC_ORG_dummies, SPEC_dummies, COMM_dummies,work_dummies, save_dummies,INSMON_dummies, FAUMON_dummies,arr_dummies,MAN_dummies,TYPE_dummies, EXCHG_dummies,LC_dummies,TL_dummies,MP_dummies,TRADE_dummies, ELEC_dummies,RUN_dummies,CUST_dummies,TRANSFER_dummies,TMP_dummies]]).toarray()))data.drop(['PS_MONTH','ID','SYNC_ORG_NO','SPEC_CODE','COMM_MODE','work_months','save_months', 'INST_MONTH','FAULT_MONTH','ARRIVE_BATCH_NO','MANUFACTURER', 'TYPE_CODE','EXCHG_TYPE_CODE','LC_FLAG', 'TL_SHARE_FLAG','MP_CAP','TRADE_CODE','ELEC_TYPE_CODE','RUN_CAP', 'CUST_STATUS_CODE','TRANSFER_CODE','TMP_FLAG'], axis=1, inplace=True)del SYNC_ORG_dummies, SPEC_dummies, COMM_dummies,work_dummies,save_dummies,\ INSMON_dummies,FAUMON_dummies,arr_dummies,MAN_dummies,TYPE_dummies,\ EXCHG_dummies,LC_dummies,TL_dummies,MP_dummies,TRADE_dummies,ELEC_dummies,\ RUN_dummies,CUST_dummies,TRANSFER_dummies,TMP_dummiesprint data#机器学习算法故障预测from sklearn.model_selection import train_test_splitfrom sklearn.metrics import classification_report,confusion_matrixfrom sklearn.model_selection import GridSearchCVimport picklefrom scipy.sparse import csc_matrixdata_X = data.drop(['FAULT_TYPE_3'], axis=1)data_y = data['FAULT_TYPE_3']data_X = csc_matrix(data_X)#encode labelle = preprocessing.LabelEncoder()data_y = le.fit_transform(data_y)'''data_X1 = coo_matrix(data_X.ix[:200000]) data_X2 = coo_matrix(data_X.ix[200001:400000])data_X3 = coo_matrix(data_X.ix[400001:])data_X4 = bmat([[data_X1], [data_X2], [data_X3]], format='coo')'''#del data_X1, data_X2, data_X3train, test, train_y, test_y = train_test_split(data_X, data_y, test_size=0.33, random_state=27)#XGBoostimport xgboost as xgbfrom xgboost.sklearn import XGBClassifierTRAIN = True # 是否训练CV = False# split train set and test setdtrain = xgb.DMatrix(data_X, data_y)dtest = xgb.DMatrix(test)clf = xgb.XGBClassifier( learning_rate = 0.2, n_estimators = 720, max_depth = 9, colsample_bytree = 0.8, subsample = 0.9, objective = 'multi:softprob', min_child_weight = 1, gamma = 2, seed = 27 )param = clf.get_xgb_params()param['num_class'] = 11if CV: cvresult = xgb.cv(param, dtrain, num_boost_round=2000, nfold=3, stratified=True, metrics='merror', early_stopping_rounds=10, verbose_eval=True) clf.set_params(n_estimators=cvresult.shape[0]) # set n_estimators as cv roundsif TRAIN: clf.fit(data_X, data_y, eval_metric='merror')else: clf = pickle.load(open("zhejiang_4_all.pkl", "rb"))ypred_xgb = clf.predict(test)ypred_xgb = le.inverse_transform(ypred_xgb)test_y_xgb = le.inverse_transform(test_y)# print model report:print(classification_report(test_y_xgb, ypred_xgb))print(confusion_matrix(test_y_xgb, ypred_xgb))xgb.plot_importance(clf.booster())plt.show()pickle.dump(clf, open("zhejiang_4_all_jiaoliu.pkl", "wb"))# #knn# from sklearn import neighbors## USE_GridSearch = False# n_neighbors = 25## clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')# if USE_GridSearch:# param_test1 = {'n_neighbors': range(20, 60, 10), 'weights': ['uniform', 'distance']}# gsearch1 = GridSearchCV(estimator=clf, param_grid=param_test1, scoring='accuracy', n_jobs=-1, cv=2, verbose=True)# gsearch1.fit(train, train_y)# print gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_# clf = gsearch1# else:# clf.fit(train, train_y)# ypred_knn = clf.predict(test)# ypred_knn = le.inverse_transform(ypred_knn)# test_y_knn = le.inverse_transform(test_y)# # print model report:# print(classification_report(test_y_knn, ypred_knn))# print(confusion_matrix(test_y_knn, ypred_knn))# pickle.dump(clf, open("zhejiang_4_KNN.pkl", "wb"))
阅读全文
1 0
- 项目——数据预处理2+xgboost+knn
- 项目——数据预处理1
- 项目1:logit,GBM,knn,xgboost准确率测试
- 项目——预处理3
- 项目——预处理4
- SPSS——数据预处理
- 数据竞赛利器 —— xgboost 学习清单
- 分类算法之一——数据预处理
- 机器学习——数据预处理
- 统计分析——数据的预处理
- 数据预处理——One-hot编码
- 数据预处理——One-hot编码
- perl—数据预处理(1)
- 数据预处理(2)
- 数据预处理2
- 数据挖掘中的几个算法——knn
- 机器学习/数据挖掘——kNN分类器
- 数据挖掘算法入门1——knn
- C语言程序设计(33)
- 基于深层神经网络的命名实体识别技术
- docker 常用命令
- 计算器
- Java中String类的方法及说明
- 项目——数据预处理2+xgboost+knn
- <简单>RecyclerView仿新闻头条的频道管理
- 1.Android系统的四大组件
- 正则表达式其实并不难(上)
- 初学Android中用id来进行xml中的控件获取
- ubuntu 下部署 javaWeb 项目配置 jdk环境变量和安装tomcat7
- 通过mark和reset方法重复利用InputStream
- java中Blob数据存入数据库
- java引用类型---阅读笔记