集成算法-xgboost/bagging/voting

来源:互联网 发布:中国农村淘宝代购店 编辑:程序博客网 时间:2024/06/03 17:27
#!/usr/bin/env python# -*- coding: utf-8 -*-''' bagging try'''import pandas as pdimport numpy as npfrom numpy import NaNfrom dateutil.parser import parse'''预处理数据'''# dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d')# data = pd.read_csv(r'E:\7 Python\data\20170616\zhejiang-new0620.csv',#                    dtype={'"EQUIP_ID"': object, 'FAULT_TYPE': object, 'INST_DATE':object, 'DETECT_DATE':object,#                           'FAULT_DATE':object, 'SYNC_ORG_NO': object, 'ORG_NO': object, 'ORG_NAME': object, 'SORT_CODE':object,#                           'SPEC_CODE': object, 'COMM_MODE': object, 'ARRIVE_BATCH_NO': object, 'MANUFACTURER': object },# date_parser=dateparse)## data.drop('ORG_NAME', axis=1, inplace=True)# # 去重去空# data = data.drop_duplicates()# data.dropna(how='all')# print data.info()# '''# 1.时间处理# '''# # 时间处理# data['FAULT_DATE1'] = pd.to_datetime(data['FAULT_DATE'].str.strip().str.split(' ').str[0])# data['INST_DATE1'] = pd.to_datetime(data['INST_DATE'].str.strip().str.split(' ').str[0])# data['DETECT_DATE1'] = pd.to_datetime(data['DETECT_DATE'].str.strip().str.split(' ').str[0], errors='coerce')# # 提取月份# data['FAULT_MONTH'] = [x.month for x in data['FAULT_DATE1']]# data['INST_MONTH'] = [x.month for x in data['INST_DATE1']]# data['DETECT_MONTH'] = [x.month for x in data['DETECT_DATE1']]# # 计算使用时间# print sum(data['FAULT_DATE1'].isnull()), sum(data['INST_DATE1'].isnull())# data['work_days'] = data['FAULT_DATE1'] - data['INST_DATE1']# data['work_months'] = [x.days / 30 if not pd.isnull(x) else np.nan for x in data['work_days']]# # 计算库存时间# data['save_days'] = data['INST_DATE1'] - data['DETECT_DATE1']# data['save_months'] = [x.days / 30 if not pd.isnull(x) else np.nan for x in data['save_days']]# '''# 2.统计分析/处理/筛选数据# '''# import seaborn as sns# import matplotlib.pyplot as plt# # 2.1 FAULT_TYPE# print data['FAULT_TYPE'].isnull().sum()# print data['FAULT_TYPE'].describe()# print data['FAULT_TYPE'].value_counts()# fig, axis0 = plt.subplots(1, 1)# sns.countplot(x='FAULT_TYPE', data=data, ax=axis0)# # 选出401-411故障# data['FAULT_TYPE'] = data['FAULT_TYPE'].str.strip()# data['FAULT_TYPE_2'] = [x[0:2] for x in data['FAULT_TYPE'].values.astype('str')]# data['FAULT_TYPE_4'] = [x[0:4] for x in data['FAULT_TYPE'].values.astype('str')]# data = data[data['FAULT_TYPE_2'] == '04']# data = data[(data['FAULT_TYPE_4'] != '0412') & (data['FAULT_TYPE'] != '04')]# print data['FAULT_TYPE'].value_counts()# print data['FAULT_TYPE_4'].value_counts()# fig, axis0 = plt.subplots(1, 1)# sns.countplot(x='FAULT_TYPE_4', data=data, ax=axis0)# # 2.2 SORT_CODE# print data['SORT_CODE'].isnull().sum()# print data['SORT_CODE'].describe()# fig, axis0 = plt.subplots(1, 1)# sns.countplot(x='SORT_CODE', data=data, ax=axis0)# # 选出SORT_CODE 为10 的智能表# data['SORT_CODE'] = data['SORT_CODE'].str.strip()# data = data[data['SORT_CODE'] == '10']# # 2.3 SPEC_CODE——待定# print data['SPEC_CODE'].isnull().sum()# print data['SPEC_CODE'].describe()# fig, axis0 = plt.subplots(1, 1)# sns.countplot(x='SPEC_CODE', data=data, ax=axis0)# # 2.4 COMM_MODE——待定# print data['COMM_MODE'].isnull().sum()# print data['COMM_MODE'].describe()# fig, axis0 = plt.subplots(1, 1)# sns.countplot(x='COMM_MODE', data=data, ax=axis0)# # 2.5 ORG_NO# print data['ORG_NO'].isnull().sum()# print data['ORG_NO'].describe()# data['ORG_NO1'] = [x[:5] for x in data['ORG_NO'].values.astype('str')]# data['ORG_NO1'].value_counts()# # 2.6 ARRIVE_BATCH_NO——待定# print data['ARRIVE_BATCH_NO'].isnull().sum()# data['ARRIVE_BATCH_NO'].value_counts()# # 2.7 MANUFACTURER——待定# print data['MANUFACTURER'].isnull().sum()# data['MANUFACTURER'].value_counts()# # 2.8 all# print data.describe()# '''# 3.删除无用数据# '''# data.drop(['FAULT_DATE','SYNC_ORG_NO', 'INST_DATE', 'DETECT_DATE',#            'work_days', 'save_days', 'FAULT_TYPE_2'],#           axis=1, inplace=True)# '''# 4.存储数据# '''# data.to_csv(r'E:\7 Python\data\20170616\zhejiang-new0620-reprocess.csv', index=False)'''建模前预处理:删除多余属性,补全时间缺失或负值'''# # 1.删除多余属性# data = pd.read_csv(r'E:\7 Python\data\20170616\zhejiang-new0620-reprocess.csv')# data = data.drop_duplicates()   # 去重# data.dropna()   # 去空# print data.info()# print data['COMM_MODE'].value_counts()# data.drop(['"EQUIP_ID"', 'FAULT_TYPE', 'ORG_NO', 'SORT_CODE', 'COMM_MODE',#            'ARRIVE_BATCH_NO', 'MANUFACTURER', 'FAULT_DATE1',#            'INST_DATE1', 'DETECT_DATE1', 'INST_MONTH',#            'DETECT_MONTH', 'FAULT_DATE1', 'FAULT_DATE1'],axis=1, inplace=True)# print data.info()# # 2.补全缺失或负值# # work_months# # 统计小于零的值# print len(data['work_months'][data['work_months']<0])# # 将小于零的值化为na# data['work_months'][data['work_months'] < 0] = NaN# # 再次统计小于零的值,确保无负值# print len(data['work_months'][data['work_months']<0])# # 统计空值# count_nan_work_months = data['work_months'].isnull().sum()# print count_nan_work_months# # 统计均值和方差# work_months_mean = data['work_months'].mean()# work_months_std = data['work_months'].std()# # 补全,生成标准正态分布,大小为空值的大小# rand_1 = np.random.randint(work_months_mean - work_months_std, work_months_mean + work_months_std, size = count_nan_work_months)# data['work_months'][data['work_months'].isnull()] = rand_1# # save_months# print len(data['save_months'][data['save_months']<0])# data['save_months'][data['save_months'] < 0] = NaN# print len(data['save_months'][data['save_months']<0])# count_nan_save_months = data['save_months'].isnull().sum()# print count_nan_save_months# save_months_mean = data['save_months'].mean()# save_months_std = data['save_months'].std()# rand_2 = np.random.randint(save_months_std - save_months_mean, save_months_mean + save_months_std, size = count_nan_save_months)# data['save_months'][data['save_months'].isnull()] = rand_2# # 检查# print len(data['work_months'][data['work_months']<0])# print len(data['save_months'][data['save_months']<0])# print data.isnull().sum().sum()# # 3.保存数据# data.to_csv(r'E:\7 Python\data\20170616\zhejiang-bagging-data.csv', index=False)'''bagging'''data = pd.read_csv(r'E:\7 Python\data\20170616\zhejiang-bagging-data.csv')print data.info()data_X = data.drop(['FAULT_TYPE_4'], axis=1)data_y = data['FAULT_TYPE_4']'''变量转化'''"category 变量处理:转化成string,one-hot编码"# FAULT_TYPE_4print data['FAULT_TYPE_4'].dtypesdata['FAULT_TYPE_4'] = data['FAULT_TYPE_4'].astype(str)print data['FAULT_TYPE_4'].dtypesprint data['FAULT_TYPE_4'].value_counts()print pd.get_dummies(data['FAULT_TYPE_4'],prefix = 'FAULT_TYPE_4').head()# SPEC_CODEprint data['SPEC_CODE'].dtypesdata['SPEC_CODE'] = data['SPEC_CODE'].astype(str)print data['SPEC_CODE'].dtypesprint data['SPEC_CODE'].value_counts()print pd.get_dummies(data['SPEC_CODE'],prefix = 'SPEC_CODE').head()# FAULT_MONTHprint data['FAULT_MONTH'].dtypesdata['FAULT_MONTH'] = data['FAULT_MONTH'].astype(str)print data['FAULT_MONTH'].dtypesprint data['FAULT_MONTH'].value_counts()print pd.get_dummies(data['FAULT_MONTH'],prefix = 'FAULT_MONTH').head()# ORG_NO1print data['ORG_NO1'].dtypesdata['ORG_NO1'] = data['ORG_NO1'].astype(str)print data['ORG_NO1'].dtypesprint data['ORG_NO1'].value_counts()print pd.get_dummies(data['ORG_NO1'],prefix = 'ORG_NO1').head()# SPEC_CODEprint data['SPEC_CODE'].dtypesdata['SPEC_CODE'] = data['SPEC_CODE'].astype(str)print data['SPEC_CODE'].dtypesprint data['SPEC_CODE'].value_counts()print pd.get_dummies(data['SPEC_CODE'],prefix = 'SPEC_CODE').head()# one-hot codingdata_dummy = pd.get_dummies(data)print data_dummy.head()"numerical变量(work_months和save_months)处理:处理缺失值,0/1数据标准化"# 缺失值处理print data_dummy.isnull().sum()# print data_dummy['save_months'].isnull().sum()# save_months_mean = data_dummy['save_months'].mean()# data_dummy['save_months']= data_dummy['save_months'].fillna(save_months_mean)# print data_dummy.isnull().sum().sum()# 0/1数据标准化numeric_cols = data.columns[data.dtypes != 'object']print numeric_colsnumeric_col_means = data_dummy.loc[:,numeric_cols].mean()numeric_col_std = data_dummy.loc[:,numeric_cols].std()data_dummy.loc[:,numeric_cols] = (data_dummy.loc[:,numeric_cols] - numeric_col_means) / numeric_col_std'''建立模型'''from sklearn.model_selection import train_test_splitfrom sklearn.metrics import classification_report,confusion_matrix# 数据分集train, test, train_y, test_y = train_test_split(data_X, data_y, test_size=0.33, random_state=27)#xgboostimport xgboost as xgb# dtrain = xgb.DMatrix(train,train_y)# dtest = xgb.DMatrix(test)xgb_clf = xgb.XGBClassifier(    learning_rate = 0.2,    n_estimators = 720,    max_depth = 9,    colsample_bytree = 0.8,    subsample = 0.9,    objective = 'multi:softprob',    min_child_weight = 1,    gamma = 2,    seed = 27 )param = xgb_clf.get_xgb_params()param['num_class'] = 11xgb_clf.fit(train,train_y, eval_metric='merror')xgb_pred = xgb_clf.predict(test)print(classification_report(test_y, xgb_pred))print(confusion_matrix(test_y, xgb_pred))# # DecisionTree# from sklearn.tree import DecisionTreeClassifier# dt_clf = DecisionTreeClassifier()# dt_clf.fit(train,train_y)# print(dt_clf)# dt_clf_pred = dt_clf.predict(test)# print(classification_report(test_y, dt_clf_pred))# print(confusion_matrix(test_y, dt_clf_pred))# # knn# from sklearn.neighbors import KNeighborsClassifier# knn_clf = KNeighborsClassifier(n_neighbors=25)# knn_clf.fit(train, train_y)# knn_pred = knn_clf.predict(test)# print knn_pred# knn_pred_proba=knn_clf.predict_proba(test)# print knn_pred_proba# # print model report:# # print knn_clf.score(test, test_y)# print(classification_report(test_y, knn_pred))# print(confusion_matrix(test_y, knn_pred))# # bagging# from sklearn.ensemble import BaggingClassifier# from sklearn.tree import DecisionTreeClassifier# dt_clf = DecisionTreeClassifier()# bagging_clf = BaggingClassifier(base_estimator=dt_clf,#                                 n_estimators=10,#                                 max_samples=1.0,#                                 max_features=1.0,#                                 bootstrap=True )# bagging_clf.fit(train,train_y)# bagging_pred = bagging_clf.predict(test)# print bagging_pred# print(classification_report(test_y, bagging_pred))# print(confusion_matrix(test_y, bagging_pred))# votingfrom sklearn.ensemble import VotingClassifierfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.neighbors import KNeighborsClassifierimport xgboost as xgbdt_clf = DecisionTreeClassifier()knn_clf = KNeighborsClassifier(n_neighbors=25)xgb_clf = xgb.XGBClassifier(learning_rate = 0.2,n_estimators = 720,max_depth = 9,colsample_bytree = 0.8,subsample = 0.9,                            objective = 'multi:softprob',min_child_weight = 1,gamma = 2,seed = 27)voting_clf=VotingClassifier(estimators=[('dt_clf', dt_clf),('knn_clf', knn_clf),('xgb_clf',xgb_clf)])voting_clf.fit(train,train_y)votingg_pred = voting_clf.predict(test)print votingg_predprint(classification_report(test_y, votingg_pred))print(confusion_matrix(test_y, votingg_pred))