机器学习项目实战之用户流失预警

来源：互联网发布：图像加水印算法编辑：程序博客网时间：2024/05/22 13:36

from __future__ import divisionimport pandas as pdimport numpy as npchurn_df = pd.read_csv("D:\\test\\machineLearning\\churn.csv")col_names = churn_df.columns.tolist()print "Column_names:"print col_namesto_show = col_names[:6]+col_names[-6:]print "\nSample_data:"churn_df[to_show].head(3)

Column_names: [‘State’, ‘Account Length’, ‘Area Code’, ‘Phone’, “Int’l Plan”, ‘VMail Plan’, ‘VMail Message’, ‘Day Mins’, ‘Day Calls’, ‘Day Charge’, ‘Eve Mins’, ‘Eve Calls’, ‘Eve Charge’, ‘Night Mins’, ‘Night Calls’, ‘Night Charge’, ‘Intl Mins’, ‘Intl Calls’, ‘Intl Charge’, ‘CustServ Calls’, ‘Churn?’] Sample_data:

State Account Length Area Code Phone Int’l Plan VMail Plan Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn? 0 KS 128 415 382-4657 no yes 11.01 10.0 3 2.70 1 False. 1 OH 107 415 371-7191 no yes 11.45 13.7 3 3.70 1 False. 2 NJ 137 415 358-1921 no no 7.32 12.2 5 3.29 0 False.

#将字符改变成数值，便于分析#Churn是客户量流失的意思churn_result = churn_df["Churn?"]y = np.where(churn_result == 'True.',1,0)#去掉一些特征to_drop = ['State','Area Code','Phone','Churn?']churn_feat_space = churn_df.drop(to_drop,axis=1)#将这些yes和no转化为布尔值yes_no_cols = ["Int'l Plan","VMail Plan"]churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'feaures = churn_feat_space.columnsX = churn_feat_space.as_matrix().astype(np.float)#重点:不同的特征项有不同的值，如1-2，3万到4万，不同特征间的数值上的巨大差异会影响我们的分析#例如作图的时候，所以我们需要统一将这些数据压缩到一定的区间上from sklearn.preprocessing import StandardScalerscaler = StandardScaler()X = scaler.fit_transform(X)print "Feature space holds %d observations and %d features"% X.shapeprint "Unique target labels:",np.unique(y)print X[0]print len(y[y == 0])

Feature space holds 3333 observations and 17 features Unique target labels: [0 1] [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] 2850

from sklearn.cross_validation import KFold#交叉验证函数：X是特征数据，y是label，clf_class是你选择的分类器，kwargs指定的参数def run_cv(X,y,clf_class,**kwargs):    # Construct a kfolds object    kf = KFold(len(y),n_folds=5,shuffle=True)    y_pred = y.copy()    # Iterate through folds    for train_index, test_index in kf:        X_train, X_test = X[train_index], X[test_index]        y_train = y[train_index]        # Initialize a classifier with key word arguments        clf = clf_class(**kwargs)        clf.fit(X_train,y_train)        y_pred[test_index] = clf.predict(X_test)    return y_pred

from sklearn.svm import SVC  #支持向量机from sklearn.ensemble import RandomForestClassifier as RF  #随机森林from sklearn.neighbors import KNeighborsClassifier as KNN  #k最近邻def accuracy(y_true,y_pred):    # NumPy interprets True and False as 1. and 0.    return np.mean(y_true == y_pred)#尝试使用多种分类器来验证效果print "Support vector machines:"print "%.3f" % accuracy(y, run_cv(X,y,SVC))print "Random forest:"print "%.3f" % accuracy(y, run_cv(X,y,RF))print "K-nearest-neighbors:"print "%.3f" % accuracy(y, run_cv(X,y,KNN))

Support vector machines: 0.913 Random forest: 0.942 K-nearest-neighbors: 0.897

#以上的准确率的意义并不大，对于客户来说，重要的是ROC指标FN，即我预测错了，认为客户不会流失，但是客户流失了from sklearn.cross_validation import KFold#交叉验证函数：X是特征数据，y是label，clf_class是你选择的分类器，kwargs指定的参数def run_prob_cv(X,y,clf_class,**kwargs):    # Construct a kfolds object    kf = KFold(len(y),n_folds=5,shuffle=True)    y_prob = np.zeros((len(y),2))    # Iterate through folds    for train_index, test_index in kf:        X_train, X_test = X[train_index], X[test_index]        y_train = y[train_index]        # Initialize a classifier with key word arguments        clf = clf_class(**kwargs)        clf.fit(X_train,y_train)        y_prob[test_index] = clf.predict_proba(X_test)    return y_prob

import warningswarnings.filterwarnings('ignore')pred_prob = run_prob_cv(X,y,RF,n_estimators=10)pred_churn = pred_prob[:,1]is_churn = y == 1counts = pd.value_counts(pred_churn)true_prob = {}for prob in counts.index:    true_prob[prob] = np.mean(is_churn[pred_churn == prob])    true_prob = pd.Series(true_prob)counts = pd.concat([counts,true_prob],axis=1).reset_index()counts.columns = ["pred_prob","count","true_prob"]counts#通过观测以下数据进行预警，当实际的可能性是百分之30或者40时，对应的真实的用户流失情况，由用户选择阈值进行预警

pred_prob count true_prob 0 0.0 1779 0.029230 1 0.1 696 0.020115 2 0.2 265 0.060377 3 0.3 126 0.142857 4 0.8 91 0.978022 5 0.9 75 0.960000 6 0.4 73 0.438356 7 0.7 65 0.953846 8 0.5 57 0.561404 9 1.0 56 0.982143 10 0.6 50 0.820000

0 0