用机器学习对CTR预估建模(一)

来源:互联网 发布:达芬奇视频编辑软件 编辑:程序博客网 时间:2024/06/16 11:06

题目网址:https://www.kaggle.com/c/avazu-ctr-prediction

数据集介绍:

train - Training set. 10 days of click-through data, ordered chronologically. Non-clicks and clicks
are subsampled according to different strategies.
Train.csv 解压后有5.6G,样本个数非常大,一般200m的csv数据(20~30维)用pandas读取成数据帧(dataframe)格式,大概会占用内存1G左右,所以这么的数据集单机内存一般吃不消。

test - Test set. 1 day of ads to for testing your model predictions.
Test.csv解压后有673m,不是很大。

sampleSubmission.csv - Sample submission file in the correct format, corresponds to the All-0.5 Benchmark.

对特征进行筛选和down sampling来降低数据集

# -*- coding: utf-8 -*-"""Created on Wed Feb 01 12:51:31 2017@author: JR.Lu"""import pandas as pdimport numpy as nptrain_df=pd.read_csv('train.csv',nrows=10000000)test_df=pd.read_csv('test.csv')#down samplingtemp_0=train_df.click==0data_0=train_df[temp_0] # 16546986./20000000 占了0.8273493左右temp_1=train_df.click==1data_1=train_df[temp_1] # 3453014data_0_ed=data_0[0:len(data_1)]data_downsampled=pd.concat([data_1,data_0_ed])#select features#通过每个columns对label的影响来选择feature,这里使用grouby实现#train_df.groupby(train_df['device_model'])['click'].mean()columns_select_test=['id','device_type','C1','C15','C16','banner_pos','banner_pos','site_category']columns_select=['click','device_type','C1','C15','C16','banner_pos','banner_pos','site_category']data_downsampled_1=data_downsampled[columns_select]test_small=test_df[columns_select_test]# 打乱数据sampler = np.random.permutation(len(data_downsampled_1))data_downsampled_1=data_downsampled_1.take(sampler)data_downsampled_1.to_csv('train_small.csv')test_small.to_csv('test_small.csv')

其次是用简单的特征来测试模型,用网格搜索的方式来进行参数优选

# -*- coding: utf-8 -*-"""Created on Wed Feb 01 20:36:46 2017@author: JR.Lu"""import pandas as pdfrom sklearn.model_selection import GridSearchCVfrom sklearn.linear_model import LogisticRegressionfrom sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifierfrom sklearn.cross_validation import train_test_splitfrom sklearn.learning_curve import learning_curvefrom sklearn import metricsimport numpy as npimport matplotlib.pyplot as pltimport scipy as spdef logloss(act, pred):    '''    比赛使用logloss作为evaluation    '''    epsilon = 1e-15    pred = sp.maximum(epsilon, pred)    pred = sp.minimum(1-epsilon, pred)    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))    ll = ll * -1.0/len(act)    return ll# 结果衡量def print_metrics(true_values, predicted_values):    print "logloss: ", logloss(true_values, predicted_values)    print "Accuracy: ", metrics.accuracy_score(true_values, predicted_values)    print "AUC: ", metrics.roc_auc_score(true_values, predicted_values)    print "Confusion Matrix: ", + metrics.confusion_matrix(true_values, predicted_values)    print metrics.classification_report(true_values, predicted_values)    def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):    '''    输入一个模型、title、x、y,返回模型学习过程曲线    '''    plt.figure()    plt.title(title)    if ylim is not None:        plt.ylim(*ylim)    plt.xlabel("Training examples")    plt.ylabel("Score")    train_sizes, train_scores, test_scores = learning_curve(        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)    train_scores_mean = np.mean(train_scores, axis=1)    train_scores_std = np.std(train_scores, axis=1)    test_scores_mean = np.mean(test_scores, axis=1)    test_scores_std = np.std(test_scores, axis=1)    plt.grid()    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,                     train_scores_mean + train_scores_std, alpha=0.1,                     color="r")    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,                     test_scores_mean + test_scores_std, alpha=0.1, color="g")    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",             label="Training score")    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",             label="Cross-validation score")    plt.legend(loc="best")    return plt#读取经过down sampling后的small数据train_df=pd.read_csv('train_small.csv',nrows=100000) test_df=pd.read_csv('test_small.csv') feature_columns=['device_type','C1','C15','C16','banner_pos',                'banner_pos','site_category']train_x=train_df[feature_columns]test_x=test_df[feature_columns]x=pd.concat([train_x,test_x])#变成one-hot encodingtemp=xfor each in feature_columns:    temp_dummies=pd.get_dummies(x[each])    temp=pd.concat([temp,temp_dummies],axis=1)x_dummies=temp.drop(feature_columns,axis=1)X_train=x_dummies[0:len(train_x)]Y_train=train_df['click']x_train, x_test, y_train, y_test=train_test_split(X_train,Y_train,test_size=0.33)#建模#模型参数选择,使用GridSearchCV实现"""LR模型可调的参数,没几个能调的,gs调参只能输入list,不能对str进行选择。LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,                   intercept_scaling=1, class_weight=None, random_state=None,                    solver='liblinear', max_iter=100, multi_class='ovr',                    verbose=0, warm_start=False, n_jobs=1)        solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’}, default: ‘liblinear’"""#param_LR= {'C':[0.1,1,2]}##    #gsearch_LR = GridSearchCV(estimator = LogisticRegression(penalty='l1',solver='liblinear'),#                          param_grid=param_LR,cv=3)#gsearch_LR.fit(x_train,y_train)#gsearch_LR.grid_scores_, gsearch_LR.best_params_, gsearch_LR.best_score_title='LRlearning{penalty=l1,solver=liblinear,cv=3}'                          plot_learning_curve(LogisticRegression(penalty='l1',solver='liblinear',C=1),                    title=title,cv=10,X=x_train,y=y_train)#gsearch_LR.fit(x_train,y_train)#gbdt模型#param_GBDT= {'learning_rate':[0.1,0.5],#             'n_estimators':[100,200,300,400],#             'max_depth':[3,4]}##gsearch_GBDT = GridSearchCV(estimator =GradientBoostingClassifier(),#                          param_grid=param_GBDT,cv=10)#gsearch_GBDT.fit(x_train,y_train)##gsearch_GBDT.grid_scores_#gsearch_GBDT.best_params_#gsearch_GBDT.best_score_#最佳参数:'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 3title='GDBTlearning{n_estimators: 200, learning_rate: 0.1, max_depth: 3}'plot_learning_curve(estimator=GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3),                    title=title,cv=2,X=x_train,y=y_train)#比LR好那么一点点#rf建模#param_rf= {'n_estimators':[100,200,300],#            'max_depth':[2,3,4]}##gsearch_rf = GridSearchCV(estimator =RandomForestClassifier(),#                          param_grid=param_rf,cv=3)##gsearch_rf.fit(x_train,y_train)##gsearch_GBDT.grid_scores_#gsearch_rf.best_params_#gsearch_rf.best_score_# 最佳参数: {'n_estimators': 200, 'max_depth': 4} title='RFlearning{n_estimators: 200,  max_depth: 4}'plot_learning_curve(estimator=RandomForestClassifier(n_estimators=200, max_depth=4),                    title=title,cv=2,X=x_train,y=y_train)# predictlr_model=LogisticRegression(penalty='l1',solver='liblinear',C=1)gbdt_model=GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3)rf_model=RandomForestClassifier(n_estimators=200, max_depth=4)lr_model.fit(x_train,y_train)gbdt_model.fit(x_train,y_train)rf_model.fit(x_train,y_train)lr_predict=lr_model.predict( x_test)gbdt_predict=gbdt_model.predict(x_test)rf_predict=rf_model.predict(x_test)print "LRmodel 性能如下:-------"print_metrics(y_test, lr_predict)print "GBDTmodel 性能如下:-------"print_metrics(y_test, gbdt_predict)print "RFmodel 性能如下:-------"print_metrics(y_test, rf_predict)

结果大概如下:

LRmodel 性能如下:-------logloss:  14.8549419892Accuracy:  0.569909090909AUC:  0.570339428461Confusion Matrix:  [[11141  5293] [ 8900  7666]]             precision    recall  f1-score   support          0       0.56      0.68      0.61     16434          1       0.59      0.46      0.52     16566avg / total       0.57      0.57      0.56     33000
GBDTmodel 性能如下:-------logloss:  14.7952832304Accuracy:  0.571636363636AUC:  0.572068547036Confusion Matrix:  [[11177  5257] [ 8879  7687]]             precision    recall  f1-score   support          0       0.56      0.68      0.61     16434          1       0.59      0.46      0.52     16566avg / total       0.58      0.57      0.57     33000
RFmodel 性能如下:-------logloss:  15.4713065032Accuracy:  0.552060606061AUC:  0.553565705536Confusion Matrix:  [[15281  1153] [13629  2937]]             precision    recall  f1-score   support          0       0.53      0.93      0.67     16434          1       0.72      0.18      0.28     16566avg / total       0.62      0.55      0.48     33000

插个图看看结果:
这里写图片描述

这里写图片描述

这里写图片描述

0 0