SAS CE - CE5 Model Lift

来源:互联网 发布:中万网络 域名 编辑:程序博客网 时间:2024/05/23 18:59
**********************************************************************************************************************;*** 5.Final Model build and validation on test sample ***;**********************************************************************************************************************;** Master macro for binary dependent variable **;%macro Model_Fin_Logistic(insdnT, insdnV, regvlist, sle=0.05, sls=.05,metric_sdnout=out.CE5_Model_Metric_All,out_txt=&path_output./CE5_Varlist_Model.txt, outds=out.CE5_scored);%** Build model **;ods listing close;ods output  ParameterEstimates=Parm1;ods output  ModelBuildingSummary= out.ce5_SelectionSummary;proc logistic data=&insdnT outmodel=est desc namelen=32;%if &weight ne %then %do; weight &weight; %end;model &dep_var = &regvlist /selection=&method stb lackfit rsq details%if &method = stepwise or &method = forward %then %do; slentry=&sle %end; %if &method = stepwise or &method = backward %then %do; slstay=&sls %end; ;output out=out1 P=pscore;run;ods listing;%** Extract variables **;proc sql noprint; select max(step) into: max1 from Parm1;quit;data parm1; set parm1 (where=(step=&max1)); run;proc sql noprint;select variable into: varlisttmp separated by ' ' from parm1 where variable ^= 'Intercept';select count(*) into: nvars from parm1 where variable ^= 'Intercept';quit;%put &varlisttmp;%* Score validation dataset;proc logistic inmodel=est;   score data=&insdnV out=out2;run;data &outds; set out1 (drop=_level_) out2 (drop=f_&dep_var I_&dep_var p_0 rename=(P_1=pscore)); run;%ctl_Stats_Log(&insdnT, &varlisttmp, &nvars);data &metric_sdnout;length file $4.;set stats;file = 'Dev';run;%ctl_Stats_Log(&insdnV, &varlisttmp, &nvars);data &metric_sdnout;set &metric_sdnout stats (in=a);if a then file = 'Val';run;%* Write out final variable list;data _NULL_;set parm1 (where=(variable ^= 'Intercept')) end=eof;FILE  "&out_txt"  LRECL=256;if _N_=1 then do;PUT ' ';PUT "%"@;PUT "LET Varlist_Final=";end;PUT variable @;if eof then PUT ';';run;%** Clean up;data out.ce5_parameterestimates; set parm1 (drop=step); run;proc datasets library=work nolist;delete parm1 stats est out1 out2;run;quit;%mend;** Master macro for continuous dependent variable **;%macro Model_Fin_Reg(insdnT, insdnV, regvlist, sle=0.05, sls=.05,metric_sdnout=out.CE5_Model_Metric_All,out_txt=&path_output./CE5_Varlist_Model.txt, outds=out.CE5_scored);%** Build model **;ods listing close;ods graphics on;ods output SelParmEst=Parm1;ods output SelectionSummary=out.ce5_SelectionSummary;proc reg data=&insdnT outest=est plots(maxpoints=none);%if &weight ne %then %do; weight &weight; %end;model &dep_var = &regvlist /selection=&method stb details%if &method = stepwise or &method = forward %then %do; slentry=&sle %end; %if &method = stepwise or &method = backward %then %do; slstay=&sls %end; ;output out=out1 P=pscore;run;ods listing;ods graphics off;quit;%** Extract variables **;proc sql noprint; select max(step) into: max1 from Parm1;quit;data parm1; set parm1 (where=(step=&max1)); run;proc sql noprint;select variable into: varlisttmp separated by ' ' from parm1 where variable ^= 'Intercept';select count(*) into: nvars from parm1 where variable ^= 'Intercept';quit;%put &varlisttmp;%* Score validation dataset;proc score data=&insdnV score=est out=out2 type=parms;var &varlisttmp;run;data &outds; set out1 out2 (rename=(model1=pscore)); run;%ctl_Stats_Reg(&insdnT, &varlisttmp, &nvars);data &metric_sdnout;length file $4.;set stats;file = 'Dev';run;%ctl_Stats_Reg(&insdnV, &varlisttmp, &nvars);data &metric_sdnout;set &metric_sdnout stats (in=a);if a then file = 'Val';run;%* Write out final variable list;data _NULL_;set parm1 (where=(variable ^= 'Intercept')) end=eof;FILE  "&out_txt"  LRECL=256;if _N_=1 then do;PUT ' ';PUT "%"@;PUT "LET Varlist_Final=";end;PUT variable @;if eof then PUT ';';run;%** Clean up;data out.ce5_parameterestimates; set parm1 (drop=step); run;proc datasets library=work nolist;delete parm1 stats est out1 out2;run;quit;%mend;** Build gains table to check performance **;%macro gains(inputfile = , score = , varlist= , title_key = );%* Create deciles based on predicted value;%if &weight = %then %do;proc rank data=&inputfile out=tmp1 descending groups=10;var &score;ranks decile;run;%end;%else %do;proc sql noprint;select sum(&weight) into: nobs from &inputfile;quit;proc sort data=&inputfile out=tmp1; by descending &score; run;data tmp1; set tmp1;retain cum;cum + &weight;decile = (floor(cum*10/(&nobs+1)));run;%end;%* Summarize file by decile;proc summary data=tmp1;var &dep_var &score &varlist ;class decile;%if &weight ne %then %do; weight &weight; %end;%if &weight ne %then %do; output out=tmp2 (drop= _freq_) sumwgt=count %end;%else %do; output out=tmp2 (rename= _freq_=count) %end;mean= min(&score)=minscore max(&score)=maxscore %if &binary_dv=Y %then %do; sum(&dep_var)=numresp %end;;run;data out.ce5_profiles_&title_key;set tmp2 (drop=minscore maxscore %if &binary_dv=Y %then %do; numresp %end;);decile = decile + 1;run;%* Model Performance;%if  &binary_dv = Y %then %do;proc sql;select (numresp/count) into: avg from tmp2 where _type_=0;select numresp into: totresp from tmp2 where _type_=0;quit;data out.ce5_gainstable_&title_key (drop=_type_ cumresp cumcount); set tmp2 (drop=&varlist where=(_type_=1));decile = decile + 1;avg_rate = numresp / count;lift = (avg_rate / &avg)*100;retain cumresp cumcount 0;cumresp + numresp;cumcount + count;cum_index = ((cumresp/cumcount) / &avg) * 100;resp_pct = numresp / &totresp;cum_resp_pct = cumresp / &totresp;run;%end;%else %do;proc sql;select &dep_var into: avg from tmp2 where _type_=0;quit;data out.ce5_gainstable_&title_key (drop=_type_); set tmp2 (drop=&varlist where=(_type_=1));decile = decile + 1;lift = (&dep_var / &avg)*100;run;%end;%* Clean up;proc datasets library=work nolist;delete tmp1 tmp2;run;quit;%mend;** Profiling Control **;%macro Fin_Profiling (inds,varlist);%** Initialize profiling dataset **;data out.CE5_profile;set _NULL_;length variable $32.;length label $256.;length category $256.;format count comma8.;format percent percent8.2;%if %upcase(&binary_dv) = Y %then %do;format Average_DV percent8.2;%end;%else %do;format Average_DV 12.2;%end;format index 8.0;length star $8.;run;%** Get overall average and count **;proc sql noprint;select count(*) into : nobs from &inds;select mean(&dep_var) into : overall_avg from &inds;quit;%** Get variables and unique counts **;proc sql;create table cnts asselect    %let i=1;%let v=%scan(&varlist, &i,' ');%do %while (&v^=);count(distinct(&v)) as &v,%let i=%eval(&i+1);%let v=%scan(&varlist, &i,' ');%end;"dummy" as dummyfrom &inds;quit;proc transpose data=cnts (drop=dummy) out=cnts (rename=col1=uniq) name=variable; run;%** Create macro variables to control processing **;data _null_; set cnts end=eof;if eof then call symputx("VARCNT",_N_);call symputx("IV"|| trim(left(put(_N_,4.)))  ,variable);call symputx("uniq"|| trim(left(put(_N_,4.)))  ,uniq);run;%** Loop through for each variable **;%do _I_ = 1 %to &varcnt;%if &&uniq&_I_ <= &fin_num_category %then %do;%fprof1(&inds,&&iv&_I_);%end;%else %do;%fprof2(&inds,&&iv&_I_);%end;%fprof3(&&iv&_I_);%end;proc datasets nolist;delete cnts;run;%mend;** Profiling macro 1 **;%macro fprof1(insdn,var);proc summary data=&insdn nway missing;var &dep_var;class &var;output out=prof (drop=_type_ rename=_freq_=xcount) mean=xmean;run;data prof;set prof;length xcategory $256.;xcategory = trim(left(put(&var,best8.)));run;%mend;** Profiling macro 2 **;%macro fprof2(insdn, var);%if %upcase(&fin_equal_dist) = Y %then %do;proc means data=&insdn p1 p99 NOPRINT;      var &var;      output out=tmp p1=var_p1 p99=var_p99  / noinherit;    run;data _NULL_;set tmp;range = (var_p99 - var_p1)/&fin_num_category;call symputx('cut_lo',var_p1);call symputx('cut_hi',var_p99);call symputx('range',range);run;data tmp;set &insdn (keep=&dep_var &var);if &var < &cut_lo then bin = 1;else if &var >= &cut_hi then bin = input("&fin_num_category",best10.);else do;do k = 1 to &fin_num_category;if &var>= &cut_lo+(k-1)*&range and &var< &cut_lo+k*&range thenbin=k;end;end;run;%end;%else %do;proc rank data=&insdn (keep=&dep_var &var) out=tmp ties=High group=&fin_num_category;var &var;ranks bin;run;%end;proc summary data=tmp nway missing;var &var &dep_var;class bin;output out=prof (drop=_type_ rename=_freq_=xcount) min(&var)=lo max(&var)=hi mean(&dep_var)=xmean;run;data prof;set prof end=eof;length xcategory $256.;if _N_ = 1 then xcategory = "Low to " || trim(left(put(hi,best8.)));else if eof then xcategory = trim(left(put(lo,best8.))) || " to High";else xcategory = trim(left(put(lo,best8.))) || " to " || trim(left(put(hi,best8.)));run;proc datasets nolist;delete tmp;run;%mend;** Profiling macro 3 **;%macro fprof3(var);proc sql;create table prof2 asselect a.variable, c.label, b.xcategory, b.xcount, b.xmeanfrom cnts a, prof b, out.ce2_vars cwhere a.variable = "&var"  and c.variable = "&var";quit;data prof3 (drop=xcount xmean xcategory);set prof2 end=eof;length category $256.;length star $8.;if _N_ = 1 then do;category = "Overall";Average_DV = &overall_avg;Count = &nobs;Percent = 1;index = 100;output;end;category = xcategory;count = xcount;percent = xcount / &nobs;Average_DV = xmean;index = (Average_DV / &overall_avg)*100;if index >= 110 then star = '* (+)';else if index > 100 then star = '  (+)';else if index <= 90 then star = '* (-)';else if index <= 100 then star = '  (-)';else star = '  (0)';output;run;data out.CE5_profile;set out.CE5_profile prof3;run;proc datasets nolist;delete prof prof2 prof3;run;%mend;** Master macro for Final Model build and validation **;%macro CE_Model_Lift(insdn, varlist);data mod val;set &insdn (keep=&Dep_var mod_val_test &varlist &weight);if mod_val_test=3 then output val;else output mod;run;%if %upcase(&Binary_dv) = Y %then %do;%Model_Fin_Logistic(mod, val, &varlist,sle=&fin_alpha,sls=&fin_alpha,metric_sdnout=out.CE5_Model_Metric_All,out_txt=&path_output./CE5_Varlist_Model.txt,outds=out.CE5_scored);%end;%else %if %upcase(&Binary_dv) ^= Y %then %do;%Model_Fin_Reg(mod, val, &varlist,sle=&fin_alpha,sls=&fin_alpha,metric_sdnout=out.CE5_Model_Metric_All,out_txt=&path_output./CE5_Varlist_Model.txt,outds=out.CE5_scored);%end;%* Final variable list;proc sql noprint;select variable into: tmplst separated by ' ' from out.ce5_parameterestimates where variable ^= 'Intercept';quit;%* Performance;%gains(inputfile = out.CE5_scored (where=(mod_val_test ^= 3)), score = pscore, varlist=&tmplst, title_key = train);%gains(inputfile = out.CE5_scored (where=(mod_val_test = 3)), score = pscore, varlist=&tmplst, title_key = test);%* Profile Variables;%Fin_Profiling(mod,&tmplst);%* Report;ods listing close;ods Tagsets.ExcelxP body="&Path_output.CE5_Model_report.xls" style=sasweb;%* Selection Summary;ods tagsets.excelxp options(sheet_name="Selection");proc print data=out.CE5_SelectionSummary noobs; run;%* Parameters;ods tagsets.excelxp options(sheet_name="Parameters");proc print data=out.CE5_ParameterEstimates noobs; run;%* Statistics;%if %upcase(&Binary_dv) = Y %then %do;ods tagsets.excelxp options(sheet_name="Statistics");proc print data=out.CE5_Model_Metric_All noobs;var file Variable Label;var Estimate  / style={tagattr='format:0.000000'};var StdEst StdErr WaldChiSq ProbChiSq VIF RelImp  / style={tagattr='format:0.0000'};var AIC SC LogL2  / style={tagattr='format:0.00'};var Rsquare SomersD Gamma TauA c Concord Discon LackFit / style={tagattr='format:0.0000'};var Lift_Index / style={tagattr='format:0.00'};var infv ks  / style={tagattr='format:0.0000'};run;%end;%else %do;ods tagsets.excelxp options(sheet_name="Statistics");proc print data=out.CE5_Model_Metric_All noobs;var file Variable Label;var Estimate  / style={tagattr='format:0.000000'};var StdEst StdErr tValue Probt VIF RelImp  / style={tagattr='format:0.0000'};var AIC  SBC JP  / style={tagattr='format:0.00'};var Rsquare AdjRsq RMSE CoeffVar PC / style={tagattr='format:0.0000'};var Lift_Index / style={tagattr='format:0.00'};var gini infv ks  / style={tagattr='format:0.0000'};run;%end;%* Performance;ods tagsets.excelxp options(sheet_interval="table");ods tagsets.excelxp options(sheet_name="Performance" sheet_interval="none" embedded_titles="yes" convert_percentages="yes");proc print data=out.CE5_GainsTable_Train noobs;title 'Train';var decile / style={tagattr='format:0'};var Count  / style={tagattr='format:#,##0'};var &dep_var pscore / style={tagattr='format:0.0000'};var minscore maxscore / style={tagattr='format:0.0000'};%if %upcase(&Binary_dv) = Y %then %do;var avg_rate / style={tagattr='format:0.00%'};var lift cum_index / style={tagattr='format:0'};var resp_pct cum_resp_pct / style={tagattr='format:0.00%'};%end;%else %do;var lift / style={tagattr='format:0'};%end;run;proc print data=out.CE5_GainsTable_Test noobs;title 'Test';var decile / style={tagattr='format:0'};var Count  / style={tagattr='format:#,##0.'};var &dep_var pscore / style={tagattr='format:0.0000'};var minscore maxscore / style={tagattr='format:0.0000'};%if %upcase(&Binary_dv) = Y %then %do;var avg_rate / style={tagattr='format:0.00%'};var lift cum_index / style={tagattr='format:0'};var resp_pct cum_resp_pct / style={tagattr='format:0.00%'};%end;%else %do;var lift / style={tagattr='format:0'};%end;run;title ' ';%* Variable Validation;ods tagsets.excelxp options(sheet_interval="table");ods tagsets.excelxp options(sheet_name="Variable Validation" sheet_interval="none" embedded_titles="yes");proc print data=out.CE5_Profiles_Train noobs;title 'Train';var decile _type_  / style={tagattr='format:0'};var Count  / style={tagattr='format:#,##0'};var &dep_var pscore &tmplst / style={tagattr='format:0.0000'};run;proc print data=out.CE5_Profiles_Test noobs;title 'Test';var decile _type_  / style={tagattr='format:0'};var Count  / style={tagattr='format:#,##0'};var &dep_var pscore &tmplst / style={tagattr='format:0.0000'};run;title ' ';%* Correlations;proc corr data = out.ce5_scored outp=out.CE5_corr noprint; var &dep_var &tmplst; run;data out.CE5_corr (drop=_type_) ; set out.CE5_corr (rename=(_name_=variable) where=(_type_ = 'CORR')); run;ods tagsets.excelxp options(sheet_interval="table");ods tagsets.excelxp options(sheet_name="Correlations");proc print data=out.CE5_corr noobs;var variable;var &dep_var &tmplst / style={tagattr='format:0.0000'};run;%* Variable Profiles;proc sql noprint;select max(length(variable)) into: l1 from out.CE5_profile;select max(length(label)) into: l2 from out.CE5_profile;select max(length(category)) into: l3 from out.CE5_profile;quit;ods tagsets.excelxp options(sheet_name="Profiles" embedded_titles="yes" convert_percentages="yes"absolute_column_width="&l1,&l2,&l3,8,8,9,8,6" Frozen_Headers='Yes' Frozen_RowHeaders='1');proc print data=out.CE5_profile noobs;var variable label category;var count / style={tagattr='format:#,##0'};var percent Average_DV index star;run;ods Tagsets.ExcelxP close;ods listing;%* Write out scoring code;data _Null_;file "&path_output./CE5_Scoring_equation.txt"; set out.ce5_parameterestimates (keep=variable estimate) end=eof;if variable = 'Intercept' then do;put " " '0d'x;put "*****  Model  *****;" '0d'x;put " " '0d'x;%if &binary_dv = Y %then %do;put "Logit = 1 * ("  estimate +(-1) ") +" '0d'x;%end;%else %do;put "Score = 1 * ("  estimate +(-1) ") +" '0d'x;%end;end;else do;if eof then put variable "* ("  estimate +(-1) ") ;" '0d'x;else put variable "* ("  estimate +(-1) ") +" '0d'x;end;if eof then do;put " " '0d'x;%if &binary_dv = Y %then %do;put "Score=1/(1+exp(-Logit));" '0d'x;%end;end;run;proc datasets library=work nolist;delete mod val;run;quit;%mend CE_Model_Lift;**********************************************************************************************************************;

0 0
原创粉丝点击