SAS CE - CE4 Model Val

来源:互联网 发布:中万网络 域名 编辑:程序博客网 时间:2024/06/05 08:24
**********************************************************************************************************************;*** 4.Model selection and tuning ***;**********************************************************************************************************************;** Master macro for binary dependent variable **;%macro Model_Val_Logistic(insdnT, insdnV, regvlist, sle=0.05, sls=.05,metric_sdnout=out.CE4_metric_sdnout);%* Set up independent variable lists;%if &excludelist =  %then %let nexcludes = 0;  %else %let nexcludes = %sysfunc(countw(&excludelist));  /* get number of variables in exclude list */%if &nexcludes ^= 0 %then %do;%let regvlist=%StringMinus(&regvlist, &excludelist);  /* Remove excluded variables from variable list */%end;%if &startlist =  %then %let nstarts = 0;  %else %let nstarts = %sysfunc(countw(&startlist));  /* get number of variables in start list */%if &nstarts ^= 0 %then %do;%if nexcludes ^= 0 %then %do;  /* Make sure there are no excluded variables in the start list*/%let startlist=%StringMinus(&startlist, &excludelist);%end;%let nstarts = %sysfunc(countw(&startlist));  /* re-get number of variables in start list */%if &nstarts ^= 0 %then %do;%let regvlist=&startlist %StringMinus(&regvlist, &startlist);%end;%end;%if &includelist =  %then %let nincludes = 0;  %else %let nincludes = %sysfunc(countw(&includelist));  /* get number of variables in include list */%if &nincludes ^= 0 %then %do; %if nexcludes ^= 0 %then %do;  /* Make sure there are no excluded variables in the include list*/%let includelist=%StringMinus(&includelist, &excludelist);%end;%let nincludes = %sysfunc(countw(&includelist));  /* re-get number of variables in include list */%if &nincludes ^= 0 %then %do;%let regvlist=&includelist %StringMinus(&regvlist, &includelist);%end;%end;ods listing close;%** Build forward model **;ods output  ParameterEstimates=Parm1;proc logistic data=&insdnT desc namelen=32;model &dep_var = &regvlist /selection=forward lackfit rsq details slentry=&sle%if &nincludes ne 0 %then %do; include=&nincludes %end;%if &nstarts ne 0 %then %do; start=&nstarts %end;;run;%** Build backward model **;ods output  ParameterEstimates=Parm2;proc logistic data=&insdnT desc namelen=32;model &dep_var = &regvlist /selection=backward lackfit rsq details slstay=&sls%if &nincludes ne 0 %then %do; include=&nincludes %end;%if &nstarts ne 0 %then %do; start=&nstarts %end;;run;ods listing;%** Extract variables **;proc sql noprint; select max(step) into: max1 from Parm1;select max(step) into: max2 from Parm2;quit;data parm1; set parm1 (where=(step=&max1)); run;data parm2; set parm2 (where=(step=&max2)); run;proc sql noprint;create table varlisttmp asselect variablefrom parm1unionselect variable from parm2;select variable into: varlisttmp separated by ' ' from varlisttmp where variable ^= 'Intercept';select count(*) into: nvars from varlisttmp where variable ^= 'Intercept';quit;%put &varlisttmp;%ctl_Stats_Log(&insdnT, &varlisttmp, &nvars);data &metric_sdnout;length file $4.;set stats;file = 'Dev';run;%ctl_Stats_Log(&insdnV, &varlisttmp, &nvars);data &metric_sdnout;set &metric_sdnout stats (in=a);if a then file = 'Val';run;%** Clean up;proc datasets library=work nolist;delete varlisttmp globalfit full vars tmp1 tmp2 parms vif stats;run;quit;%mend;** Master macro for continuous dependent variable **;%macro Model_Val_Reg(insdnT, insdnV, regvlist, sle=0.05, sls=.05,metric_sdnout=out.CE4_metric_sdnout);%* Set up independent variable lists;%if &excludelist =  %then %let nexcludes = 0;  %else %let nexcludes = %sysfunc(countw(&excludelist));  /* get number of variables in exclude list */%if &nexcludes ^= 0 %then %do;%let regvlist=%StringMinus(&regvlist, &excludelist);  /* Remove excluded variables from variable list */%end;%if &startlist =  %then %let nstarts = 0;  %else %let nstarts = %sysfunc(countw(&startlist));  /* get number of variables in start list */%if &nstarts ^= 0 %then %do;%if nexcludes ^= 0 %then %do;  /* Make sure there are no excluded variables in the start list*/%let startlist=%StringMinus(&startlist, &excludelist);%end;%let nstarts = %sysfunc(countw(&startlist));  /* re-get number of variables in start list */%if &nstarts ^= 0 %then %do;%let regvlist=&startlist %StringMinus(&regvlist, &startlist);%end;%end;%if &includelist =  %then %let nincludes = 0;  %else %let nincludes = %sysfunc(countw(&includelist));  /* get number of variables in include list */%if &nincludes ^= 0 %then %do; %if nexcludes ^= 0 %then %do;  /* Make sure there are no excluded variables in the include list*/%let includelist=%StringMinus(&includelist, &excludelist);%end;%let nincludes = %sysfunc(countw(&includelist));  /* re-get number of variables in include list */%if &nincludes ^= 0 %then %do;%let regvlist=&includelist %StringMinus(&regvlist, &includelist);%end;%end;ods listing close;ods graphics on;%** Build forward model **;ods output SelParmEst=Parm1;proc reg data=&insdnT plots(maxpoints=none);model &dep_var = &regvlist /selection=forward details slentry=&sle%if &nincludes ne 0 %then %do; include=&nincludes %end;%if &nstarts ne 0 %then %do; start=&nstarts %end;;run;ods listing;ods graphics off;quit;ods listing close;ods graphics on;%** Build backward model **;ods output  SelParmEst=Parm2;proc reg data=&insdnT plots(maxpoints=none);model &dep_var = &regvlist /selection=backward details slstay=&sls%if &nincludes ne 0 %then %do; include=&nincludes %end;%if &nstarts ne 0 %then %do; start=&nstarts %end;;run;ods listing;ods graphics off;quit;%** Extract variables **;proc sql noprint; select max(step) into: max1 from Parm1;select max(step) into: max2 from Parm2;quit;data parm1; set parm1 (where=(step=&max1)); run;data parm2; set parm2 (where=(step=&max2)); run;proc sql noprint;create table varlisttmp asselect variablefrom parm1unionselect variable from parm2;select variable into: varlisttmp separated by ' ' from varlisttmp where variable ^= 'Intercept';select count(*) into: nvars from varlisttmp where variable ^= 'Intercept';quit;%put &varlisttmp;%** Get dev model statistics **;%ctl_Stats_Reg(&insdnT, &varlisttmp, &nvars );data &metric_sdnout;length file $4.;set stats;file = 'Dev';run;%** Get val model statistics **;%ctl_Stats_Reg(&insdnV, &varlisttmp, &nvars);data &metric_sdnout;set &metric_sdnout stats (in=a);if a then file = 'Val';run;%* Clean up;proc datasets library=work nolist;delete varlisttmp globalfit full vars tmp1 tmp2 vif stats;run;quit;%mend;**  StringMinus substracts one string from another  **;%macro StringMinus(string1, string2);%local count word StringOut;%let count=1;%let word=%scan(&string1,&count,%str( ));%do %while(&word ne);%let count=%eval(&count+1);%if %sysfunc(indexw(%upcase(&string2), %upcase(&word)))=0 %then %do;%let stringout=&stringout &word;%end;%let word=%scan(&string1,&count,%str(  ));%end;&StringOut%mend;**  Remember_nobs_ngoods_nbads gets record counts  **;%macro Remember_nobs_ngoods_nbads(insdn,bads=1,goods=0);%global vnobs;%global vngoods;%global vnbads;%if &weight= %then %do;proc sql noprint; select count(*) into: vnobs from &insdn;select count(*) into: vnbads from &insdn where &dep_var in (&bads);select count(*) into: vngoods from &insdn where &dep_var in (&goods);quit;%end;%else %do;proc sql noprint;select sum(&weight) into: vnobs from &insdn;select sum(&weight) into: vnbads from &insdn where &dep_var in (&bads);select sum(&weight) into: vngoods from &insdn where &dep_var in (&goods);quit;%end;%mend;**  Control program for binary stats  **;%macro ctl_Stats_Log(insdn, varlisttmp, nvars);%** Get counts for validation file **;%Remember_nobs_ngoods_nbads(&insdn);%** Get full model statistics **;%Stats_Log(&insdn, &varlisttmp, Full Model, int=N, vif=Y);data full; set globalfit; run;%** Get model statistics without intercept **;%Stats_Log(&insdn, &varlisttmp, Intercept, int=Y, vif=N);data vars; set globalfit; run;%** Get model statistics without specific variables **;%do I = 1 %to &nvars;%let var = %scan(&varlisttmp,&I);%let tmplist=%StringMinus(&varlisttmp, &var);  /* Remove single variable */%Stats_Log(&insdn, &tmplist, &var, int=N, vif=N);data vars; set vars globalfit; run;%end;%** Create macro variables for full models metrics **;data _null_; set full;call symputx('AIC',AIC);call symputx('SC',SC);call symputx('LogL2',LogL2);call symputx('Rsquare',Rsquare);call symputx('SomersD',SomersD);call symputx('Gamma',Gamma);call symputx('TauA',TauA);call symputx('c',c);call symputx('Concord',Concord);call symputx('Discon',Discon);call symputx('LackFit',LackFit);call symputx('Lift',Lift_index);call symputx('infv',infv);call symputx('ks',ks);run;proc sql noprint;select sum(abs(standardizedest)) into: cum from parms where variable ^= 'Intercept';quit;%** Subtract full model metrics from sub-model metrics;%** This shows the impact of each variable;data tmp1; set vars;AIC = AIC - &AIC;SC = SC - &SC;LogL2 = LogL2 - &LogL2;Rsquare = Rsquare - &Rsquare;SomersD = SomersD - &SomersD;Gamma = Gamma - ΓTauA = TauA - &TauA;c = c - &c;Concord = Concord - &Concord;Discon = Discon - &Discon;LackFit = LackFit - &LackFit;lift_Index = Lift_Index - &Lift;infv = infv - &infv;ks = ks - &ks;run;%** Combine datasets **;data tmp2; set full (in=a) tmp1;if a then sord = 1;else if variable = 'Intercept' then sord = 2;else sord = 3;run;proc sql;create table stats asselect a.Variable, b.Label, b.Estimate, b.StandardizedEst as StdEst, b.StdErr, b.WaldChiSq, b.ProbChiSq, c.VarianceInflation as VIF, abs(b.StandardizedEst)/&cum as RelImp, a.AIC, a.SC, a.LogL2, a.Rsquare, a.SomersD, a.Gamma, a.TauA, a.c, a.Concord, a.Discon, a.LackFit, a.Lift_Index, a.infv, a.ks, a.sordfrom tmp2 aleft join parms b on a.variable = b.variableleft join vif c on a.variable = c.variableorder by a.sord, RelImp desc;quit;%** Clean up;proc datasets library=work nolist;delete globalfit full vars tmp1 tmp2 parms vif;run;quit;%mend;**  Stats_Log gets statistics for binary models  **;%macro Stats_Log(insdn, varlist, var, int=N, vif=N ,bads=1,goods=0);%* Extract standard model statistics;ods listing close;ods output  FitStatistics= FitStatistics;ods output  RSquare= RSquare;ods output  Association= Association;ods output  LackFitChiSq= LackFitChiSq;%if &vif=Y %then %do;ods output  ParameterEstimates=Parms;%end;proc logistic data=&insdn desc namelen=32;%if &weight ne %then %do; weight &weight; %end;model &dep_var = &varlist / stb rsq lackfit parmlabel%if &int=Y %then %do; noint %end;;output out=scoretmp p=pred;run;ods listing;data globalfit (drop=Criterion label1 label2);length variable $32.;variable = "&var";merge%if &int=N %then %do;FitStatistics (where=(Criterion='AIC') rename=(InterceptAndCovariates=AIC) keep=Criterion InterceptAndCovariates)FitStatistics (where=(Criterion='SC') rename=(InterceptAndCovariates=SC) keep=Criterion InterceptAndCovariates) FitStatistics (where=(Criterion='-2 Log L') rename=(InterceptAndCovariates=LogL2) keep=Criterion InterceptAndCovariates)%end;%else %do;FitStatistics (where=(Criterion='AIC') rename=(WithCovariates=AIC) keep=Criterion WithCovariates)FitStatistics (where=(Criterion='SC') rename=(WithCovariates=SC) keep=Criterion WithCovariates) FitStatistics (where=(Criterion='-2 Log L') rename=(WithCovariates=LogL2) keep=Criterion WithCovariates)%end;RSquare (rename=(nValue1=Rsquare) keep=nValue1)Association (where=(label2="Somers' D") rename=(nvalue2=SomersD) keep=label2 nvalue2)Association (where=(label2='Gamma') rename=(nvalue2=Gamma) keep=label2 nvalue2)Association (where=(label2='Tau-a') rename=(nvalue2=TauA) keep=label2 nvalue2)Association (where=(label2='c') rename=(nvalue2=c) keep=label2 nvalue2)Association (where=(label1='Percent Concordant') rename=(nvalue1=Concord) keep=label1 nvalue1)Association (where=(label1='Percent Discordant') rename=(nvalue1=Discon) keep=label1 nvalue1)LackFitChiSq (rename=(ProbChiSq=LackFit) keep=ProbChiSq);label AIC=" ";label SC=" ";label LogL2=" ";label LackFit=" ";run;%* Assign groups;data tmp0; set scoretmp (keep=&dep_var pred &weight);length group $4.;bad = (&dep_var = &bads);good = (&dep_var = &goods);if &dep_var = &bads then group = 'bad';else if &dep_var = &goods then group = 'good';%if &weight ^=  %then %do; xxx=round(&weight); %end;run;%* Create deciles based on predicted value;%if &weight = %then %do;proc rank data=tmp0 out=tmp1 groups=10;var pred;ranks rank;run;%end;%else %do;proc sort data=tmp0 out=tmp1; by pred; run;data tmp1; set tmp1;retain cum;cum + &weight;rank = (floor(cum*10/(&vnobs+1)));run;%end;%* Summarize file by rank;proc summary data=tmp1;var &dep_var pred good bad;class rank;%if &weight ne %then %do; weight &weight; %end;%if &weight ne %then %do; output out=tmp2 (drop= _freq_) sumwgt=count %end;%else %do; output out=tmp2 (rename= _freq_=count) %end;mean(&dep_var)=mean_dv mean(pred)=mean_pred sum(bad)=badcnt sum(good)=goodcnt;run;data _null_; set tmp2 (where=(_type_=0));call symputx('Norm',mean_dv);run;%* Perform Kolmogorov-Smirnov test;proc npar1way data=tmp1 edf noprint;class &dep_var;var pred;%if &weight ne %then %do; freq xxx; %end;output edf out=kolsmir;run;proc sql noprint; select round(_d_,.000001) into: kstmp from kolsmir; quit;%* Create custom statistics;data tmp3 (keep=Lift_Index infv ks);set tmp2 (where=(_type_=1)) end=eof;Lift_Index = mean_dv / &norm * 100;retain infv cumbad cumgood;badratio=badcnt*(1.0/&vnbads);goodratio=goodcnt*(1.0/&vngoods);if _n_=1 then do;infv=0.0;cumgood=0.0;cumbad=0.0;end;%* Information_value;if badratio+goodratio>=0.000000001 then do;if min(badratio,goodratio)<=0.000000001 then do;infv=infv+count*max(badratio,goodratio)/&vnobs;end;else do;infv=infv+(badratio-goodratio)*log(badratio/goodratio);end;end;cumbad = cumbad + badratio;cumgood = cumgood + goodratio;%* K-S value;ks=&kstmp;if eof;run;data globalfit; merge globalfit tmp3; run;%* Variance inflation factor;%if &vif=Y %then %do;data scoretmp; set scoretmp;weight=pred*(1-pred);run;ods listing close;ods graphics on;ods output ParameterEstimates=VIF;proc reg data=scoretmp plots=none;weight weight;model &dep_var = &varlist / vif;run;ods graphics off;ods listing;%end;%* Clean up;proc datasets library=work nolist;delete FitStatistics RSquare Association LackFitChiSq scoretmp tmp0 tmp1 tmp2 tmp3 kolsmir;run;quit;%mend;**  Control program for continuous stats  **;%macro ctl_Stats_Reg(insdn, varlisttmp, nvars);%global vnobs;%if &weight= %then %do;proc sql noprint; select count(*) into: vnobs from &insdn;quit;%end;%else %do;proc sql noprint;select sum(&weight) into: vnobs from &insdn;quit;%end;%** Get full model statistics **;%Stats_Reg(&insdnV, &varlisttmp, Full Model, int=N, vif=Y);data full; set globalfit; run;%** Get model statistics without intercept **;%Stats_Reg(&insdnV, &varlisttmp, Intercept, int=Y, vif=N);data vars; set globalfit; run;%** Get model statistics without specific variables **;%do I = 1 %to &nvars;%let var = %scan(&varlisttmp,&I);%let tmplist=%StringMinus(&varlisttmp, &var);  /* Remove single variable */%Stats_Reg(&insdnV, &tmplist, &var, int=N, vif=N);data vars; set vars globalfit; run;%end;%** Create macro variables for full models metrics **;data _null_; set full;call symputx('Rsquare',Rsquare);call symputx('AdjRsq',AdjRsq);call symputx('RMSE',RMSE);call symputx('CoeffVar',CoeffVar);call symputx('AIC',AIC);call symputx('SBC',SBC);call symputx('PC',PC);call symputx('JP',JP);call symputx('Lift_Index',Lift_Index);call symputx('gini',gini);call symputx('infv',infv);call symputx('ks',ks);run;proc sql noprint;select sum(abs(standardizedest)) into: cum from vif where variable ^= 'Intercept';quit;%** Subtract full model metrics from sub-model metrics;%** This shows the impact of each variable;data tmp1; set vars;Rsquare = Rsquare - &Rsquare;AdjRsq = AdjRsq - &AdjRsq;RMSE = RMSE - &RMSE;CoeffVar = CoeffVar - &CoeffVar;AIC = AIC - &AIC;SBC = SBC - &SBC;PC = PC - &PC;JP = JP - &JP;Lift_Index = Lift_Index - &Lift_Index;gini = gini - &gini;infv = infv - &infv;ks = ks - &ks;run;%** Combine datasets **;data tmp2; set full (in=a) tmp1;if a then sord = 1;else if variable = 'Intercept' then sord = 2;else sord = 3;run;proc sql;create table stats asselect a.Variable, b.Label, b.Estimate, b.StandardizedEst as StdEst, b.StdErr, b.tValue, b.Probt, b.VarianceInflation as VIF, abs(b.StandardizedEst)/&cum as RelImp, a.Rsquare, a.AdjRsq, a.RMSE, a.CoeffVar, a.AIC, a.SBC, a.PC, a.JP, a.Lift_Index, a.gini, a.infv, a.ks, a.sordfrom tmp2 aleft join vif b on a.variable = b.variableorder by a.sord, RelImp desc;quit;%** Clean up;proc datasets library=work nolist;delete globalfit full vars tmp1 tmp2 vif;run;quit;%mend;**  Stats_Reg gets statistics for continuous models  **;%macro Stats_Reg(insdn, varlist, var, int=N, vif=N);%* Extract standard model statistics;ods listing close;ods graphics on;ods output  FitStatistics=FitStatistics;%if &vif=Y %then %do;options label;ods output ParameterEstimates=VIF;%end;proc reg data=&insdn outest=outest plots=none;%if &weight ne %then %do; weight &weight; %end;model &dep_var = &varlist / stb aic sbc jp pc%if &vif=Y %then %do; vif %end;%if &int=Y %then %do; noint %end;;output out=scoretmp p=pred;run;ods graphics off;ods listing;quit;data globalfit (drop=label1 label2);length variable $32.;variable = "&var";merge FitStatistics (where=(label2="R-Square") rename=(nvalue2=Rsquare) keep=label2 nvalue2)FitStatistics (where=(label2="Adj R-Sq") rename=(nvalue2=AdjRsq) keep=label2 nvalue2)FitStatistics (where=(label1="Root MSE") rename=(nvalue1=RMSE) keep=label1 nvalue1)FitStatistics (where=(label1="Coeff Var") rename=(nvalue1=CoeffVar) keep=label1 nvalue1)outest (keep=_AIC_ _SBC_ _PC_ _JP_ rename=(_AIC_=AIC _SBC_=SBC _PC_=PC _JP_=JP));run;%* Create deciles based on predicted value;%if &weight = %then %do;proc rank data=scoretmp (keep=&dep_var pred &weight) out=tmp1 groups=10;var pred;ranks rank;run;%end;%else %do;proc sort data=scoretmp (keep=&dep_var pred &weight) out=tmp1; by pred; run;data tmp1; set tmp1;retain cum;cum + &weight;rank = (floor(cum*10/(&vnobs+1)));run;%end;%* Summarize file by rank;proc summary data=tmp1;var &dep_var pred;class rank;%if &weight ne %then %do; weight &weight; %end;%if &weight ne %then %do; output out=tmp2 (drop= _freq_) sumwgt=count %end;%else %do; output out=tmp2 (rename= _freq_=count) %end;mean(&dep_var)=mean_dv mean(pred)=mean_pred;run;%* Create custom statistics;proc sql noprint;select mean_dv into: norm   from tmp2 (where=(_type_=0));select count into: ntotal from tmp2 (where=(_type_=0));select sum(count*mean_dv) into: totalresp from tmp2 (where=(_type_=1));select sum(count*(count*mean_dv/&totalresp)) into: totalbad from tmp2 (where=(_type_=1));quit;data tmp3 (keep=lift_index gini infv ks);set tmp2 (where=(_type_=1)) end=eof;Lift_Index = mean_dv / &norm * 100;retain cumresp cumtotal;if _n_ = 1 then do;cumresp = 0;cumtotal = 0;end;cumresp = cumresp + mean_dv * count;cumpct_resp = cumresp / &totalresp;cumtotal = cumtotal + count;cumpct_freq = cumtotal / &ntotal;cumavg = cumresp / cumtotal;cumindex = cumavg / &norm * 100;badrate = count * mean_dv / &totalresp;goodrate = (1 - badrate) * count / (&ntotal - &totalbad);retain gini infv ks cumbad cumgood 0;prev_dv = lag(cumpct_resp);prev_pct = lag(cumpct_freq);if _n_ ne 1 then gini = gini+2*((cumpct_freq+prev_pct)/2-(cumpct_resp+prev_dv)/2)*(cumpct_freq-prev_pct);infv = infv+(badrate-goodrate)*log(badrate/goodrate);cumbad = cumbad+badrate;cumgood = cumgood+goodrate;ks = max(ks,abs(cumbad-cumgood));if eof;run;data globalfit; merge globalfit tmp3; run;%* Clean up;proc datasets library=work nolist;delete FitStatistics outest scoretmp tmp1 tmp2 tmp3;run;quit;%mend;** Variable Tuning and Selection Based on model metric output **;%macro Vars_tune(dt1=, dt2=, out_txt=&path_output./CE4_Varlist_Final.txt);%let neg=Rsquare SomersD Gamma TauA c Concord infv tvr ks diverg AdjRsq Lift_Index gini;%if &threshold ^= 0 and %index(&neg,&criteria)>0 %then %do;%let threshold = %sysevalf(-1*&threshold);%end;proc sql;create table tempfinal asselect variable from &dt1where sord = 3 and &criteria <= &threshold  and RelImp >= &MinImp  and file = 'Val'&SQL_joinselect variable from &dt2where sord = 3 and &criteria <= &threshold  and RelImp >= &MinImp  and file = 'Val';quit;data _NULL_;set tempfinal end=eof;FILE  "&out_txt"  LRECL=256;if _N_=1 then do;PUT ' ';PUT "%"@;PUT "LET Varlist_Final=";end;PUT variable @;if eof then PUT ';';run;proc sort data=tempfinal; by variable;data out.ce4_variables;merge out.ce4_variables tempfinal (in=a);by variable;if a then final = 'Y';run;proc datasets library=work nolist;delete tempfinal;quit;run;%mend;** Create Graph report **;%macro Grafing (inds,varlist,cats);%** Get variables and unique counts **;proc sql;create table cnts asselect    %let i=1;%let v=%scan(&varlist, &i,' ');%do %while (&v^=);count(distinct(&v)) as &v,%let i=%eval(&i+1);%let v=%scan(&varlist, &i,' ');%end;"dummy" as dummyfrom &inds;quit;proc transpose data=cnts (drop=dummy) out=cnts (rename=col1=uniq) name=variable; run;%** Create macro variables to control processing **;data _null_; set cnts end=eof;if eof then call symputx("VARCNT",_N_);call symputx("IV"|| trim(left(put(_N_,4.)))  ,variable);call symputx("uniq"|| trim(left(put(_N_,4.)))  ,uniq);run;goptions reset=all device=pdf display gunit=pct border ftext= htitle=8 htext=3;ods listing close;ods pdf file="&path_output/CE4_Graphs.pdf" style=sasweb startpage=no;%** Loop through for each variable **;%do _I_ = 1 %to &varcnt;%if &_I_ ^= 1 %then %do; ods pdf startpage=now; %end;%if &&uniq&_I_ <= &cats %then %do;proc summary data=&inds nway missing;var &dep_var;class &&iv&_I_;output out=graf (drop=_type_ rename=_freq_=count) mean=mean;run;proc print data=graf noobs;title "Variable: &&iv&_I_";var &&iv&_I_ count mean;format count comma8.;format mean 8.4;run;proc sgplot data=graf;vbar &&iv&_I_ / response=mean nostatlabel /*nooutline*/ fillattrs=(color="lightblue") /*transparency=.5*/;vline &&iv&_I_ / response=count nostatlabel y2axis lineattrs=(color="darkblue" thickness=2);label count="# of Customers";label mean="Mean &dep_var";label &&iv&_I_="&&iv&_I_";keylegend / location = outsideposition = topnobordertitle = "&&iv&_I_";format count comma8.;run;%end;%else %do;proc means data=&inds p1 p99 NOPRINT;     var &&&iv&_I_;     output out=tmp p1=var_p1 p99=var_p99  / noinherit;  run;data _NULL_;set tmp;range = (var_p99 - var_p1)/&cats;call symputx('cut_lo',var_p1);call symputx('cut_hi',var_p99);call symputx('range',range);run;data tmp;set &inds (keep=&dep_var &&iv&_I_);if &&iv&_I_ < &cut_lo then bin = 1;else if &&iv&_I_ >= &cut_hi then bin = input("&cats",best10.);else do;do k = 1 to &cats;if &&iv&_I_ >= &cut_lo+(k-1)*&range and &&iv&_I_< &cut_lo+k*&range thenbin=k;end;end;run;proc summary data=tmp nway missing;var &&iv&_I_ &dep_var;class bin;output out=graf (drop=_type_ rename=_freq_=count) min(&&iv&_I_)=lo max(&&iv&_I_)=hi mean(&dep_var)=mean;run;proc print data=graf noobs;title "Variable: &&iv&_I_";var bin count lo hi mean;format count comma8.;format lo hi mean 8.4;run;proc sgplot data=graf;vbar lo / response=mean nostatlabel fillattrs=(color="lightblue");vline lo / response=count nostatlabel y2axis lineattrs=(color="darkblue" thickness=2);label count="# of Customers";label mean="Mean &dep_var";label lo="&&iv&_I_";keylegend / location = outsideposition = topnobordertitle = "&&iv&_I_";format count comma8.;run;proc datasets nolist;delete tmp;run;%end;%end;ods pdf close;ods listing;proc datasets nolist;delete cnts graf;run;%mend;** Master macro for Model selection and tuning **;%macro CE_Model_Val(insdn, varlist);data mod val;set &insdn (where=(mod_val_test ^= 3));keep &Dep_var mod_val_test &varlist &weight;if mod_val_test=1 then output mod;else if mod_val_test=2 then output val;run;%* Start variable table;data out.ce4_variables;length variable $32;      %Let I = 1;      %Let var = %scan(&varlist,&I);      %Do %while(&var ne );            variable="&var";            output;         %Let I = %eval(&I + 1);         %Let var = %scan(&varlist,&I);      %end;run;proc sort data=out.ce4_variables; by variable; run;%if %upcase(&Binary_dv) = Y %then %do;%Model_Val_Logistic(mod, val, &varlist,sle=&sel_alpha,sls=&sel_alpha,metric_sdnout=out.CE4_Model_Metric_Mod);proc sort data=parm1 (keep=variable where=(variable ^= 'Intercept')); by variable; run;proc sort data=parm2 (keep=variable where=(variable ^= 'Intercept')); by variable; run;data out.ce4_variables;merge out.ce4_variables parm1 (in=a) parm2 (in=b);by variable;if a then Tforward = 'Y';if b then Tbackward = 'Y';run;%Model_Val_Logistic(val, mod, &varlist,sle=&sel_alpha,sls=&sel_alpha,metric_sdnout=out.CE4_Model_Metric_Val);proc sort data=parm1 (keep=variable where=(variable ^= 'Intercept')); by variable; run;proc sort data=parm2 (keep=variable where=(variable ^= 'Intercept')); by variable; run;data out.ce4_variables;merge out.ce4_variables parm1 (in=a) parm2 (in=b);by variable;if a then Vforward = 'Y';if b then Vbackward = 'Y';run;%*model selection and tuning;%Vars_tune(dt1=out.CE4_Model_Metric_Mod,dt2=out.CE4_Model_Metric_Val,out_txt=&path_output./CE4_Varlist_Final.txt);%* Report;ods listing close;ods Tagsets.ExcelxP body="&path_output.CE4_Model_report.xls" style=sasweb;ods tagsets.excelxp options(sheet_name="Variables");proc print data=out.ce4_variables noobs; run;ods tagsets.excelxp options(sheet_name="Model" embedded_titles="yes");proc print data=out.CE4_Model_Metric_Mod noobs;title "Diagnostic statistics: Model Portion";var file Variable Label;var Estimate  / style={tagattr='format:0.000000'};var StdEst StdErr WaldChiSq ProbChiSq VIF RelImp  / style={tagattr='format:0.0000'};var AIC SC LogL2  / style={tagattr='format:0.00'};var Rsquare SomersD Gamma TauA c Concord Discon LackFit / style={tagattr='format:0.0000'};var Lift_Index / style={tagattr='format:0.00'};var infv ks  / style={tagattr='format:0.0000'};run;title ;ods tagsets.excelxp options(sheet_name="Validation" embedded_titles="yes");proc print data=out.CE4_Model_Metric_Val noobs;title "Diagnostic statistics: Validation Portion";var file Variable Label;var Estimate  / style={tagattr='format:0.000000'};var StdEst StdErr WaldChiSq ProbChiSq VIF RelImp  / style={tagattr='format:0.0000'};var AIC SC LogL2  / style={tagattr='format:0.00'};var Rsquare SomersD Gamma TauA c Concord Discon LackFit / style={tagattr='format:0.0000'};var Lift_Index / style={tagattr='format:0.00'};var infv ks  / style={tagattr='format:0.0000'};run;title ;ods Tagsets.ExcelxP close;ods listing;%end;%else %if %upcase(&Binary_dv) ^= Y %then %do;%Model_Val_Reg(mod, val, &varlist,sle=&sel_alpha,sls=&sel_alpha,metric_sdnout=out.CE4_Model_Metric_Mod);proc sort data=parm1 (keep=variable where=(variable ^= 'Intercept')); by variable; run;proc sort data=parm2 (keep=variable where=(variable ^= 'Intercept')); by variable; run;data out.ce4_variables;merge out.ce4_variables parm1 (in=a) parm2 (in=b);by variable;if a then Tforward = 'Y';if b then Tbackward = 'Y';run;%Model_Val_Reg(val, mod, &varlist,sle=&sel_alpha,sls=&sel_alpha,metric_sdnout=out.CE4_Model_Metric_Val);proc sort data=parm1 (keep=variable where=(variable ^= 'Intercept')); by variable; run;proc sort data=parm2 (keep=variable where=(variable ^= 'Intercept')); by variable; run;data out.ce4_variables;merge out.ce4_variables parm1 (in=a) parm2 (in=b);by variable;if a then Vforward = 'Y';if b then Vbackward = 'Y';run;%*model selection and tuning;%Vars_tune(dt1=out.ce4_model_metric_mod,dt2=out.ce4_model_metric_val,out_txt=&path_output./CE4_Varlist_Final.txt);%* Report;ods listing close;ods Tagsets.ExcelxP body="&path_output.CE4_Model_report.xls" style=sasweb;ods tagsets.excelxp options(sheet_name="Variables");proc print data=out.ce4_variables noobs; run;ods tagsets.excelxp options(sheet_name="Model" embedded_titles="yes");proc print data=out.CE4_Model_Metric_Mod noobs;title "Diagnostic statistics: Model Portion";var file Variable Label;var Estimate  / style={tagattr='format:0.000000'};var StdEst StdErr tValue Probt VIF RelImp  / style={tagattr='format:0.0000'};var AIC  SBC JP  / style={tagattr='format:0.00'};var Rsquare AdjRsq RMSE CoeffVar PC / style={tagattr='format:0.0000'};var Lift_Index / style={tagattr='format:0.00'};var gini infv ks  / style={tagattr='format:0.0000'};run;title ;ods tagsets.excelxp options(sheet_name="Validation" embedded_titles="yes");proc print data=out.CE4_Model_Metric_Val noobs;title "Diagnostic statistics: Validation Portion";var file Variable Label;var Estimate  / style={tagattr='format:0.000000'};var StdEst StdErr tValue Probt VIF RelImp  / style={tagattr='format:0.0000'};var AIC SBC JP  / style={tagattr='format:0.00'};var Rsquare AdjRsq RMSE CoeffVar PC / style={tagattr='format:0.0000'};var Lift_Index / style={tagattr='format:0.00'};var gini infv ks  / style={tagattr='format:0.0000'};run;title ;ods Tagsets.ExcelxP close;ods listing;run;title ;%end;%* Do charting?;%if %upcase(&graph_plot)=Y %then %do;%inc "&path_output/CE4_Varlist_Final.txt";data gdat; set &insdn (where=(mod_val_test ^= 3)); run;%Grafing(gdat,&Varlist_Final,20);proc datasets library=work nolist;delete gdat;run;quit;%end;proc datasets library=work nolist;delete mod val parm1 parm2;run;quit;%mend CE_Model_Val;

0 0
原创粉丝点击