Bagging算法在SAS中的实现

来源:互联网 发布:阿里云智能家居 编辑:程序博客网 时间:2024/05/21 19:41
原文地址:Bagging算法在SAS中的实现作者:文穗
%macro bagging(data = , y = , numx = , catx = , ntrees = 10);***********************************************************;* THIS SAS MACRO IS AN ATTEMPT TO IMPLEMENT BAGGING       *;* PROPOSED BY LEO BREIMAN (1996)                          *;* ======================================================= *;* PAMAMETERS:                                             *;*  DATA   : INPUT SAS DATA TABLE                          *;*  Y      : RESPONSE VARIABLE WITH 0/1 VALUE              *;*  NUMX   : A LIST OF NUMERIC ATTRIBUTES                  *;*  CATX   : A LIST OF CATEGORICAL ATTRIBUTES              *;*  NTREES : # OF TREES TO DO THE BAGGING                  *;* ======================================================= *;* OUTPUTS:                                                *;*  1. A SAS CATALOG FILE NAMED "TREEFILES" IN THE WORKING *;*     DIRECTORY CONTAINING ALL SCORING FILES IN BAGGING   *;*  2. A LST FILE SHOWING ks STATISTICS OF THE BAGGING     *;*     CLASSIFIER AND EACH TREE CLASSIFIER                 *;* ======================================================= *;* CONTACT:                                                *;*  WENSUI.LIU@53.COM, LOSS FORECASTING & RISK MODELING    *;***********************************************************;options mprint mlogic nocenter nodate nonumber;*** a random seed value subject to change ***;%let seed = 20110613;*** assign a library to the working folder ***;libname _path '';*** generate a series of random seeds ***;data _null_;  do i = 1 to &ntrees;    random = put(ranuni(&seed) * (10 ** 8), 8.);    name   = compress("random"||put(i, 3.), ' ');    call symput(name, random);  end;run;    *** clean up catalog files in the library ***;proc datasets library = _path nolist;  delete TreeFiles tmp / memtype = catalog;run;quit;proc sql noprint;  select count(*) into :nobs from &data where &y in (1, 0);quit;data _tmp1 (keep = &y &numx &catx _id_);  set &data;  _id_ + 1;run;  %do i = 1 %to &ntrees;  %put &&random&i;  *** generate bootstrap samples for bagging ***;  proc surveyselect data = _tmp1 method = urs n = &nobs seed = &&random&i    out = sample&i(rename = (NumberHits = _hits)) noprint;  run;    *** generate data mining datasets for sas e-miner ***;  proc dmdb data = sample&i out = db_sample&i dmdbcat = cl_sample&i;    class &y &catx;    var &numx;    target &y;    freq _hits;  run;  *** create a sas temporary catalog to contain sas output ***;  filename out_tree catalog "_path.tmp.out_tree.source";  *** create decision tree mimicking CART ***;  proc split data = db_sample&i dmdbcat = cl_sample&i    criterion    = gini    assess       = impurity    maxbranch    = 2    splitsize    = 100    subtree      = assessment    exhaustive   = 0     nsurrs       = 0;    code file    = out_tree;    input &numx   / level = interval;    input &catx   / level = nominal;    target &y     / level = binary;    freq _hits;  run;    *** create a perminant sas catalog to contain all tree outputs ***;  filename in_tree catalog "_path.TreeFiles.tree&i..source";  data _null_;    infile out_tree;    input;    file in_tree;    if _n_ > 3 then put _infile_;  run;  *** score the original data by each tree output file ***;  data _score&i (keep = p_&y.1 p_&y.0 &y _id_);    set _tmp1;    %include in_tree;  run;  *** calculate KS stat ***;  proc printto new print = lst_out;  run;  ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));  proc npar1way wilcoxon edf data = _score&i;    class &y.;    var p_&y.1;  run;  proc printto;  run;  %if &i = 1 %then %do;    data _tmp2;      set _score&i;    run;    data _ks;      set _kstmp (keep = nvalue2);      tree_id = &i;      seed    = &&random&i;      ks      = round(nvalue2 * 100, 0.0001);    run;  %end;      %else %do;    data _tmp2;      set _tmp2 _score&i;    run;    data _ks;      set _ks _kstmp(in = a keep = nvalue2);      if a then do;        tree_id = &i;        seed    = &&random&i;        ks      = round(nvalue2 * 100, 0.0001);      end;    run;  %end;    %end;*** aggregate predictions from all trees in the bag ***;proc summary data = _tmp2 nway;  class _id_;  output out = _tmp3(drop = _type_ rename = (_freq_ = freq))  mean(p_&y.1) =  mean(p_&y.0) =  mean(&y) = ;run;*** calculate bagging KS stat ***;proc printto new print = lst_out;run;ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));proc npar1way wilcoxon edf data = _tmp3;  class &y;  var p_&y.1;run;proc printto;run;data _ks;  set _ks _kstmp (in = a keep = nvalue2);  if a then do;    tree_id = 0;    seed    = &seed;    ks      = round(nvalue2 * 100, 0.0001);  end;run;proc sort data = _ks;  by tree_id;run;proc sql noprint;  select max(ks) into :max_ks from _ks where tree_id > 0;    select min(ks) into :min_ks from _ks where tree_id > 0;  select ks into :bag_ks from _ks where tree_id = 0;quit;*** summarize the performance of bagging classifier and each tree in the bag ***;title "MAX KS = &max_ks, MIN KS = &min_ks, BAGGING KS = &bag_ks";proc print data = _ks noobs;  var tree_id seed ks;run;title;proc datasets library = _path nolist;  delete tmp / memtype = catalog;run;quit;%mend bagging;%let x1 = tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt          tot_rev_line rev_util bureau_score ltv tot_income;%let x2 = purpose;libname data 'D:SAS_CODEbagging';�gging(data = data.accepts, y = bad, numx = &x1, catx = &x2, ntrees = 10);

0 0