* example data;
* reference: https://www.mwsug.org/proceedings/2017/AA/MWSUG-2017-AA02.pdf;
data test;
do ID = 1 to 10000;
cumLogit = ranuni(2);
e = 1*log( cumLogit/(1-cumLogit ));
X1 = rannor(9); /* in model as quadratic */
X2 = rannor(9); /* in model as log */
X3 = rannor(9); /* only weakly in model */
X4 = rannor(9); /* not in model */
X5 = rannor(9); /* in model as a factor in interaction X7 */
X6 = rannor(9); /* in model as a factor in interaction X7 */
X7 = X6*X5; /* in model */
B1 = (ranuni(1) < .4); /* in model */
B2 = (ranuni(1) < .6); /* in model */
B3 = (ranuni(1) < .5); /* in model */
B4 = (ranuni(1) < .5); /* in model */
C12 = B1 + B2; /* indirectly in model */
C34 = B3 + B4; /* indirectly in model */
xbeta = X1**2 + log(X2+8) + .01*X3 + 2*X7 + 0.1*B1 + B2 + B3 + B4 + e;
P_1 = exp(xbeta) / (1 + exp(xbeta));
Y = (P_1 > 0.95);
/* To create a PARTITION role variable, add this line */
IF ID <= 5000 then PART = 1; ELSE PART = 0;
output;
end;
run;
proc contents data=test varnum; run;
* Question 1: how to gain parameter estimates in each step rather than the selected (last) step alone from a single running of proc hpgenselect;
* This question is about further calcuating testing AUC per step as a primary metric rather than AIC, BIC, ..., in data not used to train models;
proc hpgenselect data=test lassorho=.80 lassosteps=20;
class C12 C34;
model Y (descending) = X1-X7 C12 C34 / dist=binary;
selection method=LASSO details=all;
run;
* Question 2;
** why are parameter estimates, AIC, AICC, BIC not the same between
*** Model 1 Step 2 Lambda 0.64;
*** Model 2 Step 1 Lambda 0.64;
** why Chosen Regularization Parameter in Model 2 is not 0.64*0.190154;
** Model 1;
proc hpgenselect data=test lassorho=.80 lassosteps=2;
class C12 C34;
model Y (descending) = X1-X7 C12 C34 / dist=binary;
selection method=LASSO details=all;
run;
** Model 2;
proc hpgenselect data=test lassorho=.64 lassosteps=1;
class C12 C34;
model Y (descending) = X1-X7 C12 C34 / dist=binary;
selection method=LASSO details=all;
run;