Here is the code to generate data. The problem arises during multinomial logistic regression data generation process. Code snippet below is to generate the the categorical outcome from multinomial logistics regression model. This is to be used for treatment status.
** generate treatment status;
call RANDGEN(t, "TABLE", p);
However, simulating 1000 samples most often result in some categories with no values which result in error during dummy code transformation. I was trying to find a way to skip these samples which stall the program, and begin next cycle. If there are 950 samples out 1000 it is fine. But for now, sometimes the error happen to be with the 10th sample and cannot go beyond that.
%let S=1211;
%let NumSamples = 100;
%let N=1200;
%let beta11 = log(3);
%let beta12 = log(4);
%let beta13 = 0;
%let beta21 = log(9/10);
%let beta22 = log(10/9);
%let beta23 = 0;
%let alpha0 = 0;
%let alpha1 = 0.2;
%let alpha2 = 0.4;
%let alphaX = 0.2;
%let alphaX3 = 0.1;
** simulate data;
proc iml;
** assign variable names and allocate space for the data and parameters;
varNamesData={SampleID x x3 t t1 t2 y};
TempSimData = J(&N, NCOL(varNamesData));
x = J(&N, 1);
t = J(&N, 1);
p = J(&N, 3);
t1 = J(&N, 1, 0);
t2 = J(&N, 1, 0);
t3 = J(&N, 1, 0);
y = J(&N,1);
epsilon = J(&N, 1);
TempSimData = J(&N, NCOL(varNamesData));
create SimData from TempSimData[c=varNamesData];
** simulation loop;
do SampleID = 1 to &NumSamples;
call RANDSEED(0);
call RANDGEN(x, "NORMAL", 1, 1);
** calculate the qubic term;
x3 = x##3;
beta01 = -(&beta11 * 1 + &beta21 * 4);
beta02 = -(&beta12 * 1 + &beta22 * 4);
** define linear equations;
eta13 = beta01 + &beta11 * x + &beta21 * x3; *T=1 vs T=3;
eta23 = beta02 + &beta12 * x + &beta22 * x3; *T=2 vs T=3;
*eta33 = 0 + 0 * x + 0*x3;; *T=3 vs T=3;
** find actual probabilities for subjects to be in each treatment level;
pi1 = exp(eta13) / (exp(eta13) + exp(eta23) + 1);
pi2 = exp(eta23) / (exp(eta13) + exp(eta23) + 1);
pi3 = 1 / (exp(eta13) + exp(eta23) + 1);
** fill the probability matrix from pi1, pi2, and pi3;
p[,1] = pi1;
p[,2] = pi2;
p[,3] = pi3;
** generate treatment status;
call RANDGEN(t, "TABLE", p);
idx1 = LOC(t=1);
idx2 = LOC(t=2);
idx3 = LOC(t=3);
* create dummy variables for treatment levels;
if NCOL(idx1)>0 then t1[idx1]=1; else print "No observations in level 1";
if NCOL(idx2)>0 then t2[idx2]=1; else print "No observations in level 2";
if NCOL(idx3)>0 then t3[idx3]=1; else print "No observations in level 3";
** generate residuals;
call RANDGEN(epsilon, "NORMAL", 0, .5);
** generate y;
y = &alpha0 + &alpha1*t1 + &alpha2*t2 + &alphaX*x + &alphaX3*x3 + epsilon;
** create a temporary simulated data for each simulation loop;
TempSimData[,1] = SampleID;
TempSimData[,2] = x;
TempSimData[,3] = x3;
TempSimData[,4] = t;
TempSimData[,5] = t1;
TempSimData[,6] = t2;
TempSimData[,7] = y;
setout SimData;
append from TempSimData;
end;
close SimData;
quit;
... View more