Hi @sidello,
Let's assume your input dataset (created without the DSD option of the INFILE statement) was named HAVE. My understanding is that you want to split each pair so as to assign the two participants to two groups (say group 0 and group 1) which are as balanced as possible regarding the frequency distributions of categorical variable CAT. My suggested solution to this problem would look something like this:
/* Rearrange the data to have one observation per pair */
proc sql;
create view _tmp
as select * from have
order by pair, part;
quit;
data pairs(drop=part cat);
call streaminit(27182818);
do until(last.pair);
set _tmp;
by pair;
if first.pair then do;
_p0=part;
_c0=cat;
end;
else do;
_p1=part;
_c1=cat;
end;
end;
_r=rand('uniform');
run;
proc sql;
drop view _tmp;
quit;
/* Sort dataset PAIRS randomly */
proc sort data=pairs out=pairs;
by _r;
run;
/* Assign participants to two groups (0 and 1) with balanced
numbers of participants per category */
%let nc=3; /* number of categories (1, 2, ..., &nc) */
data want(drop=_:);
call streaminit(3141592);
array _n[&nc,0:1] _temporary_ (%eval(2*&nc)*0);
/* _n[c, g] is the number of participants of category c in group g. */
set pairs;
/* Check how either of the two possible assignments affects the balance: */
_d0=(_n[_c0,0]+1-_n[_c0,1])**2+(_n[_c1,1]+1-_n[_c1,0])**2;
_d1=(_n[_c0,1]+1-_n[_c0,0])**2+(_n[_c1,0]+1-_n[_c1,1])**2;
part=_p0;
cat=_c0;
/* Prefer an assignment which improves balance (if any), otherwise assign randomly: */
if _d0<_d1 then group=0;
else if _d1<_d0 then group=1;
else group=rand('bern',0.5);
_n[cat,group]+1;
output;
part=_p1;
cat=_c1;
group=1-group;
_n[cat,group]+1;
output;
run;
proc sort data=want;
by part pair;
run;
/* Check if groups are balanced in each category */
proc freq data=want;
tables group*cat;
run;
... View more