@JKHess Below my attempt to meet your requirements based on my understanding. I believe the code does what it should but I strongly suggest you take some time to create some really good sample data for testing where you know exactly how the desired result needs to look like.
I've chosen for performance reasons a data step/hash table approach. Your data volumes are serious so depending on the memory you've got things might or might not work.
I'm sharing two coding options (variations). Option 1 requires a bit less data prep but in turn might require up to 7GB of memory, option 2 should already work with 4GB of memory.
You can check your memory settings via below (results are in bytes, zero means normally as much as available).
proc options group=memory;
run;
Option 1:
/*************** create sample data ************************/
/* file 1 */
data dec;
input ID Date :mmddyy10. Decile;
format Date mmddyy10.;
datalines;
1 1/1/2017 1
22 1/1/2017 1
41 1/1/2017 1
56 1/1/2017 2
79 1/1/2017 2
85 1/1/2017 2
100 1/2/2017 1
118 1/2/2017 1
125 1/2/2017 2
167 1/2/2017 2
178 1/2/2017 3
;
run;
/* file 2 - not really a bridge because relationship bridge:no_dec is many:many */
data bridge;
input Date :mmddyy10. Decile Zipcode $5.;
format Date mmddyy10.;
datalines;
1/1/2017 1 88123
1/1/2017 1 03867
1/1/2017 1 04001
1/1/2017 2 03304
1/1/2017 2 98765
1/1/2017 2 96224
1/1/2017 2 00001
1/2/2017 1 98801
1/2/2017 2 88123
1/2/2017 2 12345
1/2/2017 2 83356
1/2/2017 2 98765
1/2/2017 3 03304
1/2/2017 3 04945
;
run;
/* file 3 */
data no_dec;
input ID Zipcode $5.;
datalines;
2 88123
21 88123
22 88123
23 88123
24 88123
3 12345
4 03304
5 03867
6 04945
7 04001
8 98765
9 98801
10 96224
11 00001
12 83356
13 83356
;
run;
/************ draw control *************************************/
/* assign a random value to each entry in no_dec (file 3) and output as table no_dec_ranno sorted by zipcode and random value */
data _null_;
dcl hash h1(ordered:'y', multidata:'y');
h1.defineKey('Zipcode','ran_no');
h1.defineData('id','zipcode','ran_no');
h1.defineDone();
call streaminit(10);
do until(_last);
set no_dec end=_last;
ran_no=rand('uniform');
_rc=h1.add();
end;
_rc=h1.output(dataset:'no_dec_ranno');
stop;
run;
data control(keep=id_dec id zipcode date Decile select_cnt)
control_insufficient_data(keep=id_dec id zipcode date Decile select_cnt)
;
length id_dec 8;
if _n_=1 then
do;
call streaminit(10);
/* load no_dec_ranno into hash replacing the random values by a counter */
/* - the order is still random but the counter instead of a random value will allow to address specific items later on */
/* - memory consumption of this hash is around 88bytes * number of items plus some overhead. For 34.6M rows close to 3GB */
dcl hash h_nodec(ordered:'y');
h_nodec.defineKey('Zipcode','seq_no');
h_nodec.defineData('id','zipcode','seq_no');
h_nodec.defineDone();
do until(_last);
set no_dec_ranno(drop=ran_no) end=_last;
by Zipcode;
if first.zipcode then seq_no=1;
else seq_no+1;
_rc=h_nodec.add();
end;
/* load the bridge data into a hash */
dcl hash h_brdg(dataset:'bridge', multidata:'y', ordered:'y');
h_brdg.defineKey('date','decile');
h_brdg.defineData('zipcode');
h_brdg.defineDone();
/* hash to store per zipcode the last seq_no (counter) used to populate the table with control data */
dcl hash h_last_ranno();
h_last_ranno.defineKey('zipcode');
h_last_ranno.defineData('seq_no');
h_last_ranno.defineDone();
/* hash to store for the current row of dec (file 1) all matching rows from the bridge table */
dcl hash h_zipcollect();
h_zipcollect.defineKey('iter');
h_zipcollect.defineData('zipcode');
h_zipcollect.defineDone();
end;
call missing(of _all_);
set dec(rename=(id=id_dec));
/*** draw two controls for case ***/
select_cnt=0;
/** 1. select zipcode from bridge for lookup of rows in no_dec (file 3) */
/* load all zipcodes from the bridge into hash h_zipcollect that match with the current row from dec (file 1) */
_rc=h_zipcollect.clear();
do iter=1 by 1 while(h_brdg.do_over() = 0);
_rc=h_zipcollect.add();
end;
do i=1 to 99 until(select_cnt=2); /* if sufficient data to draw control from, loop will only iterate twice */
/* random selection of one of the matching zipcodes from hash h_zipcollect */
iter=rand('integer',1,h_zipcollect.num_items);
if h_zipcollect.find()=0 then
do;
/** 2. draw control **/
/* for the current zipcode derive lowest value for seq_no for the row that hasn't been drawn already */
if h_last_ranno.find() ne 0 then
do;
seq_no=1;
_rc=h_last_ranno.add();
end;
/* draw control record */
if h_nodec.find()=0 then
do;
/* count how many control records selected for the current record from dec */
select_cnt=sum(select_cnt,1);
output control;
/* remove selected record from hash as we won't select it again */
_rc=h_nodec.remove();
/* increase seq_no by 1 for this zipcode as prep of selection of another row for the table with controls */
seq_no=sum(seq_no,1);
_rc=h_last_ranno.replace();
end;
else
do;
/* in case all rows for a zipcode have already been drawn don't use this zipcode any further */
_rc=h_zipcollect.remove();
end;
end;
end;
if select_cnt<2 then output control_insufficient_data;
run;
/* title 'control'; */
/* proc print data=control; */
/* run; */
/* title 'Decedent with insufficient matching data to create control'; */
/* proc sql; */
/* select * */
/* from control_insufficient_data; */
/* quit; */
/* title; */
Option 2:
/*************** create sample data ************************/
/* file 1 */
data dec;
input ID Date :mmddyy10. Decile;
format Date mmddyy10.;
datalines;
1 1/1/2017 1
22 1/1/2017 1
41 1/1/2017 1
56 1/1/2017 2
79 1/1/2017 2
85 1/1/2017 2
100 1/2/2017 1
118 1/2/2017 1
125 1/2/2017 2
167 1/2/2017 2
178 1/2/2017 3
;
run;
/* file 2 - not really a bridge because relationship bridge:no_dec is many:many */
data bridge;
input Date :mmddyy10. Decile Zipcode $5.;
format Date mmddyy10.;
datalines;
1/1/2017 1 88123
1/1/2017 1 03867
1/1/2017 1 04001
1/1/2017 2 03304
1/1/2017 2 98765
1/1/2017 2 96224
1/1/2017 2 00001
1/2/2017 1 98801
1/2/2017 2 88123
1/2/2017 2 12345
1/2/2017 2 83356
1/2/2017 2 98765
1/2/2017 3 03304
1/2/2017 3 04945
;
run;
/* file 3 */
data no_dec;
input ID Zipcode $5.;
datalines;
2 88123
21 88123
22 88123
23 88123
24 88123
3 12345
4 03304
5 03867
6 04945
7 04001
8 98765
9 98801
10 96224
11 00001
12 83356
13 83356
;
run;
/************ date prep *************************************/
/* sort dec by date and decile to allow for by group processing */
proc sort data=dec presorted;
by date decile;
run;
/* sort bridge by date and decile to allow for by group processing */
/* - required because bridge table likely to big for loading in total into hash table */
proc sort data=bridge presorted;
by date decile;
run;
proc sql;
create index idx_dt_dec
on bridge (date, decile);
quit;
/* assign a random value to each entry in no_dec (file 3) and output as table no_dec_ranno sorted by zipcode and random value */
data _null_;
dcl hash h1(ordered:'y', multidata:'y');
h1.defineKey('Zipcode','ran_no');
h1.defineData('id','zipcode','ran_no');
h1.defineDone();
call streaminit(10);
do until(_last);
set no_dec end=_last;
ran_no=rand('uniform');
_rc=h1.add();
end;
_rc=h1.output(dataset:'no_dec_ranno');
stop;
run;
/************ draw control *************************************/
data control(keep=id_dec id zipcode date Decile select_cnt)
control_insufficient_data(keep=id_dec id zipcode date Decile select_cnt)
;
length id_dec 8;
if _n_=1 then
do;
call streaminit(10);
/* load no_dec_ranno into hash replacing the random values by a counter */
/* - the order is still random but the counter instead of a random value will allow to address specific items later on */
/* - memory consumption of this hash is around 88bytes * number of items plus some overhead. For 34.6M rows close to 3GB */
dcl hash h_nodec(ordered:'y');
h_nodec.defineKey('Zipcode','seq_no');
h_nodec.defineData('id','zipcode','seq_no');
h_nodec.defineDone();
do until(_last);
set no_dec_ranno(drop=ran_no) end=_last;
by Zipcode;
if first.zipcode then seq_no=1;
else seq_no+1;
_rc=h_nodec.add();
end;
/* create hash for bridge data */
dcl hash h_brdg(dataset:'bridge(obs=0)', multidata:'y', ordered:'y');
h_brdg.defineKey('date','decile');
h_brdg.defineData('zipcode');
h_brdg.defineDone();
/* hash to store per zipcode the last seq_no (counter) used to populate the table with control data */
dcl hash h_last_ranno();
h_last_ranno.defineKey('zipcode');
h_last_ranno.defineData('seq_no');
h_last_ranno.defineDone();
/* hash to store for the current row of dec (file 1) all matching rows from the bridge table */
dcl hash h_zipcollect();
h_zipcollect.defineKey('iter');
h_zipcollect.defineData('zipcode');
h_zipcollect.defineDone();
end;
call missing(of _all_);
set dec(rename=(id=id_dec));
by date decile;
if first.decile then
do;
/* load bridge data for current date into hash h_brdg */
_rc=h_brdg.delete();
h_brdg = _new_ hash(dataset:catx(' ','bridge(where=(date=',date,'and decile=',decile,'))'), multidata:'y', ordered:'y');
h_brdg.defineKey('date','decile');
h_brdg.defineData('zipcode');
h_brdg.defineDone();
/* "copy" h_brdg to h_zipcollect that got a sequence key (iter) which allows to directly address items */
_rc=h_zipcollect.clear();
do iter=1 by 1 while(h_brdg.do_over() = 0);
_rc=h_zipcollect.add();
end;
end;
/*** draw two controls per row in dec (file 1) ***/
select_cnt=0;
/** 1. select zipcode from bridge for lookup of rows in no_dec (file 3) */
do i=1 to 99 until(select_cnt=2); /* if sufficient data to draw control from, loop will only iterate twice */
/* random selection of one of the zipcodes in hash h_zipcollect */
iter=rand('integer',1,h_zipcollect.num_items);
if h_zipcollect.find()=0 then
do;
/** 2. draw control **/
/* for the current zipcode derive lowest value for seq_no for the row that hasn't been drawn already */
if h_last_ranno.find() ne 0 then
do;
seq_no=1;
_rc=h_last_ranno.add();
end;
/* draw control record */
if h_nodec.find()=0 then
do;
/* count how many control records selected for the current record from dec */
select_cnt=sum(select_cnt,1);
output control;
/* remove selected record from hash as we won't select it again */
_rc=h_nodec.remove();
/* increase seq_no by 1 for this zipcode as prep of selection of another row for the table with controls */
seq_no=sum(seq_no,1);
_rc=h_last_ranno.replace();
end;
else
do;
/* in case all rows for a zipcode have already been drawn don't use this zipcode any further */
_rc=h_zipcollect.remove();
end;
end;
end;
if select_cnt<2 then output control_insufficient_data;
run;
/* title 'control'; */
/* proc print data=control; */
/* run; */
/* title 'Decedent with insufficient matching data to create control'; */
/* proc sql; */
/* select * */
/* from control_insufficient_data; */
/* quit; */
/* title; */
... View more