@JKHess I did initially an edit but then reverted back.
The private message you sent:
Hi Patrick, I just tried to respond to your most recent response to my post but it looks like it's been closed. I separated the files by year, re-ran the deciles, etc, then ran the code you provided. It ran fine with the smaller samples (10k records), but when I used the entire file, i got an error "Array subscript out of range at line 417 column 7", which is referring to this line in the code: if a_itemcum[midpt-1] >= ran_val then hbound=midpt-1; Any idea what might be causing this error? thank you..
The error was due to a wrong assignment of the upper array boundary. It's corrected in below code. Instead of hbound=a_itemcum_n_elements; it was hbound=a_itemcum[a_itemcum_n_elements];
The binary search algorithm itself is based on the excellent paper Array Lookup Techniques by @hashman
/*************** create sample data ************************/
/* file 1 */
data dec;
input ID Date :mmddyy10. Decile;
format Date mmddyy10.;
datalines;
1 1/1/2017 1
22 1/1/2017 1
41 1/1/2017 1
56 1/1/2017 2
79 1/1/2017 2
85 1/1/2017 2
100 1/2/2017 1
118 1/2/2017 1
125 1/2/2017 2
167 1/2/2017 2
178 1/2/2017 3
;
run;
/* file 2 - not really a bridge because relationship bridge:no_dec is many:many */
data bridge;
input Date :mmddyy10. Decile Zipcode $5.;
format Date mmddyy10.;
datalines;
1/1/2017 1 88123
1/1/2017 1 03867
1/1/2017 1 04001
1/1/2017 2 03304
1/1/2017 2 98765
1/1/2017 2 96224
1/1/2017 2 00001
1/2/2017 1 98801
1/2/2017 2 88123
1/2/2017 2 12345
1/2/2017 2 83356
1/2/2017 2 98765
1/2/2017 3 03304
1/2/2017 3 04945
;
run;
/* file 3 */
data no_dec;
input ID Zipcode $5.;
datalines;
2 88123
21 88123
22 88123
23 88123
24 88123
3 12345
4 03304
5 03867
6 04945
7 04001
8 98765
9 98801
10 96224
11 00001
12 83356
13 83356
;
run;
/************ data prep *************************************/
/* assign a random value to each entry in no_dec (file 3) and output as table no_dec_ranno sorted by zipcode and random value */
data _null_;
dcl hash h1(ordered:'y', multidata:'y');
h1.defineKey('Zipcode','ran_no');
h1.defineData('id','zipcode','ran_no');
h1.defineDone();
call streaminit(10);
do until(_last);
set no_dec end=_last;
ran_no=rand('uniform');
_rc=h1.add();
end;
_rc=h1.output(dataset:'no_dec_ranno');
stop;
run;
/************ draw control *************************************/
data control(keep=id_dec id zipcode date Decile select_cnt)
control_insufficient_data(keep=id_dec id zipcode date Decile select_cnt)
;
length id_dec 8;
if _n_=1 then
do;
call streaminit(10);
/* define hash to collect number of rows (items) per zipcode */
/* - used for weighted random selection of zipcode from which to draw control from */
n_items=0;
dcl hash h_nodec_nperzip();
h_nodec_nperzip.defineKey('zipcode');
h_nodec_nperzip.defineData('n_items');
h_nodec_nperzip.defineDone();
/* load no_dec_ranno into hash replacing the random values by a sequence number (by zipcode) */
/* - the order is still random but the sequence number instead of a random value will allow to address specific items later on */
/* - memory consumption of this hash is around 88bytes * number of items plus some overhead. For 34.6M rows close to 3GB */
dcl hash h_nodec(ordered:'y');
h_nodec.defineKey('Zipcode','seq_no');
h_nodec.defineData('id','zipcode','seq_no');
h_nodec.defineDone();
do until(_last);
set no_dec_ranno(drop=ran_no) end=_last;
by Zipcode;
/* populate hash h_nodec */
if first.zipcode then seq_no=1;
else seq_no+1;
_rc=h_nodec.add();
/* populate hash h_nodec_nperzip */
n_items=sum(n_items,1);
if last.zipcode then
do;
_rc=h_nodec_nperzip.add();
n_items=0;
end;
end;
/* load the bridge data into a hash */
dcl hash h_brdg(dataset:'bridge', multidata:'y', ordered:'y');
h_brdg.defineKey('date','decile');
h_brdg.defineData('zipcode');
h_brdg.defineDone();
/* hash to store per zipcode the last sequence number used to populate the table with control data */
/* - to ensure a record gets only drawn once */
dcl hash h_last_ranno();
h_last_ranno.defineKey('zipcode');
h_last_ranno.defineData('seq_no');
h_last_ranno.defineDone();
/* arrays to store zipcode and cumulative sum of number of items under a zipcode */
array a_zipcode{50000} $5 _temporary_;
array a_itemcum{0:50000} 8 _temporary_;
a_itemcum[0]=0;
end;
call missing(of _all_);
set dec(rename=(id=id_dec));
/*** draw two controls for case ***/
/** 1. select zipcode from bridge for lookup of rows in no_dec (file 3) */
/* load all zipcodes from the bridge into hash h_zipcollect that match with the current row from dec (file 1) */
_rc=h_brdg.reset_dup();
do _i=1 by 1 while(h_brdg.do_over() = 0);
_rc=h_nodec_nperzip.find()=0;
a_zipcode[_i]=zipcode;
a_itemcum[_i]=sum(a_itemcum[_i-1],n_items);
end;
a_itemcum_n_elements=sum(_i,-1);
select_cnt=0;
do _i=1 to 99 until(select_cnt=2); /* if sufficient data to draw control from, loop will only iterate twice */
/** random selection of one of the matching zipcodes from array a_zipcode, weighted by number of items per zipcode **/
/* create random integer in the range of 1 to n zipcodes to choose from */
ran_val=rand('integer',1,a_itemcum[a_itemcum_n_elements]);
/* binary search through array a_itemcum to find the element that stores the higher boundary */
/* - when found use the index of this element to derive the zipcode from which to draw control record */
lbound=1;
hbound=a_itemcum_n_elements;
do while(lbound <= hbound);
midpt=floor(sum(lbound,hbound)/2);
if a_itemcum[midpt-1] >= ran_val then hbound=midpt-1;
else
if a_itemcum[midpt] < ran_val then lbound=midpt+1;
else
/* if a_itemcum[midpt-1] < ran_val <= a_itemcum[midpt] then */
do;
zipcode=a_zipcode[midpt];
leave;
end;
end;
/** 2. draw control from population under selected zip code **/
/* for the chosen zipcode derive the row with the lowest seq_no that hasn't been drawn previously */
if h_last_ranno.find() ne 0 then
do;
seq_no=1;
_rc=h_last_ranno.add();
end;
/* draw control record */
if h_nodec.find()=0 then
do;
/* count how many control records selected for the current record from dec */
select_cnt=sum(select_cnt,1);
output control;
/* remove selected record from hash as we won't select it again */
_rc=h_nodec.remove();
/* increase seq_no by 1 for this zipcode as prep of selection of another row for the table with controls */
seq_no=sum(seq_no,1);
_rc=h_last_ranno.replace();
end;
end;
if select_cnt<2 then output control_insufficient_data;
run;
/* title 'control'; */
/* proc print data=control; */
/* run; */
/* title 'Decedent with insufficient matching data to create control'; */
/* proc sql; */
/* select * */
/* from control_insufficient_data; */
/* quit; */
/* title; */
Going forward I suggest you create a new question once you've accepted a response as solution. Just mention and link the previous discussion in the new follow-up question. Not only will this help to not "overload" discussions, it will also increase the likelihood for more people looking into your new question.
In regards of the logic used to draw the control just some more thoughts for your consideration if relevant at all. I would assume that compared to your control population (file 1) your deceased population (file 3) has a higher average age and percentage of members living under an urban postcode. This not the least because of better availability of medical facilities. You could consider to also add age-group information to your file1 and file3 to further segment from which population to draw the control from. And for rural/urban: If the distributions between file1 and file3 significantly differ then you might also want to add this info to your data to further subset the control population to draw from.
....and then there is of course the change of zipcodes. I would imagine that a change to work with actual date ranges that doesn't impact too much on performance could be hard, a change to yearly snapshots of data would be rather simple.
@JKHess Update: I further tested the binary search logic (makes my brain hurt!). I believe I've got it right now.
... View more