A requirement of maintaining the anonymizing ID across steps will require storing the active hash in a secure permanent table between uses. The OUTPUT method saves a hash. The hash data can be reloaded by using the dataset: argument tag in the hash constructor. Adding rows to a master data set is often best done with Proc APPEND
Example:
Mask in this discussion means anonymized.
* Separate permanent libraries for different data set roles;
libname SECURE 'C:\Perm\LimitAccessFolder\HealthData'; * unmasked data library;
libname WIDER 'C:\Perm\WiderAccessFolder\HealthData'; * masked data library, for analytics and reporting;
* Macro for simulating unmasked data;
%macro simulate_data_feed(data,seed=0,N=25000,M=90000,Z=1);
data &DATA;
call streaminit(&seed);
do _n_ = 1 to &N;
patid = rand('integer',&Z,&M);
output;
end;
run;
%mend;
* Macro for applying anonymized identifiers. Loaded priors, adds new as need and saves when done.
%macro seq_mask(data,var=patid,maskvar=patnum,maskdata=SECURE.maskhash,out=masked_data);
%local error memname libname;
%let error = 0;
%let maskdata = %upcase(&maskdata);
%let libname = %scan(&maskdata,2,.);
%let memname = %scan(&maskdata,2,.);
%if %length(&memname) = 0 %then %do;
%put ERROR: &maskdata disallowed, WORK/USER library presumed insecure;
%ABORT CANCEL;
%end;
%if &libname = WORK or &libname = USER %then %do;
%put ERROR: &maskdata disallowed, WORK/USER library presumed insecure;
%ABORT CANCEL;
%end;
data &out;
set &data end=LASTROW;
declare hash ALTIDS;
if _n_ = 1 then do;
if exist("&maskdata", 'DATA')
then ALTIDS = _new_ hash (dataset:"&maskdata");
else ALTIDS = _new_ Hash ();
ALTIDS.defineKey("&var");
ALTIDS.defineData("&var");
ALTIDS.defineData("&maskvar");
ALTIDS.defineDone();
end;
if ALTIDS.find() ne 0 then do;
if ALTIDS.NUM_ITEMS = 90000 then do;
put 'ERROR: No more slots for masking as 10000:99999';
call symput('error', '1');
stop;
end;
_new_count + 1;
&maskvar = 10000 + ALTIDS.NUM_ITEMS; * random alt id is a maintained sequence number;
ALTIDS.add();
end;
label &maskvar = "&var masked.";
if LASTROW then do;
put 'NOTE: ' _new_count 'new mask ids';
ALTIDS.output(dataset:"&maskdata");
end;
drop _:;
run;
%if &error %then %ABORT CANCEL;
%mend;
* Use macros to mask the ids in some source data and append to a master;
options mprint;
* reset master new state;
* proc delete data=WIDER.health_data;
* proc delete data=SECURE.maskhash;
* run;
%simulate_data_feed(SECURE.day1, seed=20200101)
%simulate_data_feed(SECURE.day2, seed=20200102, N=100000)
%simulate_data_feed(SECURE.day3, seed=20200103)
%simulate_data_feed(SECURE.day4, seed=20200104)
%simulate_data_feed(SECURE.day5, seed=20200105)
%simulate_data_feed(SECURE.day6, seed=20200106)
%simulate_data_feed(SECURE.day7, seed=20200107, N=50000, M=250000, Z=100000)
%seq_mask(SECURE.day1, out=WORK.masked1)
proc append base=WIDER.health_data force data=WORK.masked1; run;
%seq_mask(SECURE.day2, out=WORK.masked_today)
proc append base=WIDER.health_data force data=WORK.masked_today; run;
%seq_mask(SECURE.day3, out=WORK.stage1)
proc append base=WIDER.health_data data=WORK.stage1; run;
%seq_mask(SECURE.day4, out=WORK.feed_special)
proc append base=WIDER.health_data data=WORK.feed_special; run;
%seq_mask(SECURE.day5, out=WORK.week1_day5)
proc append base=WIDER.health_data data=WORK.week1_day5; run;
%seq_mask(SECURE.day6, out=WORK.week1_day6)
proc append base=WIDER.health_data data=WORK.week1_day6; run;
%seq_mask(SECURE.day7, out=WORK.week1_day7)
proc append base=WIDER.health_data data=WORK.week1_day7; run;
NOTE: Be careful, if the partial data (masked data sets that are to be appended to a master masked data set) does not have a unique key, you will be able to accidentally append the same data more than once; not good for reporting and analysis. If the data has a natural key (such as visit_guid, or claim_number+claim_line) be sure to index uniquely the master data set. A unique index will prevent accidental re-appends.
A production level system would also log macro use, maintain raw data, have throughput metrics, have a framework for undoing/rollback, etc...
... View more