Re: Pseudonymization of sensitive data

ErikLund_Jensen · Posted 01-09-2024 03:19 PM

This is easily solved. If the ID's are like your example with an underscore as delimiter and a suffix after that, the following code will work. The ID' variable in the subtable must be character to contain underscore_letter, and my axample is made on the assumption that ID's in the main table are character as well. The Idea is to make hashing in the subtables on the part before the underscore.

Note the trimming (with cats) of the argument to in all MD5 function calls, this is to avoid including an unknown number of blanks in generation of the hash value. Note also that the generation of a pseudonym name is omitted in the sub table. This is because the pseudonym name in the main table contains a counter with one added for each new ID, and the same count cannot be generated in the sub tables, if not all ID's are present.

* Test data;
data maintable;
  ID = '12345'; Name = 'Billy Nash'; Desease = 11; output;
  ID = '12345'; Name = 'Billy Nash'; Desease = 17; output;
  ID = '23456'; Name = 'jeff Smith'; Desease = 22; output;
  ID = '34567'; Name = 'John Doe'; Desease = 33; output;
run;

data subtable;
  ID = '12345_r'; Subtable_var = 11; output;
  ID = '12345_r'; Subtable_var = 17; output;
  ID = '23456_r'; Subtable_var = 22; output;
run;

* Create Pseudonym Maintable - Change ID to PseudoID;
* ID is converted to MD5 hash value,
* Name is converted to a defaultname + patient counter, this gives some extra coding 
    thanks to several records prt ID;
proc sort data=maintable;
  by ID Desease;
run;

data anontable (drop=ID Name pnr);
  length PseudoID $36 PseudoName $40;
  set maintable;
  by ID;
  retain pnr 0;
  if first.ID then pnr = pnr + 1;
  PseudoID = put(md5(cats(ID)),$hex32.);
  PseudoName = catx(' ', 'Patient', put(pnr,8.), 'Pseudoname');
run;


* Create Pseudonym Subtable - Change ID to PseudoID;
data anonsubtable (drop=ID MainID);
  length MainID $20 PseudoID $36;
  set subtable;
  by ID;
  MainID = scan(ID,1,'_');
  PseudoID = put(md5(cats(MainID)),$hex32.);
run;

* Restore identities on anonymized maintable;
proc sql;
  create table restore_main as
    select distinct
      maintable.ID,
      maintable.Name,
      anontable.Desease
    from anontable
    left join maintable
    on put(md5(cats(maintable.ID)),$hex32.) = anontable.PseudoID
  ;
quit;

* Restore identities on anonymized subtable;
proc sql;
  create table restore_sub as
    select distinct
      maintable.ID,
      maintable.Name,
      anonsubtable.Subtable_var
    from anonsubtable
    left join maintable
    on put(md5(cats(maintable.ID)),$hex32.) = anonsubtable.PseudoID
  ;
quit;

Patrick · Posted 01-09-2024 08:56 PM

@Anita_n You could just remove the extraneous information prior to creating the digest value as per below sample code.

md5 creates a 128 bit (16 byte) digest value that you can express as a 32 character hex string, sha256 creates a 256 bit digest value that then needs a 64 character hex string.

data demo;
  input id $20.;

  length id_md5_hex $32. id_sha256_hex $64.;
  id_md5_hex   =hashing('md5',strip(scan(ID,1,'_')));
  id_sha256_hex=hashing('sha256',strip(scan(ID,1,'_')));

  length id_md5_hex_2 $34. id_sha256_hex_2 $68.;
  id_md5_hex_2   =catx('_',hashing('md5',strip(scan(ID,1,'_'))),scan(ID,2,'_'));
  id_sha256_hex_2=catx('_',hashing('sha256',strip(scan(ID,1,'_'))),scan(ID,2,'_'));

  datalines;
12345
12345_c
;

proc print data=demo;
  var id_md5_hex: id_sha256_hex:;
run;

Please note that if someone figures out how you created the digest values then it's with today's compute power no more that hard to reverse the process. Below sample code to illustrate this.

data digest_lookup(compress=yes);
  length id_md5_hex $32.; 
  do i=1 to 1000000;
    id=put(i,20. -l);
    id_md5_hex =hashing('md5',strip(scan(ID,1,'_')));
    output;
  end;
  drop i;
run;

data unmasked;
  set digest_lookup;
  if _n_=1 then
    do;
      dcl hash h1(dataset:'demo');
      h1.defineKey('id_md5_hex');
      h1.defineData('id');
      h1.defineDone();
    end;
  if h1.find()=0 then output;
run;
proc print data=unmasked;
run;

Because of the above I would at least also "scramble" the hex string. Some simple algorithm as below would already make it much much harder to just guess what you've done and use a simple generated lookup table.

proc format;
  invalue hex2dec
    '0'=0
    '1'=1
    '2'=2
    '3'=3
    '4'=4
    '5'=5
    '6'=6
    '7'=7
    '8'=8
    '9'=9
    'A'=10
    'B'=11
    'C'=12
    'D'=13
    'E'=14
    'F'=15
    ;
data demo;
  input id $20.;

  length id_md5_hex id_md5_hex_scrambled $32.;
  id_md5_hex   =hashing('md5',strip(scan(ID,1,'_')));

  l=input(substr(id_md5_hex,12,1),hex2dec.)+6;
  id_md5_hex_scrambled=cats(substr(id_md5_hex,l+1),substr(id_md5_hex,1,l));
  drop l;
  datalines;
12345
12345_c
;

proc print data=demo;
run;

s_lassen · Posted 01-10-2024 04:52 AM

Assuming that the ID variable in the main table is numeric, and the ones in the subsidiary tables are character, you could try something like this for the subsidiary tables:

data subsidiary1_anon(drop=id local_id) anonymization_id;
  set subsidiary1(rename=(id=local_id));
  id=input(scan(local_id,1,'_'),best32.);
  id_type=scan(local_id,2,'_'); /* I suppose we want to keep this */
  modify anonymization_id key=id/unique nobs=n_id;
  if _iorc_ then do; /* assuming that the ID was not found */
    n_id+1;
    anon_id=n_id;
    output  anonymization_id;
    _error_=0;
    end;
  output patient1_anon;
run;