BookmarkSubscribeRSS Feed
ErikLund_Jensen
Rhodochrosite | Level 12

Hi @Anita_n 

 

This is easily solved. If the ID's are like your example with an underscore as delimiter and a suffix after that, the following code will work. The ID' variable in the subtable must be character to contain underscore_letter, and my axample is made on the assumption that ID's in the main table are character as well. The Idea is to make hashing in the subtables on the part before the underscore.

 

Note the trimming (with cats) of the argument to in all MD5 function calls, this is to avoid including an unknown number of blanks in generation of the hash value. Note also that the generation of a pseudonym name is omitted in the sub table. This is because the pseudonym name in the main table contains a counter with one added for each new ID, and the same count cannot be generated in the sub tables, if not all ID's are present.

 

* Test data;
data maintable;
  ID = '12345'; Name = 'Billy Nash'; Desease = 11; output;
  ID = '12345'; Name = 'Billy Nash'; Desease = 17; output;
  ID = '23456'; Name = 'jeff Smith'; Desease = 22; output;
  ID = '34567'; Name = 'John Doe'; Desease = 33; output;
run;

data subtable;
  ID = '12345_r'; Subtable_var = 11; output;
  ID = '12345_r'; Subtable_var = 17; output;
  ID = '23456_r'; Subtable_var = 22; output;
run;

* Create Pseudonym Maintable - Change ID to PseudoID;
* ID is converted to MD5 hash value,
* Name is converted to a defaultname + patient counter, this gives some extra coding 
    thanks to several records prt ID;
proc sort data=maintable;
  by ID Desease;
run;

data anontable (drop=ID Name pnr);
  length PseudoID $36 PseudoName $40;
  set maintable;
  by ID;
  retain pnr 0;
  if first.ID then pnr = pnr + 1;
  PseudoID = put(md5(cats(ID)),$hex32.);
  PseudoName = catx(' ', 'Patient', put(pnr,8.), 'Pseudoname');
run;


* Create Pseudonym Subtable - Change ID to PseudoID;
data anonsubtable (drop=ID MainID);
  length MainID $20 PseudoID $36;
  set subtable;
  by ID;
  MainID = scan(ID,1,'_');
  PseudoID = put(md5(cats(MainID)),$hex32.);
run;

* Restore identities on anonymized maintable;
proc sql;
  create table restore_main as
    select distinct
      maintable.ID,
      maintable.Name,
      anontable.Desease
    from anontable
    left join maintable
    on put(md5(cats(maintable.ID)),$hex32.) = anontable.PseudoID
  ;
quit;

* Restore identities on anonymized subtable;
proc sql;
  create table restore_sub as
    select distinct
      maintable.ID,
      maintable.Name,
      anonsubtable.Subtable_var
    from anonsubtable
    left join maintable
    on put(md5(cats(maintable.ID)),$hex32.) = anonsubtable.PseudoID
  ;
quit;

 

Patrick
Opal | Level 21

@Anita_n You could just remove the extraneous information prior to creating the digest value as per below sample code.

md5 creates a 128 bit (16 byte) digest value that you can express as a 32 character hex string, sha256 creates a 256 bit digest value that then needs a 64 character hex string.

data demo;
  input id $20.;

  length id_md5_hex $32. id_sha256_hex $64.;
  id_md5_hex   =hashing('md5',strip(scan(ID,1,'_')));
  id_sha256_hex=hashing('sha256',strip(scan(ID,1,'_')));

  length id_md5_hex_2 $34. id_sha256_hex_2 $68.;
  id_md5_hex_2   =catx('_',hashing('md5',strip(scan(ID,1,'_'))),scan(ID,2,'_'));
  id_sha256_hex_2=catx('_',hashing('sha256',strip(scan(ID,1,'_'))),scan(ID,2,'_'));

  datalines;
12345
12345_c
;

proc print data=demo;
  var id_md5_hex: id_sha256_hex:;
run;

Patrick_0-1704851925326.png

 

Please note that if someone figures out how you created the digest values then it's with today's compute power no more that hard to reverse the process. Below sample code to illustrate this.

data digest_lookup(compress=yes);
  length id_md5_hex $32.; 
  do i=1 to 1000000;
    id=put(i,20. -l);
    id_md5_hex =hashing('md5',strip(scan(ID,1,'_')));
    output;
  end;
  drop i;
run;

data unmasked;
  set digest_lookup;
  if _n_=1 then
    do;
      dcl hash h1(dataset:'demo');
      h1.defineKey('id_md5_hex');
      h1.defineData('id');
      h1.defineDone();
    end;
  if h1.find()=0 then output;
run;
proc print data=unmasked;
run;

Patrick_1-1704851733817.png

 

Because of the above I would at least also "scramble" the hex string. Some simple algorithm as below would already make it much much harder to just guess what you've done and use a simple generated lookup table.

proc format;
  invalue hex2dec
    '0'=0
    '1'=1
    '2'=2
    '3'=3
    '4'=4
    '5'=5
    '6'=6
    '7'=7
    '8'=8
    '9'=9
    'A'=10
    'B'=11
    'C'=12
    'D'=13
    'E'=14
    'F'=15
    ;
data demo;
  input id $20.;

  length id_md5_hex id_md5_hex_scrambled $32.;
  id_md5_hex   =hashing('md5',strip(scan(ID,1,'_')));

  l=input(substr(id_md5_hex,12,1),hex2dec.)+6;
  id_md5_hex_scrambled=cats(substr(id_md5_hex,l+1),substr(id_md5_hex,1,l));
  drop l;
  datalines;
12345
12345_c
;

proc print data=demo;
run;

Patrick_0-1704853882857.png

 



 

s_lassen
Meteorite | Level 14

Assuming that the ID variable in the main table is numeric, and the ones in the subsidiary tables are character, you could try something like this for the subsidiary tables:

data subsidiary1_anon(drop=id local_id) anonymization_id;
  set subsidiary1(rename=(id=local_id));
  id=input(scan(local_id,1,'_'),best32.);
  id_type=scan(local_id,2,'_'); /* I suppose we want to keep this */
  modify anonymization_id key=id/unique nobs=n_id;
  if _iorc_ then do; /* assuming that the ID was not found */
    n_id+1;
    anon_id=n_id;
    output  anonymization_id;
    _error_=0;
    end;
  output patient1_anon;
run;
Anita_n
Pyrite | Level 9

@s_lassen @Patrick @ErikLund_Jensen  Thanks for those examples. Combining your ideas, I think am almost getting what I want

sbxkoenk
SAS Super FREQ

I guess problem of @Anita_n is solved by now , but here's an interesting blog on the topic that was published yesterday :

 

Using the BXOR Function to Encode and Decode Text
By Ron Cody on SAS Users January 10, 2024
https://blogs.sas.com/content/sgf/2024/01/10/using-the-bxor-function-to-encode-and-decode-text/

 

Koen

SAS Innovate 2025: Save the Date

 SAS Innovate 2025 is scheduled for May 6-9 in Orlando, FL. Sign up to be first to learn about the agenda and registration!

Save the date!

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

SAS Training: Just a Click Away

 Ready to level-up your skills? Choose your own adventure.

Browse our catalog!

Discussion stats
  • 19 replies
  • 3418 views
  • 4 likes
  • 10 in conversation