Solved: Re: Flagging partial duplicates across multiple criteria

walkerped · Posted 10-31-2023 06:02 PM

I need to flag duplicates in a dataset based on the following criteria: 1. cases with the same first name, last name and zipcode 2. cases with the same first name, last name and email address. Cases should be flagged by assigning a set of duplicates a newUserId with the same id as the most recent enrollment date in the set.

For example see following table:

userid	firstName	lastName	zip	emailAddress	enrollmentDate
1	John	Smith	88787	jsmith@gmail.com	2011-01-01
2	John	Smith	99888	jsmith@gmail.com	2013-01-01
3	John	Smith	99888	johnsmith@gmail.com	2014-01-01
4	John	Smith	11111	smith12@aol.com	2016-01-01

The first two rows should be flagged as duplicates, because they share firstName, lastName and zip. The 2nd and 3rd rows are also duplicates, because they share firstName, lastName and emailAddress. This means that all of the first three rows are a single set of duplicates. These three rows should each get a newUserId that should match the most recent enrollmentDate in the set, so 3.

So, the output I want is:

userid	firstName	lastName	zip	emailAddress	enrollmentDate	newUserId
1	John	Smith	88787	jsmith@gmail.com	2011-01-01	3
2	John	Smith	99888	jsmith@gmail.com	2013-01-01	3
3	John	Smith	99888	johnsmith@gmail.com	2014-01-01	3
4	John	Smith	11111	smith12@aol.com	2016-01-01	4

I know how to use first / last processing to flag duplicates on these two criteria separately, but can't figure out how to flag an entire 'duplicate cluster' with a single value.

Ksharp · Posted 11-02-2023 08:59 PM

/*
Wouldn't this question be same with this 
https://communities.sas.com/t5/SAS-Programming/How-to-create-an-ID-from-different-variables-and-OR-condition/m-p/900795#M355993
?
*/


data test;
input userid
firstName $
lastName $
zip $
emailAddress :$40.
enrollmentDate :yymmdd12.;
format enrollmentDate yymmdd10.;
cards;
1
John
Smith
88787
jsmith@gmail.com
2011-01-01
2
John
Smith
99888
jsmith@gmail.com
2013-01-01
3
John
Smith
99888
johnsmith@gmail.com
2014-01-01
4
John
Smith
11111
smith12@aol.com
2016-01-01
;


data have;
set test;
length from to $ 100;
from=catx('|',firstName,lastName,zip); 
to=catx('|',firstName,lastName,emailAddress); ;
keep from to;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;



data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

data want2;
 if _n_=1 then do;
   if 0 then set want;
   declare hash h(dataset:'want');
   h.definekey('node');
   h.definedata('household');
   h.definedone();
 end;
set test;
call missing(household);

node=catx('|',firstName,lastName,zip); 
rc=h.find();

node=catx('|',firstName,lastName,emailAddress); ;
rc=h.find();
drop rc node;
run;

proc sort data=want2;
by household enrollmentDate;
run;
data final_want;
do until(last.household);
 set want2;
 by household;
end;
newUserId=userid;
do until(last.household);
 set want2;
 by household;
 output;
end;
run;

View solution in original post

walkerped · Posted 11-01-2023 12:07 PM

I made an error in the description, my apologies. The first two rows should be flagged as duplicates because they share first name, last name and email address. The 2nd and 3rd should be flagged as duplicates because they share firstName, lastName and zip.

maguiremq · Posted 11-01-2023 12:08 PM

No, you were clear – I misread it!

JosvanderVelden · Posted 11-02-2023 06:53 AM

If I understand you correctly then what you name 'duplicate cluster' are all the duplicate observations where firstname and lastname are equal. And you want to assign all of those a newuserid equal the highest userid in the cluster. True?

s_lassen · Posted 11-02-2023 06:06 AM

Here is a possible solution - it is not so easy, as you basically can have two userid's with both zip and email address different actually being the same user (as userid 1 and 3).

First, find the two sets of links from one userid to the next:

proc sort data=have;
  by firstName lastName zip descending enrollmentDate;
run;

data link1;
  do until(last.zip);
    set have;
    by firstName lastName zip;
    if first.zip then link1=userid;
    output;
    end;
run;

proc sort data=have;
  by firstName lastName emailAddress descending enrollmentDate;
run;

data link2;
  do until(last.emailAddress);
    set have;
    by firstName lastName emailAddress;
    if first.emailAddress then link2=userid;
    output;
    end;
run;

Next, integrate the two link tables:

proc sort data=link1;
  by userid;
run;

proc sort data=link2;
  by userid;
run;

data link;
  merge link1 link2;
  by userid;
  if link1 ne userid then
    link=link1;
  else if link2 ne userid then
    link=link2;
  drop link1 link2;
run;

Finally, index the LINK table and use it to find the final link for each userid:

proc sql;
  create index userid on link(userid);
quit;

data want;
  set link;
  do while(link ne .);
    newUserid=link;
    set link(keep=link userid rename=(link=newlink userid=link)) key=link/unique;
    link=newlink;
    end;
  if newUserid=. then
    newUserid=userid;
  drop link newlink;
run;

JosvanderVelden · Posted 11-02-2023 07:53 AM

Here is another method that you may want to investigate:

data have;
   input userid firstName $ lastName $ zip emailAddress:$50. enrollmentDate:yymmdd10.;
datalines;
1 First Last 9988787 flast@provider.domain 2011-01-01
2 First Last 9999888 flast@provider.domain 2013-01-01
3 First Last 9999888 firstlast@provider.domain 2014-01-01
4 First Last 9911111 last999@provider2.domain 2016-01-01
;
run;

data _null_;
  set have end=eod;
  if _n_=1 then do;
    declare hash criteria1 ();
      criteria1.definekey('firstname', 'lastname', 'zip');
      criteria1.definedata('firstname', 'lastname', 'zip','_point');
      criteria1.definedone();
    declare hash duplicates1 (dataset:'have (obs=0)',ordered:'a',multidata:'y');
      duplicates1.definekey('firstname', 'lastname', 'zip');
      duplicates1.definedata(all:'y');
      duplicates1.definedone();
    declare hash criteria2 ();
      criteria2.definekey('firstname', 'lastname', 'emailAddress');
      criteria2.definedata('firstname', 'lastname', 'emailAddress','_point');
      criteria2.definedone();
    declare hash duplicates2 (dataset:'have (obs=0)',ordered:'a',multidata:'y');
      duplicates2.definekey('firstname', 'lastname', 'emailAddress');
      duplicates2.definedata(all:'y');
      duplicates2.definedone();
  end;

  if criteria1.find()^=0 then 
     criteria1.add(key:firstname, key:lastname, key:zip,
                   data:firstname, data:lastname, data:zip,data:_n_); 
  else do;                
    duplicates1.add();     
    if _point^=. then do; 
      set have point=_point;
      duplicates1.add();
      criteria1.replace(key:firstname, key:lastname, key:zip,
                        data:firstname, data:lastname, data:zip, data:.);
    end;
  end;

  if criteria2.find()^=0 then 
     criteria2.add(key:firstname, key:lastname, key:emailAddress,
                   data:firstname, data:lastname, data:emailAddress, data:_n_); 
  else do;                
    duplicates2.add();     
    if _point^=. then do; 
      set have point=_point;
      duplicates2.add();
      criteria2.replace(key:firstname, key:lastname, key:emailAddress,
                        data:firstname, data:lastname, data:emailAddress, data:.);
    end;
  end;

  if eod then do;
    rc=duplicates1.output(dataset:'duplicates1');
    rc=duplicates2.output(dataset:'duplicates2');
  end;
run;

data duplicates;
  set duplicates1 duplicates2;  
run;   

proc sort data=duplicates nodup;
  by firstname lastname descending userid;
run;

data duplicates;
  retain newuserid;
  set duplicates;
  if first.lastname then newuserid = userid;
  by firstname lastname descending userid;
run;

data temp;
  set have duplicates;
run;

proc sort data=temp; by userid descending newuserid;
run;

proc sort data=temp out=want nodupkey; by userid firstname lastname;
run;

data want;
  set want;
  if newuserid eq . then newuserid = userid;
run;

Ksharp · Posted 11-02-2023 08:59 PM

/*
Wouldn't this question be same with this 
https://communities.sas.com/t5/SAS-Programming/How-to-create-an-ID-from-different-variables-and-OR-condition/m-p/900795#M355993
?
*/


data test;
input userid
firstName $
lastName $
zip $
emailAddress :$40.
enrollmentDate :yymmdd12.;
format enrollmentDate yymmdd10.;
cards;
1
John
Smith
88787
jsmith@gmail.com
2011-01-01
2
John
Smith
99888
jsmith@gmail.com
2013-01-01
3
John
Smith
99888
johnsmith@gmail.com
2014-01-01
4
John
Smith
11111
smith12@aol.com
2016-01-01
;


data have;
set test;
length from to $ 100;
from=catx('|',firstName,lastName,zip); 
to=catx('|',firstName,lastName,emailAddress); ;
keep from to;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;



data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

data want2;
 if _n_=1 then do;
   if 0 then set want;
   declare hash h(dataset:'want');
   h.definekey('node');
   h.definedata('household');
   h.definedone();
 end;
set test;
call missing(household);

node=catx('|',firstName,lastName,zip); 
rc=h.find();

node=catx('|',firstName,lastName,emailAddress); ;
rc=h.find();
drop rc node;
run;

proc sort data=want2;
by household enrollmentDate;
run;
data final_want;
do until(last.household);
 set want2;
 by household;
end;
newUserId=userid;
do until(last.household);
 set want2;
 by household;
 output;
end;
run;

Patrick · Posted 11-03-2023 12:19 AM

You could create hash values over the variables of interest. Rows with the same identical sets of variable values will then also get the same hash value.

data have;
  input userid firstName $ lastName $ zip emailAddress:$50. enrollmentDate:yymmdd10.;
  format enronllmentDate date9.;
  datalines;
1 First Last 9988787 flast@provider.domain 2011-01-01
2 First Last 9999888 flast@provider.domain 2013-01-01
3 First Last 9999888 firstlast@provider.domain 2014-01-01
4 First Last 9911111 last999@provider2.domain 2016-01-01
;

data want;
  set have;
  length case_1 case_2 $32.;
  case_1=hashing('md5',catx('|',firstName,lastName,zip));
  case_2=hashing('md5',catx('|',firstName,lastName,emailAddress));
run;

/* demo: dups for case 1 */
proc sql;
  select *, count(*) as n_dups
  from want
  group by case_1
  having count(*)>1
  ;
quit;

Classroom Training Available!