Here is another method that you may want to investigate:
data have;
input userid firstName $ lastName $ zip emailAddress:$50. enrollmentDate:yymmdd10.;
datalines;
1 First Last 9988787 flast@provider.domain 2011-01-01
2 First Last 9999888 flast@provider.domain 2013-01-01
3 First Last 9999888 firstlast@provider.domain 2014-01-01
4 First Last 9911111 last999@provider2.domain 2016-01-01
;
run;
data _null_;
set have end=eod;
if _n_=1 then do;
declare hash criteria1 ();
criteria1.definekey('firstname', 'lastname', 'zip');
criteria1.definedata('firstname', 'lastname', 'zip','_point');
criteria1.definedone();
declare hash duplicates1 (dataset:'have (obs=0)',ordered:'a',multidata:'y');
duplicates1.definekey('firstname', 'lastname', 'zip');
duplicates1.definedata(all:'y');
duplicates1.definedone();
declare hash criteria2 ();
criteria2.definekey('firstname', 'lastname', 'emailAddress');
criteria2.definedata('firstname', 'lastname', 'emailAddress','_point');
criteria2.definedone();
declare hash duplicates2 (dataset:'have (obs=0)',ordered:'a',multidata:'y');
duplicates2.definekey('firstname', 'lastname', 'emailAddress');
duplicates2.definedata(all:'y');
duplicates2.definedone();
end;
if criteria1.find()^=0 then
criteria1.add(key:firstname, key:lastname, key:zip,
data:firstname, data:lastname, data:zip,data:_n_);
else do;
duplicates1.add();
if _point^=. then do;
set have point=_point;
duplicates1.add();
criteria1.replace(key:firstname, key:lastname, key:zip,
data:firstname, data:lastname, data:zip, data:.);
end;
end;
if criteria2.find()^=0 then
criteria2.add(key:firstname, key:lastname, key:emailAddress,
data:firstname, data:lastname, data:emailAddress, data:_n_);
else do;
duplicates2.add();
if _point^=. then do;
set have point=_point;
duplicates2.add();
criteria2.replace(key:firstname, key:lastname, key:emailAddress,
data:firstname, data:lastname, data:emailAddress, data:.);
end;
end;
if eod then do;
rc=duplicates1.output(dataset:'duplicates1');
rc=duplicates2.output(dataset:'duplicates2');
end;
run;
data duplicates;
set duplicates1 duplicates2;
run;
proc sort data=duplicates nodup;
by firstname lastname descending userid;
run;
data duplicates;
retain newuserid;
set duplicates;
if first.lastname then newuserid = userid;
by firstname lastname descending userid;
run;
data temp;
set have duplicates;
run;
proc sort data=temp; by userid descending newuserid;
run;
proc sort data=temp out=want nodupkey; by userid firstname lastname;
run;
data want;
set want;
if newuserid eq . then newuserid = userid;
run;
... View more