<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Flagging partial duplicates across multiple criteria in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901219#M356163</link>
    <description>&lt;P&gt;Here is another method that you may want to investigate:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data have;
   input userid firstName $ lastName $ zip emailAddress:$50. enrollmentDate:yymmdd10.;
datalines;
1 First Last 9988787 flast@provider.domain 2011-01-01
2 First Last 9999888 flast@provider.domain 2013-01-01
3 First Last 9999888 firstlast@provider.domain 2014-01-01
4 First Last 9911111 last999@provider2.domain 2016-01-01
;
run;

data _null_;
  set have end=eod;
  if _n_=1 then do;
    declare hash criteria1 ();
      criteria1.definekey('firstname', 'lastname', 'zip');
      criteria1.definedata('firstname', 'lastname', 'zip','_point');
      criteria1.definedone();
    declare hash duplicates1 (dataset:'have (obs=0)',ordered:'a',multidata:'y');
      duplicates1.definekey('firstname', 'lastname', 'zip');
      duplicates1.definedata(all:'y');
      duplicates1.definedone();
    declare hash criteria2 ();
      criteria2.definekey('firstname', 'lastname', 'emailAddress');
      criteria2.definedata('firstname', 'lastname', 'emailAddress','_point');
      criteria2.definedone();
    declare hash duplicates2 (dataset:'have (obs=0)',ordered:'a',multidata:'y');
      duplicates2.definekey('firstname', 'lastname', 'emailAddress');
      duplicates2.definedata(all:'y');
      duplicates2.definedone();
  end;

  if criteria1.find()^=0 then 
     criteria1.add(key:firstname, key:lastname, key:zip,
                   data:firstname, data:lastname, data:zip,data:_n_); 
  else do;                
    duplicates1.add();     
    if _point^=. then do; 
      set have point=_point;
      duplicates1.add();
      criteria1.replace(key:firstname, key:lastname, key:zip,
                        data:firstname, data:lastname, data:zip, data:.);
    end;
  end;

  if criteria2.find()^=0 then 
     criteria2.add(key:firstname, key:lastname, key:emailAddress,
                   data:firstname, data:lastname, data:emailAddress, data:_n_); 
  else do;                
    duplicates2.add();     
    if _point^=. then do; 
      set have point=_point;
      duplicates2.add();
      criteria2.replace(key:firstname, key:lastname, key:emailAddress,
                        data:firstname, data:lastname, data:emailAddress, data:.);
    end;
  end;

  if eod then do;
    rc=duplicates1.output(dataset:'duplicates1');
    rc=duplicates2.output(dataset:'duplicates2');
  end;
run;

data duplicates;
  set duplicates1 duplicates2;  
run;   

proc sort data=duplicates nodup;
  by firstname lastname descending userid;
run;

data duplicates;
  retain newuserid;
  set duplicates;
  if first.lastname then newuserid = userid;
  by firstname lastname descending userid;
run;

data temp;
  set have duplicates;
run;

proc sort data=temp; by userid descending newuserid;
run;

proc sort data=temp out=want nodupkey; by userid firstname lastname;
run;

data want;
  set want;
  if newuserid eq . then newuserid = userid;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
    <pubDate>Thu, 02 Nov 2023 11:53:59 GMT</pubDate>
    <dc:creator>JosvanderVelden</dc:creator>
    <dc:date>2023-11-02T11:53:59Z</dc:date>
    <item>
      <title>Flagging partial duplicates across multiple criteria</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/900978#M356078</link>
      <description>&lt;P&gt;I need to flag duplicates in a dataset based on the following criteria: 1. cases with the same first name, last name and zipcode 2. cases with the same first name, last name and email address.&amp;nbsp; Cases should be flagged by assigning a set of duplicates a newUserId with the same id as the most recent enrollment date in the set.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;For example see following table:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;&lt;TABLE border="1" cellspacing="0" cellpadding="0"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD&gt;&lt;P&gt;userid&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;firstName&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;lastName&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;zip&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;emailAddress&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;enrollmentDate&lt;/P&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;&lt;P&gt;1&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;John&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;Smith&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;88787&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;jsmith@gmail.com&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;2011-01-01&lt;/P&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;&lt;P&gt;2&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;John&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;Smith&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;99888&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;jsmith@gmail.com&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;2013-01-01&lt;/P&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;&lt;P&gt;3&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;John&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;Smith&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;99888&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;johnsmith@gmail.com&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;2014-01-01&lt;/P&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;&lt;P&gt;4&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;John&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;Smith&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;11111&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;smith12@aol.com&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;2016-01-01&lt;/P&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;/DIV&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The first two rows should be flagged as duplicates, because they share firstName, lastName and zip.&amp;nbsp; The 2nd and 3rd rows are also duplicates, because they share firstName, lastName and emailAddress.&amp;nbsp; This means that all of the first three rows are a single set of duplicates. These three rows should each get a newUserId that should match the most recent enrollmentDate in the set, so 3.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;So, the output I want is:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;&lt;TABLE border="1" cellspacing="0" cellpadding="0"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD&gt;&lt;P&gt;userid&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;firstName&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;lastName&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;zip&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;emailAddress&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;enrollmentDate&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;newUserId&lt;/P&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;&lt;P&gt;1&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;John&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;Smith&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;88787&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;jsmith@gmail.com&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;2011-01-01&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;3&lt;/P&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;&lt;P&gt;2&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;John&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;Smith&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;99888&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;jsmith@gmail.com&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;2013-01-01&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;3&lt;/P&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;&lt;P&gt;3&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;John&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;Smith&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;99888&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;johnsmith@gmail.com&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;2014-01-01&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;3&lt;/P&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;&lt;P&gt;4&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;John&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;Smith&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;11111&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;smith12@aol.com&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;2016-01-01&lt;/P&gt;&lt;/TD&gt;&lt;TD&gt;&lt;P&gt;4&lt;/P&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I know how to use first / last processing to flag duplicates on these two criteria separately, but can't figure out how to flag an entire 'duplicate cluster' with a single value.&lt;/P&gt;&lt;/DIV&gt;</description>
      <pubDate>Tue, 31 Oct 2023 22:02:35 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/900978#M356078</guid>
      <dc:creator>walkerped</dc:creator>
      <dc:date>2023-10-31T22:02:35Z</dc:date>
    </item>
    <item>
      <title>Re: Flagging partial duplicates across multiple criteria</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901095#M356114</link>
      <description>&lt;P&gt;I made an error in the description, my apologies.&amp;nbsp; The first two rows should be flagged as duplicates because they share&amp;nbsp;&lt;SPAN&gt;first name, last name and email address. The 2nd and 3rd should be flagged as duplicates because they share&amp;nbsp;&amp;nbsp;firstName, lastName and zip.&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 01 Nov 2023 16:07:45 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901095#M356114</guid>
      <dc:creator>walkerped</dc:creator>
      <dc:date>2023-11-01T16:07:45Z</dc:date>
    </item>
    <item>
      <title>Re: Flagging partial duplicates across multiple criteria</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901096#M356115</link>
      <description>No, you were clear – I misread it!&lt;BR /&gt;</description>
      <pubDate>Wed, 01 Nov 2023 16:08:54 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901096#M356115</guid>
      <dc:creator>maguiremq</dc:creator>
      <dc:date>2023-11-01T16:08:54Z</dc:date>
    </item>
    <item>
      <title>Re: Flagging partial duplicates across multiple criteria</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901207#M356155</link>
      <description>&lt;P&gt;Here is a possible solution - it is not so easy, as you basically can have two userid's with both zip and email address different actually being the same user (as userid 1 and 3).&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;First, find the two sets of links from one userid to the next:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sort data=have;
  by firstName lastName zip descending enrollmentDate;
run;

data link1;
  do until(last.zip);
    set have;
    by firstName lastName zip;
    if first.zip then link1=userid;
    output;
    end;
run;

proc sort data=have;
  by firstName lastName emailAddress descending enrollmentDate;
run;

data link2;
  do until(last.emailAddress);
    set have;
    by firstName lastName emailAddress;
    if first.emailAddress then link2=userid;
    output;
    end;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;Next, integrate the two link tables:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sort data=link1;
  by userid;
run;

proc sort data=link2;
  by userid;
run;

data link;
  merge link1 link2;
  by userid;
  if link1 ne userid then
    link=link1;
  else if link2 ne userid then
    link=link2;
  drop link1 link2;
run;
&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;Finally, index the LINK table and use it to find the final link for each userid:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql;
  create index userid on link(userid);
quit;

data want;
  set link;
  do while(link ne .);
    newUserid=link;
    set link(keep=link userid rename=(link=newlink userid=link)) key=link/unique;
    link=newlink;
    end;
  if newUserid=. then
    newUserid=userid;
  drop link newlink;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 02 Nov 2023 10:06:10 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901207#M356155</guid>
      <dc:creator>s_lassen</dc:creator>
      <dc:date>2023-11-02T10:06:10Z</dc:date>
    </item>
    <item>
      <title>Re: Flagging partial duplicates across multiple criteria</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901213#M356158</link>
      <description>If I understand you correctly then what you name 'duplicate cluster' are all the duplicate observations where firstname and lastname are equal. And you want to assign all of those a newuserid equal the highest userid in the cluster. True?</description>
      <pubDate>Thu, 02 Nov 2023 10:53:50 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901213#M356158</guid>
      <dc:creator>JosvanderVelden</dc:creator>
      <dc:date>2023-11-02T10:53:50Z</dc:date>
    </item>
    <item>
      <title>Re: Flagging partial duplicates across multiple criteria</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901219#M356163</link>
      <description>&lt;P&gt;Here is another method that you may want to investigate:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data have;
   input userid firstName $ lastName $ zip emailAddress:$50. enrollmentDate:yymmdd10.;
datalines;
1 First Last 9988787 flast@provider.domain 2011-01-01
2 First Last 9999888 flast@provider.domain 2013-01-01
3 First Last 9999888 firstlast@provider.domain 2014-01-01
4 First Last 9911111 last999@provider2.domain 2016-01-01
;
run;

data _null_;
  set have end=eod;
  if _n_=1 then do;
    declare hash criteria1 ();
      criteria1.definekey('firstname', 'lastname', 'zip');
      criteria1.definedata('firstname', 'lastname', 'zip','_point');
      criteria1.definedone();
    declare hash duplicates1 (dataset:'have (obs=0)',ordered:'a',multidata:'y');
      duplicates1.definekey('firstname', 'lastname', 'zip');
      duplicates1.definedata(all:'y');
      duplicates1.definedone();
    declare hash criteria2 ();
      criteria2.definekey('firstname', 'lastname', 'emailAddress');
      criteria2.definedata('firstname', 'lastname', 'emailAddress','_point');
      criteria2.definedone();
    declare hash duplicates2 (dataset:'have (obs=0)',ordered:'a',multidata:'y');
      duplicates2.definekey('firstname', 'lastname', 'emailAddress');
      duplicates2.definedata(all:'y');
      duplicates2.definedone();
  end;

  if criteria1.find()^=0 then 
     criteria1.add(key:firstname, key:lastname, key:zip,
                   data:firstname, data:lastname, data:zip,data:_n_); 
  else do;                
    duplicates1.add();     
    if _point^=. then do; 
      set have point=_point;
      duplicates1.add();
      criteria1.replace(key:firstname, key:lastname, key:zip,
                        data:firstname, data:lastname, data:zip, data:.);
    end;
  end;

  if criteria2.find()^=0 then 
     criteria2.add(key:firstname, key:lastname, key:emailAddress,
                   data:firstname, data:lastname, data:emailAddress, data:_n_); 
  else do;                
    duplicates2.add();     
    if _point^=. then do; 
      set have point=_point;
      duplicates2.add();
      criteria2.replace(key:firstname, key:lastname, key:emailAddress,
                        data:firstname, data:lastname, data:emailAddress, data:.);
    end;
  end;

  if eod then do;
    rc=duplicates1.output(dataset:'duplicates1');
    rc=duplicates2.output(dataset:'duplicates2');
  end;
run;

data duplicates;
  set duplicates1 duplicates2;  
run;   

proc sort data=duplicates nodup;
  by firstname lastname descending userid;
run;

data duplicates;
  retain newuserid;
  set duplicates;
  if first.lastname then newuserid = userid;
  by firstname lastname descending userid;
run;

data temp;
  set have duplicates;
run;

proc sort data=temp; by userid descending newuserid;
run;

proc sort data=temp out=want nodupkey; by userid firstname lastname;
run;

data want;
  set want;
  if newuserid eq . then newuserid = userid;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 02 Nov 2023 11:53:59 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901219#M356163</guid>
      <dc:creator>JosvanderVelden</dc:creator>
      <dc:date>2023-11-02T11:53:59Z</dc:date>
    </item>
    <item>
      <title>Re: Flagging partial duplicates across multiple criteria</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901320#M356206</link>
      <description>&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;/*
Wouldn't this question be same with this 
https://communities.sas.com/t5/SAS-Programming/How-to-create-an-ID-from-different-variables-and-OR-condition/m-p/900795#M355993
?
*/


data test;
input userid
firstName $
lastName $
zip $
emailAddress :$40.
enrollmentDate :yymmdd12.;
format enrollmentDate yymmdd10.;
cards;
1
John
Smith
88787
jsmith@gmail.com
2011-01-01
2
John
Smith
99888
jsmith@gmail.com
2013-01-01
3
John
Smith
99888
johnsmith@gmail.com
2014-01-01
4
John
Smith
11111
smith12@aol.com
2016-01-01
;


data have;
set test;
length from to $ 100;
from=catx('|',firstName,lastName,zip); 
to=catx('|',firstName,lastName,emailAddress); ;
keep from to;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;



data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

data want2;
 if _n_=1 then do;
   if 0 then set want;
   declare hash h(dataset:'want');
   h.definekey('node');
   h.definedata('household');
   h.definedone();
 end;
set test;
call missing(household);

node=catx('|',firstName,lastName,zip); 
rc=h.find();

node=catx('|',firstName,lastName,emailAddress); ;
rc=h.find();
drop rc node;
run;

proc sort data=want2;
by household enrollmentDate;
run;
data final_want;
do until(last.household);
 set want2;
 by household;
end;
newUserId=userid;
do until(last.household);
 set want2;
 by household;
 output;
end;
run;
&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Fri, 03 Nov 2023 00:59:03 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901320#M356206</guid>
      <dc:creator>Ksharp</dc:creator>
      <dc:date>2023-11-03T00:59:03Z</dc:date>
    </item>
    <item>
      <title>Re: Flagging partial duplicates across multiple criteria</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901337#M356218</link>
      <description>&lt;P&gt;You could create hash values over the variables of interest. Rows with the same identical sets of variable values will then also get the same hash value.&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data have;
  input userid firstName $ lastName $ zip emailAddress:$50. enrollmentDate:yymmdd10.;
  format enronllmentDate date9.;
  datalines;
1 First Last 9988787 flast@provider.domain 2011-01-01
2 First Last 9999888 flast@provider.domain 2013-01-01
3 First Last 9999888 firstlast@provider.domain 2014-01-01
4 First Last 9911111 last999@provider2.domain 2016-01-01
;

data want;
  set have;
  length case_1 case_2 $32.;
  case_1=hashing('md5',catx('|',firstName,lastName,zip));
  case_2=hashing('md5',catx('|',firstName,lastName,emailAddress));
run;

/* demo: dups for case 1 */
proc sql;
  select *, count(*) as n_dups
  from want
  group by case_1
  having count(*)&amp;gt;1
  ;
quit;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Fri, 03 Nov 2023 04:19:06 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Flagging-partial-duplicates-across-multiple-criteria/m-p/901337#M356218</guid>
      <dc:creator>Patrick</dc:creator>
      <dc:date>2023-11-03T04:19:06Z</dc:date>
    </item>
  </channel>
</rss>

