<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Finding and removing duplicates in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616315#M180404</link>
    <description>&lt;P&gt;Hello Everyone,&lt;/P&gt;&lt;P&gt;I have the data below (an excel dataset is also attached) and I would like to check for duplicates and then remove the duplicates. For example, in the data set below ID 400 has a filedate of May_2019 and two different certify dates. I would like to identify ID’s that have duplicate certifydates and the file date in the same month and then only keep the most recent certify date. So in the data below the certify date for ID 400 would be 10/10/2019. The dataset I am working with has millions of obs. Thank you in advance.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;ID &amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; filedate &amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; certifydate&lt;/P&gt;&lt;P&gt;100&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Jan_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 2/2/2019&lt;/P&gt;&lt;P&gt;200&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Mar_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 5/10/2019&lt;/P&gt;&lt;P&gt;300&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Apr_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 6/7/2019&lt;/P&gt;&lt;P&gt;400&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; May_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 8/10/2019&lt;/P&gt;&lt;P&gt;400&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; May_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 10/10/2019&lt;/P&gt;&lt;P&gt;500&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Jun_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 11/12/2019&lt;/P&gt;</description>
    <pubDate>Thu, 09 Jan 2020 20:26:53 GMT</pubDate>
    <dc:creator>luvscandy27</dc:creator>
    <dc:date>2020-01-09T20:26:53Z</dc:date>
    <item>
      <title>Finding and removing duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616315#M180404</link>
      <description>&lt;P&gt;Hello Everyone,&lt;/P&gt;&lt;P&gt;I have the data below (an excel dataset is also attached) and I would like to check for duplicates and then remove the duplicates. For example, in the data set below ID 400 has a filedate of May_2019 and two different certify dates. I would like to identify ID’s that have duplicate certifydates and the file date in the same month and then only keep the most recent certify date. So in the data below the certify date for ID 400 would be 10/10/2019. The dataset I am working with has millions of obs. Thank you in advance.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;ID &amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; filedate &amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; certifydate&lt;/P&gt;&lt;P&gt;100&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Jan_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 2/2/2019&lt;/P&gt;&lt;P&gt;200&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Mar_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 5/10/2019&lt;/P&gt;&lt;P&gt;300&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Apr_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 6/7/2019&lt;/P&gt;&lt;P&gt;400&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; May_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 8/10/2019&lt;/P&gt;&lt;P&gt;400&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; May_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 10/10/2019&lt;/P&gt;&lt;P&gt;500&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Jun_2019&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 11/12/2019&lt;/P&gt;</description>
      <pubDate>Thu, 09 Jan 2020 20:26:53 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616315#M180404</guid>
      <dc:creator>luvscandy27</dc:creator>
      <dc:date>2020-01-09T20:26:53Z</dc:date>
    </item>
    <item>
      <title>Re: Finding and removing duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616317#M180405</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/255656"&gt;@luvscandy27&lt;/a&gt;&amp;nbsp; Is it okay to assume your dataset is sorted&lt;STRONG&gt; by ID Filedate Certifydate&lt;/STRONG&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;If yes&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data want;
 do until(last.filedate);
  set have;
  by id filedate notsorted;
 end;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;You can basically ignore the NOTSORTED. I just used as it is already grouped by ID Filedate .&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 09 Jan 2020 20:35:19 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616317#M180405</guid>
      <dc:creator>novinosrin</dc:creator>
      <dc:date>2020-01-09T20:35:19Z</dc:date>
    </item>
    <item>
      <title>Re: Finding and removing duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616319#M180406</link>
      <description>&lt;P&gt;And if you do not expect ties in certifydate within a group of filedate, SQL will do&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;
proc sql;
create table want as
select *
from have
group by id,filedate
having certifydate=max(certifydate);
quit;
&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 09 Jan 2020 20:36:43 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616319#M180406</guid>
      <dc:creator>novinosrin</dc:creator>
      <dc:date>2020-01-09T20:36:43Z</dc:date>
    </item>
    <item>
      <title>Re: Finding and removing duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616320#M180407</link>
      <description>Yes, it's ok to assume that.&lt;BR /&gt;</description>
      <pubDate>Thu, 09 Jan 2020 20:37:05 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616320#M180407</guid>
      <dc:creator>luvscandy27</dc:creator>
      <dc:date>2020-01-09T20:37:05Z</dc:date>
    </item>
    <item>
      <title>Re: Finding and removing duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616321#M180408</link>
      <description>&lt;P&gt;Or simpler&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data want;
 set have;
 by id filedate;
 if last.filedate;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 09 Jan 2020 20:38:27 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616321#M180408</guid>
      <dc:creator>novinosrin</dc:creator>
      <dc:date>2020-01-09T20:38:27Z</dc:date>
    </item>
    <item>
      <title>Re: Finding and removing duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616328#M180412</link>
      <description>&lt;P&gt;Why not use hash? &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data _null_;

if _n_ = 1 then do;
    if 0 then set work.HAVE;
    dcl hash uniq(dataset:"work.HAVE", ordered:'A',duplicate:'r');
    uniq.definekey('ID','filedate');
    uniq.definedata('ID','filedate','certifydate');
    uniq.definedone();
end;

uniq.output(dataset:"work.WANT");

stop;

run;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Depending on the size of your dataset and your environment and how many dups you have, this may run much faster than other methods or not at all (only if your dataset is really, really big and your environment doesn't have much memory). The dataset does not have to be sorted for this to work.&lt;/P&gt;</description>
      <pubDate>Thu, 09 Jan 2020 21:30:27 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616328#M180412</guid>
      <dc:creator>JeffMaggio</dc:creator>
      <dc:date>2020-01-09T21:30:27Z</dc:date>
    </item>
    <item>
      <title>Re: Finding and removing duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616331#M180413</link>
      <description>Thanks everyone! Just one more question, if I wanted to just check the&lt;BR /&gt;datset for duplicates how could I do that?&lt;BR /&gt;</description>
      <pubDate>Thu, 09 Jan 2020 21:48:05 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616331#M180413</guid>
      <dc:creator>luvscandy27</dc:creator>
      <dc:date>2020-01-09T21:48:05Z</dc:date>
    </item>
    <item>
      <title>Re: Finding and removing duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616333#M180414</link>
      <description>&lt;P&gt;You could output those Filedates having more than one distinct certify date as a dataset with dupes&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;
proc sql;
create table dupes as
select *
from have
group by id,filedate
having count(distinct certifydate)&amp;gt;1;
quit;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;And assuming, if there are more than certifydate within a fiiledate and those certifydates will be distinct, the following datastep would output 2 datasets i.e one with keeping your most recent certifydate and the other with the duplicates&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data want dup;
 set have;
 by id filedate ;
 if not (first.filedate and last.filedate) then output dup;
 if last.filedate then output want;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 09 Jan 2020 22:09:23 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Finding-and-removing-duplicates/m-p/616333#M180414</guid>
      <dc:creator>novinosrin</dc:creator>
      <dc:date>2020-01-09T22:09:23Z</dc:date>
    </item>
  </channel>
</rss>

