<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: how to dedup data in PROC SQL in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/250124#M47140</link>
    <description>Thank you so much for helping me!!&lt;BR /&gt;</description>
    <pubDate>Mon, 15 Feb 2016 16:45:17 GMT</pubDate>
    <dc:creator>Ying</dc:creator>
    <dc:date>2016-02-15T16:45:17Z</dc:date>
    <item>
      <title>how to dedup data in PROC SQL</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249823#M47043</link>
      <description>&lt;P&gt;Hello,&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Could you please help me to use PROC SQL create a below data TEST_NODUP? &amp;nbsp;Thank you!&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;*--- TEST data;
data TEST;
  input id type cd $2. date sub_id type2 cnt;
  informat date yymmddn10.;
datalines;
574 1 6Z 1/10/2008 689 2 1
574 1 6Z 2/11/2008 616 2 1
574 1 7E 12/19/2008 8117 2 1
574 1 7E 12/19/2008 8917 2 1
574 1 7E 12/19/2008 8927 2 1
574 1 7E 12/19/2008 8422 2 1
574 1 7E 12/19/2008 8864 2 1
;
run;
proc sort data = TEST;
  by id type cd date sub_id type2;
run;

*--- final table;
data TEST_NODUP;
  set TEST;
  by id type cd date sub_id type2;
  if last.date;
  format yymmddn10.; 
run;

TEST_NODUP
574 1 6Z 1/10/2008 689 2 1
574 1 6Z 2/11/2008 616 2 1
574 1 7E 12/19/2008 8927 2 1 &lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 12 Feb 2016 21:18:48 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249823#M47043</guid>
      <dc:creator>Ying</dc:creator>
      <dc:date>2016-02-12T21:18:48Z</dc:date>
    </item>
    <item>
      <title>Re: how to dedup data in PROC SQL</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249829#M47047</link>
      <description>&lt;P&gt;Proc sql;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp; create table test_nodup as&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp; select distinct *&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp; from test;&lt;/P&gt;
&lt;P&gt;quit;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Should generally work.&lt;/P&gt;</description>
      <pubDate>Fri, 12 Feb 2016 21:58:10 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249829#M47047</guid>
      <dc:creator>ballardw</dc:creator>
      <dc:date>2016-02-12T21:58:10Z</dc:date>
    </item>
    <item>
      <title>Re: how to dedup data in PROC SQL</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249835#M47051</link>
      <description>Thank you!  This would not work for my case.</description>
      <pubDate>Fri, 12 Feb 2016 23:21:36 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249835#M47051</guid>
      <dc:creator>Ying</dc:creator>
      <dc:date>2016-02-12T23:21:36Z</dc:date>
    </item>
    <item>
      <title>Re: how to dedup data in PROC SQL</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249836#M47052</link>
      <description>&lt;P&gt;While your subject says dedup, that's not what your program does.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;It is actually selecting the most recent record for an ID TYPE CD combination and where there are multiple records for the same date then choosing the highest SUB_ID and TYPE2.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;In SQL you could do this as a GROUP BY sub-query, but I think the DATA step way is easier.&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql;
  create table test_latest as
  select a.*
  from test as a
  inner join
  (select id
             ,type
             ,cd
             ,max(date) as date_max
             ,max(sub_id) as sub_id_max
             ,max(type2) as type2_max
   from test
   group by id
           ,type
           ,cd
   )  as b
   on a.id = b.id
   and a.type = b.type
   and a.cd = b.cd
   and a.date = b.date_max
   and a.sub_id = b.sub_id_max
   and a.type2 = b.type2_max
  ;
quit;  
  &lt;/CODE&gt;&amp;nbsp;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 12 Feb 2016 23:25:37 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249836#M47052</guid>
      <dc:creator>SASKiwi</dc:creator>
      <dc:date>2016-02-12T23:25:37Z</dc:date>
    </item>
    <item>
      <title>Re: how to dedup data in PROC SQL</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249837#M47053</link>
      <description>&lt;P&gt;Hello&amp;nbsp;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/18034"&gt;@Ying﻿&lt;/a&gt;,&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I think, there is no &lt;EM&gt;general&lt;/EM&gt; equivalent to BY-group processing with &lt;FONT face="courier new,courier"&gt;last.&lt;EM&gt;variable&lt;/EM&gt;&lt;/FONT&gt; in PROC SQL (leaving &lt;FONT face="courier new,courier"&gt;monotonic()&lt;/FONT&gt; aside).&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;However, if in your real data (like in dataset TEST) variable SUB_ID&amp;nbsp;does not have duplicate values within a ID-TYPE-CD-DATE BY group, you could use the following technique:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql;
create table test_nodup as
select *
from test
group by id, type, cd, date
having sub_id=max(sub_id)
order by id, type, cd, date;
quit;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;(The ORDER BY clause should actually be redundant. The SELECT statement could be modified to specify a format for variable DATE, if needed.)&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;If there &lt;EM&gt;are&lt;/EM&gt; duplicates of SUB_ID within some BY groups, but TYPE2 and CNT can serve as "tie-breakers" (i.e., the combination SUB_ID, TYPE2, CNT is unique within each &lt;SPAN&gt;ID-TYPE-CD-DATE&lt;/SPAN&gt;&amp;nbsp;BY group), one could think about defining an expression combining values of&amp;nbsp;&lt;SPAN&gt;SUB_ID, TYPE2 and CNT to replace SUB_ID in the HAVING clause. (The details would depend on the possible values of those three variables.)&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;By the way, my SAS 9.4 does not like your INFORMAT and FORMAT statements (for good&amp;nbsp;reasons).&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 12 Feb 2016 23:24:40 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249837#M47053</guid>
      <dc:creator>FreelanceReinh</dc:creator>
      <dc:date>2016-02-12T23:24:40Z</dc:date>
    </item>
    <item>
      <title>Re: how to dedup data in PROC SQL</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249839#M47054</link>
      <description>&lt;P&gt;Thank you so much! &amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;The TEST was created by&amp;nbsp;&lt;SPAN&gt;ID-TYPE-CD-DATE (see below), so the way you describe should work. &amp;nbsp;I was given the code with PROC SQL and data steps and asked to replace the data steps with PROC SQL. &amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;
&lt;PRE&gt;proc sql;
    create table TEST as
        select id, type, cd, date, sub_id, type2, count(*) as cnt
        from    indsn
        group by 1,2,3,4,5,6          
        order by 1,2,3,4,5,6   ;
quit;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Another question, how to just simply rewrite the PROC SORT NODUPKEY data=... to PROC SQL to remove the dup? &amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;proc sql;
    create table TEST as
        select id, type, cd, date, sub_id, type2, status, dollar, amt
        from    indsn a1,
                   indsn2 a2
        where a1.id = a2.id and
                   a1.type = a2.type and
                   a1.cd = a2.;
quit;

proc sort data=TEST NODUPKEY; by id type cd; run;&lt;/PRE&gt;</description>
      <pubDate>Fri, 12 Feb 2016 23:50:10 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249839#M47054</guid>
      <dc:creator>Ying</dc:creator>
      <dc:date>2016-02-12T23:50:10Z</dc:date>
    </item>
    <item>
      <title>Re: how to dedup data in PROC SQL</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249840#M47055</link>
      <description>&lt;P&gt;Your first PROC SQL step shows that the combination of SUB_ID and TYPE2 is unique within the&amp;nbsp;&lt;SPAN&gt;ID-TYPE-CD-DATE BY groups. So, CNT could be ignored in the "combined" expression I mentioned.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;The PROC SORT step retains the&amp;nbsp;first observation for each ID-TYPE-CD BY group. In PROC SQL, however, we would need to &lt;EM&gt;define&lt;/EM&gt; which observation should be regarded as the "first" in a BY group. If TEST was sorted by DATE, SUB_ID etc. within each BY group, we could use a similar GROUP BY/HAVING approach as for your original question. Otherwise, we might have to resort to the undocumented MONOTONIC function, but I would prefer&amp;nbsp;to avoid that:&lt;/SPAN&gt;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql undo_policy=none;
create table test as
select * from test
group by id, type, cd
having monotonic()=min(monotonic());
quit;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&lt;SPAN&gt;(I've learned about the option &lt;FONT face="courier new,courier"&gt;undo_policy=none&lt;/FONT&gt; from &lt;A href="https://communities.sas.com/t5/General-SAS-Programming/WARNING-This-CREATE-TABLE-statement-recursively-references-the/td-p/171614" target="_blank"&gt;this older thread&lt;/A&gt;, but I would recommend to avoid reading and overwriting TEST with the same PROC SQL step.)&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;What is assumed about the values of DATE, SUB_ID, etc. after&amp;nbsp;the PROC SORT step?&lt;/P&gt;</description>
      <pubDate>Sat, 13 Feb 2016 00:45:13 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/249840#M47055</guid>
      <dc:creator>FreelanceReinh</dc:creator>
      <dc:date>2016-02-13T00:45:13Z</dc:date>
    </item>
    <item>
      <title>Re: how to dedup data in PROC SQL</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/250009#M47105</link>
      <description>&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data TEST;
  input id type cd $ date : mmddyy10. sub_id type2 cnt;
  format date yymmdd10.;
datalines;
574 1 6Z 1/10/2008 689 2 1
574 1 6Z 2/11/2008 616 2 1
574 1 7E 12/19/2008 8117 2 1
574 1 7E 12/19/2008 8917 2 1
574 1 7E 12/19/2008 8927 2 1
574 1 7E 12/19/2008 8422 2 1
574 1 7E 12/19/2008 8864 2 1
;
run;
proc sql;
select distinct *
 from (select distinct * from TEST
        group by id ,type ,cd ,date
         having sub_id=max(sub_id) )
     group by  id ,type ,cd ,date
      having type2=max(type2);
quit;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Mon, 15 Feb 2016 02:47:37 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/250009#M47105</guid>
      <dc:creator>Ksharp</dc:creator>
      <dc:date>2016-02-15T02:47:37Z</dc:date>
    </item>
    <item>
      <title>Re: how to dedup data in PROC SQL</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/250124#M47140</link>
      <description>Thank you so much for helping me!!&lt;BR /&gt;</description>
      <pubDate>Mon, 15 Feb 2016 16:45:17 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-dedup-data-in-PROC-SQL/m-p/250124#M47140</guid>
      <dc:creator>Ying</dc:creator>
      <dc:date>2016-02-15T16:45:17Z</dc:date>
    </item>
  </channel>
</rss>

