<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: PROC SQL SELF JOIN remove duplicates in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/885718#M350020</link>
    <description>&lt;P&gt;Try this:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;
data abuse;
	infile cards dlm=" 	";
	input case o_ssn:$11. v_ssn:$11. date:$10.;
datalines;
1 000-00-0001 000-00-0002 01/01/2000
2 000-00-0002 000-00-0001 01/01/2000
3 000-00-0003 000-00-0004 01/01/2000
4 000-00-0005 000-00-0006 01/01/2000
5 000-00-0007 000-00-0008 01/01/2000
6 000-00-0008 000-00-0007 01/01/2000
;
run;


proc sql;
create table no_reverse_match as 
select 
a.case, a.o_ssn, a.v_ssn

from 
abuse a inner join abuse b
on a.o_ssn = b.v_ssn and a.v_ssn = b.o_ssn and a.o_ssn &amp;lt; a.v_ssn 

where  
a.date=b.date;
quit;

&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 20 Jul 2023 22:00:19 GMT</pubDate>
    <dc:creator>HB</dc:creator>
    <dc:date>2023-07-20T22:00:19Z</dc:date>
    <item>
      <title>PROC SQL SELF JOIN remove duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/885685#M350008</link>
      <description>&lt;P&gt;Please see Example 1 and Result Output Page 1 in this link: &lt;A href="https://support.sas.com/resources/papers/proceedings/proceedings/sugi29/065-29.pdf" target="_blank"&gt;https://support.sas.com/resources/papers/proceedings/proceedings/sugi29/065-29.pdf&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The following SQL procedure provides Result Output on page 1:&lt;/P&gt;&lt;P&gt;proc sql;&lt;BR /&gt;select a.*,&lt;BR /&gt;from abuse a, abuse b&lt;BR /&gt;where a.o_ssn=b.v_ssn and&lt;BR /&gt;a.v_ssn=b.o_ssn and&lt;BR /&gt;a.date=b.date;&lt;BR /&gt;quit;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I would like to keep only cases 1 and 5 instead of cases 1,2,5,6. What code would I need to write to do that?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thanks for the help!&lt;/P&gt;</description>
      <pubDate>Thu, 20 Jul 2023 19:57:49 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/885685#M350008</guid>
      <dc:creator>r-lee-484</dc:creator>
      <dc:date>2023-07-20T19:57:49Z</dc:date>
    </item>
    <item>
      <title>Re: PROC SQL SELF JOIN remove duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/885688#M350010</link>
      <description>&lt;P&gt;The order of output from SQL is not guaranteed. You should provide some actual variable&amp;nbsp; values indicating which observations you want.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Better is to provide some rule(s) related to the content of a data set. I can write code that will keep just those observations but isn't extensible to any other project or data set.&lt;/P&gt;</description>
      <pubDate>Thu, 20 Jul 2023 20:04:57 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/885688#M350010</guid>
      <dc:creator>ballardw</dc:creator>
      <dc:date>2023-07-20T20:04:57Z</dc:date>
    </item>
    <item>
      <title>Re: PROC SQL SELF JOIN remove duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/885692#M350011</link>
      <description>&lt;P&gt;I have a dataset with about 5,000,000 observations. I won't be able to individually select which observations to keep. Is there a way I can code for a rule to just keep 1 set of matches?&lt;/P&gt;</description>
      <pubDate>Thu, 20 Jul 2023 20:13:11 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/885692#M350011</guid>
      <dc:creator>r-lee-484</dc:creator>
      <dc:date>2023-07-20T20:13:11Z</dc:date>
    </item>
    <item>
      <title>Re: PROC SQL SELF JOIN remove duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/885697#M350013</link>
      <description>&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/446141"&gt;@r-lee-484&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;
&lt;P&gt;I have a dataset with about 5,000,000 observations. I won't be able to individually select which observations to keep. Is there a way I can code for a rule to just keep 1 set of matches?&lt;/P&gt;
&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;I repeat: what are the rules involved with which observations you want to keep.&lt;/P&gt;
&lt;P&gt;I set of matches to what?&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Perhaps it is time to provide an example of data similar to what you currently have what you want for the output given that starting data.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;There are many ways to get "1 set" of somethings. How a "set" is defined is critical in any of the processes.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 20 Jul 2023 20:17:13 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/885697#M350013</guid>
      <dc:creator>ballardw</dc:creator>
      <dc:date>2023-07-20T20:17:13Z</dc:date>
    </item>
    <item>
      <title>Re: PROC SQL SELF JOIN remove duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/885718#M350020</link>
      <description>&lt;P&gt;Try this:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;
data abuse;
	infile cards dlm=" 	";
	input case o_ssn:$11. v_ssn:$11. date:$10.;
datalines;
1 000-00-0001 000-00-0002 01/01/2000
2 000-00-0002 000-00-0001 01/01/2000
3 000-00-0003 000-00-0004 01/01/2000
4 000-00-0005 000-00-0006 01/01/2000
5 000-00-0007 000-00-0008 01/01/2000
6 000-00-0008 000-00-0007 01/01/2000
;
run;


proc sql;
create table no_reverse_match as 
select 
a.case, a.o_ssn, a.v_ssn

from 
abuse a inner join abuse b
on a.o_ssn = b.v_ssn and a.v_ssn = b.o_ssn and a.o_ssn &amp;lt; a.v_ssn 

where  
a.date=b.date;
quit;

&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 20 Jul 2023 22:00:19 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/885718#M350020</guid>
      <dc:creator>HB</dc:creator>
      <dc:date>2023-07-20T22:00:19Z</dc:date>
    </item>
    <item>
      <title>Re: PROC SQL SELF JOIN remove duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/886253#M350216</link>
      <description>&lt;P&gt;Thank you very much, that code works to remove reverse matches and to retain 1 set of matches.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;On another note, what code should I write to filter out any paired matches? From the dataset "abuse" I only&amp;nbsp;want to keep cases 3 and 4, and I don't want to keep case 1,2,5,6?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thanks for your help!&lt;/P&gt;</description>
      <pubDate>Tue, 25 Jul 2023 15:00:50 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/886253#M350216</guid>
      <dc:creator>r-lee-484</dc:creator>
      <dc:date>2023-07-25T15:00:50Z</dc:date>
    </item>
    <item>
      <title>Re: PROC SQL SELF JOIN remove duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/886328#M350241</link>
      <description>&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/446141"&gt;@r-lee-484&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;
&lt;P&gt;Thank you very much, that code works to remove reverse matches and to retain 1 set of matches.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;On another note, what code should I write to filter out any paired matches? From the dataset "abuse" I only&amp;nbsp;want to keep cases 3 and 4, and I don't want to keep case 1,2,5,6?&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;
&lt;P&gt;In other words, you want every BUT the paired matches, correct?&amp;nbsp; Then you could just use the EXCEPT set operator:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;
proc sql;
create table unmatched_cases  as 
  select c.case, c.o_ssn, c.v_ssn  
  from abuse as c
  except

  select  a.case, a.o_ssn, a.v_ssn
  from 
    abuse a inner join abuse b
    on a.o_ssn = b.v_ssn 
       and a.v_ssn = b.o_ssn 
  where  
    a.date=b.date;
quit;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;There is also straightforward DATA step code for this task:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data un_matched_cases;
  set abuse;
  if _n_=1 then do;
    declare hash h (dataset:'abuse (keep=date o_ssn v_ssn)');
      h.definekey('date','o_ssn','v_ssn');
      h.definedone();
  end;
  if h.check(key:date,key:v_ssn,key:o_ssn)^=0;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;Note order of keys for the hash object h is DATE,O_SSN,V_SSN.&amp;nbsp; &amp;nbsp;That means the check method - &lt;EM&gt;&lt;STRONG&gt;h.check(...) &lt;/STRONG&gt;&lt;/EM&gt;&amp;nbsp;- for "reverse match" must present the arguments ordered as DATE,V_SSN,O_SSN.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 26 Jul 2023 01:39:13 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/886328#M350241</guid>
      <dc:creator>mkeintz</dc:creator>
      <dc:date>2023-07-26T01:39:13Z</dc:date>
    </item>
    <item>
      <title>Re: PROC SQL SELF JOIN remove duplicates</title>
      <link>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/886417#M350263</link>
      <description>&lt;P&gt;This works thank you!&lt;/P&gt;</description>
      <pubDate>Wed, 26 Jul 2023 13:54:15 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/PROC-SQL-SELF-JOIN-remove-duplicates/m-p/886417#M350263</guid>
      <dc:creator>r-lee-484</dc:creator>
      <dc:date>2023-07-26T13:54:15Z</dc:date>
    </item>
  </channel>
</rss>

