<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: proc sql left join condition using where or on in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/581692#M165354</link>
    <description>&lt;P&gt;Welcome to the SAS Community &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;What result did you get yourself? Did you get the same result from the two queries?&lt;/P&gt;</description>
    <pubDate>Fri, 16 Aug 2019 11:43:52 GMT</pubDate>
    <dc:creator>PeterClemmensen</dc:creator>
    <dc:date>2019-08-16T11:43:52Z</dc:date>
    <item>
      <title>proc sql left join condition using where or on</title>
      <link>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/581682#M165346</link>
      <description>&lt;P&gt;Hello, I wondered if people had any experience with where they place a condition when doing a left join performance/time wise. I have the code:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql ;
    create table step2 as
    select distinct a.*,
        b.income,
        ...
    from step1 as a

    left join (select year, income, ..., from income_data) as b
    on a.id=b.id and a.year=b.year and b.id^='' ;
quit ;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;I could have used a where expression instead, are they equivalent, or is there some performance differences?:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql ;
    create table step2 as
    select distinct a.*,
        b.income,
        ...
    from step1 as a

    left join (select year, income, ..., from income_data where id^='') as b
    on a.id=b.id and a.year=b.year ;
quit ;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Fri, 16 Aug 2019 11:09:55 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/581682#M165346</guid>
      <dc:creator>Gexern</dc:creator>
      <dc:date>2019-08-16T11:09:55Z</dc:date>
    </item>
    <item>
      <title>Re: proc sql left join condition using where or on</title>
      <link>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/581689#M165351</link>
      <description>&lt;P&gt;First, compare the results of both queries to see if they are logically equivalent.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/285430"&gt;@Gexern&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;
&lt;P&gt;Hello, I wondered if people had any experience with where they place a condition when doing a left join performance/time wise. I have the code:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql ;
    create table step2 as
    select distinct a.*,
        b.income,
        ...
    from step1 as a

    left join (select year, income, ..., from income_data) as b
    on a.id=b.id and a.year=b.year and b.id^='' ;
quit ;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;I could have used a where expression instead, are they equivalent, or is there some performance differences?:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql ;
    create table step2 as
    select distinct a.*,
        b.income,
        ...
    from step1 as a

    left join (select year, income, ..., from income_data where id^='') as b
    on a.id=b.id and a.year=b.year ;
quit ;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 16 Aug 2019 11:42:51 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/581689#M165351</guid>
      <dc:creator>Kurt_Bremser</dc:creator>
      <dc:date>2019-08-16T11:42:51Z</dc:date>
    </item>
    <item>
      <title>Re: proc sql left join condition using where or on</title>
      <link>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/581692#M165354</link>
      <description>&lt;P&gt;Welcome to the SAS Community &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;What result did you get yourself? Did you get the same result from the two queries?&lt;/P&gt;</description>
      <pubDate>Fri, 16 Aug 2019 11:43:52 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/581692#M165354</guid>
      <dc:creator>PeterClemmensen</dc:creator>
      <dc:date>2019-08-16T11:43:52Z</dc:date>
    </item>
    <item>
      <title>Re: proc sql left join condition using where or on</title>
      <link>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/581693#M165355</link>
      <description>&lt;P&gt;I haven't, no. The code runs for 4 hours so its not that easy to compare. I can batch two programs that run at the same time and compare. I would &lt;U&gt;expect&lt;/U&gt; to get the same result from both queries though.&lt;/P&gt;</description>
      <pubDate>Fri, 16 Aug 2019 11:49:36 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/581693#M165355</guid>
      <dc:creator>Gexern</dc:creator>
      <dc:date>2019-08-16T11:49:36Z</dc:date>
    </item>
    <item>
      <title>Re: proc sql left join condition using where or on</title>
      <link>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/581730#M165368</link>
      <description>&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/285430"&gt;@Gexern&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;
&lt;P&gt;I haven't, no. The code runs for 4 hours so its not that easy to compare. I can batch two programs that run at the same time and compare. I would &lt;U&gt;expect&lt;/U&gt; to get the same result from both queries though.&lt;/P&gt;
&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;How many records are involved?&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I would create subsets of the data of only a few thousand records for each and test the code for performance and equivalence.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Do you have missing values of a.id? I don't see much sense for the b.id^=' '&amp;nbsp;in this if you don't.&lt;/P&gt;
&lt;PRE&gt;on a.id=b.id and a.year=b.year and b.id^='' &lt;/PRE&gt;
&lt;P&gt;And if you do, I strongly wonder why an ID type variable would be missing. Since I tend to think of ID as used to identify things then missing values are extremely problematic. If A.ID does not have any missing values then the b.id^=' '&amp;nbsp;is not needed as join on will only get matches.&lt;/P&gt;</description>
      <pubDate>Fri, 16 Aug 2019 14:25:55 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/581730#M165368</guid>
      <dc:creator>ballardw</dc:creator>
      <dc:date>2019-08-16T14:25:55Z</dc:date>
    </item>
    <item>
      <title>Re: proc sql left join condition using where or on</title>
      <link>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/582341#M165622</link>
      <description>&lt;P&gt;Make up some fake data and run your codes against it.&lt;/P&gt;
&lt;P&gt;I did this:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data step1;
input id $ year;
datalines;
A 2018
A 2019
B 2018
B 2019
C 2018
C 2019
;

data income_data;
input id $ year income;
datalines;
A 2018 1000
A 2018 1500
B 2019 3000
C 2018 2000
;
run;

proc sql ;
create table step2_a as
select distinct
  a.*,
  b.income
from step1 as a
left join (
  select
    id,
    year,
    income
  from income_data
) as b
on a.id = b.id and a.year = b.year and b.id ^= ''
;
quit;

proc sql ;
create table step2_b as
select distinct
  a.*,
  b.income
from step1 as a
left join (
  select
    id,
    year,
    income
  from income_data where id ^= ''
) as b
on a.id = b.id and a.year = b.year
;
quit;

proc compare data=step2_a compare=step2_b;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;and found no difference.&lt;/P&gt;
&lt;P&gt;So play around with the faked source data to include edge cases (do you have missing id and/or year values in one of the datasets, or do you have multiple instances for any id/year pair in one or both datasets) that might turn up in&amp;nbsp;&lt;EM&gt;your&lt;/EM&gt; source data.&lt;/P&gt;
&lt;P&gt;In particular I noticed your use of distinct. Some people use this out of habit and do not realize that it forces SQL to sort the result table by&amp;nbsp;&lt;EM&gt;all&lt;/EM&gt; columns to determine possible duplicates. This can be very time-consuming, so you should check your data (see Maxim 3) if it is necessary at all.&lt;/P&gt;
&lt;P&gt;From this, I created a simplified version of your code; first, I removed the check for id ^= '':&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql;
create table step2_c as
select
  a.*,
  b.income
from step1 as a
left join (
  select
    id,
    year,
    income
  from income_data
) as b
on a.id = b.id and a.year = b.year
;
quit;

proc compare data=step2_a compare=step2_c;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;and found no difference to the previous results.&lt;/P&gt;
&lt;P&gt;Next, I simplified this even further:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql;
create table step2_d as
select
  a.*,
  b.income
from step1 as a
left join income_data as b
on a.id = b.id and a.year = b.year
;
quit;

proc compare data=step2_a compare=step2_d;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;and got the same results once again.&lt;/P&gt;
&lt;P&gt;If you do not have a many-to-many relationship with regards to id and year, you can use a data step:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sort data=step1;
by id year;
run;

proc sort data=income_data;
by id year;
run;

data step2_e;
merge
  step1 (in=_in_step1)
  income_data
;
by id year;
if _in_step1;
run;

proc compare data=step2_a compare=step2_e;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;which once again yields identical results with my fake data. Out of experience, I dare to say that this approach will beat all SQL methods performance-wise. Especially if you can manage to have the sorts done further up when the data sets are created in the first place.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 20 Aug 2019 08:23:19 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/proc-sql-left-join-condition-using-where-or-on/m-p/582341#M165622</guid>
      <dc:creator>Kurt_Bremser</dc:creator>
      <dc:date>2019-08-20T08:23:19Z</dc:date>
    </item>
  </channel>
</rss>

