<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: divergence between proc sql join and hash in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860615#M339970</link>
    <description>&lt;P&gt;Thanks Tom. Looking closer, it was indeed an inner join. Seeing the original data, however, I still found something that doesn't add up, because the presence of additional fields creates double lines in the sql that disappear in the hash. Here is an example, I think it can be solved by introducing multidata="Y" also on the second hash:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data  OPE;
  input cod_contratto :$15. NUM_REVISIONE TS_FINE_VALIDITA :datetime.;
  format TS_FINE_VALIDITA datetime19.;
datalines;
100004790 35 01Jan1999:00:00:00
;
data AA;
input cod_contratto :$15. NUM_REVISIONE extra ;
datalines;
100004790 35 1
100004790 35 2
;
 
proc sql;
create table W1140ABS  as
  select distinct
     aa.cod_contratto,
	 AA.NUM_REVISIONE,
	 AA.extra,
     OPE.TS_FINE_VALIDITA
  from AA
  inner join  OPE
    ON AA.COD_CONTRATTO = OPE.COD_CONTRATTO
    and OPE.NUM_REVISIONE = AA.NUM_REVISIONE;
quit;

 
data W1140ABS_hash;
  if 0 then set OPE AA;
  if _n_=1 then do;
    declare hash h_merge(dataset:"OPE",multidata:'Y');
    rc = h_merge.DefineKey("COD_CONTRATTO", "NUM_REVISIONE");
    rc = h_merge.DefineData( "TS_FINE_VALIDITA");
    rc = h_merge.DefineDone();
    declare hash h_left();
    rc = h_left.DefineKey("COD_CONTRATTO", "NUM_REVISIONE");
    rc = h_left.DefineData("RC");
    rc = h_left.DefineDone();
  end;
  set AA  end=eof;
  if 0=h_left.add() then do;
    rc = h_merge.find(); 
    do while(not rc);
      output;
      rc = h_merge.find_next();
    end;
  end;
  drop rc ;
run; &lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Fri, 24 Feb 2023 08:15:15 GMT</pubDate>
    <dc:creator>mariopellegrini</dc:creator>
    <dc:date>2023-02-24T08:15:15Z</dc:date>
    <item>
      <title>divergence between proc sql join and hash</title>
      <link>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860398#M339899</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Good morning everyone. I encountered an error in a conversion from a sql proc (left join) to a left join with hash table: I report the example showing that the TS_END_VALIDITA field at the end differs between the 2 join versions&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data  OPE;
input cod_contratto:$15. NUM_REVISIONE TS_FINE_VALIDITA:datetime18.;
format TS_FINE_VALIDITA datetime18.;
datalines;
100004790 35 01Jan1999:00:00:00
100004790 35 14Sep2022:00:00:00
;
data  AA;
input cod_contratto:$15. NUM_REVISIONE;
datalines;
100004790 35
100004790 35
;
 
proc sql;
create table W1140ABS  as
select distinct OPE.COD_CONTRATTO 
               ,OPE.NUM_REVISIONE as PRG_REV_CONTR
               ,OPE.TS_FINE_VALIDITA format=datetime18. 
from AA
left join  OPE
ON AA.COD_CONTRATTO = OPE.COD_CONTRATTO
and OPE.NUM_REVISIONE = AA.NUM_REVISIONE;
quit;

data W1140ABS_hash;
if 0 then set OPE AA;
	 declare hash h_merge(dataset:"OPE");
	 rc = h_merge.DefineKey("COD_CONTRATTO", "NUM_REVISIONE");
	 rc = h_merge.DefineData("TS_FINE_VALIDITA");
	 rc = h_merge.DefineDone();
drop rc;
do while (not eof);
   set AA  end=eof;
   rc = h_merge.find();  
   output;
end;
run; 
&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;Mario&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 23 Feb 2023 10:31:27 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860398#M339899</guid>
      <dc:creator>mariopellegrini</dc:creator>
      <dc:date>2023-02-23T10:31:27Z</dc:date>
    </item>
    <item>
      <title>Re: divergence between proc sql join and hash</title>
      <link>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860416#M339902</link>
      <description>&lt;P&gt;The hash object allows only unique keys; While the dataset is read into the object, observations with duplicate keys are discarded.&lt;/P&gt;
&lt;P&gt;This means that only the first obs from OPE is present in the object, and only its value makes it into the resulting dataset; the SQL first creates a cartesian join with 4 observations, two of which are discarded when DISTINCT is applied.&lt;/P&gt;</description>
      <pubDate>Thu, 23 Feb 2023 11:41:59 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860416#M339902</guid>
      <dc:creator>Kurt_Bremser</dc:creator>
      <dc:date>2023-02-23T11:41:59Z</dc:date>
    </item>
    <item>
      <title>Re: divergence between proc sql join and hash</title>
      <link>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860432#M339908</link>
      <description>&lt;P&gt;multidata option: "Y" in hash object declaration why doesn't it resolve? are there no other solutions?&lt;/P&gt;</description>
      <pubDate>Thu, 23 Feb 2023 13:51:46 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860432#M339908</guid>
      <dc:creator>mariopellegrini</dc:creator>
      <dc:date>2023-02-23T13:51:46Z</dc:date>
    </item>
    <item>
      <title>Re: divergence between proc sql join and hash</title>
      <link>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860439#M339911</link>
      <description>&lt;P&gt;You still have to&amp;nbsp;&lt;EM&gt;iterate&lt;/EM&gt; through the hash for all matches, and then apply an equivalent of the DISTINCT (e.g. use PROC SORT with NODUPKEY, or use a second hash to store all combinations you already wrote to the output).&lt;/P&gt;</description>
      <pubDate>Thu, 23 Feb 2023 14:15:13 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860439#M339911</guid>
      <dc:creator>Kurt_Bremser</dc:creator>
      <dc:date>2023-02-23T14:15:13Z</dc:date>
    </item>
    <item>
      <title>Re: divergence between proc sql join and hash</title>
      <link>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860459#M339913</link>
      <description>&lt;P&gt;MULTIDATA='Y' will help.&amp;nbsp; But then you have to do more work.&amp;nbsp; To implement the DISTINCT feature you had in the SQL query then make a second HASH to keep track of which observations from the non-hashed dataset have already occured.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Also let's not rename one of the variables in the SQL so we can use PROC COMPARE to test the results.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;While we are at it there is no need to attach the format again to the other variable. (And to get DATETIME to properly display the year you need a width of at least 19, some type of bug that SAS is not planning to fix).&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;And your SQL query makes no sense as a LEFT join since you are only selecting variables from the RIGHT table.&amp;nbsp; So converted it to an INNER join.&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data  OPE;
  input cod_contratto :$15. NUM_REVISIONE TS_FINE_VALIDITA :datetime18.;
  format TS_FINE_VALIDITA datetime19.;
datalines;
100004790 35 01Jan1999:00:00:00
100004790 35 14Sep2022:00:00:00
x 12
;
data  AA;
input cod_contratto :$15. NUM_REVISIONE;
datalines;
100004790 35
100004790 35
y 24
;
 
proc sql;
create table W1140ABS  as
  select distinct
     OPE.COD_CONTRATTO 
    ,OPE.NUM_REVISIONE /* as PRG_REV_CONTR */
    ,OPE.TS_FINE_VALIDITA /* format=datetime18. */
  from AA
  inner join  OPE
    ON AA.COD_CONTRATTO = OPE.COD_CONTRATTO
    and OPE.NUM_REVISIONE = AA.NUM_REVISIONE
;
quit;

data W1140ABS_hash;
  if 0 then set OPE AA;
  if _n_=1 then do;
    declare hash h_merge(dataset:"OPE",multidata:'Y');
    rc = h_merge.DefineKey("COD_CONTRATTO", "NUM_REVISIONE");
    rc = h_merge.DefineData("TS_FINE_VALIDITA");
    rc = h_merge.DefineDone();
    declare hash h_left();
    rc = h_left.DefineKey("COD_CONTRATTO", "NUM_REVISIONE");
    rc = h_left.DefineData("RC");
    rc = h_left.DefineDone();
  end;
  set AA  end=eof;
  if 0=h_left.add() then do;
    rc = h_merge.find(); 
    do while(not rc);
      output;
      rc = h_merge.find_next();
    end;
  end;
  drop rc ;
run; 

proc compare data=W1140ABS compare=W1140ABS_hash listall;
run;

&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 23 Feb 2023 14:58:00 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860459#M339913</guid>
      <dc:creator>Tom</dc:creator>
      <dc:date>2023-02-23T14:58:00Z</dc:date>
    </item>
    <item>
      <title>Re: divergence between proc sql join and hash</title>
      <link>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860464#M339914</link>
      <description>&lt;P&gt;Thank you very much ... a bit complex as a code! actually I would need a left join because in the original query I also have other variables and I have to get the exact rows of the left query. Can you modify the hash you wrote to bring back the left join? Thank you!&lt;/P&gt;</description>
      <pubDate>Thu, 23 Feb 2023 15:17:31 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860464#M339914</guid>
      <dc:creator>mariopellegrini</dc:creator>
      <dc:date>2023-02-23T15:17:31Z</dc:date>
    </item>
    <item>
      <title>Re: divergence between proc sql join and hash</title>
      <link>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860487#M339925</link>
      <description>&lt;P&gt;Sure.&amp;nbsp; You need to add another variable to the LEFT dataset so you can see what you are joining.&lt;/P&gt;
&lt;P&gt;So here is a cleaned up example:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data  OPE;
  input cod_contratto :$15. NUM_REVISIONE TS_FINE_VALIDITA :datetime.;
  format TS_FINE_VALIDITA datetime19.;
datalines;
100004790 35 01Jan1999:00:00:00
100004790 35 14Sep2022:00:00:00
x 12
;
data  AA;
input cod_contratto :$15. NUM_REVISIONE extra ;
datalines;
100004790 35 1
100004790 35 2
y 24 1
;
 
proc sql;
create table W1140ABS  as
  select distinct
     aa.*
    ,OPE.TS_FINE_VALIDITA /* format=datetime19. */
  from AA
  left join  OPE
    ON AA.COD_CONTRATTO = OPE.COD_CONTRATTO
    and OPE.NUM_REVISIONE = AA.NUM_REVISIONE
;
quit;

data W1140ABS_hash;
  if 0 then set OPE AA;
  if _n_=1 then do;
    declare hash h_merge(dataset:"OPE",multidata:'Y');
    rc = h_merge.DefineKey("COD_CONTRATTO", "NUM_REVISIONE");
    rc = h_merge.DefineData("TS_FINE_VALIDITA");
    rc = h_merge.DefineDone();
    declare hash h_left();
    rc = h_left.DefineKey("COD_CONTRATTO", "NUM_REVISIONE", "extra");
    rc = h_left.DefineData("RC");
    rc = h_left.DefineDone();
  end;
  set AA  end=eof;
  call missing(of TS_FINE_VALIDITA);
  if 0=h_left.add() then do;
    rc = h_merge.find(); 
    do until(rc);
      output;
      rc = h_merge.find_next();
    end;
    put;
  end;
  drop rc ;
run; 

proc compare data=W1140ABS compare=W1140ABS_hash listall;
run;

proc print data=W1140ABS;
run;

&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 23 Feb 2023 16:16:44 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860487#M339925</guid>
      <dc:creator>Tom</dc:creator>
      <dc:date>2023-02-23T16:16:44Z</dc:date>
    </item>
    <item>
      <title>Re: divergence between proc sql join and hash</title>
      <link>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860615#M339970</link>
      <description>&lt;P&gt;Thanks Tom. Looking closer, it was indeed an inner join. Seeing the original data, however, I still found something that doesn't add up, because the presence of additional fields creates double lines in the sql that disappear in the hash. Here is an example, I think it can be solved by introducing multidata="Y" also on the second hash:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data  OPE;
  input cod_contratto :$15. NUM_REVISIONE TS_FINE_VALIDITA :datetime.;
  format TS_FINE_VALIDITA datetime19.;
datalines;
100004790 35 01Jan1999:00:00:00
;
data AA;
input cod_contratto :$15. NUM_REVISIONE extra ;
datalines;
100004790 35 1
100004790 35 2
;
 
proc sql;
create table W1140ABS  as
  select distinct
     aa.cod_contratto,
	 AA.NUM_REVISIONE,
	 AA.extra,
     OPE.TS_FINE_VALIDITA
  from AA
  inner join  OPE
    ON AA.COD_CONTRATTO = OPE.COD_CONTRATTO
    and OPE.NUM_REVISIONE = AA.NUM_REVISIONE;
quit;

 
data W1140ABS_hash;
  if 0 then set OPE AA;
  if _n_=1 then do;
    declare hash h_merge(dataset:"OPE",multidata:'Y');
    rc = h_merge.DefineKey("COD_CONTRATTO", "NUM_REVISIONE");
    rc = h_merge.DefineData( "TS_FINE_VALIDITA");
    rc = h_merge.DefineDone();
    declare hash h_left();
    rc = h_left.DefineKey("COD_CONTRATTO", "NUM_REVISIONE");
    rc = h_left.DefineData("RC");
    rc = h_left.DefineDone();
  end;
  set AA  end=eof;
  if 0=h_left.add() then do;
    rc = h_merge.find(); 
    do while(not rc);
      output;
      rc = h_merge.find_next();
    end;
  end;
  drop rc ;
run; &lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 24 Feb 2023 08:15:15 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/divergence-between-proc-sql-join-and-hash/m-p/860615#M339970</guid>
      <dc:creator>mariopellegrini</dc:creator>
      <dc:date>2023-02-24T08:15:15Z</dc:date>
    </item>
  </channel>
</rss>

