<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Left join in HASH with missing values to be replaced with latest available value in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672592#M202151</link>
    <description>&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/31461"&gt;@mkeintz&lt;/a&gt;, It will have monthly information for many years.</description>
    <pubDate>Mon, 27 Jul 2020 15:53:37 GMT</pubDate>
    <dc:creator>Myurathan</dc:creator>
    <dc:date>2020-07-27T15:53:37Z</dc:date>
    <item>
      <title>Left join in HASH with missing values to be replaced with latest available value</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672558#M202139</link>
      <description>&lt;P&gt;Dear Experts,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I will have to join the following two tables.&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Table one (1 billion rows)&lt;/STRONG&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;Data one;
input snapshotdate sourcekey sourcesystemid value1;
datalines;
20200101 112 5 788
20200102 112 5 789
20200103 112 5 800
20200201 112 5 786
20200202 112 5 777
20200203 112 5 834
20200301 112 5 789
20200302 112 5 771
20200303 112 5 832
20200101 222 6 788
20200102 222 6 789
20200103 222 6 800
20200201 222 6 786
20200202 222 6 777
20200203 222 6 834
20200301 222 6 789
20200302 222 6 771
20200303 222 6 832
;
run;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;STRONG&gt;Table Two (32 million rows)&lt;/STRONG&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;Data two;
input period sourcekey sourcesystemid npl;
datalines;
202001 112 5 999
202002 112 5 988
202001 222 6 555
202002 222 6 556
;
run;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;I would like to get the joined table as below;&lt;/P&gt;&lt;TABLE border="1"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD&gt;snapshotdate&lt;/TD&gt;&lt;TD&gt;sourcekey&lt;/TD&gt;&lt;TD&gt;sourcesystemid&lt;/TD&gt;&lt;TD&gt;value1&lt;/TD&gt;&lt;TD&gt;NPL&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200101&lt;/TD&gt;&lt;TD&gt;112&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;TD&gt;788&lt;/TD&gt;&lt;TD&gt;999&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200102&lt;/TD&gt;&lt;TD&gt;112&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;TD&gt;789&lt;/TD&gt;&lt;TD&gt;999&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200103&lt;/TD&gt;&lt;TD&gt;112&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;TD&gt;800&lt;/TD&gt;&lt;TD&gt;999&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200201&lt;/TD&gt;&lt;TD&gt;112&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;TD&gt;786&lt;/TD&gt;&lt;TD&gt;988&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200202&lt;/TD&gt;&lt;TD&gt;112&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;TD&gt;777&lt;/TD&gt;&lt;TD&gt;988&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200203&lt;/TD&gt;&lt;TD&gt;112&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;TD&gt;834&lt;/TD&gt;&lt;TD&gt;988&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200301&lt;/TD&gt;&lt;TD&gt;112&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;TD&gt;789&lt;/TD&gt;&lt;TD&gt;&lt;STRONG&gt;988&lt;/STRONG&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200302&lt;/TD&gt;&lt;TD&gt;112&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;TD&gt;771&lt;/TD&gt;&lt;TD&gt;&lt;STRONG&gt;988&lt;/STRONG&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200303&lt;/TD&gt;&lt;TD&gt;112&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;TD&gt;832&lt;/TD&gt;&lt;TD&gt;&lt;STRONG&gt;988&lt;/STRONG&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200101&lt;/TD&gt;&lt;TD&gt;222&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;788&lt;/TD&gt;&lt;TD&gt;555&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200102&lt;/TD&gt;&lt;TD&gt;222&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;789&lt;/TD&gt;&lt;TD&gt;555&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200103&lt;/TD&gt;&lt;TD&gt;222&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;800&lt;/TD&gt;&lt;TD&gt;555&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200201&lt;/TD&gt;&lt;TD&gt;222&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;786&lt;/TD&gt;&lt;TD&gt;556&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200202&lt;/TD&gt;&lt;TD&gt;222&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;777&lt;/TD&gt;&lt;TD&gt;556&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200203&lt;/TD&gt;&lt;TD&gt;222&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;834&lt;/TD&gt;&lt;TD&gt;556&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200301&lt;/TD&gt;&lt;TD&gt;222&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;789&lt;/TD&gt;&lt;TD&gt;&lt;STRONG&gt;556&lt;/STRONG&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200302&lt;/TD&gt;&lt;TD&gt;222&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;771&lt;/TD&gt;&lt;TD&gt;&lt;STRONG&gt;556&lt;/STRONG&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;20200303&lt;/TD&gt;&lt;TD&gt;222&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;832&lt;/TD&gt;&lt;TD&gt;&lt;STRONG&gt;556&lt;/STRONG&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;When there is a missing (year and month), it has to be filled with the latest available value (bolded),&amp;nbsp;&lt;/P&gt;&lt;P&gt;The code I have currently: (not working to replace the missing values)&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;Proc SQL;
Create Table want as
Select 
a.*,
b.npl
from one as a
left join two as b
on a.sourcekey =b.sourcekey and a.sourcesystemid = b.sourcesystemid and input(substr(put(a.snapshotdate,8.),1,6),6.) = b.period
order by a.sourcekey,a.snapshotdate
;
Quit;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Since it involves large tables, I prefer this to be coded using HASH. I would like to use table two&amp;nbsp; as HASH object.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thanks in advance.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Myu&lt;/P&gt;</description>
      <pubDate>Mon, 27 Jul 2020 14:51:15 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672558#M202139</guid>
      <dc:creator>Myurathan</dc:creator>
      <dc:date>2020-07-27T14:51:15Z</dc:date>
    </item>
    <item>
      <title>Re: Left join in HASH with missing values to be replaced with latest available value</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672584#M202148</link>
      <description>Are those tables sorted by sourcekey and snapshotdate/period ?&lt;BR /&gt;&lt;BR /&gt;Bart</description>
      <pubDate>Mon, 27 Jul 2020 15:45:39 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672584#M202148</guid>
      <dc:creator>yabwon</dc:creator>
      <dc:date>2020-07-27T15:45:39Z</dc:date>
    </item>
    <item>
      <title>Re: Left join in HASH with missing values to be replaced with latest available value</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672587#M202149</link>
      <description>&lt;P&gt;Do the series in table 1 ALWAYS come in three's&amp;nbsp; (2001,2002,2003)?&lt;/P&gt;</description>
      <pubDate>Mon, 27 Jul 2020 15:50:16 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672587#M202149</guid>
      <dc:creator>mkeintz</dc:creator>
      <dc:date>2020-07-27T15:50:16Z</dc:date>
    </item>
    <item>
      <title>Re: Left join in HASH with missing values to be replaced with latest available value</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672590#M202150</link>
      <description>Yes, both tables are sorted by sourcekey, sourcesystemid, and snapshotperiod/period</description>
      <pubDate>Mon, 27 Jul 2020 15:52:31 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672590#M202150</guid>
      <dc:creator>Myurathan</dc:creator>
      <dc:date>2020-07-27T15:52:31Z</dc:date>
    </item>
    <item>
      <title>Re: Left join in HASH with missing values to be replaced with latest available value</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672592#M202151</link>
      <description>&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/31461"&gt;@mkeintz&lt;/a&gt;, It will have monthly information for many years.</description>
      <pubDate>Mon, 27 Jul 2020 15:53:37 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672592#M202151</guid>
      <dc:creator>Myurathan</dc:creator>
      <dc:date>2020-07-27T15:53:37Z</dc:date>
    </item>
    <item>
      <title>Re: Left join in HASH with missing values to be replaced with latest available value</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672595#M202152</link>
      <description>&lt;P&gt;If they are sorted I would consider using MERGE statement.&lt;/P&gt;
&lt;P&gt;Bart&lt;/P&gt;</description>
      <pubDate>Mon, 27 Jul 2020 16:03:26 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672595#M202152</guid>
      <dc:creator>yabwon</dc:creator>
      <dc:date>2020-07-27T16:03:26Z</dc:date>
    </item>
    <item>
      <title>Re: Left join in HASH with missing values to be replaced with latest available value</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672598#M202153</link>
      <description>&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/35763"&gt;@yabwon&lt;/a&gt;, I am not much used to merge statement. Could you please help me with the code. Thanks in advance.</description>
      <pubDate>Mon, 27 Jul 2020 16:10:11 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672598#M202153</guid>
      <dc:creator>Myurathan</dc:creator>
      <dc:date>2020-07-27T16:10:11Z</dc:date>
    </item>
    <item>
      <title>Re: Left join in HASH with missing values to be replaced with latest available value</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672603#M202154</link>
      <description>&lt;P&gt;Editted note: just saw the info that data are sorted "by sourcekey, sourcesystemid, and snapshotperiod/period".&amp;nbsp; So the program is modified.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;If the data are sorted by sourcekey/snapshotdate (table1) and sourcekey/period (table 2), then you don't need hash.&amp;nbsp; You can use a "last-observation-carried-forward" technique in which you (1) assign a dummy date to the period in table2 (i.e.&amp;nbsp; the first date of the month specified by period, and (2) use a SET/BY combination, complemented by a conditional SET statement.&amp;nbsp; No hash needed:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;Data one;
input snapshotdate yymmdd8. sourcekey sourcesystemid value1;
format snapshotdate date9. ;
datalines;
20200101 112 5 788
20200102 112 5 789
20200103 112 5 800
20200201 112 5 786
20200202 112 5 777
20200203 112 5 834
20200301 112 5 789
20200302 112 5 771
20200303 112 5 832
20200101 222 6 788
20200102 222 6 789
20200103 222 6 800
20200201 222 6 786
20200202 222 6 777
20200203 222 6 834
20200301 222 6 789
20200302 222 6 771
20200303 222 6 832
;
run;

data two;
input period sourcekey sourcesystemid npl;
datalines;
202001 112 5 999
202002 112 5 988
202001 222 6 555
202002 222 6 556
;
run;

data need / view=need;
  set two;
  month_begin_date = mdy(mod(period,100),1,floor(period/100));
  format month_begin_date date9. ;
run;

data want (drop=month_begin_date);
  set need (keep=sourcekey sourcesystem month_begin_date rename=(month_begin_date=snapshotdate) in=in2)
      one (in=in1) ;
  by sourcesystem sourcekey snapshotdate ;
  if in2 then set need; /*Get the other variables, automatically retained */
  if in1;
  output;
  if last.sourcekey then call missing(of _all_);
run;

&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;Notes:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;OL&gt;
&lt;LI&gt;DATA NEED is a data set VIEW, not a data set FILE.&amp;nbsp; So it never gets written to disk.&amp;nbsp; Instead it is activated (sort of like instantiating a hash object) only when it is referred to in a later step.&amp;nbsp; So basically it's piping the data from table TWO to&amp;nbsp; the data want step.&lt;/LI&gt;
&lt;LI&gt;The first set/by reads only the relevant sort variable from need.&amp;nbsp; And only when the incoming observation gets a need (i.e. table two) record, does it "set" the other varibles in need.&amp;nbsp; The advantage here is that those other variables (sourcesystemid npl), &lt;EM&gt;&lt;STRONG&gt;since they are only read by this conditional set&lt;/STRONG&gt;&lt;/EM&gt;, are automatically retained until the next time this conditional set is executed - so missing periods in table 2 just inherit the most recent table 2 values.&lt;BR /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; I.e this is "last observation carried forward".&lt;/LI&gt;
&lt;LI&gt;The explicit OUTPUT statement followed by the "if last.sourcekey then call missig(of _all_)" is a way to prevent &lt;STRIKE&gt;sourcesystemid and&lt;/STRIKE&gt; npl from being carried forward from one sourcekey to the next.&amp;nbsp; Now if your table 2 ALWAYS has a monthly record matching or preceding the first corresponding sourcekey in table 1, you wouldn't need those two statements.&lt;/LI&gt;
&lt;/OL&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 27 Jul 2020 19:07:45 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672603#M202154</guid>
      <dc:creator>mkeintz</dc:creator>
      <dc:date>2020-07-27T19:07:45Z</dc:date>
    </item>
    <item>
      <title>Re: Left join in HASH with missing values to be replaced with latest available value</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672604#M202155</link>
      <description>&lt;P&gt;try this:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;Data one;
input snapshotdate sourcekey sourcesystemid value1;
datalines;
20200101 112 5 788
20200102 112 5 789
20200103 112 5 800
20200201 112 5 786
20200202 112 5 777
20200203 112 5 834
20200301 112 5 789
20200302 112 5 771
20200303 112 5 832
20200101 222 6 788
20200102 222 6 789
20200103 222 6 800
20200201 222 6 786
20200202 222 6 777
20200203 222 6 834
20200301 222 6 789
20200302 222 6 771
20200303 222 6 832
;
run;

Data two;
input period sourcekey sourcesystemid npl;
datalines;
202001 112 5 999
202002 112 5 988
202001 222 6 555
202002 222 6 556
;
run;

data oneView / view = oneView;
  set one;
  period = floor(snapshotdate/100);
run;

data want;
  merge oneView two;
  by sourcesystemid sourcekey period;
  _lag_npl_ = lag(npl);
  if missing(npl) then npl = _lag_npl_;
  drop _lag_npl_;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Bart&lt;/P&gt;</description>
      <pubDate>Mon, 27 Jul 2020 16:19:55 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672604#M202155</guid>
      <dc:creator>yabwon</dc:creator>
      <dc:date>2020-07-27T16:19:55Z</dc:date>
    </item>
    <item>
      <title>Re: Left join in HASH with missing values to be replaced with latest available value</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672663#M202174</link>
      <description>&lt;P&gt;A hash approach&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data want(drop=period rc);
   if _N_ = 1 then do;
      dcl hash h(dataset : "two");
      h.definekey("sourcekey", "sourcesystemid", "period");
      h.definedata("npl");
      h.definedone();
   end;

   set one;
   if 0 then set two;

   rc = h.find(key : sourcekey, key : sourcesystemid, key : int(snapshotdate/100));
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Mon, 27 Jul 2020 18:42:06 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Left-join-in-HASH-with-missing-values-to-be-replaced-with-latest/m-p/672663#M202174</guid>
      <dc:creator>PeterClemmensen</dc:creator>
      <dc:date>2020-07-27T18:42:06Z</dc:date>
    </item>
  </channel>
</rss>

