Solved: convert proc sql in hash

mariopellegrini · Posted 05-21-2023 04:39 AM

I'm trying to convert a sql proc to hash, but in 2 solutions I've found there is a single problem. I summarized in this example: for key 1 the problem is the variable TS is not reported correctly in the values, for key 2 the problem is that the duplicates are lost. We need a hash that solves these 2 problems at the same time:

data T_A_PD_PTFDN_OPERAZIONI;
input k TS value2;
datalines;
1 1 300
1 2 300
2 111 300
;
data T_A_PD_PTFDN_BENE;
input k TS value1;
datalines;
1 1 300
1 1 300
2 111 300
2 111 400
;
 

proc sql noprint;
   create table work.W1140ABS_SQL   as 
   select 
         distinct AA.k
        ,OPE.TS
		,AA.value1
		,OPE.value2
        
   from  WORK.T_A_PD_PTFDN_OPERAZIONI as OPE 
         ,WORK.T_A_PD_PTFDN_BENE as AA
   where AA.k = OPE.k
   ;    
quit;

/*ko per k=1
  ok per k=2*/
data W1140ABS_hash_1;
if 0 then set WORK.T_A_PD_PTFDN_OPERAZIONI  
              WORK.T_A_PD_PTFDN_BENE;
declare hash h_merge(multidata:'y',dataset:"WORK.T_A_PD_PTFDN_OPERAZIONI ");
	 rc = h_merge.DefineKey("k");
	 rc = h_merge.DefineData(all:'yes');
	 rc = h_merge.DefineDone();
drop rc;
rc = h_merge.find();
do while (not eof);
   set  WORK.T_A_PD_PTFDN_BENE  end=eof;
   if h_merge.find() = 0; 
   output;
   rc = h_merge.find_next() ; 
end;
run; 

/*ok per k=1
  ko per k=2*/
data W1140ABS_hash_2; 
if 0 then set WORK.T_A_PD_PTFDN_OPERAZIONI  
              WORK.T_A_PD_PTFDN_BENE;
  if _n_=1 then do;
    declare hash h_merge(dataset:"T_A_PD_PTFDN_OPERAZIONI",multidata:'Y');
    rc = h_merge.DefineKey("K");
    rc = h_merge.DefineData(all:'yes');
    rc = h_merge.DefineDone();
    declare hash h_left();
    rc = h_left.DefineKey("K");
    rc = h_left.DefineData("RC");
    rc = h_left.DefineDone();
  end;
 set  WORK.T_A_PD_PTFDN_BENE  end=eof;
  if 0=h_left.add() then do;
    rc = h_merge.find(); 
    do while(not rc);
      output;
      rc = h_merge.find_next();
    end;
  end;
  drop rc ;
run;

Tom · Posted 05-21-2023 02:41 PM

To mimic the DISTINCT keyword in your SQL select you either need to use a second HASH object for the output.

Or add a PROC SORT NODUPKEY afterwards.

First let's make your dataset using easier to type dataset names.

data OPE;
  input k TS value2;
datalines;
1 1 300
1 2 300
2 111 300
;
data AA;
  input k TS value1;
datalines;
1 1 300
1 1 300
2 111 300
2 111 400
;

Now let's use one HASH to hold the OPE data and a second to hold the result set. Then we just need to read the data from the other dataset and check if there is a match and then add the results into the result hash when they are not already there.

data _null_ ;
  if 0 then set ope(keep=k ts value2) aa(keep=value1);
  if _n_=1 then do;
    declare hash h_ope(multidata:'y',dataset:"ope");
    rc = h_ope.DefineKey("k");
    rc = h_ope.DefineData("value2","TS");
    rc = h_ope.DefineDone();

    declare hash h_result(ordered:'YES');
    rc = h_result.DefineKey("k", "TS", "value1", "value2");
    rc = h_result.DefineData("k", "TS", "value1", "value2");
    rc = h_result.DefineDone();
  end;

  set aa end=eof;
  if 0=h_ope.find() then do until(h_ope.find_next());
    if h_result.find() then h_result.add();
  end;

  if eof then h_result.output(dataset:"want_hash");
run;

If the result set is too big to stuff into a hash then just write out ALL of the matches and use PROC SORT with NODUPKEY to remove the extra observations.

data want_hash(keep=k TS value1 value2) ;
  if 0 then set ope(keep=k ts value2) aa(keep=value1);
  if _n_=1 then do;
    declare hash h_ope(multidata:'y',dataset:"ope");
    rc = h_ope.DefineKey("k");
    rc = h_ope.DefineData("value2","TS");
    rc = h_ope.DefineDone();
  end;

  set aa;
  if 0=h_ope.find() then do until(h_ope.find_next());
    output;
  end;
run;

proc sort data=want_hash nodupkey;
  by k TS value1 value2;
run;

It might be better to use AA as the dataset that is in the HASH object since then there is only one data value to store.

data wnat_hash(keep=k ts value1 value2) ;
  if 0 then set ope(keep=k ts value2) aa(keep=value1);
  if _n_=1 then do;
    declare hash h_aa(multidata:'y',dataset:"aa");
    rc = h_aa.DefineKey("k");
    rc = h_aa.DefineData("value1");
    rc = h_aa.DefineDone();
  end;

  set ope ;
  if 0=h_aa.find() then do until(h_aa.find_next());
    output;
  end;
run;

proc sort data=want_hash nodupkey;
  by k TS value1 value2;
run;

View solution in original post

Tom · Posted 05-21-2023 02:41 PM

To mimic the DISTINCT keyword in your SQL select you either need to use a second HASH object for the output.

Or add a PROC SORT NODUPKEY afterwards.

First let's make your dataset using easier to type dataset names.

data OPE;
  input k TS value2;
datalines;
1 1 300
1 2 300
2 111 300
;
data AA;
  input k TS value1;
datalines;
1 1 300
1 1 300
2 111 300
2 111 400
;

Now let's use one HASH to hold the OPE data and a second to hold the result set. Then we just need to read the data from the other dataset and check if there is a match and then add the results into the result hash when they are not already there.

data _null_ ;
  if 0 then set ope(keep=k ts value2) aa(keep=value1);
  if _n_=1 then do;
    declare hash h_ope(multidata:'y',dataset:"ope");
    rc = h_ope.DefineKey("k");
    rc = h_ope.DefineData("value2","TS");
    rc = h_ope.DefineDone();

    declare hash h_result(ordered:'YES');
    rc = h_result.DefineKey("k", "TS", "value1", "value2");
    rc = h_result.DefineData("k", "TS", "value1", "value2");
    rc = h_result.DefineDone();
  end;

  set aa end=eof;
  if 0=h_ope.find() then do until(h_ope.find_next());
    if h_result.find() then h_result.add();
  end;

  if eof then h_result.output(dataset:"want_hash");
run;

If the result set is too big to stuff into a hash then just write out ALL of the matches and use PROC SORT with NODUPKEY to remove the extra observations.

data want_hash(keep=k TS value1 value2) ;
  if 0 then set ope(keep=k ts value2) aa(keep=value1);
  if _n_=1 then do;
    declare hash h_ope(multidata:'y',dataset:"ope");
    rc = h_ope.DefineKey("k");
    rc = h_ope.DefineData("value2","TS");
    rc = h_ope.DefineDone();
  end;

  set aa;
  if 0=h_ope.find() then do until(h_ope.find_next());
    output;
  end;
run;

proc sort data=want_hash nodupkey;
  by k TS value1 value2;
run;

It might be better to use AA as the dataset that is in the HASH object since then there is only one data value to store.

data wnat_hash(keep=k ts value1 value2) ;
  if 0 then set ope(keep=k ts value2) aa(keep=value1);
  if _n_=1 then do;
    declare hash h_aa(multidata:'y',dataset:"aa");
    rc = h_aa.DefineKey("k");
    rc = h_aa.DefineData("value1");
    rc = h_aa.DefineDone();
  end;

  set ope ;
  if 0=h_aa.find() then do until(h_aa.find_next());
    output;
  end;
run;

proc sort data=want_hash nodupkey;
  by k TS value1 value2;
run;

mkeintz · Posted 05-21-2023 11:38 PM

@Tom wrote:

If the result set is too big to stuff into a hash then just write out ALL of the matches and use PROC SORT with NODUPKEY to remove the extra observations.

But if the dataset AA is sorted by the match key K, as it is in the sample data, you could reduce memory usage and avoid the proc sort nodukey followup step by outputting each item from h_result at the end of each group of identical k values. Followed by clearing the h_result object in preparation of the next k-group. I marked the needed changes in bold italics below:

data want_h;
  if 0 then set ope(keep=k ts value2) aa(keep=value1);
  if _n_=1 then do;
    declare hash h_ope(multidata:'y',dataset:"ope");
    rc = h_ope.DefineKey("k");
    rc = h_ope.DefineData("value2","TS");
    rc = h_ope.DefineDone();

    declare hash h_result(ordered:'YES');
    rc = h_result.DefineKey("k", "TS", "value1", "value2");
    rc = h_result.DefineData("k", "TS", "value1", "value2");
    rc = h_result.DefineDone();
    declare hiter i ('h_result');
  end;

  set aa ;
  by k;

  if 0=h_ope.find() then do until(h_ope.find_next());
    if h_result.find() then h_result.add();
  end;

  if last.k;
  do while (i.next()=0);
    output;
  end;
  h_result.clear();
run;

And if the h_ope object also presents a stress on available memory, but is also sorted by k, then one could do similar group-by-group filling and clearing of the h_ope object.

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------

mariopellegrini · Posted 05-22-2023 10:08 AM

Thanks Tom, it works now. Just one question if you can give me a guess: by applying this code to complete tables, the processing time is only slightly reduced: it goes from 3'10" to 2'37". We are talking about a table of about 40 million rows and one of 38 million and 14 variables. Maybe in the end it's not very convenient to apply hash tables in this specific? Do you think it is possible to significantly improve processing times?

Tom · Posted 05-22-2023 10:13 AM

Best way to improve performance is not to do cartesian product of too large tables.

If you have the data sorted by the key variables and you just want to do a normal one-to-one or one-to-many merge then a data step will be much faster than anything you could build with SQL.

data want ;
  merge ore(in=in1) aa(in=in2 keep=k value1);
  by k;
  if in1 and in2;
run;

Whether or not that is possible depends on what you are actually trying to do.

convert proc sql in hash

Re: convert proc sql in hash

Re: convert proc sql in hash

Re: convert proc sql in hash

Re: convert proc sql in hash

Re: convert proc sql in hash

convert proc sql in hash

Re: convert proc sql in hash

Re: convert proc sql in hash

Re: convert proc sql in hash

Re: convert proc sql in hash

Re: convert proc sql in hash

SAS Innovate 2025: Call for Content

Click image to register for webinar

Classroom Training Available!