Solved: Creating a Hash Table and Using it in the Same Data Step

theponcer · Posted 07-17-2019 06:14 PM

I am trying to do this in one step.

I could do something like this, but it is not dynamic or efficient:

*ACTUAL ATTEMPT;
*This table shows what each gardener has in their garden;
data have1;
   input ID $1 Name $3-7 Item $9-20;
   datalines;
1 Billy Carrots
1 Billy Peas
1 Billy Tomatoes
2 Willy Peas
2 Willy Rutabegas
2 Willy Celery
3 Jilly Carrots
3 Jilly Peas
3 Jilly Parsley
;
run;

*These are the vegetables I am interested in;
data have2;
   input Item $1-10;
   datalines;
Carrots
Rutabega
Parsley
;
run;

proc sql; create table gardeners as select distinct
Name
from have1;
quit;

%macro indicator (field);
data &field.;
set have1;
if Item = "&field." then
output &field.;
run;
%mend;

%indicator(Carrots);
%indicator(Rutabega);
%indicator(Parsley);

data gardeners_v2;
if _n_=1 then do;
	if 0 then set carrots (drop=ID Item);
		declare hash carrots(dataset:"carrots");
		carrots.definekey("Name");
		carrots.definedata("Name");
		carrots.definedone();
end;
set gardeners;
rc_carrots=carrots.find(); 
if rc_carrots=0 then rc_carrots=1;
else rc_carrots = 0;

if _n_=1 then do;
	if 0 then set rutabega (drop=ID Item);
		declare hash rutabega(dataset:"rutabega");
		rutabega.definekey("Name");
		rutabega.definedata("Name");
		rutabega.definedone();
end;
set gardeners;
rc_rutabega=rutabega.find(); 
if rc_rutabega=0 then rc_rutabega=1;
else rc_rutabega = 0;

if _n_=1 then do;
	if 0 then set parsley (drop=ID Item);
		declare hash parsley(dataset:"parsley");
		parsley.definekey("Name");
		parsley.definedata("Name");
		parsley.definedone();
end;
set gardeners;
rc_parsley=parsley.find(); 
if rc_parsley=0 then rc_parsley=1;
else rc_parsley = 0;

if sum(rc_carrots,rc_rutabega,rc_parsley) >0 then output;
run;

data final;
if _n_=1 then do;
	if 0 then set gardeners_v2;
	dcl hash gardeners(dataset:"gardeners_v2");
	gardeners.definekey("Name");
	gardeners.definedata(all:"yes");
	gardeners.definedone();
 end;
 set have1;
 if gardeners.find()=0;
 run;

Here's what I actually want to do, although I don't know how:

A) Create a dynamic hash table
	1) Get a distinct list of gardeners who have vegetables in the vegetable list
	2) Create an indicator field for each vegetable in the vegetable list
	3) Populate the indicator field if that gardener has that vegetable

B) Join the hash table on the previous table, including only gardeners that match

*This table shows what each gardener has in their garden;
data have1;
   input ID $1 Name $3-7 Item $9-20;
   datalines;
1 Billy Carrots
1 Billy Peas
1 Billy Tomatoes
2 Willy Peas
2 Willy Rutabegas
2 Willy Celery
3 Jilly Carrots
3 Jilly Peas
3 Jilly Parsley
;
run;

*These are the vegetables I am interested in;
data have2;
   input Item $1-10;
   datalines;
Carrots
Rutabega
Parsley
;
run;

*This is the hash table that gets created - it shows only the gardeners that have the vegetables of 
interest, and which ones they have

data want_hash;
	input ID $1 Name $3-7 Carrots $9 Rutabega $11 Parsley $13;
	datalines;
1 Billy 1 0 0
3 Jilly 1 0 1
;
run;

*The final table contains all vegetables from the gardeners who had the vegetables of interest, as well 
as indicating which of the vegetables they had

data want_final;
	input ID $1 Name $3-7 Item $9-16 Carrots $17 Rutabega $19 Parsley $21;
	datalines;
1 Billy Carrots  1 0 0
1 Billy Peas     1 0 0
1 Billy Tomatoes 1 0 0
3 Jilly Carrots  1 0 1
3 Jilly Peas     1 0 1
3 Jilly Parsley  1 0 1
;
run;

Any ideas?

Tom · Posted 07-19-2019 12:27 PM

Is there some reason to want a HASH solution to this problem? Are you just trying to learn how HASH objects work?

If the list of possible names is small enough to fit into a macro variable then you can just use a couple of macro variables to generate a data step that can just process the data.

proc sql noprint;
  select distinct nliteral(item),quote(upcase(trim(item)))
    into :namelist separated by ' ',:itemlist separated by ' '
  from have2
  ;
%let nitems=&sqlobs;
run;

data want ;
do until (last.id);
  set have1 ;
  by id ;
  array flags $1 &namelist ;
  array names[&nitems] $32 _temporary_ (&itemlist);
  if first.id then do _n_=1 to &nitems;
    flags[_n_]='0';
  end;
  _n_=whichc(upcase(item),of names[*]);
  if _n_ in (1:&nitems) then flags[_n_]='1';
end;
do until (last.id);
  set have1;
  by id;
  output;
end;
run;

Obs    ID    Name     Item         Carrots    Parsley    Rutabega

 1     1     Billy    Carrots         1          0          0
 2     1     Billy    Peas            1          0          0
 3     1     Billy    Tomatoes        1          0          0
 4     2     Willy    Peas            0          0          0
 5     2     Willy    Rutabegas       0          0          0
 6     2     Willy    Celery          0          0          0
 7     3     Jilly    Carrots         1          1          0
 8     3     Jilly    Peas            1          1          0
 9     3     Jilly    Parsley         1          1          0

If you want to eliminate Willy then make the OUTPUT statement conditional.

if whichc('1',of flags[*]) then output;

View solution in original post

ChrisNZ · Posted 07-18-2019 12:25 AM

Here is a totally dynamic method:

proc sql; 
  create table FLAGS as
  select ga.*, ve.* , (it.ID is not null) as FLAG
  from (select unique ID, NAME from HAVE1) as ga
         natural join
       (select unique ITEM     from HAVE2) as ve
         left join
       HAVE1                               as it
         on  ga.ID  =it.ID
         and ga.NAME=it.NAME
         and ve.ITEM=it.ITEM;
quit;

proc transpose data=FLAGS out=TR(drop=_NAME_);
  by  ID NAME ;
  id  ITEM;
  var FLAG; 
run;

proc sql; 
  create table FINAL as
  select  ve.* , tr.*
  from (select unique ID, NAME from HAVE1) as ga
         natural join
       (select unique ITEM     from HAVE2) as ve
         left join
       TR                                  as tr
         on  ga.ID  =tr.ID
         and ga.NAME=tr.NAME;    
quit;

ITEM	ID	NAME	Carrots	Parsley
Parsley	1	Billy	1	0
Carrots	1	Billy	1	0
Rutabega	1	Billy	1	0
Parsley	2	Willy	0	0
Carrots	2	Willy	0	0
Rutabega	2	Willy	0	0
Carrots	3	Jilly	1	1
Parsley	3	Jilly	1	1
Rutabega	3	Jilly	1	1

High-Performance SAS Coding - Third Edition

ChrisNZ · Posted 07-18-2019 12:32 AM

You can define table ga as

(select unique ID, NAME from HAVE1
        where ITEM in (select ITEM from HAVE2)) as ga

if you want to remove Willy.

High-Performance SAS Coding - Third Edition

KachiM · Posted 07-18-2019 04:36 AM

Here is one way using hash objects.

One issue with length of ITEM (for lookup in hash table) can be solved by using the same length ($9) in both the Data Sets. This is done as:

data have2;
   input Item :$9.;
datalines;
Carrots
Rutabega
Parsley
;
run;

data have1;
   input ID :$1. Name :$5. Item :$9.;
   datalines;
1 Billy Carrots
1 Billy Peas
1 Billy Tomatoes
2 Willy Peas
2 Willy Rutabegas
2 Willy Celery
3 Jilly Carrots
3 Jilly Peas
3 Jilly Parsley
;
run;

The absence of ID in HAVE2 and resetting an array intermediately used, (IND[ ]), with zeros makes the solution slightly complicated. Here is the Hash solution:

data want;
   if _n_ = 1 then do;
      declare hash h(ordered:'Y');
      h.definekey('Item');
      h.definedata('_iorc_');
      h.definedone();
      do _iorc_ = 1 by 1 until(eof);
         set have2 end = eof;
         h.add();
      end;
   end;
   do until(last.ID);
      set have1;
      by ID notsorted;
      array ind[*] i_Carrots i_Rutabega i_Parsley;
      if h.find() = 0 then ind[_iorc_] = 1;
   end;
   if nmiss(of ind[*]) ne dim(ind) then do;
      do i = 1 to dim(ind);
         if missing(ind[i]) then ind[i] = 0;
      end;
      output;
   end;
drop i Item;
run;

The last part of the program checks for the missing values in the array, IND[ ]. If all cells are missing (like in ID = 2) then it is ignored. Otherwise, cells having missing values are replaced by ZEROs. ( there is no SAS function similar to "call missing").

Best regards

DataSP

theponcer · Posted 07-18-2019 02:19 PM

Neither of these got me quite what I wanted. I will try breaking it up into smaller steps. The first thing I need to do is to get a distinct list of gardeners who have vegetables in the vegetable list. I believe I should be able to do this by creating an empty "gardeners" hash table, and then populating it by testing if the gardener identified has already been output to the gardener hash table.

To illustrate this more clearly, in my examples above, I am looking for gardeners who have any of these three vegetables: Carrots, Rutabegas (spelled wrong, so no match), or Parsley. At this point, my data table would look like this:

ID	Name	Item	rc_vegetables
1	Billy	Carrots	0
1	Billy	Peas	166666
1	Billy	Tomatoes	166666
2	Willy	Peas	166666
2	Willy	Rutabega	166666
2	Willy	Celery	166666
3	Jilly	Carrots	0
3	Jilly	Peas	166666
3	Jilly	Parsley	0

The first observation has rc_vegetables=0. There is no match for "Billy" in the gardeners hash, which is currently empty, so "Billy" is output to the gardeners hash, which now looks like this:

Name

Billy

The second observation has rc_vegetables=166666, so it is ignored. The seventh observation has rc_vegetables=0, and "Jilly" is not included in the "gardeners" hash table, so "Jilly" is output as well. Now the hash table looks like this:

Name

Billy

Jilly

The ninth observation has rc_vegetables=0, but "Jilly" is already included in the "gardeners" hash table, so we move to the next item. This is the end of the file, so the "gardeners" hash table is complete.

I am thinking that logic would look something like this to start with?

data test;
if _n_=1 then do;
*Create a blank hash table that we will populate later;
dcl hash gardeners();
gardeners.definekey("Name");
gardeners.definedata("Name");
gardeners.definedone();

dcl hash vegetables(dataset:"have2");
vegetables.definekey("Item");
vegetables.definedata("Item");
vegetables.definedone();
end;

*Identify gardeners who have the vegetable of interest;
 set have1 end=done;
 rc_vegetables=vegetables.find();

*The code works up to here, but the while loop causes SAS to freeze;

do while (not done);

*If a gardener has a vegetable of interest and has not already been added to the "gardeners" 
hash table, output the gardener to the "gardeners" hash table;

 if rc_vegetables=0 and gardeners.find() ne 0 then
 gardeners.add();

 end;
 run;

KachiM · Posted 07-18-2019 11:41 PM

What is your wanted output? You have shown in the previous post one output with two rows and another output with several rows. In your last post you seem to use 'NAME' as identifier not as ID as understood by me. Even a Data Step solution is possible. Anyway show your expected output so that some efficient solution can be thought of.

theponcer · Posted 07-19-2019 10:47 AM

My wanted output is this:

*The final table contains all vegetables from the gardeners who had the vegetables of interest, as well as indicating which of the vegetables they had
data want_final;
	input ID $1 Name $3-7 Item $9-16 Carrots $17 Rutabega $19 Parsley $21;
	datalines;
1 Billy Carrots  1 0 0
1 Billy Peas     1 0 0
1 Billy Tomatoes 1 0 0
3 Jilly Carrots  1 0 1
3 Jilly Peas     1 0 1
3 Jilly Parsley  1 0 1
;
run;

I think maybe I confused you by including the "want_hash" table as well - I was envisioning this as an intermediate step that would then be used to produce the final result.

Your solution gives me this:

ID	Name	rc_vegetables	Carrots	Rutabega	Parsley
1	Billy	166666	1	0	0
3	Jilly	0	1	0	1

Which is the hash table I want. Is there a way to use this within the same step to produce the final table? I was thinking that if I could define a blank hash table, use your code to populate the hash table, and then merge the hash table onto the original table to get the final desired result, that might work.

Here's my attempt at it using your code:

*These are just lists of the vegetables so that I can change these and make the column names dynamic;

proc sql noprint;
select distinct compress(Item) into :var_list separated by '","'
from have2;
quit;

proc sql noprint;
select distinct compress(Item) into :keep_list separated by ' '
from have2;
quit;


data want gardeners;
   if _n_ = 1 then do;
   *Define the gardeners hash table which will be populated later;
   	  declare hash gardeners();
	  gardeners.definekey("ID","Name");
	  gardeners.definedata("&var_list.");
	  gardeners.definedone();
	*Define the vegetables hash which will be used to determine which gardeners have the
	  vegetable of interest;
      declare hash h(ordered:'Y');
      h.definekey('Item');
      h.definedata('_iorc_');
      h.definedone();
      do _iorc_ = 1 by 1 until(eof);
         set have2 end = eof;
         h.add();
      end;
   end;
   *Determine which of the vegetables of interest each gardener who has at least one 
   vegetable of interest has (Willy is not included in this list because he does not
   have a vegetable of interest);
   do until(last.ID);
      set have1;
      by ID notsorted;
      array ind[*] &keep_list.;
      if h.find() = 0 then ind[_iorc_] = 1;
   end;
   if nmiss(of ind[*]) ne dim(ind) then do;
      do i = 1 to dim(ind);
         if missing(ind[i]) then ind[i] = 0;
      end;
	*Populate both an external table and a hash table that shows which gardeners have a 
	  vegetable of interest, and whihc vegetables they have;
      output gardeners;
	  gardeners.add();
   end;
*Use the newly populated gardeners hash table to create a new table that has all of the 
 vegetables from any gardener with a vegetable of interest, as well as indicating which
 vegetables of interest they have (This part doesn't work);
     set have1;
rc_gardeners=gardeners.find();
if rc_gardeners=0 then output want;
/*drop i Item;*/
run;

I don't think that I am populating the "gardeners" hash table correctly, but I am unable to see it. Is there a way that you can see the hash table? This code produces this result:

Item	ID	Name	rc_vegetables	Carrots	Rutabega	Parsley	i	rc_gardeners
Carrots	1	Billy	0	1	0	0	4	0
Peas	1	Billy	166666	1	0	0	.	0
Tomatoes	1	Billy	166666	1	0	0	4	0

Which is basically what I want, but Jilly should also be included in this table.

Sorry for the confusion, and thanks for the help.

KachiM · Posted 07-19-2019 12:32 PM

I saw your requirement as:

data want_final;
	input ID $1 Name $3-7 Item $9-16 Carrots $17 Rutabega $19 Parsley $21;
	datalines;
1 Billy Carrots  1 0 0
1 Billy Peas     1 0 0
1 Billy Tomatoes 1 0 0
3 Jilly Carrots  1 0 1
3 Jilly Peas     1 0 1
3 Jilly Parsley  1 0 1
;
run;

If you run this Data Step, you get ID, Name and Item. Other three variables are blanks. The two rows produced by my program carry all the information you need.

ID 	Name 	Carrots 	Rutabega 	Parsley
1 	Billy 	1 	0 	0
3 	Jilly 	1 	0 	1

I am very sorry to note that your wishful output ( rows 2 to 3 and 5 to 6) carry no useful information.

ChrisNZ · Posted 07-18-2019 11:56 PM

You should spend the time to ask the full question and show the full output for all cases in the first post, rather than asking us to spend the time to come up with successive solutions to a problem whose definition you keep changing.

High-Performance SAS Coding - Third Edition

Tom · Posted 07-19-2019 12:27 PM

Is there some reason to want a HASH solution to this problem? Are you just trying to learn how HASH objects work?

If the list of possible names is small enough to fit into a macro variable then you can just use a couple of macro variables to generate a data step that can just process the data.

proc sql noprint;
  select distinct nliteral(item),quote(upcase(trim(item)))
    into :namelist separated by ' ',:itemlist separated by ' '
  from have2
  ;
%let nitems=&sqlobs;
run;

data want ;
do until (last.id);
  set have1 ;
  by id ;
  array flags $1 &namelist ;
  array names[&nitems] $32 _temporary_ (&itemlist);
  if first.id then do _n_=1 to &nitems;
    flags[_n_]='0';
  end;
  _n_=whichc(upcase(item),of names[*]);
  if _n_ in (1:&nitems) then flags[_n_]='1';
end;
do until (last.id);
  set have1;
  by id;
  output;
end;
run;

Obs    ID    Name     Item         Carrots    Parsley    Rutabega

 1     1     Billy    Carrots         1          0          0
 2     1     Billy    Peas            1          0          0
 3     1     Billy    Tomatoes        1          0          0
 4     2     Willy    Peas            0          0          0
 5     2     Willy    Rutabegas       0          0          0
 6     2     Willy    Celery          0          0          0
 7     3     Jilly    Carrots         1          1          0
 8     3     Jilly    Peas            1          1          0
 9     3     Jilly    Parsley         1          1          0

If you want to eliminate Willy then make the OUTPUT statement conditional.

if whichc('1',of flags[*]) then output;

theponcer · Posted 07-19-2019 02:29 PM

I was thinking that I would need to use a hash object because the true dataset is very large, and is unsorted. This worked perfectly!

hashman · Posted 07-19-2019 03:29 PM

@theponcer:

First, you cannot do it in a single step since your input files don't have the variables Carrots, Rutabega, Parsley, so HAVE2 must be pre-processed to get the variable list for the compiler in the final step. Second, it makes no sense to make these variables character of different length since by their nature they are Boolean, so they should be numeric. Having said that, there're many ways to skin this cat, including using the hash object if you insist on using it. For example:

data have1 ;                                              
  input ID $ 1 Name $ 3-7 Item $ 9-19 ;                   
  cards ;                                                 
1 Billy Carrots                                           
1 Billy Peas                                              
1 Billy Tomatoes                                          
2 Willy Peas                                              
2 Willy Rutabegas                                         
2 Willy Celery                                            
3 Jilly Carrots                                           
3 Jilly Peas                                              
3 Jilly Parsley                                           
run ;                                                     
                                                          
data have2 ;                                              
  input Item $1-10 ;                                      
  cards ;                                                 
Carrots                                                   
Rutabega                                                  
Parsley                                                   
run ;                                                     
                                                          
proc sql noprint ;                                        
  select unique item into :v separated by " " from have2 ;
quit ;                                                    
                                                          
data want ;                                               
  if _n_ = 1 then do ;                                    
    dcl hash h (dataset:"have2") ;                        
    h.definekey ("item") ;                                
    h.definedone () ;                                     
  end ;                                                   
  do _iorc_ = 1 by 1 until (last.id) ;                    
    set have1 ;                                           
    by id ;                                               
    if h.check() = 0 then _n_ = 0 ;                       
    array v &v ;                                          
    do over v ;                                           
      if vname (v) = item then v = 1 ;                    
    end ;                                                 
  end ;                                                   
  do over v ;                                             
    v = sum (v, 0) ;                                      
  end ;                                                   
  do _iorc_ = 1 to _iorc_ ;                               
    set have1 ;                                           
    if _n_ = 0 then output ;                              
  end ;                                                   
run ;

Kind regards

Paul D.

theponcer · Posted 07-19-2019 03:59 PM

Thanks so much! This is awesome!

hashman · Posted 07-20-2019 01:45 AM

@theponcer:

You're most welcome.

Another interesting question that might arise from your inquiry is how to satisfy it if the input file is not sorted by ID - without sorting it first. Note that most DATA step solutions, including mine, have taken advantage of the DoW loop by explicitly relying on the sorted order. However, it can be done against an unsorted file as well by using the dynamic nature of the hash object - namely, its ability to aggregate in memory on the fly. In order to do this, the algorithm has to be somewhat reversed by keying the hash table by ID and updating its data items for the hash variables like Carrots, Rutabega, Parsley whose names come from HAVE2 depending on the content of each record in HAVE1. One way of doing it may look thusly:

data have1 ;                                              
  input ID $ 1 Name $ 3-7 Item $ 9-19 ;                   
  cards ;                                                 
3 Jilly Parsley                                           
1 Billy Peas                                              
2 Willy Peas                                              
1 Billy Tomatoes                                          
2 Willy Celery                                            
3 Jilly Carrots                                           
2 Willy Rutabegas                                         
3 Jilly Peas                                              
1 Billy Carrots                                           
run ;                                                     
                                                          
data have2 ;                                              
  input Item $1-10 ;                                      
  cards ;                                                 
Carrots                                                   
Rutabega                                                  
Parsley                                                   
run ;                                                     
                                                          
proc sql noprint ;                                        
  select unique item into :v separated by " " from have2 ;
quit ;                                                    
                                                          
data want ;                                               
  if _n_ = 1 then do ;                                    
    dcl hash h() ;                                        
    h.definekey  ("id") ;                                 
    do _n_ = 1 to countw ("&v") ;                         
      h.definedata (scan ("&v", _n_)) ;                   
    end ;                                                 
    h.definedone () ;                                     
    do until (z) ;                                        
      set have1 end = z ;                                 
      array v &v ;                                        
      if h.find() ne 0 then do over v ;                   
        v = 0 ;                                           
      end ;                                               
      _n_ = 0 ;                                           
      do over v ;                                         
        if vname (v) ne item then continue ;              
        v = 1 ;                                           
        _n_ = 1 ;                                         
      end ;                                               
      if _n_ then h.replace() ;                           
    end ;                                                 
  end ;                                                   
  set have1 ;                                             
  if h.find() = 0 ;                                       
run ;

The idea is to aggregate the matching values in the hash table H on the first pass through HAVE1 and then attach them to every record on the second pass through HAVE1 if its ID and ID in the hash table match. Though the output is in the original input order, the requisite aggregates like Carrots, Rutabega, Parsley tagged to each record are as required. Note how the string V generated by SQL is used to dynamically define these hash data variables one at a time.

Kind regards

Paul D.

Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Re: Creating a Hash Table and Using it in the Same Data Step

Classroom Training Available!