Solved: Creating new variables by comparing values within same dataset

i_r · Posted 12-09-2019 04:51 PM

Hi. I’m reading a file with only two variables, id1 and id2. The file has parent id's that can split into multiple child id's and a child id can further split or produce another child id. There is no link on these records other than id2 can appear as id1 in a subsequent record. I would like to end up with a flattened dataset as the example below. Can you suggest the best way to get this accomplished? Thanks in advance.

data have;
infile datalines delimiter=',';
input id1:$16. id2:$16.;
datalines;
1909307070100103,1909307070103065
1909307070100103,1909307070103064
1909307070100171,1909307070103061
1909307070100171,1909307070103060
1909307070100196,1909307070103088
1909307070100196,1909307070103089
1909307070100196,1909307070103090
1909307070100196,1909307070103091
1909307070100201,1909307070103106
1909307070103061,1909307070103117
1909307070103062,1909307070103105
1909307070103065,1909307070103087
1909307070103067,1909307070103104
;
run;

Want

id1 id2 id3

1909307070100103 1909307070103065 1909307070103087

1909307070100103 1909307070103064

1909307070100171 1909307070103061 1909307070103117

1909307070100171 1909307070103060

1909307070100196 1909307070103088

1909307070100196 1909307070103089

1909307070100196 1909307070103090

1909307070100196 1909307070103091

1909307070100201 1909307070103106

1909307070103062 1909307070103105

1909307070103067 1909307070103104

ErikLund_Jensen · Posted 12-10-2019 01:21 PM

Hi @i_r

You can do it with a series of Proc SQL steps. I have made an example and also shown how the SQL step coud be placed in a macro loop, so the code will handle an unknown number of levels in the tree structure.

* Test data - a 4. level is included as last record for test putposes;
data have;
  infile datalines delimiter=',';
  input id1:$16. id2:$16.;
datalines;
1909307070100103,1909307070103065
1909307070100103,1909307070103064
1909307070100171,1909307070103061
1909307070100171,1909307070103060
1909307070100196,1909307070103088
1909307070100196,1909307070103089
1909307070100196,1909307070103090
1909307070100196,1909307070103091
1909307070100201,1909307070103106
1909307070103061,1909307070103117
1909307070103062,1909307070103105
1909307070103065,1909307070103087
1909307070103067,1909307070103104
1909307070103087,1909307070103099
run;

/*-----------------------------------------------------------------------
Bulding the tree with a SQL step for each level - steps are manually added
until last level is reached, i.e. no more childs are added as a next
level.
-----------------------------------------------------------------------*/

proc sql noprint;
	create table n1 as 
		select distinct id1 as n1 from have
		where id2 not in (select id1 from have);
quit;

proc sql;
	create table n2 as 
		select a.n1, b.id2 as n2
		from n1 as a left join have as b
		on a.n1 = b.id1 ;
quit;

proc sql;
	create table n3 as 
		select a.n1, a.n2, b.id2 as n3
		from n2 as a left join have as b
		on a.n2 = b.id1 ;
quit;

proc sql;
	create table n4 as 
		select a.n1, a.n2, a.n3, b.id2 as n4
		from n3 as a left join have as b
		on a.n3 = b.id1 ;
quit;

proc sql;
	create table n5 as 
		select a.n1, a.n2, a.n3, a.n4, b.id2 as n5
		from n4 as a left join have as b
		on a.n4 = b.id1 ;
quit;


/*-----------------------------------------------------------------------
Same principle, but with a macro loop to repeat SQL step until last level 
is reached, so an unknown number of levels is processed.
-----------------------------------------------------------------------*/

%macro buildtree;

	* Level 1;
	proc sql noprint;
		create table n1 as 
			select distinct id1 as n1 from have
			where id2 not in (select id1 from have);
	quit;

	* Loop and add levels until the tree is exhausted - next level is empty;
	%let i = 0;
	%do %until (&nextfound = 0);
		%let i = %eval(&i+1);
		proc sql noprint;
			create table n%eval(&i+1) as 
				select a.*, b.id2 as n%eval(&i+1)
				from n&i as a left join have as b
				on a.n&i = b.id1 ;

			select count(*) into :nextfound from n%eval(&i+1)
				where n%eval(&i+1) ne ''; 
		quit;
	%end;

	* Keep last data set with a new level found as Want;
	data want; set n&i; run;

	* Drop intermediate data sets;
	proc datasets lib=work nolist;
		delete n:;
	quit;

%mend;
%buildtree;

View solution in original post

mkeintz · Posted 12-10-2019 01:51 AM

I see you stop at ID3, so I assume that no record can have both a parent record and a child record. If so, then the program below will work.

One note: to me the key phrase in your request is "id2 can appear as id1 in a subsequent record". This means that if you link an ID2 to some upcoming ID1, then when that ID1 is encountered you can skip outputting that record:

data have;
  infile datalines delimiter=',';
  input id1:$16. id2:$16.;
datalines;
1909307070100103,1909307070103065
1909307070100103,1909307070103064
1909307070100171,1909307070103061
1909307070100171,1909307070103060
1909307070100196,1909307070103088
1909307070100196,1909307070103089
1909307070100196,1909307070103090
1909307070100196,1909307070103091
1909307070100201,1909307070103106
1909307070103061,1909307070103117
1909307070103062,1909307070103105
1909307070103065,1909307070103087
1909307070103067,1909307070103104
run;

data want (drop=_:);
 merge have have (obs=0 rename=(id1=_idkey id2=id3));
 if _n_=1 then do;
    declare hash lookups(dataset:'have (rename=(id1=_idkey id2=id3))');
      lookups.definekey('_idkey');
      lookups.definedata(all:'Y');
      lookups.definedone();
    declare hash already_found ();
      already_found.definekey('_idkey');
      already_found.definedone();
 end;

 /*If child id found, add _IDKEY (=id2) to the hash of found id's */
 if lookups.find(key:id2)=0 then already_found.add();   

 /* Use "subsettinng IF" to filter only records whose ID1 was not previously found as a child */
 if already_found.find(key:id1)^=0;  
run;

Thank you for generating a working DATA HAVE step.
This program sets up a LOOKUP hash, containing all the data from HAVE. It is keyed on ID1 (renamed as _IDKEY in the hash.
For every incoming record, the ID2 value is searched via the FIND method against all those _IDKEYs. If one is found, then ID3 is will have been successfully retrieved.
If the FIND was successful (lookups.find(key:id2)=0),then the ID2 value is put in the already_found hash.
Any observation whose ID1 is in already_found,will not be output.
The MERGE statement has HAVE a 2nd time with "obs=0". This forces SAS to set _IDKEY and ID3 (in the rename parameter) to be automatically reset to missing with each iteration of the data step.

The only problem with this is that it works only when a child has only 1 child of its own. It would not work with the data below, which has the record 4444444444444444,5555555555555555 is followed by the two records

5555555555555555,6666666666666666 and

5555555555555555,7777777777777777.

Presumable you would want output containing

4444444444444444,5555555555555555,6666666666666666 and

4444444444444444,5555555555555555,7777777777777777.

but the above would not capture the 2nd grandchild.

To accommodate, you can use:

data have;
  infile datalines delimiter=',';
  input id1:$16. id2:$16.;
datalines;
1909307070100103,1909307070103065
1909307070100103,1909307070103064
1909307070100171,1909307070103061
1909307070100171,1909307070103060
1909307070100196,1909307070103088
1909307070100196,1909307070103089
1909307070100196,1909307070103090
1909307070100196,1909307070103091
1909307070100201,1909307070103106
1909307070103061,1909307070103117
1909307070103062,1909307070103105
1909307070103065,1909307070103087
1909307070103067,1909307070103104
4444444444444444,5555555555555555
5555555555555555,6666666666666666
5555555555555555,7777777777777777
run;
data want (drop=_:);
 merge have have (obs=0 rename=(id1=_idkey id2=id3));
 if _n_=1 then do;
    declare hash lookups(dataset:'have (rename=(id1=_idkey id2=id3))',multidata:'Y');
      lookups.definekey('_idkey');
      lookups.definedata(all:'Y');
      lookups.definedone();
    declare hash already_found ();
      already_found.definekey('_idkey');
      already_found.definedone();
 end;

 /*If child id found in LOOKUPS, add _IDKEY (=id2) to the hash of found id's */
 if lookups.find(key:id2)=0 then do;
   already_found.add();
   do until (lookups.find_next()^=0);
     output;
   end;
 end;
 /* Output a record if its ID1 was not previously found as a child */
 else if already_found.find(key:id1)^=0 then output;
run;

This changes the LOOKUPS hash to "multidata:'Y'", which allows the hash to contain multiple dataitems (think multiple rows) with tied key values. That in turn allows the "find_next" method to traverse all the tied key dataitems.

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------

s_lassen · Posted 12-10-2019 03:38 AM

SQL is probably the simplest:

proc sql;
  create table want as select
    parent.id1,parent.id2,child.id2 as id3
  from have parent left join have child on parent.id2=child.id1
  where parent.id1 not in(select id2 from have);
quit;

But you should consider a couple of questions:

What if there are branches deeper than 1 (an id2 points to a record, and the id2 in that record points to yet another record)? Should you then have an id4 variable?
What do you want if an id2 value points to more than one other record? In the solution shown here, you will get all the combinations, but is that what you want?

i_r · Posted 12-10-2019 10:50 AM

Thanks for your response. There will be branches deeper than my example shows and there could be id4, id5, etc. I would want every unique branch created.

ErikLund_Jensen · Posted 12-10-2019 01:21 PM

Hi @i_r

You can do it with a series of Proc SQL steps. I have made an example and also shown how the SQL step coud be placed in a macro loop, so the code will handle an unknown number of levels in the tree structure.

* Test data - a 4. level is included as last record for test putposes;
data have;
  infile datalines delimiter=',';
  input id1:$16. id2:$16.;
datalines;
1909307070100103,1909307070103065
1909307070100103,1909307070103064
1909307070100171,1909307070103061
1909307070100171,1909307070103060
1909307070100196,1909307070103088
1909307070100196,1909307070103089
1909307070100196,1909307070103090
1909307070100196,1909307070103091
1909307070100201,1909307070103106
1909307070103061,1909307070103117
1909307070103062,1909307070103105
1909307070103065,1909307070103087
1909307070103067,1909307070103104
1909307070103087,1909307070103099
run;

/*-----------------------------------------------------------------------
Bulding the tree with a SQL step for each level - steps are manually added
until last level is reached, i.e. no more childs are added as a next
level.
-----------------------------------------------------------------------*/

proc sql noprint;
	create table n1 as 
		select distinct id1 as n1 from have
		where id2 not in (select id1 from have);
quit;

proc sql;
	create table n2 as 
		select a.n1, b.id2 as n2
		from n1 as a left join have as b
		on a.n1 = b.id1 ;
quit;

proc sql;
	create table n3 as 
		select a.n1, a.n2, b.id2 as n3
		from n2 as a left join have as b
		on a.n2 = b.id1 ;
quit;

proc sql;
	create table n4 as 
		select a.n1, a.n2, a.n3, b.id2 as n4
		from n3 as a left join have as b
		on a.n3 = b.id1 ;
quit;

proc sql;
	create table n5 as 
		select a.n1, a.n2, a.n3, a.n4, b.id2 as n5
		from n4 as a left join have as b
		on a.n4 = b.id1 ;
quit;


/*-----------------------------------------------------------------------
Same principle, but with a macro loop to repeat SQL step until last level 
is reached, so an unknown number of levels is processed.
-----------------------------------------------------------------------*/

%macro buildtree;

	* Level 1;
	proc sql noprint;
		create table n1 as 
			select distinct id1 as n1 from have
			where id2 not in (select id1 from have);
	quit;

	* Loop and add levels until the tree is exhausted - next level is empty;
	%let i = 0;
	%do %until (&nextfound = 0);
		%let i = %eval(&i+1);
		proc sql noprint;
			create table n%eval(&i+1) as 
				select a.*, b.id2 as n%eval(&i+1)
				from n&i as a left join have as b
				on a.n&i = b.id1 ;

			select count(*) into :nextfound from n%eval(&i+1)
				where n%eval(&i+1) ne ''; 
		quit;
	%end;

	* Keep last data set with a new level found as Want;
	data want; set n&i; run;

	* Drop intermediate data sets;
	proc datasets lib=work nolist;
		delete n:;
	quit;

%mend;
%buildtree;

i_r · Posted 12-10-2019 04:20 PM

Thank you Erik! This works perfectly for the results I need. You made my day.

Creating new variables by comparing values within same dataset

Re: Creating new variables by comparing values within same dataset

Re: Creating new variables by comparing values within same dataset

Re: Creating new variables by comparing values within same dataset

Re: Creating new variables by comparing values within same dataset

Re: Creating new variables by comparing values within same dataset

Re: Creating new variables by comparing values within same dataset

Classroom Training Available!