Hi. I’m reading a file with only two variables, id1 and id2. The file has parent id's that can split into multiple child id's and a child id can further split or produce another child id. There is no link on these records other than id2 can appear as id1 in a subsequent record. I would like to end up with a flattened dataset as the example below. Can you suggest the best way to get this accomplished? Thanks in advance.
data have;
infile datalines delimiter=',';
input id1:$16. id2:$16.;
datalines;
1909307070100103,1909307070103065
1909307070100103,1909307070103064
1909307070100171,1909307070103061
1909307070100171,1909307070103060
1909307070100196,1909307070103088
1909307070100196,1909307070103089
1909307070100196,1909307070103090
1909307070100196,1909307070103091
1909307070100201,1909307070103106
1909307070103061,1909307070103117
1909307070103062,1909307070103105
1909307070103065,1909307070103087
1909307070103067,1909307070103104
;
run;
Want
id1 id2 id3
1909307070100103 1909307070103065 1909307070103087
1909307070100103 1909307070103064
1909307070100171 1909307070103061 1909307070103117
1909307070100171 1909307070103060
1909307070100196 1909307070103088
1909307070100196 1909307070103089
1909307070100196 1909307070103090
1909307070100196 1909307070103091
1909307070100201 1909307070103106
1909307070103062 1909307070103105
1909307070103067 1909307070103104
Hi @i_r
You can do it with a series of Proc SQL steps. I have made an example and also shown how the SQL step coud be placed in a macro loop, so the code will handle an unknown number of levels in the tree structure.
* Test data - a 4. level is included as last record for test putposes;
data have;
infile datalines delimiter=',';
input id1:$16. id2:$16.;
datalines;
1909307070100103,1909307070103065
1909307070100103,1909307070103064
1909307070100171,1909307070103061
1909307070100171,1909307070103060
1909307070100196,1909307070103088
1909307070100196,1909307070103089
1909307070100196,1909307070103090
1909307070100196,1909307070103091
1909307070100201,1909307070103106
1909307070103061,1909307070103117
1909307070103062,1909307070103105
1909307070103065,1909307070103087
1909307070103067,1909307070103104
1909307070103087,1909307070103099
run;
/*-----------------------------------------------------------------------
Bulding the tree with a SQL step for each level - steps are manually added
until last level is reached, i.e. no more childs are added as a next
level.
-----------------------------------------------------------------------*/
proc sql noprint;
create table n1 as
select distinct id1 as n1 from have
where id2 not in (select id1 from have);
quit;
proc sql;
create table n2 as
select a.n1, b.id2 as n2
from n1 as a left join have as b
on a.n1 = b.id1 ;
quit;
proc sql;
create table n3 as
select a.n1, a.n2, b.id2 as n3
from n2 as a left join have as b
on a.n2 = b.id1 ;
quit;
proc sql;
create table n4 as
select a.n1, a.n2, a.n3, b.id2 as n4
from n3 as a left join have as b
on a.n3 = b.id1 ;
quit;
proc sql;
create table n5 as
select a.n1, a.n2, a.n3, a.n4, b.id2 as n5
from n4 as a left join have as b
on a.n4 = b.id1 ;
quit;
/*-----------------------------------------------------------------------
Same principle, but with a macro loop to repeat SQL step until last level
is reached, so an unknown number of levels is processed.
-----------------------------------------------------------------------*/
%macro buildtree;
* Level 1;
proc sql noprint;
create table n1 as
select distinct id1 as n1 from have
where id2 not in (select id1 from have);
quit;
* Loop and add levels until the tree is exhausted - next level is empty;
%let i = 0;
%do %until (&nextfound = 0);
%let i = %eval(&i+1);
proc sql noprint;
create table n%eval(&i+1) as
select a.*, b.id2 as n%eval(&i+1)
from n&i as a left join have as b
on a.n&i = b.id1 ;
select count(*) into :nextfound from n%eval(&i+1)
where n%eval(&i+1) ne '';
quit;
%end;
* Keep last data set with a new level found as Want;
data want; set n&i; run;
* Drop intermediate data sets;
proc datasets lib=work nolist;
delete n:;
quit;
%mend;
%buildtree;
I see you stop at ID3, so I assume that no record can have both a parent record and a child record. If so, then the program below will work.
One note: to me the key phrase in your request is "id2 can appear as id1 in a subsequent record". This means that if you link an ID2 to some upcoming ID1, then when that ID1 is encountered you can skip outputting that record:
data have;
infile datalines delimiter=',';
input id1:$16. id2:$16.;
datalines;
1909307070100103,1909307070103065
1909307070100103,1909307070103064
1909307070100171,1909307070103061
1909307070100171,1909307070103060
1909307070100196,1909307070103088
1909307070100196,1909307070103089
1909307070100196,1909307070103090
1909307070100196,1909307070103091
1909307070100201,1909307070103106
1909307070103061,1909307070103117
1909307070103062,1909307070103105
1909307070103065,1909307070103087
1909307070103067,1909307070103104
run;
data want (drop=_:);
merge have have (obs=0 rename=(id1=_idkey id2=id3));
if _n_=1 then do;
declare hash lookups(dataset:'have (rename=(id1=_idkey id2=id3))');
lookups.definekey('_idkey');
lookups.definedata(all:'Y');
lookups.definedone();
declare hash already_found ();
already_found.definekey('_idkey');
already_found.definedone();
end;
/*If child id found, add _IDKEY (=id2) to the hash of found id's */
if lookups.find(key:id2)=0 then already_found.add();
/* Use "subsettinng IF" to filter only records whose ID1 was not previously found as a child */
if already_found.find(key:id1)^=0;
run;
The only problem with this is that it works only when a child has only 1 child of its own. It would not work with the data below, which has the record 4444444444444444,5555555555555555 is followed by the two records
5555555555555555,6666666666666666 and
5555555555555555,7777777777777777.
Presumable you would want output containing
4444444444444444,5555555555555555,6666666666666666 and
4444444444444444,5555555555555555,7777777777777777.
but the above would not capture the 2nd grandchild.
To accommodate, you can use:
data have;
infile datalines delimiter=',';
input id1:$16. id2:$16.;
datalines;
1909307070100103,1909307070103065
1909307070100103,1909307070103064
1909307070100171,1909307070103061
1909307070100171,1909307070103060
1909307070100196,1909307070103088
1909307070100196,1909307070103089
1909307070100196,1909307070103090
1909307070100196,1909307070103091
1909307070100201,1909307070103106
1909307070103061,1909307070103117
1909307070103062,1909307070103105
1909307070103065,1909307070103087
1909307070103067,1909307070103104
4444444444444444,5555555555555555
5555555555555555,6666666666666666
5555555555555555,7777777777777777
run;
data want (drop=_:);
merge have have (obs=0 rename=(id1=_idkey id2=id3));
if _n_=1 then do;
declare hash lookups(dataset:'have (rename=(id1=_idkey id2=id3))',multidata:'Y');
lookups.definekey('_idkey');
lookups.definedata(all:'Y');
lookups.definedone();
declare hash already_found ();
already_found.definekey('_idkey');
already_found.definedone();
end;
/*If child id found in LOOKUPS, add _IDKEY (=id2) to the hash of found id's */
if lookups.find(key:id2)=0 then do;
already_found.add();
do until (lookups.find_next()^=0);
output;
end;
end;
/* Output a record if its ID1 was not previously found as a child */
else if already_found.find(key:id1)^=0 then output;
run;
This changes the LOOKUPS hash to "multidata:'Y'", which allows the hash to contain multiple dataitems (think multiple rows) with tied key values. That in turn allows the "find_next" method to traverse all the tied key dataitems.
SQL is probably the simplest:
proc sql;
create table want as select
parent.id1,parent.id2,child.id2 as id3
from have parent left join have child on parent.id2=child.id1
where parent.id1 not in(select id2 from have);
quit;
But you should consider a couple of questions:
Thanks for your response. There will be branches deeper than my example shows and there could be id4, id5, etc. I would want every unique branch created.
Hi @i_r
You can do it with a series of Proc SQL steps. I have made an example and also shown how the SQL step coud be placed in a macro loop, so the code will handle an unknown number of levels in the tree structure.
* Test data - a 4. level is included as last record for test putposes;
data have;
infile datalines delimiter=',';
input id1:$16. id2:$16.;
datalines;
1909307070100103,1909307070103065
1909307070100103,1909307070103064
1909307070100171,1909307070103061
1909307070100171,1909307070103060
1909307070100196,1909307070103088
1909307070100196,1909307070103089
1909307070100196,1909307070103090
1909307070100196,1909307070103091
1909307070100201,1909307070103106
1909307070103061,1909307070103117
1909307070103062,1909307070103105
1909307070103065,1909307070103087
1909307070103067,1909307070103104
1909307070103087,1909307070103099
run;
/*-----------------------------------------------------------------------
Bulding the tree with a SQL step for each level - steps are manually added
until last level is reached, i.e. no more childs are added as a next
level.
-----------------------------------------------------------------------*/
proc sql noprint;
create table n1 as
select distinct id1 as n1 from have
where id2 not in (select id1 from have);
quit;
proc sql;
create table n2 as
select a.n1, b.id2 as n2
from n1 as a left join have as b
on a.n1 = b.id1 ;
quit;
proc sql;
create table n3 as
select a.n1, a.n2, b.id2 as n3
from n2 as a left join have as b
on a.n2 = b.id1 ;
quit;
proc sql;
create table n4 as
select a.n1, a.n2, a.n3, b.id2 as n4
from n3 as a left join have as b
on a.n3 = b.id1 ;
quit;
proc sql;
create table n5 as
select a.n1, a.n2, a.n3, a.n4, b.id2 as n5
from n4 as a left join have as b
on a.n4 = b.id1 ;
quit;
/*-----------------------------------------------------------------------
Same principle, but with a macro loop to repeat SQL step until last level
is reached, so an unknown number of levels is processed.
-----------------------------------------------------------------------*/
%macro buildtree;
* Level 1;
proc sql noprint;
create table n1 as
select distinct id1 as n1 from have
where id2 not in (select id1 from have);
quit;
* Loop and add levels until the tree is exhausted - next level is empty;
%let i = 0;
%do %until (&nextfound = 0);
%let i = %eval(&i+1);
proc sql noprint;
create table n%eval(&i+1) as
select a.*, b.id2 as n%eval(&i+1)
from n&i as a left join have as b
on a.n&i = b.id1 ;
select count(*) into :nextfound from n%eval(&i+1)
where n%eval(&i+1) ne '';
quit;
%end;
* Keep last data set with a new level found as Want;
data want; set n&i; run;
* Drop intermediate data sets;
proc datasets lib=work nolist;
delete n:;
quit;
%mend;
%buildtree;
Thank you Erik! This works perfectly for the results I need. You made my day.
Are you ready for the spotlight? We're accepting content ideas for SAS Innovate 2025 to be held May 6-9 in Orlando, FL. The call is open until September 25. Read more here about why you should contribute and what is in it for you!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.