Solved: Re: Finding depth and last child

amol123 · Posted 01-17-2018 11:35 AM

I have below dataset 'test' which has two columns name and parent:

Explanation=> a has no parent, a is parent of b, b is parent of c, c is parent of d, e has no parent, e is parent of f, f is parent of g,

so want to found the depth of 'a' and 'e':

i have given the output for your reference:

data test;

input name $ parent $;

datalines;

a

b a

c b

d c

e

f e

g f

;

run;

****Expected Output*****

name depth last_child

a 3 d

e 2 g

I have tried to solve this but didn't get correct output.

s_lassen · Posted 01-18-2018 06:26 AM

I would try something like this:

proc sql;
  create index parent on test(parent);
quit;

data want;
  set test(where=(parent is null));
  parent=name;
  do depth=0 to 100; /* we stop at 100 in case there are circular references in data */
    set test(rename=(name=last_child)) key=parent;
    if _iorc_ then leave;
    parent=last_child;
    end;
  _error_=0; /* _ERROR_ is set when a key is not found, this is not an error here */
  keep name depth last_child;
run;

Using SET with KEY= is normally the easiest way to traverse trees. The value 100 is used as I assume that there are no trees with a depth greater than that - so if you find that value it may mean that you have a circular reference in your data.

View solution in original post

Reeza · Posted 01-17-2018 11:39 AM

Can you show what you tried so we know what approach you're taking?

There are several ways to solve this type of problem.

@amol123 wrote:

I have below dataset 'test' which has two columns name and parent:

Explanation=> a has no parent, a is parent of b, b is parent of c, c is parent of d, e has no parent, e is parent of f, f is parent of g,

so want to found the depth of 'a' and 'e':

i have given the output for your reference:

data test;

input name $ parent $;

datalines;

a

b a

c b

d c

e

f e

g f

;

run;

****Expected Output*****

name depth last_child

a 3 d

e 2 g

I have tried to solve this but didn't get correct output.

s_lassen · Posted 01-18-2018 06:26 AM

I would try something like this:

proc sql;
  create index parent on test(parent);
quit;

data want;
  set test(where=(parent is null));
  parent=name;
  do depth=0 to 100; /* we stop at 100 in case there are circular references in data */
    set test(rename=(name=last_child)) key=parent;
    if _iorc_ then leave;
    parent=last_child;
    end;
  _error_=0; /* _ERROR_ is set when a key is not found, this is not an error here */
  keep name depth last_child;
run;

Using SET with KEY= is normally the easiest way to traverse trees. The value 100 is used as I assume that there are no trees with a depth greater than that - so if you find that value it may mean that you have a circular reference in your data.

amol123 · Posted 01-18-2018 12:16 PM

Thank you so much!!!!

s_lassen · Posted 01-18-2018 07:02 AM

Yet another way of doing this - starting from the bottom. May be better if several nodes can have the same parent:

proc sql;
  create index name on test(name);
  create table want as select
    name as last_child,
    0 as depth,
    parent
  from test
  where name not in(select parent from test);
quit;

data want;
  modify want;
  do while(parent ne ' ');
    name=parent;
    set test key=name;
    depth=depth+1;
    end;
  parent=name;
  replace;
run;

Patrick · Posted 01-18-2018 07:06 AM

@amol123

Have you already searched the communities here. I know for sure that similar questions have been asked in the past (as I've been one of the people asking) and that solutions have been provided.

Ksharp · Posted 01-18-2018 08:12 AM

Does parent could have multiple child , if it was ,then you could end with multiple last child .


data test;
input name $ parent $;
datalines;
a .
b a
c b
d c
e .
f e
g f
;
run;


data have;
 set test(rename=(parent=_start name=_end));
run;

proc sql;
create table parent as
 select *
  from have 
   where _start in (select _end from have where _start is missing);
quit;



data want(keep=path);
if _n_ eq 1 then do;
length path _path  $ 700 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();

declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();

end;


set parent;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
*putlog 'WARNING:Found  ' _end;
   
pa.add();
do while(hi_path.next()=0);
 if n ne 1 then pa.remove(key:_n);_n=n;
 _path=path;   
 _start=scan(path,-1,'|');
 rc=ha.find(); if rc ne 0 then output;
 do while(rc=0);
  if not findw(path,strip(_end),'|') then do;
   if length(path)+length(_end)+1 gt lengthc(path) then do;
    putlog 'ERROR: The length of path and _path are set too short';
    stop;
   end;
   
  * putlog 'WARNING:Found  ' _end;
   count+1;n=count;
   path=catx('|',path,_end);
   pa.add(); 
   path=_path;
 end;
  rc=ha.find_next();
end;
end;
pa.clear();

run;

Registration is open

SAS Training: Just a Click Away