Solved: Re: Grouping entities - Parent Subsidiary relationship

TaneMahuta · Posted 09-29-2019 05:13 PM

Hello,

I have a data set with 2 variables. Parent and Subsidiary. There are approx. 12 levels of parent-subsidiary relationships which I want to be grouped. In the example data there are two groups and ten entities. I am using SAS EG 7.1

Example Have:

Parent	Subsidiary
1	3
1	4
2	5
2	6
3	7
3	8
4	9
4	10

Example Want:

Group_Key	Entity_Key
1	1
1	3
1	4
1	7
1	8
1	9
1	10
2	2
2	5
2	6

Patrick · Posted 09-29-2019 06:56 PM

Here you go

data have;
  infile datalines truncover dlm=' ';
  input Parent  Subsidiary;
  datalines;
1 3
1 4
2 5
2 6
3 7
3 8
4 9
4 10
11 12
12 13
13 11
;

data want(drop=Child);
  if _n_=1 then 
    do;
      if 0 then set have(rename=(Parent=Root Subsidiary=Child));
      dcl hash h1(dataset:'have(rename=(Parent=Root Subsidiary=Child) where=(Root ne Child))');
      h1.defineKey('Child');
      h1.defineData('Root');
      h1.defineDone();
    end;
  set have;

  Root=Parent;
  Depth=1;
  do while(h1.check(key:Root)=0);
    h1.find(key:Root);
    Depth+1;
    if Depth=1000 then 
      do;
        put 'WARNING: Circular relationship. Process will not continue';
        call missing(root);
        leave;
      end;
  end;

run;

proc print;
run;

I haven't added the rows where parent and child have the same value as this didn't make sense to me. It would be easy enough though to add code which constructs such additional rows.

View solution in original post

Patrick · Posted 09-29-2019 06:56 PM

Here you go

data have;
  infile datalines truncover dlm=' ';
  input Parent  Subsidiary;
  datalines;
1 3
1 4
2 5
2 6
3 7
3 8
4 9
4 10
11 12
12 13
13 11
;

data want(drop=Child);
  if _n_=1 then 
    do;
      if 0 then set have(rename=(Parent=Root Subsidiary=Child));
      dcl hash h1(dataset:'have(rename=(Parent=Root Subsidiary=Child) where=(Root ne Child))');
      h1.defineKey('Child');
      h1.defineData('Root');
      h1.defineDone();
    end;
  set have;

  Root=Parent;
  Depth=1;
  do while(h1.check(key:Root)=0);
    h1.find(key:Root);
    Depth+1;
    if Depth=1000 then 
      do;
        put 'WARNING: Circular relationship. Process will not continue';
        call missing(root);
        leave;
      end;
  end;

run;

proc print;
run;

I haven't added the rows where parent and child have the same value as this didn't make sense to me. It would be easy enough though to add code which constructs such additional rows.

TaneMahuta · Posted 09-29-2019 09:06 PM

Thank you Patrick, your answer resolves my problem.

The reason for including rows where parent and child have the same value is so I am able to query all entities within the group.

Thanks again.

Ksharp · Posted 10-01-2019 01:03 AM

Patrick,

What if there two parent have the same child . Like:

Parent	Subsidiary
1	3
1	4
2	4
2	6

Patrick · Posted 10-01-2019 06:04 AM

@Ksharp

Then obviously my code wouldn't work. I've had something like a hierarchy in mind when providing code.

Neither the sample data nor the narrative did indicate that there can be more than one parent.

Ksharp · Posted 10-01-2019 12:58 AM


data have;
infile cards expandtabs ;
input from $  to $ ;
cards;
1	3
1	4
2	5
2	6
3	7
3	8
4	9
4	10
;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

TaneMahuta · Posted 10-01-2019 10:47 PM

Hi Ksharp,

Thank you for your reply.

In your solution a new id is created, what I'm looking to do is retain the first parent as the group id. For example, company_a is the parent of company_b who is the parent of company_c and company_d. Company a, b,c and d therefore form a group headed by company_a.

Have:

From_Key	To_Key
a	b
b	c
b	d

Want:

Group_Key	To_Key
a	a
a	b
a	c
a	d

You raise a good point in reply to Patrick. In the data there should not be any entity with more than one parent, although there are errors in the data where the relationship between entities has been recorded in error. This results in circular relationships (as Patrick mentions) and entities with 2 or more parents.

Ksharp · Posted 10-03-2019 08:28 AM

OK. Try this one.

data have;
infile cards expandtabs;
input _start $ _end $;
cards;
1	3
1	4
2	5
2	6
3	7
3	8
4	9
4	10
a	b
;
run;
 
proc sql;
create table ancentor as
select * from have 
 where _start not in (select _end from have);
quit;


data want(keep=path);
if _n_ eq 1 then do;
length path _path  $ 800 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();

declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();

end;


set ancentor ;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
*putlog 'WARNING:Found  ' _end;
   
pa.add();
do while(hi_path.next()=0);
 if n ne 1 then pa.remove(key:_n);_n=n;
 _path=path;   
 _start=scan(path,-1,'|');
 rc=ha.find();  if rc ne 0 then output; 
 do while(rc=0);
  if not findw(path,strip(_end),'|') then do;
   if length(path)+length(_end)+1 gt lengthc(path) then do;
    putlog 'ERROR: The length of path and _path are set too short';
    stop;
   end;
   
   *putlog 'WARNING:Found  ' _end;
   count+1;n=count;
   path=catx('|',path,_end);
   pa.add(); 
   path=_path;
 end;   else output; 
  rc=ha.find_next();
end;
end;
pa.clear();

run;
data want1;
 set want;
 group_key=scan(path,1,'|');
 do i=1 to countw(path,'|');
  to_key=scan(path,i,'|');
  output;
 end;
drop i path;
run;
proc sort data=want1 out=want2 nodupkey;
by group_key to_key;
run;

SAS Innovate 2025: Call for Content