BookmarkSubscribeRSS Feed
gyambqt
Obsidian | Level 7

Hi Expects,

 

Assume I have dataset  below:

 

ID v1 v2 ….. vn
1 a      
2 a b    
3   b   d
4       d
5 a      

 

the desired output I wanted:

Group_id ID v1 v2 ….. vn
1 1 a      
1 2 a b    
1 3   b   d
1 4       d
1 5 a      

 

I want the same value from each column resolved together. ID=1,2,5 resolve together due to same value 'a' in V1. ID=2,3 resolve together in V2 due to same value 'b'. ID=3,4 resolve together in Vn due to same value 'd'.  Because ID=1,2,5  and ID=2,3 and ID=3,4 resolve by v1, v2, Vn respectively so ID=1,2,3,4,5 all resolved together due to indirect resolution among V1, V2,... Vn. This is because ID=1,2,5 and ID=2,3 share same ID=2. ID=2,3 and ID=3,4 share same ID=3. So they all should be assigned same ID.

 

Can someone help me on this? the number of variables used to link rows are not confirmed so I need a program flexible enough to adjust for more or less.

 

 

4 REPLIES 4
PGStats
Opal | Level 21

If you have access to SAS/OR, calculate connected components (groups of connected IDs) with proc optnet:

 

data have;
input ID    (v1  v2 v3) (:$1.);
datalines;
1     a    . .        
2     a   b   .     
3     .    b       d
4      .  .         d
5     a    . .    
;

data attr;
set have;
array _v v:;
do i = 1 to dim(_v);
    if not missing(_v{i}) then do;
        v = _v{i};
        output;
        end;
    end;
keep id v;
run;

proc sql;
create table links as
select a.id as from, b.id as to
from attr as a inner join attr as b on a.id<b.id and a.v=b.v;
quit;

proc optnet data_links=links graph_direction=undirected out_nodes=nodes;
concomp;
run;

proc sql;
create table want as
select 
    a.concomp as group_id,
    b.*
from nodes as a inner join 
    have as b on a.node=b.id
order by b.id;
select * from want;
quit;

PGStats_0-1621997893187.png

 

PG
gyambqt
Obsidian | Level 7
There is no license for SAS/OR. Any idea?
s_lassen
Meteorite | Level 14

If there is no SAS/OR license, you can use keyed access to the links table:

data have;
input ID    (v1  v2 v3) (:$1.);
datalines;
1     a    . .        
2     a   b   .     
3     .    b       d
4      .  .         d
5     a    . .    
;

data attr;
set have;
array _v v:;
do i = 1 to dim(_v);
    if not missing(_v{i}) then do;
        v = _v{i};
        output;
        end;
    end;
keep id v;
run;

proc sql;
create table links as
select a.id as from, b.id as to
from attr as a inner join attr as b on a.id<b.id and a.v=b.v;
create index to on links(to);
quit;

data groups;
  set have;
  to=id;
  ancestor=id;
  do while(1);
    set links key=to;
    if _iorc_ then do;
      _error_=0;
      leave;
      end;
    ancestor=from;
    to=from;
    end;
  drop to from;
run;

proc sort;
  by ancestor;
run;

data want;
   set groups;
   by ancestor;
   group_id+first.ancestor;
   drop ancestor;
run;

  

 

Ksharp
Super User
data temp;
input ID    (v1  v2 v3) (:$1.);
datalines;
1     a    . .        
2     a   b   .     
3     .    b       d
4      .  .         d
5     a    . .    
;

data have(rename=(_id=from v=to));
set temp;
_id=put(id,best. -l);
array _v v:;
do i = 1 to dim(_v);
    if not missing(_v{i}) then do;
        v = _v{i};
        output;
        end;
    end;
keep _id v;
run;




data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;



data final_want;
 if _n_=1 then do;
   if 0 then set want;
   declare hash h(dataset:'want');
   h.definekey('node');
   h.definedata('household');
   h.definedone();
 end;

 set temp;
_id=put(id,best. -l);
if not missing(_id) then h.find(key:_id);
array _v v:;
do i = 1 to dim(_v);
    if not missing(_v{i}) then h.find(key:_v{i});
end;

drop _id i node;
run;

SAS Innovate 2025: Register Today!

 

Join us for SAS Innovate 2025, our biggest and most exciting global event of the year, in Orlando, FL, from May 6-9. Sign up by March 14 for just $795.


Register now!

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

SAS Training: Just a Click Away

 Ready to level-up your skills? Choose your own adventure.

Browse our catalog!

Discussion stats
  • 4 replies
  • 657 views
  • 1 like
  • 4 in conversation