BookmarkSubscribeRSS Feed
gyambqt
Obsidian | Level 7

Hi Expects,

 

Assume I have dataset  below:

 

ID v1 v2 ….. vn
1 a      
2 a b    
3   b   d
4       d
5 a      

 

the desired output I wanted:

Group_id ID v1 v2 ….. vn
1 1 a      
1 2 a b    
1 3   b   d
1 4       d
1 5 a      

 

I want the same value from each column resolved together. ID=1,2,5 resolve together due to same value 'a' in V1. ID=2,3 resolve together in V2 due to same value 'b'. ID=3,4 resolve together in Vn due to same value 'd'.  Because ID=1,2,5  and ID=2,3 and ID=3,4 resolve by v1, v2, Vn respectively so ID=1,2,3,4,5 all resolved together due to indirect resolution among V1, V2,... Vn. This is because ID=1,2,5 and ID=2,3 share same ID=2. ID=2,3 and ID=3,4 share same ID=3. So they all should be assigned same ID.

 

Can someone help me on this? the number of variables used to link rows are not confirmed so I need a program flexible enough to adjust for more or less.

 

 

4 REPLIES 4
PGStats
Opal | Level 21

If you have access to SAS/OR, calculate connected components (groups of connected IDs) with proc optnet:

 

data have;
input ID    (v1  v2 v3) (:$1.);
datalines;
1     a    . .        
2     a   b   .     
3     .    b       d
4      .  .         d
5     a    . .    
;

data attr;
set have;
array _v v:;
do i = 1 to dim(_v);
    if not missing(_v{i}) then do;
        v = _v{i};
        output;
        end;
    end;
keep id v;
run;

proc sql;
create table links as
select a.id as from, b.id as to
from attr as a inner join attr as b on a.id<b.id and a.v=b.v;
quit;

proc optnet data_links=links graph_direction=undirected out_nodes=nodes;
concomp;
run;

proc sql;
create table want as
select 
    a.concomp as group_id,
    b.*
from nodes as a inner join 
    have as b on a.node=b.id
order by b.id;
select * from want;
quit;

PGStats_0-1621997893187.png

 

PG
gyambqt
Obsidian | Level 7
There is no license for SAS/OR. Any idea?
s_lassen
Meteorite | Level 14

If there is no SAS/OR license, you can use keyed access to the links table:

data have;
input ID    (v1  v2 v3) (:$1.);
datalines;
1     a    . .        
2     a   b   .     
3     .    b       d
4      .  .         d
5     a    . .    
;

data attr;
set have;
array _v v:;
do i = 1 to dim(_v);
    if not missing(_v{i}) then do;
        v = _v{i};
        output;
        end;
    end;
keep id v;
run;

proc sql;
create table links as
select a.id as from, b.id as to
from attr as a inner join attr as b on a.id<b.id and a.v=b.v;
create index to on links(to);
quit;

data groups;
  set have;
  to=id;
  ancestor=id;
  do while(1);
    set links key=to;
    if _iorc_ then do;
      _error_=0;
      leave;
      end;
    ancestor=from;
    to=from;
    end;
  drop to from;
run;

proc sort;
  by ancestor;
run;

data want;
   set groups;
   by ancestor;
   group_id+first.ancestor;
   drop ancestor;
run;

  

 

Ksharp
Super User
data temp;
input ID    (v1  v2 v3) (:$1.);
datalines;
1     a    . .        
2     a   b   .     
3     .    b       d
4      .  .         d
5     a    . .    
;

data have(rename=(_id=from v=to));
set temp;
_id=put(id,best. -l);
array _v v:;
do i = 1 to dim(_v);
    if not missing(_v{i}) then do;
        v = _v{i};
        output;
        end;
    end;
keep _id v;
run;




data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;



data final_want;
 if _n_=1 then do;
   if 0 then set want;
   declare hash h(dataset:'want');
   h.definekey('node');
   h.definedata('household');
   h.definedone();
 end;

 set temp;
_id=put(id,best. -l);
if not missing(_id) then h.find(key:_id);
array _v v:;
do i = 1 to dim(_v);
    if not missing(_v{i}) then h.find(key:_v{i});
end;

drop _id i node;
run;

Ready to join fellow brilliant minds for the SAS Hackathon?

Build your skills. Make connections. Enjoy creative freedom. Maybe change the world. Registration is now open through August 30th. Visit the SAS Hackathon homepage.

Register today!
How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

Click image to register for webinarClick image to register for webinar

Classroom Training Available!

Select SAS Training centers are offering in-person courses. View upcoming courses for:

View all other training opportunities.

Discussion stats
  • 4 replies
  • 470 views
  • 1 like
  • 4 in conversation