Hi Expects,
Assume I have dataset below:
ID | v1 | v2 | ….. | vn |
1 | a | |||
2 | a | b | ||
3 | b | d | ||
4 | d | |||
5 | a |
the desired output I wanted:
Group_id | ID | v1 | v2 | ….. | vn |
1 | 1 | a | |||
1 | 2 | a | b | ||
1 | 3 | b | d | ||
1 | 4 | d | |||
1 | 5 | a |
I want the same value from each column resolved together. ID=1,2,5 resolve together due to same value 'a' in V1. ID=2,3 resolve together in V2 due to same value 'b'. ID=3,4 resolve together in Vn due to same value 'd'. Because ID=1,2,5 and ID=2,3 and ID=3,4 resolve by v1, v2, Vn respectively so ID=1,2,3,4,5 all resolved together due to indirect resolution among V1, V2,... Vn. This is because ID=1,2,5 and ID=2,3 share same ID=2. ID=2,3 and ID=3,4 share same ID=3. So they all should be assigned same ID.
Can someone help me on this? the number of variables used to link rows are not confirmed so I need a program flexible enough to adjust for more or less.
If you have access to SAS/OR, calculate connected components (groups of connected IDs) with proc optnet:
data have;
input ID (v1 v2 v3) (:$1.);
datalines;
1 a . .
2 a b .
3 . b d
4 . . d
5 a . .
;
data attr;
set have;
array _v v:;
do i = 1 to dim(_v);
if not missing(_v{i}) then do;
v = _v{i};
output;
end;
end;
keep id v;
run;
proc sql;
create table links as
select a.id as from, b.id as to
from attr as a inner join attr as b on a.id<b.id and a.v=b.v;
quit;
proc optnet data_links=links graph_direction=undirected out_nodes=nodes;
concomp;
run;
proc sql;
create table want as
select
a.concomp as group_id,
b.*
from nodes as a inner join
have as b on a.node=b.id
order by b.id;
select * from want;
quit;
If there is no SAS/OR license, you can use keyed access to the links table:
data have;
input ID (v1 v2 v3) (:$1.);
datalines;
1 a . .
2 a b .
3 . b d
4 . . d
5 a . .
;
data attr;
set have;
array _v v:;
do i = 1 to dim(_v);
if not missing(_v{i}) then do;
v = _v{i};
output;
end;
end;
keep id v;
run;
proc sql;
create table links as
select a.id as from, b.id as to
from attr as a inner join attr as b on a.id<b.id and a.v=b.v;
create index to on links(to);
quit;
data groups;
set have;
to=id;
ancestor=id;
do while(1);
set links key=to;
if _iorc_ then do;
_error_=0;
leave;
end;
ancestor=from;
to=from;
end;
drop to from;
run;
proc sort;
by ancestor;
run;
data want;
set groups;
by ancestor;
group_id+first.ancestor;
drop ancestor;
run;
data temp; input ID (v1 v2 v3) (:$1.); datalines; 1 a . . 2 a b . 3 . b d 4 . . d 5 a . . ; data have(rename=(_id=from v=to)); set temp; _id=put(id,best. -l); array _v v:; do i = 1 to dim(_v); if not missing(_v{i}) then do; v = _v{i}; output; end; end; keep _id v; run; data full; set have end=last; if _n_ eq 1 then do; declare hash h(); h.definekey('node'); h.definedata('node'); h.definedone(); end; output; node=from; h.replace(); from=to; to=node; output; node=from; h.replace(); if last then h.output(dataset:'node'); drop node; run; data want(keep=node household); declare hash ha(ordered:'a'); declare hiter hi('ha'); ha.definekey('count'); ha.definedata('last'); ha.definedone(); declare hash _ha(hashexp: 20); _ha.definekey('key'); _ha.definedone(); if 0 then set full; declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y'); from_to.definekey('from'); from_to.definedata('to'); from_to.definedone(); if 0 then set node; declare hash no(dataset:'node'); declare hiter hi_no('no'); no.definekey('node'); no.definedata('node'); no.definedone(); do while(hi_no.next()=0); household+1; output; count=1; key=node;_ha.add(); last=node;ha.add(); rc=hi.first(); do while(rc=0); from=last;rx=from_to.find(); do while(rx=0); key=to;ry=_ha.check(); if ry ne 0 then do; node=to;output;rr=no.remove(key:node); key=to;_ha.add(); count+1; last=to;ha.add(); end; rx=from_to.find_next(); end; rc=hi.next(); end; ha.clear();_ha.clear(); end; stop; run; data final_want; if _n_=1 then do; if 0 then set want; declare hash h(dataset:'want'); h.definekey('node'); h.definedata('household'); h.definedone(); end; set temp; _id=put(id,best. -l); if not missing(_id) then h.find(key:_id); array _v v:; do i = 1 to dim(_v); if not missing(_v{i}) then h.find(key:_v{i}); end; drop _id i node; run;
Join us for SAS Innovate 2025, our biggest and most exciting global event of the year, in Orlando, FL, from May 6-9. Sign up by March 14 for just $795.
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.