Hi Expects,
Assume I have dataset below:
ID | v1 | v2 | ….. | vn |
1 | a | |||
2 | a | b | ||
3 | b | d | ||
4 | d | |||
5 | a |
the desired output I wanted:
Group_id | ID | v1 | v2 | ….. | vn |
1 | 1 | a | |||
1 | 2 | a | b | ||
1 | 3 | b | d | ||
1 | 4 | d | |||
1 | 5 | a |
I want the same value from each column resolved together. ID=1,2,5 resolve together due to same value 'a' in V1. ID=2,3 resolve together in V2 due to same value 'b'. ID=3,4 resolve together in Vn due to same value 'd'. Because ID=1,2,5 and ID=2,3 and ID=3,4 resolve by v1, v2, Vn respectively so ID=1,2,3,4,5 all resolved together due to indirect resolution among V1, V2,... Vn. This is because ID=1,2,5 and ID=2,3 share same ID=2. ID=2,3 and ID=3,4 share same ID=3. So they all should be assigned same ID.
Can someone help me on this? the number of variables used to link rows are not confirmed so I need a program flexible enough to adjust for more or less.
If you have access to SAS/OR, calculate connected components (groups of connected IDs) with proc optnet:
data have;
input ID (v1 v2 v3) (:$1.);
datalines;
1 a . .
2 a b .
3 . b d
4 . . d
5 a . .
;
data attr;
set have;
array _v v:;
do i = 1 to dim(_v);
if not missing(_v{i}) then do;
v = _v{i};
output;
end;
end;
keep id v;
run;
proc sql;
create table links as
select a.id as from, b.id as to
from attr as a inner join attr as b on a.id<b.id and a.v=b.v;
quit;
proc optnet data_links=links graph_direction=undirected out_nodes=nodes;
concomp;
run;
proc sql;
create table want as
select
a.concomp as group_id,
b.*
from nodes as a inner join
have as b on a.node=b.id
order by b.id;
select * from want;
quit;
If there is no SAS/OR license, you can use keyed access to the links table:
data have;
input ID (v1 v2 v3) (:$1.);
datalines;
1 a . .
2 a b .
3 . b d
4 . . d
5 a . .
;
data attr;
set have;
array _v v:;
do i = 1 to dim(_v);
if not missing(_v{i}) then do;
v = _v{i};
output;
end;
end;
keep id v;
run;
proc sql;
create table links as
select a.id as from, b.id as to
from attr as a inner join attr as b on a.id<b.id and a.v=b.v;
create index to on links(to);
quit;
data groups;
set have;
to=id;
ancestor=id;
do while(1);
set links key=to;
if _iorc_ then do;
_error_=0;
leave;
end;
ancestor=from;
to=from;
end;
drop to from;
run;
proc sort;
by ancestor;
run;
data want;
set groups;
by ancestor;
group_id+first.ancestor;
drop ancestor;
run;
data temp; input ID (v1 v2 v3) (:$1.); datalines; 1 a . . 2 a b . 3 . b d 4 . . d 5 a . . ; data have(rename=(_id=from v=to)); set temp; _id=put(id,best. -l); array _v v:; do i = 1 to dim(_v); if not missing(_v{i}) then do; v = _v{i}; output; end; end; keep _id v; run; data full; set have end=last; if _n_ eq 1 then do; declare hash h(); h.definekey('node'); h.definedata('node'); h.definedone(); end; output; node=from; h.replace(); from=to; to=node; output; node=from; h.replace(); if last then h.output(dataset:'node'); drop node; run; data want(keep=node household); declare hash ha(ordered:'a'); declare hiter hi('ha'); ha.definekey('count'); ha.definedata('last'); ha.definedone(); declare hash _ha(hashexp: 20); _ha.definekey('key'); _ha.definedone(); if 0 then set full; declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y'); from_to.definekey('from'); from_to.definedata('to'); from_to.definedone(); if 0 then set node; declare hash no(dataset:'node'); declare hiter hi_no('no'); no.definekey('node'); no.definedata('node'); no.definedone(); do while(hi_no.next()=0); household+1; output; count=1; key=node;_ha.add(); last=node;ha.add(); rc=hi.first(); do while(rc=0); from=last;rx=from_to.find(); do while(rx=0); key=to;ry=_ha.check(); if ry ne 0 then do; node=to;output;rr=no.remove(key:node); key=to;_ha.add(); count+1; last=to;ha.add(); end; rx=from_to.find_next(); end; rc=hi.next(); end; ha.clear();_ha.clear(); end; stop; run; data final_want; if _n_=1 then do; if 0 then set want; declare hash h(dataset:'want'); h.definekey('node'); h.definedata('household'); h.definedone(); end; set temp; _id=put(id,best. -l); if not missing(_id) then h.find(key:_id); array _v v:; do i = 1 to dim(_v); if not missing(_v{i}) then h.find(key:_v{i}); end; drop _id i node; run;
Build your skills. Make connections. Enjoy creative freedom. Maybe change the world. Registration is now open through August 30th. Visit the SAS Hackathon homepage.
Register today!Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Select SAS Training centers are offering in-person courses. View upcoming courses for: