Hi Expects,
Assume I have dataset below:
| ID | v1 | v2 | ….. | vn |
| 1 | a | |||
| 2 | a | b | ||
| 3 | b | d | ||
| 4 | d | |||
| 5 | a |
the desired output I wanted:
| Group_id | ID | v1 | v2 | ….. | vn |
| 1 | 1 | a | |||
| 1 | 2 | a | b | ||
| 1 | 3 | b | d | ||
| 1 | 4 | d | |||
| 1 | 5 | a |
I want the same value from each column resolved together. ID=1,2,5 resolve together due to same value 'a' in V1. ID=2,3 resolve together in V2 due to same value 'b'. ID=3,4 resolve together in Vn due to same value 'd'. Because ID=1,2,5 and ID=2,3 and ID=3,4 resolve by v1, v2, Vn respectively so ID=1,2,3,4,5 all resolved together due to indirect resolution among V1, V2,... Vn. This is because ID=1,2,5 and ID=2,3 share same ID=2. ID=2,3 and ID=3,4 share same ID=3. So they all should be assigned same ID.
Can someone help me on this? the number of variables used to link rows are not confirmed so I need a program flexible enough to adjust for more or less.
If you have access to SAS/OR, calculate connected components (groups of connected IDs) with proc optnet:
data have;
input ID (v1 v2 v3) (:$1.);
datalines;
1 a . .
2 a b .
3 . b d
4 . . d
5 a . .
;
data attr;
set have;
array _v v:;
do i = 1 to dim(_v);
if not missing(_v{i}) then do;
v = _v{i};
output;
end;
end;
keep id v;
run;
proc sql;
create table links as
select a.id as from, b.id as to
from attr as a inner join attr as b on a.id<b.id and a.v=b.v;
quit;
proc optnet data_links=links graph_direction=undirected out_nodes=nodes;
concomp;
run;
proc sql;
create table want as
select
a.concomp as group_id,
b.*
from nodes as a inner join
have as b on a.node=b.id
order by b.id;
select * from want;
quit;
If there is no SAS/OR license, you can use keyed access to the links table:
data have;
input ID (v1 v2 v3) (:$1.);
datalines;
1 a . .
2 a b .
3 . b d
4 . . d
5 a . .
;
data attr;
set have;
array _v v:;
do i = 1 to dim(_v);
if not missing(_v{i}) then do;
v = _v{i};
output;
end;
end;
keep id v;
run;
proc sql;
create table links as
select a.id as from, b.id as to
from attr as a inner join attr as b on a.id<b.id and a.v=b.v;
create index to on links(to);
quit;
data groups;
set have;
to=id;
ancestor=id;
do while(1);
set links key=to;
if _iorc_ then do;
_error_=0;
leave;
end;
ancestor=from;
to=from;
end;
drop to from;
run;
proc sort;
by ancestor;
run;
data want;
set groups;
by ancestor;
group_id+first.ancestor;
drop ancestor;
run;
data temp;
input ID (v1 v2 v3) (:$1.);
datalines;
1 a . .
2 a b .
3 . b d
4 . . d
5 a . .
;
data have(rename=(_id=from v=to));
set temp;
_id=put(id,best. -l);
array _v v:;
do i = 1 to dim(_v);
if not missing(_v{i}) then do;
v = _v{i};
output;
end;
end;
keep _id v;
run;
data full;
set have end=last;
if _n_ eq 1 then do;
declare hash h();
h.definekey('node');
h.definedata('node');
h.definedone();
end;
output;
node=from; h.replace();
from=to; to=node;
output;
node=from; h.replace();
if last then h.output(dataset:'node');
drop node;
run;
data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
from_to.definekey('from');
from_to.definedata('to');
from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
no.definekey('node');
no.definedata('node');
no.definedone();
do while(hi_no.next()=0);
household+1; output;
count=1;
key=node;_ha.add();
last=node;ha.add();
rc=hi.first();
do while(rc=0);
from=last;rx=from_to.find();
do while(rx=0);
key=to;ry=_ha.check();
if ry ne 0 then do;
node=to;output;rr=no.remove(key:node);
key=to;_ha.add();
count+1;
last=to;ha.add();
end;
rx=from_to.find_next();
end;
rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;
data final_want;
if _n_=1 then do;
if 0 then set want;
declare hash h(dataset:'want');
h.definekey('node');
h.definedata('household');
h.definedone();
end;
set temp;
_id=put(id,best. -l);
if not missing(_id) then h.find(key:_id);
array _v v:;
do i = 1 to dim(_v);
if not missing(_v{i}) then h.find(key:_v{i});
end;
drop _id i node;
run;
April 27 – 30 | Gaylord Texan | Grapevine, Texas
Walk in ready to learn. Walk out ready to deliver. This is the data and AI conference you can't afford to miss.
Register now and lock in 2025 pricing—just $495!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.