Hi,
I have a business problem where i want to generate group id against every row. Grouping will be done based on 3 parameters. sample input and expected output is below.
input.
custid | var1 | var2 | var3 |
1 | a11 | b11 | c11 |
2 | a12 | b11 | c12 |
3 | a13 | b13 | c12 |
4 | a14 | b14 | c14 |
Output:
custid | var1 | var2 | var3 | groupid |
1 | a11 | b11 | c11 | G1 |
2 | a12 | b11 | c12 | G1 |
3 | a13 | b13 | c12 | G1 |
4 | a14 | b14 | c14 | G2 |
And what is the rule or rules involved.
With the example you have provided we have no idea what the "group id" value would be if a value of a15, b15 or c15 appears
May I ask how big is your data or how many custids are there?
and how many groups, approximately, you expect?
I have the feeling you'll need a complex compare, maybe recursive, to assign the final group - to your sample test:
comparing var1 there are four different groups,
comparing var2 will eliminate it into 3 groups,
comparing var3 will eliminate it into 2 groups.
If I add a 5th custid as: a11-b15-c16 - will it be assigned as G1?
The technical code may depend on data-set size.
Use proc optnet to find connected components:
data have;
input custid var1$ var2$ var3$;
datalines;
1 a11 b11 c11
2 a12 b11 c12
3 a13 b13 c12
4 a14 b14 c14
;
proc transpose data=have out=temp1;
by custid;
var var1-var3;
run;
proc sql;
create table links as
select unique
a.custid as from,
b.custid as to
from
temp1 as a inner join
temp1 as b on a.col1=b.col1 and a.custid <= b.custid;
quit;
proc optnet data_links=links GRAPH_DIRECTION=UNDIRECTED out_nodes=groups;
concomp;
run;
proc sql;
create table want as
select
a.*,
cats("G", b.concomp) as groupid
from
have as a left join
groups as b on a.custid=b.node;
*drop table temp1 links groups;
quit;
You are trying to get linked components .
data temp;
input custid var1$ var2$ var3$;
datalines;
1 a11 b11 c11
2 a12 b11 c12
3 a13 b13 c12
4 a14 b14 c14
;
data have;
set temp;
array x{*} $ var1-var3;
do i=1 to dim(x)-1;
from=x{i};
do j=i+1 to dim(x);
to=x{j};output;
end;
end;
keep from to;
run;
data full;
set have end=last;
if _n_ eq 1 then do;
declare hash h();
h.definekey('node');
h.definedata('node');
h.definedone();
end;
output;
node=from; h.replace();
from=to; to=node;
output;
node=from; h.replace();
if last then h.output(dataset:'node');
drop node;
run;
data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
from_to.definekey('from');
from_to.definedata('to');
from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
no.definekey('node');
no.definedata('node');
no.definedone();
do while(hi_no.next()=0);
household+1; output;
count=1;
key=node;_ha.add();
last=node;ha.add();
rc=hi.first();
do while(rc=0);
from=last;rx=from_to.find();
do while(rx=0);
key=to;ry=_ha.check();
if ry ne 0 then do;
node=to;output;rr=no.remove(key:node);
key=to;_ha.add();
count+1;
last=to;ha.add();
end;
rx=from_to.find_next();
end;
rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;
data final_want;
if _n_=1 then do;
if 0 then set want;
declare hash h(dataset:'want',hashexp:20);
h.definekey('node');
h.definedata('household');
h.definedone();
end;
set temp;
call missing(household);
rc=h.find(key:var1);
length groupid $ 20 ;
groupid=cats('G',household);
drop rc node household;
run;
Are you ready for the spotlight? We're accepting content ideas for SAS Innovate 2025 to be held May 6-9 in Orlando, FL. The call is open until September 25. Read more here about why you should contribute and what is in it for you!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.