Re: Generate group id based on multiple parameters in "or" condition

arpitagarwal512 · Posted 04-13-2020 11:19 AM

Hi,

I have a business problem where i want to generate group id against every row. Grouping will be done based on 3 parameters. sample input and expected output is below.

input.

custid	var1	var2	var3
1	a11	b11	c11
2	a12	b11	c12
3	a13	b13	c12
4	a14	b14	c14

Output:

custid	var1	var2	var3	groupid
1	a11	b11	c11	G1
2	a12	b11	c12	G1
3	a13	b13	c12	G1
4	a14	b14	c14	G2

ballardw · Posted 04-13-2020 11:25 AM

And what is the rule or rules involved.

With the example you have provided we have no idea what the "group id" value would be if a value of a15, b15 or c15 appears

arpitagarwal512 · Posted 04-13-2020 11:28 AM

group id is a sequence no prefix with "G" and starts from 1. value inside is a static value

arpitagarwal512 · Posted 04-13-2020 11:34 AM

here in first row, column var2 value is "b11" and similar value available in second row so ideally row1 and row 2 belongs to same group. now in second row, column v3 have same value in third row v3 column value, so second row and third row also belongs to same group.
row 4 column value don't have common value in any of the row so row 4 belongs to different group.
Now,
row1 and row 2 belongs to same group . row2 and row 3 belongs to same group
i.e row 1, row 2 and row 3 belongs to same group . So G1 group id assigned
rest row4 assigned with G2 group is

Shmuel · Posted 04-13-2020 12:32 PM

May I ask how big is your data or how many custids are there?

and how many groups, approximately, you expect?

I have the feeling you'll need a complex compare, maybe recursive, to assign the final group - to your sample test:

comparing var1 there are four different groups,

comparing var2 will eliminate it into 3 groups,

comparing var3 will eliminate it into 2 groups.

If I add a 5th custid as: a11-b15-c16 - will it be assigned as G1?

The technical code may depend on data-set size.

arpitagarwal512 · Posted 04-13-2020 12:37 PM

data will be around 3 million rows and expected group will be around .7 million.

If I add a 5th custid as: a11-b15-c16 - will it be assigned as G1
above statement is correct because a11 is common in first and fifth row for column var1

PGStats · Posted 04-13-2020 12:50 PM

Use proc optnet to find connected components:

data have;
input custid  var1$    var2$    var3$;
datalines;
1   a11 b11 c11
2   a12 b11 c12
3   a13 b13 c12
4   a14 b14 c14
;

proc transpose data=have out=temp1;
by custid;
var var1-var3;
run;

proc sql;
create table links as
select unique
	a.custid as from,
	b.custid as to
from 
	temp1 as a inner join
	temp1 as b on a.col1=b.col1 and a.custid <= b.custid;
quit;

proc optnet data_links=links GRAPH_DIRECTION=UNDIRECTED out_nodes=groups;
concomp;
run;

proc sql;
create table want as
select
	a.*,
	cats("G", b.concomp) as groupid
from 
	have as a left join
	groups as b on a.custid=b.node;
*drop table temp1 links groups;
quit;

PG

arpitagarwal512 · Posted 04-14-2020 05:44 AM

I don't have license of proc optnet instead i am using sas viya where i got to know proc optnetwork procedure. does it will same as proc optnet?

Ksharp · Posted 04-14-2020 08:22 AM

You are trying to get linked components .

data temp;
input custid  var1$    var2$    var3$;
datalines;
1   a11 b11 c11
2   a12 b11 c12
3   a13 b13 c12
4   a14 b14 c14
;
data have;
 set temp;
 array x{*} $ var1-var3;
 do i=1 to dim(x)-1;
   from=x{i};
   do j=i+1 to dim(x);
     to=x{j};output;
   end;
 end;
 keep from to;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

data final_want;
if _n_=1 then do;
  if 0 then set want;
  declare hash h(dataset:'want',hashexp:20);
  h.definekey('node');
  h.definedata('household');
  h.definedone();
end;
 set temp;
call missing(household);
rc=h.find(key:var1);
length groupid $ 20 ;
groupid=cats('G',household);
drop rc node household;
run;

Catch up on SAS Innovate 2026

SAS Training: Just a Click Away