Hi! given these data:
data have;
infile datalines delimiter=',';
input id1 $ id2 $;
datalines;
M,A
A,B
C,D
B,C
V,W
W,X
X,Y
Y,Z
;
I am hoping to create a grouper ID that will associate M to A, A to B, B to C, and C to D. Same thing with V to W, W to X, X to Y and Y to Z. Please see below for the data I want:
data want;
infile datalines delimiter=',';
input id1 $ id2 $ grouper;
datalines;
M,A,1
A,B,1
B,C,1
C,D,1
V,W,2
W,X,2
X,Y,2
Y,Z,2
;
So, in the output, there are 2 groups that are established. Is this possible using SAS 9.4? Please note that I cannot guarantee sort order so ideally the solution does not use first/last/lag commands.
TIA!
data have;
infile datalines delimiter=',';
input id1 $ id2 $;
datalines;
M,A
X,Y
A,B
C,D
B,C
Z,S
V,W
W,X
Y,Z
;
data want ;
if _n_=1 then do;
if 0 then set have;
dcl hash H (dataset:'have') ;
h.definekey ("id2") ;
h.definedata ("id2","id1") ;
h.definedone () ;
dcl hash H1 (dataset:'have') ;
h1.definekey ("id1") ;
h1.definedata ("id1","id2") ;
h1.definedone () ;
end;
set have( rename=(id1=v1 id2=v2)) ;
array t(999) $ _temporary_;
if v1 not in t and v2 not in t;
grp+1;
id1=v1;
id2=v2;
output;
do while(h1.find(key:v2)=0);
output;
if v2 not in t then do;
_iorc_+1;
t(_iorc_)=v2;
end;
v2=id2;
end;
do while(h.find(key:v1)=0);
output;
if v1 not in t then do;
_iorc_+1;
t(_iorc_)=v1;
end;
v1=id1;
end;
drop v:;
run;
Hi @paulsonalec
How would you describe the rule here:
for example, two records belong to the same group if the last value of id2= the value of id1.
is that right?
data want;
set have;
_lag = lag(ID2);
if _lag ne ID1 then count+1;
retain count;
drop _lag;
run;
Hi @paulsonalec
Here is the output with the same code:
-> 2 groups are created as the condition is based on the lag value of ID2.
Best,
Right, sorry for the confusion. I guess I'm interested in this example, since sort order cannot be guaranteed here (see how I've changed row order of 'C,D' and 'B,C'). Again, I'm sorry for being annoying/not including this detail in the beginning:
data have;
infile datalines delimiter=',';
input id1 $ id2 $;
datalines;
M,A
A,B
C,D
B,C
V,W
W,X
X,Y
Y,Z
;
data want;
set have;
_lag = lag(ID2);
if _lag ne ID1 then count+1;
retain count;
drop _lag;
run;
Hi @paulsonalec
It seems to be better if you run a proc sort before the data step:
data have;
infile datalines delimiter=',';
input id1 $ id2 $;
datalines;
M,A
X,Y
A,B
C,D
B,C
V,W
W,X
Y,Z
;
proc sort data=have out=have_sorted;
by id2 id1;
run;
data want;
set have_sorted;
_lag = lag(ID2);
if _lag ne ID1 then count+1;
retain count;
drop _lag;
run;
Thanks, again, lag/sort isn't an option here unfortunately. Consider this possibility (I've added row Z,S):
data have;
infile datalines delimiter=',';
input id1 $ id2 $;
datalines;
M,A
X,Y
A,B
C,D
B,C
Z,S
V,W
W,X
Y,Z
;
proc sort data=have out=have_sorted;
by id2 id1;
run;
data want;
set have_sorted;
_lag = lag(ID2);
if _lag ne ID1 then count+1;
retain count;
drop _lag;
run;
data have;
infile datalines delimiter=',';
input id1 $ id2 $;
datalines;
M,A
X,Y
A,B
C,D
B,C
Z,S
V,W
W,X
Y,Z
;
data want ;
if _n_=1 then do;
if 0 then set have;
dcl hash H (dataset:'have') ;
h.definekey ("id2") ;
h.definedata ("id2","id1") ;
h.definedone () ;
dcl hash H1 (dataset:'have') ;
h1.definekey ("id1") ;
h1.definedata ("id1","id2") ;
h1.definedone () ;
end;
set have( rename=(id1=v1 id2=v2)) ;
array t(999) $ _temporary_;
if v1 not in t and v2 not in t;
grp+1;
id1=v1;
id2=v2;
output;
do while(h1.find(key:v2)=0);
output;
if v2 not in t then do;
_iorc_+1;
t(_iorc_)=v2;
end;
v2=id2;
end;
do while(h.find(key:v1)=0);
output;
if v1 not in t then do;
_iorc_+1;
t(_iorc_)=v1;
end;
v1=id1;
end;
drop v:;
run;
This is unbelievable, thank you!
It is one to one match or one to many match ?
data have;
infile cards ;
input from $ to $ ;
cards;
1 2
1 3
4 5
5 2
9 4
6 7
8 7
;
run;
data full;
set have end=last;
if _n_ eq 1 then do;
declare hash h();
h.definekey('node');
h.definedata('node');
h.definedone();
end;
output;
node=from; h.replace();
from=to; to=node;
output;
node=from; h.replace();
if last then h.output(dataset:'node');
drop node;
run;
data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
from_to.definekey('from');
from_to.definedata('to');
from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
no.definekey('node');
no.definedata('node');
no.definedone();
do while(hi_no.next()=0);
household+1; output;
count=1;
key=node;_ha.add();
last=node;ha.add();
rc=hi.first();
do while(rc=0);
from=last;rx=from_to.find();
do while(rx=0);
key=to;ry=_ha.check();
if ry ne 0 then do;
node=to;output;rr=no.remove(key:node);
key=to;_ha.add();
count+1;
last=to;ha.add();
end;
rx=from_to.find_next();
end;
rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;
Very good curiosity and thinking @Ksharp
Registration is now open for SAS Innovate 2025 , our biggest and most exciting global event of the year! Join us in Orlando, FL, May 6-9.
Sign up by Dec. 31 to get the 2024 rate of just $495.
Register now!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.