I have a dataset where the same individuals typically have multiple observations. Like this:
ID | Startdate | Enddate
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
What I want is for the data to be transformed, so there is only one observation for each individual, like this:
ID | Startdate1 | Enddate1 | Startdate2 | Enddate2 | Startdate3 | Enddate3
1 01/01/2001 31/12/2001 01/01/2002 04/05/2002 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001 07/07/2001 15/10/2001 . .
3 01/04/2002 08/09/2003 . . . .
The actual dataset contains hundreds of thousands of observations, so I need a "general" code for conpressing observations for the same individual without losing any observations.
Can someone help?
Hi @niki0209
You can ty this, using an array function:
data have;
input ID Startdate Enddate;
informat Startdate Enddate ddmmyy10.;
format Startdate Enddate ddmmyy10.;
cards;
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
;
run;
proc sql noprint;
select max(nb_obs) into:nb_obs from (select count(*) as nb_obs from have group by ID);
quit;
data want;
set have (rename=(Startdate=_Startdate Enddate=_Enddate));
array Startdate(&nb_obs);
array Enddate(&nb_obs);
by ID;
retain Startdate;
retain Enddate;
if first.ID then do;
counter=0;
call missing (of Startdate(*));
call missing (of Enddate(*));
end;
counter+1;
Startdate(counter) = _Startdate;
Enddate(counter) = _Enddate;
if last.ID then output;
drop _Startdate _Enddate counter;
format Startdate: Enddate: ddmmyy10.;
run;
Hi @niki0209
You can ty this, using an array function:
data have;
input ID Startdate Enddate;
informat Startdate Enddate ddmmyy10.;
format Startdate Enddate ddmmyy10.;
cards;
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
;
run;
proc sql noprint;
select max(nb_obs) into:nb_obs from (select count(*) as nb_obs from have group by ID);
quit;
data want;
set have (rename=(Startdate=_Startdate Enddate=_Enddate));
array Startdate(&nb_obs);
array Enddate(&nb_obs);
by ID;
retain Startdate;
retain Enddate;
if first.ID then do;
counter=0;
call missing (of Startdate(*));
call missing (of Enddate(*));
end;
counter+1;
Startdate(counter) = _Startdate;
Enddate(counter) = _Enddate;
if last.ID then output;
drop _Startdate _Enddate counter;
format Startdate: Enddate: ddmmyy10.;
run;
Thanks! Completely solves my problem :-).
You're welcome @niki0209 !
Have a great day
Thanks - you too :-).
MERGE skill.
data have;
input ID Startdate Enddate;
informat Startdate Enddate ddmmyy10.;
format Startdate Enddate ddmmyy10.;
cards;
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
;
run;
data temp;
set have;
by id;
if first.id then n=0;
n+1;
run;
proc sql noprint;
select distinct catt('temp(where=(n=',n,')
rename=(Startdate=Startdate_',n,' Enddate=Enddate_',n,'))') into : merge separated by ' '
from temp ;
quit;
data want;
merge &merge;
by id;
drop n;
run;
A double transpose can reshape the data into the form you desire. A double transpose is needed because two variables are being pivoted into columns.
data have;
attrib id length=8 startdate enddate informat=ddmmyy10. format=ddmmyy10.;
input ID Startdate Enddate; datalines;
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
;
data have_view / view=have_view;
set have;
by id;
if first.id then seq=1; else seq+1;
run;
proc transpose data=have_view out=name_value;
by id seq;
var startdate enddate;
run;
proc transpose data=name_value out=want(drop=_name_);
by id;
id _name_ seq;
run;
A second way is to perform an 'array-based transposed'. This requires an apriori scan of the data to find the size of the largest id group.
proc sql noprint;
select max (subselect.freq) into :across_count trimmed from
(select id, count(*) as freq from have group by id) as subselect;
%put NOTE: &=across_count;
data want2;
do _n_ = 1 by 1 until (last.id);
set have;
by id;
array starts startdate1-startdate&across_count;
array ends enddate1 -enddate&across_count;
starts [_n_] = startdate;
ends [_n_] = enddate;
end;
attrib startdate: enddate: format=ddmmyy10. informat=ddmmyy10.;
drop startdate enddate;
run;
If you want the array based solution to have a column order that is like start1, end1, start2, end2, etc... you will need a macro to generate the variable names in that order. Example:
%macro enum(min=1, max=, template=);
%local index;
%do index = &min %to &max;
%unquote(&template)
%end;
%mend;
data want3;
do _n_ = 1 by 1 until (last.id);
set have;
by id;
attrib
%enum(max=&across_count, template=%nrstr(startdate&index enddate&index))
format=ddmmyy10. informat=ddmmyy10.
;
array starts startdate1-startdate&across_count;
array ends enddate1 -enddate&across_count;
starts [_n_] = startdate;
ends [_n_] = enddate;
end;
drop startdate enddate;
run;
Join us for SAS Innovate 2025, our biggest and most exciting global event of the year, in Orlando, FL, from May 6-9. Sign up by March 14 for just $795.
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.