I have a dataset where the same individuals typically have multiple observations. Like this:
ID | Startdate | Enddate
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
What I want is for the data to be transformed, so there is only one observation for each individual, like this:
ID | Startdate1 | Enddate1 | Startdate2 | Enddate2 | Startdate3 | Enddate3
1 01/01/2001 31/12/2001 01/01/2002 04/05/2002 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001 07/07/2001 15/10/2001 . .
3 01/04/2002 08/09/2003 . . . .
The actual dataset contains hundreds of thousands of observations, so I need a "general" code for conpressing observations for the same individual without losing any observations.
Can someone help?
Hi @niki0209
You can ty this, using an array function:
data have;
input ID Startdate Enddate;
informat Startdate Enddate ddmmyy10.;
format Startdate Enddate ddmmyy10.;
cards;
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
;
run;
proc sql noprint;
select max(nb_obs) into:nb_obs from (select count(*) as nb_obs from have group by ID);
quit;
data want;
set have (rename=(Startdate=_Startdate Enddate=_Enddate));
array Startdate(&nb_obs);
array Enddate(&nb_obs);
by ID;
retain Startdate;
retain Enddate;
if first.ID then do;
counter=0;
call missing (of Startdate(*));
call missing (of Enddate(*));
end;
counter+1;
Startdate(counter) = _Startdate;
Enddate(counter) = _Enddate;
if last.ID then output;
drop _Startdate _Enddate counter;
format Startdate: Enddate: ddmmyy10.;
run;
Hi @niki0209
You can ty this, using an array function:
data have;
input ID Startdate Enddate;
informat Startdate Enddate ddmmyy10.;
format Startdate Enddate ddmmyy10.;
cards;
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
;
run;
proc sql noprint;
select max(nb_obs) into:nb_obs from (select count(*) as nb_obs from have group by ID);
quit;
data want;
set have (rename=(Startdate=_Startdate Enddate=_Enddate));
array Startdate(&nb_obs);
array Enddate(&nb_obs);
by ID;
retain Startdate;
retain Enddate;
if first.ID then do;
counter=0;
call missing (of Startdate(*));
call missing (of Enddate(*));
end;
counter+1;
Startdate(counter) = _Startdate;
Enddate(counter) = _Enddate;
if last.ID then output;
drop _Startdate _Enddate counter;
format Startdate: Enddate: ddmmyy10.;
run;
Thanks! Completely solves my problem :-).
You're welcome @niki0209 !
Have a great day
Thanks - you too :-).
MERGE skill.
data have;
input ID Startdate Enddate;
informat Startdate Enddate ddmmyy10.;
format Startdate Enddate ddmmyy10.;
cards;
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
;
run;
data temp;
set have;
by id;
if first.id then n=0;
n+1;
run;
proc sql noprint;
select distinct catt('temp(where=(n=',n,')
rename=(Startdate=Startdate_',n,' Enddate=Enddate_',n,'))') into : merge separated by ' '
from temp ;
quit;
data want;
merge &merge;
by id;
drop n;
run;
A double transpose can reshape the data into the form you desire. A double transpose is needed because two variables are being pivoted into columns.
data have;
attrib id length=8 startdate enddate informat=ddmmyy10. format=ddmmyy10.;
input ID Startdate Enddate; datalines;
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
;
data have_view / view=have_view;
set have;
by id;
if first.id then seq=1; else seq+1;
run;
proc transpose data=have_view out=name_value;
by id seq;
var startdate enddate;
run;
proc transpose data=name_value out=want(drop=_name_);
by id;
id _name_ seq;
run;
A second way is to perform an 'array-based transposed'. This requires an apriori scan of the data to find the size of the largest id group.
proc sql noprint;
select max (subselect.freq) into :across_count trimmed from
(select id, count(*) as freq from have group by id) as subselect;
%put NOTE: &=across_count;
data want2;
do _n_ = 1 by 1 until (last.id);
set have;
by id;
array starts startdate1-startdate&across_count;
array ends enddate1 -enddate&across_count;
starts [_n_] = startdate;
ends [_n_] = enddate;
end;
attrib startdate: enddate: format=ddmmyy10. informat=ddmmyy10.;
drop startdate enddate;
run;
If you want the array based solution to have a column order that is like start1, end1, start2, end2, etc... you will need a macro to generate the variable names in that order. Example:
%macro enum(min=1, max=, template=);
%local index;
%do index = &min %to &max;
%unquote(&template)
%end;
%mend;
data want3;
do _n_ = 1 by 1 until (last.id);
set have;
by id;
attrib
%enum(max=&across_count, template=%nrstr(startdate&index enddate&index))
format=ddmmyy10. informat=ddmmyy10.
;
array starts startdate1-startdate&across_count;
array ends enddate1 -enddate&across_count;
starts [_n_] = startdate;
ends [_n_] = enddate;
end;
drop startdate enddate;
run;
Are you ready for the spotlight? We're accepting content ideas for SAS Innovate 2025 to be held May 6-9 in Orlando, FL. The call is open until September 25. Read more here about why you should contribute and what is in it for you!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.