Solved: Re: How do I generate a variable for each observation of each ID?

niki0209 · Posted 12-09-2019 08:14 AM

I have a dataset where the same individuals typically have multiple observations. Like this:

ID | Startdate | Enddate

1 01/01/2001 31/12/2001

1 01/01/2002 04/05/2002

1 06/07/2002 30/11/2002

2 05/05/2001 05/06/2001

2 07/07/2001 15/10/2001

3 01/04/2002 08/09/2003

What I want is for the data to be transformed, so there is only one observation for each individual, like this:

1 01/01/2001 31/12/2001 01/01/2002 04/05/2002 06/07/2002 30/11/2002

2 05/05/2001 05/06/2001 07/07/2001 15/10/2001 . .

3 01/04/2002 08/09/2003 . . . .

The actual dataset contains hundreds of thousands of observations, so I need a "general" code for conpressing observations for the same individual without losing any observations.

Can someone help?

ed_sas_member · Posted 12-09-2019 08:40 AM

Hi @niki0209

You can ty this, using an array function:

data have;
	input ID Startdate Enddate;
	informat Startdate Enddate ddmmyy10.;
	format Startdate Enddate ddmmyy10.;
	cards;
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
;
run;

proc sql noprint;
	select max(nb_obs) into:nb_obs from (select count(*) as nb_obs from have group by ID);
quit;

data want;

	set have (rename=(Startdate=_Startdate Enddate=_Enddate));

	array Startdate(&nb_obs);
	array Enddate(&nb_obs);

	by ID;
	retain Startdate;
	retain Enddate;
	
 	if first.ID then do;
	 		counter=0;
	 		call missing (of Startdate(*));
 			call missing (of Enddate(*));
 		end;

 	counter+1;
	
	Startdate(counter) = _Startdate;
	Enddate(counter) = _Enddate;
		
	if last.ID then output;
	
	drop _Startdate _Enddate counter;
	format Startdate: Enddate: ddmmyy10.;

run;

View solution in original post

ed_sas_member · Posted 12-09-2019 08:40 AM

Hi @niki0209

You can ty this, using an array function:

data have;
	input ID Startdate Enddate;
	informat Startdate Enddate ddmmyy10.;
	format Startdate Enddate ddmmyy10.;
	cards;
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
;
run;

proc sql noprint;
	select max(nb_obs) into:nb_obs from (select count(*) as nb_obs from have group by ID);
quit;

data want;

	set have (rename=(Startdate=_Startdate Enddate=_Enddate));

	array Startdate(&nb_obs);
	array Enddate(&nb_obs);

	by ID;
	retain Startdate;
	retain Enddate;
	
 	if first.ID then do;
	 		counter=0;
	 		call missing (of Startdate(*));
 			call missing (of Enddate(*));
 		end;

 	counter+1;
	
	Startdate(counter) = _Startdate;
	Enddate(counter) = _Enddate;
		
	if last.ID then output;
	
	drop _Startdate _Enddate counter;
	format Startdate: Enddate: ddmmyy10.;

run;

niki0209 · Posted 12-09-2019 09:03 AM

Thanks! Completely solves my problem :-).

ed_sas_member · Posted 12-09-2019 09:05 AM

You're welcome @niki0209 !

Have a great day

niki0209 · Posted 12-09-2019 09:06 AM

Thanks - you too :-).

Ksharp · Posted 12-10-2019 07:46 AM

MERGE skill.

data have;
	input ID Startdate Enddate;
	informat Startdate Enddate ddmmyy10.;
	format Startdate Enddate ddmmyy10.;
	cards;
1 01/01/2001 31/12/2001
1 01/01/2002 04/05/2002
1 06/07/2002 30/11/2002
2 05/05/2001 05/06/2001
2 07/07/2001 15/10/2001
3 01/04/2002 08/09/2003
;
run;

data temp;
 set have;
 by id;
 if first.id then n=0;
 n+1;
run;
proc sql noprint;
select distinct catt('temp(where=(n=',n,')
rename=(Startdate=Startdate_',n,' Enddate=Enddate_',n,'))')	into : merge separated by ' '
 from temp ;
quit;
data want;
 merge &merge;
 by id;
 drop n;
run;

RichardDeVen · Posted 02-12-2020 06:05 AM

A double transpose can reshape the data into the form you desire. A double transpose is needed because two variables are being pivoted into columns.

data have;
attrib id length=8 startdate enddate informat=ddmmyy10. format=ddmmyy10.;
input ID  Startdate    Enddate; datalines;
1         01/01/2001      31/12/2001
1         01/01/2002      04/05/2002
1         06/07/2002      30/11/2002
2         05/05/2001      05/06/2001
2         07/07/2001      15/10/2001
3         01/04/2002      08/09/2003
;

data have_view / view=have_view;    
  set have;
  by id;
  if first.id then seq=1; else seq+1;
run;

proc transpose data=have_view out=name_value;
  by id seq; 
  var startdate enddate;
run;

proc transpose data=name_value out=want(drop=_name_);
  by id;
  id _name_ seq;
run;

A second way is to perform an 'array-based transposed'. This requires an apriori scan of the data to find the size of the largest id group.

proc sql noprint;
  select max (subselect.freq) into :across_count trimmed from 
  (select id, count(*) as freq from have group by id) as subselect;

%put NOTE: &=across_count;

data want2;
  do _n_ = 1 by 1 until (last.id);
    set have;
    by id;

    array starts startdate1-startdate&across_count;  
    array ends   enddate1  -enddate&across_count;

    starts [_n_] = startdate;
    ends   [_n_] = enddate;
  end;

  attrib startdate: enddate: format=ddmmyy10. informat=ddmmyy10.;
  drop startdate enddate;
run;

If you want the array based solution to have a column order that is like start1, end1, start2, end2, etc... you will need a macro to generate the variable names in that order. Example:

%macro enum(min=1, max=, template=);
  %local index;
  %do index = &min %to &max;
    %unquote(&template)
  %end;
%mend;

data want3;
  do _n_ = 1 by 1 until (last.id);
    set have;
    by id;

    attrib 
      %enum(max=&across_count, template=%nrstr(startdate&index enddate&index))
      format=ddmmyy10. informat=ddmmyy10.
    ;

    array starts startdate1-startdate&across_count;  
    array ends   enddate1  -enddate&across_count;

    starts [_n_] = startdate;
    ends   [_n_] = enddate;
  end;

  drop startdate enddate;
run;

Register Today!

SAS Training: Just a Click Away