Solved: Transform a dataset from long to wide format and fill values

NewUsrStat · Posted 02-07-2025 03:43 AM

Hi guys,

suppose to have the following table:

data DB;
  input ID :$20. Admission :date09. Discharge :date09. Age_class Age_end Index Value Total; 
  format Admission date9. Discharge date9.;
cards;
0001 01JUL2014  16AUG2014    1    4   1   2.3  11.9
0001 13MAY2018  22JUN2018    3    4   0   1.4   .
0001 23JAN2019  25JAN2019    4    4   0    3.2  .
0002 13MAY2016  22SEP2016    1    5   1    2   7.9
0002 09JUL2023  10JUL2023    2    5   0   0.3   .
0002 12SEP2024  15SEP2024    3    5   0   0.2   .
0003 01JUL2014  18AUG2014    1    3   1   12  17.3
0003 07DEC2023  16DEC2023    2    3   0   0.3   .
0004 12JAN2014  15JAN2014    1    2   1    2    2.1
0004 30MAY2019  13JUL2019    2    2   0   0.1   . 
0005 30JUN2019  13OCT2019    5    5   0   4.1   . 
; 
run;

Is there a way to get the following?


data DB1;
  input ID :$20. Admission :date09. Discharge :date09. Age_class Age_end Index Value Total Age_class1 Age_class2 Age_class3 Age_class4 Age_class5; 
  format Admission date9. Discharge date9.;
cards;
0001 01JUL2014  16AUG2014    1    4   1   2.3  11.9   2.3   5   1.4   3.2    .
0001 13MAY2018  22JUN2018    3    4   0   1.4   .      .     .    .   .    .
0001 23JAN2019  25JAN2019    4    4   0   3.2   .      .     .    .   .    .
0002 13MAY2016  22SEP2016    1    5   1    2   7.9     2    0.3   0.2   5    0.4
0002 09JUL2023  10JUL2023    2    5   0   0.3   .      .     .    .   .    .
0002 12SEP2024  15SEP2024    3    5   0   0.2   .      .     .    .   .    .
0003 01JUL2014  18AUG2014    1    3   1    12  17.3    12   0.3   5   .    .
0003 07DEC2023  16DEC2023    2    3   0    0.3  .      .     .    .   .    . 
0004 12JAN2014  15JAN2014    1    2   1     2   2.1    2    0.1   .   .    .
0004 30MAY2019  13JUL2019    2    2   0    0.1   .     .     .    .   .    .
0005 30JUN2019  13OCT2019    5    5   1    4.1  4.1    .     .    .   .    4.1
; 
run;

In other words, for each ID in DB there is the Age_class at each admission and an Age_class at exit (Age_end) from the study. Then, there is the "Value" variable that is the time the patient stay in the corresponding Age_class (before exit). There is also a "Total" time in the study. What I need is to transform the dataset DB into a wide format where there are columns corresponding to the age classes filled by the corresponding Value. Note that in this new dataset the row-wise total must be reached and must be equal to "Total" and so: if there is a jump in the age classes (Age_class variable) a value = 5 years must be added because the age-class interval is of max 5 years.

Finally, in the new wide dataset (DB1) rows must be filled only where Index = 1

Can anyone help me please?

Thank you very much in advance

Patrick · Posted 02-07-2025 05:51 AM

Does below return what you're after?

data have;
  input ID :$20. Admission :date09. Discharge :date09. Age_class Age_end Index Value Total Age_class1 Age_class2 Age_class3 Age_class4 Age_class5; 
  format Admission date9. Discharge date9.;
cards;
0001 01JUL2014  16AUG2014    1    4   1   2.3  11.9   2.3   1.4   5   3.2    .
0001 13MAY2018  22JUN2018    3    4   0   1.4   .      .     .    .   .    .
0001 23JAN2019  25JAN2019    4    4   0   3.2   .      .     .    .   .    .
0002 13MAY2016  22SEP2016    1    5   1    2   7.9     2    0.3   0.2   5    0.4
0002 09JUL2023  10JUL2023    2    5   0   0.3   .      .     .    .   .    .
0002 12SEP2024  15SEP2024    3    5   0   0.2   .      .     .    .   .    .
0003 01JUL2014  18AUG2014    1    3   1    12  17.3    12   0.3   5   .    .
0003 07DEC2023  16DEC2023    2    3   0    0.3  .      .     .    .   .    . 
0004 12JAN2014  15JAN2014    1    2   1     2   2.1    2    0.1   .   .    .
0004 30MAY2019  13JUL2019    2    2   0    0.1   .     .     .    .   .    .
0005 30JUN2019  13OCT2019    5    5   1    4.1  4.1    .     .    .   .    4.1
0006 30JUN2019  13OCT2019    5    5   1    0  0    .     .    .   .    .
; 
run;

data want(drop=_:);
  if _n_=1 then 
    do;
      _val=0;
      dcl hash h1(dataset:'have(rename=(value=_val))', ordered:'y');
      h1.defineKey('id','age_class');
      h1.defineData('_val');
      h1.defineDone();
    end;
    set have;
    array age_class_derived{5} 8;

    if index=1 then
      do;
        /* distribute existing values */
        do _i=1 to dim(age_class_derived);
          if h1.find(key:id,key:_i)=0 then age_class_derived[_i]=_val;
        end;
        
        /* fill-up with max 5 until total reached */
        _total=round(sum(of age_class_derived[*]),.0000001);
        do _i=1 to dim(age_class_derived) while( _total<total );
          if missing(age_class_derived[_i]) then 
            do;
              age_class_derived[_i]=min(5,abs(sum(total,-_total)));
            end;
            _total=round(sum(of age_class_derived[*]),.0000001);
        end;
      end;
run;

proc print data=want;
run;

I've made the assumption that the marked cells in your sample data were typos.

View solution in original post

Patrick · Posted 02-07-2025 05:51 AM

Does below return what you're after?

data have;
  input ID :$20. Admission :date09. Discharge :date09. Age_class Age_end Index Value Total Age_class1 Age_class2 Age_class3 Age_class4 Age_class5; 
  format Admission date9. Discharge date9.;
cards;
0001 01JUL2014  16AUG2014    1    4   1   2.3  11.9   2.3   1.4   5   3.2    .
0001 13MAY2018  22JUN2018    3    4   0   1.4   .      .     .    .   .    .
0001 23JAN2019  25JAN2019    4    4   0   3.2   .      .     .    .   .    .
0002 13MAY2016  22SEP2016    1    5   1    2   7.9     2    0.3   0.2   5    0.4
0002 09JUL2023  10JUL2023    2    5   0   0.3   .      .     .    .   .    .
0002 12SEP2024  15SEP2024    3    5   0   0.2   .      .     .    .   .    .
0003 01JUL2014  18AUG2014    1    3   1    12  17.3    12   0.3   5   .    .
0003 07DEC2023  16DEC2023    2    3   0    0.3  .      .     .    .   .    . 
0004 12JAN2014  15JAN2014    1    2   1     2   2.1    2    0.1   .   .    .
0004 30MAY2019  13JUL2019    2    2   0    0.1   .     .     .    .   .    .
0005 30JUN2019  13OCT2019    5    5   1    4.1  4.1    .     .    .   .    4.1
0006 30JUN2019  13OCT2019    5    5   1    0  0    .     .    .   .    .
; 
run;

data want(drop=_:);
  if _n_=1 then 
    do;
      _val=0;
      dcl hash h1(dataset:'have(rename=(value=_val))', ordered:'y');
      h1.defineKey('id','age_class');
      h1.defineData('_val');
      h1.defineDone();
    end;
    set have;
    array age_class_derived{5} 8;

    if index=1 then
      do;
        /* distribute existing values */
        do _i=1 to dim(age_class_derived);
          if h1.find(key:id,key:_i)=0 then age_class_derived[_i]=_val;
        end;
        
        /* fill-up with max 5 until total reached */
        _total=round(sum(of age_class_derived[*]),.0000001);
        do _i=1 to dim(age_class_derived) while( _total<total );
          if missing(age_class_derived[_i]) then 
            do;
              age_class_derived[_i]=min(5,abs(sum(total,-_total)));
            end;
            _total=round(sum(of age_class_derived[*]),.0000001);
        end;
      end;
run;

proc print data=want;
run;

I've made the assumption that the marked cells in your sample data were typos.

NewUsrStat · Posted 02-07-2025 06:14 AM

Really really sorry: for ID 0001 age class 3 = 1.4 while age_class = 2 is 5. I will edit my question soon.

Tom · Posted 02-07-2025 10:07 AM

The desired result does not make much sense.

Either make a wide dataset with one observation per ID.

So perhaps something like this?

Or leave it in the tall format.

PaigeMiller · Posted 02-07-2025 10:15 AM

Agreeing with @Tom , I don't see the point of doing this transformation from long to wide. What is the benefit here of creating a wide data set as you stated above? What can you do with such a wide data set that you can't do with the long data set?

As things stand so far in this thread, this is the XY problem, where it seems like you are tightly focused on one specific programming task, and possibly you have chosen to go down a path this is inefficient and hard to program; and if we knew where you wanted to go, we could help figure out a better path that is easier to program.

--
Paige Miller

ballardw · Posted 02-07-2025 02:55 PM

@PaigeMiller wrote:

Agreeing with @Tom , I don't see the point of doing this transformation from long to wide. What is the benefit here of creating a wide data set as you stated above? What can you do with such a wide data set that you can't do with the long data set?

As things stand so far in this thread, this is the XY problem, where it seems like you are tightly focused on one specific programming task, and possibly you have chosen to go down a path this is inefficient and hard to program; and if we knew where you wanted to go, we could help figure out a better path that is easier to program.

From several of the other posts by the OP I would say this set might be related to interpolating between the "wide" values prior to transposing back to a long form.

I would say that OP posts have all the symptoms of an XY problem with never describing actual starting data and where to go but keeps asking how to do one task and often poorly described at that.

Ksharp · Posted 02-08-2025 02:54 AM

Just for having some fun.

data DB;
  input ID :$20. Admission :date09. Discharge :date09. Age_class Age_end Index Value Total; 
  format Admission date9. Discharge date9.;
cards;
0001 01JUL2014  16AUG2014    1    4   1   2.3  11.9
0001 13MAY2018  22JUN2018    3    4   0   1.4   .
0001 23JAN2019  25JAN2019    4    4   0    3.2  .
0002 13MAY2016  22SEP2016    1    5   1    2   7.9
0002 09JUL2023  10JUL2023    2    5   0   0.3   .
0002 12SEP2024  15SEP2024    3    5   0   0.2   .
0003 01JUL2014  18AUG2014    1    3   1   12  17.3
0003 07DEC2023  16DEC2023    2    3   0   0.3   .
0004 12JAN2014  15JAN2014    1    2   1    2    2.1
0004 30MAY2019  13JUL2019    2    2   0   0.1   . 
0005 30JUN2019  13OCT2019    5    5   0   4.1   . 
; 
run;
proc sql noprint;
create table summary as
select ID,sum(Total)-sum(Value) as balance
 from DB
  group by ID;
select max(Age_class) into :n trimmed from DB;
quit;
data token;
set summary;
temp=balance;n=0;
if balance<5 then do;n=1;output;end;
 else do;
   do until(balance<5);
     n+1;balance=5;output;
     balance=temp-5;
     temp=temp-5;
   end;
   n+1;output;
 end;
drop temp;
run;
proc transpose data=DB out=transpose(drop=_name_) prefix=Age_class;
by id ;
id Age_class;
var value;
run;
data want;
if _n_=1 then do;
if 0 then set DB;
retain Age_class1-Age_class&n.; *Change the order of variables;
if 0 then set token;
declare hash h(dataset:'token');
h.definekey('id','n');
h.definedata('balance');
h.definedone();
end;
merge DB transpose;
by id;
array x{*} Age_class1-Age_class&n.;
n=0;
if first.id then do;
do i=1 to dim(x);
  if missing(x{i}) then do;
    n+1;call missing(balance);rc=h.find();x{i}=balance;rc=h.remove();
  end;
end;
end;
drop rc n i balance;
output;
call missing(of _all_);
run;

Transform a dataset from long to wide format and fill values

Re: Transform a dataset from long to wide format and fill values

Re: Transform a dataset from long to wide format and fill values

Re: Transform a dataset from long to wide format and fill values

Re: Transform a dataset from long to wide format and fill values

Re: Transform a dataset from long to wide format and fill values

Re: Transform a dataset from long to wide format and fill values

Re: Transform a dataset from long to wide format and fill values

Registration is open