Solved: Fill in gaps in a monthly array

ModeratelyWise

I have a very wide and very long table of monthly risk scores for clients that looks something like this:

ID	Mo_1	Mo_2	Mo_3	Mo_4	Mo_5	Mo_6	Mo_7	Mo_8	Mo_9	Mo_10	Mo_11	Mo_12	Mo_13	Mo_14	Mo_15	Mo_16	Mo_17	Mo_18	Mo_19	Mo_20
a	-	-	3	3	3	3	-	-	-	3	3.2	3.3	-	3.4	3.3	3.2	3.2	3.2	-	-
b	0	0	-	0	0.2	0.2	0.2	0.2	-	-	-	-	-	-	-	-	-	-	-	0.3
c	-	2	2	-	-	-	-	-	-	-	-	-	-	-	-	-	-	2	-	-
d	0.3	0.3	0.3	0.3	0.3	0.3	0.3	0.3	-	-	0.6	0.6	0.6	0.6	0.6	0.6	0.6	0.6	0.6	0.6
e	-	-	-	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2
f	4	-	-	-	3	3	-	3.2	-	3.2	-	-	-	3.3	3.4	4	-	-	-	3.2
g	-	-	-	-	-	-	-	-	-	0	0	-	0	0	0	0	0	0	0	0

With the first column being the client's ID and the subsequent columns being their risk score for that given month. The table above is much smaller than the actual one I am working with obviously. There are many more months and clients in the real data set.

I want to do two things with these data. The first is to create a table that summarizes any gaps between months that have a value. I want it to look something like this:

ID	Gap_Num	Gap_length	Start_Mo	End_Mo	Start_Val	End_Val	Mean
a	1	3	Mo_6	Mo_10	3	3	3
a	2	1	Mo_12	Mo_14	3.3	3.4	3.35
b	1	1	Mo_2	Mo_4	0	0	0
b	2	11	Mo_8	Mo_19	0.2	0.3	0.25
c	1	14	Mo_3	Mo_18	2	2	2
d	1	2	Mo_8	Mo_11	0.3	0.6	0.45
f	1	3	Mo_1	Mo_5	4	3	3.5
f	2	1	Mo_6	Mo_8	3	3.2	3.1
f	3	1	Mo_8	Mo_10	3.2	3.2	3.2
f	4	3	Mo_10	Mo_14	3.2	3.3	3.25
f	5	3	Mo_16	Mo_20	4	3.2	3.6
g	1	1	Mo_11	Mo_13	0	0	0

Where each gap is listed for each client along with the size of the gap, the month before the gap began, the month after the gap ended, the risk scores for those months and the average of those risk scores.

The second thing would be to go in and fill the gaps with mean risk scores calculated above. The table below shows the filled in gaps in red font.

ID	Mo_1	Mo_2	Mo_3	Mo_4	Mo_5	Mo_6	Mo_7	Mo_8	Mo_9	Mo_10	Mo_11	Mo_12	Mo_13	Mo_14	Mo_15	Mo_16	Mo_17	Mo_18	Mo_19	Mo_20
a	-	-	3	3	3	3	3	3	3	3	3.2	3.3	3.35	3.4	3.3	3.2	3.2	3.2	-	-
b	0	0	0	0	0.2	0.2	0.2	0.2	0.25	0.25	0.25	0.25	0.25	0.25	0.25	0.25	0.25	0.25	0.25	0.3
c	-	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	-	-
d	0.3	0.3	0.3	0.3	0.3	0.3	0.3	0.3	0.45	0.45	0.6	0.6	0.6	0.6	0.6	0.6	0.6	0.6	0.6	0.6
e	-	-	-	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2
f	4	3.5	3.5	3.5	3	3	3.1	3.2	3.2	3.2	3.25	3.25	3.25	3.3	3.4	4	3.6	3.6	3.6	3.2
g	-	-	-	-	-	-	-	-	-	0	0	0	0	0	0	0	0	0	0	0

Thanks in advance. Any help would be appreciated.

Best,

Marc

Ksharp

data have;
  input ID $ 	Mo_1 	Mo_2 	Mo_3 	Mo_4 	Mo_5 	Mo_6 	Mo_7 	Mo_8 	Mo_9 	Mo_10 	Mo_11 	Mo_12 	Mo_13 	Mo_14 	Mo_15 	Mo_16 	Mo_17 	Mo_18 	Mo_19 	Mo_20;
datalines;
a 	. 	. 	3 	3 	3 	3 	. 	. 	. 	3 	3.2 	3.3 	. 	3.4 	3.3 	3.2 	3.2 	3.2 	. 	.
b 	0 	0 	. 	0 	0.2 	0.2 	0.2 	0.2 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	0.3
c 	. 	2 	2 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	2 	. 	.
d 	0.3 	0.3 	0.3 	0.3 	0.3 	0.3 	0.3 	0.3 	. 	. 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6
e 	. 	. 	. 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2
f 	4 	. 	. 	. 	3 	3 	. 	3.2 	. 	3.2 	. 	. 	. 	3.3 	3.4 	4 	. 	. 	. 	3.2
g 	. 	. 	. 	. 	. 	. 	. 	. 	. 	0 	0 	. 	0 	0 	0 	0 	0 	0 	0 	0
;
proc transpose data=have out=temp;
by id;
var Mo_:;
run;
data temp2;
 set temp;
 missing=missing(col1);
run;
data temp3;
 set temp2;
 by id missing notsorted;
 if first.id then group=0;
 if first.missing then group+1;

 lag_col1=lag(col1);
 lag_name_=lag(_name_);
 if first.id then call missing(lag_col1,lag_name_);
run;
data temp4;
 merge temp3 temp3(keep=id _name_ col1 rename=(id=_id _name_=next_name_ col1=next_col1) firstobs=2);
 if id ne _id then call missing(next_name_,next_col1);
 drop _id;
run;
data temp5;
do gap_length=1 by 1 until(last.group);
 set temp4;
 by id group;
 if first.group then do;start_mo=lag_name_;start_val=lag_col1;end;
 if last.group then do;end_mo=next_name_;end_val=next_col1;end;
end;
if missing;
keep gap_length id group start_mo end_mo  start_val end_val;
run;
data temp6;
 set temp5;
 mean=mean(start_val,end_val);
 if not missing(start_val) and not missing(end_val);
run;
data want;
 set temp6;
 by id;
 if first.id then gap_num=0;
 gap_num+1;
run;


data temp7;
 merge temp3 want(keep=id group mean);
 by id group;
 col1=coalesce(col1,mean);
 keep id _name_ col1;
run;
proc transpose data=temp7 out=want2(drop=_name_);
by id ;
id _name_;
var col1;
run;

View solution in original post

ModeratelyWise

ID	Mo_1	Mo_2	Mo_3	Mo_4	Mo_5	Mo_6	Mo_7	Mo_8	Mo_9	Mo_10	Mo_11	Mo_12	Mo_13	Mo_14	Mo_15	Mo_16	Mo_17	Mo_18	Mo_19	Mo_20
a	-	-	3	3	3	3	3	3	3	3	3.2	3.3	3.35	3.4	3.3	3.2	3.2	3.2	-	-
b	0	0	0	0	0.2	0.2	0.2	0.2	0.25	0.25	0.25	0.25	0.25	0.25	0.25	0.25	0.25	0.25	0.25	0.3
c	-	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	-	-
d	0.3	0.3	0.3	0.3	0.3	0.3	0.3	0.3	0.45	0.45	0.6	0.6	0.6	0.6	0.6	0.6	0.6	0.6	0.6	0.6
e	-	-	-	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2	2.2
f	4	3.5	3.5	3.5	3	3	3.1	3.2	3.2	3.2	3.25	3.25	3.25	3.3	3.4	4	3.6	3.6	3.6	3.2
g	-	-	-	-	-	-	-	-	-	0	0	0	0	0	0	0	0	0	0	0

ballardw

Please provide starting data in the form of a working data step pasted into a text box opened with the </> icon above the message window such as:

data have;
  input ID $ 	Mo_1 	Mo_2 	Mo_3 	Mo_4 	Mo_5 	Mo_6 	Mo_7 	Mo_8 	Mo_9 	Mo_10 	Mo_11 	Mo_12 	Mo_13 	Mo_14 	Mo_15 	Mo_16 	Mo_17 	Mo_18 	Mo_19 	Mo_20;
datalines;
a 	. 	. 	3 	3 	3 	3 	. 	. 	. 	3 	3.2 	3.3 	. 	3.4 	3.3 	3.2 	3.2 	3.2 	. 	.
b 	0 	0 	. 	0 	0.2 	0.2 	0.2 	0.2 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	0.3
c 	. 	2 	2 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	2 	. 	.
d 	0.3 	0.3 	0.3 	0.3 	0.3 	0.3 	0.3 	0.3 	. 	. 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6
e 	. 	. 	. 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2
f 	4 	. 	. 	. 	3 	3 	. 	3.2 	. 	3.2 	. 	. 	. 	3.3 	3.4 	4 	. 	. 	. 	3.2
g 	. 	. 	. 	. 	. 	. 	. 	. 	. 	0 	0 	. 	0 	0 	0 	0 	0 	0 	0 	0
;

No need for the table with the "gap" information.

data want;
   set have;
   array m(*) mo:;
   do i= 1 to ( dim(m) - 1 );
      if not missing(m[i]) and missing(m[i+1]) and missing(gapstart) then gapstart= i+1;
      if missing(m[i]) and not missing(m[i+1]) and not missing(gapstart) then gapend = i;
      if gapstart and gapend then do;
         
         do k= gapstart to gapend;
            m[k] = mean(m[gapstart-1],m[gapend+1]);
         end;
         call missing (gapstart,gapend);
      end;

   end;
   drop i k gapstart gapend;
run;

The above identifies the start and ends of the gaps uses that information after it is determined to calculate the mean of the endpoints.

Ksharp

data have;
  input ID $ 	Mo_1 	Mo_2 	Mo_3 	Mo_4 	Mo_5 	Mo_6 	Mo_7 	Mo_8 	Mo_9 	Mo_10 	Mo_11 	Mo_12 	Mo_13 	Mo_14 	Mo_15 	Mo_16 	Mo_17 	Mo_18 	Mo_19 	Mo_20;
datalines;
a 	. 	. 	3 	3 	3 	3 	. 	. 	. 	3 	3.2 	3.3 	. 	3.4 	3.3 	3.2 	3.2 	3.2 	. 	.
b 	0 	0 	. 	0 	0.2 	0.2 	0.2 	0.2 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	0.3
c 	. 	2 	2 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	2 	. 	.
d 	0.3 	0.3 	0.3 	0.3 	0.3 	0.3 	0.3 	0.3 	. 	. 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6 	0.6
e 	. 	. 	. 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2 	2.2
f 	4 	. 	. 	. 	3 	3 	. 	3.2 	. 	3.2 	. 	. 	. 	3.3 	3.4 	4 	. 	. 	. 	3.2
g 	. 	. 	. 	. 	. 	. 	. 	. 	. 	0 	0 	. 	0 	0 	0 	0 	0 	0 	0 	0
;
proc transpose data=have out=temp;
by id;
var Mo_:;
run;
data temp2;
 set temp;
 missing=missing(col1);
run;
data temp3;
 set temp2;
 by id missing notsorted;
 if first.id then group=0;
 if first.missing then group+1;

 lag_col1=lag(col1);
 lag_name_=lag(_name_);
 if first.id then call missing(lag_col1,lag_name_);
run;
data temp4;
 merge temp3 temp3(keep=id _name_ col1 rename=(id=_id _name_=next_name_ col1=next_col1) firstobs=2);
 if id ne _id then call missing(next_name_,next_col1);
 drop _id;
run;
data temp5;
do gap_length=1 by 1 until(last.group);
 set temp4;
 by id group;
 if first.group then do;start_mo=lag_name_;start_val=lag_col1;end;
 if last.group then do;end_mo=next_name_;end_val=next_col1;end;
end;
if missing;
keep gap_length id group start_mo end_mo  start_val end_val;
run;
data temp6;
 set temp5;
 mean=mean(start_val,end_val);
 if not missing(start_val) and not missing(end_val);
run;
data want;
 set temp6;
 by id;
 if first.id then gap_num=0;
 gap_num+1;
run;


data temp7;
 merge temp3 want(keep=id group mean);
 by id group;
 col1=coalesce(col1,mean);
 keep id _name_ col1;
run;
proc transpose data=temp7 out=want2(drop=_name_);
by id ;
id _name_;
var col1;
run;

Fill in gaps in a monthly array

Re: Fill in gaps in a monthly array

Re: Fill in gaps in a monthly array

Re: Fill in gaps in a monthly array

Re: Fill in gaps in a monthly array

Fill in gaps in a monthly array

Re: Fill in gaps in a monthly array

Re: Fill in gaps in a monthly array

Re: Fill in gaps in a monthly array

Re: Fill in gaps in a monthly array

Ready to join fellow brilliant minds for the SAS Hackathon?

Click image to register for webinar

Classroom Training Available!