I have a very wide and very long table of monthly risk scores for clients that looks something like this:
ID | Mo_1 | Mo_2 | Mo_3 | Mo_4 | Mo_5 | Mo_6 | Mo_7 | Mo_8 | Mo_9 | Mo_10 | Mo_11 | Mo_12 | Mo_13 | Mo_14 | Mo_15 | Mo_16 | Mo_17 | Mo_18 | Mo_19 | Mo_20 |
a | - | - | 3 | 3 | 3 | 3 | - | - | - | 3 | 3.2 | 3.3 | - | 3.4 | 3.3 | 3.2 | 3.2 | 3.2 | - | - |
b | 0 | 0 | - | 0 | 0.2 | 0.2 | 0.2 | 0.2 | - | - | - | - | - | - | - | - | - | - | - | 0.3 |
c | - | 2 | 2 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | 2 | - | - |
d | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | - | - | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 |
e | - | - | - | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 |
f | 4 | - | - | - | 3 | 3 | - | 3.2 | - | 3.2 | - | - | - | 3.3 | 3.4 | 4 | - | - | - | 3.2 |
g | - | - | - | - | - | - | - | - | - | 0 | 0 | - | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
With the first column being the client's ID and the subsequent columns being their risk score for that given month. The table above is much smaller than the actual one I am working with obviously. There are many more months and clients in the real data set.
I want to do two things with these data. The first is to create a table that summarizes any gaps between months that have a value. I want it to look something like this:
ID | Gap_Num | Gap_length | Start_Mo | End_Mo | Start_Val | End_Val | Mean |
a | 1 | 3 | Mo_6 | Mo_10 | 3 | 3 | 3 |
a | 2 | 1 | Mo_12 | Mo_14 | 3.3 | 3.4 | 3.35 |
b | 1 | 1 | Mo_2 | Mo_4 | 0 | 0 | 0 |
b | 2 | 11 | Mo_8 | Mo_19 | 0.2 | 0.3 | 0.25 |
c | 1 | 14 | Mo_3 | Mo_18 | 2 | 2 | 2 |
d | 1 | 2 | Mo_8 | Mo_11 | 0.3 | 0.6 | 0.45 |
f | 1 | 3 | Mo_1 | Mo_5 | 4 | 3 | 3.5 |
f | 2 | 1 | Mo_6 | Mo_8 | 3 | 3.2 | 3.1 |
f | 3 | 1 | Mo_8 | Mo_10 | 3.2 | 3.2 | 3.2 |
f | 4 | 3 | Mo_10 | Mo_14 | 3.2 | 3.3 | 3.25 |
f | 5 | 3 | Mo_16 | Mo_20 | 4 | 3.2 | 3.6 |
g | 1 | 1 | Mo_11 | Mo_13 | 0 | 0 | 0 |
Where each gap is listed for each client along with the size of the gap, the month before the gap began, the month after the gap ended, the risk scores for those months and the average of those risk scores.
The second thing would be to go in and fill the gaps with mean risk scores calculated above. The table below shows the filled in gaps in red font.
ID | Mo_1 | Mo_2 | Mo_3 | Mo_4 | Mo_5 | Mo_6 | Mo_7 | Mo_8 | Mo_9 | Mo_10 | Mo_11 | Mo_12 | Mo_13 | Mo_14 | Mo_15 | Mo_16 | Mo_17 | Mo_18 | Mo_19 | Mo_20 |
a | - | - | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3.2 | 3.3 | 3.35 | 3.4 | 3.3 | 3.2 | 3.2 | 3.2 | - | - |
b | 0 | 0 | 0 | 0 | 0.2 | 0.2 | 0.2 | 0.2 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.3 |
c | - | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | - | - |
d | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.45 | 0.45 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 |
e | - | - | - | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 |
f | 4 | 3.5 | 3.5 | 3.5 | 3 | 3 | 3.1 | 3.2 | 3.2 | 3.2 | 3.25 | 3.25 | 3.25 | 3.3 | 3.4 | 4 | 3.6 | 3.6 | 3.6 | 3.2 |
g | - | - | - | - | - | - | - | - | - | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Thanks in advance. Any help would be appreciated.
Best,
Marc
data have;
input ID $ Mo_1 Mo_2 Mo_3 Mo_4 Mo_5 Mo_6 Mo_7 Mo_8 Mo_9 Mo_10 Mo_11 Mo_12 Mo_13 Mo_14 Mo_15 Mo_16 Mo_17 Mo_18 Mo_19 Mo_20;
datalines;
a . . 3 3 3 3 . . . 3 3.2 3.3 . 3.4 3.3 3.2 3.2 3.2 . .
b 0 0 . 0 0.2 0.2 0.2 0.2 . . . . . . . . . . . 0.3
c . 2 2 . . . . . . . . . . . . . . 2 . .
d 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 . . 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6
e . . . 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2
f 4 . . . 3 3 . 3.2 . 3.2 . . . 3.3 3.4 4 . . . 3.2
g . . . . . . . . . 0 0 . 0 0 0 0 0 0 0 0
;
proc transpose data=have out=temp;
by id;
var Mo_:;
run;
data temp2;
set temp;
missing=missing(col1);
run;
data temp3;
set temp2;
by id missing notsorted;
if first.id then group=0;
if first.missing then group+1;
lag_col1=lag(col1);
lag_name_=lag(_name_);
if first.id then call missing(lag_col1,lag_name_);
run;
data temp4;
merge temp3 temp3(keep=id _name_ col1 rename=(id=_id _name_=next_name_ col1=next_col1) firstobs=2);
if id ne _id then call missing(next_name_,next_col1);
drop _id;
run;
data temp5;
do gap_length=1 by 1 until(last.group);
set temp4;
by id group;
if first.group then do;start_mo=lag_name_;start_val=lag_col1;end;
if last.group then do;end_mo=next_name_;end_val=next_col1;end;
end;
if missing;
keep gap_length id group start_mo end_mo start_val end_val;
run;
data temp6;
set temp5;
mean=mean(start_val,end_val);
if not missing(start_val) and not missing(end_val);
run;
data want;
set temp6;
by id;
if first.id then gap_num=0;
gap_num+1;
run;
data temp7;
merge temp3 want(keep=id group mean);
by id group;
col1=coalesce(col1,mean);
keep id _name_ col1;
run;
proc transpose data=temp7 out=want2(drop=_name_);
by id ;
id _name_;
var col1;
run;
ID | Mo_1 | Mo_2 | Mo_3 | Mo_4 | Mo_5 | Mo_6 | Mo_7 | Mo_8 | Mo_9 | Mo_10 | Mo_11 | Mo_12 | Mo_13 | Mo_14 | Mo_15 | Mo_16 | Mo_17 | Mo_18 | Mo_19 | Mo_20 |
a | - | - | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3.2 | 3.3 | 3.35 | 3.4 | 3.3 | 3.2 | 3.2 | 3.2 | - | - |
b | 0 | 0 | 0 | 0 | 0.2 | 0.2 | 0.2 | 0.2 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.25 | 0.3 |
c | - | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | - | - |
d | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.45 | 0.45 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 | 0.6 |
e | - | - | - | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 | 2.2 |
f | 4 | 3.5 | 3.5 | 3.5 | 3 | 3 | 3.1 | 3.2 | 3.2 | 3.2 | 3.25 | 3.25 | 3.25 | 3.3 | 3.4 | 4 | 3.6 | 3.6 | 3.6 | 3.2 |
g | - | - | - | - | - | - | - | - | - | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Please provide starting data in the form of a working data step pasted into a text box opened with the </> icon above the message window such as:
data have; input ID $ Mo_1 Mo_2 Mo_3 Mo_4 Mo_5 Mo_6 Mo_7 Mo_8 Mo_9 Mo_10 Mo_11 Mo_12 Mo_13 Mo_14 Mo_15 Mo_16 Mo_17 Mo_18 Mo_19 Mo_20; datalines; a . . 3 3 3 3 . . . 3 3.2 3.3 . 3.4 3.3 3.2 3.2 3.2 . . b 0 0 . 0 0.2 0.2 0.2 0.2 . . . . . . . . . . . 0.3 c . 2 2 . . . . . . . . . . . . . . 2 . . d 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 . . 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 e . . . 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 f 4 . . . 3 3 . 3.2 . 3.2 . . . 3.3 3.4 4 . . . 3.2 g . . . . . . . . . 0 0 . 0 0 0 0 0 0 0 0 ;
No need for the table with the "gap" information.
data want; set have; array m(*) mo:; do i= 1 to ( dim(m) - 1 ); if not missing(m[i]) and missing(m[i+1]) and missing(gapstart) then gapstart= i+1; if missing(m[i]) and not missing(m[i+1]) and not missing(gapstart) then gapend = i; if gapstart and gapend then do; do k= gapstart to gapend; m[k] = mean(m[gapstart-1],m[gapend+1]); end; call missing (gapstart,gapend); end; end; drop i k gapstart gapend; run;
The above identifies the start and ends of the gaps uses that information after it is determined to calculate the mean of the endpoints.
data have;
input ID $ Mo_1 Mo_2 Mo_3 Mo_4 Mo_5 Mo_6 Mo_7 Mo_8 Mo_9 Mo_10 Mo_11 Mo_12 Mo_13 Mo_14 Mo_15 Mo_16 Mo_17 Mo_18 Mo_19 Mo_20;
datalines;
a . . 3 3 3 3 . . . 3 3.2 3.3 . 3.4 3.3 3.2 3.2 3.2 . .
b 0 0 . 0 0.2 0.2 0.2 0.2 . . . . . . . . . . . 0.3
c . 2 2 . . . . . . . . . . . . . . 2 . .
d 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 . . 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6
e . . . 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2 2.2
f 4 . . . 3 3 . 3.2 . 3.2 . . . 3.3 3.4 4 . . . 3.2
g . . . . . . . . . 0 0 . 0 0 0 0 0 0 0 0
;
proc transpose data=have out=temp;
by id;
var Mo_:;
run;
data temp2;
set temp;
missing=missing(col1);
run;
data temp3;
set temp2;
by id missing notsorted;
if first.id then group=0;
if first.missing then group+1;
lag_col1=lag(col1);
lag_name_=lag(_name_);
if first.id then call missing(lag_col1,lag_name_);
run;
data temp4;
merge temp3 temp3(keep=id _name_ col1 rename=(id=_id _name_=next_name_ col1=next_col1) firstobs=2);
if id ne _id then call missing(next_name_,next_col1);
drop _id;
run;
data temp5;
do gap_length=1 by 1 until(last.group);
set temp4;
by id group;
if first.group then do;start_mo=lag_name_;start_val=lag_col1;end;
if last.group then do;end_mo=next_name_;end_val=next_col1;end;
end;
if missing;
keep gap_length id group start_mo end_mo start_val end_val;
run;
data temp6;
set temp5;
mean=mean(start_val,end_val);
if not missing(start_val) and not missing(end_val);
run;
data want;
set temp6;
by id;
if first.id then gap_num=0;
gap_num+1;
run;
data temp7;
merge temp3 want(keep=id group mean);
by id group;
col1=coalesce(col1,mean);
keep id _name_ col1;
run;
proc transpose data=temp7 out=want2(drop=_name_);
by id ;
id _name_;
var col1;
run;
Build your skills. Make connections. Enjoy creative freedom. Maybe change the world. Registration is now open through August 30th. Visit the SAS Hackathon homepage.
Register today!Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Select SAS Training centers are offering in-person courses. View upcoming courses for: