Hello @mhoward2,
@mhoward2 wrote:
The problem is the code is ugly and I feel like there should be a better way to do this. Here is the code:
PROC SQL;
CREATE TABLE TMP1DAY.MONTHLY_SUMMARY AS
SELECT t1.ID_VARIABLE,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201901" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201902", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201901,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201902" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201903", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201902,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201903" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201904", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201903,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201904" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201905", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201904,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201905" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201906", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201905,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201906" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201907", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201906,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201907" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201908", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201907,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201908" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201909", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201908,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201909" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201910", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201909,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201910" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201911", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201910,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201911" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201912", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201911,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201912" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "202001", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201912_01,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201912" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "202002", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201912_02,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202001" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202002", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202001,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202002" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202003", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202002,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202003" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202004", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202003,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202004" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202005", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202004,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202005" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202006", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202005,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202006" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202007", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202006,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202007" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202008", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202007,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202008" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202009", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202008,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202009" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202010", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202009,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202010" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202011", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202010,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202011" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202012", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202011,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202012" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202101", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202012_01,
SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202012" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202102", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202012_02
FROM WORK.PRE_MONTHLY_SUMMARY t1
GROUP BY 1;
QUIT;
You could write a macro to shorten the code and to allow for arbitrary ranges of years:
%macro sumcode(start_yr, final_yr);
%local y y2 m m2 zm zm2;
%do y=&start_yr %to &final_yr;
%let y2=%eval(&y+1);
%do m=1 %to 11;
%let m2=%eval(&m+1);
%let zm=%sysfunc(putn(&m,z2.));
%let zm2=%sysfunc(putn(&m2,z2.));
, SUM(IFN(t1.FIRST_YYYYMM BETWEEN "&y.01" AND "&y&zm" AND t1.SECOND_YYYYMM BETWEEN "&y.01" AND "&y&zm2", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_&y&zm
%end;
, SUM(IFN(t1.FIRST_YYYYMM BETWEEN "&y.01" AND "&y.12" AND t1.SECOND_YYYYMM BETWEEN "&y.01" AND "&y2.01", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_&y.12_01
, SUM(IFN(t1.FIRST_YYYYMM BETWEEN "&y.01" AND "&y.12" AND t1.SECOND_YYYYMM BETWEEN "&y.01" AND "&y2.02", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_&y.12_02
%end;
%mend sumcode;
Then your PROC SQL step, extended to, say, years 2018 - 2023, would look like this:
PROC SQL;
CREATE TABLE TMP1DAY.MONTHLY_SUMMARY AS
SELECT t1.ID_VARIABLE
%sumcode(2018,2023)
FROM WORK.PRE_MONTHLY_SUMMARY t1
GROUP BY 1;
QUIT;
It turned out that this PROC SQL step is relatively fast. I have tested it on the sample dataset (with approx. 550,000 observations) created below:
/* Create sample data for demonstration */
data pre_monthly_summary;
call streaminit(27182818);
do id_variable=1 to 100000;
d=intnx('month','01JAN2018'd,rand('integer',0,40));
do _n_=1 to rand('integer',10);
first_yyyymm=put(d,yymmn.);
second_yyyymm=put(intnx('month',d,rand('integer',0,9)),yymmn.);
sum_column=5*rand('integer',0,30);
output;
d=intnx('month',d,rand('integer',0,9));
end;
end;
drop d;
run;
%let first_yr=2018;
%let final_yr=2023;
Feel free to modify this code to make it more realistic.
I have also tried to implement a different approach, inspired by Reeza's ideas, but with the sample data above the final PROC TRANSPOSE step alone is much slower than your PROC SQL step (on my old workstation), likely due to my poor implementation. Here is the code anyway. At least it reproduces the results of your PROC SQL step for the sample data. Maybe you can use it for validation purposes.
/* Aggregation preparing subsequent calculation */
proc sql;
create table agg(drop=bd) as
select id_variable, input(first_yyyymm,4.) as yr,
mdy(1,1,calculated yr) as bd,
max(intck('month',calculated bd,input(first_yyyymm,yymmn6.)),intck('month',calculated bd,input(second_yyyymm,yymmn6.))-1) as md,
sum(sum_column) as s
from pre_monthly_summary
where 0<=calculated md<=12
group by 1,2,4;
quit;
/* Compute cumulative sums of SUM_COLUMN */
data cumul(drop=s);
set agg;
by id_variable yr;
where &first_yr<=yr<=&final_yr;
if first.yr then c=s;
else c+s;
run;
/* Create template with all combinations of IDs and month differences */
data all_dates;
do yr=&first_yr to &final_yr;
do md=0 to 12;
output;
end;
end;
run;
proc sql;
create view all_comb as
select distinct id_variable, yr, md
from pre_monthly_summary, all_dates;
quit;
/* Insert missing months into CUMUL and reformat grouping variable */
data cumul2;
merge cumul
all_comb;
by id_variable yr md;
length m $9;
if md=11 then m=cat(yr,'12_01');
else if md=12 then m=cat(yr,'12_02');
else m=cat(yr,put(md+1,z2.));
drop md;
run;
/* Impute missing values of the cumulative sums */
data cumul2LOCF;
update cumul2(obs=0) cumul2;
by id_variable yr;
if c=. then c=0;
output;
run;
/* Create final wide dataset */
proc transpose data=cumul2LOCF out=want(drop=_:) prefix=sum_column_;
by id_variable;
id m;
var c;
run;
... View more