BookmarkSubscribeRSS Feed
☑ This topic is solved. Need further help from the community? Please sign in and ask a new question.
mhoward2
Obsidian | Level 7

Hi, I have some code that sums up monthly values and assigns each month as a new column. The problem is the code is ugly and I feel like there should be a better way to do this. Here is the code:

 

PROC SQL;
	CREATE TABLE TMP1DAY.MONTHLY_SUMMARY AS
	SELECT t1.ID_VARIABLE,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201901" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201902", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201901,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201902" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201903", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201902,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201903" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201904", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201903,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201904" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201905", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201904,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201905" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201906", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201905,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201906" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201907", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201906,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201907" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201908", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201907,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201908" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201909", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201908,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201909" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201910", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201909,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201910" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201911", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201910,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201911" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201912", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201911,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201912" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "202001", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201912_01,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201912" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "202002", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201912_02,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202001" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202002", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202001,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202002" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202003", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202002,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202003" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202004", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202003,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202004" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202005", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202004,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202005" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202006", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202005,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202006" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202007", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202006,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202007" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202008", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202007,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202008" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202009", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202008,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202009" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202010", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202009,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202010" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202011", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202010,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202011" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202012", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202011,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202012" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202101", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202012_01,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202012" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202102", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202012_02
	FROM WORK.PRE_MONTHLY_SUMMARY t1
	GROUP BY 1;
QUIT;

How would I instead write this to do the same thing but programmatically/with macro variables? its already out of hand and I will need to add more years which will just make it even worse. Any and all advice is appreciated.

 

Thanks in advance!

1 ACCEPTED SOLUTION

Accepted Solutions
FreelanceReinh
Jade | Level 19

Hello @mhoward2,

 


@mhoward2 wrote:

The problem is the code is ugly and I feel like there should be a better way to do this. Here is the code:

PROC SQL;
	CREATE TABLE TMP1DAY.MONTHLY_SUMMARY AS
	SELECT t1.ID_VARIABLE,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201901" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201902", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201901,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201902" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201903", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201902,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201903" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201904", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201903,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201904" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201905", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201904,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201905" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201906", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201905,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201906" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201907", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201906,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201907" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201908", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201907,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201908" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201909", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201908,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201909" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201910", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201909,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201910" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201911", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201910,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201911" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201912", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201911,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201912" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "202001", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201912_01,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201912" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "202002", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201912_02,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202001" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202002", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202001,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202002" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202003", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202002,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202003" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202004", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202003,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202004" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202005", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202004,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202005" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202006", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202005,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202006" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202007", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202006,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202007" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202008", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202007,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202008" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202009", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202008,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202009" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202010", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202009,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202010" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202011", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202010,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202011" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202012", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202011,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202012" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202101", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202012_01,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202012" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202102", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202012_02
	FROM WORK.PRE_MONTHLY_SUMMARY t1
	GROUP BY 1;
QUIT;

You could write a macro to shorten the code and to allow for arbitrary ranges of years:

%macro sumcode(start_yr, final_yr);
%local y y2 m m2 zm zm2;
%do y=&start_yr %to &final_yr;
  %let y2=%eval(&y+1);
  %do m=1 %to 11;
    %let m2=%eval(&m+1);
    %let zm=%sysfunc(putn(&m,z2.));
    %let zm2=%sysfunc(putn(&m2,z2.));
    , SUM(IFN(t1.FIRST_YYYYMM BETWEEN "&y.01" AND "&y&zm" AND t1.SECOND_YYYYMM BETWEEN "&y.01" AND "&y&zm2", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_&y&zm
  %end;
    , SUM(IFN(t1.FIRST_YYYYMM BETWEEN "&y.01" AND "&y.12" AND t1.SECOND_YYYYMM BETWEEN "&y.01" AND "&y2.01", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_&y.12_01
    , SUM(IFN(t1.FIRST_YYYYMM BETWEEN "&y.01" AND "&y.12" AND t1.SECOND_YYYYMM BETWEEN "&y.01" AND "&y2.02", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_&y.12_02
%end;
%mend sumcode;

 

Then your PROC SQL step, extended to, say, years 2018 - 2023, would look like this:

PROC SQL;
    CREATE TABLE TMP1DAY.MONTHLY_SUMMARY AS
    SELECT t1.ID_VARIABLE
           %sumcode(2018,2023)
    FROM WORK.PRE_MONTHLY_SUMMARY t1
    GROUP BY 1;
QUIT;

It turned out that this PROC SQL step is relatively fast. I have tested it on the sample dataset (with approx. 550,000 observations) created below:

/* Create sample data for demonstration */

data pre_monthly_summary;
call streaminit(27182818);
do id_variable=1 to 100000;
  d=intnx('month','01JAN2018'd,rand('integer',0,40));
  do _n_=1 to rand('integer',10);
    first_yyyymm=put(d,yymmn.);
    second_yyyymm=put(intnx('month',d,rand('integer',0,9)),yymmn.);
    sum_column=5*rand('integer',0,30);
    output;
    d=intnx('month',d,rand('integer',0,9));
  end;
end;
drop d;
run;

%let first_yr=2018;
%let final_yr=2023;

Feel free to modify this code to make it more realistic.

 

I have also tried to implement a different approach, inspired by Reeza's ideas, but with the sample data above the final PROC TRANSPOSE step alone is much slower than your PROC SQL step (on my old workstation), likely due to my poor implementation. Here is the code anyway. At least it reproduces the results of your PROC SQL step for the sample data. Maybe you can use it for validation purposes.

Spoiler
/* Aggregation preparing subsequent calculation */

proc sql;
create table agg(drop=bd) as
select id_variable, input(first_yyyymm,4.) as yr,
       mdy(1,1,calculated yr) as bd,
       max(intck('month',calculated bd,input(first_yyyymm,yymmn6.)),intck('month',calculated bd,input(second_yyyymm,yymmn6.))-1) as md,
       sum(sum_column) as s
from pre_monthly_summary
where 0<=calculated md<=12
group by 1,2,4;
quit;

/* Compute cumulative sums of SUM_COLUMN */

data cumul(drop=s);
set agg;
by id_variable yr;
where &first_yr<=yr<=&final_yr;
if first.yr then c=s;
else c+s;
run;

/* Create template with all combinations of IDs and month differences */

data all_dates;
do yr=&first_yr to &final_yr;
  do md=0 to 12;
    output;
  end;
end;
run;

proc sql;
create view all_comb as
select distinct id_variable, yr, md
from pre_monthly_summary, all_dates;
quit;

/* Insert missing months into CUMUL and reformat grouping variable */

data cumul2;
merge cumul
      all_comb;
by id_variable yr md;
length m $9;
if md=11 then m=cat(yr,'12_01');
else if md=12 then m=cat(yr,'12_02');
else m=cat(yr,put(md+1,z2.));
drop md;
run;

/* Impute missing values of the cumulative sums */

data cumul2LOCF;
update cumul2(obs=0) cumul2;
by id_variable yr;
if c=. then c=0;
output;
run;

/* Create final wide dataset */

proc transpose data=cumul2LOCF out=want(drop=_:) prefix=sum_column_;
by id_variable;
id m;
var c;
run;

 

 

View solution in original post

12 REPLIES 12
ballardw
Super User

 

Can you provide some dummy example data that behaves like your data set?

And provide the rules involved. Some of this appears to be a cumulative total for a calendar year and then you throw in SUM_COLUMN_201912_01 and similar. Are those two the only exceptions of the cumulative year?

 

Without seeing an example data set I'm not sure I want to make suggestions.

 

Instructions here: https://communities.sas.com/t5/SAS-Communities-Library/How-to-create-a-data-step-version-of-your-dat... will show how to turn an existing SAS data set into data step code that can be pasted into a forum code box using the </> icon or attached as text to show exactly what you have and that we can test code against. Does not have to actual data, just similar variables and behave the same.

 

mhoward2
Obsidian | Level 7

Hi, thank you for responding, my source data looks like this below. I understand the output of each being its own column is a little weird, but I have been told that is how it must be for now.

 

ID_VARIABLE FIRST_YYYYMM SECOND_YYYYMM SUM_COLUMN
1 201905 201905 15
2 201901 201901 0
2 201901 201902 0
2 201902 201902 20
2 201904 201904 0
2 201907 201907 0
2 201910 201910 0
3 201901 201901 70
3 201908 201908 0
3 201910 201912 30
3 201911 201911 110

 

As for your other question, yes there will always be 2 "full year" periods where the FIRST_YYYYMM is between 01 and 12 and the SECOND_YYYYMM is 01 and 02 of the next year.

 

I am still looking to see how I would accomplish this easily with less lines of code in the select statement. Any and all help would be appreciated, thank you!

Patrick
Opal | Level 21
So it also must be proc sql and can’t be proc means? WHY?
mhoward2
Obsidian | Level 7
if there is a way to accomplish this without proc sql, im all ears! Thanks!
Quentin
Super User

Can you show an example of the data in PRE_MONTHLY_SUMMARY, and maybe if there is an earlier table with unsummarized data as well?  And also describe why you're calculating all these variables?  What will you do next after you've got all these variables calculated?

 

One challenge is you're creating a bunch of monthly variables.  Almost always you are better off with a more vertical structure where you have a single column for date (or month, or whatever).  And each month would get a row.

 

While you could use the macro language to generate the code you showed, I think you'll probably be better off restructuring your data to a more vertical (i.e. normalized) format.

BASUG is hosting free webinars Next up: Mark Keintz presenting History Carried Forward, Future Carried Back: Mixing Time Series of Differing Frequencies on May 8. Register now at the Boston Area SAS Users Group event page: https://www.basug.org/events.
PaigeMiller
Diamond | Level 26

Certainly a macro could be used, but as stated by @ballardw we would need to know the exact rules.

 

My first thought was to re-arrange the data as suggested by @Quentin , in general I think long data sets are much easier to program than wide data sets; and use something like PROC SUMMARY or PROC REPORT. But because this problem need to know dates of two variables to determine if the value in t1.sum_column gets summed or not, I haven't been able to think of a way to re-arrange the data and then use PROC SUMMARY or PROC REPORT.

--
Paige Miller
mhoward2
Obsidian | Level 7
I agree the output data structure is not ideal, but I have been told it needs to be that way. I have posted an example of the source data above in my response to ballardw. Any and all help is appreciated!
mhoward2
Obsidian | Level 7
I agree the output data structure is not ideal, but I have been told it needs to be that way. I have posted an example of the source data above in my response to ballardw. Any and all help is appreciated! Thank you!
Tom
Super User Tom
Super User

Data seems strange. 

Why are your "month" variables character strings?

Why are you filtering on both "month" variables?  What cases are you excluding by doing take?  What would a case with the first month value as 201901 but the second month value as 201903 mean?  Where would that be counted?

 

Why not just make a multilevel format and 

proc format ;
value $monthly (multilabel)
  "201901"-"201901" = "201901"
  "201901"-"201902" = "201902"
  "201901"-"201903" = "201903"
  "201901"-"201903" = "201904"
...
 ;
run;

You can fill in the rest. Or use a data step to generate an CNTLIN dataset to pass to PROC FORMAT to make the format.

Then  use PROC SUMMARY to get the totals.  You can post process the results to eliminate the combinations you don't want.

proc summary data=PRE_MONTHLY_SUMMARY nway ;
  class FIRST_YYYYMM SECOND_YYYYMM / mlf ;
  format FIRST_YYYYMM SECOND_YYYYMM $monthly. ;
  var sum_column ;
  output out=want sum=;
run;

data want;
  set want;
  if put(first_yyyymm,$monthly.)=put(second_yyyymm,$monthly.);
run;

 

Something like:

 

Reeza
Super User
Running totals are much easier in a data step, and likely a two step data step where you first summarize by month and then calculate the running total with a transpose.

Likely the same amount of code AND dynamic once you expand to more years.
FreelanceReinh
Jade | Level 19

Hello @mhoward2,

 


@mhoward2 wrote:

The problem is the code is ugly and I feel like there should be a better way to do this. Here is the code:

PROC SQL;
	CREATE TABLE TMP1DAY.MONTHLY_SUMMARY AS
	SELECT t1.ID_VARIABLE,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201901" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201902", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201901,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201902" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201903", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201902,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201903" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201904", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201903,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201904" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201905", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201904,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201905" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201906", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201905,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201906" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201907", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201906,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201907" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201908", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201907,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201908" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201909", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201908,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201909" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201910", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201909,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201910" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201911", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201910,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201911" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "201912", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201911,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201912" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "202001", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201912_01,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "201901" AND "201912" AND t1.SECOND_YYYYMM BETWEEN "201901" AND "202002", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_201912_02,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202001" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202002", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202001,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202002" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202003", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202002,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202003" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202004", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202003,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202004" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202005", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202004,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202005" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202006", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202005,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202006" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202007", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202006,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202007" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202008", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202007,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202008" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202009", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202008,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202009" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202010", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202009,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202010" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202011", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202010,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202011" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202012", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202011,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202012" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202101", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202012_01,
		   SUM(IFN(t1.FIRST_YYYYMM BETWEEN "202001" AND "202012" AND t1.SECOND_YYYYMM BETWEEN "202001" AND "202102", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_202012_02
	FROM WORK.PRE_MONTHLY_SUMMARY t1
	GROUP BY 1;
QUIT;

You could write a macro to shorten the code and to allow for arbitrary ranges of years:

%macro sumcode(start_yr, final_yr);
%local y y2 m m2 zm zm2;
%do y=&start_yr %to &final_yr;
  %let y2=%eval(&y+1);
  %do m=1 %to 11;
    %let m2=%eval(&m+1);
    %let zm=%sysfunc(putn(&m,z2.));
    %let zm2=%sysfunc(putn(&m2,z2.));
    , SUM(IFN(t1.FIRST_YYYYMM BETWEEN "&y.01" AND "&y&zm" AND t1.SECOND_YYYYMM BETWEEN "&y.01" AND "&y&zm2", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_&y&zm
  %end;
    , SUM(IFN(t1.FIRST_YYYYMM BETWEEN "&y.01" AND "&y.12" AND t1.SECOND_YYYYMM BETWEEN "&y.01" AND "&y2.01", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_&y.12_01
    , SUM(IFN(t1.FIRST_YYYYMM BETWEEN "&y.01" AND "&y.12" AND t1.SECOND_YYYYMM BETWEEN "&y.01" AND "&y2.02", t1.SUM_COLUMN, 0)) AS SUM_COLUMN_&y.12_02
%end;
%mend sumcode;

 

Then your PROC SQL step, extended to, say, years 2018 - 2023, would look like this:

PROC SQL;
    CREATE TABLE TMP1DAY.MONTHLY_SUMMARY AS
    SELECT t1.ID_VARIABLE
           %sumcode(2018,2023)
    FROM WORK.PRE_MONTHLY_SUMMARY t1
    GROUP BY 1;
QUIT;

It turned out that this PROC SQL step is relatively fast. I have tested it on the sample dataset (with approx. 550,000 observations) created below:

/* Create sample data for demonstration */

data pre_monthly_summary;
call streaminit(27182818);
do id_variable=1 to 100000;
  d=intnx('month','01JAN2018'd,rand('integer',0,40));
  do _n_=1 to rand('integer',10);
    first_yyyymm=put(d,yymmn.);
    second_yyyymm=put(intnx('month',d,rand('integer',0,9)),yymmn.);
    sum_column=5*rand('integer',0,30);
    output;
    d=intnx('month',d,rand('integer',0,9));
  end;
end;
drop d;
run;

%let first_yr=2018;
%let final_yr=2023;

Feel free to modify this code to make it more realistic.

 

I have also tried to implement a different approach, inspired by Reeza's ideas, but with the sample data above the final PROC TRANSPOSE step alone is much slower than your PROC SQL step (on my old workstation), likely due to my poor implementation. Here is the code anyway. At least it reproduces the results of your PROC SQL step for the sample data. Maybe you can use it for validation purposes.

Spoiler
/* Aggregation preparing subsequent calculation */

proc sql;
create table agg(drop=bd) as
select id_variable, input(first_yyyymm,4.) as yr,
       mdy(1,1,calculated yr) as bd,
       max(intck('month',calculated bd,input(first_yyyymm,yymmn6.)),intck('month',calculated bd,input(second_yyyymm,yymmn6.))-1) as md,
       sum(sum_column) as s
from pre_monthly_summary
where 0<=calculated md<=12
group by 1,2,4;
quit;

/* Compute cumulative sums of SUM_COLUMN */

data cumul(drop=s);
set agg;
by id_variable yr;
where &first_yr<=yr<=&final_yr;
if first.yr then c=s;
else c+s;
run;

/* Create template with all combinations of IDs and month differences */

data all_dates;
do yr=&first_yr to &final_yr;
  do md=0 to 12;
    output;
  end;
end;
run;

proc sql;
create view all_comb as
select distinct id_variable, yr, md
from pre_monthly_summary, all_dates;
quit;

/* Insert missing months into CUMUL and reformat grouping variable */

data cumul2;
merge cumul
      all_comb;
by id_variable yr md;
length m $9;
if md=11 then m=cat(yr,'12_01');
else if md=12 then m=cat(yr,'12_02');
else m=cat(yr,put(md+1,z2.));
drop md;
run;

/* Impute missing values of the cumulative sums */

data cumul2LOCF;
update cumul2(obs=0) cumul2;
by id_variable yr;
if c=. then c=0;
output;
run;

/* Create final wide dataset */

proc transpose data=cumul2LOCF out=want(drop=_:) prefix=sum_column_;
by id_variable;
id m;
var c;
run;

 

 

mhoward2
Obsidian | Level 7
This is perfect! Thank you so much, I would not have known how to do this, your work is very much appreciated!!!

sas-innovate-2024.png

Available on demand!

Missed SAS Innovate Las Vegas? Watch all the action for free! View the keynotes, general sessions and 22 breakouts on demand.

 

Register now!

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

Click image to register for webinarClick image to register for webinar

Classroom Training Available!

Select SAS Training centers are offering in-person courses. View upcoming courses for:

View all other training opportunities.

Discussion stats
  • 12 replies
  • 542 views
  • 2 likes
  • 8 in conversation