Solved: Re: Good way to summarise this data (counting and summing)

yellowyellowred · Posted 06-23-2022 01:27 AM

I have a data set that looks like this:

Person_ID	Plan_ID	Age_group	Type	Timepoint	Payment	Result
901	101	10to20	A	0	500	0
901	119	10to20	A	1	400	1
901	129	10to20	A	3	250	0
905	203	21to30	B	0	100	0
905	287	21to30	B	1	150	0
905	301	21to30	B	2	100	1
910	401	21to30	B	0	900	0
910	450	21to30	B	1	800	1
910	500	21to30	B	2	100	1
910	600	21to30	B	3	150	1
920	700	10to20	A	0	300	1
920	800	10to20	A	1	100	1

What I want is to output a table like this:

Age_group	Type	Time_period	Number of people	Proportion of people with result=1 before payment	Proportion of people with result = 1 after payment	Total marginal payment
10to20	A	0 to 1	2	50%	100%	800 (500 + 300)
21to30	B	0 to 1	2	0%	50%	1000 (100 + 900)
21to30	B	1 to 2	2	50%	100%	950 (800 + 150)
21to30	B	2 to 3	1	100%	100%	100 (the payment for plan_ID = 500)

So the logic is this:
Each Person_ID can have multiple plan_ID's, which show as a unique row. For each Person_ID, their Age_group and Type will remain the same for different Plan_ID's. For each subsequent Plan_ID, the timepoint increments by 1. The first timepoint for a Person_ID will always start at 0.

I am only looking at 1-step jumps, so from 0 to 1, 1 to 2, 2 to 3 etc, which is why the jump from 1 to 3 is excluded.

The "number of people" column in the result table is the count of how many unique Person_ID's there are with the same Type and Time_period. The Time_period variable e.g. "0 to 1" represents data for people who have a "0" timepoint and a "1" timepoint.

The "Proportion of people with result=1 before payment" column calculates the proportion of unique Person_ID's with "Result" = 1 at the beginning time point, e.g. for Time_period = "0 to 1", we look at the Timepoint = 0. Similarly for the following column.

The total marginal payment sums up the total payment received by the age_group with that particular Type and Time_period which would have caused any change in the proportion. E.g. The first calculation of 800 (500 + 300) is for the Time_period = "0 to 1" meaning we only sum the payments at timepoint = 0. Similarly, for Time_period = "1 to 2", we only look at payments at Timepoint = 1.

Can someone please show how I'd do this? I'm really bad at SAS. Hopefully I was clear in this exercise.

Note that the first four variables in the starting table can have a possibly infinite amount of values.

Here is code used to generate that starting table:

data person;
   input Person_ID $ Plan_ID $ Age_group $ Type $ Timepoint Payment Result;
   datalines;
901 101 10to20 A 0 500 0
901 119 10to20 A 1 400 1
901 129 10to20 A 3 250 0
905 203 21to30 B 0 100 0
905 287 21to30 B 1 150 0
905 301 21to30 B 2 100 1
910 401 21to30 B 0 900 0
910 450 21to30 B 1 800 1
910 500 21to30 B 2 100 1
920 700 10to20 A 0 300 1
920 800 10to20 A 1 100 1
;

Ksharp · Posted 06-24-2022 07:25 AM

OK. Maybe I understand what you mean now. Try this one :

data person;
   input Person_ID $ Plan_ID $ Age_group $ Type $ Timepoint Payment Result;
   datalines;
901 101 10to20 A 0 500 0
901 119 10to20 A 1 400 1
901 129 10to20 A 3 250 0
905 203 21to30 B 0 100 0
905 287 21to30 B 1 150 0
905 301 21to30 B 2 100 1
910 401 21to30 B 0 900 0
910 450 21to30 B 1 800 1
910 500 21to30 B 2 100 1
910 600 21to30 B 3 150 1
920 700 10to20 A 0 300 1
920 800 10to20 A 1 100 1
;

proc sql;
create table level as
select distinct Person_ID,Age_group , Type , Timepoint
 from person
  order by 1,2,3,4;
quit;
data level1;
 set level;
 lag_Timepoint=lag(Timepoint);
 if Person_ID=lag(Person_ID) and Age_group=lag(Age_group) and Type=lag(Type) and dif(Timepoint)=1 then do; 
   first=lag_Timepoint; last=Timepoint;output;
 end;
 drop lag_Timepoint Timepoint;
run;

proc sql;
create table level2 as
select distinct Age_group , Type , first,last
 from level1;
quit;

%macro report(Age_group=,Type=,first=,last=);
data key;
 set level1;
 if  Age_group="&Age_group." and Type="&Type."  and first=&first. and last=&last.;
run;
data key1;
 set key;
Timepoint=first;output;
Timepoint=last;output;
drop first last;
run;

data have;
if _n_=1 then do;
  if 0 then set key;
  declare hash h(dataset:'key1');
  h.definekey('Person_ID','Age_group','Type','Timepoint');
  h.definedone();
end;
 set person;
 if h.check()=0;
run;
proc sql;
create table temp as
select "&Age_group." as Age_group length=80,"&Type." as Type length=80,
 catx(' ',&first.,'to',&last.) as Time_period length=80,
 (select count(distinct Person_ID) from have) as n_people,
 (select count(distinct Person_ID) from have where Timepoint=&first. and result=1)/
 (select count(distinct Person_ID) from have where Timepoint=&first. ) as prop_before_pay format=percent8.2,
 (select count(distinct Person_ID) from have where Timepoint=&last. and result=1)/
 (select count(distinct Person_ID) from have where Timepoint=&last. ) as prop_after_pay format=percent8.2,
 (select sum(payment) from have where Timepoint=&first.) as Total_payment
 from have(obs=1);
quit;
proc append base=want data=temp force;run;
run;
%mend;




proc delete data=want;run;
data _null_;
 set level2;
 call execute(catt('%report(Age_group=',Age_group,',Type=',Type,',first=',first,',last=',last,')'));
run;

View solution in original post

Ksharp · Posted 06-23-2022 09:19 AM

data person;
   input Person_ID $ Plan_ID $ Age_group $ Type $ Timepoint Payment Result;
   datalines;
901 101 10to20 A 0 500 0
901 119 10to20 A 1 400 1
901 129 10to20 A 3 250 0
905 203 21to30 B 0 100 0
905 287 21to30 B 1 150 0
905 301 21to30 B 2 100 1
910 401 21to30 B 0 900 0
910 450 21to30 B 1 800 1
910 500 21to30 B 2 100 1
910	600 21to30 B 3 150 1
920 700 10to20 A 0 300 1
920 800 10to20 A 1 100 1
;

proc sql;
create table level as
select distinct Age_group , Type , Timepoint
 from person
  order by 1,2,3;
quit;
data level1;
 set level;
 lag_Timepoint=lag(Timepoint);
 if Age_group=lag(Age_group) and Type=lag(Type) and dif(Timepoint)=1 then do; 
   first=lag_Timepoint; last=Timepoint;output;
 end;
 drop lag_Timepoint Timepoint;
run;


%macro report(Age_group=,Type=,first=,last=);
data have;
 set person;
 if Age_group="&Age_group." and Type="&Type."  and Timepoint in (&first. &last.);
run;
proc sql;
create table temp as
select "&Age_group." as Age_group length=80,"&Type." as Type length=80,
 catx(' ',&first.,'to',&last.) as Time_period length=80,
 (select count(distinct Person_ID) from have) as n_people,
 (select count(distinct Person_ID) from have where Timepoint=&first. and result=1)/
 (select count(distinct Person_ID) from have where Timepoint=&first. ) as prop_before_pay format=percent8.2,
 (select count(distinct Person_ID) from have where Timepoint=&last. and result=1)/
 (select count(distinct Person_ID) from have where Timepoint=&last. ) as prop_after_pay format=percent8.2,
 (select sum(payment) from have where Timepoint=&first.) as Total_payment
 from have(obs=1);
quit;
proc append base=want data=temp force;run;
run;
%mend;




proc delete data=want;run;
data _null_;
 set level1;
 call execute(catt('%report(Age_group=',Age_group,',Type=',Type,',first=',first,',last=',last,')'));
run;

yellowyellowred · Posted 06-23-2022 12:10 PM

Hi @Ksharp , thanks so much for your help.
It almost worked for me, when I run your code exactly, the final entry (4th row) gives me:

Age_group	Type	Time_period	n_people	...	total_payment
...	...	...	...	...	...
21to30	B	2 to 3	2	...	200

but from the starting table, there is only one Person_ID (Person_ID = 910) with a Timepoint = 2 and Timepoint = 3, so for Time_period = "2 to 3", there is only n_people = 1, and also the total_payment is 100 (9th row in the starting table). I tried adjusting and debugging but I couldn't figure out what caused this double up

Ksharp · Posted 06-24-2022 07:25 AM

OK. Maybe I understand what you mean now. Try this one :

data person;
   input Person_ID $ Plan_ID $ Age_group $ Type $ Timepoint Payment Result;
   datalines;
901 101 10to20 A 0 500 0
901 119 10to20 A 1 400 1
901 129 10to20 A 3 250 0
905 203 21to30 B 0 100 0
905 287 21to30 B 1 150 0
905 301 21to30 B 2 100 1
910 401 21to30 B 0 900 0
910 450 21to30 B 1 800 1
910 500 21to30 B 2 100 1
910 600 21to30 B 3 150 1
920 700 10to20 A 0 300 1
920 800 10to20 A 1 100 1
;

proc sql;
create table level as
select distinct Person_ID,Age_group , Type , Timepoint
 from person
  order by 1,2,3,4;
quit;
data level1;
 set level;
 lag_Timepoint=lag(Timepoint);
 if Person_ID=lag(Person_ID) and Age_group=lag(Age_group) and Type=lag(Type) and dif(Timepoint)=1 then do; 
   first=lag_Timepoint; last=Timepoint;output;
 end;
 drop lag_Timepoint Timepoint;
run;

proc sql;
create table level2 as
select distinct Age_group , Type , first,last
 from level1;
quit;

%macro report(Age_group=,Type=,first=,last=);
data key;
 set level1;
 if  Age_group="&Age_group." and Type="&Type."  and first=&first. and last=&last.;
run;
data key1;
 set key;
Timepoint=first;output;
Timepoint=last;output;
drop first last;
run;

data have;
if _n_=1 then do;
  if 0 then set key;
  declare hash h(dataset:'key1');
  h.definekey('Person_ID','Age_group','Type','Timepoint');
  h.definedone();
end;
 set person;
 if h.check()=0;
run;
proc sql;
create table temp as
select "&Age_group." as Age_group length=80,"&Type." as Type length=80,
 catx(' ',&first.,'to',&last.) as Time_period length=80,
 (select count(distinct Person_ID) from have) as n_people,
 (select count(distinct Person_ID) from have where Timepoint=&first. and result=1)/
 (select count(distinct Person_ID) from have where Timepoint=&first. ) as prop_before_pay format=percent8.2,
 (select count(distinct Person_ID) from have where Timepoint=&last. and result=1)/
 (select count(distinct Person_ID) from have where Timepoint=&last. ) as prop_after_pay format=percent8.2,
 (select sum(payment) from have where Timepoint=&first.) as Total_payment
 from have(obs=1);
quit;
proc append base=want data=temp force;run;
run;
%mend;




proc delete data=want;run;
data _null_;
 set level2;
 call execute(catt('%report(Age_group=',Age_group,',Type=',Type,',first=',first,',last=',last,')'));
run;

Good way to summarise this data (counting and summing)

Re: Good way to summarise this data (counting and summing)

Re: Good way to summarise this data (counting and summing)

Re: Good way to summarise this data (counting and summing)

Re: Good way to summarise this data (counting and summing)

SAS Innovate 2025: Save the Date

SAS Training: Just a Click Away