Hello,
I want to add a new column to data set "ttt1" that is called "Sum_Z_date1_till_date2".
In this new column need to calculate sum of Z between date 1 and date 2.
What is the way to do it please?
Expected results is : for ID=1 get 50 and for ID=2 get 120
data ttt1;
format date1 date2 date9.;
input ID date1 :date9. date2 :date9.;
cards;
1 '01MAR2021'd '05MAR2021'd
2 '08MAR2021'd '15MAR2021'd
;
Run;
data ttt2;
input ID date3 :date9. Z;
cards;
1 '28FEB2021'd 10
1 '02MAR2021'd 20
1 '04MAR2021'd 30
1 '06MAR2021'd 10
2 '08MAR2021'd 50
2 '14MAR2021'd 70
2 '18MAR2021'd 80
;
Run;
I would move the date condition into the ON of the LEFT JOIN:
proc sql;
create table want as
select
t1.id,
t1.date1,
t1.date2,
sum(t2.z) as sum_z
from ttt1 t1 left join ttt2 t2
on t1.id=t2.id and t1.date1 le t2.date3 le t1.date2
group by t1.id, t1.date1, t1.date2
;
quit;
data ttt1;
format date1 date2 date9.;
input ID date1 :date9. date2 :date9.;
cards;
1 01MAR2021 05MAR2021
2 08MAR2021 15MAR2021
;
Run;
data ttt2;
input ID date3 :date9. Z;
cards;
1 28FEB2021 10
1 02MAR2021 20
1 04MAR2021 30
1 06MAR2021 10
2 08MAR2021 50
2 14MAR2021 70
2 18MAR2021 80
;
Run;
proc sql;
create table ttt3 as select
a.*,sum(b.z*(a.date1<=b.date3<=a.date2)) as sum_z
from ttt1 as a left join ttt2 as b on a.id=b.id
group by a.id
having b.date3=max(b.date3);
quit;
@Ronein wrote:
Why is it needed to write the statement
having b.date3=max(b.date3);?
So you get only one output record from SQL for each row of the table ttt1
Hello,
I run your code but results is not correct.
Please compare results
data ttt1;
format date1 date2 date9.;
input ID date1 :date9. date2 :date9.;
cards;
1 01MAR2021 05MAR2021
2 08MAR2021 15MAR2021
1 27FEB2021 28FEB2021
;
Run;
data ttt2;
format date3 date9.;
input ID date3 :date9. Z;
cards;
1 28FEB2021 10
1 02MAR2021 20
1 04MAR2021 30
1 06MAR2021 10
2 08MAR2021 50
2 14MAR2021 70
2 18MAR2021 80
;
Run;
/**My solution*/
/*Cartesian product:Many to many merge*/
proc sql;
create table ttt3 as
select *,
case when Date1<=Date3<=Date2 then 1 else 0 end as Indicator,
calculated Indicator*Z as Z_new
from ttt1 a
left join ttt2 b
on a.ID=b.ID
order by ID,date1,date2
;
quit;
proc sql;
create table wanted as
select ID,date1,date2,
sum(Z_new) as Z_new
from ttt3
group by Id,Date1,Date2;
quit;
/**PaigeMiller solution*/
proc sql;
create table ttt3 as select
a.*,sum(b.z*(a.date1<=b.date3<=a.date2)) as sum_z
from ttt1 as a left join ttt2 as b on a.id=b.id
group by a.id
having b.date3=max(b.date3);
quit;
This is easily fixed. The problem with a Cartesian join is that if you have large data sets, you will need a huge amount of time to process the Cartesian join.
data ttt1;
format date1 date2 date9.;
input ID date1 :date9. date2 :date9.;
n=_n_;
cards;
1 01MAR2021 05MAR2021
2 08MAR2021 15MAR2021
1 27FEB2021 28FEB2021
;
Run;
data ttt2;
format date3 date9.;
input ID date3 :date9. Z;
cards;
1 28FEB2021 10
1 02MAR2021 20
1 04MAR2021 30
1 06MAR2021 10
2 08MAR2021 50
2 14MAR2021 70
2 18MAR2021 80
;
Run;
proc sql;
create table ttt3 as select
a.*,sum(b.z*(a.date1<=b.date3<=a.date2)) as sum_z
from ttt1 as a left join ttt2 as b on a.id=b.id
group by a.id,a.n
having b.date3=max(b.date3)
order by a.n;
quit;
Thanks
I would move the date condition into the ON of the LEFT JOIN:
proc sql;
create table want as
select
t1.id,
t1.date1,
t1.date2,
sum(t2.z) as sum_z
from ttt1 t1 left join ttt2 t2
on t1.id=t2.id and t1.date1 le t2.date3 le t1.date2
group by t1.id, t1.date1, t1.date2
;
quit;
Just for fun, with hash tables:
data ttt1;
format date1 date2 date9.;
input ID date1 :date9. date2 :date9.;
cards;
1 '01MAR2021'd '05MAR2021'd
2 '08MAR2021'd '15MAR2021'd
;
Run;
data ttt2;
input ID date3 :date9. Z;
cards;
1 '28FEB2021'd 10
1 '02MAR2021'd 20
1 '04MAR2021'd 30
1 '06MAR2021'd 10
2 '08MAR2021'd 50
2 '14MAR2021'd 70
2 '18MAR2021'd 80
;
Run;
data WANT;
if 0 then set ttt1 ttt2;
declare hash H(dataset: "ttt2");
H.defineKey("ID","date3");
H.defineData("Z");
H.defineDone();
do until(eof);
set ttt1 end=eof;
by ID;
do date3 = date1 to date2;
if H.find()=0 then sum + Z;
end;
if last.ID then
do;
output;
sum = 0;
end;
end;
drop date3 z;
stop;
run;
proc print;
run;
Bart
Update, for duplicated data in ttt1 and ttt2:
data ttt1;
format date1 date2 date9.;
input ID date1 :date9. date2 :date9.;
cards;
1 '06MAR2021'd '15MAR2021'd
1 '01MAR2021'd '05MAR2021'd
2 '08MAR2021'd '15MAR2021'd
;
Run;
data ttt2;
input ID date3 :date9. Z;
cards;
1 '28FEB2021'd 10
1 '02MAR2021'd 20
1 '04MAR2021'd 1
1 '04MAR2021'd 29
1 '06MAR2021'd 10
2 '08MAR2021'd 50
2 '14MAR2021'd 70
2 '18MAR2021'd 80
;
Run;
data WANT;
if 0 then set ttt1 ttt2;
declare hash H();
H.defineKey("ID","date3");
H.defineData("Z");
H.defineDone();
do until(end);
set ttt2(rename=(z=z2)) end=end;
if H.find() then z = z2;
else z + z2;
H.replace();
end;
do until(eof);
set ttt1 end=eof;
by ID;
do date3 = date1 to date2;
if H.find()=0 then sum + Z;
end;
if last.ID then
do;
output;
sum = 0;
end;
end;
drop date3 z:;
stop;
run;
proc print;
run;
Bart
I'm a great enthusiast for the hash object, but allow me to propagate a "conditional set" apprauch to accommodate multiple date ranges per id:
data ttt1;
format date1 date2 date9.;
input ID date1 :date9. date2 :date9.;
cards;
1 '01MAR2021'd '05MAR2021'd
1 '06MAR2021'd '15MAR2021'd
2 '08MAR2021'd '15MAR2021'd
;
Run;
data ttt2;
input ID date3 :date9. Z;
cards;
1 '28FEB2021'd 10
1 '02MAR2021'd 20
1 '04MAR2021'd 1
1 '04MAR2021'd 29
1 '06MAR2021'd 10
2 '08MAR2021'd 50
2 '14MAR2021'd 70
2 '18MAR2021'd 80
;
Run;
data want (drop=z date3);
set ttt1 (keep=id date1 rename=(date1=date3) in=start)
ttt2
ttt1 (rename=(date2=date3) in=end);
by id date3;
if start=1 then do;
set ttt1; /* Get DATE1 and DATE2, automatically retained*/
zsum=.;
end;
else if (date1<=date3<=date2) then zsum+z;
if end=1;
run;
This assumes that TTT1 is (1) sorted by ID/DATE1, and (2) has no intersecting date ranges.
The SET statement forces the DATE1 (renamed to DATE3) to appear in chronological order with all the DATE3 values in TTT2, followed by DATE2 (from TTT1 and renamed to DATE3). The conditional set statement (in the "if start" do group) reads in (unrenamed) DATE1 and DATE2. These variables will be retained until the next conditional set statement, (i.e. throughout all the qualifying records in TTT2), thereby enabling the "if (date1<=date3<=date2) ..." test for accumulating ZSUM.
Great code Mark! Beautiful use-case of interleaving.
In defence of hash approach, it requires only that ttt1 be sorted by ID, data ranges can overlap. And I just realised that hash aggregates on the ID level, not ID/date (but that's easy to fix by doing:
/*if last.ID then*/
).
Bart
If TTT1 and TTT2 are both sorted by ID, and TTT1 has one record per id, then there is a straightforward DATA step solution:
data want (drop=z date3);
merge ttt2 ttt1;
by id;
if first.id then zsum=.;
if (date1<=date3<=date2) then zsum+z;
if last.id;
run;
data ttt1; format date1 date2 date9.; input ID date1 :date9. date2 :date9.; format date1 date2 date9.; cards; 1 '01MAR2021'd '05MAR2021'd 2 '08MAR2021'd '15MAR2021'd ; Run; data ttt2; input ID date3 :date9. Z; format date3 date9.; cards; 1 '28FEB2021'd 10 1 '02MAR2021'd 20 1 '04MAR2021'd 30 1 '06MAR2021'd 10 2 '08MAR2021'd 50 2 '14MAR2021'd 70 2 '18MAR2021'd 80 ; Run; proc summary data=ttt2 nway; class id date3; var z; output out=sum(drop=_:) sum=; run; data want; if _n_=1 then do; if 0 then set sum; declare hash h(dataset:'sum',hashexp:20); h.definekey('id','date3'); h.definedata('z'); h.definedone(); end; set ttt1; sum=0; do temp=date1 to date2; if h.find(key:id,key:temp)=0 then sum+z; end; drop temp z date3; run;
Good news: We've extended SAS Hackathon registration until Sept. 12, so you still have time to be part of our biggest event yet – our five-year anniversary!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.