Hi all,
I have two datasets. One is at the person level (one row per ID) and has a date var. The other has multiple rows per person and has a date var and a categorical var, each of which can be different on each row:
Dataset 1
ID date_var1
1 2/5/2013
2 9/13/2014
3 6/4/2012
Dataset 2
ID date_var2 categ_var2
1 1/4/2013 abc
1 1/10/2013 def
1 2/1/2014 ghi
3 8/17/2012 abc
3 9/12/2013 jkl
What I want is to join the two datasets such that:
1. If someone from Dataset 1 (like ID 2) is not found in Dataset 2, they remain and all their Dataset 1 vars stay the same.
2. If someone from Dataset 1 is in Dataset 2 and at least one of their date_var2s is prior to date_var1, they join to the row in Dataset2 with the most recent date prior to date_var1 (so for ID1, this would be the second row with date_var2=1/10/2013 and categ_var2=def
3. If someone from Dataset 1 is in Dataset 2 but none of their date_var2s is prior to their date_var1 (like ID=3), they should remain in the resulting dataset with all their variables from Dataset 1 the same and the variables from Dataset 2 set to missing.
Any help is much appreciated.
Here's one way to do it. I'm sure there may be a way to create only 1 data set using sub queries and such, but this should get you what you're looking for.
data dataset1;
input ID$ date_var1;
informat ID $2. date_var1 mmddyy10.;
format ID $2. date_var1 mmddyy10.;
datalines;
1 2/5/2013
2 9/13/2014
3 6/4/2012
;
run;
data dataset2;
input ID$ date_var2 categ_var2$;
informat ID $2. date_var2 mmddyy10. categ_var2 $3.;
format ID $2. date_var2 mmddyy10. categ_var2 $3.;
datalines;
1 1/4/2013 abc
1 1/10/2013 def
1 2/1/2014 ghi
3 8/17/2012 abc
3 9/12/2013 jkl
;
run;
proc sql;
create table date as
select A.ID, max(A.date_var2) as date_var2 format=mmddyy10.
from dataset2 A, dataset1 B
where (A.ID=B.ID AND A.date_var2 < B.date_var1)
group by A.ID;
create table new_dataset2 as
select A.*
from dataset2 A, date B
where (A.ID=B.ID AND A.date_var2=B.date_var2);
create table want as
select A.*,
B.date_var2,
B.categ_var2
from dataset1 A left join new_dataset2 B
on (A.ID=B.ID);
quit;
This is what I understood in small steps:
/* 0.) Create datasets */
data work.dataset1;
input ID date_var1 mmddyy10.;
datalines;
1 02/05/2013
2 09/13/2014
3 06/04/2012
;
run;
data work.dataset2;
input ID date_var2 mmddyy10. categ_var2 $ ;
datalines;
1 01/04/2013 abc
1 01/10/2013 def
1 02/01/2014 ghi
3 08/17/2012 abc
3 09/12/2013 jkl
;
run;
/************************************************/
/* 1.) Condition one */
proc sql;
create table work.cond_one as (
select * from work.dataset1 as a
left outer join work.dataset2 as b
on a.ID = b.ID
);
quit;
/************************************************/
/* 2.) Condition two */
proc sql;
create table work.cond_two_a as (
select * from work.dataset1 as a
inner join work.dataset2 as b
on a.ID = b.ID
);
quit;
data work.cond_two_b;
set work.cond_two_a;
length relation $14.;
if date_var2 < date_var1 then relation = 'smaller';
else relation = 'biggerorequal';
run;
proc sql;
create table work.cond_two_c as (
select * from work.cond_two_b as b
where 'smaller' in (
select distinct relation from work.cond_two_b as c
where b.id = c.id
)
and b.date_var2 < b.date_var1
);
quit;
proc sql;
create table work.cond_two as (
select id, date_var1, date_var2, categ_var2 from work.cond_two_c
group by id
having date_var2 = max(date_var2)
);
quit;
/************************************************/
/* 3.) Condition three */
proc sql;
create table work.cond_three_c as (
select * from work.cond_two_b as b
where 'smaller' not in (
select distinct relation from work.cond_two_b as c
where b.id = c.id
)
);
quit;
data work.cond_three;
set work.cond_three_c (drop=relation);
date_var2 = .;
categ_var2 = "";
run;
/************************************************/
/* 4.) combine all */
/* if a union is what you mean */
data work.alltogethernow;
set work.cond_one
work.cond_two
work.cond_three;
run;
Join us for SAS Innovate April 16-19 at the Aria in Las Vegas. Bring the team and save big with our group pricing for a limited time only.
Pre-conference courses and tutorials are filling up fast and are always a sellout. Register today to reserve your seat.
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.