DATA Step, Macro, Functions and more

Joining two datasets based on relative dates

Reply
Frequent Contributor
Posts: 138

Joining two datasets based on relative dates

Hi all,

 

I have two datasets. One is at the person level (one row per ID) and has a date var. The other has multiple rows per person and has a date var and a categorical var, each of which can be different on each row: 

 

Dataset 1

ID  date_var1

1    2/5/2013

2    9/13/2014

3    6/4/2012

 

Dataset 2

ID  date_var2  categ_var2

1    1/4/2013        abc

1    1/10/2013      def

1     2/1/2014       ghi

3    8/17/2012      abc

3    9/12/2013      jkl

 

What I want is to join the two datasets such that:

 

1. If someone from Dataset 1 (like ID 2) is not found in Dataset 2, they remain and all their Dataset 1 vars stay the same.

2. If someone from Dataset 1 is in Dataset 2 and at least one of their date_var2s is prior to date_var1, they join to the row in Dataset2 with the most recent date prior to date_var1 (so for ID1, this would be the second row with date_var2=1/10/2013 and categ_var2=def

3. If someone from Dataset 1 is in Dataset 2 but none of their date_var2s is prior to their date_var1 (like ID=3), they should remain in the resulting dataset with all their variables from Dataset 1 the same and the variables from Dataset 2 set to missing.

 

Any help is much appreciated.

 

Frequent Contributor
Posts: 130

Re: Joining two datasets based on relative dates

Posted in reply to Walternate

Here's one way to do it.  I'm sure there may be a way to create only 1 data set using sub queries and such, but this should get you what you're looking for.

data dataset1;
input ID$ date_var1;
informat ID $2. date_var1 mmddyy10.;
format ID $2. date_var1 mmddyy10.;
datalines;
1 2/5/2013
2 9/13/2014
3 6/4/2012
;
run;

data dataset2;
input ID$ date_var2 categ_var2$;
informat ID $2. date_var2 mmddyy10. categ_var2 $3.;
format ID $2. date_var2 mmddyy10. categ_var2 $3.;
datalines;
1 1/4/2013 abc
1 1/10/2013 def
1 2/1/2014 ghi
3 8/17/2012 abc
3 9/12/2013 jkl
;
run;

proc sql;
create table date as
select A.ID, max(A.date_var2) as date_var2 format=mmddyy10.
from dataset2 A, dataset1 B
where (A.ID=B.ID AND A.date_var2 < B.date_var1)
group by A.ID;

create table new_dataset2 as
select A.*
from dataset2 A, date B
where (A.ID=B.ID AND A.date_var2=B.date_var2);

create table want as
select A.*,
	   B.date_var2,
	   B.categ_var2
from dataset1 A left join new_dataset2 B
on (A.ID=B.ID);
quit;
Regular Contributor
Regular Contributor
Posts: 156

Re: Joining two datasets based on relative dates

This is what I understood in small steps:

 

/* 0.) Create datasets */

data work.dataset1;
input ID date_var1 mmddyy10.;
datalines;
1 02/05/2013
2 09/13/2014
3 06/04/2012
;
run;

data work.dataset2;
input ID date_var2 mmddyy10. categ_var2 $ ;
datalines;
1 01/04/2013 abc
1 01/10/2013 def
1 02/01/2014 ghi
3 08/17/2012 abc
3 09/12/2013 jkl
;
run;

/************************************************/

/* 1.) Condition one */

proc sql;
	create table work.cond_one as (
	select * from work.dataset1 as a
	left outer join work.dataset2 as b
	on a.ID = b.ID
	);
quit;

/************************************************/

/* 2.) Condition two */

proc sql;
	create table work.cond_two_a as (
	select * from work.dataset1 as a
	inner join work.dataset2 as b
	on a.ID = b.ID
	);
quit;

data work.cond_two_b;
	set work.cond_two_a;
	length relation $14.;
	if date_var2 < date_var1 then relation = 'smaller';
	else relation = 'biggerorequal';
run;

proc sql;
	create table work.cond_two_c as (
		select * from work.cond_two_b as b
		where 'smaller' in (
		select distinct relation from work.cond_two_b as c
		where b.id = c.id
		)
		and b.date_var2 < b.date_var1
	);
quit;

proc sql;
	create table work.cond_two as (
		select id, date_var1, date_var2, categ_var2 from work.cond_two_c 
		group by id
		having date_var2 = max(date_var2)
	);
quit;

/************************************************/

/* 3.) Condition three */

proc sql;
	create table work.cond_three_c as (
		select * from work.cond_two_b as b
		where 'smaller' not in (
		select distinct relation from work.cond_two_b as c
		where b.id = c.id
		)
	);
quit;

data work.cond_three;
	set work.cond_three_c (drop=relation);
	date_var2  = .;
	categ_var2 = "";
run;

/************************************************/

/* 4.) combine all */
/* if a union is what you mean */

data work.alltogethernow;
	set work.cond_one
	work.cond_two
	work.cond_three;
run;
Ask a Question
Discussion stats
  • 2 replies
  • 197 views
  • 0 likes
  • 3 in conversation