BookmarkSubscribeRSS Feed
Walternate
Obsidian | Level 7

Hi all,

 

I have two datasets. One is at the person level (one row per ID) and has a date var. The other has multiple rows per person and has a date var and a categorical var, each of which can be different on each row: 

 

Dataset 1

ID  date_var1

1    2/5/2013

2    9/13/2014

3    6/4/2012

 

Dataset 2

ID  date_var2  categ_var2

1    1/4/2013        abc

1    1/10/2013      def

1     2/1/2014       ghi

3    8/17/2012      abc

3    9/12/2013      jkl

 

What I want is to join the two datasets such that:

 

1. If someone from Dataset 1 (like ID 2) is not found in Dataset 2, they remain and all their Dataset 1 vars stay the same.

2. If someone from Dataset 1 is in Dataset 2 and at least one of their date_var2s is prior to date_var1, they join to the row in Dataset2 with the most recent date prior to date_var1 (so for ID1, this would be the second row with date_var2=1/10/2013 and categ_var2=def

3. If someone from Dataset 1 is in Dataset 2 but none of their date_var2s is prior to their date_var1 (like ID=3), they should remain in the resulting dataset with all their variables from Dataset 1 the same and the variables from Dataset 2 set to missing.

 

Any help is much appreciated.

 

2 REPLIES 2
dcruik
Lapis Lazuli | Level 10

Here's one way to do it.  I'm sure there may be a way to create only 1 data set using sub queries and such, but this should get you what you're looking for.

data dataset1;
input ID$ date_var1;
informat ID $2. date_var1 mmddyy10.;
format ID $2. date_var1 mmddyy10.;
datalines;
1 2/5/2013
2 9/13/2014
3 6/4/2012
;
run;

data dataset2;
input ID$ date_var2 categ_var2$;
informat ID $2. date_var2 mmddyy10. categ_var2 $3.;
format ID $2. date_var2 mmddyy10. categ_var2 $3.;
datalines;
1 1/4/2013 abc
1 1/10/2013 def
1 2/1/2014 ghi
3 8/17/2012 abc
3 9/12/2013 jkl
;
run;

proc sql;
create table date as
select A.ID, max(A.date_var2) as date_var2 format=mmddyy10.
from dataset2 A, dataset1 B
where (A.ID=B.ID AND A.date_var2 < B.date_var1)
group by A.ID;

create table new_dataset2 as
select A.*
from dataset2 A, date B
where (A.ID=B.ID AND A.date_var2=B.date_var2);

create table want as
select A.*,
	   B.date_var2,
	   B.categ_var2
from dataset1 A left join new_dataset2 B
on (A.ID=B.ID);
quit;
Eva
Quartz | Level 8 Eva
Quartz | Level 8

This is what I understood in small steps:

 

/* 0.) Create datasets */

data work.dataset1;
input ID date_var1 mmddyy10.;
datalines;
1 02/05/2013
2 09/13/2014
3 06/04/2012
;
run;

data work.dataset2;
input ID date_var2 mmddyy10. categ_var2 $ ;
datalines;
1 01/04/2013 abc
1 01/10/2013 def
1 02/01/2014 ghi
3 08/17/2012 abc
3 09/12/2013 jkl
;
run;

/************************************************/

/* 1.) Condition one */

proc sql;
	create table work.cond_one as (
	select * from work.dataset1 as a
	left outer join work.dataset2 as b
	on a.ID = b.ID
	);
quit;

/************************************************/

/* 2.) Condition two */

proc sql;
	create table work.cond_two_a as (
	select * from work.dataset1 as a
	inner join work.dataset2 as b
	on a.ID = b.ID
	);
quit;

data work.cond_two_b;
	set work.cond_two_a;
	length relation $14.;
	if date_var2 < date_var1 then relation = 'smaller';
	else relation = 'biggerorequal';
run;

proc sql;
	create table work.cond_two_c as (
		select * from work.cond_two_b as b
		where 'smaller' in (
		select distinct relation from work.cond_two_b as c
		where b.id = c.id
		)
		and b.date_var2 < b.date_var1
	);
quit;

proc sql;
	create table work.cond_two as (
		select id, date_var1, date_var2, categ_var2 from work.cond_two_c 
		group by id
		having date_var2 = max(date_var2)
	);
quit;

/************************************************/

/* 3.) Condition three */

proc sql;
	create table work.cond_three_c as (
		select * from work.cond_two_b as b
		where 'smaller' not in (
		select distinct relation from work.cond_two_b as c
		where b.id = c.id
		)
	);
quit;

data work.cond_three;
	set work.cond_three_c (drop=relation);
	date_var2  = .;
	categ_var2 = "";
run;

/************************************************/

/* 4.) combine all */
/* if a union is what you mean */

data work.alltogethernow;
	set work.cond_one
	work.cond_two
	work.cond_three;
run;

hackathon24-white-horiz.png

The 2025 SAS Hackathon has begun!

It's finally time to hack! Remember to visit the SAS Hacker's Hub regularly for news and updates.

Latest Updates

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

SAS Training: Just a Click Away

 Ready to level-up your skills? Choose your own adventure.

Browse our catalog!

Discussion stats
  • 2 replies
  • 1355 views
  • 0 likes
  • 3 in conversation