Solved: Re: Identifying dates for treatment

stellapersis7 · Posted 05-30-2024 11:30 AM

https://communities.sas.com/t5/SAS-Programming/Overlapping-drugs/m-p/930264#M365992

My query is in continuation of the above question which I posted few weeks back, I was unable to continue the same topic as it was closed. How can I know the difference between 'SGA_START' and 'start'? I am trying to merge the dataset 'want' with 'SGA2', how ever I cannot identifying the difference/no. of days between SGA_START and start dates. Can someone please help me with the code.

Thank you

mkeintz · Posted 06-01-2024 10:51 AM

If there are no overlaps WITHIN sga or WITHIN dm, and both are sorted by ID sga_start/dm_start, then there is a single step solution using conditional SET statements, as in:

data sga;
input id	(sga_start	sga_end) (:mmddyy10.);
format sga_start	sga_end mmddyy10. ;
cards;
1	1/1/17	7/1/17	 	 
1	10/1/17	12/1/17	 	 
2	2/1/17	4/1/17	 	 
2	9/1/17	11/1/17	
;
data dm;
input id	(dm_start	dm_end) (:mmddyy10.);
format dm_start	dm_end mmddyy10. ;
cards;
1	4/1/17	5/1/17	 	 
1	6/1/17	12/1/17	 	 
2	3/1/17	4/1/17	 	 
2	9/1/17	12/1/17
;


data want (drop=_:);
  merge sga (keep=id sga_start rename=(sga_start=_date) in=insga)
        dm  (keep=id dm_start  rename=(dm_start=_date)  in=indm) ;
  by id _date;


  if insga=1 then set sga ;
  if indm=1 then set dm ;

  if (sga_start <= dm_start <= sga_end)  or
     (sga_start <= dm_end   <= sga_end) then do;
    overlap+1;
    overlap_start=max(dm_start,sga_start);
    overlap_end=min(sga_end,dm_end);
    overlap_length=1 + overlap_end - overlap_start;
    pre_overlap_duration=overlap_start-sga_start;
    format overlap_start overlap_end mmddyy10. ;
    output;
  end;
  if last.id then call missing(of _all_);
run;

If you want to know more about the underlying logic of using conditional SET statements, please see my recent presentation at VIDEOS | BASUG.

Edited note: the code above assumes that the [sga_start,sga_end] interval is NEVER within the [dm_start,dm_end] interval, since that is how the sample data was presented. But if it is possible that dm_start <= sga_start <= sga_end <= dm_end, then change

  if (sga_start <= dm_start <= sga_end)  or
     (sga_start <= dm_end   <= sga_end) then do;

to

  if max(sga_start,dm_start) <= min(sga_end,dm_end) then do;

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------

View solution in original post

ballardw · Posted 05-30-2024 11:43 AM

Marked as having a solution does not "close" a thread on this forum. Though when a question is different enough than a new thread referencing the old one is appropriate.

However that likely means providing a different set of example data sets to discuss. So now you should provide something providing examples of the two (?) sets you expect to merge. You should not expect us to repeatedly rerun code in another thread to get something used in this one. Plus, I am not sure which sets would be needed.

Ksharp · Posted 05-30-2024 09:34 PM

Could you post the desired output based on your previous example.

It is hard to guess what you are looking for .

"difference between 'SGA_START' and 'start'?"

You mean then number of days between SGA_START and DM_START ?

data sga;
infile cards expandtabs;
input id	(sga_start	sga_end) (:mmddyy10.);
format sga_start	sga_end mmddyy10. ;
cards;
1	1/1/17	7/1/17	 	 
1	10/1/17	12/1/17	 	 
2	2/1/17	4/1/17	 	 
2	9/1/17	11/1/17	
;
data dm;
infile cards expandtabs;
input id	(dm_start	dm_end) (:mmddyy10.);
format dm_start	dm_end mmddyy10. ;
cards;
1	4/1/17	5/1/17	 	 
1	6/1/17	12/1/17	 	 
2	3/1/17	4/1/17	 	 
2	9/1/17	12/1/17
;


data SGA2;
 set sga;
 label='SGA';
 do date=SGA_START to SGA_END;
  output;
 end;
 keep  id date label;
 format date date9.;
run;
data DM2;
 set dm;
 label='DM ';
 do date=DM_START to DM_END;
  output;
 end;
 keep id date label;
 format date date9.;
run;
data temp;
 set SGA2 DM2;
run;
proc sort  data=temp nodupkey;
by id  date label;
run;
data temp2;
do until(last.date);
 set temp;
 by id date;
 length tag $ 40;
 tag=catx('|',tag,label);
end;
drop label;
run;
data temp3;
 set temp2;
 by id tag notsorted;
 if first.tag or dif(date) ne 1 then group+1;
run;
proc sql;
create table want as
select id,group,tag,scan(tag,-1,'|') as flag,min(date) as start format=mmddyy10.,max(date) as end format=mmddyy10.,count(*) as days
 from temp3
  group by id,group,tag
  ;
quit;
data want2;
 set want;
 by id flag notsorted;
 retain sga_start;
 if first.flag then sga_start=ifn(flag='SGA',start,.);
 difference=start-sga_start;  *the variable you want;
 if findc(tag,'|');
 format sga_start mmddyy10.;
 drop flag;
run;

stellapersis7 · Posted 05-31-2024 08:11 PM

So, I want the number of days between SGA_start and the start of overlap period for those patients who had overlap of SGA and DM.

data I have
id	sga_start	sga_end
1	1/1/17	7/1/17
1	10/1/17	12/1/17
2	2/1/17	4/1/17
2	9/1/17	11/1/17

id	dm_start	dm_end
1	4/1/17	5/1/17
1	6/1/17	12/1/17
2	3/1/17	4/1/17
2	9/1/17	12/1/17
Overlap of SGA and DM				data I want
id		start	end	duration between sga_start and start of overlap
1	overlap1	4/1/17	5/1/17	92 days
	overlap2	6/1/17	7/1/17	153 days
	overlap3	10/1/17	12/1/17	0 days
2	overlap1	3/1/17	4/1/17	28 days
	overlap2	9/1/17	11/1/17	0 days

I am posting the data sets again for convenience. ( I used the same code you posted before to get the overlap periods)

data WORK.SGA;

infile datalines dsd truncover;

input ENROLID:32. PERIOD:32. SGA_START:MMDDYY10. SGA_END:MMDDYY10.;

format SGA_START MMDDYY10. SGA_END MMDDYY10.;

label ENROLID="Enrollee ID";

datalines;

27335102 1 04/25/2017 06/21/2017

27335102 2 09/07/2017 12/12/2017

27681801 1 01/04/2017 11/21/2018

27681801 2 02/13/2019 08/28/2019

27681801 3 10/30/2019 12/30/2019

;;;;

data WORK.DM;

infile datalines dsd truncover;

input ENROLID:32. PERIOD:32. GLP_START:MMDDYY10. GLP_END:MMDDYY10.;

format GLP_START MMDDYY10. GLP_END MMDDYY10.;

label ENROLID="Enrollee ID";

datalines;

27335102 1 06/08/2017 06/08/2017

27335102 2 08/12/2017 08/12/2017

27335102 3 10/14/2017 10/14/2017

27335102 4 12/31/2017 12/31/2017

27681801 1 10/31/2017 10/31/2017

;;;;

data WORK.OVERLAP;

infile datalines dsd truncover;

input ENROLID:32. group:32. tag:$40. start:MMDDYY10. end:MMDDYY10. days:32.;

format start MMDDYY10. end MMDDYY10.;

label ENROLID="Enrollee ID";

datalines;

27335102 2 GLP|SGA 06/08/2017 06/08/2017 1

27335102 6 GLP|SGA 10/14/2017 10/14/2017 1

27681801 10 GLP|SGA 10/31/2017 10/31/2017 1

28593801 15 GLP|SGA 11/02/2018 11/29/2018 28

29791002 20 GLP|SGA 05/04/2017 05/04/2017 1

;;;;

Thank you.

Ksharp · Posted 06-01-2024 01:42 AM

So did you check my code ?

stellapersis7 · Posted 06-01-2024 12:55 PM

I did your previous code and got the dataset overlap. I merged back with dataset- SGA to get SGA_Start, and did (duration = start-SGA_Start). However, there are multiple durations for each 'start' because of multiple SGA_Start. For example, please check the screenshot below. for observations, 31-36, there is a same start and end dates which calculated duration against many SGA_Start dates. However, I want the difference with the nearest SGA_start date only. So for the ones between 31-36, I will need to capture 34 only where the duration is captured to the nearest SGA_Start date.

Ksharp · Posted 06-01-2024 08:36 PM

So you did not use the code I posted and use MERGE statement by yourself ?
Try running my code and open dataset WANT2 and check variable DIFFERENCE .
use the data in my code and explain where is wrong .

mkeintz · Posted 06-01-2024 10:51 AM

If there are no overlaps WITHIN sga or WITHIN dm, and both are sorted by ID sga_start/dm_start, then there is a single step solution using conditional SET statements, as in:

data sga;
input id	(sga_start	sga_end) (:mmddyy10.);
format sga_start	sga_end mmddyy10. ;
cards;
1	1/1/17	7/1/17	 	 
1	10/1/17	12/1/17	 	 
2	2/1/17	4/1/17	 	 
2	9/1/17	11/1/17	
;
data dm;
input id	(dm_start	dm_end) (:mmddyy10.);
format dm_start	dm_end mmddyy10. ;
cards;
1	4/1/17	5/1/17	 	 
1	6/1/17	12/1/17	 	 
2	3/1/17	4/1/17	 	 
2	9/1/17	12/1/17
;


data want (drop=_:);
  merge sga (keep=id sga_start rename=(sga_start=_date) in=insga)
        dm  (keep=id dm_start  rename=(dm_start=_date)  in=indm) ;
  by id _date;


  if insga=1 then set sga ;
  if indm=1 then set dm ;

  if (sga_start <= dm_start <= sga_end)  or
     (sga_start <= dm_end   <= sga_end) then do;
    overlap+1;
    overlap_start=max(dm_start,sga_start);
    overlap_end=min(sga_end,dm_end);
    overlap_length=1 + overlap_end - overlap_start;
    pre_overlap_duration=overlap_start-sga_start;
    format overlap_start overlap_end mmddyy10. ;
    output;
  end;
  if last.id then call missing(of _all_);
run;

If you want to know more about the underlying logic of using conditional SET statements, please see my recent presentation at VIDEOS | BASUG.

Edited note: the code above assumes that the [sga_start,sga_end] interval is NEVER within the [dm_start,dm_end] interval, since that is how the sample data was presented. But if it is possible that dm_start <= sga_start <= sga_end <= dm_end, then change

  if (sga_start <= dm_start <= sga_end)  or
     (sga_start <= dm_end   <= sga_end) then do;

to

  if max(sga_start,dm_start) <= min(sga_end,dm_end) then do;

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------

SAS Training: Just a Click Away