Hello SAS experts,
I have a dataset and would like to identify an individual's true disease type based on the following:
Below is the dataset I have and the expected output.
data WORK.SUBTYPE_SAMPLE;
infile datalines dsd truncover;
input ID:BEST12. Type:$12. Reference_date:DATE9. service_start:DATE9. service_end:DATE9.;
format ID BEST12. Reference_date DATE9. service_start DATE9. service_end DATE9.;
datalines;
1 A 04JAN2016 10JAN2016 21JAN2016
1 B 04JAN2016 09JUL2018 09NOV2019
1 Unspecified 04JAN2016 06JAN2016 10FEB2016
2 B 08JUN2019 08DEC2019 19DEC2019
2 Unspecified 08JUN2019 22OCT2019 09AUG2019
3 Unspecified 02FEB2017 02APR2017 15APR2017
4 A 01JAN2020 03MAR2020 24MAR2020
4 A 01JAN2020 05MAY2018 10MAY2018
4 Unnspecified 01JAN2020 02JAN2020 03JAN2020
5 A 09SEP2016 11NOV2016 15NOV2016
5 B 09SEP2016 09SEP2016 10NOV2016
6 A 03MAR2016 30AUG2016 02NOV2016
6 A 03MAR2016 14OCT2016 19OCT2016
6 A 03MAR2016 26MAR2016 19DEC2016
6 Unspecified 03MAR2016 20OCT2016 21OCT2016
6 Unspecified 03MAR2016 12DEC2016 28DEC2016
6 B 03MAR2016 28JUN2016 15AUG2016
7 B 10OCT2022 11OCT2022 14NOV2022
8 Unspecified 01JAN2019 05MAY2019 06MAY2019
8 Unspecified 01JAN2019 07MAY2019 08MAY2019
;;;;
Want:
Id | true_type |
1 | A |
2 | B |
3 | Unspecified |
4 | A |
5 | B |
6 | A |
7 | B |
8 | Unspecified |
Thank you
The first part is just a slight re-work of your input dataset - it was producing all sorts of errors trying to read as it was:
data WORK.SUBTYPE_SAMPLE;
infile cards dsd truncover firstobs=1 dlm=',';
length ID $12 type $12 reference_date service_start service_end 4;
informat reference_date service_start service_end date9.;
format Reference_date DATE9. service_start DATE9. service_end DATE9.;
input ID Type Reference_date service_start service_end;
cards;
1,A,04JAN2016,10JAN2016,21JAN2016
1,B,04JAN2016,09JUL2018,09NOV2019
1,Unspecified,04JAN2016,06JAN2016,10FEB2016
2,B,08JUN2019,08DEC2019,19DEC2019
2,Unspecified,08JUN2019,22OCT2019,09AUG2019
3,Unspecified,02FEB2017,02APR2017,15APR2017
4,A,01JAN2020,03MAR2020,24MAR2020
4,A,01JAN2020,05MAY2018,10MAY2018
4,Unnspecified,01JAN2020,02JAN2020,03JAN2020
5,A,09SEP2016,11NOV2016,15NOV2016
5,B,09SEP2016,09SEP2016,10NOV2016
6,A,03MAR2016,30AUG2016,02NOV2016
6,A,03MAR2016,14OCT2016,19OCT2016
6,A,03MAR2016,26MAR2016,19DEC2016
6,Unspecified,03MAR2016,20OCT2016,21OCT2016
6,Unspecified,03MAR2016,12DEC2016,28DEC2016
6,B,03MAR2016,28JUN2016,15AUG2016
7,B,10OCT2022,11OCT2022,14NOV2022
8,Unspecified,01JAN2019,05MAY2019,06MAY2019
8,Unspecified,01JAN2019,07MAY2019,08MAY2019
;
run;
proc sort data=subtype_sample; by id; run;
data want;
set subtype_sample;
by ID;
length true_type $12 closest 4 anyAB 3;
retain true_type closest anyAB;
if first.ID then do;
closest=10000;
true_type='';
anyAB=0;
end;
dist=min(
abs(service_start-reference_date),
abs(service_end-reference_end)
);
if type in ('A', 'B') then do;
anyAB=1;
if dist<closest then do;
true_type=type;
closest=dist;
end;
end;
else if anyAB=0 then true_type='Unspecified';
if last.ID then output;
keep ID true_type closest;
run;
proc print data=want; run;
The first part is just a slight re-work of your input dataset - it was producing all sorts of errors trying to read as it was:
data WORK.SUBTYPE_SAMPLE;
infile cards dsd truncover firstobs=1 dlm=',';
length ID $12 type $12 reference_date service_start service_end 4;
informat reference_date service_start service_end date9.;
format Reference_date DATE9. service_start DATE9. service_end DATE9.;
input ID Type Reference_date service_start service_end;
cards;
1,A,04JAN2016,10JAN2016,21JAN2016
1,B,04JAN2016,09JUL2018,09NOV2019
1,Unspecified,04JAN2016,06JAN2016,10FEB2016
2,B,08JUN2019,08DEC2019,19DEC2019
2,Unspecified,08JUN2019,22OCT2019,09AUG2019
3,Unspecified,02FEB2017,02APR2017,15APR2017
4,A,01JAN2020,03MAR2020,24MAR2020
4,A,01JAN2020,05MAY2018,10MAY2018
4,Unnspecified,01JAN2020,02JAN2020,03JAN2020
5,A,09SEP2016,11NOV2016,15NOV2016
5,B,09SEP2016,09SEP2016,10NOV2016
6,A,03MAR2016,30AUG2016,02NOV2016
6,A,03MAR2016,14OCT2016,19OCT2016
6,A,03MAR2016,26MAR2016,19DEC2016
6,Unspecified,03MAR2016,20OCT2016,21OCT2016
6,Unspecified,03MAR2016,12DEC2016,28DEC2016
6,B,03MAR2016,28JUN2016,15AUG2016
7,B,10OCT2022,11OCT2022,14NOV2022
8,Unspecified,01JAN2019,05MAY2019,06MAY2019
8,Unspecified,01JAN2019,07MAY2019,08MAY2019
;
run;
proc sort data=subtype_sample; by id; run;
data want;
set subtype_sample;
by ID;
length true_type $12 closest 4 anyAB 3;
retain true_type closest anyAB;
if first.ID then do;
closest=10000;
true_type='';
anyAB=0;
end;
dist=min(
abs(service_start-reference_date),
abs(service_end-reference_end)
);
if type in ('A', 'B') then do;
anyAB=1;
if dist<closest then do;
true_type=type;
closest=dist;
end;
end;
else if anyAB=0 then true_type='Unspecified';
if last.ID then output;
keep ID true_type closest;
run;
proc print data=want; run;
Thank you!!
data WORK.SUBTYPE_SAMPLE;
infile cards dsd truncover firstobs=1 dlm=',';
length ID $12 type $12 reference_date service_start service_end 4;
informat reference_date service_start service_end date9.;
format Reference_date DATE9. service_start DATE9. service_end DATE9.;
input ID Type Reference_date service_start service_end;
cards;
1,A,04JAN2016,10JAN2016,21JAN2016,
1,B,04JAN2016,09JUL2018,09NOV2019,
1,Unspecified,04JAN2016,06JAN2016,10FEB2016
2,B,08JUN2019,08DEC2019,19DEC2019,
2,Unspecified,08JUN2019,22OCT2019,09AUG2019
3,Unspecified,02FEB2017,02APR2017,15APR2017
4,A,01JAN2020,03MAR2020,24MAR2020,
4,A,01JAN2020,05MAY2018,10MAY2018,
4,Unnspecified,01JAN2020,02JAN2020,03JAN2020
5,A,09SEP2016,11NOV2016,15NOV2016,
5,B,09SEP2016,09SEP2016,10NOV2016,
6,A,03MAR2016,30AUG2016,02NOV2016,
6,A,03MAR2016,14OCT2016,19OCT2016,
6,A,03MAR2016,26MAR2016,19DEC2016,
6,Unspecified,03MAR2016,20OCT2016,21OCT2016
6,Unspecified,03MAR2016,12DEC2016,28DEC2016
6,B,03MAR2016,28JUN2016,15AUG2016,
7,B,10OCT2022,11OCT2022,14NOV2022,
8,Unspecified,01JAN2019,05MAY2019,06MAY2019
8,Unspecified,01JAN2019,07MAY2019,08MAY2019
;
run;
proc sql;
create table want as
select distinct ID,type
from SUBTYPE_SAMPLE
where type in ('A' 'B')
group by ID
having abs(reference_date-service_start)=min(abs(reference_date-service_start))
union
select distinct ID,type
from (select * from SUBTYPE_SAMPLE group by ID having sum(type='Unspecified')=count(*))
group by ID
having abs(reference_date-service_start)=min(abs(reference_date-service_start))
order by ID
;
quit;
Given that the data are already sorted by ID, then you can read each ID group with all the 'A' and 'B' types preceding all 'Unspecified' types, via
data want;
set have (where=(type in ('A','B')))
have (where=(type ='Unspecified')) ;
...
;
Then:
data want;
set have (where=(type in ('A','B')))
have (where=(type ='Unspecified')) ;
by id;
retain min_dist . true_type ' ' ;
_dist=min(abs(service_start-reference_date)
,abs(service_end-reference_date));
if first.id=1 then do;
min_dist=_dist;
true_type=type;
end;
else if type in ('A','B') and _dist<min_dist then do;
true_type=type;
min_dist=_dist;
end;
if last.id;
keep id true_type ;
run;
It's finally time to hack! Remember to visit the SAS Hacker's Hub regularly for news and updates.
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.