I have the following dataset:
DATA have;
INPUT Person_ID Total_ID Treat ps_score;
CARDS;
1004 1 0 0.01
1004 2 0 0.03
1004 3 1 0.043
1004 4 1 .
1004 5 1 0.02
1004 6 0 .
1007 7 0 .
1007 8 1 0.032
1007 9 1 0.77
1007 10 0 0.041
1009 11 0 0.045
1009 12 0 0.11
1009 13 0 0.82
1009 14 1 0.94
;
RUN;
I want to obtain the following dataset:
DATA want;
INPUT Person_ID Total_ID Treat ps_score matched_ID_1 matched_ID_2
;
CARDS;
1004 1 0 0.01 . .
1004 2 0 0.03 . .
1004 3 1 0.043 10 11
1004 4 1 . . .
1004 5 1 0.02 10 .
1004 6 0 . . .
1007 7 0 . . .
1007 8 1 0.032 2 .
1007 9 1 0.77 13 .
1007 10 0 0.041 . .
1009 11 0 0.045 . .
1009 12 0 0.11 . .
1009 13 0 0.82 . .
1009 14 1 0.94 10 .
;
RUN;
To achieve the desired outcome, I want to do the following:
Thank you for any assistance!
Hi @TrueTears While I am not sure and of course can't gauge what your hardware capacity is- like RAM, OS, disk space and other factors for solutions that may have higher dependency on the mentioned factors , for what it's worth I am offering a solution using HASH. The solution may or may not be fast as I haven't really tested extensively, however it adds to the menu offering more choices for you to play with.
Your total_id being unique across observations, makes it a very convenient look up operation. Therefore, if I understand your requirement thoroughly, the logic is fairly straight forward regardless of the approach be it SQL or Datastep. Secondly, the TREAT variable being binary, offers the convenience of splitting the observations into a base table and a look up table. Well, the rest is a breeze IMHO>
DATA have;
INPUT Person_ID Total_ID Treat ps_score;
CARDS;
1004 1 0 0.01
1004 2 0 0.03
1004 3 1 0.043
1004 4 1 .
1004 5 1 0.02
1004 6 0 .
1007 7 0 .
1007 8 1 0.032
1007 9 1 0.77
1007 10 0 0.041
1009 11 0 0.045
1009 12 0 0.11
1009 13 0 0.82
1009 14 1 0.94
;
RUN;
data temp;
if _n_=1 then do;
if 0 then set have have(rename=(person_id=_person_id Total_ID=Total_ID_ ps_score=_ps_score));
dcl hash h(dataset:'have(where=(treat=0 and _ps_score)
rename=(person_id=_person_id Total_ID=Total_ID_ ps_score=_ps_score)))');
h.defineKey('Total_ID_');
h.definedata('_Person_ID','_ps_score','Total_ID_');
h.defineDone();
dcl hiter hi('h');
end;
do until(z);
set have end=z;
array t(9999)_temporary_;
call missing(of t(*),__n,Total_ID_);
if not(treat and ps_score) then do;
output;
continue;
end;
_n_=constant('big');
do while(hi.next()=0);
if person_id=_person_id then continue;
_n=round(abs(_ps_score-ps_score),0.0001);
if _n<=_n_ then do;
if _n<_n_ then call missing(of t(*),__n);
_n_=_n;
__n=sum(__n,1);
t(__n)=Total_ID_;
end;
end;
do _n_=1 by 1 while(t(_n_));
_rc=h.find(key:t(_n_));
output;
end;
end;
drop _:;
run;
/*Transpose to wide to meet your expected output*/
proc transpose data=temp out=want(drop=_name_) prefix=matched_ID_;
by person_id--ps_score;
var Total_ID_;
run;
proc print noobs;run;
Person_ID | Total_ID | Treat | ps_score | matched_ID_1 | matched_ID_2 |
---|---|---|---|---|---|
1004 | 1 | 0 | 0.010 | . | . |
1004 | 2 | 0 | 0.030 | . | . |
1004 | 3 | 1 | 0.043 | 10 | 11 |
1004 | 4 | 1 | . | . | . |
1004 | 5 | 1 | 0.020 | 10 | . |
1004 | 6 | 0 | . | . | . |
1007 | 7 | 0 | . | . | . |
1007 | 8 | 1 | 0.032 | 2 | . |
1007 | 9 | 1 | 0.770 | 13 | . |
1007 | 10 | 0 | 0.041 | . | . |
1009 | 11 | 0 | 0.045 | . | . |
1009 | 12 | 0 | 0.110 | . | . |
1009 | 13 | 0 | 0.820 | . | . |
1009 | 14 | 1 | 0.940 | 10 | . |
DATA want; INPUT Person_ID Total_ID Treat ps_score ; CARDS; 1004 1 0 0.01 . . 1004 2 0 0.03 . . 1004 3 1 0.043 10 11 1004 4 1 . . . 1004 5 1 0.02 10 . 1004 6 0 . . . 1007 7 0 . . . 1007 8 1 0.032 2 . 1007 9 1 0.77 13 . 1007 10 0 0.041 . . 1009 11 0 0.045 . . 1009 12 0 0.11 . . 1009 13 0 0.82 . . 1009 14 1 0.94 10 . ; RUN; proc sql; create table temp as select a.*,b.total_id as id,round(abs(a.ps_score-b.ps_score),0.0001) as d from want as a,want as b where a.person_id ne b.person_id and b.treat=0 group by a.person_id,a.total_id having d=min(d); quit; proc transpose data=temp out=temp1(drop=_name_) prefix=matched_id_; by person_id total_id treat ps_score; var id; run; data want; set temp1; if treat=0 or missing(ps_score) then call missing(of matched_id_:); run;
Thanks @Ksharp! I was wondering if there is a method that is faster? The code seems to hang for a long time when I apply it on a large dataset (with around 90,000 observations). Thank you.
EDIT: If it helps, any observations with ps_score that is missing do not need to be shown and can be removed from the dataset since it is not used in the calculations.
Create a view that excludes the obs with missing values, and use that in the SQL.
Or create a temporary dataset for that. Also make sure that you don't "carry around" unneeded columns.
Hi @TrueTears While I am not sure and of course can't gauge what your hardware capacity is- like RAM, OS, disk space and other factors for solutions that may have higher dependency on the mentioned factors , for what it's worth I am offering a solution using HASH. The solution may or may not be fast as I haven't really tested extensively, however it adds to the menu offering more choices for you to play with.
Your total_id being unique across observations, makes it a very convenient look up operation. Therefore, if I understand your requirement thoroughly, the logic is fairly straight forward regardless of the approach be it SQL or Datastep. Secondly, the TREAT variable being binary, offers the convenience of splitting the observations into a base table and a look up table. Well, the rest is a breeze IMHO>
DATA have;
INPUT Person_ID Total_ID Treat ps_score;
CARDS;
1004 1 0 0.01
1004 2 0 0.03
1004 3 1 0.043
1004 4 1 .
1004 5 1 0.02
1004 6 0 .
1007 7 0 .
1007 8 1 0.032
1007 9 1 0.77
1007 10 0 0.041
1009 11 0 0.045
1009 12 0 0.11
1009 13 0 0.82
1009 14 1 0.94
;
RUN;
data temp;
if _n_=1 then do;
if 0 then set have have(rename=(person_id=_person_id Total_ID=Total_ID_ ps_score=_ps_score));
dcl hash h(dataset:'have(where=(treat=0 and _ps_score)
rename=(person_id=_person_id Total_ID=Total_ID_ ps_score=_ps_score)))');
h.defineKey('Total_ID_');
h.definedata('_Person_ID','_ps_score','Total_ID_');
h.defineDone();
dcl hiter hi('h');
end;
do until(z);
set have end=z;
array t(9999)_temporary_;
call missing(of t(*),__n,Total_ID_);
if not(treat and ps_score) then do;
output;
continue;
end;
_n_=constant('big');
do while(hi.next()=0);
if person_id=_person_id then continue;
_n=round(abs(_ps_score-ps_score),0.0001);
if _n<=_n_ then do;
if _n<_n_ then call missing(of t(*),__n);
_n_=_n;
__n=sum(__n,1);
t(__n)=Total_ID_;
end;
end;
do _n_=1 by 1 while(t(_n_));
_rc=h.find(key:t(_n_));
output;
end;
end;
drop _:;
run;
/*Transpose to wide to meet your expected output*/
proc transpose data=temp out=want(drop=_name_) prefix=matched_ID_;
by person_id--ps_score;
var Total_ID_;
run;
proc print noobs;run;
Person_ID | Total_ID | Treat | ps_score | matched_ID_1 | matched_ID_2 |
---|---|---|---|---|---|
1004 | 1 | 0 | 0.010 | . | . |
1004 | 2 | 0 | 0.030 | . | . |
1004 | 3 | 1 | 0.043 | 10 | 11 |
1004 | 4 | 1 | . | . | . |
1004 | 5 | 1 | 0.020 | 10 | . |
1004 | 6 | 0 | . | . | . |
1007 | 7 | 0 | . | . | . |
1007 | 8 | 1 | 0.032 | 2 | . |
1007 | 9 | 1 | 0.770 | 13 | . |
1007 | 10 | 0 | 0.041 | . | . |
1009 | 11 | 0 | 0.045 | . | . |
1009 | 12 | 0 | 0.110 | . | . |
1009 | 13 | 0 | 0.820 | . | . |
1009 | 14 | 1 | 0.940 | 10 | . |
Registration is now open for SAS Innovate 2025 , our biggest and most exciting global event of the year! Join us in Orlando, FL, May 6-9.
Sign up by Dec. 31 to get the 2024 rate of just $495.
Register now!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.