My test results, which are in line with my expectations:
data
WORK.DIAG
WORK.ENC (keep=ID)
;
length PAYLOAD $200;
PAYLOAD = repeat("X",199);
do ID=1 to 1e7;
output WORK.DIAG;
if mod(ID,5) = 0 then output WORK.ENC;
end;
run; /* 5.87 seconds */
proc sql;
create table WORK.DIAG_NEW1 as
select
ID,
PAYLOAD,
case
when ID in (select ID from WORK.ENC)
then ID
else .
end as ENC_ID
from WORK.DIAG;
quit; /* in (select ...) 27.59 seconds */
proc sql _method;
create table WORK.DIAG_NEW2 as
select
d.ID,
d.PAYLOAD,
E.ID as ID_ENC
from WORK.DIAG as d
left join WORK.ENC as e
on D.ID = E.ID;
quit; /* left join 24:05 */
data WORK.DIAG_NEW3;
set WORK.DIAG;
if _n_ = 1 then do;
declare hash h (dataset:"WORK.ENC",hashexp:7);
h.definekey("ID");
h.definedone();
end;
if h.check() = 0 then ENC_ID = ID;
run; /* hash 15.08 seconds */
data WORK.DIAG_NEW4;
merge
WORK.DIAG (in=D)
WORK.ENC (in=E)
;
by ID;
if D;
if E then ENC_ID = ID;
run; /* merge by 10.41 seconds */
data
WORK.DIAG(sortedby=ID)
WORK.ENC (sortedby=ID keep=ID)
;
length PAYLOAD $200;
PAYLOAD = repeat("X",199);
do ID=1 to 1e7;
output WORK.DIAG;
if mod(ID,5) = 0 then output WORK.ENC;
end;
run; /* 5.24 seconds */
proc sql _method;
create table WORK.DIAG_NEW2 as
select
d.ID,
d.PAYLOAD,
e.ID as ID_ENC
from WORK.DIAG as d
left join WORK.ENC as e
on D.ID = E.ID;
quit; /* left join sorted 12.24 seconds */
The worst time is: in (select...), which is very inefficient as expected (it's basically a Cartesian product).
Then a SQL join with bad metadata (proc sql doesn't know the data is already sorted so sorts both tables again).
Then hash.
Then the SQL join without sort
Then the data step merge, which leverages the sorted data the best.
Note: WORK library uses the BASE engine.
When using SPDE, the data step takes longer than proc sql.
... View more