I'm trying to create data set WANT from HAVE based on the condition that there must be a match on at least 2keys. I have two approaches that I have tried. Thanks to some SAS information at www2.sas.com/proceedings/sugi27/p075-27.pdf The SQL approach finds the match but becomes a time consuming and resource intensive approach when dealing with large data set. Approach 2 does work but definitely a lengthy process and I find it a bit difficult to follow and replicate.
Question: 1) Anyone has a different (3rd) approach to solving this problem? OR
2) Could anyone provide an enhancement to approach 2 by adding a new variable country (‘US’ for all participants) to data set work.HAVE and generate a data set work.WANT with the condition that there’s a match on at least 2 keys. Note: The client_ID value should not change (I think). Thanks
HAVE
ADNUM SSN NAME DOB
101 856116534 ABCD 18760323
187 856176354 ABCD 18760323
233 856176354 ABCD 18670323
456 850176534 ABCD 18760323
490 856176534 ABEF 18670323
535 856176534 ABGF 18760323
601 856176534 ABEF 18760323
632 856176534 ABEE 18760323
879 856116534 ABEF 18760323
911 856176534 ABGF 18760323
919 123456789 JOHN 19000224
200 000089712 DAVE 20000212
WANT (ALLDATA8)
ADNUM SSN NAME DOB Client_id
101 856116534 ABCD 18760323 3
187 856176354 ABCD 18760323 3
233 856176354 ABCD 18670323 3
456 850176534 ABCD 18760323 3
490 856176534 ABEF 18670323 3
535 856176534 ABGF 18760323 3
601 856176534 ABEF 18760323 3
632 856176534 ABEE 18760323 3
879 856116534 ABEF 18760323 3
911 856176534 ABGF 18760323 3
919 123456789 JOHN 19000224 2
200 000089712 DAVE 20000212 1
/* SQL approach */
PROC SQL;
CREATE TABLE work.match AS
SELECT X.ADNUM AS ADNUM1,
Y.ADNUM AS ADNUM2
FROM work.have X, work.have Y
WHERE ( (X.SSN EQ Y.SSN)
+ (X.NAME EQ Y.NAME)
+ (X.DOB EQ Y.DOB) ) >= 2
AND (X.ADNUM < Y.ADNUM)
ORDER BY ADNUM1, ADNUM2;
QUIT;
/* Approach 2*/
DATA ALLDATA1 /*(KEEP = ADNUM NSID SDID NDID)*/;
SET work.have;/* imported data or created data set*/
LENGTH NSID $ 13 SDID $ 17 NDID $ 12
DOBC $ 8;
DOBC = TRIM(LEFT(COMPRESS(PUT(DOB,
BEST12.),'-')));
NSID = TRIM(NAME)||TRIM(SSN);
NDID = TRIM(NAME)||DOBC;
SDID = TRIM(SSN)||DOBC;
RUN;
%MACRO GENKEY(INDATA_ = , /* INPUT DATA */
OUTDATA_ = , /* OUTPUT DATA */
KEYID_ = /* BY VAR. */);
%LOCAL INDATA_ OUTDATA_ KEYID_;
PROC SORT DATA = &INDATA_;
BY &KEYID_.ID;
RUN;
DATA &OUTDATA_(DROP = &KEYID_.ID);
SET &INDATA_;
BY &KEYID_.ID;
IF FIRST.&KEYID_.ID THEN &KEYID_.KEY+1;
RUN;
%MEND GENKEY;
%GENKEY(INDATA_ = ALLDATA1,
OUTDATA_ = ALLDATA2,
KEYID_ = NS)
%GENKEY(INDATA_ = ALLDATA2,
OUTDATA_ = ALLDATA3,
KEYID_ = ND)
%GENKEY(INDATA_ = ALLDATA3,
OUTDATA_ = ALLDATA4,
KEYID_ = SD)
%MACRO GENFMTS(INDATA_ = , /* INPUT DATA */
FMTS_ = , /* FORMATS NAME */
KEY1_ = , /* WITHIN */
KEY2_ = /* TRANSLATE */);
%LOCAL INDATA_ FMTS_ KEY1_ KEY2_;
PROC SORT DATA = &INDATA_;
BY &KEY1_.KEY &KEY2_.KEY;
RUN;
DATA &FMTS_.0(KEEP = FMTNAME START LABEL
KEY);
SET &INDATA_;
BY &KEY1_.KEY;
RETAIN FMTNAME "&FMTS_" &KEY2_.KEYF;
IF FIRST.&KEY1_.KEY
THEN &KEY2_.KEYF = &KEY2_.KEY;
IF &KEY2_.KEY NE &KEY2_.KEYF THEN
DO;
START = &KEY2_.KEY;
LABEL = &KEY2_.KEYF;
KEY + 1;
OUTPUT;
END;
RUN;
PROC SORT DATA = &FMTS_.0 NODUPKEY;
BY START LABEL;
RUN;
%MEND GENFMTS;
%GENFMTS(INDATA_ = ALLDATA4,
FMTS_ = TRANSND,
KEY1_ = SD,
KEY2_ = ND) /* Tested to this level */
%MACRO TRANS(INDATA_ = ,
OUTDATA_ = ,
INIFMTS_ = ,
FINFMTS_ = ,
TRANS_ = );
%LOCAL
INDATA_ /* INPUT DATA SET */
OUTDATA_ /* OUTPUT DATA SET */
INIFMTS_ /* INITIAL FORMATS */
FINFMTS_ /* FINAL FORMATS */
TRANS_ /* **KEY TO BE TRANSLATED */
NEWOBS /* OBS. IN INITIAL FORMATS */
OLDOBS /* OBS. IN FINAL FORMATS */
LKOBS /* NUM OF LINK CASES TO BE
UPDATED IN EACH ITERATION */
REOBS /* NUM OF REPEAT CASES TO BE
UPDATED IN EACH ITERATION */
ITERS /* NUM OF ITERATIONS */
UP /* SERIES NUMBER FOR THE
TEMPORARY UPDATE DATA SET */
LK /* SERIES NUMBER FOR THE
TEMPORARY LINK DATA SET */
RE /* SERIES NUMBER FOR THE
TEMPORARY REPEAT DATA SET */
;
%LET UP = 1;
%LET LK = 1;
%LET RE = 1;
%LET ITERS = 0;
DATA _NULL_;
CALL SYMPUT('OLDOBS',
TRIM(LEFT(PUT(OLDOBS,10.))));
SET &INIFMTS_ NOBS = OLDOBS;
STOP;
RUN;
%PUT %STR( );
%PUT NOTE: DATA SET &INIFMTS_ HAS &OLDOBS
OBSERVATION PAIRS.;
PROC SQL;
CREATE TABLE LINK&LK AS
SELECT Y.KEY AS KEY,
Y.START AS START,
X.LABEL AS LABEL
FROM &INIFMTS_ X, &INIFMTS_ Y
WHERE X.START EQ Y.LABEL
ORDER BY KEY, START, LABEL;
QUIT;
PROC SQL;
CREATE TABLE REPEAT&RE AS
SELECT Y.KEY AS KEY,
Y.LABEL AS START,
X.LABEL AS LABEL
FROM &INIFMTS_ X, &INIFMTS_ Y
WHERE (X.START EQ Y.START)
AND (Y.LABEL > X.LABEL)
ORDER BY KEY, START, LABEL;
QUIT;
DATA _NULL_;
CALL SYMPUT('LKOBS',
TRIM(LEFT(PUT(LINK,10.))));
SET LINK&LK NOBS = LINK;
STOP;
RUN;
%PUT %STR( );
%PUT NOTE: DATA SET LINK&LK HAS &LKOBS
OBSERVATION PAIRS.;
DATA _NULL_;
CALL SYMPUT('REOBS',
TRIM(LEFT(PUT(REPEAT,10.))));
SET REPEAT&RE NOBS = REPEAT;
STOP;
RUN;
%PUT %STR( );
%PUT NOTE: DATA SET REPEAT&RE HAS &REOBS
OBSERVATION PAIRS.;
%DO %WHILE ((&LKOBS NE 0)
OR (&REOBS NE 0));
PROC SORT DATA = &INIFMTS_;
BY KEY;
RUN;
DATA UPDT&UP;
UPDATE &INIFMTS_ LINK&LK;
BY KEY;
RUN;
DATA UPDT%EVAL(&UP+1);
UPDATE UPDT&UP REPEAT&RE;
BY KEY;
RUN;
PROC SORT DATA = UPDT%EVAL(&UP+1)
NODUPKEY;
BY START LABEL;
RUN;
PROC SQL;
CREATE TABLE LINK%EVAL(&LK+1) AS
SELECT Y.KEY AS KEY,
Y.START AS START,
X.LABEL AS LABEL
FROM UPDT%EVAL(&UP+1) X,
UPDT%EVAL(&UP+1) Y
WHERE X.START EQ Y.LABEL
ORDER BY KEY, START, LABEL;
QUIT;
PROC SQL;
CREATE TABLE REPEAT%EVAL(&RE+1) AS
SELECT Y.KEY AS KEY,
Y.LABEL AS START,
X.LABEL AS LABEL
FROM UPDT%EVAL(&UP+1) X,
UPDT%EVAL(&UP+1) Y
WHERE (X.START EQ Y.START)
AND (Y.LABEL > X.LABEL)
ORDER BY KEY, START, LABEL;
QUIT;
DATA _NULL_;
CALL SYMPUT('LKOBS',
TRIM(LEFT(PUT(LINK,10.))));
SET LINK%EVAL(&LK+1) NOBS=LINK;
STOP;
RUN;
%PUT %STR( );
%PUT NOTE: DATA SET LINK%EVAL(&LK+1)
HAS &LKOBS OBSERVATION PAIRS.;
DATA _NULL_;
CALL SYMPUT('REOBS',
TRIM(LEFT(PUT(REPEAT,10.))));
SET REPEAT%EVAL(&RE+1) NOBS=REPEAT;
STOP;
RUN;
%PUT %STR( );
%PUT NOTE: DATA SET REPEAT%EVAL(&RE+1)
HAS &REOBS OBSERVATION PAIRS.;
%LET ITERS = %EVAL(&ITERS+1);
%LET INIFMTS_ = UPDT%EVAL(&UP+1);
%LET UP = %EVAL(&UP+2);
%LET LK = %EVAL(&LK+1);
%LET RE = %EVAL(&RE+1);
%END; /* END OF %DO %WHILE LOOP */
DATA _NULL_;
CALL SYMPUT('NEWOBS',
TRIM(LEFT(PUT(NEWOBS,10.))));
SET &INIFMTS_ NOBS=NEWOBS;
STOP;
RUN;
%PUT NOTE: TRANS MADE &ITERS ITERATIONS.;
%PUT NOTE: %EVAL(&OLDOBS - &NEWOBS)
DUPLICATE MAPPINGS WERE REMOVED.;
OPTIONS NOTES;
%PUT NOTE: SORTING &NEWOBS OBSERVATIONS IN
THE UPDATES DATA INTO &FINFMTS_..;
PROC SORT DATA = &INIFMTS_ OUT = &FINFMTS_
NODUPKEY;
BY START LABEL;
RUN;
%IF &NEWOBS NE 0 %THEN
%DO;
PROC FORMAT CNTLIN = &FINFMTS_;
DATA &OUTDATA_;
SET &INDATA_;
&TRANS_=PUT(&TRANS_,&FINFMTS_..);
RUN;
%END;
%ELSE
%DO;
PROC DATA SETS;
CHANGE &INDATA_ = &OUTDATA;
RUN;
%END;
%MEND TRANS;
%TRANS(INDATA_ = ALLDATA4,
OUTDATA_ = ALLDATA5,
INIFMTS_ = TRANSND0,
FINFMTS_ = TRANSND,
TRANS_ = NDKEY)
%GENFMTS(INDATA_ = ALLDATA5,
FMTS_ = TRANSNS,
KEY1_ = ND,
KEY2_ = NS)
%TRANS(INDATA_ = ALLDATA5,
OUTDATA_ = ALLDATA6,
INIFMTS_ = TRANSNS0,
FINFMTS_ = TRANSNS,
TRANS_ = NSKEY)
%GENFMTS(INDATA_ = ALLDATA6,
FMTS_ = TRANSSD,
KEY1_ = NS,
KEY2_ = SD)
%TRANS(INDATA_ = ALLDATA6,
OUTDATA_ = ALLDATA7,
INIFMTS_ = TRANSSD0,
FINFMTS_ = TRANSSD,
TRANS_ = SDKEY)
PROC SORT DATA = ALLDATA7;
BY SDKEY;
RUN;
DATA ALLDATA8(KEEP = ADNUM CLIENTID)/*remove keep statement if you want to*/;/* This step adds the client id */
SET ALLDATA7;
BY SDKEY;
IF FIRST.SDKEY THEN CLIENTID + 1;
RUN;
wow...sort of difficult to follow...
best I could do was build on the first approach.
data HAVE;
input ADNUM SSN NAME $ DOB;
format dob datetime20;
match1 = catx('',put(ssn,z9.),trim(name));
match2=catx('',put(ssn,z9.),put(dob,best12.));
match3=catx('',trim(name),put(dob,best12.));
cards;
101 856116534 ABCD 18760323
187 856176354 ABCD 18760323
233 856176354 ABCD 18670323
456 850176534 ABCD 18760323
490 856176534 ABEF 18670323
535 856176534 ABGF 18760323
601 856176534 ABEF 18760323
632 856176534 ABEE 18760323
879 856116534 ABEF 18760323
911 856176534 ABGF 18760323
919 123456789 JOHN 19000224
200 000089712 DAVE 20000212
;
run;
proc sql;
create table want as
select
t1.adnum as adnum1
,t2.adnum as adnum2
from
have t1
inner join have t2
on (t1.match1 = t2.match1 or t1.match2=t2.match2 or t1.match3=t2.match3)
and t1.adnum < t2.adnum;
quit;
Registration is now open for SAS Innovate 2025 , our biggest and most exciting global event of the year! Join us in Orlando, FL, May 6-9.
Sign up by Dec. 31 to get the 2024 rate of just $495.
Register now!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.