I'm trying to create data set WANT from HAVE based on the condition that there must be a match on at least 2keys. I have two approaches that I have tried. Thanks to some SAS information at www2.sas.com/proceedings/sugi27/p075-27.pdf The SQL approach finds the match but becomes a time consuming and resource intensive approach when dealing with large data set. Approach 2 does work but definitely a lengthy process and I find it a bit difficult to follow and replicate. Question: 1) Anyone has a different (3rd) approach to solving this problem? OR 2) Could anyone provide an enhancement to approach 2 by adding a new variable country (‘US’ for all participants) to data set work.HAVE and generate a data set work.WANT with the condition that there’s a match on at least 2 keys. Note: The client_ID value should not change (I think). Thanks HAVE ADNUM SSN NAME DOB 101 856116534 ABCD 18760323 187 856176354 ABCD 18760323 233 856176354 ABCD 18670323 456 850176534 ABCD 18760323 490 856176534 ABEF 18670323 535 856176534 ABGF 18760323 601 856176534 ABEF 18760323 632 856176534 ABEE 18760323 879 856116534 ABEF 18760323 911 856176534 ABGF 18760323 919 123456789 JOHN 19000224 200 000089712 DAVE 20000212 WANT (ALLDATA8) ADNUM SSN NAME DOB Client_id 101 856116534 ABCD 18760323 3 187 856176354 ABCD 18760323 3 233 856176354 ABCD 18670323 3 456 850176534 ABCD 18760323 3 490 856176534 ABEF 18670323 3 535 856176534 ABGF 18760323 3 601 856176534 ABEF 18760323 3 632 856176534 ABEE 18760323 3 879 856116534 ABEF 18760323 3 911 856176534 ABGF 18760323 3 919 123456789 JOHN 19000224 2 200 000089712 DAVE 20000212 1 /* SQL approach */ PROC SQL; CREATE TABLE work.match AS SELECT X.ADNUM AS ADNUM1, Y.ADNUM AS ADNUM2 FROM work.have X, work.have Y WHERE ( (X.SSN EQ Y.SSN) + (X.NAME EQ Y.NAME) + (X.DOB EQ Y.DOB) ) >= 2 AND (X.ADNUM < Y.ADNUM) ORDER BY ADNUM1, ADNUM2; QUIT; /* Approach 2*/ DATA ALLDATA1 /*(KEEP = ADNUM NSID SDID NDID)*/; SET work.have;/* imported data or created data set*/ LENGTH NSID $ 13 SDID $ 17 NDID $ 12 DOBC $ 8; DOBC = TRIM(LEFT(COMPRESS(PUT(DOB, BEST12.),'-'))); NSID = TRIM(NAME)||TRIM(SSN); NDID = TRIM(NAME)||DOBC; SDID = TRIM(SSN)||DOBC; RUN; %MACRO GENKEY(INDATA_ = , /* INPUT DATA */ OUTDATA_ = , /* OUTPUT DATA */ KEYID_ = /* BY VAR. */); %LOCAL INDATA_ OUTDATA_ KEYID_; PROC SORT DATA = &INDATA_; BY &KEYID_.ID; RUN; DATA &OUTDATA_(DROP = &KEYID_.ID); SET &INDATA_; BY &KEYID_.ID; IF FIRST.&KEYID_.ID THEN &KEYID_.KEY+1; RUN; %MEND GENKEY; %GENKEY(INDATA_ = ALLDATA1, OUTDATA_ = ALLDATA2, KEYID_ = NS) %GENKEY(INDATA_ = ALLDATA2, OUTDATA_ = ALLDATA3, KEYID_ = ND) %GENKEY(INDATA_ = ALLDATA3, OUTDATA_ = ALLDATA4, KEYID_ = SD) %MACRO GENFMTS(INDATA_ = , /* INPUT DATA */ FMTS_ = , /* FORMATS NAME */ KEY1_ = , /* WITHIN */ KEY2_ = /* TRANSLATE */); %LOCAL INDATA_ FMTS_ KEY1_ KEY2_; PROC SORT DATA = &INDATA_; BY &KEY1_.KEY &KEY2_.KEY; RUN; DATA &FMTS_.0(KEEP = FMTNAME START LABEL KEY); SET &INDATA_; BY &KEY1_.KEY; RETAIN FMTNAME "&FMTS_" &KEY2_.KEYF; IF FIRST.&KEY1_.KEY THEN &KEY2_.KEYF = &KEY2_.KEY; IF &KEY2_.KEY NE &KEY2_.KEYF THEN DO; START = &KEY2_.KEY; LABEL = &KEY2_.KEYF; KEY + 1; OUTPUT; END; RUN; PROC SORT DATA = &FMTS_.0 NODUPKEY; BY START LABEL; RUN; %MEND GENFMTS; %GENFMTS(INDATA_ = ALLDATA4, FMTS_ = TRANSND, KEY1_ = SD, KEY2_ = ND) /* Tested to this level */ %MACRO TRANS(INDATA_ = , OUTDATA_ = , INIFMTS_ = , FINFMTS_ = , TRANS_ = ); %LOCAL INDATA_ /* INPUT DATA SET */ OUTDATA_ /* OUTPUT DATA SET */ INIFMTS_ /* INITIAL FORMATS */ FINFMTS_ /* FINAL FORMATS */ TRANS_ /* **KEY TO BE TRANSLATED */ NEWOBS /* OBS. IN INITIAL FORMATS */ OLDOBS /* OBS. IN FINAL FORMATS */ LKOBS /* NUM OF LINK CASES TO BE UPDATED IN EACH ITERATION */ REOBS /* NUM OF REPEAT CASES TO BE UPDATED IN EACH ITERATION */ ITERS /* NUM OF ITERATIONS */ UP /* SERIES NUMBER FOR THE TEMPORARY UPDATE DATA SET */ LK /* SERIES NUMBER FOR THE TEMPORARY LINK DATA SET */ RE /* SERIES NUMBER FOR THE TEMPORARY REPEAT DATA SET */ ; %LET UP = 1; %LET LK = 1; %LET RE = 1; %LET ITERS = 0; DATA _NULL_; CALL SYMPUT('OLDOBS', TRIM(LEFT(PUT(OLDOBS,10.)))); SET &INIFMTS_ NOBS = OLDOBS; STOP; RUN; %PUT %STR( ); %PUT NOTE: DATA SET &INIFMTS_ HAS &OLDOBS OBSERVATION PAIRS.; PROC SQL; CREATE TABLE LINK&LK AS SELECT Y.KEY AS KEY, Y.START AS START, X.LABEL AS LABEL FROM &INIFMTS_ X, &INIFMTS_ Y WHERE X.START EQ Y.LABEL ORDER BY KEY, START, LABEL; QUIT; PROC SQL; CREATE TABLE REPEAT&RE AS SELECT Y.KEY AS KEY, Y.LABEL AS START, X.LABEL AS LABEL FROM &INIFMTS_ X, &INIFMTS_ Y WHERE (X.START EQ Y.START) AND (Y.LABEL > X.LABEL) ORDER BY KEY, START, LABEL; QUIT; DATA _NULL_; CALL SYMPUT('LKOBS', TRIM(LEFT(PUT(LINK,10.)))); SET LINK&LK NOBS = LINK; STOP; RUN; %PUT %STR( ); %PUT NOTE: DATA SET LINK&LK HAS &LKOBS OBSERVATION PAIRS.; DATA _NULL_; CALL SYMPUT('REOBS', TRIM(LEFT(PUT(REPEAT,10.)))); SET REPEAT&RE NOBS = REPEAT; STOP; RUN; %PUT %STR( ); %PUT NOTE: DATA SET REPEAT&RE HAS &REOBS OBSERVATION PAIRS.; %DO %WHILE ((&LKOBS NE 0) OR (&REOBS NE 0)); PROC SORT DATA = &INIFMTS_; BY KEY; RUN; DATA UPDT&UP; UPDATE &INIFMTS_ LINK&LK; BY KEY; RUN; DATA UPDT%EVAL(&UP+1); UPDATE UPDT&UP REPEAT&RE; BY KEY; RUN; PROC SORT DATA = UPDT%EVAL(&UP+1) NODUPKEY; BY START LABEL; RUN; PROC SQL; CREATE TABLE LINK%EVAL(&LK+1) AS SELECT Y.KEY AS KEY, Y.START AS START, X.LABEL AS LABEL FROM UPDT%EVAL(&UP+1) X, UPDT%EVAL(&UP+1) Y WHERE X.START EQ Y.LABEL ORDER BY KEY, START, LABEL; QUIT; PROC SQL; CREATE TABLE REPEAT%EVAL(&RE+1) AS SELECT Y.KEY AS KEY, Y.LABEL AS START, X.LABEL AS LABEL FROM UPDT%EVAL(&UP+1) X, UPDT%EVAL(&UP+1) Y WHERE (X.START EQ Y.START) AND (Y.LABEL > X.LABEL) ORDER BY KEY, START, LABEL; QUIT; DATA _NULL_; CALL SYMPUT('LKOBS', TRIM(LEFT(PUT(LINK,10.)))); SET LINK%EVAL(&LK+1) NOBS=LINK; STOP; RUN; %PUT %STR( ); %PUT NOTE: DATA SET LINK%EVAL(&LK+1) HAS &LKOBS OBSERVATION PAIRS.; DATA _NULL_; CALL SYMPUT('REOBS', TRIM(LEFT(PUT(REPEAT,10.)))); SET REPEAT%EVAL(&RE+1) NOBS=REPEAT; STOP; RUN; %PUT %STR( ); %PUT NOTE: DATA SET REPEAT%EVAL(&RE+1) HAS &REOBS OBSERVATION PAIRS.; %LET ITERS = %EVAL(&ITERS+1); %LET INIFMTS_ = UPDT%EVAL(&UP+1); %LET UP = %EVAL(&UP+2); %LET LK = %EVAL(&LK+1); %LET RE = %EVAL(&RE+1); %END; /* END OF %DO %WHILE LOOP */ DATA _NULL_; CALL SYMPUT('NEWOBS', TRIM(LEFT(PUT(NEWOBS,10.)))); SET &INIFMTS_ NOBS=NEWOBS; STOP; RUN; %PUT NOTE: TRANS MADE &ITERS ITERATIONS.; %PUT NOTE: %EVAL(&OLDOBS - &NEWOBS) DUPLICATE MAPPINGS WERE REMOVED.; OPTIONS NOTES; %PUT NOTE: SORTING &NEWOBS OBSERVATIONS IN THE UPDATES DATA INTO &FINFMTS_..; PROC SORT DATA = &INIFMTS_ OUT = &FINFMTS_ NODUPKEY; BY START LABEL; RUN; %IF &NEWOBS NE 0 %THEN %DO; PROC FORMAT CNTLIN = &FINFMTS_; DATA &OUTDATA_; SET &INDATA_; &TRANS_=PUT(&TRANS_,&FINFMTS_..); RUN; %END; %ELSE %DO; PROC DATA SETS; CHANGE &INDATA_ = &OUTDATA; RUN; %END; %MEND TRANS; %TRANS(INDATA_ = ALLDATA4, OUTDATA_ = ALLDATA5, INIFMTS_ = TRANSND0, FINFMTS_ = TRANSND, TRANS_ = NDKEY) %GENFMTS(INDATA_ = ALLDATA5, FMTS_ = TRANSNS, KEY1_ = ND, KEY2_ = NS) %TRANS(INDATA_ = ALLDATA5, OUTDATA_ = ALLDATA6, INIFMTS_ = TRANSNS0, FINFMTS_ = TRANSNS, TRANS_ = NSKEY) %GENFMTS(INDATA_ = ALLDATA6, FMTS_ = TRANSSD, KEY1_ = NS, KEY2_ = SD) %TRANS(INDATA_ = ALLDATA6, OUTDATA_ = ALLDATA7, INIFMTS_ = TRANSSD0, FINFMTS_ = TRANSSD, TRANS_ = SDKEY) PROC SORT DATA = ALLDATA7; BY SDKEY; RUN; DATA ALLDATA8(KEEP = ADNUM CLIENTID)/*remove keep statement if you want to*/;/* This step adds the client id */ SET ALLDATA7; BY SDKEY; IF FIRST.SDKEY THEN CLIENTID + 1; RUN;
... View more