Match on at least 2 keys

Reply
Occasional Contributor
Posts: 11

Match on at least 2 keys

I'm trying to create data set WANT from HAVE based on the condition that there must be a match on at least 2keys. I have two approaches that I have tried. Thanks to some SAS information at www2.sas.com/proceedings/sugi27/p075-27.pdf The SQL approach finds the match but becomes a time consuming and resource intensive approach when dealing with large data set. Approach 2 does work but definitely a lengthy process and I find it a bit difficult to follow and replicate.

Question: 1) Anyone has a different (3rd) approach to solving this problem? OR
2) Could anyone provide an enhancement to approach 2 by adding a new variable country (‘US’ for all participants) to data set work.HAVE and generate a data set work.WANT with the condition that there’s a match on at least 2 keys. Note: The client_ID value should not change (I think). Thanks


HAVE


ADNUM SSN NAME DOB
101 856116534 ABCD 18760323
187 856176354 ABCD 18760323
233 856176354 ABCD 18670323
456 850176534 ABCD 18760323
490 856176534 ABEF 18670323
535 856176534 ABGF 18760323
601 856176534 ABEF 18760323
632 856176534 ABEE 18760323
879 856116534 ABEF 18760323
911 856176534 ABGF 18760323
919 123456789 JOHN 19000224
200 000089712 DAVE 20000212


WANT (ALLDATA8)

ADNUM SSN NAME DOB Client_id
101 856116534 ABCD 18760323 3
187 856176354 ABCD 18760323 3
233 856176354 ABCD 18670323 3
456 850176534 ABCD 18760323 3
490 856176534 ABEF 18670323 3
535 856176534 ABGF 18760323 3
601 856176534 ABEF 18760323 3
632 856176534 ABEE 18760323 3
879 856116534 ABEF 18760323 3
911 856176534 ABGF 18760323 3
919 123456789 JOHN 19000224 2
200 000089712 DAVE 20000212 1

 
/* SQL approach */
PROC SQL;
CREATE TABLE work.match AS
SELECT X.ADNUM AS ADNUM1,
Y.ADNUM AS ADNUM2
FROM work.have X, work.have Y
WHERE ( (X.SSN EQ Y.SSN)
+ (X.NAME EQ Y.NAME)
+ (X.DOB EQ Y.DOB) ) >= 2
AND (X.ADNUM < Y.ADNUM)
ORDER BY ADNUM1, ADNUM2;
QUIT;


/* Approach 2*/
DATA ALLDATA1 /*(KEEP = ADNUM NSID SDID NDID)*/;
SET work.have;/* imported data or created data set*/
LENGTH NSID $ 13 SDID $ 17 NDID $ 12
DOBC $ 8;
DOBC = TRIM(LEFT(COMPRESS(PUT(DOB,
BEST12.),'-')));
NSID = TRIM(NAME)||TRIM(SSN);
NDID = TRIM(NAME)||DOBC;
SDID = TRIM(SSN)||DOBC;
RUN;
%MACRO GENKEY(INDATA_ = , /* INPUT DATA */
OUTDATA_ = , /* OUTPUT DATA */
KEYID_ = /* BY VAR. */);
%LOCAL INDATA_ OUTDATA_ KEYID_;
PROC SORT DATA = &INDATA_;
BY &KEYID_.ID;
RUN;
DATA &OUTDATA_(DROP = &KEYID_.ID);
SET &INDATA_;
BY &KEYID_.ID;
IF FIRST.&KEYID_.ID THEN &KEYID_.KEY+1;
RUN;
%MEND GENKEY;
%GENKEY(INDATA_ = ALLDATA1,
OUTDATA_ = ALLDATA2,
KEYID_ = NS)
%GENKEY(INDATA_ = ALLDATA2,
OUTDATA_ = ALLDATA3,
KEYID_ = ND)
%GENKEY(INDATA_ = ALLDATA3,
OUTDATA_ = ALLDATA4,
KEYID_ = SD)
%MACRO GENFMTS(INDATA_ = , /* INPUT DATA */
FMTS_ = , /* FORMATS NAME */
KEY1_ = , /* WITHIN */
KEY2_ = /* TRANSLATE */);
%LOCAL INDATA_ FMTS_ KEY1_ KEY2_;
PROC SORT DATA = &INDATA_;
BY &KEY1_.KEY &KEY2_.KEY;
RUN;
DATA &FMTS_.0(KEEP = FMTNAME START LABEL
KEY);
SET &INDATA_;
BY &KEY1_.KEY;
RETAIN FMTNAME "&FMTS_" &KEY2_.KEYF;
IF FIRST.&KEY1_.KEY
THEN &KEY2_.KEYF = &KEY2_.KEY;
IF &KEY2_.KEY NE &KEY2_.KEYF THEN
DO;
START = &KEY2_.KEY;
LABEL = &KEY2_.KEYF;
KEY + 1;
OUTPUT;
END;
RUN;
PROC SORT DATA = &FMTS_.0 NODUPKEY;
BY START LABEL;
RUN;
%MEND GENFMTS;
%GENFMTS(INDATA_ = ALLDATA4,
FMTS_ = TRANSND,
KEY1_ = SD,
KEY2_ = ND) /* Tested to this level */
%MACRO TRANS(INDATA_ = ,
OUTDATA_ = ,
INIFMTS_ = ,
FINFMTS_ = ,
TRANS_ = );
%LOCAL
INDATA_ /* INPUT DATA SET */
OUTDATA_ /* OUTPUT DATA SET */
INIFMTS_ /* INITIAL FORMATS */
FINFMTS_ /* FINAL FORMATS */
TRANS_ /* **KEY TO BE TRANSLATED */
NEWOBS /* OBS. IN INITIAL FORMATS */
OLDOBS /* OBS. IN FINAL FORMATS */
LKOBS /* NUM OF LINK CASES TO BE
UPDATED IN EACH ITERATION */
REOBS /* NUM OF REPEAT CASES TO BE
UPDATED IN EACH ITERATION */
ITERS /* NUM OF ITERATIONS */
UP /* SERIES NUMBER FOR THE
TEMPORARY UPDATE DATA SET */
LK /* SERIES NUMBER FOR THE
TEMPORARY LINK DATA SET */
RE /* SERIES NUMBER FOR THE
TEMPORARY REPEAT DATA SET */
;
%LET UP = 1;
%LET LK = 1;
%LET RE = 1;
%LET ITERS = 0;
DATA _NULL_;
CALL SYMPUT('OLDOBS',
TRIM(LEFT(PUT(OLDOBS,10.))));
SET &INIFMTS_ NOBS = OLDOBS;
STOP;
RUN;
%PUT %STR( );
%PUT NOTE: DATA SET &INIFMTS_ HAS &OLDOBS
OBSERVATION PAIRS.;
PROC SQL;
CREATE TABLE LINK&LK AS
SELECT Y.KEY AS KEY,
Y.START AS START,
X.LABEL AS LABEL
FROM &INIFMTS_ X, &INIFMTS_ Y
WHERE X.START EQ Y.LABEL
ORDER BY KEY, START, LABEL;
QUIT;
PROC SQL;
CREATE TABLE REPEAT&RE AS
SELECT Y.KEY AS KEY,
Y.LABEL AS START,
X.LABEL AS LABEL
FROM &INIFMTS_ X, &INIFMTS_ Y
WHERE (X.START EQ Y.START)
AND (Y.LABEL > X.LABEL)
ORDER BY KEY, START, LABEL;
QUIT;
DATA _NULL_;
CALL SYMPUT('LKOBS',
TRIM(LEFT(PUT(LINK,10.))));
SET LINK&LK NOBS = LINK;
STOP;
RUN;
%PUT %STR( );
%PUT NOTE: DATA SET LINK&LK HAS &LKOBS
OBSERVATION PAIRS.;
DATA _NULL_;
CALL SYMPUT('REOBS',
TRIM(LEFT(PUT(REPEAT,10.))));
SET REPEAT&RE NOBS = REPEAT;
STOP;
RUN;
%PUT %STR( );
%PUT NOTE: DATA SET REPEAT&RE HAS &REOBS
OBSERVATION PAIRS.;
%DO %WHILE ((&LKOBS NE 0)
OR (&REOBS NE 0));
PROC SORT DATA = &INIFMTS_;
BY KEY;
RUN;
DATA UPDT&UP;
UPDATE &INIFMTS_ LINK&LK;
BY KEY;
RUN;
DATA UPDT%EVAL(&UP+1);
UPDATE UPDT&UP REPEAT&RE;
BY KEY;
RUN;
PROC SORT DATA = UPDT%EVAL(&UP+1)
NODUPKEY;
BY START LABEL;
RUN;
PROC SQL;
CREATE TABLE LINK%EVAL(&LK+1) AS
SELECT Y.KEY AS KEY,
Y.START AS START,
X.LABEL AS LABEL
FROM UPDT%EVAL(&UP+1) X,
UPDT%EVAL(&UP+1) Y
WHERE X.START EQ Y.LABEL
ORDER BY KEY, START, LABEL;
QUIT;
PROC SQL;
CREATE TABLE REPEAT%EVAL(&RE+1) AS
SELECT Y.KEY AS KEY,
Y.LABEL AS START,
X.LABEL AS LABEL
FROM UPDT%EVAL(&UP+1) X,
UPDT%EVAL(&UP+1) Y
WHERE (X.START EQ Y.START)
AND (Y.LABEL > X.LABEL)
ORDER BY KEY, START, LABEL;
QUIT;
DATA _NULL_;
CALL SYMPUT('LKOBS',
TRIM(LEFT(PUT(LINK,10.))));
SET LINK%EVAL(&LK+1) NOBS=LINK;
STOP;
RUN;
%PUT %STR( );
%PUT NOTE: DATA SET LINK%EVAL(&LK+1)
HAS &LKOBS OBSERVATION PAIRS.;
DATA _NULL_;
CALL SYMPUT('REOBS',
TRIM(LEFT(PUT(REPEAT,10.))));
SET REPEAT%EVAL(&RE+1) NOBS=REPEAT;
STOP;
RUN;
%PUT %STR( );
%PUT NOTE: DATA SET REPEAT%EVAL(&RE+1)
HAS &REOBS OBSERVATION PAIRS.;
%LET ITERS = %EVAL(&ITERS+1);
%LET INIFMTS_ = UPDT%EVAL(&UP+1);
%LET UP = %EVAL(&UP+2);
%LET LK = %EVAL(&LK+1);
%LET RE = %EVAL(&RE+1);
%END; /* END OF %DO %WHILE LOOP */
DATA _NULL_;
CALL SYMPUT('NEWOBS',
TRIM(LEFT(PUT(NEWOBS,10.))));
SET &INIFMTS_ NOBS=NEWOBS;
STOP;
RUN;
%PUT NOTE: TRANS MADE &ITERS ITERATIONS.;
%PUT NOTE: %EVAL(&OLDOBS - &NEWOBS)
DUPLICATE MAPPINGS WERE REMOVED.;
OPTIONS NOTES;
%PUT NOTE: SORTING &NEWOBS OBSERVATIONS IN
THE UPDATES DATA INTO &FINFMTS_..;
PROC SORT DATA = &INIFMTS_ OUT = &FINFMTS_
NODUPKEY;
BY START LABEL;
RUN;
%IF &NEWOBS NE 0 %THEN
%DO;
PROC FORMAT CNTLIN = &FINFMTS_;
DATA &OUTDATA_;
SET &INDATA_;
&TRANS_=PUT(&TRANS_,&FINFMTS_..);
RUN;
%END;
%ELSE
%DO;
PROC DATA SETS;
CHANGE &INDATA_ = &OUTDATA;
RUN;
%END;
%MEND TRANS;
%TRANS(INDATA_ = ALLDATA4,
OUTDATA_ = ALLDATA5,
INIFMTS_ = TRANSND0,
FINFMTS_ = TRANSND,
TRANS_ = NDKEY)
%GENFMTS(INDATA_ = ALLDATA5,
FMTS_ = TRANSNS,
KEY1_ = ND,
KEY2_ = NS)
%TRANS(INDATA_ = ALLDATA5,
OUTDATA_ = ALLDATA6,
INIFMTS_ = TRANSNS0,
FINFMTS_ = TRANSNS,
TRANS_ = NSKEY)
%GENFMTS(INDATA_ = ALLDATA6,
FMTS_ = TRANSSD,
KEY1_ = NS,
KEY2_ = SD)
%TRANS(INDATA_ = ALLDATA6,
OUTDATA_ = ALLDATA7,
INIFMTS_ = TRANSSD0,
FINFMTS_ = TRANSSD,
TRANS_ = SDKEY)
PROC SORT DATA = ALLDATA7;
BY SDKEY;
RUN;
DATA ALLDATA8(KEEP = ADNUM CLIENTID)/*remove keep statement if you want to*/;/* This step adds the client id */
SET ALLDATA7;
BY SDKEY;
IF FIRST.SDKEY THEN CLIENTID + 1;
RUN;

Super Contributor
Posts: 578

Re: Match on at least 2 keys

wow...sort of difficult to follow...

best I could do was build on the first approach.

data HAVE;

input ADNUM SSN  NAME $ DOB;

format dob datetime20;

match1 = catx('',put(ssn,z9.),trim(name));

match2=catx('',put(ssn,z9.),put(dob,best12.));

match3=catx('',trim(name),put(dob,best12.));

cards;

101 856116534 ABCD 18760323

187 856176354 ABCD 18760323

233 856176354 ABCD 18670323

456 850176534 ABCD 18760323

490 856176534 ABEF 18670323

535 856176534 ABGF 18760323

601 856176534 ABEF 18760323

632 856176534 ABEE 18760323

879 856116534 ABEF 18760323

911 856176534 ABGF 18760323

919 123456789 JOHN 19000224

200 000089712 DAVE 20000212

;

run;

proc sql;

create table want as

select  

    t1.adnum as adnum1

    ,t2.adnum as adnum2

from

    have t1

    inner join have t2

        on (t1.match1 = t2.match1 or t1.match2=t2.match2 or t1.match3=t2.match3)

           and t1.adnum < t2.adnum;

quit;

Ask a Question
Discussion stats
  • 1 reply
  • 223 views
  • 0 likes
  • 2 in conversation