Thank you everyone for your help. I wasn't able to get the best performance from any of the solutions which led me to realize that my address data needed to be cleaned up. Here's the 15 step process I put together which recreates the address table giving priority to the residential address. I'm still verifying the data in my "new" table but based on initial review and test, this works incredibly well. Any comments or suggestions are certainly welcome. I'm always trying to learn! /*PART 1 - FIND ALL GAPS BETWEEN THE RESIDENTIAL ADDRESSES*/
DATA WORK.NEW_ADDR_P1;
SYSECHO "PART 1";
SET WORK.NEW_ADDR_R;
BY CLIENT_ID;
LAGDATE=LAG(END_DT);
RETAIN LAGDATE;
IF FIRST.CLIENT_ID THEN DIF=0;
ELSE DIF=INTCK("DAY",LAGDATE,EFFECTIVE_DT);
IF DIF>1 THEN DO;
NEWEFF=EFFECTIVE_DT-(DIF-1);
NEWEND=EFFECTIVE_DT-1;
END;
RUN;
/*PART 2 - DELETE RESIDENTIAL RECORDS FROM THE P1 TABLE WHERE NO GAPS EXISTS BETWEEN RESIDENTIAL ADDRESSES*/
PROC SQL;
SYSECHO "PART 2";
DELETE * FROM WORK.NEW_ADDR_P1 WHERE NEWEFF IS MISSING;
QUIT;
/*PART 3 - CAPTURE MAILING ADDRESS INFO BETWEEN THE RESIDENTIAL GAPS*/
DATA WORK.NEW_ADDR_P22;
SYSECHO "PART 3";
MERGE WORK.NEW_ADDR_P1 (IN=INA) WORK.NEW_ADDR_M (IN=INB);
BY CLIENT_ID;
IF INA AND INB;
RUN;
/*PART 4 - CALCULATE THE CORRECT EFFECTIVE AND END DATES FOR RESIDENTIAL GAPS*/
PROC SQL;
SYSECHO "PART 4";
CREATE TABLE WORK.NEW_ADDR_P2 AS
SELECT CLIENT_ID,
CASE
WHEN NEWEND < EFFECTIVE_DT THEN .
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN NEWEFF
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT >= NEWEND THEN NEWEFF
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN EFFECTIVE_DT
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT >= NEWEND THEN EFFECTIVE_DT
ELSE .
END AS EFFECTIVE_DT2,
CASE
WHEN CALCULATED EFFECTIVE_DT2 IS MISSING THEN .
WHEN NEWEFF > EFFECTIVE_DT AND END_DT <= NEWEND THEN END_DT
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN NEWEND
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT > NEWEND THEN NEWEND
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN END_DT
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT >= NEWEND THEN NEWEND
ELSE .
END AS END_DT2,
POSTAL_AREA_CD, ADDRESS_TYPE_CD
FROM WORK.NEW_ADDR_P22
WHERE CALCULATED EFFECTIVE_DT2 IS NOT MISSING;
QUIT;
/*PART 5 - DATA CLEANUP (RENAMING VARIABLES AND SORTING DATASET*/
PROC SORT DATA=WORK.NEW_ADDR_P2 (RENAME=(EFFECTIVE_DT2=EFFECTIVE_DT) RENAME=(END_DT2=END_DT));
SYSECHO "PART 5";
BY CLIENT_ID EFFECTIVE_DT;
RUN;
/*PART 6 - CREATE THE BASE TABLE AND APPEND THE RESIDENTIAL ADDRESS INFO TABLES AND SORT*/
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_R;
SYSECHO "PART 6";
RUN;
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_P2;
RUN;
PROC SORT DATA=WORK.NEW_ADDR_P3;
BY CLIENT_ID EFFECTIVE_DT;
RUN;
/*PART 7 - CAPTURE THE MIN AND MAX EFFECTIVE DATE AND MAX END DATE FROM THE NEW ADDRESS TABLE*/
PROC SQL;
SYSECHO "PART 7";
CREATE TABLE WORK.NEW_ADDR_P4 AS
SELECT CLIENT_ID, MIN(EFFECTIVE_DT) AS MIN_EFFDATE, MAX(EFFECTIVE_DT) AS MAX_EFFDATE, MAX(END_DT) AS MAX_ENDDATE
FROM WORK.NEW_ADDR_P3
GROUP BY CLIENT_ID;
QUIT;
/*PART 8 - CAPTURE MAILING ADDRESSES EARLIER THAN THE MIN EFFECTIVE DATE*/
PROC SQL;
SYSECHO "PART 8";
CREATE TABLE WORK.NEW_ADDR_P5 AS
SELECT T1.*, T2.MIN_EFFDATE
FROM WORK.NEW_ADDR_M T1 INNER JOIN WORK.NEW_ADDR_P4 T2
ON T1.CLIENT_ID=T2.CLIENT_ID AND T2.MIN_EFFDATE> T1.EFFECTIVE_DT
WHERE T2.MIN_EFFDATE > T1.END_DT;
QUIT;
/*PART 9 - UPDATE THE END DATE OF THE LAST MAILING ADDRESS RECORD TO PREVENT OVERLAPPING DATES*/
DATA WORK.NEW_ADDR_P52 (KEEP= CLIENT_ID EFFECTIVE_DT END_DT POSTAL_AREA_CD ADDRESS_TYPE_CD);
SYSECHO "PART 9";
SET WORK.NEW_ADDR_P5;
BY CLIENT_ID;
LAGDATE=LAG(END_DT);
RETAIN LAGDATE;
IF FIRST.CLIENT_ID THEN DIF=0;
ELSE DIF=INTCK("DAY",LAGDATE,EFFECTIVE_DT);
IF LAST.CLIENT_ID AND DIF>1 THEN END_DT=MIN_EFFDATE-1;
RUN;
/*PART 10 - CAPTURE MAILING ADDRESSES LATER THAN THE MAX EFFECTIVE DATE*/
PROC SQL;
SYSECHO "PART 10";
CREATE TABLE WORK.NEW_ADDR_P6 AS
SELECT T1.*, T2.MAX_EFFDATE, T2.MAX_ENDDATE
FROM WORK.NEW_ADDR_M T1 INNER JOIN WORK.NEW_ADDR_P4 T2
ON T1.CLIENT_ID=T2.CLIENT_ID AND T2.MAX_EFFDATE < T1.EFFECTIVE_DT AND T2.MAX_ENDDATE < T1.END_DT;
QUIT;
/*PART 11 - UPDATE THE EFFECTIVE DATE OF THE LATER ADDRESSES TO PREVENT OVERLAPPING DATES*/
DATA WORK.NEW_ADDR_P62 (KEEP= CLIENT_ID EFFECTIVE_DT END_DT POSTAL_AREA_CD ADDRESS_TYPE_CD);
SYSECHO "PART 11";
SET WORK.NEW_ADDR_P6;
BY CLIENT_ID;
IF FIRST.CLIENT_ID THEN DO;
IF EFFECTIVE_DT=MAX_ENDDATE THEN EFFECTIVE_DT=MAX_ENDDATE+1;
END;
RUN;
/*PART 12 - APPEND MAILING ADDRESS INFO ON RECORD BEFORE AND AFTER THE RESIDENTIAL THEN SORT*/
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_P52;
SYSECHO "PART 12-1";
RUN;
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_P62;
SYSECHO "PART 12-2";
RUN;
PROC SORT DATA=WORK.NEW_ADDR_P3;
SYSECHO "PART 12-3";
BY CLIENT_ID EFFECTIVE_DT;
RUN;
/*PART 13 - CHECK TABLE FOR OVERLAPPING DATES CAUSED BY INVALID PRODUCTION DATA*/
DATA WORK.NEW_ADDR_CHECK;
SYSECHO "PART 13";
SET WORK.NEW_ADDR_P3;
BY CLIENT_ID;
LAGDATE=LAG(END_DT);
RETAIN LAGDATE;
IF FIRST.CLIENT_ID THEN DIF=0;
ELSE DIF=INTCK("DAY",LAGDATE,EFFECTIVE_DT);
RUN;
/*PART 14 - DELETE RECORDS WITH NEGATIVE DATE DIF VALUES*/
PROC SQL;
SYSECHO "PART 14";
DELETE * FROM WORK.NEW_ADDR_CHECK WHERE DIF<0;
QUIT;
/*PART 15 - CREATE FINAL TABLE WITH CORRECT VARIABLES AND SORT*/
PROC SORT DATA=WORK.NEW_ADDR_CHECK OUT=WORK.NEW_ADDR_FINAL (KEEP= CLIENT_ID EFFECTIVE_DT END_DT POSTAL_AREA_CD ADDRESS_TYPE_CD);
SYSECHO "PART 15";
BY CLIENT_ID EFFECTIVE_DT;
RUN;
... View more