Hi All,
I'm looking for help to join my accounting data with address information at the time of the transaction. The address information for the customer needs to be prioritized so that I use the 'R' address only even if the customer has both an 'R' and 'M' address at the time of the transaction.
This is what I need the data to look like
ID | TRANS_DT | POSTAL_AT_TIME_OF_TRANS | ADDRESS_TYPE |
2125 | 2004-03-01 | M6S4A1 | R |
You can see from the table below with the address information that both the 'R' and 'M' address were "effective" at the time of the transaction.
BTW: The transaction data is just the first two columns (ID and TRANS_DT). This is what the address information looks like
ID | EFFECTIVE_DT | END_DATE_CALC | ID_POSTAL | ADDRESS_TYPE_CD |
2125 | 1960-01-01 | 1999-01-26 | M3C1C2 | M |
2125 | 1960-01-01 | 1999-01-26 | M6N1Z2 | R |
2125 | 1999-01-27 | 2006-10-31 | P0H1T0 | M |
2125 | 1999-01-27 | 2007-06-29 | M6S4A1 | R |
2125 | 2006-11-01 | 2011-12-24 | M6N2M1 | M |
2125 | 2012-10-16 | 2018-03-31 | M3L2H7 | M |
Thank you everyone for your help. I wasn't able to get the best performance from any of the solutions which led me to realize that my address data needed to be cleaned up. Here's the 15 step process I put together which recreates the address table giving priority to the residential address. I'm still verifying the data in my "new" table but based on initial review and test, this works incredibly well.
Any comments or suggestions are certainly welcome. I'm always trying to learn!
/*PART 1 - FIND ALL GAPS BETWEEN THE RESIDENTIAL ADDRESSES*/
DATA WORK.NEW_ADDR_P1;
SYSECHO "PART 1";
SET WORK.NEW_ADDR_R;
BY CLIENT_ID;
LAGDATE=LAG(END_DT);
RETAIN LAGDATE;
IF FIRST.CLIENT_ID THEN DIF=0;
ELSE DIF=INTCK("DAY",LAGDATE,EFFECTIVE_DT);
IF DIF>1 THEN DO;
NEWEFF=EFFECTIVE_DT-(DIF-1);
NEWEND=EFFECTIVE_DT-1;
END;
RUN;
/*PART 2 - DELETE RESIDENTIAL RECORDS FROM THE P1 TABLE WHERE NO GAPS EXISTS BETWEEN RESIDENTIAL ADDRESSES*/
PROC SQL;
SYSECHO "PART 2";
DELETE * FROM WORK.NEW_ADDR_P1 WHERE NEWEFF IS MISSING;
QUIT;
/*PART 3 - CAPTURE MAILING ADDRESS INFO BETWEEN THE RESIDENTIAL GAPS*/
DATA WORK.NEW_ADDR_P22;
SYSECHO "PART 3";
MERGE WORK.NEW_ADDR_P1 (IN=INA) WORK.NEW_ADDR_M (IN=INB);
BY CLIENT_ID;
IF INA AND INB;
RUN;
/*PART 4 - CALCULATE THE CORRECT EFFECTIVE AND END DATES FOR RESIDENTIAL GAPS*/
PROC SQL;
SYSECHO "PART 4";
CREATE TABLE WORK.NEW_ADDR_P2 AS
SELECT CLIENT_ID,
CASE
WHEN NEWEND < EFFECTIVE_DT THEN .
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN NEWEFF
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT >= NEWEND THEN NEWEFF
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN EFFECTIVE_DT
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT >= NEWEND THEN EFFECTIVE_DT
ELSE .
END AS EFFECTIVE_DT2,
CASE
WHEN CALCULATED EFFECTIVE_DT2 IS MISSING THEN .
WHEN NEWEFF > EFFECTIVE_DT AND END_DT <= NEWEND THEN END_DT
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN NEWEND
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT > NEWEND THEN NEWEND
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN END_DT
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT >= NEWEND THEN NEWEND
ELSE .
END AS END_DT2,
POSTAL_AREA_CD, ADDRESS_TYPE_CD
FROM WORK.NEW_ADDR_P22
WHERE CALCULATED EFFECTIVE_DT2 IS NOT MISSING;
QUIT;
/*PART 5 - DATA CLEANUP (RENAMING VARIABLES AND SORTING DATASET*/
PROC SORT DATA=WORK.NEW_ADDR_P2 (RENAME=(EFFECTIVE_DT2=EFFECTIVE_DT) RENAME=(END_DT2=END_DT));
SYSECHO "PART 5";
BY CLIENT_ID EFFECTIVE_DT;
RUN;
/*PART 6 - CREATE THE BASE TABLE AND APPEND THE RESIDENTIAL ADDRESS INFO TABLES AND SORT*/
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_R;
SYSECHO "PART 6";
RUN;
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_P2;
RUN;
PROC SORT DATA=WORK.NEW_ADDR_P3;
BY CLIENT_ID EFFECTIVE_DT;
RUN;
/*PART 7 - CAPTURE THE MIN AND MAX EFFECTIVE DATE AND MAX END DATE FROM THE NEW ADDRESS TABLE*/
PROC SQL;
SYSECHO "PART 7";
CREATE TABLE WORK.NEW_ADDR_P4 AS
SELECT CLIENT_ID, MIN(EFFECTIVE_DT) AS MIN_EFFDATE, MAX(EFFECTIVE_DT) AS MAX_EFFDATE, MAX(END_DT) AS MAX_ENDDATE
FROM WORK.NEW_ADDR_P3
GROUP BY CLIENT_ID;
QUIT;
/*PART 8 - CAPTURE MAILING ADDRESSES EARLIER THAN THE MIN EFFECTIVE DATE*/
PROC SQL;
SYSECHO "PART 8";
CREATE TABLE WORK.NEW_ADDR_P5 AS
SELECT T1.*, T2.MIN_EFFDATE
FROM WORK.NEW_ADDR_M T1 INNER JOIN WORK.NEW_ADDR_P4 T2
ON T1.CLIENT_ID=T2.CLIENT_ID AND T2.MIN_EFFDATE> T1.EFFECTIVE_DT
WHERE T2.MIN_EFFDATE > T1.END_DT;
QUIT;
/*PART 9 - UPDATE THE END DATE OF THE LAST MAILING ADDRESS RECORD TO PREVENT OVERLAPPING DATES*/
DATA WORK.NEW_ADDR_P52 (KEEP= CLIENT_ID EFFECTIVE_DT END_DT POSTAL_AREA_CD ADDRESS_TYPE_CD);
SYSECHO "PART 9";
SET WORK.NEW_ADDR_P5;
BY CLIENT_ID;
LAGDATE=LAG(END_DT);
RETAIN LAGDATE;
IF FIRST.CLIENT_ID THEN DIF=0;
ELSE DIF=INTCK("DAY",LAGDATE,EFFECTIVE_DT);
IF LAST.CLIENT_ID AND DIF>1 THEN END_DT=MIN_EFFDATE-1;
RUN;
/*PART 10 - CAPTURE MAILING ADDRESSES LATER THAN THE MAX EFFECTIVE DATE*/
PROC SQL;
SYSECHO "PART 10";
CREATE TABLE WORK.NEW_ADDR_P6 AS
SELECT T1.*, T2.MAX_EFFDATE, T2.MAX_ENDDATE
FROM WORK.NEW_ADDR_M T1 INNER JOIN WORK.NEW_ADDR_P4 T2
ON T1.CLIENT_ID=T2.CLIENT_ID AND T2.MAX_EFFDATE < T1.EFFECTIVE_DT AND T2.MAX_ENDDATE < T1.END_DT;
QUIT;
/*PART 11 - UPDATE THE EFFECTIVE DATE OF THE LATER ADDRESSES TO PREVENT OVERLAPPING DATES*/
DATA WORK.NEW_ADDR_P62 (KEEP= CLIENT_ID EFFECTIVE_DT END_DT POSTAL_AREA_CD ADDRESS_TYPE_CD);
SYSECHO "PART 11";
SET WORK.NEW_ADDR_P6;
BY CLIENT_ID;
IF FIRST.CLIENT_ID THEN DO;
IF EFFECTIVE_DT=MAX_ENDDATE THEN EFFECTIVE_DT=MAX_ENDDATE+1;
END;
RUN;
/*PART 12 - APPEND MAILING ADDRESS INFO ON RECORD BEFORE AND AFTER THE RESIDENTIAL THEN SORT*/
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_P52;
SYSECHO "PART 12-1";
RUN;
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_P62;
SYSECHO "PART 12-2";
RUN;
PROC SORT DATA=WORK.NEW_ADDR_P3;
SYSECHO "PART 12-3";
BY CLIENT_ID EFFECTIVE_DT;
RUN;
/*PART 13 - CHECK TABLE FOR OVERLAPPING DATES CAUSED BY INVALID PRODUCTION DATA*/
DATA WORK.NEW_ADDR_CHECK;
SYSECHO "PART 13";
SET WORK.NEW_ADDR_P3;
BY CLIENT_ID;
LAGDATE=LAG(END_DT);
RETAIN LAGDATE;
IF FIRST.CLIENT_ID THEN DIF=0;
ELSE DIF=INTCK("DAY",LAGDATE,EFFECTIVE_DT);
RUN;
/*PART 14 - DELETE RECORDS WITH NEGATIVE DATE DIF VALUES*/
PROC SQL;
SYSECHO "PART 14";
DELETE * FROM WORK.NEW_ADDR_CHECK WHERE DIF<0;
QUIT;
/*PART 15 - CREATE FINAL TABLE WITH CORRECT VARIABLES AND SORT*/
PROC SORT DATA=WORK.NEW_ADDR_CHECK OUT=WORK.NEW_ADDR_FINAL (KEEP= CLIENT_ID EFFECTIVE_DT END_DT POSTAL_AREA_CD ADDRESS_TYPE_CD);
SYSECHO "PART 15";
BY CLIENT_ID EFFECTIVE_DT;
RUN;
@chrishull wrote:
When adding the flag, how do you recommend that I identify the address that I should use? I think I understand the concept but I'm not sure how I handle the priority flag when I have overlapping dates.
You're right, that approach won't work with overlapping dates. Just to clarify for overlapping dates you want to take the M, not R record?
I would merge first, ignoring the dual record issue. You'll get multiples. Then sort within a data step so that M's are always first and take the top record.
@Reeza Any clue why this adds a zero to the date value of the second date column?
data temp;
length ID 8. effective_dt end_date_calc 8.;
format effective_dt yymmdd10. end_date_calc yymmdd10.;
input ID effective_dt yymmdd10. end_date_calc yymmdd10. ;
datalines;
2125 1960-01-01 1999-01-26
2125 1960-01-01 1999-01-26
2125 1999-01-27 2006-10-31
2125 1999-01-27 2007-06-29
2125 2006-11-01 2011-12-24
2125 2012-10-16 2018-03-31
;;;
run;
This gives:
Register today and join us virtually on June 16!
sasglobalforum.com | #SASGF
View now: on-demand content for SAS users
data temp; length ID 8. effective_dt end_date_calc 8.; informat effective_dt yymmdd10. end_date_calc yymmdd10.; format effective_dt yymmdd10. end_date_calc yymmdd10.; input ID effective_dt end_date_calc ; datalines; 2125 1960-01-01 1999-01-26 2125 1960-01-01 1999-01-26 2125 1999-01-27 2006-10-31 2125 1999-01-27 2007-06-29 2125 2006-11-01 2011-12-24 2125 2012-10-16 2018-03-31 ;;; run;
This works.
@noling wrote:
@Reeza Any clue why this adds a zero to the date value of the second date column?
data temp; length ID 8. effective_dt end_date_calc 8.; format effective_dt yymmdd10. end_date_calc yymmdd10.; input ID effective_dt yymmdd10. end_date_calc yymmdd10. ; datalines; 2125 1960-01-01 1999-01-26 2125 1960-01-01 1999-01-26 2125 1999-01-27 2006-10-31 2125 1999-01-27 2007-06-29 2125 2006-11-01 2011-12-24 2125 2012-10-16 2018-03-31 ;;; run;
This gives:
@noling , use the colon modifier so that the input field starts with the next available non-space character.
input ID effective_dt :yymmdd10. end_date_calc :yymmdd10. ;
Because you told it to read exactly 10 characters, but since you were starting at the space between two dates the one's place of the last date was not read.
I guess you need something like
select ID, ..., ( select id_postal from Addresses where id=t.id and t.trans_dt between effective_dt and end_date_calc group by id having address_type_cd=max(address_type_cd)) as postal_at_time_of_trans from Transactions as t;
Thank you, PG. I'm running this now on approx. 300,000 transaction records (1/100th of my total transaction data volume). It's been running for over 30 minutes. Do you think performance is impacted because the address dataset has 60+ million rows?
Any ideas on how to speed it up?
It's going to be a slow lookup because you're not doing an exact match, you'll have to check all records within the date ranges. Do your data sets have indexes? That would likely help speed things up.
Also, you would have to post the code actually used to comment on efficiency.
@chrishull wrote:
Thank you, PG. I'm running this now on approx. 300,000 transaction records (1/100th of my total transaction data volume). It's been running for over 30 minutes. Do you think performance is impacted because the address dataset has 60+ million rows?
Any ideas on how to speed it up?
Thank you everyone for your help. I wasn't able to get the best performance from any of the solutions which led me to realize that my address data needed to be cleaned up. Here's the 15 step process I put together which recreates the address table giving priority to the residential address. I'm still verifying the data in my "new" table but based on initial review and test, this works incredibly well.
Any comments or suggestions are certainly welcome. I'm always trying to learn!
/*PART 1 - FIND ALL GAPS BETWEEN THE RESIDENTIAL ADDRESSES*/
DATA WORK.NEW_ADDR_P1;
SYSECHO "PART 1";
SET WORK.NEW_ADDR_R;
BY CLIENT_ID;
LAGDATE=LAG(END_DT);
RETAIN LAGDATE;
IF FIRST.CLIENT_ID THEN DIF=0;
ELSE DIF=INTCK("DAY",LAGDATE,EFFECTIVE_DT);
IF DIF>1 THEN DO;
NEWEFF=EFFECTIVE_DT-(DIF-1);
NEWEND=EFFECTIVE_DT-1;
END;
RUN;
/*PART 2 - DELETE RESIDENTIAL RECORDS FROM THE P1 TABLE WHERE NO GAPS EXISTS BETWEEN RESIDENTIAL ADDRESSES*/
PROC SQL;
SYSECHO "PART 2";
DELETE * FROM WORK.NEW_ADDR_P1 WHERE NEWEFF IS MISSING;
QUIT;
/*PART 3 - CAPTURE MAILING ADDRESS INFO BETWEEN THE RESIDENTIAL GAPS*/
DATA WORK.NEW_ADDR_P22;
SYSECHO "PART 3";
MERGE WORK.NEW_ADDR_P1 (IN=INA) WORK.NEW_ADDR_M (IN=INB);
BY CLIENT_ID;
IF INA AND INB;
RUN;
/*PART 4 - CALCULATE THE CORRECT EFFECTIVE AND END DATES FOR RESIDENTIAL GAPS*/
PROC SQL;
SYSECHO "PART 4";
CREATE TABLE WORK.NEW_ADDR_P2 AS
SELECT CLIENT_ID,
CASE
WHEN NEWEND < EFFECTIVE_DT THEN .
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN NEWEFF
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT >= NEWEND THEN NEWEFF
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN EFFECTIVE_DT
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT >= NEWEND THEN EFFECTIVE_DT
ELSE .
END AS EFFECTIVE_DT2,
CASE
WHEN CALCULATED EFFECTIVE_DT2 IS MISSING THEN .
WHEN NEWEFF > EFFECTIVE_DT AND END_DT <= NEWEND THEN END_DT
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN NEWEND
WHEN EFFECTIVE_DT <= NEWEFF AND END_DT > NEWEND THEN NEWEND
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT BETWEEN NEWEFF AND NEWEND THEN END_DT
WHEN EFFECTIVE_DT >= NEWEFF AND END_DT >= NEWEND THEN NEWEND
ELSE .
END AS END_DT2,
POSTAL_AREA_CD, ADDRESS_TYPE_CD
FROM WORK.NEW_ADDR_P22
WHERE CALCULATED EFFECTIVE_DT2 IS NOT MISSING;
QUIT;
/*PART 5 - DATA CLEANUP (RENAMING VARIABLES AND SORTING DATASET*/
PROC SORT DATA=WORK.NEW_ADDR_P2 (RENAME=(EFFECTIVE_DT2=EFFECTIVE_DT) RENAME=(END_DT2=END_DT));
SYSECHO "PART 5";
BY CLIENT_ID EFFECTIVE_DT;
RUN;
/*PART 6 - CREATE THE BASE TABLE AND APPEND THE RESIDENTIAL ADDRESS INFO TABLES AND SORT*/
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_R;
SYSECHO "PART 6";
RUN;
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_P2;
RUN;
PROC SORT DATA=WORK.NEW_ADDR_P3;
BY CLIENT_ID EFFECTIVE_DT;
RUN;
/*PART 7 - CAPTURE THE MIN AND MAX EFFECTIVE DATE AND MAX END DATE FROM THE NEW ADDRESS TABLE*/
PROC SQL;
SYSECHO "PART 7";
CREATE TABLE WORK.NEW_ADDR_P4 AS
SELECT CLIENT_ID, MIN(EFFECTIVE_DT) AS MIN_EFFDATE, MAX(EFFECTIVE_DT) AS MAX_EFFDATE, MAX(END_DT) AS MAX_ENDDATE
FROM WORK.NEW_ADDR_P3
GROUP BY CLIENT_ID;
QUIT;
/*PART 8 - CAPTURE MAILING ADDRESSES EARLIER THAN THE MIN EFFECTIVE DATE*/
PROC SQL;
SYSECHO "PART 8";
CREATE TABLE WORK.NEW_ADDR_P5 AS
SELECT T1.*, T2.MIN_EFFDATE
FROM WORK.NEW_ADDR_M T1 INNER JOIN WORK.NEW_ADDR_P4 T2
ON T1.CLIENT_ID=T2.CLIENT_ID AND T2.MIN_EFFDATE> T1.EFFECTIVE_DT
WHERE T2.MIN_EFFDATE > T1.END_DT;
QUIT;
/*PART 9 - UPDATE THE END DATE OF THE LAST MAILING ADDRESS RECORD TO PREVENT OVERLAPPING DATES*/
DATA WORK.NEW_ADDR_P52 (KEEP= CLIENT_ID EFFECTIVE_DT END_DT POSTAL_AREA_CD ADDRESS_TYPE_CD);
SYSECHO "PART 9";
SET WORK.NEW_ADDR_P5;
BY CLIENT_ID;
LAGDATE=LAG(END_DT);
RETAIN LAGDATE;
IF FIRST.CLIENT_ID THEN DIF=0;
ELSE DIF=INTCK("DAY",LAGDATE,EFFECTIVE_DT);
IF LAST.CLIENT_ID AND DIF>1 THEN END_DT=MIN_EFFDATE-1;
RUN;
/*PART 10 - CAPTURE MAILING ADDRESSES LATER THAN THE MAX EFFECTIVE DATE*/
PROC SQL;
SYSECHO "PART 10";
CREATE TABLE WORK.NEW_ADDR_P6 AS
SELECT T1.*, T2.MAX_EFFDATE, T2.MAX_ENDDATE
FROM WORK.NEW_ADDR_M T1 INNER JOIN WORK.NEW_ADDR_P4 T2
ON T1.CLIENT_ID=T2.CLIENT_ID AND T2.MAX_EFFDATE < T1.EFFECTIVE_DT AND T2.MAX_ENDDATE < T1.END_DT;
QUIT;
/*PART 11 - UPDATE THE EFFECTIVE DATE OF THE LATER ADDRESSES TO PREVENT OVERLAPPING DATES*/
DATA WORK.NEW_ADDR_P62 (KEEP= CLIENT_ID EFFECTIVE_DT END_DT POSTAL_AREA_CD ADDRESS_TYPE_CD);
SYSECHO "PART 11";
SET WORK.NEW_ADDR_P6;
BY CLIENT_ID;
IF FIRST.CLIENT_ID THEN DO;
IF EFFECTIVE_DT=MAX_ENDDATE THEN EFFECTIVE_DT=MAX_ENDDATE+1;
END;
RUN;
/*PART 12 - APPEND MAILING ADDRESS INFO ON RECORD BEFORE AND AFTER THE RESIDENTIAL THEN SORT*/
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_P52;
SYSECHO "PART 12-1";
RUN;
PROC APPEND BASE=WORK.NEW_ADDR_P3 DATA=WORK.NEW_ADDR_P62;
SYSECHO "PART 12-2";
RUN;
PROC SORT DATA=WORK.NEW_ADDR_P3;
SYSECHO "PART 12-3";
BY CLIENT_ID EFFECTIVE_DT;
RUN;
/*PART 13 - CHECK TABLE FOR OVERLAPPING DATES CAUSED BY INVALID PRODUCTION DATA*/
DATA WORK.NEW_ADDR_CHECK;
SYSECHO "PART 13";
SET WORK.NEW_ADDR_P3;
BY CLIENT_ID;
LAGDATE=LAG(END_DT);
RETAIN LAGDATE;
IF FIRST.CLIENT_ID THEN DIF=0;
ELSE DIF=INTCK("DAY",LAGDATE,EFFECTIVE_DT);
RUN;
/*PART 14 - DELETE RECORDS WITH NEGATIVE DATE DIF VALUES*/
PROC SQL;
SYSECHO "PART 14";
DELETE * FROM WORK.NEW_ADDR_CHECK WHERE DIF<0;
QUIT;
/*PART 15 - CREATE FINAL TABLE WITH CORRECT VARIABLES AND SORT*/
PROC SORT DATA=WORK.NEW_ADDR_CHECK OUT=WORK.NEW_ADDR_FINAL (KEEP= CLIENT_ID EFFECTIVE_DT END_DT POSTAL_AREA_CD ADDRESS_TYPE_CD);
SYSECHO "PART 15";
BY CLIENT_ID EFFECTIVE_DT;
RUN;
SAS Innovate 2025 is scheduled for May 6-9 in Orlando, FL. Sign up to be first to learn about the agenda and registration!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.