And One more question, If data like this what you gonna do ? 7801669 7801669 97082679 1-Nov-15 7801669 6368875 106930088 2-Nov-15 7801669 7801669 107207912 3-Nov-15
Hi,
If this is the data:
7801669 7801669 97082679 1-Nov-15
7801669 6368875 106930088 2-Nov-15
7801669 7801669 107207912 3-Nov-15
Since the 2nd record has one pre-match and one post-match consecutively, the minimum value of both their dates will be taken i.e. in this case 1-Nov-2015.
Expected Result would be:
7801669 7801669 97082679 1-Nov-15
7801669 6368875 97082679 1-Nov-15
7801669 7801669 107207912 3-Nov-15
data one; infile datalines expandtabs truncover; input CURR_RK ORIG_RK CUST_SK VALID_DTTM: anydtdte10. ; format VALID_DTTM date9.; datalines; 41 2673 2342321 1-Sep-12 41 41 12559928 1-Jan-13 41 4615581 14452649 1-Feb-13 41 4615581 67316104 1-Aug-14 41 41 72971041 1-Oct-14 41 4615581 74341618 1-Nov-14 41 8828883 63829111 1-Jul-15 9217 9217 21128775 1-Jan-13 9217 10124544 15142897 1-Jan-13 9217 9217 30874351 1-Mar-13 9217 9217 71276102 1-Sep-14 9217 10124544 74885982 1-Nov-14 346372 346372 6478392 1-Sep-14 7801669 6368875 2591876 1-Dec-12 7801669 7801669 3829995 1-Jan-13 7801669 7801669 97082679 1-Nov-15 7801669 6368875 106930088 2-Nov-15 7801669 7801669 107207912 3-Nov-15 ; run; proc sql; create table key as select *,max(VALID_DTTM) as max_date,min(VALID_DTTM) as min_date from one group by curr_rk order by curr_rk,valid_dttm; quit; data want; if _n_=1 then do; if 0 then set key(rename=(CUST_SK=new_sk VALID_DTTM=new_dt)); declare hash h1(hashexp:20); h1.definekey('CURR_RK','new_dt'); h1.definedata('new_sk','new_dt'); h1.definedone(); end; do until(last.curr_rk); set key; by curr_rk; if CURR_RK=ORIG_RK then do; new_sk=CUST_SK; new_dt=VALID_DTTM; rc=h1.add(); end; end; do until(last.curr_rk); set key; by curr_rk; if CURR_RK=ORIG_RK then do;want_sk=CUST_SK;want_dt=VALID_DTTM; end; else do; dif=99999; do i=min_date to max_date; call missing(new_sk,new_dt); rc=h1.find(key:CURR_RK,key:i); if rc=0 then do; abs=abs(VALID_DTTM-new_dt); if abs lt dif then do; dif=abs;want_sk=new_sk;want_dt=new_dt; end; end; end; end; output; end; h1.clear(); format want_dt date9.; drop rc dif i min_date max_date abs new_sk new_dt; run;
You need test code on your own. There are too many scenarios need to be consider . Like : what you gonna do ? 7801669 7801669 97082679 1-Nov-15 7801669 7801669 77082680 1-Nov-15 7801669 6368875 106930088 1-Nov-15 <--------- 7801669 7801669 22207912 1-Nov-15 7801669 7801669 107207912 1-Nov-15
Hi,
That is the reason for me trying to create date range variables "max_before_valid_dttm" & "min_after_valid_dttm" for each record based on its pre-match and post-match valid_dttm values, we will consider the min value of them as final_dttm.
Apart from that, the code provided now is not as expected and causing some wrong cust_sk and date assignments.
The code given yesterday was perfectly working on subset data. But failing due to HASH memory issue in SAS when run on full table.
If we can resolve that, it will be absolutely fine.
Working SAS Code with HASH object program:
data one;
infile datalines;
input CURR_RK ORIG_RK CUST_SK VALID_DTTM: anydtdte10. ;
format VALID_DTTM date9.;
datalines;
41 2673 2342321 1-Sep-12
41 41 12559928 1-Jan-13
41 4615581 14452649 1-Feb-13
41 4615581 67316104 1-Aug-14
41 41 72971041 1-Oct-14
41 4615581 74341618 1-Nov-14
41 8828883 63829111 1-Jul-15
9217 9217 21128775 1-Jan-13
9217 10124544 15142897 1-Jan-13
9217 9217 30874351 1-Mar-13
9217 9217 71276102 1-Sep-14
9217 10124544 74885982 1-Nov-14
346372 346372 6478392 1-Sep-14
7801669 6368875 2591876 1-Dec-12
7801669 7801669 3829995 1-Jan-13
7801669 7801669 97082679 1-Aug-15
7801669 6368875 106930088 1-Oct-15
7801669 7801669 107207912 1-Nov-15
;
run;
data one;
set one;
by curr_rk;
if first.curr_rk then n=0;
n+1;
run;
data want;
if _n_=1 then do;
if 0 then set one(rename=(CUST_SK=new_sk VALID_DTTM=new_dt));
declare hash h1(dataset:'one(rename=(CUST_SK=new_sk VALID_DTTM=new_dt)
where=(CURR_RK=ORIG_RK))');
h1.definekey('CURR_RK','ORIG_RK');
h1.definedata('new_sk','new_dt');
h1.definedone();
if 0 then set one(keep=ORIG_RK rename=(ORIG_RK=_ORIG_RK));
declare hash h2(dataset:'one(rename=(ORIG_RK=_ORIG_RK
CUST_SK=new_sk VALID_DTTM=new_dt))');
h2.definekey('CURR_RK','n');
h2.definedata('new_sk','new_dt','_ORIG_RK');
h2.definedone();
end;
set one;
by curr_rk;
if first.curr_rk then do;
if curr_rk = orig_rk then do;new_sk=CUST_SK; new_dt=VALID_DTTM;end;
else do;call missing(new_sk,new_dt);rc=h1.find(key:CURR_RK,key:CURR_RK);end;
end;
else do;
if curr_rk = orig_rk then do;new_sk=CUST_SK; new_dt=VALID_DTTM;end;
else do;
do k=n-1 to 1 by -1;
call missing(new_sk,new_dt);
rc=h2.find(key:CURR_RK,key:k);
if CURR_RK=_ORIG_RK then leave;
end;
end;
end;
drop rc k n _ORIG_RK;
run;
But you want the nearest date , no matter it is PRE or POST , right ? data one; infile datalines expandtabs truncover; input CURR_RK ORIG_RK CUST_SK VALID_DTTM: anydtdte10. ; format VALID_DTTM date9.; datalines; 41 2673 2342321 1-Sep-12 41 41 12559928 1-Jan-13 41 4615581 14452649 1-Feb-13 41 4615581 67316104 1-Aug-14 41 41 72971041 1-Oct-14 41 4615581 74341618 1-Nov-14 41 8828883 63829111 1-Jul-15 9217 9217 21128775 1-Jan-13 9217 10124544 15142897 1-Jan-13 9217 9217 30874351 1-Mar-13 9217 9217 71276102 1-Sep-14 9217 10124544 74885982 1-Nov-14 346372 346372 6478392 1-Sep-14 7801669 6368875 2591876 1-Dec-12 7801669 7801669 3829995 1-Jan-13 7801669 7801669 97082679 1-Aug-15 7801669 6368875 106930088 1-Nov-15 7801669 7801669 107207912 1-Nov-15 ; run; proc sql; create table key as select *,max(VALID_DTTM) as max_date,min(VALID_DTTM) as min_date from one group by curr_rk order by curr_rk,valid_dttm; quit; data have; do total=1 by 1 until(last.curr_rk); set one; by curr_rk; end; do n=1 by 1 until(last.curr_rk); set one; by curr_rk; output; end; run; data want; if _n_=1 then do; if 0 then set have(rename=(CUST_SK=new_sk VALID_DTTM=new_dt ORIG_RK=_ORIG_RK)); declare hash h2(dataset:'have(rename=(CUST_SK=new_sk VALID_DTTM=new_dt ORIG_RK=_ORIG_RK)'); h2.definekey('CURR_RK','n'); h2.definedata('new_sk','new_dt','_ORIG_RK'); h2.definedone(); end; do until(last.curr_rk); set have; by curr_rk; if CURR_RK=ORIG_RK then do; new_sk=CUST_SK; new_dt=VALID_DTTM;_ORIG_RK=ORIG_RK;rc=h2.add(); end; end; do until(last.curr_rk); set have; by curr_rk; if CURR_RK=ORIG_RK then do;want_sk=CUST_SK;want_dt=VALID_DTTM; end; else do; do k=n-1 to 1 by -1; call missing(new_sk,new_dt); rc=h2.find(key:CURR_RK,key:k); if CURR_RK=_ORIG_RK then do; dif_pre=abs(new_dt-VALID_DTTM); want_sk=new_sk; want_dt=new_dt; leave; end; end; do k=n+1 to total; call missing(new_sk,new_dt); rc=h2.find(key:CURR_RK,key:k); if CURR_RK=_ORIG_RK then do; dif_post=abs(new_dt-VALID_DTTM); if dif_post lt dif_pre or missing(dif_pre) then do; want_sk=new_sk; want_dt=new_dt; end; leave; end; end; end; output; end; h2.clear(); format want_dt date9.; drop rc dif_: k new_sk new_dt _ORIG_RK n total; run;
Sorry. Try This code : data one; infile datalines expandtabs truncover; input CURR_RK ORIG_RK CUST_SK VALID_DTTM: anydtdte10. ; format VALID_DTTM date9.; datalines; 41 2673 2342321 1-Sep-12 41 41 12559928 1-Jan-13 41 4615581 14452649 1-Feb-13 41 4615581 67316104 1-Aug-14 41 41 72971041 1-Oct-14 41 4615581 74341618 1-Nov-14 41 8828883 63829111 1-Jul-15 9217 9217 21128775 1-Jan-13 9217 10124544 15142897 1-Jan-13 9217 9217 30874351 1-Mar-13 9217 9217 71276102 1-Sep-14 9217 10124544 74885982 1-Nov-14 346372 346372 6478392 1-Sep-14 7801669 6368875 2591876 1-Dec-12 7801669 7801669 3829995 1-Jan-13 7801669 7801669 97082679 1-Aug-15 7801669 6368875 106930088 1-Nov-15 7801669 7801669 107207912 1-Nov-15 ; run; data have; do total=1 by 1 until(last.curr_rk); set one; by curr_rk; end; do n=1 by 1 until(last.curr_rk); set one; by curr_rk; output; end; run; data want; if _n_=1 then do; if 0 then set have(rename=(CUST_SK=new_sk VALID_DTTM=new_dt ORIG_RK=_ORIG_RK)); declare hash h2(dataset:'have(rename=(CUST_SK=new_sk VALID_DTTM=new_dt ORIG_RK=_ORIG_RK)'); h2.definekey('CURR_RK','n'); h2.definedata('new_sk','new_dt','_ORIG_RK'); h2.definedone(); end; do until(last.curr_rk); set have; by curr_rk; if CURR_RK=ORIG_RK then do; new_sk=CUST_SK; new_dt=VALID_DTTM;_ORIG_RK=ORIG_RK;rc=h2.add(); end; end; do until(last.curr_rk); set have; by curr_rk; if CURR_RK=ORIG_RK then do;want_sk=CUST_SK;want_dt=VALID_DTTM; end; else do; do k=n-1 to 1 by -1; call missing(new_sk,new_dt); rc=h2.find(key:CURR_RK,key:k); if CURR_RK=_ORIG_RK then do; dif_pre=abs(new_dt-VALID_DTTM); want_sk=new_sk; want_dt=new_dt; leave; end; end; do k=n+1 to total; call missing(new_sk,new_dt); rc=h2.find(key:CURR_RK,key:k); if CURR_RK=_ORIG_RK then do; dif_post=abs(new_dt-VALID_DTTM); if dif_post lt dif_pre or missing(dif_pre) then do; want_sk=new_sk; want_dt=new_dt; end; leave; end; end; end; output; end; h2.clear(); format want_dt date9.; drop rc dif_: k new_sk new_dt _ORIG_RK n total; run;
Hi,
Thanks for the help. But I see one issue in the results. Please refer below. The record highlighted below in red should have the CUST_SK= 12559928, VALID_DTTM=01Jan2013. Not sure how to correct that issue from your code. Can you suggest?
Except that everything else seems fine.
CURR_RK | ORIG_RK | CUST_SK | VALID_DTTM | want_sk | want_dt |
41 | 2673 | 2342321 | 01Sep2012 | 12559928 | 01Jan2013 |
41 | 41 | 12559928 | 01Jan2013 | 12559928 | 01Jan2013 |
41 | 4615581 | 14452649 | 01Feb2013 | 12559928 | 01Jan2013 |
41 | 4615581 | 67316104 | 01Aug2014 | 72971041 | 01Oct2014 |
41 | 41 | 72971041 | 01Oct2014 | 72971041 | 01Oct2014 |
41 | 4615581 | 74341618 | 01Nov2014 | 72971041 | 01Oct2014 |
41 | 8828883 | 63829111 | 01Jul2015 | 72971041 | 01Oct2014 |
9217 | 9217 | 21128775 | 01Jan2013 | 21128775 | 01Jan2013 |
9217 | 10124544 | 15142897 | 01Jan2013 | 21128775 | 01Jan2013 |
9217 | 9217 | 30874351 | 01Mar2013 | 30874351 | 01Mar2013 |
9217 | 9217 | 71276102 | 01Sep2014 | 71276102 | 01Sep2014 |
9217 | 10124544 | 74885982 | 01Nov2014 | 71276102 | 01Sep2014 |
346372 | 346372 | 6478392 | 01Sep2014 | 6478392 | 01Sep2014 |
7801669 | 6368875 | 2591876 | 01Dec2012 | 3829995 | 01Jan2013 |
7801669 | 7801669 | 3829995 | 01Jan2013 | 3829995 | 01Jan2013 |
7801669 | 7801669 | 97082679 | 01Aug2015 | 97082679 | 01Aug2015 |
7801669 | 6368875 | 106930088 | 01Nov2015 | 107207912 | 01Nov2015 |
7801669 | 7801669 | 107207912 | 01Nov2015 | 107207912 | 01Nov2015 |
Why ? I don't understand . 41 41 12559928 01Jan2013 12559928 01Jan2013 41 4615581 14452649 01Feb2013 12559928 01Jan2013 41 4615581 67316104 01Aug2014 72971041 01Oct2014 41 41 72971041 01Oct2014 72971041 01Oct2014 abs(01Jan2013 - 01Aug2014) is greater than abs(01Oct2014 - 01Aug2014) Don't you want the most near date ?
No, we always want the previous matching record valid date, unless the immediate post-matching record's valid date is same as current record valid date.
In this case of 3rd record, since its pre-matching record date is 01Jan2013 and post matching record's date is 01Oct2014, we should take the minimum of them i.e. 01Jan2013 and its corresponding CUST_SK i.e. 12559928.
If the date of 3rd record had been 01Oct2014 (instead of 01Aug20014) then it is same as in the next matching record (4th) in sequence. In such case 3rd record should get date=01Oct2014 and SK=72971041.
Expected results highlighted in green:
41 41 12559928 01Jan2013 12559928 01Jan2013 => match=y, prevmatchdate=01Jan2013, postmatchdate=01Oct2014
41 4615581 14452649 01Feb2013 12559928 01Jan2013 => match=n, prevmatchdate=01Jan2013, postmatchdate=01Oct2014
41 4615581 67316104 01Aug2014 12559928 01Jan2013 => match=n, prevmatchdate=01Jan2013, postmatchdate=01Oct2014
41 41 72971041 01Oct2014 72971041 01Oct2014 => match=y, prevmatchdate=01Oct2014, postmatchdate=blank
OK. Maybe I understand what you mean. You only want PRE date which is less than or equal with the current obs , Do not want any POST date ? data one; infile datalines expandtabs truncover; input CURR_RK ORIG_RK CUST_SK VALID_DTTM: anydtdte10. ; format VALID_DTTM date9.; datalines; 41 2673 2342321 1-Sep-12 41 41 12559928 1-Jan-13 41 4615581 14452649 1-Feb-13 41 4615581 67316104 1-Aug-14 41 41 72971041 1-Oct-14 41 4615581 74341618 1-Nov-14 41 8828883 63829111 1-Jul-15 9217 9217 21128775 1-Jan-13 9217 10124544 15142897 1-Jan-13 9217 9217 30874351 1-Mar-13 9217 9217 71276102 1-Sep-14 9217 10124544 74885982 1-Nov-14 346372 346372 6478392 1-Sep-14 7801669 6368875 2591876 1-Dec-12 7801669 7801669 3829995 1-Jan-13 7801669 7801669 97082679 1-Aug-15 7801669 6368875 106930088 1-Nov-15 7801669 7801669 107207912 1-Nov-15 ; run; data have; do total=1 by 1 until(last.curr_rk); set one; by curr_rk; end; do n=1 by 1 until(last.curr_rk); set one; by curr_rk; output; end; run; data want; if _n_=1 then do; if 0 then set have(rename=(CUST_SK=new_sk VALID_DTTM=new_dt)); declare hash h2(); h2.definekey('CURR_RK','n'); h2.definedata('new_sk','new_dt'); h2.definedone(); end; do until(last.curr_rk); set have; by curr_rk; if CURR_RK=ORIG_RK then do; new_sk=CUST_SK; new_dt=VALID_DTTM;h2.add(); end; end; do until(last.curr_rk); set have; by curr_rk; if CURR_RK=ORIG_RK then do;want_sk=CUST_SK;want_dt=VALID_DTTM; end; else do; do k=n-1 to 1 by -1; call missing(new_sk,new_dt); rc=h2.find(key:CURR_RK,key:k); if rc=0 then do; want_sk=new_sk; want_dt=new_dt; leave; end; end; do k=n+1 to total; call missing(new_sk,new_dt); rc=h2.find(key:CURR_RK,key:k); if rc=0 then do; if new_dt=VALID_DTTM or first.curr_rk then do; want_sk=new_sk; want_dt=new_dt; end; leave; end; end; end; output; end; h2.clear(); format want_dt date9.; drop rc new_sk new_dt n k total; run;
OK. Maybe I understand what you mean. You only want PRE date which is less than or equal with the current obs , Do not want any POST date ? data one; infile datalines expandtabs truncover; input CURR_RK ORIG_RK CUST_SK VALID_DTTM: anydtdte10. ; format VALID_DTTM date9.; datalines; 41 2673 2342321 1-Sep-12 41 41 12559928 1-Jan-13 41 4615581 14452649 1-Feb-13 41 4615581 67316104 1-Aug-14 41 41 72971041 1-Oct-14 41 4615581 74341618 1-Nov-14 41 8828883 63829111 1-Jul-15 9217 9217 21128775 1-Jan-13 9217 10124544 15142897 1-Jan-13 9217 9217 30874351 1-Mar-13 9217 9217 71276102 1-Sep-14 9217 10124544 74885982 1-Nov-14 346372 346372 6478392 1-Sep-14 7801669 6368875 2591876 1-Dec-12 7801669 7801669 3829995 1-Jan-13 7801669 7801669 97082679 1-Aug-15 7801669 6368875 106930088 1-Nov-15 7801669 7801669 107207912 1-Nov-15 ; run; data have; do total=1 by 1 until(last.curr_rk); set one; by curr_rk; end; do n=1 by 1 until(last.curr_rk); set one; by curr_rk; output; end; run; data want; if _n_=1 then do; if 0 then set have(rename=(CUST_SK=new_sk VALID_DTTM=new_dt)); declare hash h2(); h2.definekey('CURR_RK','n'); h2.definedata('new_sk','new_dt'); h2.definedone(); end; do until(last.curr_rk); set have; by curr_rk; if CURR_RK=ORIG_RK then do; new_sk=CUST_SK; new_dt=VALID_DTTM;h2.add(); end; end; do until(last.curr_rk); set have; by curr_rk; if CURR_RK=ORIG_RK then do;want_sk=CUST_SK;want_dt=VALID_DTTM; end; else do; do k=n-1 to 1 by -1; call missing(new_sk,new_dt); rc=h2.find(key:CURR_RK,key:k); if rc=0 then do; want_sk=new_sk; want_dt=new_dt; leave; end; end; do k=n+1 to total; call missing(new_sk,new_dt); rc=h2.find(key:CURR_RK,key:k); if rc=0 then do; if new_dt=VALID_DTTM or first.curr_rk then do; want_sk=new_sk; want_dt=new_dt; end; leave; end; end; end; output; end; h2.clear(); format want_dt date9.; drop rc new_sk new_dt n k total; run;
Sorry. My code still don't work for this scenario: 8801669 8801669 97082679 1-Nov-15 8801669 6368875 106930088 1-Nov-15 8801669 8801669 107207912 1-Nov-15 TRY THIS CODE: data one; infile datalines expandtabs truncover; input CURR_RK ORIG_RK CUST_SK VALID_DTTM: anydtdte10. ; format VALID_DTTM date9.; datalines; 41 2673 2342321 1-Sep-12 41 41 12559928 1-Jan-13 41 4615581 14452649 1-Feb-13 41 4615581 67316104 1-Aug-14 41 41 72971041 1-Oct-14 41 4615581 74341618 1-Nov-14 41 8828883 63829111 1-Jul-15 9217 9217 21128775 1-Jan-13 9217 10124544 15142897 1-Jan-13 9217 9217 30874351 1-Mar-13 9217 9217 71276102 1-Sep-14 9217 10124544 74885982 1-Nov-14 346372 346372 6478392 1-Sep-14 7801669 6368875 2591876 1-Dec-12 7801669 7801669 3829995 1-Jan-13 7801669 7801669 97082679 1-Aug-15 7801669 6368875 106930088 1-Nov-15 7801669 7801669 107207912 1-Nov-15 8801669 8801669 97082679 1-Nov-15 8801669 6368875 106930088 1-Nov-15 8801669 8801669 107207912 1-Nov-15 ; run; data have; do total=1 by 1 until(last.curr_rk); set one; by curr_rk; end; do n=1 by 1 until(last.curr_rk); set one; by curr_rk; output; end; run; data want; if _n_=1 then do; if 0 then set have(rename=(CUST_SK=new_sk VALID_DTTM=new_dt)); declare hash h2(); h2.definekey('CURR_RK','n'); h2.definedata('new_sk','new_dt'); h2.definedone(); end; do until(last.curr_rk); set have; by curr_rk; if CURR_RK=ORIG_RK then do; new_sk=CUST_SK; new_dt=VALID_DTTM;h2.add(); end; end; do until(last.curr_rk); set have; by curr_rk; if CURR_RK=ORIG_RK then do;want_sk=CUST_SK;want_dt=VALID_DTTM; end; else do; do k=n-1 to 1 by -1; call missing(new_sk,new_dt); rc=h2.find(key:CURR_RK,key:k); if rc=0 then do; want_sk=new_sk; want_dt=new_dt; leave; end; end; do k=n+1 to total; call missing(new_sk,new_dt); rc=h2.find(key:CURR_RK,key:k); if rc=0 then do; if (new_dt=VALID_DTTM and want_dt ne new_dt) /*<------*/ or first.curr_rk then do; want_sk=new_sk; want_dt=new_dt; end; leave; end; end; end; output; end; h2.clear(); format want_dt date9.; drop rc new_sk new_dt n k total; run;
Why ? I don't understand . 41 41 12559928 01Jan2013 12559928 01Jan2013 41 4615581 14452649 01Feb2013 12559928 01Jan2013 41 4615581 67316104 01Aug2014 72971041 01Oct2014 41 41 72971041 01Oct2014 72971041 01Oct2014 abs(01Jan2013 - 01Aug2014) is greater than abs(01Oct2014 - 01Aug2014) Don't you want the most near date ?
Save $250 on SAS Innovate and get a free advance copy of the new SAS For Dummies book! Use the code "SASforDummies" to register. Don't miss out, May 6-9, in Orlando, Florida.
Learn the difference between classical and Bayesian statistical approaches and see a few PROC examples to perform Bayesian analysis in this video.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.