Art,
Thanks. I know it's work to do all that testing. Here's one final test case, to see if all solutions are equal.
In PHONE, startdt=01jan2012 enddt=30jan2012
In PAID, paystart=14jan2012 payend=21jan2012
It's possible that this is an unrealistic situation, but in real life data are often much stranger than what logic or imagination might suggest.
I tried to see if your data tests this type of condition, but the eyesight is starting to blur here!
First of all huge thanks to everyone for your help, really appreciated. I tried all versions but the format one works the best. The hash version misses the example above from Astounding which sadly does happen in the data. I'll do some more testing on live data and let you know how it goes.
I dont really like to single out a 'winner' as such as all answers helped the next person but I'm going to try to give Astounding the points for the format approach and helpfuls to Art and Hai.Kuo? Please let me know if this upsets anyone!
Thanks again and have a great xmas
Steve
: I threw in your extra test case as well as one additional one. Both the format and JackHammer approaches ended up with the same (correct) results, where the hash approach failed on both of the extra cases. Your format approach ran significantly faster than my JackHammer approach.
: I completely agree with your designations of which solution was "correct."
data phone ;
infile datalines delimiter=',';
input clientid $ sharerid $ phone $ startdt enddt ;
informat startdt enddt date9. ;
format startdt enddt date9. ;
datalines;
client1,sharer1,555-6532,21Nov2011,10Dec2012
client1,sharer1,444-5655,29Nov2010,14Feb2011
client1,sharer1,333-1234,20May1993,17Aug1993
client1,sharer1,333-1234,08Sep1993,08Sep1993
client2,sharer2,111-5658,02Nov1992,12Aug1993
client2,sharer2,888-7845,12Aug1993,23Aug1993
client2,sharer2,999-9988,31Dec1993,26Mar1994
client2,sharer2,999-9988,28Mar1994,28Mar1994
client2,sharer2,789-9876,10Aug1994,05Sep1994
client2,sharer2,789-9876,22Jun1995,10Jul1995
client2,sharer2,363-3636,19Jun1998,12Nov1998
client2,sharer2,656-8989,12Nov1998,26Feb1999
client2,sharer2,656-8989,10Dec1999,10Jan2000
client2,sharer2,222-6589,10Jan2000,31Jan2000
client2,sharer2,777-8755,31Jan2000,03Feb2000
client2,sharer2,141-1414,23Aug2000,26Mar2001
client2,sharer2,778-6589,07Jun2001,10Jul2001
client2,sharer2,666-6548,10Jul2001,12Nov2001
client2,sharer2,141-1414,07Jun2001,10Jul2001
client2,sharer2,666-6548,10Apr2002,06Aug2002
client2,sharer2,321-6544,18Dec2003,08Apr2004
client2,sharer2,951-6235,08Apr2004,10Aug2004
client2,sharer2,753-1245,25Jan2007,18Jul2007
client2,sharer2,777-8755,25Jun2009,14Sep2009
client2,sharer2,852-8525,18Jun2009,02Jun2010
client2,sharer2,852-8525,20Oct2010,16May2011
client2,sharer2,852-8525,31May2012,10Dec2012
client2,sharer2,565-5656,05Sep1994,01Nov1994
client2,sharer2,565-5656,14Nov1994,14Nov1994
client3,sharer3,777-7777,01jan2012,30jan2012
client4,sharer4,888-8888,01jan2012,30jan2012
;
proc sort data=phone;
by clientid sharerid phone startdt;
run;
data paid_periods;
infile datalines delimiter=',';
input clientid $ sharerid $ paystart payend ;
informat paystart payend date9. ;
format paystart payend date9. ;
datalines;
client9,sharer9,09Aug1999,09Aug2009
client2,sharer2,31Aug1992,23Aug1993
client2,sharer2,25Dec1993,26Mar1994
client2,sharer2,10Aug1994,01Nov1994
client2,sharer2,15Mar1995,20Mar1995
client2,sharer2,19Jun1998,18Feb1999
client2,sharer2,10Dec1999,31Jan2000
client2,sharer2,20Jun2001,12Nov2001
client2,sharer2,10Apr2002,27Jul2002
client2,sharer2,18Dec2003,06Aug2004
client2,sharer2,11Dec2006,17Jul2007
client2,sharer2,18Jun2009,01Jun2010
client2,sharer2,20Oct2010,13May2011
client3,sharer3,14jan2012,21jan2012
client4,sharer4,14jan2012,21jan2012
;
/*JackHammer approach*/
proc sort data=phone out=unique_phone (keep=clientid sharerid phone) nodupkey;
by clientid sharerid phone;
run;
proc sort data=phone ;
by clientid sharerid phone;
run;
proc sql;
create table paid_periods_expanded
as select a.paystart,a.payend,b.phone as phone2,
b.clientid,b.sharerid
from unique_phone b left join paid_periods a
on a.clientid=b.clientid and
a.sharerid=b.sharerid
order by clientid,sharerid,phone,paystart
;
quit;
data paid_periods_expanded;
set paid_periods_expanded;
if missing(paystart) then do;
paystart='02jan1960'd;
payend='02jan1960'd;
end;
run;
data want_1 (keep=clientid sharerid phone date_range);
set phone;
by clientid sharerid phone;
array dates(40000);
retain dates: lowest highest;
if first.phone then do;
call missing(of dates(*));
call missing(lowest);
call missing(highest);
call missing(first);
call missing(last);
end;
lowest=min(lowest,startdt);
highest=max(highest,enddt);
do i=startdt to enddt;
dates(i)=i;
end;
if last.phone then do;
do until (last.phone2);
set paid_periods_expanded
(rename=(clientid=clientid2 sharerid=sharerid2));
by clientid2 sharerid2 phone2;
do i=paystart to payend;
if not missing(dates(i)) then call missing(dates(i));
end;
if last.phone2 then do;
do i=lowest to highest;
if missing(first) and not missing(dates(i)) then first=dates(i);
if not missing(first) and missing(dates(i+1)) then do;
last=dates(i);
date_range=catx("-",put(first,date9.),put(last,date9.));
output;
call missing(first);
call missing(last);
end;
end;
end;
end;
end;
run;
/*hash approach*/
proc sort data=paid_periods;
by clientid sharerid paystart;
run;
data want_2;
if _n_=1 then do;
if 0 then set paid_periods;
declare hash h(dataset:'paid_periods',multidata:'y');
h.definekey('clientid', 'sharerid');
h.definedata('paystart', 'payend');
h.definedone();
end;
set phone;
_rc=h.find();
do _rc=0 by 0 while (_rc=0);
if paystart<= startdt <= payend then startdt=payend+1;
if paystart<= enddt <= payend then enddt=paystart-1;
if startdt > enddt then leave;
_rc=h.find_next();
end;
if enddt>= startdt then do;
date_range=catx("-",put(startdt,date9.),put(enddt,date9.));
output;
end;
drop paystart payend _rc startdt enddt;
run;
/*format approach*/
data paid_periods2;
set paid_periods;
retain fmtname '$paid' label 'Paid';
start = clientid || sharerid || put(paystart, yymmddn8.);
end = clientid || sharerid || put(payend, yymmddn8.);
keep fmtname start end label;
run;
proc format cntlin=paid_periods2;
run;
data individual_unpaid_days;
set phone;
do day=startdt to enddt;
if put(clientid || sharerid || put(day, yymmddn8.), $paid.) ne 'Paid' then output;
end;
drop startdt enddt;
run;
proc sort data=individual_unpaid_days;
by clientid sharerid phone;
run;
data assign_group;
set individual_unpaid_days;
by clientid sharerid phone;
prior_day = lag(day);
if first.phone or prior_day + 1 ne day then groupnum + 1;
drop prior_day;
run;
data want_3;
set assign_group;
by clientid sharerid phone groupnum;
if first.groupnum then start_of_range = day;
retain start_of_range;
if last.groupnum then do;
end_of_range=day;
date_range=catx("-",put(start_of_range,date9.),put(end_of_range,date9.));
output;
end;
drop day groupnum start_: end_:;
run;
Hi all,
Have been doing live testing of the code and it works but my big concern is the very large filesize that occurs. I then wondered if putting all the dates on a timeline might be a better way of doing it and have written the following code, is there a way to use this approach to get accurate dates? I've included the test data at the top. If this could be used then filesizes are dramatically reduced. Basically I've got this far but cant work out how to get the dates from it.
data phone ;
infile datalines delimiter=',';
input clientid $ sharerid $ phone $ startdt enddt ;
informat startdt enddt date9. ;
format startdt enddt date9. ;
datalines;
client1,sharer1,555-6532,21Nov2011,10Dec2012
client1,sharer1,444-5655,29Nov2010,14Feb2011
client1,sharer1,333-1234,20May1993,17Aug1993
client1,sharer1,333-1234,08Sep1993,08Sep1993
client2,sharer2,666-6548,10Jul2001,12Nov2001
client2,sharer2,666-6548,10Apr2002,06Aug2002
client2,sharer2,111-5658,02Nov1992,12Aug1993
client2,sharer2,222-6589,10Jan2000,31Jan2000
client2,sharer2,777-8755,31Jan2000,03Feb2000
client2,sharer2,777-8755,25Jun2009,14Sep2009
client2,sharer2,321-6544,18Dec2003,08Apr2004
client2,sharer2,778-6589,07Jun2001,10Jul2001
client2,sharer2,999-9988,31Dec1993,26Mar1994
client2,sharer2,999-9988,28Mar1994,28Mar1994
client2,sharer2,888-7845,12Aug1993,23Aug1993
client2,sharer2,789-9876,10Aug1994,05Sep1994
client2,sharer2,789-9876,22Jun1995,10Jul1995
client2,sharer2,951-6235,08Apr2004,10Aug2004
client2,sharer2,753-1245,25Jan2007,18Jul2007
client2,sharer2,656-8989,12Nov1998,26Feb1999
client2,sharer2,656-8989,10Dec1999,10Jan2000
client2,sharer2,141-1414,23Aug2000,26Mar2001
client2,sharer2,141-1414,07Jun2001,10Jul2001
client2,sharer2,363-3636,19Jun1998,12Nov1998
client2,sharer2,852-8525,18Jun2009,02Jun2010
client2,sharer2,852-8525,20Oct2010,16May2011
client2,sharer2,852-8525,31May2012,10Dec2012
client2,sharer2,565-5656,05Sep1994,01Nov1994
client2,sharer2,565-5656,14Nov1994,14Nov1994
client2,sharer2,221-2212,01jan2012,30jan2012
;
data paid_periods;
infile datalines delimiter=',';
input clientid $ sharerid $ paystart payend ;
informat paystart payend date9. ;
format paystart payend date9. ;
datalines;
client2,sharer2,31Aug1992,23Aug1993
client2,sharer2,25Dec1993,26Mar1994
client2,sharer2,10Aug1994,01Nov1994
client2,sharer2,15Mar1995,20Mar1995
client2,sharer2,19Jun1998,18Feb1999
client2,sharer2,10Dec1999,31Jan2000
client2,sharer2,20Jun2001,12Nov2001
client2,sharer2,10Apr2002,27Jul2002
client2,sharer2,18Dec2003,06Aug2004
client2,sharer2,11Dec2006,17Jul2007
client2,sharer2,18Jun2009,01Jun2010
client2,sharer2,20Oct2010,13May2011
client2,sharer2,14jan2012,21jan2012
;
proc sql; |
create table ph as
select distinct clientid,sharerid,phone | |
from phone |
;
create table pays as
select distinct rs.*,ph.phone | |
from paid_periods rs, ph | |
where rs.clientid=ph.clientid and rs.sharerid=ph.sharerid |
;
quit;
data phprd(keep=phone clientid sharerid phone_flag date) ;
set phone ;
phone_flag=1;
date=startdt;
output;
phone_flag=0;
date=enddt;
output;
run;
proc sort data=phprd;
by phone clientid sharerid date;
run;
data payprd(keep = phone clientid sharerid pay_flag date) ;
set pays;
pay_flag=1;
date=paystart;
output;
pay_flag=0;
date=payend;
output;
run;
proc sort data=payprd;
by phone clientid sharerid date;
run;
data timeline ;
set payprd phprd;
format date ddmmyy10. clientid sharerid 12.;
by phone clientid sharerid date;
run;
Are you ready for the spotlight? We're accepting content ideas for SAS Innovate 2025 to be held May 6-9 in Orlando, FL. The call is open until September 25. Read more here about why you should contribute and what is in it for you!
Use this tutorial as a handy guide to weigh the pros and cons of these commonly used machine learning algorithms.
Find more tutorials on the SAS Users YouTube channel.