BookmarkSubscribeRSS Feed
🔒 This topic is solved and locked. Need further help from the community? Please sign in and ask a new question.
Astounding
PROC Star

Art,

Thanks.  I know it's work to do all that testing.  Here's one final test case, to see if all solutions are equal.

In PHONE, startdt=01jan2012 enddt=30jan2012

In PAID, paystart=14jan2012 payend=21jan2012

It's possible that this is an unrealistic situation, but in real life data are often much stranger than what logic or imagination might suggest.

I tried to see if your data tests this type of condition, but the eyesight is starting to blur here!

SteveNZ
Obsidian | Level 7

First of all huge thanks to everyone for your help, really appreciated. I tried all versions but the format one works the best. The hash version misses the example above from Astounding which sadly does happen in the data. I'll do some more testing on live data and let you know how it goes.

I dont really like to single out a 'winner' as such as all answers helped the next person but I'm going to try to give Astounding the points for the format approach and helpfuls to Art and Hai.Kuo? Please let me know if this upsets anyone!

Thanks again and have a great xmas

Steve

art297
Opal | Level 21

: I threw in your extra test case as well as one additional one.  Both the format and JackHammer approaches ended up with the same (correct) results, where the hash approach failed on both of the extra cases.  Your format approach ran significantly faster than my JackHammer approach.

: I completely agree with your designations of which solution was "correct."  

data phone ;

  infile datalines delimiter=',';

  input clientid $ sharerid $ phone $ startdt enddt ;

  informat startdt enddt date9. ;

  format startdt enddt date9. ;

  datalines;

client1,sharer1,555-6532,21Nov2011,10Dec2012

client1,sharer1,444-5655,29Nov2010,14Feb2011

client1,sharer1,333-1234,20May1993,17Aug1993

client1,sharer1,333-1234,08Sep1993,08Sep1993

client2,sharer2,111-5658,02Nov1992,12Aug1993

client2,sharer2,888-7845,12Aug1993,23Aug1993

client2,sharer2,999-9988,31Dec1993,26Mar1994

client2,sharer2,999-9988,28Mar1994,28Mar1994

client2,sharer2,789-9876,10Aug1994,05Sep1994

client2,sharer2,789-9876,22Jun1995,10Jul1995

client2,sharer2,363-3636,19Jun1998,12Nov1998

client2,sharer2,656-8989,12Nov1998,26Feb1999

client2,sharer2,656-8989,10Dec1999,10Jan2000

client2,sharer2,222-6589,10Jan2000,31Jan2000

client2,sharer2,777-8755,31Jan2000,03Feb2000

client2,sharer2,141-1414,23Aug2000,26Mar2001

client2,sharer2,778-6589,07Jun2001,10Jul2001

client2,sharer2,666-6548,10Jul2001,12Nov2001

client2,sharer2,141-1414,07Jun2001,10Jul2001

client2,sharer2,666-6548,10Apr2002,06Aug2002

client2,sharer2,321-6544,18Dec2003,08Apr2004

client2,sharer2,951-6235,08Apr2004,10Aug2004

client2,sharer2,753-1245,25Jan2007,18Jul2007

client2,sharer2,777-8755,25Jun2009,14Sep2009

client2,sharer2,852-8525,18Jun2009,02Jun2010

client2,sharer2,852-8525,20Oct2010,16May2011

client2,sharer2,852-8525,31May2012,10Dec2012

client2,sharer2,565-5656,05Sep1994,01Nov1994

client2,sharer2,565-5656,14Nov1994,14Nov1994

client3,sharer3,777-7777,01jan2012,30jan2012

client4,sharer4,888-8888,01jan2012,30jan2012

;

proc sort data=phone;

  by clientid sharerid phone startdt;

run;

data paid_periods;

  infile datalines delimiter=',';

  input clientid $ sharerid $ paystart payend ;

  informat paystart payend date9. ;

  format paystart payend date9. ;

  datalines;

client9,sharer9,09Aug1999,09Aug2009

client2,sharer2,31Aug1992,23Aug1993

client2,sharer2,25Dec1993,26Mar1994

client2,sharer2,10Aug1994,01Nov1994

client2,sharer2,15Mar1995,20Mar1995

client2,sharer2,19Jun1998,18Feb1999

client2,sharer2,10Dec1999,31Jan2000

client2,sharer2,20Jun2001,12Nov2001

client2,sharer2,10Apr2002,27Jul2002

client2,sharer2,18Dec2003,06Aug2004

client2,sharer2,11Dec2006,17Jul2007

client2,sharer2,18Jun2009,01Jun2010

client2,sharer2,20Oct2010,13May2011

client3,sharer3,14jan2012,21jan2012

client4,sharer4,14jan2012,21jan2012

;

/*JackHammer approach*/

proc sort data=phone out=unique_phone (keep=clientid sharerid phone) nodupkey;

  by clientid sharerid phone;

run;

proc sort data=phone ;

  by clientid sharerid phone;

run;

proc sql;

  create table paid_periods_expanded

    as select a.paystart,a.payend,b.phone as phone2,

           b.clientid,b.sharerid

      from unique_phone b left join paid_periods a

         on a.clientid=b.clientid and

                a.sharerid=b.sharerid

        order by clientid,sharerid,phone,paystart

  ;

quit;

data paid_periods_expanded;

  set paid_periods_expanded;

  if missing(paystart) then do;

    paystart='02jan1960'd;

    payend='02jan1960'd;

  end;

run;

data want_1 (keep=clientid sharerid phone date_range);

  set phone;

  by clientid sharerid phone;

  array dates(40000);

  retain dates: lowest highest;

  if first.phone then do;

    call missing(of dates(*));

    call missing(lowest);

    call missing(highest);

    call missing(first);

    call missing(last);

  end;

  lowest=min(lowest,startdt);

  highest=max(highest,enddt);

  do i=startdt to enddt;

    dates(i)=i;

  end;

  if last.phone then do;

    do until (last.phone2);

      set paid_periods_expanded

          (rename=(clientid=clientid2 sharerid=sharerid2));

      by clientid2 sharerid2 phone2;

      do i=paystart to payend;

        if not missing(dates(i)) then call missing(dates(i));

      end;

      if last.phone2 then do;

        do i=lowest to highest;

          if missing(first) and not missing(dates(i)) then first=dates(i);

          if not missing(first) and missing(dates(i+1)) then do;

            last=dates(i);

            date_range=catx("-",put(first,date9.),put(last,date9.));

            output;

            call missing(first);

            call missing(last);

          end;

        end;

      end;

    end;

  end;

run;

/*hash approach*/

proc sort data=paid_periods;

  by clientid sharerid paystart;

run;

data want_2;

  if _n_=1 then do;

    if 0 then set paid_periods;

    declare hash h(dataset:'paid_periods',multidata:'y');

    h.definekey('clientid', 'sharerid');

    h.definedata('paystart', 'payend');

    h.definedone();

  end;

  set phone;

  _rc=h.find();

  do _rc=0 by 0 while (_rc=0);

    if paystart<= startdt <= payend then startdt=payend+1;

    if  paystart<= enddt <= payend then enddt=paystart-1;

    if startdt > enddt then leave;

    _rc=h.find_next();

  end;

  if enddt>= startdt then do;

    date_range=catx("-",put(startdt,date9.),put(enddt,date9.));

    output;

  end;

  drop paystart payend _rc startdt enddt;

run;

/*format approach*/

data paid_periods2;

   set paid_periods;

   retain fmtname '$paid' label 'Paid';

   start = clientid || sharerid || put(paystart, yymmddn8.);

   end = clientid || sharerid || put(payend, yymmddn8.);

   keep fmtname start end label;

run;

proc format cntlin=paid_periods2;

run;

data individual_unpaid_days;

   set phone;

   do day=startdt to enddt;

        if put(clientid || sharerid || put(day, yymmddn8.), $paid.) ne 'Paid' then output;

   end;

   drop startdt enddt;

run;

proc sort data=individual_unpaid_days;

  by clientid sharerid phone;

run;

data assign_group;

   set individual_unpaid_days;

   by clientid sharerid phone;

   prior_day = lag(day);

   if first.phone or prior_day + 1 ne day then groupnum + 1;

   drop prior_day;

run;

data want_3;

   set assign_group;

   by clientid sharerid phone groupnum;

   if first.groupnum then start_of_range = day;

   retain start_of_range;

   if last.groupnum then do;

      end_of_range=day;

      date_range=catx("-",put(start_of_range,date9.),put(end_of_range,date9.));

      output;

   end;

   drop day groupnum start_: end_:;

run;

SteveNZ
Obsidian | Level 7

Hi all,

Have been doing live testing of the code and it works but my big concern is the very large filesize that occurs. I then wondered if putting all the dates on a timeline might be a better way of doing it and have written the following code, is there a way to use this approach to get accurate dates? I've included the test data at the top. If this could be used then filesizes are dramatically reduced. Basically I've got this far but cant work out how to get the dates from it.

data phone ;

  infile datalines delimiter=',';

  input clientid $ sharerid $ phone $ startdt enddt ;

  informat startdt enddt date9. ;

  format startdt enddt date9. ;

  datalines;

client1,sharer1,555-6532,21Nov2011,10Dec2012

client1,sharer1,444-5655,29Nov2010,14Feb2011

client1,sharer1,333-1234,20May1993,17Aug1993

client1,sharer1,333-1234,08Sep1993,08Sep1993

client2,sharer2,666-6548,10Jul2001,12Nov2001

client2,sharer2,666-6548,10Apr2002,06Aug2002

client2,sharer2,111-5658,02Nov1992,12Aug1993

client2,sharer2,222-6589,10Jan2000,31Jan2000

client2,sharer2,777-8755,31Jan2000,03Feb2000

client2,sharer2,777-8755,25Jun2009,14Sep2009

client2,sharer2,321-6544,18Dec2003,08Apr2004

client2,sharer2,778-6589,07Jun2001,10Jul2001

client2,sharer2,999-9988,31Dec1993,26Mar1994

client2,sharer2,999-9988,28Mar1994,28Mar1994

client2,sharer2,888-7845,12Aug1993,23Aug1993

client2,sharer2,789-9876,10Aug1994,05Sep1994

client2,sharer2,789-9876,22Jun1995,10Jul1995

client2,sharer2,951-6235,08Apr2004,10Aug2004

client2,sharer2,753-1245,25Jan2007,18Jul2007

client2,sharer2,656-8989,12Nov1998,26Feb1999

client2,sharer2,656-8989,10Dec1999,10Jan2000

client2,sharer2,141-1414,23Aug2000,26Mar2001

client2,sharer2,141-1414,07Jun2001,10Jul2001

client2,sharer2,363-3636,19Jun1998,12Nov1998

client2,sharer2,852-8525,18Jun2009,02Jun2010

client2,sharer2,852-8525,20Oct2010,16May2011

client2,sharer2,852-8525,31May2012,10Dec2012

client2,sharer2,565-5656,05Sep1994,01Nov1994

client2,sharer2,565-5656,14Nov1994,14Nov1994

client2,sharer2,221-2212,01jan2012,30jan2012

;

data paid_periods;

  infile datalines delimiter=',';

  input clientid $ sharerid $ paystart payend ;

  informat paystart payend date9. ;

  format paystart payend date9. ;

  datalines;

client2,sharer2,31Aug1992,23Aug1993

client2,sharer2,25Dec1993,26Mar1994

client2,sharer2,10Aug1994,01Nov1994

client2,sharer2,15Mar1995,20Mar1995

client2,sharer2,19Jun1998,18Feb1999

client2,sharer2,10Dec1999,31Jan2000

client2,sharer2,20Jun2001,12Nov2001

client2,sharer2,10Apr2002,27Jul2002

client2,sharer2,18Dec2003,06Aug2004

client2,sharer2,11Dec2006,17Jul2007

client2,sharer2,18Jun2009,01Jun2010

client2,sharer2,20Oct2010,13May2011

client2,sharer2,14jan2012,21jan2012

;

proc sql;

      create table ph as

    select distinct clientid,sharerid,phone
      from phone

      ;

      create table pays as

    select distinct rs.*,ph.phone
    from paid_periods rs, ph
    where rs.clientid=ph.clientid and rs.sharerid=ph.sharerid

      ;

    quit;

    data phprd(keep=phone clientid sharerid phone_flag date)   ;

      set phone ;

      phone_flag=1;

      date=startdt;

      output;

      phone_flag=0;

      date=enddt;

      output;

    run;

    proc sort data=phprd;

      by phone clientid sharerid date;

    run;

    data payprd(keep = phone clientid sharerid pay_flag date)   ;

      set pays;

      pay_flag=1;

      date=paystart;

      output;

      pay_flag=0;

      date=payend;

      output;

    run;

    proc sort data=payprd;

      by phone clientid sharerid date;

    run;

    data timeline ;

      set payprd phprd;

      format date ddmmyy10. clientid sharerid 12.;

      by phone clientid sharerid date;

    run;

art297
Opal | Level 21

: I would suggest posting this as a new question so that others have a chance to get points for answering it. Not that I have any need for more points, but it does entice others to spend some time answering such questions.

Ready to join fellow brilliant minds for the SAS Hackathon?

Build your skills. Make connections. Enjoy creative freedom. Maybe change the world. Registration is now open through August 30th. Visit the SAS Hackathon homepage.

Register today!
How to choose a machine learning algorithm

Use this tutorial as a handy guide to weigh the pros and cons of these commonly used machine learning algorithms.

Find more tutorials on the SAS Users YouTube channel.

Discussion stats
  • 20 replies
  • 1659 views
  • 6 likes
  • 4 in conversation