Re: Construct continuous intervals from non-contiguous validity ranges

legends1337 · Posted 04-04-2022 08:41 AM

Given

data have;
    infile datalines missover delimiter="|" dsd;
    input id :$20. (start_date end_date) (:date9.) (attribute_1 attribute_2 attribute_3 attribute_4) ($);
    format start_date end_date date9.;
    datalines;
ID1|01MAR2014|31DEC9999|BIG|YES|| 
ID2|01SEP2015|30NOV2020|||TWO|
ID2|01SEP2015|31DEC9999|SMALL|||
ID2|01AUG2021|31DEC9999|||TWO|
ID3|01DEC2014|31MAY2016||YES||
ID3|01DEC2014|29JUN2017||||OK
ID3|01DEC2014|31DEC9999|MEDIUM|||
ID3|31MAR2015|29SEP2017|||ONE|
ID3|30JUN2017|31DEC9999||YES||TBD
ID3|30SEP2017|31DEC9999|||ONE, TWO|
;

I would like to get continuous validity ranges for each id with the correct attributes at each of them.
The desired output would be like this:

+-----+------------+-----------+-------------+-------------+-------------+-------------+
| id  | start_date | end_date  | attribute_1 | attribute_2 | attribute_3 | attribute_4 |
+-----+------------+-----------+-------------+-------------+-------------+-------------+
| ID1 | 01MAR2014  | 31DEC9999 | BIG         | YES         |             |             |
| ID2 | 01SEP2015  | 30NOV2020 | SMALL       |             | TWO         |             |
| ID2 | 01DEC2020  | 31JUL2021 | SMALL       |             |             |             |
| ID2 | 01AUG2021  | 31DEC9999 | SMALL       |             | TWO         |             |
| ID3 | 01DEC2014  | 30MAR2015 | MEDIUM      | YES         |             | OK          |
| ID3 | 31MAR2015  | 31MAY2016 | MEDIUM      | YES         | ONE         | OK          |
| ID3 | 01JUN2016  | 29JUN2016 | MEDIUM      |             | ONE         | OK          |
| ID3 | 30JUN2016  | 29SEP2017 | MEDIUM      | YES         | ONE         | TBD         |
| ID3 | 30SEP2017  | 31DEC9999 | MEDIUM      | YES         | ONE, TWO    | TBD         |
+-----+------------+-----------+-------------+-------------+-------------+-------------+

I found a way of doing it using joins, but would like to know if there exist a better way of doing the following:

data all_intervals;
    set have(keep= id start_date end_date);
    _start = start_date; output;
    _end = start_date-1; output;
    _end = end_date; output;
    if end_date < '31DEC9999'd then do;
        _start = end_date+1; output;
    end;
run;

proc sql;
    create table all_intervals as
    select distinct t1.id, t1._start, t2._end
    from all_intervals t1, all_intervals t2
    where t1.id = t2.id and t2._end > t1._start
;
quit;

data all_intervals;
set all_intervals;
by id _start;
if first.id or first._start;
run;

proc sql noprint;
    select 'max(t1.'||NAME||') as '||NAME into :attributes separated by ','
    from sashelp.vcolumn
    where libname = "WORK" and memname = "HAVE" and upcase(name) not in ('ID', "_START", "_END")
;
quit;

proc sql;
    create table merge as
    select t2.id, t2._start as start_date, t2._end as end_date, &attributes.
    from have t1 right join all_intervals t2
    on t1.id = t2.id
    and ((t2._start <= t1.start_date <= t2._end)
        or (t2._start <= t1.end_date <=  t2._end)
        or (t2._start >= t1.start_date and t2._end <= t1.end_date))
    group by t2.id, t2._start
    order by t2.id, t2._start
;
quit;

proc sort data=merge out=want nodupkey; by id start_date end_date; run;

The above produce the expected output.

ballardw · Posted 04-04-2022 11:18 AM

It would help if you stated some rules so that we don't have to decipher to code to determine what the rules you have implemented might actually be.

A minor point: You want to be cautious with DATALINES data that does not start on the left margin. Every so often you can make a logic error in the input and have odd results.

This works for your example data. Assumption is that there is ONLY one value populated for the Attribute_1 variable per group. If that is not the case then you need to provide data that shows the behavior with multiple values and what the actual result for that example would be.

data example;
   merge have have(keep = id attribute_1  rename=(attribute_1=attr1)
                   where= (not missing(attr1))
                  )
  ;
  by id;
  attribute_1 = coalescec(attribute_1,attr1);
  drop attr1;

run;

This relies on the default behavior of Merge. A similar one step left join in SQL, left to the interested reader, joining the data set on itself on the ID variable and only using the Attribute_1 variable from a reflex join.

legends1337 · Posted 04-04-2022 12:00 PM

Thank you for the swift reply @ballardw

Sorry if I was not clear enough.

Basically, the idea is that I have multiple records with different validity ranges for each ID. In each of those observations, some attributes are populated (at least one). The problem is that the validity ranges are not continuous.

Example:

+-----+------------+-----------+-------------+-------------+-------------+-------------+
| id  | start_date | end_date  | attribute_1 | attribute_2 | attribute_3 | attribute_4 |
+-----+------------+-----------+-------------+-------------+-------------+-------------+
| ID2 | 01SEP2015  | 30NOV2020 |             |             | TWO         |             | -----> Attribute 3 is valid from 01SEP2015 to 30NOV2020
| ID2 | 01DEC2020  | 31JUL2021 | SMALL       |             |             |             | -----> Attribute 1 is valid from 01SEP2015 to 31DEC9999

What I would like to end up with is a table that contains the "status" (meaning: what are the attributes values) of an ID at one point in time (from 01JAN2014 to 31DEC9999). So instead of overlapping dates, I would have continuous validity ranges:

+-----+------------+-----------+-------------+-------------+-------------+-------------+
| id  | start_date | end_date  | attribute_1 | attribute_2 | attribute_3 | attribute_4 |
+-----+------------+-----------+-------------+-------------+-------------+-------------+
| ID2 | 01SEP2015  | 30NOV2020 | SMALL       |             | TWO         |             | ----> until 30NOV2020
| ID2 | 01DEC2020  | 31JUL2021 | SMALL       |             |             |             | ----> Start 30NOV2020 + 1 day
| ID2 | 01AUG2021  | 31DEC9999 | SMALL       |             | TWO         |             | ----> Start 31JUL2021 + 1 day
+-----+------------+-----------+-------------+-------------+-------------+-------------+

Having this table as output will help me when I am trying to find the exact status of an ID at a given point in time (it will output only a single observation).

Example:

where id="ID2" and start_date <= "28SEP2020"d <= end_date should only output one observation which is:

+-----+------------+-----------+-------------+-------------+-------------+-------------+
| id  | start_date | end_date  | attribute_1 | attribute_2 | attribute_3 | attribute_4 |
+-----+------------+-----------+-------------+-------------+-------------+-------------+
| ID2 | 01SEP2015  | 30NOV2020 | SMALL       |             | TWO         |             |
+-----+------------+-----------+-------------+-------------+-------------+-------------+

----> Attribute_1 = "SMALL" and attribute_3 = "TWO" at that specific time.

instead of

+-----+------------+-----------+-------------+-------------+-------------+-------------+
| id  | start_date | end_date  | attribute_1 | attribute_2 | attribute_3 | attribute_4 |
+-----+------------+-----------+-------------+-------------+-------------+-------------+
| ID2 | 01SEP2015  | 30NOV2020 |             |             | TWO         |             |
| ID2 | 01SEP2015  | 31DEC9999 | SMALL       |             |             |             |
+-----+------------+-----------+-------------+-------------+-------------+-------------+

Also, your code does not produce the expected output for the example: the validity ranges are not continuous (they remain unchanged)

+-----+-------------+-----------+--------------+--------------+--------------+--------------+
| id  |  start_date |  end_date |  attribute_1 |  attribute_2 |  attribute_3 |  attribute_4 |
+-----+-------------+-----------+--------------+--------------+--------------+--------------+
| ID1 | 01MAR2014   | 31DEC9999 | BIG          | YES          |              |              |
| ID2 | 01SEP2015   | 30NOV2020 | SMALL        |              | TWO          |              |
| ID2 | 01SEP2015   | 31DEC9999 | SMALL        |              |              |              |
| ID2 | 01AUG2021   | 31DEC9999 | SMALL        |              | TWO          |              |
| ID3 | 01DEC2014   | 31MAY2016 | MEDIUM       | YES          |              |              |
| ID3 | 01DEC2014   | 29JUN2017 | MEDIUM       |              |              | OK           |
| ID3 | 01DEC2014   | 31DEC9999 | MEDIUM       |              |              |              |
| ID3 | 31MAR2015   | 29SEP2017 | MEDIUM       |              | ONE          |              |
| ID3 | 30JUN2017   | 31DEC9999 | MEDIUM       | YES          |              | TBD          |
| ID3 | 30SEP2017   | 31DEC9999 | MEDIUM       |              | ONE, TWO     |              |
+-----+-------------+-----------+--------------+--------------+--------------+--------------+

Please let me know if that is still unclear.

mkeintz · Posted 04-04-2022 02:54 PM

For each ID, you want to construct a set of non-overlapping date ranges, that preserve all the attribute information history that came from a set of overlapping ranges. Is that correct?

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------

legends1337 · Posted 04-04-2022 03:02 PM

@mkeintz Correct - this is exactly what I am searching for.

Tom · Posted 04-04-2022 04:01 PM

You could just brute force it.

Expand each observation into one row per date. Use UPDATE statement to collapse the away the missing values. Then use BY group processing to find the new intervals.

data have;
    infile datalines truncover dlm="|" dsd;
    input id :$20. (start_date end_date) (:date9.) (attribute_1 - attribute_4) ($);
    format start_date end_date date9.;
datalines;
ID1|01MAR2014|31DEC9999|BIG|YES||
ID2|01SEP2015|30NOV2020|||TWO|
ID2|01SEP2015|31DEC9999|SMALL|||
ID2|01AUG2021|31DEC9999|||TWO|
ID3|01DEC2014|31MAY2016||YES||
ID3|01DEC2014|29JUN2017||||OK
ID3|01DEC2014|31DEC9999|MEDIUM|||
ID3|31MAR2015|29SEP2017|||ONE|
ID3|30JUN2017|31DEC9999||YES||TBD
ID3|30SEP2017|31DEC9999|||ONE, TWO|
;

data step1;
  set have;
  do date=start_date to end_date;
    output;
  end;
  format date date9.;
  drop start_date end_date;
run;

proc sort; by id date; run;

data step2;
  update step1(obs=0) step1;
  by id date;
run;

data want;
  retain id start_date date;
  set step2;
  by id attribute_1 - attribute_4 notsorted;
  if first.attribute_4 then start_date=date;
  if last.attribute_4;
  rename date=end_date;
  format start_date date9.;
run;

Results

                 start_                 attribute_    attribute_    attribute_    attribute_
Obs    id          date     end_date        1             2             3             4

 1     ID1    01MAR2014    31DEC9999      BIG            YES
 2     ID2    01SEP2015    30NOV2020      SMALL                      TWO
 3     ID2    01DEC2020    31JUL2021      SMALL
 4     ID2    01AUG2021    31DEC9999      SMALL                      TWO
 5     ID3    01DEC2014    30MAR2015      MEDIUM         YES                         OK
 6     ID3    31MAR2015    31MAY2016      MEDIUM         YES         ONE             OK
 7     ID3    01JUN2016    29JUN2017      MEDIUM                     ONE             OK
 8     ID3    30JUN2017    29SEP2017      MEDIUM         YES         ONE             TBD
 9     ID3    30SEP2017    31DEC9999      MEDIUM         YES         ONE, TWO        TBD

Your expected output does not look right for the ID3.

Value Comparison Results for Variables

______________________________________________________________________
                       ||       Base    Compare
 id                    ||  start_dat  start_dat      Diff.     % Diff
                       ||          e          e
 ____________________  ||  _________  _________  _________  _________
                       ||
 ID3                   ||  30JUN2017  30JUN2016  -365.0000    -1.7381
______________________________________________________________________


______________________________________________________________________
                       ||       Base    Compare
 id                    ||   end_date   end_date      Diff.     % Diff
 ____________________  ||  _________  _________  _________  _________
                       ||
 ID3                   ||  29JUN2017  29JUN2016  -365.0000    -1.7382
______________________________________________________________________

Ksharp · Posted 04-05-2022 08:09 AM

data have;
    infile datalines truncover dlm="|" dsd;
    input id :$20. (start_date end_date) (:date9.) (attribute_1 - attribute_4) ($);
    format start_date end_date date9.;
datalines;
ID1|01MAR2014|31DEC9999|BIG|YES||
ID2|01SEP2015|30NOV2020|||TWO|
ID2|01SEP2015|31DEC9999|SMALL|||
ID2|01AUG2021|31DEC9999|||TWO|
ID3|01DEC2014|31MAY2016||YES||
ID3|01DEC2014|29JUN2017||||OK
ID3|01DEC2014|31DEC9999|MEDIUM|||
ID3|31MAR2015|29SEP2017|||ONE|
ID3|30JUN2017|31DEC9999||YES||TBD
ID3|30SEP2017|31DEC9999|||ONE, TWO|
;

data step1;
  set have;
  length attrib $ 80;
  attrib=catx('|',of attribute_1-attribute_4);
  do date=start_date to end_date;
    output;
  end;
  format date date9.;
  drop start_date end_date attribute_1 - attribute_4;
run;

proc sort data=step1; by id date; run;

data step2;
  n=0;
 do until(last.date);
  set step1;
  by id date ;
  array x{20} $ 20  ;
  length unique $ 200;
  do i=1 to countw(attrib,'|');
    temp=scan(attrib,i,'|');
	if temp not in x then do;n+1;x{n}=temp;end;
  end;
 end;
 call sortc(of x{i});
 unique=catx("|",of x{*});
 drop i temp attrib x: n;
 run;

data step3;
 do until(last.unique);
  set step2;
  by id unique notsorted;
  if first.unique then start=date;
end;
end=date;
drop date;
format start end date9.;
 run;

 data step4;
 if _n_=1 then do;
  if 0 then set have;
  declare hash h1(dataset:'have(keep=attribute_1 where=(attribute_1 is not missing))');
  h1.definekey('attribute_1');
  h1.definedone();

  declare hash h2(dataset:'have(keep=attribute_2 where=(attribute_2 is not missing))');
  h2.definekey('attribute_2');
  h2.definedone();

  declare hash h3(dataset:'have(keep=attribute_3 where=(attribute_3 is not missing))');
  h3.definekey('attribute_3');
  h3.definedone();

  declare hash h4(dataset:'have(keep=attribute_4 where=(attribute_4 is not missing))');
  h4.definekey('attribute_4');
  h4.definedone();
 end;
  set step3;
  length key $ 20;
  do i=1 to countw(unique,'|');
   key=scan(unique,i,'|');
   if h1.check(key:key)=0 then n=1;
   if h2.check(key:key)=0 then n=2;
   if h3.check(key:key)=0 then n=3;
   if h4.check(key:key)=0 then n=4;
   output;
  end;
drop unique i start_date end_date attribute_:;
run;

proc transpose data=step4 out=want(drop=_name_) prefix=attribute_;
by id start end;
var key;
id n;
run;

Construct continuous intervals from non-contiguous validity ranges