Re: Match rows on multiple variables

teamlinerek · Posted 08-21-2024 03:23 PM

I HAVE the following:

DIRECTORID COMPANYID DATESTARTROLE DATEENDROLE

1 1001 01JUL1998 05MAR2021

2 1001 01APR2020 05MAR2021

3 1001 01JUL2010 01MAY2011

1 1002 06MAR2021 07MAR2024

2 1002 06JUL2023 05MAY2024

4 1002 01APR1988 15JUN2023

I WANT the following:

DIRECTORID MATCHES

1 3

2 1

3 1

4 1

Where MATCHES captures the unique number of DIRECTORIDs where employment at the COMPANYID overlaps.

I have over 3,000,000 observations.

Are there any suggestions on how to get my expected output?

Thank you!!

ballardw · Posted 08-21-2024 03:34 PM

I think you may need to walk us through the logic of how you get Directorid=1 and a result of 3, as in all the variables compared. I only see 2 values of Directorid=1 so there is obviously something else going on.

teamlinerek · Posted 08-22-2024 11:24 AM

Thank you for asking!

The logic is that for director 1, employment overlaps with directors 2, 3, 4 for a total of 3 matches.

For director 2, employment overlaps with only director 1.

For director 3, employment overlaps with only director 1.

For director 4, employment overlaps with only director 1.

Ksharp · Posted 08-22-2024 02:23 AM

data have;
input DIRECTORID            COMPANYID             DATESTARTROLE      :date9.         DATEENDROLE :date9.;
format   DATESTARTROLE        DATEENDROLE :date9.;
cards;
1                                              1001                            01JUL1998                 05MAR2021
2                                              1001                            01APR2020                05MAR2021
3                                              1001                            01JUL2010                 01MAY2011
1                                              1002                            06MAR2021               07MAR2024
2                                              1002                            06JUL2023                 05MAY2024
4                                              1002                            01APR1988                15JUN2023
;
data temp;
 set have(keep=DIRECTORID  COMPANYID DATESTARTROLE  DATEENDROLE);
 do date=DATESTARTROLE to DATEENDROLE;
   output;
 end;
 keep COMPANYID  date DIRECTORID;
 format date date9.;
run;
proc sort data=temp nodupkey;
by  COMPANYID date DIRECTORID;
run;
data temp2;
 do until(last.date);
   set temp;
   by COMPANYID date;
   length list $ 200;
   list=catx('|',list,DIRECTORID);
 end;
 if findc(list,'|');
 drop DIRECTORID ;
 run;
 data temp3;
 if _n_=1 then do;
  if 0 then set temp2;
  declare hash h(dataset:'temp2',hashexp:20);
  h.definekey('COMPANYID','date');
  h.definedata('list');
  h.definedone();
 end;
set have;
length key $ 40;
 do date=DATESTARTROLE to DATEENDROLE;
   if h.find()=0 then do;
     do i=1 to countw(list,'|');
       key=scan(list,i,'|');
	   output;
	 end;
   end;
 end;
keep DIRECTORID key;
run;
proc sql;
create table want as
select DIRECTORID,count(distinct key)-1 as MATCHES
 from temp3
  group by DIRECTORID;
quit;

teamlinerek · Posted 08-22-2024 12:34 PM

Thank you!

I ran this code using the whole set and ran out of resources on the first "temp" query, even after eliminating unnecessary files. I ran the same code on a smaller subset, and it worked!

I am concerned about splitting the file and double counting directorids where they overlap in more than one companyid.

Do you have any suggestions for rerunning the code?

Ksharp · Posted 08-22-2024 08:12 PM

Yes. You could split this big table into several smaller subset dataset. But it is a little complicated.
/******First subset ******/
data have1;
set have;
if COMPANYID in (1001) ;
run;
data temp;
set have1;
..............until............
data temp3_1;
....................
keep DIRECTORID key;
run;

/******Second subset ******/
data have2;
set have;
if COMPANYID in (1002) ;
run;
data temp;
set have2;
..............until............
data temp3_2;
....................
keep DIRECTORID key;
run;

/******Combine these output datasets together And run the final PROC SQL********/
data temp3_all;
set temp3_1 temp3_2;
run;
proc sql;
create table want as
select DIRECTORID,count(distinct key)-1 as MATCHES
from temp3_all
group by DIRECTORID;
quit;

teamlinerek · Posted 09-18-2024 10:36 AM

Can this code be rewritten for overlaps in years instead of days?

Thank you!

Ksharp · Posted 09-18-2024 08:22 PM

You could change raw data from DATE into YEAR, no need to change code at all.

Like:

data have;
input DIRECTORID            COMPANYID             DATESTARTROLE             DATEENDROLE ;
cards;
1                                              1001                            1998                2021
2                                              1001                            2020                2021
3                                              1001                            2010                2011
1                                              1002                            2021                2024
2                                              1002                            2023                2024
4                                              1002                            1988                2023
;
data temp;
 set have(keep=DIRECTORID  COMPANYID DATESTARTROLE  DATEENDROLE);
 do date=DATESTARTROLE to DATEENDROLE;
   output;
 end;
 keep COMPANYID  date DIRECTORID;
run;
proc sort data=temp nodupkey;
by  COMPANYID date DIRECTORID;
run;
data temp2;
 do until(last.date);
   set temp;
   by COMPANYID date;
   length list $ 200;
   list=catx('|',list,DIRECTORID);
 end;
 if findc(list,'|');
 drop DIRECTORID ;
 run;
 data temp3;
 if _n_=1 then do;
  if 0 then set temp2;
  declare hash h(dataset:'temp2',hashexp:20);
  h.definekey('COMPANYID','date');
  h.definedata('list');
  h.definedone();
 end;
set have;
length key $ 40;
 do date=DATESTARTROLE to DATEENDROLE;
   if h.find()=0 then do;
     do i=1 to countw(list,'|');
       key=scan(list,i,'|');
	   output;
	 end;
   end;
 end;
keep DIRECTORID key;
run;
proc sql;
create table want as
select DIRECTORID,count(distinct key)-1 as MATCHES
 from temp3
  group by DIRECTORID;
quit;

mkeintz · Posted 09-27-2024 10:35 PM

Sort your data by companyid/datestartrole, then you can discover the matches one company at a time:

data have;
input DIRECTORID COMPANYID DATESTARTROLE :date9.  DATEENDROLE :date9.;
format   DATESTARTROLE        DATEENDROLE :date9.;
cards;
1 1001 01JUL1998 05MAR2021
2 1001 01APR2020 05MAR2021
3 1001 01JUL2010 01MAY2011
1 1002 06MAR2021 07MAR2024
2 1002 06JUL2023 05MAY2024
4 1002 01APR1988 15JUN2023
;

proc sort data=have out=need;
  by companyid datestartrole;
run;

%let max_size=50;  /*Max possible N of concurrent directors*/
data _null_;
  array curr_dirs {&max_size} _temporary_;
  array exdates   {&max_size} _temporary_;
  array exdirs    {&max_size} _temporary_;

  if _n_=1 then do;
    declare hash overlaps();
      overlaps.definekey('dir1','dir2');
      overlaps.definedone();
    declare hash hwant(ordered:'a');
      hwant.definekey('directorid');
      hwant.definedata('directorid','matches');
      hwant.definedone();
  end;

  set need end=end_of_need;
  by companyid datestartrole;

  if first.companyid then call missing(of curr_dirs{*},of exdates{*},of exdirs{*});

  /* First obs for this dir?  Make a new hwant entry with matches=0*/
  if hwant.find(key:directorid)^=0 then hwant.add(key:directorid,data:directorid,data:0);

  /* Remove dirs from CURR_DIRS with exit dates prior to datestartrole*/
  min_exdate=min(of exdates{*},'31dec9999'd);
  do while (min_exdate<datestartrole);
    e=whichn(min(of exdates{*}),of exdates{*});
    d=whichn(exdirs{e},of curr_dirs{*});
    call missing (curr_dirs{d},exdates{e},exdirs{e});
    min_exdate=min(of exdates{*},'31dec9999'd);
  end;

  /*Add the new director's exitdate to upcoming exit list */
  do e=1 by 1 until(exdates{e}=.); end; /*Identify leftmost missing exdate */
  exdates{e}=dateendrole;
  exdirs{e}=directorid;
  /* Add this director to curr_dirs array*/
  do d=1 by 1 until(curr_dirs{d}=.); end; 
  curr_dirs{d}=directorid;

  do d=1 to n(of curr_dirs{*});
    dir1=directorid;
    dir2=smallest(d,of curr_dirs{*});
    if dir2=dir1 then continue;
    if overlaps.find()=0 then continue; /*Skip below If this overlap already extablished*/
    overlaps.add(key:dir1,key:dir2,data:dir1,data:dir2);    /* dir1/dir2 */
    overlaps.add(key:dir2,key:dir1,data:dir2,data:dir1);    /* dir2/dir1 */
    do directorid=dir2,dir1;
      hwant.find();
      matches+1;
      hwant.replace();
    end;
  end;

  if end_of_need then hwant.output(dataset:'want');
run;

One company at a time, this program dynamically updates arrays listing the current directors, and the upcoming director exit dates.

It uses those arrays to update two hashes over all companies: (1) OVERLAPS which records each pair of directors having an overlap, and (2) HWANT, which keeps track of the count of overlaps for each dir. If a pair of directors overlaps more than once, it counts as only one in the MATHES variable.

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------

Catch up on SAS Innovate 2026

SAS Training: Just a Click Away