I HAVE the following:
DIRECTORID COMPANYID DATESTARTROLE DATEENDROLE
1 1001 01JUL1998 05MAR2021
2 1001 01APR2020 05MAR2021
3 1001 01JUL2010 01MAY2011
1 1002 06MAR2021 07MAR2024
2 1002 06JUL2023 05MAY2024
4 1002 01APR1988 15JUN2023
I WANT the following:
DIRECTORID MATCHES
1 3
2 1
3 1
4 1
Where MATCHES captures the unique number of DIRECTORIDs where employment at the COMPANYID overlaps.
I have over 3,000,000 observations.
Are there any suggestions on how to get my expected output?
Thank you!!
I think you may need to walk us through the logic of how you get Directorid=1 and a result of 3, as in all the variables compared. I only see 2 values of Directorid=1 so there is obviously something else going on.
Thank you for asking!
The logic is that for director 1, employment overlaps with directors 2, 3, 4 for a total of 3 matches.
For director 2, employment overlaps with only director 1.
For director 3, employment overlaps with only director 1.
For director 4, employment overlaps with only director 1.
data have;
input DIRECTORID COMPANYID DATESTARTROLE :date9. DATEENDROLE :date9.;
format DATESTARTROLE DATEENDROLE :date9.;
cards;
1 1001 01JUL1998 05MAR2021
2 1001 01APR2020 05MAR2021
3 1001 01JUL2010 01MAY2011
1 1002 06MAR2021 07MAR2024
2 1002 06JUL2023 05MAY2024
4 1002 01APR1988 15JUN2023
;
data temp;
set have(keep=DIRECTORID COMPANYID DATESTARTROLE DATEENDROLE);
do date=DATESTARTROLE to DATEENDROLE;
output;
end;
keep COMPANYID date DIRECTORID;
format date date9.;
run;
proc sort data=temp nodupkey;
by COMPANYID date DIRECTORID;
run;
data temp2;
do until(last.date);
set temp;
by COMPANYID date;
length list $ 200;
list=catx('|',list,DIRECTORID);
end;
if findc(list,'|');
drop DIRECTORID ;
run;
data temp3;
if _n_=1 then do;
if 0 then set temp2;
declare hash h(dataset:'temp2',hashexp:20);
h.definekey('COMPANYID','date');
h.definedata('list');
h.definedone();
end;
set have;
length key $ 40;
do date=DATESTARTROLE to DATEENDROLE;
if h.find()=0 then do;
do i=1 to countw(list,'|');
key=scan(list,i,'|');
output;
end;
end;
end;
keep DIRECTORID key;
run;
proc sql;
create table want as
select DIRECTORID,count(distinct key)-1 as MATCHES
from temp3
group by DIRECTORID;
quit;
Yes. You could split this big table into several smaller subset dataset. But it is a little complicated.
/******First subset ******/
data have1;
set have;
if COMPANYID in (1001) ;
run;
data temp;
set have1;
..............until............
data temp3_1;
....................
keep DIRECTORID key;
run;
/******Second subset ******/
data have2;
set have;
if COMPANYID in (1002) ;
run;
data temp;
set have2;
..............until............
data temp3_2;
....................
keep DIRECTORID key;
run;
/******Combine these output datasets together And run the final PROC SQL********/
data temp3_all;
set temp3_1 temp3_2;
run;
proc sql;
create table want as
select DIRECTORID,count(distinct key)-1 as MATCHES
from temp3_all
group by DIRECTORID;
quit;
You could change raw data from DATE into YEAR, no need to change code at all.
Like:
data have;
input DIRECTORID COMPANYID DATESTARTROLE DATEENDROLE ;
cards;
1 1001 1998 2021
2 1001 2020 2021
3 1001 2010 2011
1 1002 2021 2024
2 1002 2023 2024
4 1002 1988 2023
;
data temp;
set have(keep=DIRECTORID COMPANYID DATESTARTROLE DATEENDROLE);
do date=DATESTARTROLE to DATEENDROLE;
output;
end;
keep COMPANYID date DIRECTORID;
run;
proc sort data=temp nodupkey;
by COMPANYID date DIRECTORID;
run;
data temp2;
do until(last.date);
set temp;
by COMPANYID date;
length list $ 200;
list=catx('|',list,DIRECTORID);
end;
if findc(list,'|');
drop DIRECTORID ;
run;
data temp3;
if _n_=1 then do;
if 0 then set temp2;
declare hash h(dataset:'temp2',hashexp:20);
h.definekey('COMPANYID','date');
h.definedata('list');
h.definedone();
end;
set have;
length key $ 40;
do date=DATESTARTROLE to DATEENDROLE;
if h.find()=0 then do;
do i=1 to countw(list,'|');
key=scan(list,i,'|');
output;
end;
end;
end;
keep DIRECTORID key;
run;
proc sql;
create table want as
select DIRECTORID,count(distinct key)-1 as MATCHES
from temp3
group by DIRECTORID;
quit;
Sort your data by companyid/datestartrole, then you can discover the matches one company at a time:
data have;
input DIRECTORID COMPANYID DATESTARTROLE :date9. DATEENDROLE :date9.;
format DATESTARTROLE DATEENDROLE :date9.;
cards;
1 1001 01JUL1998 05MAR2021
2 1001 01APR2020 05MAR2021
3 1001 01JUL2010 01MAY2011
1 1002 06MAR2021 07MAR2024
2 1002 06JUL2023 05MAY2024
4 1002 01APR1988 15JUN2023
;
proc sort data=have out=need;
by companyid datestartrole;
run;
%let max_size=50; /*Max possible N of concurrent directors*/
data _null_;
array curr_dirs {&max_size} _temporary_;
array exdates {&max_size} _temporary_;
array exdirs {&max_size} _temporary_;
if _n_=1 then do;
declare hash overlaps();
overlaps.definekey('dir1','dir2');
overlaps.definedone();
declare hash hwant(ordered:'a');
hwant.definekey('directorid');
hwant.definedata('directorid','matches');
hwant.definedone();
end;
set need end=end_of_need;
by companyid datestartrole;
if first.companyid then call missing(of curr_dirs{*},of exdates{*},of exdirs{*});
/* First obs for this dir? Make a new hwant entry with matches=0*/
if hwant.find(key:directorid)^=0 then hwant.add(key:directorid,data:directorid,data:0);
/* Remove dirs from CURR_DIRS with exit dates prior to datestartrole*/
min_exdate=min(of exdates{*},'31dec9999'd);
do while (min_exdate<datestartrole);
e=whichn(min(of exdates{*}),of exdates{*});
d=whichn(exdirs{e},of curr_dirs{*});
call missing (curr_dirs{d},exdates{e},exdirs{e});
min_exdate=min(of exdates{*},'31dec9999'd);
end;
/*Add the new director's exitdate to upcoming exit list */
do e=1 by 1 until(exdates{e}=.); end; /*Identify leftmost missing exdate */
exdates{e}=dateendrole;
exdirs{e}=directorid;
/* Add this director to curr_dirs array*/
do d=1 by 1 until(curr_dirs{d}=.); end;
curr_dirs{d}=directorid;
do d=1 to n(of curr_dirs{*});
dir1=directorid;
dir2=smallest(d,of curr_dirs{*});
if dir2=dir1 then continue;
if overlaps.find()=0 then continue; /*Skip below If this overlap already extablished*/
overlaps.add(key:dir1,key:dir2,data:dir1,data:dir2); /* dir1/dir2 */
overlaps.add(key:dir2,key:dir1,data:dir2,data:dir1); /* dir2/dir1 */
do directorid=dir2,dir1;
hwant.find();
matches+1;
hwant.replace();
end;
end;
if end_of_need then hwant.output(dataset:'want');
run;
One company at a time, this program dynamically updates arrays listing the current directors, and the upcoming director exit dates.
It uses those arrays to update two hashes over all companies: (1) OVERLAPS which records each pair of directors having an overlap, and (2) HWANT, which keeps track of the count of overlaps for each dir. If a pair of directors overlaps more than once, it counts as only one in the MATHES variable.
Are you ready for the spotlight? We're accepting content ideas for SAS Innovate 2025 to be held May 6-9 in Orlando, FL. The call is open until September 25. Read more here about why you should contribute and what is in it for you!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.