Solved: Re: Create an edge list

niam · Posted 02-18-2020 05:00 PM

Hello,

I need to create an edge list from a table that includes the affiliation of each doctor to up to 5 hospitals. Data looks like below:

Obs	doctor	hosp1	hosp2	hosp3	hosp5
1	1003000522	100072			.
2	1003002106	100072			.
3	1003004870	.			.
4	1003005810	100128			.
5	1003006552	100121	100157		.
6	1003008533	100018	100012	100244	.

Doctors and hospitals are all identified by unique codes. In the example above, I have 6 doctors and 7 hospitals (100018, 100012, 100244, 100121, 100157, 100128, and 100072) . Note that doctor in row 3 are not affiliated with any hospitals.

I need to create a new table, with 3 columns. First and second column should show pairs of hospitals and the third column show show the number of doctors that are shared between them. In this example, as shown in row 5, hospitals 100121 and 100157 share 1 physician with each other, also, as shown in row 6, hospitals 100018, 10012 and 100244 each share one physician with each other, therefore the resulting data set should look like this, also please note that in the output, where both hospitals in column 1 and 2 are eqaul, the value in column 3 shows the total number of physicians that are affiliated with that hospital. For example, hospital code 100072 has 2 physicians affiliated with it (1003000522 and 1003002106)

Hosp1 Hosp2 #Shared_Docs

100072 100072 2

100128 100128 1

100121 100121 1

100121 100157 1

100018 100018 1

100018 100012 1

100018 100244 1

100012 100012 1

100012 100244 1

100244 100244 1

mkeintz · Posted 02-18-2020 10:21 PM

This is really a two-step task: (1) write out all the pairs, and (2) get a frequency of all pairs. The second step is trivial - it's what PROC FREQ is made for. The first step is straightforward:

data have;
infile cards truncover;
input Obs	doctor	hosp1	hosp2	hosp3	hosp4	hosp5;
cards;
1	1003000522	100072	 	 	 	.
2	1003002106	100072	 	 	 	.
3	1003004870	.	 	 	 	.
4	1003005810	100128	 	 	 	.
5	1003006552	100121	100157	 	 	.
6	1003008533	100018	100012	100244	 	.
;

data temp  (keep=obs doctor hosp1 hosp2);
  set have (rename=(hosp1=dummy1 hosp2=dummy2));
  array h {*} dummy1-dummy2 hosp3-hosp5;
  nh=n(of h{*});
  if nh>0;
  do h1=1 to nh;
    do h2=h1 to nh;
      hosp1=min(h{h1},h{h2});
      hosp2=max(h{h1},h{h2});
      output;
    end;
  end;
run;
proc freq data=temp  noprint ;
  tables hosp1 * hosp2 /out=want (keep=hosp1 hosp2 count);
run;

In the DATA TEMP step, notice I rename HOSP1 and HOSP2 to DUMMY1 DUMMY2 respectively. That's just so I can re-use the names HOSP1 and HOSP2 in the output.
The array is there just to provide a framework for finding all pairs (including self-pairs).
Because the hospitals are not sorted within each observation, you need the MIN and MAX functions when assigning members of each pair to HOSP1 and HOSP2.

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------

View solution in original post

novinosrin · Posted 02-18-2020 06:20 PM

data have;
infile cards truncover;
input Obs	doctor	hosp1	hosp2	hosp3	hosp4	hosp5;
cards;
1	1003000522	100072	 	 	 	.
2	1003002106	100072	 	 	 	.
3	1003004870	.	 	 	 	.
4	1003005810	100128	 	 	 	.
5	1003006552	100121	100157	 	 	.
6	1003008533	100018	100012	100244	 	.
;

proc transpose data=have out=temp(drop=_name_ where=(col1 ne .));
by doctor;
var hosp:;
run;

data want ;
 if _n_=1 then do;
  if 0 then set temp(rename=(col1=hosp1)) temp(rename=(col1=hosp2));
  dcl hash H (dataset:'temp(rename=(col1=hosp2))',multidata:'y') ;
  h.definekey  ("doctor") ;
  h.definedata ("hosp2") ;
  h.definedone () ;
 end;
 do Shared_Docs=1 by 1 until(last.hosp1);
  set temp(rename=(col1=hosp1));
  by hosp1 notsorted;
  do until(not _t) ; 
   do rc = h.find() by 0 while (rc = 0) ; 
    h.has_next(result:_t) ;  
    if hosp1=hosp2 then h.removedup() ; 
	if last.hosp1 then output;
    rc=h.find_next();
   end ;
  end ;
 end;
 drop _t rc doctor;
run;

/*Or perhaps a better bet to Sort the transposed dataset before the Hash*/

data have;
infile cards truncover;
input Obs	doctor	hosp1	hosp2	hosp3	hosp4	hosp5;
cards;
1	1003000522	100072	 	 	 	.
2	1003002106	100072	 	 	 	.
3	1003004870	.	 	 	 	.
4	1003005810	100128	 	 	 	.
5	1003006552	100121	100157	 	 	.
6	1003008533	100018	100012	100244	 	.
;

proc transpose data=have out=temp(drop=_name_ where=(col1 ne .));
by doctor;
var hosp:;
run;

proc sort data=temp ;
by col1;
run;

data want ;
 if _n_=1 then do;
  if 0 then set temp(rename=(col1=hosp1)) temp(rename=(col1=hosp2));
  dcl hash H (dataset:'temp(rename=(col1=hosp2))',multidata:'y') ;
  h.definekey  ("doctor") ;
  h.definedata ("hosp2") ;
  h.definedone () ;
 end;
 do Shared_Docs=1 by 1 until(last.hosp1);
  set temp(rename=(col1=hosp1));
  by hosp1 ;
  do until(not _t) ; 
   do rc = h.find() by 0 while (rc = 0) ; 
    h.has_next(result:_t) ;  
    if hosp1=hosp2 then h.removedup() ; 
	if last.hosp1 then output;
    rc=h.find_next();
   end ;
  end ;
 end;
 drop _t rc doctor;
run;

mkeintz · Posted 02-18-2020 10:21 PM

This is really a two-step task: (1) write out all the pairs, and (2) get a frequency of all pairs. The second step is trivial - it's what PROC FREQ is made for. The first step is straightforward:

data have;
infile cards truncover;
input Obs	doctor	hosp1	hosp2	hosp3	hosp4	hosp5;
cards;
1	1003000522	100072	 	 	 	.
2	1003002106	100072	 	 	 	.
3	1003004870	.	 	 	 	.
4	1003005810	100128	 	 	 	.
5	1003006552	100121	100157	 	 	.
6	1003008533	100018	100012	100244	 	.
;

data temp  (keep=obs doctor hosp1 hosp2);
  set have (rename=(hosp1=dummy1 hosp2=dummy2));
  array h {*} dummy1-dummy2 hosp3-hosp5;
  nh=n(of h{*});
  if nh>0;
  do h1=1 to nh;
    do h2=h1 to nh;
      hosp1=min(h{h1},h{h2});
      hosp2=max(h{h1},h{h2});
      output;
    end;
  end;
run;
proc freq data=temp  noprint ;
  tables hosp1 * hosp2 /out=want (keep=hosp1 hosp2 count);
run;

In the DATA TEMP step, notice I rename HOSP1 and HOSP2 to DUMMY1 DUMMY2 respectively. That's just so I can re-use the names HOSP1 and HOSP2 in the output.
The array is there just to provide a framework for finding all pairs (including self-pairs).
Because the hospitals are not sorted within each observation, you need the MIN and MAX functions when assigning members of each pair to HOSP1 and HOSP2.

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------

niam · Posted 02-19-2020 10:03 AM

Thank you very much for your help. Your solution worked! I wish I could code like you!

Patrick · Posted 02-19-2020 12:44 AM

Here another option using a hash object.

data have;
  infile cards truncover;
  input Obs doctor hosp1 hosp2 hosp3 hosp4 hosp5;
  cards;
1 1003000522 100072       .
2 1003002106 100072       .
3 1003004870 .       .
4 1003005810 100128       .
5 1003006552 100121 100157     .
6 1003008533 100018 100012 100244   .
;

data _null_;
  if _n_=1 then
    do;
      dcl hash h1(ordered:'y', suminc: 'count');
      h1.defineKey('hospital1','hospital2');
      h1.defineData('hospital1','hospital2','n_docs');
      h1.defineDone();
      retain count 1;
    end;

  set have end=last;
  array hosp {*} hosp1-hosp5;

  do _i=1 to dim(hosp) while(not missing(hosp[_i]));
    hospital1=hosp[_i];
    do _k=_i to dim(hosp) while(not missing(hosp[_k]));
      hospital2=hosp[_k];
      _rc=h1.sum(sum: n_docs);
      n_docs+1;
      _rc=h1.replace();
    end;
  end;

  if last then h1.output(dataset:'want');

run;

proc print data=want;
run;