I'd like to perform one-to-one mentor-to-student match/linkage. One mentor can take only one student and also only if nearest in their residential location.
Program below works in this demo settings. However, have no idea how many passes would do the job in my real data where I have around N=500 mentors and N=3500 students. And this ratio can be substantially different for different counties (N=62). What would be more efficient programming dealing with multiple passes? (SAS 9.4)
/*MENTORS*/
data mentor;
input mentor_ID $User_X User_Y;
cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;
run;
/*STUDENTS*/
data students;
input student_ID $ User_X User_Y;
cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 69 1
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 55 1
;
run;
/*CALCULATION OF DISTANCE BETWEEN MENTORS AND STUDENTS*/
proc sql;
create table distance(keep=mentor_id student_id distance) as
select t1.*,
t1.user_x*1 as x1,
t1.user_y*1 as y1,
t2.user_x*1 as x2,
t2.user_y*1 as y2,
t2.student_id,
geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance
from mentor t1, students t2
group by t1.mentor_id
having (calculated distance)=calculated distance;
quit;
/*KEEP MINIMUM DISTANCE*/
/*minimum distance*/
proc sql;
create table dist_min(keep=mentor_id student_id distance) as
select t1.*,
t1.user_x*1 as x1,
t1.user_y*1 as y1,
t2.user_x*1 as x2,
t2.user_y*1 as y2,
t2.student_id,
geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance
from mentor t1, students t2
group by t1.mentor_id
having min(calculated distance)=calculated distance;
quit;
/*CHECK OUTPUTS*/
proc print data=distance; run;
proc print data=dist_min; run;
/*MENTOR TO STUDENT / ONE-TO-ONE MATCHES*/;
/**pass1 */
proc sort data=dist_min;
by student_id distance;
run;
data dist_min_keep; set dist_min;
by student_id distance;
if first.student_id then output;
run;
/*MENTORS MATCHED TO MANY STUDENTS*/
*PASS2;
proc sort data=mentor; by mentor_id; run;
proc sort data=dist_min_keep; by mentor_id distance; run;
data mentor2(where=(distance = .));
merge mentor (in=a) dist_min_keep (in=b);
by mentor_id;
drop student_id;
if a;
run;
data mentor2a(drop=distance); set mentor2;
run;
/*STUDENTS MATCHED TO SAME MENTORS*/
proc sort data=students; by student_id; run;
proc sort data=dist_min_keep; by student_id; run;
data student2(where=(distance=.));
merge students (in=a) dist_min_keep (in=b);
by student_id;
drop mentor_id;
if a;
run;
data student2a(drop=distance); set student2;
run;
*PASS2;
/*CALCULATE AND KEEP MINIMUM DISTANCE AMONG NOT DISTINCTLY MATCHED MENTORS AND STUDENTS FROM THE FIRST PASS */
proc sql;
create table dist_2p (keep=mentor_id student_id distance) as
select
t1.user_x*1 as x1,
t1.user_y*1 as y1,
t1.mentor_id,
t2.user_x*1 as x2,
t2.user_y*1 as y2,
t2.student_id,
geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance
from mentor2a t1, student2a t2
group by t1.mentor_id
having min(calculated distance)=calculated distance;
quit;
data final_matches; set Dist_min_keep dist_2p; run;
proc print data=mentor;
proc print data=students;
proc print data=final_matches; run;
... View more