I'd like to perform one-to-one mentor-to-student match/linkage. One mentor can take only one student and also only if nearest in their residential location.
Program below works in this demo settings. However, have no idea how many passes would do the job in my real data where I have around N=500 mentors and N=3500 students. And this ratio can be substantially different for different counties (N=62). What would be more efficient programming dealing with multiple passes? (SAS 9.4)
/*MENTORS*/
data mentor;
input mentor_ID $User_X User_Y;
cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;
run;
/*STUDENTS*/
data students;
input student_ID $ User_X User_Y;
cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 69 1
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 55 1
;
run;
/*CALCULATION OF DISTANCE BETWEEN MENTORS AND STUDENTS*/
proc sql;
create table distance(keep=mentor_id student_id distance) as
select t1.*,
t1.user_x*1 as x1,
t1.user_y*1 as y1,
t2.user_x*1 as x2,
t2.user_y*1 as y2,
t2.student_id,
geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance
from mentor t1, students t2
group by t1.mentor_id
having (calculated distance)=calculated distance;
quit;
/*KEEP MINIMUM DISTANCE*/
/*minimum distance*/
proc sql;
create table dist_min(keep=mentor_id student_id distance) as
select t1.*,
t1.user_x*1 as x1,
t1.user_y*1 as y1,
t2.user_x*1 as x2,
t2.user_y*1 as y2,
t2.student_id,
geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance
from mentor t1, students t2
group by t1.mentor_id
having min(calculated distance)=calculated distance;
quit;
/*CHECK OUTPUTS*/
proc print data=distance; run;
proc print data=dist_min; run;
/*MENTOR TO STUDENT / ONE-TO-ONE MATCHES*/;
/**pass1 */
proc sort data=dist_min;
by student_id distance;
run;
data dist_min_keep; set dist_min;
by student_id distance;
if first.student_id then output;
run;
/*MENTORS MATCHED TO MANY STUDENTS*/
*PASS2;
proc sort data=mentor; by mentor_id; run;
proc sort data=dist_min_keep; by mentor_id distance; run;
data mentor2(where=(distance = .));
merge mentor (in=a) dist_min_keep (in=b);
by mentor_id;
drop student_id;
if a;
run;
data mentor2a(drop=distance); set mentor2;
run;
/*STUDENTS MATCHED TO SAME MENTORS*/
proc sort data=students; by student_id; run;
proc sort data=dist_min_keep; by student_id; run;
data student2(where=(distance=.));
merge students (in=a) dist_min_keep (in=b);
by student_id;
drop mentor_id;
if a;
run;
data student2a(drop=distance); set student2;
run;
*PASS2;
/*CALCULATE AND KEEP MINIMUM DISTANCE AMONG NOT DISTINCTLY MATCHED MENTORS AND STUDENTS FROM THE FIRST PASS */
proc sql;
create table dist_2p (keep=mentor_id student_id distance) as
select
t1.user_x*1 as x1,
t1.user_y*1 as y1,
t1.mentor_id,
t2.user_x*1 as x2,
t2.user_y*1 as y2,
t2.student_id,
geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance
from mentor2a t1, student2a t2
group by t1.mentor_id
having min(calculated distance)=calculated distance;
quit;
data final_matches; set Dist_min_keep dist_2p; run;
proc print data=mentor;
proc print data=students;
proc print data=final_matches; run;
Your input datasteps don't include the other three variables and you are multiplying a number by 1 in your proc sql call.
Are you trying to do something like the following?:
/*MENTORS*/ data mentor; input mentor_ID $User_X User_Y a b c; cards; U1 53.8 -32.1 0 23 1 U2 23.8 -96.3 1 25 2 U3 34.5 -70.7 0 28 1 U4 28.7 -76.5 1 35 2 ; /*STUDENTS*/ data students; input student_ID $ User_X User_Y a b c; cards; A1 53.6 -32.1 1 63 1 A2 35.6 -12.3 0 25 2 A3 63.4 -85.4 1 69 1 A4 34.5 -70.9 0 45 2 A5 37.8 -77.7 1 55 1 ; proc sql; create table distance(keep=mentor_id student_id distance) as select t1.*, t1.user_x as x1, t1.user_y as y1, t2.user_x as x2, t2.user_y as y2, t2.student_id, geodist(x1, y1, x2, y2) as distance from mentor t1, students t2 order by mentor_id, calculated distance /* group by t1.mentor_id */ /* having (calculated distance)=calculated distance */ ; quit; data want (keep=mentor_id student_id distance); array students(999) $; set distance; retain i needone students:; by mentor_id; if _n_ eq 1 then i=0; if first.mentor_id then needone=1; if needone then do; if student_id not in students then do; i+1; needone=0; students(i)=student_id; output; end; end; run;
Art, CEO, AnalystFinder.com
Your input datasteps don't include the other three variables and you are multiplying a number by 1 in your proc sql call.
Are you trying to do something like the following?:
/*MENTORS*/ data mentor; input mentor_ID $User_X User_Y a b c; cards; U1 53.8 -32.1 0 23 1 U2 23.8 -96.3 1 25 2 U3 34.5 -70.7 0 28 1 U4 28.7 -76.5 1 35 2 ; /*STUDENTS*/ data students; input student_ID $ User_X User_Y a b c; cards; A1 53.6 -32.1 1 63 1 A2 35.6 -12.3 0 25 2 A3 63.4 -85.4 1 69 1 A4 34.5 -70.9 0 45 2 A5 37.8 -77.7 1 55 1 ; proc sql; create table distance(keep=mentor_id student_id distance) as select t1.*, t1.user_x as x1, t1.user_y as y1, t2.user_x as x2, t2.user_y as y2, t2.student_id, geodist(x1, y1, x2, y2) as distance from mentor t1, students t2 order by mentor_id, calculated distance /* group by t1.mentor_id */ /* having (calculated distance)=calculated distance */ ; quit; data want (keep=mentor_id student_id distance); array students(999) $; set distance; retain i needone students:; by mentor_id; if _n_ eq 1 then i=0; if first.mentor_id then needone=1; if needone then do; if student_id not in students then do; i+1; needone=0; students(i)=student_id; output; end; end; run;
Art, CEO, AnalystFinder.com
Amazing art297 as usual.
If I want mentors and students match on other variables a,b,c would I just be able to throw them into sql like below?
proc sql;
create table distance(keep=mentor_id student_id distance) as
select t1.*,
t1.user_x as x1,
t1.user_y as y1,
t2.user_x as x2,
t2.user_y as y2,
t2.student_id,
geodist(x1, y1, x2, y2) as distance
from mentor t1 join students t2
on t1.a=t2.a and t1.b=t2.b and t1.c=t2.c
order by mentor_id, calculated distance
;
quit;
I would think so, but would need some example data in order to test it.
Art, CEO, AnalystFinder.com
Please see data as suggested. I think it worked out. But I'm curious if any comments.
Does it below linkage make sense if i needed impose certain conditions on the linkage?
/*MENTORS*/
data mentor;
input mentor_ID $User_X User_Y group age gender;
cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;
/*STUDENTS*/
data students;
input student_ID $ User_X User_Y group age gender;
cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 69 1
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 55 1
;
proc sql;
create table distance(drop=x1 x2 y1 y2 user_x user_y) as
select t1.*,
t1.user_x as x1,
t1.user_y as y1,
t2.user_x as x2,
t2.user_y as y2,
t2.student_id,
t2.age as student_age,
t2.gender as student_gender,
t2.group as student_group,
geodist(x1, y1, x2, y2) as distance
from mentor t1, students t2
order by mentor_id, calculated distance
/* group by t1.mentor_id */
/* having (calculated distance)=calculated distance */
;
quit;
data want(keep=mentor_ID group age gender student_ID distance student_age student_gender student_group);
array students(999) $;
set distance;
retain i needone students:;
by mentor_id;
if _n_ eq 1 then i=0;
if first.mentor_id then needone=1;
if needone then do;
if student_id not in students then do;
i+1;
needone=0;
students(i)=student_id;
output;
end;
end;
run;
/*MENTORS*/
data mentor;
input mentor_ID $User_X User_Y group age gender;
cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;
/*STUDENTS*/
data students;
input student_ID $ User_X User_Y group age gender;
cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 35 2
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 25 2
;
proc sql;
create table distance1 as
select t1.*,
t1.user_x as x1,
t1.user_y as y1,
t2.user_x as x2,
t2.user_y as y2,
t2.student_id,
t2.age as student_age,
t2.gender as student_gender,
t2.group as student_group,
geodist(x1, y1, x2, y2) as distance
from mentor t1 join students t2
on t1.age=t2.age and t1.group=t2.group and t1.gender=t2.gender
order by mentor_id, calculated distance
/* group by t1.mentor_id */
/* having (calculated distance)=calculated distance */
;
quit;
data want1 (keep=mentor_id student_id distance age group gender student_age student_gender student_group);
array students(999) $;
set distance1;
retain i needone students:;
by mentor_id;
if _n_ eq 1 then i=0;
if first.mentor_id then needone=1;
if needone then do;
if student_id not in students then do;
i+1;
needone=0;
students(i)=student_id;
output;
end;
end;
run;
proc print data=want;
title "Linked by nearest distance";
run;
proc print data=want1;
title "Linked by nearest distance given other conditions met";
run;
Not enough data to give it a sufficient test but, again, I don't see why it wouldn't work with the conditions you added.
With your example, it selects the two pairs that match.
Art, CEO, AnalystFinder.com
SAS Innovate 2025 is scheduled for May 6-9 in Orlando, FL. Sign up to be first to learn about the agenda and registration!
Learn the difference between classical and Bayesian statistical approaches and see a few PROC examples to perform Bayesian analysis in this video.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.