Auto-suggest helps you quickly narrow down your search results by suggesting possible matches as you type.

Showing results for

Find a Community

- Home
- /
- SAS Programming
- /
- SAS Procedures
- /
- Match/Link with multiple passes

- RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page

- Mark as New
- Bookmark
- Subscribe
- RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

12-04-2017 12:56 PM

I'd perform one-to-one mentor-to-student match/linkage. One mentor can take only one student and also only if nearest in their residential location.

Program below works in this demo settings. However, have no idea how many passes would do the job in my real data where I have around N=500 mentors and N=3500 students. And this ratio can be substantially different for different counties (N=62). What would be more efficient programming dealing with multiple passes? (SAS 9.4)

```
/*MENTORS*/
data mentor;
input mentor_ID $User_X User_Y;
cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;
run;
/*STUDENTS*/
data students;
input student_ID $ User_X User_Y;
cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 69 1
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 55 1
;
run;
/*CALCULATION OF DISTANCE BETWEEN MENTORS AND STUDENTS*/
proc sql;
create table distance(keep=mentor_id student_id distance) as
select t1.*,
t1.user_x*1 as x1,
t1.user_y*1 as y1,
t2.user_x*1 as x2,
t2.user_y*1 as y2,
t2.student_id,
geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance
from mentor t1, students t2
group by t1.mentor_id
having (calculated distance)=calculated distance;
quit;
/*KEEP MINIMUM DISTANCE*/
/*minimum distance*/
proc sql;
create table dist_min(keep=mentor_id student_id distance) as
select t1.*,
t1.user_x*1 as x1,
t1.user_y*1 as y1,
t2.user_x*1 as x2,
t2.user_y*1 as y2,
t2.student_id,
geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance
from mentor t1, students t2
group by t1.mentor_id
having min(calculated distance)=calculated distance;
quit;
/*CHECK OUTPUTS*/
proc print data=distance; run;
proc print data=dist_min; run;
/*MENTOR TO STUDENT / ONE-TO-ONE MATCHES*/;
/**pass1 */
proc sort data=dist_min;
by student_id distance;
run;
data dist_min_keep; set dist_min;
by student_id distance;
if first.student_id then output;
run;
/*MENTORS MATCHED TO MANY STUDENTS*/
*PASS2;
proc sort data=mentor; by mentor_id; run;
proc sort data=dist_min_keep; by mentor_id distance; run;
data mentor2(where=(distance = .));
merge mentor (in=a) dist_min_keep (in=b);
by mentor_id;
drop student_id;
if a;
run;
data mentor2a(drop=distance); set mentor2;
run;
/*STUDENTS MATCHED TO SAME MENTORS*/
proc sort data=students; by student_id; run;
proc sort data=dist_min_keep; by student_id; run;
data student2(where=(distance=.));
merge students (in=a) dist_min_keep (in=b);
by student_id;
drop mentor_id;
if a;
run;
data student2a(drop=distance); set student2;
run;
*PASS2;
/*CALCULATE AND KEEP MINIMUM DISTANCE AMONG NOT DISTINCTLY MATCHED MENTORS AND STUDENTS FROM THE FIRST PASS */
proc sql;
create table dist_2p (keep=mentor_id student_id distance) as
select
t1.user_x*1 as x1,
t1.user_y*1 as y1,
t1.mentor_id,
t2.user_x*1 as x2,
t2.user_y*1 as y2,
t2.student_id,
geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance
from mentor2a t1, student2a t2
group by t1.mentor_id
having min(calculated distance)=calculated distance;
quit;
data final_matches; set Dist_min_keep dist_2p; run;
proc print data=mentor;
proc print data=students;
proc print data=final_matches; run;
```

Accepted Solutions

Solution

12-04-2017
06:33 PM

- Mark as New
- Bookmark
- Subscribe
- RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Posted in reply to Cruise

12-04-2017 01:34 PM - edited 12-04-2017 02:06 PM

Your input datasteps don't include the other three variables and you are multiplying a number by 1 in your proc sql call.

Are you trying to do something like the following?:

/*MENTORS*/ data mentor; input mentor_ID $User_X User_Y a b c; cards; U1 53.8 -32.1 0 23 1 U2 23.8 -96.3 1 25 2 U3 34.5 -70.7 0 28 1 U4 28.7 -76.5 1 35 2 ; /*STUDENTS*/ data students; input student_ID $ User_X User_Y a b c; cards; A1 53.6 -32.1 1 63 1 A2 35.6 -12.3 0 25 2 A3 63.4 -85.4 1 69 1 A4 34.5 -70.9 0 45 2 A5 37.8 -77.7 1 55 1 ; proc sql; create table distance(keep=mentor_id student_id distance) as select t1.*, t1.user_x as x1, t1.user_y as y1, t2.user_x as x2, t2.user_y as y2, t2.student_id, geodist(x1, y1, x2, y2) as distance from mentor t1, students t2 order by mentor_id, calculated distance /* group by t1.mentor_id */ /* having (calculated distance)=calculated distance */ ; quit; data want (keep=mentor_id student_id distance); array students(999) $; set distance; retain i needone students:; by mentor_id; if _n_ eq 1 then i=0; if first.mentor_id then needone=1; if needone then do; if student_id not in students then do; i+1; needone=0; students(i)=student_id; output; end; end; run;

Art, CEO, AnalystFinder.com

All Replies

Solution

12-04-2017
06:33 PM

- Mark as New
- Bookmark
- Subscribe
- RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Posted in reply to Cruise

12-04-2017 01:34 PM - edited 12-04-2017 02:06 PM

Your input datasteps don't include the other three variables and you are multiplying a number by 1 in your proc sql call.

Are you trying to do something like the following?:

/*MENTORS*/ data mentor; input mentor_ID $User_X User_Y a b c; cards; U1 53.8 -32.1 0 23 1 U2 23.8 -96.3 1 25 2 U3 34.5 -70.7 0 28 1 U4 28.7 -76.5 1 35 2 ; /*STUDENTS*/ data students; input student_ID $ User_X User_Y a b c; cards; A1 53.6 -32.1 1 63 1 A2 35.6 -12.3 0 25 2 A3 63.4 -85.4 1 69 1 A4 34.5 -70.9 0 45 2 A5 37.8 -77.7 1 55 1 ; proc sql; create table distance(keep=mentor_id student_id distance) as select t1.*, t1.user_x as x1, t1.user_y as y1, t2.user_x as x2, t2.user_y as y2, t2.student_id, geodist(x1, y1, x2, y2) as distance from mentor t1, students t2 order by mentor_id, calculated distance /* group by t1.mentor_id */ /* having (calculated distance)=calculated distance */ ; quit; data want (keep=mentor_id student_id distance); array students(999) $; set distance; retain i needone students:; by mentor_id; if _n_ eq 1 then i=0; if first.mentor_id then needone=1; if needone then do; if student_id not in students then do; i+1; needone=0; students(i)=student_id; output; end; end; run;

Art, CEO, AnalystFinder.com

- Mark as New
- Bookmark
- Subscribe
- RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Posted in reply to art297

12-06-2017 01:28 PM

Amazing art297 as usual.

If I want mentors and students match on other variables a,b,c would I just be able to throw them into sql like below?

```
proc sql;
create table distance(keep=mentor_id student_id distance) as
select t1.*,
t1.user_x as x1,
t1.user_y as y1,
t2.user_x as x2,
t2.user_y as y2,
t2.student_id,
geodist(x1, y1, x2, y2) as distance
from mentor t1 join students t2
on t1.a=t2.a and t1.b=t2.b and t1.c=t2.c
order by mentor_id, calculated distance
;
quit;
```

- Mark as New
- Bookmark
- Subscribe
- RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Posted in reply to Cruise

12-06-2017 01:57 PM

I would think so, but would need some example data in order to test it.

Art, CEO, AnalystFinder.com

- Mark as New
- Bookmark
- Subscribe
- RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Posted in reply to Cruise

12-06-2017 02:49 PM

Please see data as suggested. I think it worked out. But I'm curious if any comments.

Does it below linkage make sense if i needed impose certain conditions on the linkage?

```
/*MENTORS*/
data mentor;
input mentor_ID $User_X User_Y group age gender;
cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;
/*STUDENTS*/
data students;
input student_ID $ User_X User_Y group age gender;
cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 69 1
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 55 1
;
proc sql;
create table distance(drop=x1 x2 y1 y2 user_x user_y) as
select t1.*,
t1.user_x as x1,
t1.user_y as y1,
t2.user_x as x2,
t2.user_y as y2,
t2.student_id,
t2.age as student_age,
t2.gender as student_gender,
t2.group as student_group,
geodist(x1, y1, x2, y2) as distance
from mentor t1, students t2
order by mentor_id, calculated distance
/* group by t1.mentor_id */
/* having (calculated distance)=calculated distance */
;
quit;
data want(keep=mentor_ID group age gender student_ID distance student_age student_gender student_group);
array students(999) $;
set distance;
retain i needone students:;
by mentor_id;
if _n_ eq 1 then i=0;
if first.mentor_id then needone=1;
if needone then do;
if student_id not in students then do;
i+1;
needone=0;
students(i)=student_id;
output;
end;
end;
run;
/*MENTORS*/
data mentor;
input mentor_ID $User_X User_Y group age gender;
cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;
/*STUDENTS*/
data students;
input student_ID $ User_X User_Y group age gender;
cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 35 2
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 25 2
;
proc sql;
create table distance1 as
select t1.*,
t1.user_x as x1,
t1.user_y as y1,
t2.user_x as x2,
t2.user_y as y2,
t2.student_id,
t2.age as student_age,
t2.gender as student_gender,
t2.group as student_group,
geodist(x1, y1, x2, y2) as distance
from mentor t1 join students t2
on t1.age=t2.age and t1.group=t2.group and t1.gender=t2.gender
order by mentor_id, calculated distance
/* group by t1.mentor_id */
/* having (calculated distance)=calculated distance */
;
quit;
data want1 (keep=mentor_id student_id distance age group gender student_age student_gender student_group);
array students(999) $;
set distance1;
retain i needone students:;
by mentor_id;
if _n_ eq 1 then i=0;
if first.mentor_id then needone=1;
if needone then do;
if student_id not in students then do;
i+1;
needone=0;
students(i)=student_id;
output;
end;
end;
run;
proc print data=want;
title "Linked by nearest distance";
run;
proc print data=want1;
title "Linked by nearest distance given other conditions met";
run;
```

- Mark as New
- Bookmark
- Subscribe
- RSS Feed
- Permalink
- Email to a Friend
- Report Inappropriate Content

Posted in reply to Cruise

12-06-2017 03:11 PM

Not enough data to give it a sufficient test but, again, I don't see why it wouldn't work with the conditions you added.

With your example, it selects the two pairs that match.

Art, CEO, AnalystFinder.com