Solved: Re: Match/Link with multiple passes

Cruise · Posted 12-04-2017 12:56 PM

I'd like to perform one-to-one mentor-to-student match/linkage. One mentor can take only one student and also only if nearest in their residential location.

Program below works in this demo settings. However, have no idea how many passes would do the job in my real data where I have around N=500 mentors and N=3500 students. And this ratio can be substantially different for different counties (N=62). What would be more efficient programming dealing with multiple passes? (SAS 9.4)

/*MENTORS*/
data mentor;
 input mentor_ID $User_X User_Y;
 cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;
run;

/*STUDENTS*/
data students;
 input student_ID $ User_X	User_Y;
 cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 69 1
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 55 1
;
run;

/*CALCULATION OF DISTANCE BETWEEN MENTORS AND STUDENTS*/
proc sql;
create table distance(keep=mentor_id student_id distance) as
select t1.*, 
    t1.user_x*1 as x1,
    t1.user_y*1 as y1,
    t2.user_x*1 as x2,
    t2.user_y*1 as y2,
	t2.student_id,
    geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance

from mentor t1, students t2
group by t1.mentor_id
having (calculated distance)=calculated distance;
quit; 

/*KEEP MINIMUM DISTANCE*/

/*minimum distance*/
proc sql;
create table dist_min(keep=mentor_id student_id distance) as
select t1.*, 
    t1.user_x*1 as x1,
    t1.user_y*1 as y1,
    t2.user_x*1 as x2,
    t2.user_y*1 as y2,
	t2.student_id,
    geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance

from mentor t1, students t2
group by t1.mentor_id
having min(calculated distance)=calculated distance;
quit;

/*CHECK OUTPUTS*/
proc print data=distance; run;
proc print data=dist_min; run; 

/*MENTOR TO STUDENT / ONE-TO-ONE MATCHES*/;
/**pass1 */
proc sort data=dist_min;
by student_id distance;
run;
data dist_min_keep; set dist_min;
by student_id distance;
if first.student_id then output;
run;

/*MENTORS MATCHED TO MANY STUDENTS*/
*PASS2;
proc sort data=mentor; by mentor_id; run;
proc sort data=dist_min_keep; by mentor_id distance; run;
data mentor2(where=(distance = .));
merge mentor (in=a) dist_min_keep (in=b);
by mentor_id;
drop student_id;
if a;
run;
data mentor2a(drop=distance); set mentor2;
run;

/*STUDENTS MATCHED TO SAME MENTORS*/
proc sort data=students; by student_id; run;
proc sort data=dist_min_keep; by student_id; run;
data student2(where=(distance=.));
merge students (in=a) dist_min_keep (in=b);
by student_id;
drop mentor_id;
if a;
run;
data student2a(drop=distance); set student2;
run;

*PASS2;
/*CALCULATE AND KEEP MINIMUM DISTANCE AMONG NOT DISTINCTLY MATCHED MENTORS AND STUDENTS FROM THE FIRST PASS */
proc sql;
create table dist_2p (keep=mentor_id student_id distance) as
select
    t1.user_x*1 as x1,
    t1.user_y*1 as y1,
	t1.mentor_id,
    t2.user_x*1 as x2,
    t2.user_y*1 as y2,
	t2.student_id,
    geodist(calculated x1, calculated y1, calculated x2, calculated y2) as distance

from mentor2a t1, student2a t2
group by t1.mentor_id
having min(calculated distance)=calculated distance;
quit;

data final_matches; set Dist_min_keep dist_2p; run;  

proc print data=mentor;
proc print data=students;
proc print data=final_matches; run;

art297 · Posted 12-04-2017 01:34 PM

Your input datasteps don't include the other three variables and you are multiplying a number by 1 in your proc sql call.

Are you trying to do something like the following?:

/*MENTORS*/
data mentor;
 input mentor_ID $User_X User_Y a b c;
 cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;

/*STUDENTS*/
data students;
 input student_ID $ User_X	User_Y a b c;
 cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 69 1
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 55 1
;

proc sql;
  create table distance(keep=mentor_id student_id distance) as
    select t1.*, 
    t1.user_x as x1,
    t1.user_y as y1,
    t2.user_x as x2,
    t2.user_y as y2,
	t2.student_id,
    geodist(x1, y1, x2, y2) as distance

      from mentor t1, students t2
        order by mentor_id, calculated distance
/*         group by t1.mentor_id */
/*           having (calculated distance)=calculated distance */
  ;
quit;

data want (keep=mentor_id student_id distance);
  array students(999) $;
  set distance;
  retain i needone students:;
  by mentor_id;
  if _n_ eq 1 then i=0;
  if first.mentor_id then needone=1;
  if needone then do;
    if student_id not in students then do;
      i+1;
      needone=0;
      students(i)=student_id;
      output;
    end;
  end;
run;

Art, CEO, AnalystFinder.com

View solution in original post

art297 · Posted 12-04-2017 01:34 PM

Your input datasteps don't include the other three variables and you are multiplying a number by 1 in your proc sql call.

Are you trying to do something like the following?:

/*MENTORS*/
data mentor;
 input mentor_ID $User_X User_Y a b c;
 cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;

/*STUDENTS*/
data students;
 input student_ID $ User_X	User_Y a b c;
 cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 69 1
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 55 1
;

proc sql;
  create table distance(keep=mentor_id student_id distance) as
    select t1.*, 
    t1.user_x as x1,
    t1.user_y as y1,
    t2.user_x as x2,
    t2.user_y as y2,
	t2.student_id,
    geodist(x1, y1, x2, y2) as distance

      from mentor t1, students t2
        order by mentor_id, calculated distance
/*         group by t1.mentor_id */
/*           having (calculated distance)=calculated distance */
  ;
quit;

data want (keep=mentor_id student_id distance);
  array students(999) $;
  set distance;
  retain i needone students:;
  by mentor_id;
  if _n_ eq 1 then i=0;
  if first.mentor_id then needone=1;
  if needone then do;
    if student_id not in students then do;
      i+1;
      needone=0;
      students(i)=student_id;
      output;
    end;
  end;
run;

Art, CEO, AnalystFinder.com

Cruise · Posted 12-06-2017 01:28 PM

@art297

Amazing art297 as usual.

If I want mentors and students match on other variables a,b,c would I just be able to throw them into sql like below?

proc sql;
  create table distance(keep=mentor_id student_id distance) as
    select t1.*, 
    t1.user_x as x1,
    t1.user_y as y1,
    t2.user_x as x2,
    t2.user_y as y2,
	t2.student_id,
    geodist(x1, y1, x2, y2) as distance

      from mentor t1 join students t2
      on t1.a=t2.a and t1.b=t2.b and t1.c=t2.c  
        order by mentor_id, calculated distance
  ;
quit;

art297 · Posted 12-06-2017 01:57 PM

I would think so, but would need some example data in order to test it.

Art, CEO, AnalystFinder.com

Cruise · Posted 12-06-2017 02:49 PM

Please see data as suggested. I think it worked out. But I'm curious if any comments.

Does it below linkage make sense if i needed impose certain conditions on the linkage?

@art297

/*MENTORS*/
data mentor;
 input mentor_ID $User_X User_Y group age gender;
 cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;

/*STUDENTS*/
data students;
 input student_ID $ User_X	User_Y group age gender;
 cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 69 1
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 55 1
;

proc sql;
  create table distance(drop=x1 x2 y1 y2 user_x user_y) as
    select t1.*, 
    t1.user_x as x1,
    t1.user_y as y1,
    t2.user_x as x2,
    t2.user_y as y2,
	t2.student_id,
	t2.age as student_age,
	t2.gender as student_gender,
	t2.group as student_group,
    geodist(x1, y1, x2, y2) as distance

      from mentor t1, students t2
        order by mentor_id, calculated distance
/*         group by t1.mentor_id */
/*           having (calculated distance)=calculated distance */
  ;
quit;

data want(keep=mentor_ID group age gender student_ID distance student_age student_gender student_group);
  array students(999) $;
  set distance;
  retain i needone students:;
  by mentor_id;
  if _n_ eq 1 then i=0;
  if first.mentor_id then needone=1;
  if needone then do;
    if student_id not in students then do;
      i+1;
      needone=0;
      students(i)=student_id;
      output;
    end;
  end;
run; 


/*MENTORS*/
data mentor;
 input mentor_ID $User_X User_Y group age gender;
 cards;
U1 53.8 -32.1 0 23 1
U2 23.8 -96.3 1 25 2
U3 34.5 -70.7 0 28 1
U4 28.7 -76.5 1 35 2
;

/*STUDENTS*/
data students;
 input student_ID $ User_X	User_Y group age gender;
 cards;
A1 53.6 -32.1 1 63 1
A2 35.6 -12.3 0 25 2
A3 63.4 -85.4 1 35 2
A4 34.5 -70.9 0 45 2
A5 37.8 -77.7 1 25 2
;

proc sql;
  create table distance1 as
    select t1.*, 
    t1.user_x as x1,
    t1.user_y as y1,
    t2.user_x as x2,
    t2.user_y as y2,
	t2.student_id,
	t2.age as student_age,
	t2.gender as student_gender,
	t2.group as student_group,
    geodist(x1, y1, x2, y2) as distance

      from mentor t1 join students t2
	  on t1.age=t2.age and t1.group=t2.group and t1.gender=t2.gender 
        order by mentor_id, calculated distance
/*         group by t1.mentor_id */
/*           having (calculated distance)=calculated distance */
  ;
quit;

data want1 (keep=mentor_id student_id distance age group gender student_age student_gender student_group);
  array students(999) $;
  set distance1;
  retain i needone students:;
  by mentor_id;
  if _n_ eq 1 then i=0;
  if first.mentor_id then needone=1;
  if needone then do;
    if student_id not in students then do;
      i+1;
      needone=0;
      students(i)=student_id;
      output;
    end;
  end;
run; 

proc print data=want;
title "Linked by nearest distance";
run;

proc print data=want1;
title "Linked by nearest distance given other conditions met";
run;

art297 · Posted 12-06-2017 03:11 PM

Not enough data to give it a sufficient test but, again, I don't see why it wouldn't work with the conditions you added.

With your example, it selects the two pairs that match.

Art, CEO, AnalystFinder.com

Ready to join fellow brilliant minds for the SAS Hackathon?

Classroom Training Available!