A proposed solution. Some of the hints are already given in this forum. I am looking to match pairs with the smallest score differences. One can at some later point remove the pairs with a score difference greater than the minimum wanted. Same sample datasets. /*********************/ /***** DataSet 1 *****/ /*********************/ data t_a(keep=id1 zScore); do id = 1 to 10000; id1 = compress('A'||put(id, z5.)); zScore= ceil(100000*ranuni(3)); output; end; run; /*********************/ /***** DataSet 2 *****/ /*********************/ data t_b(keep=id2 zScore); do id = 1 to 50000; id2 = compress('B'||put(id, z5.)); zScore= ceil(200000*ranuni(3)); output; end; run; /**************************************/ /** One can do a full Cartesian Join **/ /** or one can adjust the range **/ /**************************************/ PROC SQL; create table t_c as select a.id1, b.id2, a.zScore as zS1, b.zScore as zS2, (a.zScore-b.zScore) as difx, abs(a.zScore-b.zScore) as difa from t_a a, t_b b where (-100 <= (a.zScore-b.zScore) <=100); /*********************************/ /** check if all id1 are in t_c **/ /*********************************/ select count(distinct id1) as c_id1 from t_c; QUIT; proc sort data=t_c; by difa difx; run; /*****************************/ /**** the 1-to-1 matching ****/ /*****************************/ data t_want(keep=id1 id2 difa difx zS1 zS2); if _N_=1 then do; declare hash ha(multidata:'N'); ha.definekey('mbr1'); ha.definedata('mbr1'); ha.definedone(); declare hash hb(multidata:'N'); hb.definekey('mbr2'); hb.definedata('mbr2'); hb.definedone(); end; do until (aDone); set t_c; mbr1=id1; mbr2=id2; k1=ha.find(); k2=hb.find(); if (k1 and k2) then do; ha.add(); hb.add(); output; end; end; run; /***************************/ /**** some idiot checks ****/ /***************************/ proc sql; select count(distinct id1) as c_id1, count(distinct id2) as c_id2 from t_want; select difa,difx, sum(1) as c_pairs from t_want group by difa, difx; quit;
... View more