Here is a fuzz matched algorithm I wrote before (could be apply to Chinese or Japanese ..... Characters).
data a;
infile cards truncover;
input a $200.;
cards;
Jon
John
Johnny
Will
Willy
Willie
Milly
;
data b;
infile cards truncover;
input b $200.;
cards;
Johnathan
William
;
proc fcmp outlib=work.math.func;
function fuzz_match(a $,b $) ;
length new_a new_b str_short str_long $ 200;
new_a=kcompress(a,,'kad');
new_b=kcompress(b,,'kad');
if klength(a)<klength(b) then do;
str_short=new_a;str_long=new_b;
end;
else do;
str_short=new_b;str_long=new_a;
end;
array count{10} _temporary_ (0 0 0 0 0 0 0 0 0 0); /*The count of one string ,two string , three string....*/
array weight{10} _temporary_ (2 4 8 16 32 64 128 256 512 1024); /*The weight of one string ,two string , three string....*/
max_str_len=min(10,klength(a),klength(b)); /*Search the max length of str is 10*/
do len=1 to max_str_len; /*Search one string ,two string , three string .......*/
n=0;
do start=1 to klength(str_short)+1-len; /*Search sub-string in long string*/
if kfind(str_long,strip(ksubstr(str_short,start,len))) then n+1;
end;
count{len}=n;
end;
sum=0;w=0;mean=0;
do k=1 to max_str_len;
if count{k} ne 0 then do;sum+count{k}*weight{k}; w+weight{k};end;
end;
/*Calculated weight mean
if w=0 then mean=0;
else mean=sum/w;
*/
if a=b then sum=constant('exactint');
return (sum); /*return weight sum or weight mean(mean)*/
endsub;
run;
options cmplib=work.math;
proc sql;
create table want as
select a,b,fuzz_match(a,b) as weight
from a, b
order by a,weight desc;
quit;
... View more