The first thing is combining all the variables into one variables .
a.k.a make a column for a row.
Here could give you a start.
data have;
set sashelp.heart(obs=100);
length row $ 200;
obs+1;
row=catx('|',of _all_);
keep obs row;
run;
proc fcmp outlib=work.math.func;
function fuzz_match(a $,b $) ;
length new_a new_b str_short str_long $ 200; /*only support the max length of string is 200*/
new_a=kcompress(a,'|','kad');
new_b=kcompress(b,'|','kad');
if klength(a)<klength(b) then do;
str_short=new_a;str_long=new_b;
end;
else do;
str_short=new_b;str_long=new_a;
end;
array count{10} _temporary_ (0 0 0 0 0 0 0 0 0 0); /*The count of one string ,two string , three string....*/
array weight{10} _temporary_ (2 4 8 16 32 64 128 256 512 1024); /*The weight of one string ,two string , three string....*/
max_str_len=min(10,klength(a),klength(b)); /*Search the max length of str is 10*/
do len=1 to max_str_len; /*Search string which's length is 1, 2, 3.......*/
n=0;
do start=1 to klength(str_short)+1-len; /*Search sub-string in long string*/
if kfind(str_long,strip(ksubstr(str_short,start,len))) then n+1;
end;
count{len}=n;
end;
sum=0;w=0;mean=0;
do k=1 to max_str_len;
if count{k} ne 0 then do;sum+count{k}*weight{k}; w+weight{k};end;
end;
/*Calculated weight mean
if w=0 then mean=0;
else mean=sum/w;
*/
if a=b then sum=constant('exactint');
return (sum); /*return weight sum or weight mean(mean)*/
endsub;
run;
options cmplib=work.math;
proc sql;
create table want as
select a.obs,a.row as row1,b.obs as obs2,b.row as row2,fuzz_match(a.row,b.row) as weight
from have as a,have as b
where a.obs ne b.obs
order by a.obs,weight desc;
quit;
... View more