Here it what I have at this point. It is not exactly what is being done by Peter Norvig, but it is where I am so far. * Based on : http://norvig.com/spell-correct.html;
%let word=speling;
filename big '/temp/big.txt'; * http://norvig.com/big.txt;
filename words '/usr/share/dict/words'; * unix default dictionary (provided in my other word puzzle related posts);
data words;
infile words truncover;
input word $upcase48.;
run;
data big;
length word $48;
infile big lrecl=1024 truncover;
input @;
_infile_=compbl(prxchange('s/[^A-Z]/ /i',-1,_infile_));
if _infile_ ne '' then
do i=1 to countw(_infile_,' ');
word=upcase(scan(_infile_,i,' '));
if word ne '' then output;
end;
drop i;
run;
data words;
set words big;
word=strip(word);
run;
proc freq data=words;
tables word /list out=wfreq(drop=percent) noprint;
run;
%macro wf_find; *to avoid repeating this code block below for each correction type;
if wf.find()=0 then
do;
clev=complev(orig_word,word);
if clev<=2 then output;
end;
%mend;
data corrections;
length word a b c $48;
orig_word=upcase("&word");
alphabet='ABCDEFGHIJKLMNOPQRSTUVWXYZ';
if 0 then set wfreq;
declare hash wf(hashexp:10,dataset:'wfreq');
declare hiter wfi('wf');
wf.definekey('word');
wf.definedata(all:'Y');
wf.definedone();
*replaces;
do i=1 to length(orig_word);
do ii=1 to 26;
word=orig_word;
substr(word,i,1)=substr(alphabet,ii,1);
%wf_find
end;
end;
*deletes;
do i=1 to length(word);
word=orig_word;
substr(word,i,1)='';
word=compress(word);
%wf_find
end;
*transposes;
do i=1 to length(orig_word)-1;
word=orig_word;
a=substr(word,i,1);
b=substr(word,i+1,1);
substr(word,i,1)=b;
substr(word,i+1,1)=a;
%wf_find
end;
*inserts;
do i=0 to length(orig_word);
word=orig_word;
a=subpad(word,1,i);
b=subpad(word,i+1,length(word)-i);
do ii=1 to 26;
c=substr(alphabet,ii,1);
word=cats(of a c b);
%wf_find
end;
end;
*brute - find all words in 'dictionary' that have an edit distance of <= 2, this step should not be necessary because previous method should find all instances, however this is just to be sure;
do while(wfi.next()=0);
clev=complev(orig_word,word);
if clev<=2 then output;
end;
keep orig_word word count clev;
stop;
run;
proc sql;
select distinct 'Did you mean: ' || strip(word)
from corrections
where clev=( select min(clev)
from corrections )
and count=( select max(count)
from corrections
where clev=( select min(clev)
from corrections ));
quit;
... View more