Probably not the most elegant solutions, but it works. /*make dummy dataset:*/
data work.dummy_data;
input index names $20. corrected_names $20.;
format names $20. corrected_names $20.;
datalines;
1 Tom Hanks
2 John Trovolta
3 Mel Gibson
4 Sylvester Stalone
5 Mel Gibbon
6 Tim Honks
7 Johnny Travolta
8 Sylvester Stallone
9 Mel Gibson
10 Sylvester Stallone
11 Tom Hawks
12 John Travolta
13 John Travolta
14 Tommy Hanks
15 Tom Hanks
16 Mel Gibson
;
run;
%macro clean_names;
/*macro variable with rows that still need to be processed (to-do list):*/
proc sql noprint;
select index into :to_do separated by ', '
from dummy_data
;
quit;
/*%put &to_do;*/
/*start loop, go on until there are no more rows to process):*/
%do %until(%sysfunc(countw("&to_do", ",")) = 1);
/*select rows that need to be processed:*/
proc sql;
create table work.temp1 as
select *
from dummy_data
where index in (&to_do)
;
quit;
/*take value from first row:*/
proc sql noprint;
select names into :name
from temp1
;
quit;
/*%put &name;*/
/*calculate Levenshtein distance relative to values in other rows:*/
proc sql;
create table work.temp2 as
select *,
(COMPLEV("&name", names)) as distance
from temp1
;
quit;
/*select all rows that meet criterion:*/
proc sql;
create table work.temp3 as
select *
from temp2
where distance <= 4
;
quit;
/*determine index, these rows will be corrected in this iteration:*/
proc sql noprint;
select index into :to_fill separated by ', '
from temp3
;
quit;
/*%put &to_fill;*/
/*make frequency table with various spellings:*/
proc freq data=work.temp3 order=freq noprint;
tables names / nocol norow nopercent out = work.temp_freq1;
run;
/*add index (for next step):*/
proc sql;
create table work.temp_freq2 as
select *,
(monotonic()) as row_number
from temp_freq1
;
quit;
/*macro variable with most frequently occuring spelling (=pick):*/
proc sql noprint;
select names into :pick
from temp_freq2
where row_number = 1
;
quit;
/*%put &pick;*/
/*replace various spellings with most frequent spelling:*/
proc sql;
update dummy_data
set corrected_names =
case
when index in (&to_fill) then "&pick"
else corrected_names
end;
quit;
%let to_do = 0; /* silly bug fix.*/
/*update to-do list:*/
proc sql noprint;
select index into :to_do separated by ', '
from dummy_data
where corrected_names is missing
;
quit;
%end;
%mend clean_names;
%clean_names
... View more