<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Correct inconsistent spelling (using Levenshtein distances) in New SAS User</title>
    <link>https://communities.sas.com/t5/New-SAS-User/Correct-inconsistent-spelling-using-Levenshtein-distances/m-p/519971#M3908</link>
    <description>&lt;P&gt;&lt;SPAN&gt;Dear SAS community members,&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;I am struggling with the following, hopefully someone can help.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;I have a fairly large table (6 million rows) with a column with names. Unfortunately, spelling is not always consistent. So, for example, I might have ‘Mel Gibson’ and ‘Mel Gibbon’ as two different spellings of the same name. Now, what I would like is to correct this by replacing all these different spellings with a single one (e.g. the most frequently occuring). Also, since it is a fairly large number of rows I am looking for something efficient.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;My idea was to first make a list of indices &lt;EM&gt;I&lt;/EM&gt;&amp;nbsp;of all names that need to be processed. Initially this list runs from 1 to num_rows (i.e, all entries). Then start a loop, pick the first name and calculate a criterion (e.g. Levenshtein distance &amp;lt; 3) for all remaining names. For all names where the criterion is met replace the spelling with the most frequently occuring spelling. Then update &lt;EM&gt;I&lt;/EM&gt; and do the same for the next name in line. Keep doing this until length of &lt;EM&gt;I&lt;/EM&gt; is zero. The problem is that I don’t know how to accomplish this. Help would be greatly appreciated!&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Below is a small dummy dataset I have been playing around with.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Many thanks in advance for any help. Best regards,&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Ruud&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;/*make dummy dataset:*/
data work.dummy_data;
input index names $20. corrected_names $20.;
format names $20. corrected_names $20.; 
datalines;
1 Tom Hanks
2 John Trovolta
3 Mel Gibson
4 Sylvester Stalone
5 Mel Gibbon
6 Tim Honks
7 Johnny Travolta
8 Sylvester Stallone
9 Mel Gibson
10 Sylvester Stallone
11 Tom Hawks
12 John Travolta
13 John Travolta
14 Tommy Hanks
15 Tom Hanks
16 Mel Gibson
;
run;&lt;/CODE&gt;&amp;nbsp;&lt;/PRE&gt;</description>
    <pubDate>Fri, 14 Dec 2018 14:40:04 GMT</pubDate>
    <dc:creator>RuudB83</dc:creator>
    <dc:date>2018-12-14T14:40:04Z</dc:date>
    <item>
      <title>Correct inconsistent spelling (using Levenshtein distances)</title>
      <link>https://communities.sas.com/t5/New-SAS-User/Correct-inconsistent-spelling-using-Levenshtein-distances/m-p/519971#M3908</link>
      <description>&lt;P&gt;&lt;SPAN&gt;Dear SAS community members,&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;I am struggling with the following, hopefully someone can help.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;I have a fairly large table (6 million rows) with a column with names. Unfortunately, spelling is not always consistent. So, for example, I might have ‘Mel Gibson’ and ‘Mel Gibbon’ as two different spellings of the same name. Now, what I would like is to correct this by replacing all these different spellings with a single one (e.g. the most frequently occuring). Also, since it is a fairly large number of rows I am looking for something efficient.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;My idea was to first make a list of indices &lt;EM&gt;I&lt;/EM&gt;&amp;nbsp;of all names that need to be processed. Initially this list runs from 1 to num_rows (i.e, all entries). Then start a loop, pick the first name and calculate a criterion (e.g. Levenshtein distance &amp;lt; 3) for all remaining names. For all names where the criterion is met replace the spelling with the most frequently occuring spelling. Then update &lt;EM&gt;I&lt;/EM&gt; and do the same for the next name in line. Keep doing this until length of &lt;EM&gt;I&lt;/EM&gt; is zero. The problem is that I don’t know how to accomplish this. Help would be greatly appreciated!&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Below is a small dummy dataset I have been playing around with.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Many thanks in advance for any help. Best regards,&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Ruud&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;/*make dummy dataset:*/
data work.dummy_data;
input index names $20. corrected_names $20.;
format names $20. corrected_names $20.; 
datalines;
1 Tom Hanks
2 John Trovolta
3 Mel Gibson
4 Sylvester Stalone
5 Mel Gibbon
6 Tim Honks
7 Johnny Travolta
8 Sylvester Stallone
9 Mel Gibson
10 Sylvester Stallone
11 Tom Hawks
12 John Travolta
13 John Travolta
14 Tommy Hanks
15 Tom Hanks
16 Mel Gibson
;
run;&lt;/CODE&gt;&amp;nbsp;&lt;/PRE&gt;</description>
      <pubDate>Fri, 14 Dec 2018 14:40:04 GMT</pubDate>
      <guid>https://communities.sas.com/t5/New-SAS-User/Correct-inconsistent-spelling-using-Levenshtein-distances/m-p/519971#M3908</guid>
      <dc:creator>RuudB83</dc:creator>
      <dc:date>2018-12-14T14:40:04Z</dc:date>
    </item>
    <item>
      <title>Re: Correct inconsistent spelling (using Levenshtein distances)</title>
      <link>https://communities.sas.com/t5/New-SAS-User/Correct-inconsistent-spelling-using-Levenshtein-distances/m-p/519983#M3911</link>
      <description>&lt;P&gt;I don't have time to code this for you, but some guidance:&lt;/P&gt;
&lt;P&gt;Soundex function can be used to encode text:&lt;/P&gt;
&lt;P&gt;&lt;A href="http://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000245948.htm" target="_blank"&gt;http://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000245948.htm&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;This may get you some similarity.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;You can use proc freq to to caculate frequencies of names.&lt;/P&gt;</description>
      <pubDate>Mon, 10 Dec 2018 15:21:32 GMT</pubDate>
      <guid>https://communities.sas.com/t5/New-SAS-User/Correct-inconsistent-spelling-using-Levenshtein-distances/m-p/519983#M3911</guid>
      <dc:creator>RW9</dc:creator>
      <dc:date>2018-12-10T15:21:32Z</dc:date>
    </item>
    <item>
      <title>Re: Correct inconsistent spelling (using Levenshtein distances)</title>
      <link>https://communities.sas.com/t5/New-SAS-User/Correct-inconsistent-spelling-using-Levenshtein-distances/m-p/521473#M4203</link>
      <description>&lt;P&gt;Probably not the most elegant solutions, but it works.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;/*make dummy dataset:*/
data work.dummy_data;
input index names $20. corrected_names $20.;
format names $20. corrected_names $20.; 
datalines;
1 Tom Hanks
2 John Trovolta
3 Mel Gibson
4 Sylvester Stalone
5 Mel Gibbon
6 Tim Honks
7 Johnny Travolta
8 Sylvester Stallone
9 Mel Gibson
10 Sylvester Stallone
11 Tom Hawks
12 John Travolta
13 John Travolta
14 Tommy Hanks
15 Tom Hanks
16 Mel Gibson
;
run;

%macro clean_names;

/*macro variable with rows that still need to be processed (to-do list):*/
proc sql noprint;
select index into :to_do separated by ', '
from dummy_data
;
quit;

/*%put &amp;amp;to_do;*/

/*start loop, go on until there are no more rows to process):*/
%do %until(%sysfunc(countw("&amp;amp;to_do", ",")) = 1);

/*select rows that need to be processed:*/
proc sql;
create table work.temp1 as
select *
from dummy_data
where index in (&amp;amp;to_do)
;
quit;

/*take value from first row:*/
proc sql noprint;
select names into :name
from temp1
;
quit;

/*%put &amp;amp;name;*/

/*calculate Levenshtein distance relative to values in other rows:*/
proc sql;
create table work.temp2 as
select *,
(COMPLEV("&amp;amp;name", names)) as distance
from temp1
;
quit;

/*select all rows that meet criterion:*/
proc sql;
create table work.temp3 as
select *
from temp2
where distance &amp;lt;= 4
;
quit;

/*determine index, these rows will be corrected in this iteration:*/
proc sql noprint;
select index into :to_fill separated by ', '
from temp3
;
quit;

/*%put &amp;amp;to_fill;*/

/*make frequency table with various spellings:*/
proc freq data=work.temp3 order=freq noprint;
tables names / nocol norow nopercent out = work.temp_freq1;
run;

/*add index (for next step):*/
proc sql;
create table work.temp_freq2 as
select *,
(monotonic()) as row_number	
from temp_freq1
;
quit;

/*macro variable with most frequently occuring spelling (=pick):*/
proc sql noprint;
select names into :pick
from temp_freq2
where row_number = 1
;
quit;

/*%put &amp;amp;pick;*/

/*replace various spellings with most frequent spelling:*/
proc sql;
update dummy_data
set corrected_names = 
case 
when index in (&amp;amp;to_fill) then "&amp;amp;pick" 
else corrected_names
end;
quit;

%let to_do = 0; /* silly bug fix.*/

/*update to-do list:*/
proc sql noprint;
select index into :to_do separated by ', '
from dummy_data
where corrected_names is missing
;
quit;

%end;
%mend clean_names;

%clean_names&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 14 Dec 2018 14:38:19 GMT</pubDate>
      <guid>https://communities.sas.com/t5/New-SAS-User/Correct-inconsistent-spelling-using-Levenshtein-distances/m-p/521473#M4203</guid>
      <dc:creator>RuudB83</dc:creator>
      <dc:date>2018-12-14T14:38:19Z</dc:date>
    </item>
  </channel>
</rss>

