Inspired by @novinosrin , here is a solution using complev
data have;
input First $ Last $ Address &:$15.;
cards;
BOB SMITH 100 SAS RD
BOB SMITH 100 N SAS RD
BOB JONES 1000 R RD
;
/* Sort so that the longest address is kept as the *true* address */
proc sql;
create table temp as
select *
from have
order by last, first, length(address) desc, address;
quit;
/* Use the Levenshtein distance, with a cutoff, to detect *real*
changes in address spellings */
data want ;
set temp; by last first;
if first.first then do;
addr = address;
id + 1;
end;
else if complev(addr, address, 5) >= 5 then do;
addr = address;
id + 1;
end;
retain addr;
run;
proc print data=want; run;
Obs. First Last Address addr id
1 BOB JONES 1000 R RD 1000 R RD 1
2 BOB SMITH 100 N SAS RD 100 N SAS RD 2
3 BOB SMITH 100 SAS RD 100 N SAS RD 2
... View more