Hello,
First count the records as they exists without any data cleansing, this will give all distinct names, from this you might need to find the fuzzy matches. There are several techniques to find out the fuzzy matches, it all depends on how your data look. I would recommend scoring with SPEDIS & COMPGED (great functions for fuzzy patterns). Why not both for better results. Check this blog post for more ways.
Here is one of my example for fuzzy pattern matching:
data have;
input biz_name $1-25;
datalines;
ABC Limited
ABC Limited
ABC Limited
ABC Ltd
ABC Ltd
Test Holding LLC
Test Holding LLC.
XYZ Ltd
;
run;
/* This will count the number of records per company (company names might be duplicated due to fuzzy) & assign a row number for each distinct company name */
proc sql;
create table test as
select monotonic() as row_id,biz_name,rec_cnt
from (
select biz_name,count(*) as rec_cnt
from have
group by 1) ;
quit;
/* Cartesian product, to cross compare each names */
proc sql;
create table test2 as
select a.row_id,
b.row_id as row_id_c,
a.biz_name,
b.biz_name as biz_name_c,
a.rec_cnt,
b.rec_cnt as rec_cnt_c
from test as a,test as b
where a.row_id<>b.row_id /* excluding join by itself */
and spedis(a.biz_name,b.biz_name)<50 and compged(a.biz_name,b.biz_name)<500 /* Fuzzy match comparsion */
order by 1,2
;
quit;
/* must exist in pairs, this will make sure the scores are acceptable both ways */
/* i.e. ABC Limited compared with ABC Ltd & ABC Ltd compared with ABC Limited (scores differ) */
proc sql;
create table test3 as
select *
from test2
where row_id in (select row_id_c from test2)
and row_id_c in (select row_id from test2);
quit;
/* records that have pair of sets */
data test4(drop=lag_row_id_c);
set test3;
lag_row_id_c=lag(row_id_c);
if row_id=lag_row_id_c;
run;
/* Join back the matched names to count the final values */
proc sql;
create table final as
select a.row_id,
a.biz_name,
a.rec_cnt,
b.row_id as merge_row_id,
b.biz_name as merge_biz_name,
b.rec_cnt as merge_rec_cnt,
case when a.rec_cnt=. then 0 else a.rec_cnt end + case when b.rec_cnt=. then 0 else b.rec_cnt end as total_cnt
from test as a
left join test4 as b on a.row_id=b.row_id_c
where a.row_id not in (select row_id from test4);
quit;
... View more