Fuzzy String Matching with inconsistent data

Accepted Solution Solved
Reply
Contributor
Posts: 57
Accepted Solution

Fuzzy String Matching with inconsistent data

 

Dear Community Members:

 

I would like to seek your input in merging two data sets using strings. I have data set #1 with approximately 4.2 million names, and data set #2 with approximately 60,000 names. I normally would create a full cartesian product and use compged function to narrow down possible matches. In data set #1, some names (company names) are not always written consistently. For example, it is possible to run into:

LEHMAN BROTHERS

LEHMAN BROTERS INC

EMPLOYEES OF LEHMAN BROTHERS, etc.

 

When the company name is stated towards the end of the string (as in the last one above), COMPGED assigns too high of a cost and returns false matches with significantly lower costs. I provided a small sample to illustrate this point. I tried reversing the order of names in COMPGED, but it did not help much. 

 

I was wondering if there was a way to look for overlaps between two strings irrespective whether this overlapping string is at the beginning, in the middle or at the end of a string. If you have other advice, I would love to hear them too.

 

Thank you

 

/*This is the data that cause issues*/

proc IMPORT
datafile="C:\Users\.....\sample1.xlsx"
dbms=xlsx
out=sample1
replace;
run;

 

/*this is the data that have no known issues*/
proc IMPORT
datafile="C:\Users\.....\sample2.xlsx"
dbms=xlsx
out=sample2
replace;
run;

proc sql;
    create table x as
        select *, compged(sample1.name1, sample2.name2, 'L') as cost1, compged(sample2.name2, sample1.name1, 'L') as cost2
        from sample1, sample2 ;
        /*where calculated cost1 le 750 or calculated cost2 le 750;*/
quit;

proc sort data=x; by name1 cost2; run;


Accepted Solutions
Solution
‎05-17-2016 11:51 AM
Respected Advisor
Posts: 4,828

Re: Fuzzy String Matching with inconsistent data

You could probably trim the problem down considerably by eliminating good matches first. Something like this:

 

data s1;
length name $50;
input name &;
id = _n_;
datalines;
EMPLOYEES OF LEHMAN BROTHERS
ACTION FUND OF LEHMAN BROTHERS
POLITICAL ACTION COMMITTEE OF LEHMAN BROTHERS
SUPER PAC LEHMAN BROTHERS
COMPAMY PAC LEHMAN BROTHERS
LEHMAN BROS
LEHMAN BROTHERS
L. BROS
;

data s2;
length name $50;
input name &;
id = _n_;
datalines;
LEHMAN BROTHERS INC
NORTHROP GRUMMAN COROPROATION 
3M COMPANY
3M   
ZIP
ABC
GENERAL MOTORS
LEHMAN AND SON
;

data s1w;
set s1;
do i = 1 to countw(name);
    word = scan(name,i);
    if length(word) > 2 then output;
    end;
keep id word;
run;

data s2w;
set s2;
do i = 1 to countw(name);
    word = scan(name,i);
    if  length(word) > 2  and
        word not in ("INC", "COMPANY", "CORP", "AND")  /* Etc. Meaningless matches */
    then output;
    end;
keep id word;
run;

proc sql;

create table wordMatches as
select id1, id2, nbWords
from
    (select 
        s1w.id as id1, 
        s2w.id as id2, 
        count(*) as nbWords
    from 
        s1w inner join 
        s2w on s1w.word=s2w.word
    group by s1w.id, s2w.id )
group by id1
having nbWords = max(nbWords);

create table uniqueBestMatches as
select id1, id2, nbWords
from wordMatches
group by id1
having count(id2) = 1;

create table multipleBestMatches as
select id1, id2, nbWords
from wordMatches
group by id1
having count(id2) > 1;

create table unMatched as
select * 
from s1
where id not in (select id1 from wordMatches);

quit;

proc sql;

Title "Good unique matches";
select s1.name as name1, s2.name as name2
from 
    uniqueBestMatches as a inner join
    s1 on a.id1=s1.id inner join
    s2 on a.id2=s2.id;

Title "Multiple matches, investigate with compged";
select s1.name as name1, s2.name as name2
from 
    multipleBestMatches as a inner join
    s1 on a.id1=s1.id inner join
    s2 on a.id2=s2.id
order by name1;

Title "Unmatched names";
select name from unMatched;

quit;
PG

View solution in original post


All Replies
Solution
‎05-17-2016 11:51 AM
Respected Advisor
Posts: 4,828

Re: Fuzzy String Matching with inconsistent data

You could probably trim the problem down considerably by eliminating good matches first. Something like this:

 

data s1;
length name $50;
input name &;
id = _n_;
datalines;
EMPLOYEES OF LEHMAN BROTHERS
ACTION FUND OF LEHMAN BROTHERS
POLITICAL ACTION COMMITTEE OF LEHMAN BROTHERS
SUPER PAC LEHMAN BROTHERS
COMPAMY PAC LEHMAN BROTHERS
LEHMAN BROS
LEHMAN BROTHERS
L. BROS
;

data s2;
length name $50;
input name &;
id = _n_;
datalines;
LEHMAN BROTHERS INC
NORTHROP GRUMMAN COROPROATION 
3M COMPANY
3M   
ZIP
ABC
GENERAL MOTORS
LEHMAN AND SON
;

data s1w;
set s1;
do i = 1 to countw(name);
    word = scan(name,i);
    if length(word) > 2 then output;
    end;
keep id word;
run;

data s2w;
set s2;
do i = 1 to countw(name);
    word = scan(name,i);
    if  length(word) > 2  and
        word not in ("INC", "COMPANY", "CORP", "AND")  /* Etc. Meaningless matches */
    then output;
    end;
keep id word;
run;

proc sql;

create table wordMatches as
select id1, id2, nbWords
from
    (select 
        s1w.id as id1, 
        s2w.id as id2, 
        count(*) as nbWords
    from 
        s1w inner join 
        s2w on s1w.word=s2w.word
    group by s1w.id, s2w.id )
group by id1
having nbWords = max(nbWords);

create table uniqueBestMatches as
select id1, id2, nbWords
from wordMatches
group by id1
having count(id2) = 1;

create table multipleBestMatches as
select id1, id2, nbWords
from wordMatches
group by id1
having count(id2) > 1;

create table unMatched as
select * 
from s1
where id not in (select id1 from wordMatches);

quit;

proc sql;

Title "Good unique matches";
select s1.name as name1, s2.name as name2
from 
    uniqueBestMatches as a inner join
    s1 on a.id1=s1.id inner join
    s2 on a.id2=s2.id;

Title "Multiple matches, investigate with compged";
select s1.name as name1, s2.name as name2
from 
    multipleBestMatches as a inner join
    s1 on a.id1=s1.id inner join
    s2 on a.id2=s2.id
order by name1;

Title "Unmatched names";
select name from unMatched;

quit;
PG
Super User
Posts: 9,878

Re: Fuzzy String Matching with inconsistent data

Did you try other functions? Like SPEDIS() which retures spell distance. COMPGED() returns the EDIT distance.
Contributor
Posts: 57

Re: Fuzzy String Matching with inconsistent data

Thank you for your help!

☑ This topic is solved.

Need further help from the community? Please ask a new question.

Discussion stats
  • 3 replies
  • 706 views
  • 1 like
  • 3 in conversation