Hi,
give a try to the SAS COMPGED and COMPLEV in-built functions for determining the edit distance between the strings and see if it helps you
Example:
DATA ds1;
infile datalines dsd;
input String1 : $200.;
datalines;
"2QG850-19 natural topaz stone, size 1099 x 648 mm, 2cm thick (62 PCE), 100% brand new"
;
RUN;
DATA ds2;
infile datalines dsd;
input String2 : $200.;
datalines;
"EGNL01 natural topaz stone, thickness 2cm, size 2990x1280mm (25 PCE), 100% brand new"
"Natural topaz stone, size 578 mm x 648 mm, thickness 30 mm, 100% new"
"natural stone, size 991 mm x 610 mm, thickness 30 mm, 100% new"
"topaz stone, size 1905 mm x 921 mm, thickness 30 mm, 100% new"
"ABC"
"I have two datasets that contain a date, a customer name and a narrative description of merchandise"
;
RUN;
PROC SQL;
CREATE TABLE have AS
SELECT *
FROM ds1,ds2
;
QUIT;
DATA want;
set have;
GED=compged(string1, string2); /*Returns the generalized edit distance between two strings*/
LEV=complev(string1, string2,'LN');/*Returns the Levenshtein edit distance between two strings*/
SUM=GED+LEV;
RUN;
PROC SORT data=want; by SUM ; RUN;
... View more