this is what I cam up with till now - but it still not giving accurate result /* Initial Data Setup */ data dataset1; input Product $ x $15.; datalines; via1 . via2 003 via3 014 via4 GA4 via5 GA015 via6 319 via7 23456 via8 10101010198765 via9 22201 via10 6631 ; run; data dataset2; input Name $ x $15.; datalines; a 2 b 3 c 14 d 4 e 15 f GF319 g 23456 h 201 i 98765 j 31 ; run; /* Initial Cleaning Steps for dataset1 */ data d1clean; set dataset1; length x1 $15; x1 = compress(x, '', 'kd'); if not missing(x1) then do; if input(x1, best32.) = 0 then do; x1 = '0'; end; else do; x1 = cats(input(x1, best32.)); end; end; else do; x1 = ''; /* Ensure blank if no digits */ end; run; /* Initial Cleaning Steps for dataset2 */ data d2clean; set dataset2; length x1 $15; x1 = compress(x, '', 'kd'); if not missing(x1) then do; if input(x1, best32.) = 0 then do; x1 = '0'; end; else do; x1 = cats(input(x1, best32.)); end; end; else do; x1 = ''; /* Ensure blank if no digits */ end; run; PROC SQL; /* Create the final transformed version of d1clean */ CREATE TABLE d1_final_cleaned AS SELECT d1.Product, d1.x AS original_dataset1_x, d1.x1 AS x1_after_initial_cleaning, COALESCE( ( /* Subquery to find the shortest d2.x1 that is a suffix of d1.x1 (if d1.x1 is longer) */ SELECT MIN(d2.x1) FROM d2clean d2 WHERE LENGTH(TRIM(d2.x1)) > 0 AND LENGTH(TRIM(d1.x1)) > LENGTH(TRIM(d2.x1)) AND /* Check if d1.x1 ends with d2.x1 */ INDEX(TRIM(d1.x1), TRIM(d2.x1)) = (LENGTH(TRIM(d1.x1)) - LENGTH(TRIM(d2.x1)) + 1) AND LENGTH(TRIM(d2.x1)) = ( SELECT MIN(LENGTH(TRIM(d2_inner.x1))) FROM d2clean d2_inner WHERE LENGTH(TRIM(d2_inner.x1)) > 0 AND LENGTH(TRIM(d1.x1)) > LENGTH(TRIM(d2_inner.x1)) AND INDEX(TRIM(d1.x1), TRIM(d2_inner.x1)) = (LENGTH(TRIM(d1.x1)) - LENGTH(TRIM(d2_inner.x1)) + 1) ) ), d1.x1 ) AS x1 FROM d1clean d1; /* Create the final transformed version of d2clean */ CREATE TABLE d2_final_cleaned AS SELECT d2.Name, d2.x AS original_dataset2_x, d2.x1 AS x1_after_initial_cleaning, COALESCE( ( /* Subquery to find the shortest d1.x1 that is a suffix of d2.x1 (if d2.x1 is longer) */ SELECT MIN(d1.x1) FROM d1clean d1 WHERE LENGTH(TRIM(d1.x1)) > 0 AND LENGTH(TRIM(d2.x1)) > LENGTH(TRIM(d1.x1)) AND INDEX(TRIM(d2.x1), TRIM(d1.x1)) = (LENGTH(TRIM(d2.x1)) - LENGTH(TRIM(d1.x1)) + 1) AND LENGTH(TRIM(d1.x1)) = ( SELECT MIN(LENGTH(TRIM(d1_inner.x1))) FROM d1clean d1_inner WHERE LENGTH(TRIM(d1_inner.x1)) > 0 AND LENGTH(TRIM(d2.x1)) > LENGTH(TRIM(d1_inner.x1)) AND INDEX(TRIM(d2.x1), TRIM(d1_inner.x1)) = (LENGTH(TRIM(d2.x1)) - LENGTH(TRIM(d1_inner.x1)) + 1) ) ), d2.x1 ) AS x1 FROM d2clean d2; QUIT; /* Turn options back on if you need them for subsequent steps */ /* OPTIONS NOTES STIMER SOURCE SYNTAXCHECK; */ /* Display the results */ PROC PRINT DATA=d1_final_cleaned NOOBS; TITLE "d1_final_cleaned: x1 transformed based on d2clean suffixes"; RUN; PROC PRINT DATA=d2_final_cleaned NOOBS; TITLE "d2_final_cleaned: x1 transformed based on d1clean suffixes"; RUN;
... View more