Hi, this is a follow up to an earlier post which I marked successful, today I realized the data is being matched on missing values and that is not the result I wanted, all the other matching is wonderful but I cannot show matches on the missing values for acct in rule 4, missing values for id in rule 5 and I also noticed that in rule 3 for acct/last name also matched for missing values on account. How do I avoid this result. I have been reading everything I can for hash tables for hours and tested multiple ways but cannot figure out how to modify the code to avoid the matching on missing values in those scenarios.
Any assistance will be greatly appreciated.
Here is my sample code I have been working with:
data work.main_data;
infile datalines dlm=',' dsd truncover;
length lastname firstname $25 store $5 pendingno $13 id $10 acct $16;
input lastname firstname store pendingno id acct $ assigndate :date9.;
format assigndate date9.;
datalines;
JONES,JACK,05522,A123456B85946,079458678,4569874589375158,04JUN2021
LYANS,LOIS,04437,A124457B85946,129458646,,22JAN2022
WILLIAMS,JOHN,05522,A123478C25946,379429678,5429874589374758,07AUG2021
BETZINA,ANN,04437,D983456B85946,469457578,4569874589375894,15OCT2021
JOYES,JACK,00301,A123456B85946,279258678,3729874589375158,04SEP2021
SAMS,MACK,02747,A128547B85946,654458746,455874589375321,27MAY2022
WILLIS,JANE,00301,A123458X25946,379447878,5412345589374758,13AUG2021
BETXINA,RUTH,04437,D983426B85946,269453578,4569874589387651,25DEC2021
DAVIS,DAVID,05522,B123458X25136,379442778,5418445589374758,13APR2021
BATHE,ROBERT,04437,D983716B85946,264853578,4569891589387657,25MAR2022
;
data work.match_data;
infile datalines dlm=',' dsd truncover;
length last_name first_name $25 refno $13 sid $10 acctno $16;
input last_name first_name refno sid acctno $ recevdate :date9.;
format recevdate date9.;
datalines;
JONES,JACK,A123456B85946,079458678,4569874589375158,05JUN2021
LYONS,LOIS,A123357B85946,,6559874589375321,23JAN2022
WILLIAMS,JAMES,,379429678,5429874589374758,09AUG2021
BETZINA,ANN,D983456B85946,469457578,4569874589375158,18OCT2021
JOYES,JACK,A123456B85946,279258678,3729874589375158,07SEP2021
SAMSA,MACK,A128547B95946,654458646,,28MAY2022
WILLS,JADE,A123458X25946,379447878,5412345589374758,15AUG2021
BETXINA,RUTH,D983426B85946,269453578,4569874589387651,27DEC2021
WILLIAMS,JOHN,A123478C25946,379429678,5429874589374758,08AUG2021
;
data work.v_match_data /view=work.v_match_data;
set work.match_data;
rename
refno=pendingno
last_name=lastname
first_name=firstname
acctno = acct
sid = id
;
run;
data want;
format rule 1.;
rule = 0;
if _n_=1 then
do;
if 0 then set work.v_match_data(keep=recevdate);
dcl hash h_rule1(dataset:'work.v_match_data');
h_rule1.defineKey('pendingno','lastname');
h_rule1.defineData('recevdate');
h_rule1.defineDone();
dcl hash h_rule2(dataset:'work.v_match_data');
h_rule2.defineKey('pendingno');
h_rule2.defineData('recevdate');
h_rule2.defineDone();
dcl hash h_rule3(dataset:'work.v_match_data');
h_rule3.defineKey('acct','lastname');
h_rule3.defineData('recevdate');
h_rule3.defineDone();
dcl hash h_rule4(dataset:'work.v_match_data ');
h_rule4.defineKey ('acct');
h_rule4.defineData('recevdate');
h_rule4.defineDone();
dcl hash h_rule5(dataset:'work.v_match_data');
h_rule5.defineKey('id');
h_rule5.defineData('recevdate');
h_rule5.defineDone();
dcl hash h_rule6(dataset:'work.v_match_data');
h_rule6.defineKey('lastname','firstname');
h_rule6.defineData('recevdate');
h_rule6.defineDone();
end;
call missing(of _all_);
set work.main_data;
if h_rule1.find()=0 then rule=1;
else if h_rule2.find()=0 then rule=2;
else if h_rule3.find()=0 then rule=3 ;
else if h_rule4.find()=0 then rule=4;
else if h_rule5.find()=0 then rule=5;
else if h_rule6.find()=0 then rule=6;
else rule=0;
run;
I presume all you need to do is remove dataitems (i.e. "rows") with missing values for the respective hash key. Apparently the only problem is rule 4, with key ACCT. So change
dcl hash h_rule4(dataset:'work.v_match_data ');
to
dcl hash h_rule4(dataset:'work.v_match_data (where=(nmiss(acct))=0)');
No missing acct entry will be in the hash object.
Edit: @andreas_lds noted the misplaced closing parent above. The suggested code should have been
dcl hash h_rule4(dataset:‘work.v_match_data(where=(nmiss(acct)=0))’)
I presume all you need to do is remove dataitems (i.e. "rows") with missing values for the respective hash key. Apparently the only problem is rule 4, with key ACCT. So change
dcl hash h_rule4(dataset:'work.v_match_data ');
to
dcl hash h_rule4(dataset:'work.v_match_data (where=(nmiss(acct))=0)');
No missing acct entry will be in the hash object.
Edit: @andreas_lds noted the misplaced closing parent above. The suggested code should have been
dcl hash h_rule4(dataset:‘work.v_match_data(where=(nmiss(acct)=0))’)
Actually this also happens on ID(rule 5), it is matching missing values there also.
I have tried the suggested correction, but I get this error:
NOTE: There were 9 observations read from the data set WORK.MATCH_DATA.
ERROR: Invalid option name =.
ERROR: Invalid data set option string at line 147 column 7.
ERROR: DATA STEP Component Object failure. Aborted during the EXECUTION phase.
I have attached my code and if I can get the image in, the results, you will see the last 2 records match on rule 4 and rule 5 when those values were missing.
data work.main_data;
infile datalines dlm=',' dsd truncover;
length lastname firstname $25 store $5 pendingno $13 id $10 acct $16;
input lastname firstname store pendingno id acct $ assigndate :date9.;
format assigndate date9.;
datalines;
JONES,JACK,05522,A123456B85946,079458678,4569874589375158,04JUN2021
LYANS,LOIS,04437,A124457B85946,129458646,6559874589375321,22JAN2022
WILLIAMS,JOHN,05522,A123478B25946,379429668,5429842709374758,07AUG2021
BETZINA,ANN,04437,D983456B85946,469457578,4569874589375894,15OCT2021
JOYES,JACK,00301,A123456B85946,279258678,3729874589375158,04SEP2021
SAMS,MACK,02747,A128547B85946,654458746,455874589375321,27MAY2022
WILLIS,JANE,00301,A123458X25946,379447878,5412345589374758,13AUG2021
BETXINA,RUTH,04437,D983426B85946,269453578,4569874589387651,25DEC2021
DAVIS,DAVID,05522,B123458X25136,,5418445589374758,13APR2021
BATHE,ROBERT,04437,D983716B85946,264853578,,25MAR2022
;
data work.match_data;
infile datalines dlm=',' dsd truncover;
length last_name first_name $25 refno $13 sid $10 acctno $16;
input last_name first_name refno sid acctno $ recevdate :date9.;
format recevdate date9.;
datalines;
JONES,JACK,A123456B85946,079458678,4569874589375158,05JUN2021
LYONS,LOIS,A123357B85946,,,23JAN2022
WILLIAMS,JAMES,A874857B85946,379429678,5428794589374758,09AUG2021
BETZINA,ANN,D983456B85946,469457578,4569874589375158,18OCT2021
JOYES,JACK,A123456B85946,279258678,3729874589375158,07SEP2021
SAMSA,MACK,A128547B95946,654458646,,28MAY2022
WILLS,JADE,A123458X25946,379447878,5412345589374758,15AUG2021
BETXINA,RUTH,D983426B85946,269453578,4569874589387651,27DEC2021
WILLIAMS,JOHN,A123478C25946,379429678,,08AUG2021
;
data work.v_match_data /view=work.v_match_data;
set work.match_data;
rename
refno=pendingno
last_name=lastname
first_name=firstname
acctno = acct
sid = id
;
run;
data want;
format rule 1.;
rule = 0;
if _n_=1 then
do;
if 0 then set work.v_match_data(keep=recevdate);
dcl hash h_rule1(dataset:'work.v_match_data');
h_rule1.defineKey('pendingno','lastname');
h_rule1.defineData('recevdate');
h_rule1.defineDone();
dcl hash h_rule2(dataset:'work.v_match_data');
h_rule2.defineKey('pendingno');
h_rule2.defineData('recevdate');
h_rule2.defineDone();
dcl hash h_rule3(dataset:'work.v_match_data');
h_rule3.defineKey('acct','lastname');
h_rule3.defineData('recevdate');
h_rule3.defineDone();
dcl hash h_rule4(dataset:'work.v_match_data (where=(nmiss(acct))=0)');
h_rule4.defineKey ('acct');
h_rule4.defineData('recevdate');
h_rule4.defineDone();
/* (where=(nmiss(acct))=0) */
dcl hash h_rule5(dataset:'work.v_match_data');
h_rule5.defineKey('id');
h_rule5.defineData('recevdate');
h_rule5.defineDone();
/* (where=(nmiss(id))=0) */
dcl hash h_rule6(dataset:'work.v_match_data');
h_rule6.defineKey('lastname','firstname');
h_rule6.defineData('recevdate');
h_rule6.defineDone();
end;
call missing(of _all_);
set work.main_data;
if h_rule1.find()=0 then rule=1;
else if h_rule2.find()=0 then rule=2;
else if h_rule3.find()=0 then rule=3 ;
else if h_rule4.find()=0 then rule=4;
else if h_rule5.find()=0 then rule=5;
else if h_rule6.find()=0 then rule=6;
else rule=0;
run;
One closing bracket is misplaced:
'work.v_match_data (where=(nmiss(acct))=0)'
move it behind the zero.
Thanks, funny thing is I tried so many different ways but never hit on the correct one. Thank you very much!
SAS Innovate 2025 is scheduled for May 6-9 in Orlando, FL. Sign up to be first to learn about the agenda and registration!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.