@ballardw, thank you for the help. I used your suggestion of FINDW for state (I think I'm using the optional arguments correctly to treat anything that isn't a letter as a word boundary?). I couldn't select only on zip code though, because the zip code sometimes doesn't match the city and state, and the data step that follows what I included here ranks records based on best match over all three variables. Also, thanks for the tip about EXCLUDE in PROC SQL. I've never seen that before and have since used it in another program.
I ended up using a hash, which I've never done before, but (after many versions) it seems to be working.
data Residences;
length Residence $50;
do Residence=
"MIAMI GARDENS/FL/33055",
"SARASOTA FL/ZIP)",
"TITUSVILLE, FL)",
"ADJUNTAS PRTE/ZIP)",
"OPA LOCKA/TARPON SPRINGS",
"SAINT PETERSBURG/FL/33710",
"DELAND,FL 32720)",
"DORAL, FL 33172)",
"MIAMI FL 33168P)",
"ADEL, GA 31620)",
"OSTRICA, LA 70041)",
"KINGSLAND, GA )",
"MILTON FL 32570)",
"LARGO,FL 32773 )",
"FORT PIERCE/FL/34945",
"+IAMI,FL",
"HOMELESS (850)",
"SABASTIAN, FLZIP)",
"PLANT CITY/TAMPA AREA",
"BUSHNELL, FZIP)"
;
output;
end;
run;
proc sql;
create table ParseThese as
select distinct find(compress(Residence),compress(city),'i') as CityFound
,case when find(compress(Residence),compress(stname(statecode)),'i') then 1
when findw(Residence,statecode,'z','aki') then 2
else 0
end as StateFound
,find(compress(Residence),cats(put(zip,z5.))) as ZipFound
,rp.Residence
,city
,statecode
,zip
from sashelp.zipcode,Residences as rp
where findW(Residence,statecode,'z','aki')
or find(compress(Residence),compress(stname(statecode)),'i')
or find(compress(Residence),compress(city),'i')
or find(compress(Residence),cats(put(zip,z5.)))
order by Residence,city,statecode,zip,CityFound,StateFound,ZipFound
;
quit;
data Parsed;
length Residence $50 CITY $35 STATECODE $2 ZIP 8 CityFound StateFound ZipFound 8 CityPattern StatePattern ZipPattern $250 CityParser StateParser ZipParser 8 CityMatch StateMatch ZipMatch 8;
if _n_ = 1 then do;
declare hash myhash(dataset:'ParseThese', multidata: 'y', ordered: 'y');
rc = myhash.DEFINEKEY('Residence','City','StateCode','ZIP');
rc = myhash.DEFINEDATA('Residence','CityFound','StateFound','ZipFound','City','StateCode','ZIP');
rc = myhash.DEFINEDONE();
CALL MISSING(Residence,CityFound,StateFound,ZipFound,City,StateCode,ZIP,
CityPattern,CityParser,CityMatch,StatePattern,StateParser,StateMatch,ZipPattern,ZipParser,ZipMatch);
DECLARE hiter myiter('myhash');
end;
rc = myiter.first();
do while (rc eq 0);
if CityFound then do;
CityPattern=cats("/\b",upcase(city),"\b/");
CityParser=prxparse(CityPattern);
CityMatch=ifn(missing(Residence)=missing(city)=0,prxmatch(CityParser,compbl(Residence)),0);
call prxfree(CityParser);
end;
else call missing(CityPattern,CityParser,CityMatch);
if StateFound then do;
StatePattern=cats("/\b",upcase(statecode),"\b|\b",stname(statecode),"\b/");
StateParser=prxparse(StatePattern);
StateMatch=ifn(missing(Residence)=missing(statecode)=0,prxmatch(StateParser,compbl(Residence)),0);
call prxfree(StateParser);
end;
else call missing(StatePattern,StateParser,StateMatch);
if ZipFound then do;
ZipPattern=cats("/\b",put(zip,z5.),"\b/");
ZipParser=prxparse(ZipPattern);
ZipMatch=ifn(missing(Residence)=missing(zip)=0,prxmatch(ZipParser,compbl(Residence)),0);
call prxfree(ZipParser);
end;
else call missing(ZipPattern,ZipParser,ZipMatch);
if sum(CityMatch,StateMatch,ZipMatch) then output;
rc = myiter.next();
end;
run;
... View more