/* Temp data set has 2594 unique names (i.e., 2594 unique observations) */
/* Create a table for matching names and retrieve firm ID (permno) from CSRP database */
/* Permno is the best way to search CRSP database. */
/* Data set Matches has 1277 observations */
/* Bobnamesoriginal is the company name I was given by my coauthor. */
/* Bobnamesnew is the company name after I clean the name to increase chances of */
/* finding a match in the CSRP database. */
/* Company_Name_Header is the name of the company inf the CSRP database. */
proc sql;
create table Matches as select b.permno, a.bobnamesnew, b.company_name_header,
a.bobnamesoriginal from work.Temp as a inner join names.crspnames as b on
a.bobnamesnew=b.company_name_header;
quit;
/* searching for duplicates */
/* No duplicates found */
/* 1277 unique permno */
proc sort data=matches nodupkey;
by permno;
run;
/* Create a table for nonmatches */
/* Since temp has 2594 unique names and I was able to match 1277 names, */
/* there should be 1317 = 2594 - 1277 nonmatch names */
/* However, the nonmatches data set has 1375 observations/nonmatches */
/* What am I doing wrong? I can't figure it out. */
proc sql;
create table nonmatches as select a.bobnamesnew from work.temp as a left join
names.crspnames as b on a.bobnamesnew=b.company_name_header where
b.company_name_header is null;
quit;
/* searching for duplicates */
/* No duplicates found */
/* Nonmatches still has 1375 observations. */
proc sort data=nonmatches nodupkey;
by bobnamesnew;
run;