Hello,
@Astounding and others (@art297, @Tom, @ballardw, to name a few) have been super helpful in helping me through the data cleaning process in SAS. My program looks good, and I can finally run a regression using the data set we've created.
I thought I had all of the control variables I needed in ceo_firm, my merged ceo compensation and firm performance dataset, but it looks like I only used some of the variables I thought I needed from the firm performance data. Namely, I need to control for firm size with a variable such as market cap or trading volume, and these are likely in earlier iterations of the CRSP dataset.
Here's my question: is there a way to merge in a single variable from an earlier dataset to my ceo_firm dataset? Like to essentially single the variable out and include it? Or would this require going back and revising the earlier actions taken?
Here's my program:
libname paper "~/425/425_Final_Paper";
*import compustat dataset;
PROC IMPORT out=paper.CompuStat_Execucomp
datafile="~/425/425_Final_Paper/CompuStat_Execucomp.csv" DBMS=CSV replace;
getnames=yes;
guessingrows=2000;
run;
*keep only CEOs;
data paper.Compustat_ExecuComp2;
set paper.Compustat_ExecuComp;
if CEOANN='CEO';
run;
*** extra DATA step for checking previous data step results by comparing number of records selected;
data check_CEOANN;
set paper.Compustat_ExecuComp;
if CEOANN=: 'CEO';
run;
proc sort data=paper.Compustat_ExecuComp2;
by ticker year;
run;
*import csrp dataset;
PROC IMPORT out=paper.CSRP_Monthly_Stock_char
datafile="~/425/425_Final_Paper/CSRP_MonthlyStock_char.csv" DBMS=CSV replace;
getnames=yes;
guessingrows=max;
run;
proc contents data=paper.CSRP_Monthly_Stock_char;
run;
*remove bad data;
data paper.CSRP_Monthly_Stock_char2;
set paper.CSRP_Monthly_Stock_char (rename=(ret=character_ret));
drop character_ret;
if cusip=' ' then
delete;
ret=input(character_ret, ??8.);
if ret=. then
delete;
date=input(put(date, z8.), yymmdd8.);
format date yymmdd10.;
year=year(date);
month=month(date);
if cusip=: '?' then
cusip=substr(cusip, 2);
run;
proc contents data=paper.CSRP_Monthly_Stock_char;
run;
proc contents data=paper.CSRP_Monthly_Stock_char2;
run;
proc contents data=paper.multiple_CEOs;
run;
proc sort data=paper.CSRP_Monthly_Stock_char2;
by ticker year;
run;
proc sort data=paper.Compustat_ExecuComp2;
by ticker year;
run;
*Remove all bad years from both data sources;
data paper.Compustat_ExecuComp3;
length ticker $5;
merge paper.Compustat_ExecuComp2 paper.multiple_CEOs (keep=ticker year
in=had_multiple_CEOs);
by ticker year;
if had_multiple_CEOs then
delete;
run;
data paper.CSRP_Monthly_Stock_char3;
merge paper.CSRP_Monthly_Stock_char2 paper.multiple_CEOs (keep=ticker year
in=had_multiple_CEOs);
by ticker year;
if had_multiple_CEOs then
delete;
run;
*find additional bad data: multiple return records for the same month/year;
proc freq data=paper.CSRP_Monthly_Stock_char3;
tables ticker * year * month / noprint
out=paper.multiple_returns (where=(count > 1));
run;
*Remove all matching year data for multiple returns;
proc sort data=paper.multiple_returns out=multiple_returns (keep=ticker year)
NODUPKEY;
by ticker year;
run;
data paper.Compustat_ExecuComp4;
merge paper.Compustat_ExecuComp3 (in=keepme)
multiple_returns (in=had_multiple_returns);
by ticker year;
if keepme;
if had_multiple_returns then
delete;
run;
data paper.CSRP_Monthly_Stock_char4;
length ticker $5;
merge paper.CSRP_Monthly_Stock_char3
multiple_returns (in=had_multiple_returns);
by ticker year;
if had_multiple_returns then
delete;
run;
proc contents data=paper.CSRP_Monthly_Stock_char3;
run;
proc contents data=paper.CSRP_Monthly_Stock_char4;
run;
*create new variable annualized growth;
data paper.CSRP_annual_returns;
set paper.CSRP_Monthly_Stock_char4;
by ticker year;
retain annual_return 1;
annual_return=annual_return * (1 + RET);
if month=12 or last.ticker;
annual_return=(annual_return - 1) * 100;
output;
annual_return=1;
keep ticker year annual_return;
run;
*use proc contents to see if there is a type mismatch;
proc contents data=paper.CSRP_annual_returns;
run;
proc contents data=paper.Compustat_ExecuComp4;
run;
*MERGE statement has more than one data set with repeats of BY values;
proc sort data=paper.CSRP_annual_returns out=paper.CSRP_annual_returns2
nodupkey;
by ticker;
run;
*Merge CEO data and firm data;
DATA paper.ceo_firm;
length ticker $5;
merge paper.CSRP_annual_returns2 (in=in1) paper.compustat_execucomp4 (in=in2);
by ticker;
if in1 and in2;
run;
proc contents data=paper.ceo_firm;
run;
*remove missing return data;
data paper.ceo_firm;
set paper.ceo_firm;
if annual_return=. then
delete;
run;
proc means data=paper.CSRP_Monthly_Stock_char3 n nmiss;
var ret;
run;
proc means data=paper.Compustat_ExecuComp4 n nmiss;
var salary;
run;
proc means data=paper.CSRP_annual_returns n nmiss;
var annual_return;
run;
data paper.ceo_firm;
set paper.ceo_firm (rename=(BONUS=_BONUS STOCK_AWARDS=_STOCK_AWARDS));
BONUS=input(_BONUS, ?? 8.);
STOCK_AWARDS=input(_STOCK_AWARDS, ?? 8.);
run;
*create dummy variable that indicates whether the firm's market value increases in a given year;
data paper.ceo_firm;
set paper.ceo_firm;
by ticker year;
last_return=lag(annual_return);
if not first.ticker then
increase=(annual_return > last_return);
else
increase=.;
run;
data paper.ceo_firm;
set paper.ceo_firm;
if gender='MALE' then male=1; else male=0;
run;
Any suggestions are appreciated!
-SAStuck
Assuming you want to add one more variable from paper.<crsp_table> to last created paper.ceo_firm
and that both tables are already sorted by tickr year then:
proc sql;
create table want as
select a.* , b.<varname>
from paper.ceo_firm as a
left join paper.<crsp table>
on a.ticker = b.ticker and
a.year = b.year;
quit;
check number of final output observations - it shall be the same as in paper.ceo_firm.
@Shmuel, thanks for the message! Any idea what's going on here?
*bring variable in from other data set;
proc sql;
create table want as
select a.* , b.<VOL>
from paper.ceo_firm as a
left join paper.CSRP_Monthly_Stock_char4
on a.ticker = b.ticker and
a.year = b.year;
quit;
Here's the log:
1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK; 70 71 *bring variable in from other data set; 72 proc sql; 73 create table want as 74 select a.* , b.<VOL> _ 22 ERROR 22-322: Syntax error, expecting one of the following: a name, *. 75 from paper.ceo_firm as a _____ 22 ERROR 22-322: Syntax error, expecting one of the following: !, !!, &, (, *, **, +, ',', -, /, <, <=, <>, =, >, >=, AND, EQ, EQT, FROM, GE, GET, GT, GTT, LE, LET, LT, LTT, NE, NET, OR, ^=, |, ||, ~=. 76 left join paper.CSRP_Monthly_Stock_char4 ____ 22 76 ERROR 22-322: Syntax error, expecting one of the following: a quoted string, ',', AS, FORMAT, FROM, INFORMAT, INTO, LABEL, LEN, LENGTH, TRANSCODE. ERROR 76-322: Syntax error, statement will be ignored. 77 on a.ticker = b.ticker and 78 a.year = b.year; NOTE: PROC SQL set option NOEXEC and will continue to check the syntax of statements. 79 quit; NOTE: The SAS System stopped processing this step because of errors. NOTE: PROCEDURE SQL used (Total process time): real time 0.00 seconds user cpu time 0.00 seconds system cpu time 0.01 seconds memory 49.00k OS Memory 23460.00k Timestamp 04/11/2018 06:02:30 PM Step Count 53 Switch Count 0 Page Faults 0 Page Reclaims 15 Page Swaps 0 Voluntary Context Switches 0 Involuntary Context Switches 0 Block Input Operations 0 Block Output Operations 0 80 81 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK; 93
Drop the < > signs:
*bring variable in from other data set;
proc sql;
create table want as
select a.* , b.VOL
from paper.ceo_firm as a
left join paper.CSRP_Monthly_Stock_char4
on a.ticker = b.ticker and
a.year = b.year;
quit;
Hey, i'm still having some issues here:
1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK; 70 71 *bring variable in from other data set; 72 proc sql; 73 create table want as 74 select a.* , b.VOL 75 from paper.ceo_firm as a 76 left join paper.CSRP_Monthly_Stock_char 77 on a.ticker = b.ticker and 78 a.year = b.year; ERROR: Unresolved reference to table/correlation name b. ERROR: Unresolved reference to table/correlation name b. ERROR: Unresolved reference to table/correlation name b. ERROR: Expression using equals (=) has components that are of different data types. NOTE: PROC SQL set option NOEXEC and will continue to check the syntax of statements. 79 quit; NOTE: The SAS System stopped processing this step because of errors. NOTE: PROCEDURE SQL used (Total process time): real time 0.00 seconds user cpu time 0.01 seconds system cpu time 0.00 seconds memory 8607.00k OS Memory 35504.00k Timestamp 04/12/2018 12:09:02 AM Step Count 29 Switch Count 0 Page Faults 0 Page Reclaims 536 Page Swaps 0 Voluntary Context Switches 10 Involuntary Context Switches 0 Block Input Operations 0 Block Output Operations 8 80 81 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK; 93
Sory, it's my typo:
*bring variable in from other data set;
proc sql;
create table want as
select a.* , b.VOL
from paper.ceo_firm as a
left join paper.CSRP_Monthly_Stock_char4 as b
on a.ticker = b.ticker and
a.year = b.year;
quit;
Thanks for the correction. Based on this error:
1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK; 70 71 proc reg data=paper.ceo_firm plots(maxpoints=none); 72 model salary=annual_return bonus stock_awards VOL; ERROR: Variable VOL not found. NOTE: The previous statement has been deleted. 73 run; WARNING: No variables specified for an SSCP matrix. Execution terminating. NOTE: PROCEDURE REG used (Total process time): real time 0.00 seconds user cpu time 0.01 seconds system cpu time 0.00 seconds memory 3410.21k OS Memory 32960.00k Timestamp 04/12/2018 10:59:44 AM Step Count 41 Switch Count 0 Page Faults 0 Page Reclaims 482 Page Swaps 0 Voluntary Context Switches 6 Involuntary Context Switches 0 Block Input Operations 0 Block Output Operations 48 74 75 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK; 87
Do I need to call the table ceo_firm? Or reference work.want in my model statement instead of ceo_firm? Thanks!
Check your logic - which file includes the VOL variable.
If you ran the code the same as I wrote it then you added it to WANT.
If you prefer use another name, addapt the code.
Thanks for the headsup. This is something ive been wondering about actually: what is work.want? Or just want? Is this like a shorthand sort of naming convention for data sets when one doesn't know the specific name? Just wondering.
Additionally,
Somehow my number of observations has skyrocketed since adding this code. My proc reg had been reading 12,000-some observations and now it is reading 142,234. Is there any explanation for this?
Thanks again!
When I started help within the forum I found others using table names like HAVE for input and WANT for desired output
and accpeted it as very convenient.
The difference in number of observations means that the combination of ticker and year are not uniqe.
SQL join makes a cartesin join thus multiplies output.
If you know what other variables make it unique - add them to the conditions:
....
left join paper.CSRP_Monthly_Stock_char4 as b
on a.ticker = b.ticker and
a.year = b.year and
a.new_key = b.new_key; /* adapt name and add more lines as need */
@Shmuel, is there any chance that the addition of this code into my program would push me over my storage quota? My home directory is full and now I need to delete files. This wasn't an issue until I ran this code. Maybe it's a coincidence? I'm just wondering if there is something about the nature of this code or the amount of observations it pulled into my dataset that could result in this. I believe I am using a sort of university/free version of SAS for academics . . . something along those lines. Thanks!
The last dataset in your original code was paper.ceo_firm.
Have you changed the output dataset name (instead work.want) to a new name or named it again as paper.ceo_firm ?
Usually, I prefer that each step creates a unique name (new one).
When you use the same dataset name again and again and you need to change the code you have to run the code from start,
otherwise you can run only those steps starting the changed code.
You asked -
is there any chance that the addition of this code into my program would push me over my storage quota?
the answer depends on your physical storage, but the real question should be - do you realy need the cartesian multiply of observations? you wanted to add just one variable not add observations ?!
I suggest:
1) check again is there other variable(s) to make the matching unique ?
2) Fix the code to use unique matching variables and run that part that creates paper.ceo_firm dataset.
You can submit selected steps. No must to run all code from start.
I'm afraid i'm having trouble wrapping my head around what you mean by unique . . . this whole program essentially has revolved around the combination of ticker and year. I'm not sure why "VOL" for ticker and year would not be unique. Unless you have another suggestion, is there a way to undo this and recreate the ceo_firm data set as it was? With the earlier amount of observations? At this point i'm just concerned with preserving the data. Unfortunately, overwriting datasets unintentionally has been a theme of mine during this go-round.
In the next code I have copied that part of your code (line 160 on) to recreate paper.ceo_firm,
merged few steps to one, added code to check duplicates and code to add the
required variable to above dataset.
read carefully my notes in between /*...*/.
You can submit step by step to run and check each that ran OK.
Wish you success.
/********************************************************************/ /* START RERUN TO REPRODUCE PAPER.CEO_FIRM DATASET */ /********************************************************************/ /* IF YOU HAVE STORAGE ISSUE DELETE THE LAST CREATED PAPER.CEO_FIRM */ proc datasets lib=paper; delete ceo_firm nolist; quit; /* ============= RECREATE PAPER.CEO_FIRM ========================== */ *Merge CEO data and firm data; DATA paper.ceo_firm_1; length ticker $5; merge paper.CSRP_annual_returns2 (in=in1) paper.compustat_execucomp4 (in=in2); by ticker; if in1 and in2; run; *remove missing return data; *create dummy variable that indicates whether the firm's market value increases in a given year; data paper.ceo_firm_2; set paper.ceo_firm_1 (rename=(BONUS=_BONUS STOCK_AWARDS=_STOCK_AWARDS)); BONUS=input(_BONUS, ?? 8.); STOCK_AWARDS=input(_STOCK_AWARDS, ?? 8.); if annual_return=. then delete; run; data paper.ceo_firm; set paper.ceo_firm_temp_2; by ticker year; last_return=lag(annual_return); if not first.ticker then increase=(annual_return > last_return); else increase=.; if gender='MALE' then male=1; else male=0; run; /* DELETE TEMPORARY DATASETS */ proc datasets lib=paper; delete ceo_firm_1 ceo_firm_2 nolist; quit; /* CHECK FOR DUPLICATES */ data is_dup; set paper.CSRP_Monthly_Stock_char4; /* dataset origin containing var to add VOL ? */ by ticker year; if not (firt.year and last.year); run; * are there duplicates ? ; * if positive sort the dataset with NODUPKEY option ; * alternativeley add more key variables to to join in next sql; proc sort data=paper.CSRP_Monthly_Stock_char4 (keep ticker year VOL) /* replace VOL to the required var */ out=crsp_nodup NODUPKEY; by ticker year; run; * add the required var to paper.ceo_firm dataset ; *bring variable in from other data set; proc sql; create table want as select a.* , b.VOL from paper.ceo_firm as a left join paper.
crsp_nodup /*
CSRP_Monthly_Stock_char4*/ as b
on a.ticker = b.ticker
and a.year = b.year
/* and a.key = b.key ?????? */
; quit;
Hey @Shmuel, thanks for the revised program.
Here are some issues I ran into:
1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK; 70 71 /********************************************************************/ 72 /* START RERUN TO REPRODUCE PAPER.CEO_FIRM DATASET */ 73 /********************************************************************/ 74 75 /* IF YOU HAVE STORAGE ISSUE DELETE THE LAST CREATED PAPER.CEO_FIRM */ 76 proc datasets lib=paper; 76 ! delete ceo_firm nolist; quit; NOTE: The file PAPER.NOLIST (memtype=DATA) was not found, but appears on a DELETE statement. NOTE: Deleting PAPER.CEO_FIRM (memtype=DATA). NOTE: PROCEDURE DATASETS used (Total process time): real time 0.05 seconds user cpu time 0.05 seconds system cpu time 0.00 seconds memory 2160.40k OS Memory 28072.00k Timestamp 04/14/2018 02:44:04 PM Step Count 23 Switch Count 1 Page Faults 0 Page Reclaims 1090 Page Swaps 0 Voluntary Context Switches 25 Involuntary Context Switches 0 Block Input Operations 32 Block Output Operations 16 77 78 /* ============= RECREATE PAPER.CEO_FIRM ========================== */ 79 80 *Merge CEO data and firm data; 81 DATA paper.ceo_firm_1; 82 length ticker $5; 83 merge paper.CSRP_annual_returns2 (in=in1) 84 paper.compustat_execucomp4 (in=in2); 85 by ticker; 86 87 if in1 and in2; 88 run; NOTE: There were 10586 observations read from the data set PAPER.CSRP_ANNUAL_RETURNS2. NOTE: There were 13355 observations read from the data set PAPER.COMPUSTAT_EXECUCOMP4. NOTE: The data set PAPER.CEO_FIRM_1 has 12727 observations and 108 variables. NOTE: DATA statement used (Total process time): real time 0.13 seconds user cpu time 0.02 seconds system cpu time 0.05 seconds memory 4219.21k OS Memory 33968.00k Timestamp 04/14/2018 02:44:04 PM Step Count 24 Switch Count 5 Page Faults 0 Page Reclaims 1136 Page Swaps 0 Voluntary Context Switches 1072 Involuntary Context Switches 3 Block Input Operations 55104 Block Output Operations 51976 89 90 *remove missing return data; 91 *create dummy variable that indicates whether the firm's market value increases in a given year; 92 93 data paper.ceo_firm_2; 94 set paper.ceo_firm_1 (rename=(BONUS=_BONUS STOCK_AWARDS=_STOCK_AWARDS)); 95 BONUS=input(_BONUS, ?? 8.); 96 STOCK_AWARDS=input(_STOCK_AWARDS, ?? 8.); 97 98 if annual_return=. then 99 delete; 100 run; NOTE: There were 12727 observations read from the data set PAPER.CEO_FIRM_1. NOTE: The data set PAPER.CEO_FIRM_2 has 12727 observations and 110 variables. NOTE: DATA statement used (Total process time): real time 0.12 seconds user cpu time 0.01 seconds system cpu time 0.03 seconds memory 3640.34k OS Memory 33452.00k Timestamp 04/14/2018 02:44:04 PM Step Count 25 Switch Count 1 Page Faults 0 Page Reclaims 566 Page Swaps 0 Voluntary Context Switches 1082 Involuntary Context Switches 1 Block Input Operations 52000 Block Output Operations 52744 101 102 data paper.ceo_firm; 103 set paper.ceo_firm_temp_2; ERROR: File PAPER.CEO_FIRM_TEMP_2.DATA does not exist. 104 by ticker year; 105 last_return=lag(annual_return); 106 107 if not first.ticker then 108 increase=(annual_return > last_return); 109 else 110 increase=.; 111 112 if gender='MALE' then male=1; else male=0; 113 run; NOTE: The SAS System stopped processing this step because of errors. WARNING: The data set PAPER.CEO_FIRM may be incomplete. When this step was stopped there were 0 observations and 5 variables. NOTE: DATA statement used (Total process time): real time 0.01 seconds user cpu time 0.00 seconds system cpu time 0.00 seconds memory 498.06k OS Memory 30632.00k Timestamp 04/14/2018 02:44:04 PM Step Count 26 Switch Count 1 Page Faults 0 Page Reclaims 101 Page Swaps 0 Voluntary Context Switches 30 Involuntary Context Switches 0 Block Input Operations 0 Block Output Operations 264 114 115 /* DELETE TEMPORARY DATASETS */ 116 proc datasets lib=paper; 116 ! delete ceo_firm_1 ceo_firm_2 nolist; quit; NOTE: The file PAPER.NOLIST (memtype=DATA) was not found, but appears on a DELETE statement. NOTE: Deleting PAPER.CEO_FIRM_1 (memtype=DATA). NOTE: Deleting PAPER.CEO_FIRM_2 (memtype=DATA). NOTE: PROCEDURE DATASETS used (Total process time): real time 0.05 seconds user cpu time 0.04 seconds system cpu time 0.01 seconds memory 636.75k OS Memory 30888.00k Timestamp 04/14/2018 02:44:04 PM Step Count 27 Switch Count 2 Page Faults 0 Page Reclaims 103 Page Swaps 0 Voluntary Context Switches 50 Involuntary Context Switches 0 Block Input Operations 32 Block Output Operations 24 117 118 /* CHECK FOR DUPLICATES */ 119 data is_dup; 120 set paper.CSRP_Monthly_Stock_char4; /* dataset origin containing var to add VOL ? */ 121 by ticker year; 122 if not (firt.year and last.year); _________ 557 ERROR: DATA STEP Component Object failure. Aborted during the COMPILATION phase. ERROR 557-185: Variable firt is not an object. NOTE: The SAS System stopped processing this step because of errors. NOTE: DATA statement used (Total process time): real time 0.00 seconds user cpu time 0.00 seconds system cpu time 0.00 seconds memory 1979.90k OS Memory 31656.00k Timestamp 04/14/2018 02:44:04 PM Step Count 28 Switch Count 0 Page Faults 0 Page Reclaims 257 Page Swaps 0 Voluntary Context Switches 9 Involuntary Context Switches 0 Block Input Operations 288 Block Output Operations 0 123 run; 124 125 * are there duplicates ? ; 126 * if positive sort the dataset with NODUPKEY option ; 127 * alternativeley add more key variables to to join in next sql; 128 129 proc sort data=paper.CSRP_Monthly_Stock_char4 (keep ticker year VOL) ____ ______ 12 12 ERROR 12-63: Missing '=' for option KEEP. 130 /* replace VOL to the required var */ 131 out=crsp_nodup NODUPKEY; 132 by ticker year; 133 run; NOTE: The SAS System stopped processing this step because of errors. WARNING: The data set WORK.CRSP_NODUP may be incomplete. When this step was stopped there were 0 observations and 0 variables. NOTE: PROCEDURE SORT used (Total process time): real time 0.00 seconds user cpu time 0.00 seconds system cpu time 0.00 seconds memory 1658.37k OS Memory 31656.00k Timestamp 04/14/2018 02:44:04 PM Step Count 29 Switch Count 1 Page Faults 0 Page Reclaims 265 Page Swaps 0 Voluntary Context Switches 8 Involuntary Context Switches 0 Block Input Operations 0 Block Output Operations 160 134 135 * add the required var to paper.ceo_firm dataset ; 136 *bring variable in from other data set; 137 proc sql; 138 create table want as 139 select a.* , b.VOL 140 from paper.ceo_firm as a 141 left join paper.crsp_nodup /*CSRP_Monthly_Stock_char4*/ as b on a.ticker = b.ticker and a.year = b.year 141 ! /* and a.key = b.key ?????? */ ; quit; ERROR: File PAPER.CRSP_NODUP.DATA does not exist. NOTE: PROC SQL set option NOEXEC and will continue to check the syntax of statements. NOTE: The SAS System stopped processing this step because of errors. NOTE: PROCEDURE SQL used (Total process time): real time 0.00 seconds user cpu time 0.00 seconds system cpu time 0.01 seconds memory 323.43k OS Memory 30888.00k Timestamp 04/14/2018 02:44:04 PM Step Count 30 Switch Count 0 Page Faults 0 Page Reclaims 84 Page Swaps 0 Voluntary Context Switches 12 Involuntary Context Switches 0 Block Input Operations 288 Block Output Operations 0 142 143 144 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK; 156
Here is the code again:
libname paper "~/425/425_Final_Paper";
*import compustat dataset;
PROC IMPORT out=paper.CompuStat_Execucomp
datafile="~/425/425_Final_Paper/CompuStat_Execucomp.csv" DBMS=CSV replace;
getnames=yes;
guessingrows=2000;
run;
*keep only CEOs;
data paper.Compustat_ExecuComp2;
set paper.Compustat_ExecuComp;
if CEOANN='CEO';
run;
*** extra DATA step for checking previous data step results by comparing number of records selected;
data check_CEOANN;
set paper.Compustat_ExecuComp;
if CEOANN=: 'CEO';
run;
proc sort data=paper.Compustat_ExecuComp2;
by ticker year;
run;
*import csrp dataset;
PROC IMPORT out=paper.CSRP_Monthly_Stock_char
datafile="~/425/425_Final_Paper/CSRP_MonthlyStock_char.csv" DBMS=CSV replace;
getnames=yes;
guessingrows=max;
run;
proc contents data=paper.CSRP_Monthly_Stock_char;
run;
*remove bad data;
data paper.CSRP_Monthly_Stock_char2;
set paper.CSRP_Monthly_Stock_char (rename=(ret=character_ret));
drop character_ret;
if cusip=' ' then
delete;
ret=input(character_ret, ??8.);
if ret=. then
delete;
date=input(put(date, z8.), yymmdd8.);
format date yymmdd10.;
year=year(date);
month=month(date);
if cusip=: '?' then
cusip=substr(cusip, 2);
run;
proc contents data=paper.CSRP_Monthly_Stock_char;
run;
proc contents data=paper.CSRP_Monthly_Stock_char2;
run;
proc contents data=paper.multiple_CEOs;
run;
proc sort data=paper.CSRP_Monthly_Stock_char2;
by ticker year;
run;
proc sort data=paper.Compustat_ExecuComp2;
by ticker year;
run;
*Remove all bad years from both data sources;
data paper.Compustat_ExecuComp3;
length ticker $5;
merge paper.Compustat_ExecuComp2 paper.multiple_CEOs (keep=ticker year
in=had_multiple_CEOs);
by ticker year;
if had_multiple_CEOs then
delete;
run;
data paper.CSRP_Monthly_Stock_char3;
merge paper.CSRP_Monthly_Stock_char2 paper.multiple_CEOs (keep=ticker year
in=had_multiple_CEOs);
by ticker year;
if had_multiple_CEOs then
delete;
run;
*find additional bad data: multiple return records for the same month/year;
proc freq data=paper.CSRP_Monthly_Stock_char3;
tables ticker * year * month / noprint
out=paper.multiple_returns (where=(count > 1));
run;
*Remove all matching year data for multiple returns;
proc sort data=paper.multiple_returns out=multiple_returns (keep=ticker year)
NODUPKEY;
by ticker year;
run;
data paper.Compustat_ExecuComp4;
merge paper.Compustat_ExecuComp3 (in=keepme)
multiple_returns (in=had_multiple_returns);
by ticker year;
if keepme;
if had_multiple_returns then
delete;
run;
data paper.CSRP_Monthly_Stock_char4;
length ticker $5;
merge paper.CSRP_Monthly_Stock_char3
multiple_returns (in=had_multiple_returns);
by ticker year;
if had_multiple_returns then
delete;
run;
proc contents data=paper.CSRP_Monthly_Stock_char3;
run;
proc contents data=paper.CSRP_Monthly_Stock_char4;
run;
*create new variable annualized growth;
data paper.CSRP_annual_returns;
set paper.CSRP_Monthly_Stock_char4;
by ticker year;
retain annual_return 1;
annual_return=annual_return * (1 + RET);
if month=12 or last.ticker;
annual_return=(annual_return - 1) * 100;
output;
annual_return=1;
keep ticker year annual_return;
run;
*use proc contents to see if there is a type mismatch;
proc contents data=paper.CSRP_annual_returns;
run;
proc contents data=paper.Compustat_ExecuComp4;
run;
*MERGE statement has more than one data set with repeats of BY values;
proc sort data=paper.CSRP_annual_returns out=paper.CSRP_annual_returns2
nodupkey;
by ticker;
run;
/********************************************************************/
/* START RERUN TO REPRODUCE PAPER.CEO_FIRM DATASET */
/********************************************************************/
/* IF YOU HAVE STORAGE ISSUE DELETE THE LAST CREATED PAPER.CEO_FIRM */
proc datasets lib=paper; delete ceo_firm nolist; quit;
/* ============= RECREATE PAPER.CEO_FIRM ========================== */
*Merge CEO data and firm data;
DATA paper.ceo_firm_1;
length ticker $5;
merge paper.CSRP_annual_returns2 (in=in1)
paper.compustat_execucomp4 (in=in2);
by ticker;
if in1 and in2;
run;
*remove missing return data;
*create dummy variable that indicates whether the firm's market value increases in a given year;
data paper.ceo_firm_2;
set paper.ceo_firm_1 (rename=(BONUS=_BONUS STOCK_AWARDS=_STOCK_AWARDS));
BONUS=input(_BONUS, ?? 8.);
STOCK_AWARDS=input(_STOCK_AWARDS, ?? 8.);
if annual_return=. then
delete;
run;
data paper.ceo_firm;
set paper.ceo_firm_temp_2;
by ticker year;
last_return=lag(annual_return);
if not first.ticker then
increase=(annual_return > last_return);
else
increase=.;
if gender='MALE' then male=1; else male=0;
run;
/* DELETE TEMPORARY DATASETS */
proc datasets lib=paper; delete ceo_firm_1 ceo_firm_2 nolist; quit;
/* CHECK FOR DUPLICATES */
data is_dup;
set paper.CSRP_Monthly_Stock_char4; /* dataset origin containing var to add VOL ? */
by ticker year;
if not (firt.year and last.year);
run;
* are there duplicates ? ;
* if positive sort the dataset with NODUPKEY option ;
* alternativeley add more key variables to to join in next sql;
proc sort data=paper.CSRP_Monthly_Stock_char4 (keep ticker year VOL)
/* replace VOL to the required var */
out=crsp_nodup NODUPKEY;
by ticker year;
run;
* add the required var to paper.ceo_firm dataset ;
*bring variable in from other data set;
proc sql;
create table want as
select a.* , b.VOL
from paper.ceo_firm as a
left join paper.crsp_nodup /*CSRP_Monthly_Stock_char4*/ as b on a.ticker = b.ticker and a.year = b.year /* and a.key = b.key ?????? */ ; quit;
I really appreciate the help!
Good news: We've extended SAS Hackathon registration until Sept. 12, so you still have time to be part of our biggest event yet – our five-year anniversary!
What’s the difference between SAS Enterprise Guide and SAS Studio? How are they similar? Just ask SAS’ Danny Modlin.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.