Hello,
@Astounding and others (@art297, @Tom, @ballardw, to name a few) have been super helpful in helping me through the data cleaning process in SAS. My program looks good, and I can finally run a regression using the data set we've created.
I thought I had all of the control variables I needed in ceo_firm, my merged ceo compensation and firm performance dataset, but it looks like I only used some of the variables I thought I needed from the firm performance data. Namely, I need to control for firm size with a variable such as market cap or trading volume, and these are likely in earlier iterations of the CRSP dataset.
Here's my question: is there a way to merge in a single variable from an earlier dataset to my ceo_firm dataset? Like to essentially single the variable out and include it? Or would this require going back and revising the earlier actions taken?
Here's my program:
libname paper "~/425/425_Final_Paper";
*import compustat dataset;
PROC IMPORT out=paper.CompuStat_Execucomp
datafile="~/425/425_Final_Paper/CompuStat_Execucomp.csv" DBMS=CSV replace;
getnames=yes;
guessingrows=2000;
run;
*keep only CEOs;
data paper.Compustat_ExecuComp2;
set paper.Compustat_ExecuComp;
if CEOANN='CEO';
run;
*** extra DATA step for checking previous data step results by comparing number of records selected;
data check_CEOANN;
set paper.Compustat_ExecuComp;
if CEOANN=: 'CEO';
run;
proc sort data=paper.Compustat_ExecuComp2;
by ticker year;
run;
*import csrp dataset;
PROC IMPORT out=paper.CSRP_Monthly_Stock_char
datafile="~/425/425_Final_Paper/CSRP_MonthlyStock_char.csv" DBMS=CSV replace;
getnames=yes;
guessingrows=max;
run;
proc contents data=paper.CSRP_Monthly_Stock_char;
run;
*remove bad data;
data paper.CSRP_Monthly_Stock_char2;
set paper.CSRP_Monthly_Stock_char (rename=(ret=character_ret));
drop character_ret;
if cusip=' ' then
delete;
ret=input(character_ret, ??8.);
if ret=. then
delete;
date=input(put(date, z8.), yymmdd8.);
format date yymmdd10.;
year=year(date);
month=month(date);
if cusip=: '?' then
cusip=substr(cusip, 2);
run;
proc contents data=paper.CSRP_Monthly_Stock_char;
run;
proc contents data=paper.CSRP_Monthly_Stock_char2;
run;
proc contents data=paper.multiple_CEOs;
run;
proc sort data=paper.CSRP_Monthly_Stock_char2;
by ticker year;
run;
proc sort data=paper.Compustat_ExecuComp2;
by ticker year;
run;
*Remove all bad years from both data sources;
data paper.Compustat_ExecuComp3;
length ticker $5;
merge paper.Compustat_ExecuComp2 paper.multiple_CEOs (keep=ticker year
in=had_multiple_CEOs);
by ticker year;
if had_multiple_CEOs then
delete;
run;
data paper.CSRP_Monthly_Stock_char3;
merge paper.CSRP_Monthly_Stock_char2 paper.multiple_CEOs (keep=ticker year
in=had_multiple_CEOs);
by ticker year;
if had_multiple_CEOs then
delete;
run;
*find additional bad data: multiple return records for the same month/year;
proc freq data=paper.CSRP_Monthly_Stock_char3;
tables ticker * year * month / noprint
out=paper.multiple_returns (where=(count > 1));
run;
*Remove all matching year data for multiple returns;
proc sort data=paper.multiple_returns out=multiple_returns (keep=ticker year)
NODUPKEY;
by ticker year;
run;
data paper.Compustat_ExecuComp4;
merge paper.Compustat_ExecuComp3 (in=keepme)
multiple_returns (in=had_multiple_returns);
by ticker year;
if keepme;
if had_multiple_returns then
delete;
run;
data paper.CSRP_Monthly_Stock_char4;
length ticker $5;
merge paper.CSRP_Monthly_Stock_char3
multiple_returns (in=had_multiple_returns);
by ticker year;
if had_multiple_returns then
delete;
run;
proc contents data=paper.CSRP_Monthly_Stock_char3;
run;
proc contents data=paper.CSRP_Monthly_Stock_char4;
run;
*create new variable annualized growth;
data paper.CSRP_annual_returns;
set paper.CSRP_Monthly_Stock_char4;
by ticker year;
retain annual_return 1;
annual_return=annual_return * (1 + RET);
if month=12 or last.ticker;
annual_return=(annual_return - 1) * 100;
output;
annual_return=1;
keep ticker year annual_return;
run;
*use proc contents to see if there is a type mismatch;
proc contents data=paper.CSRP_annual_returns;
run;
proc contents data=paper.Compustat_ExecuComp4;
run;
*MERGE statement has more than one data set with repeats of BY values;
proc sort data=paper.CSRP_annual_returns out=paper.CSRP_annual_returns2
nodupkey;
by ticker;
run;
*Merge CEO data and firm data;
DATA paper.ceo_firm;
length ticker $5;
merge paper.CSRP_annual_returns2 (in=in1) paper.compustat_execucomp4 (in=in2);
by ticker;
if in1 and in2;
run;
proc contents data=paper.ceo_firm;
run;
*remove missing return data;
data paper.ceo_firm;
set paper.ceo_firm;
if annual_return=. then
delete;
run;
proc means data=paper.CSRP_Monthly_Stock_char3 n nmiss;
var ret;
run;
proc means data=paper.Compustat_ExecuComp4 n nmiss;
var salary;
run;
proc means data=paper.CSRP_annual_returns n nmiss;
var annual_return;
run;
data paper.ceo_firm;
set paper.ceo_firm (rename=(BONUS=_BONUS STOCK_AWARDS=_STOCK_AWARDS));
BONUS=input(_BONUS, ?? 8.);
STOCK_AWARDS=input(_STOCK_AWARDS, ?? 8.);
run;
*create dummy variable that indicates whether the firm's market value increases in a given year;
data paper.ceo_firm;
set paper.ceo_firm;
by ticker year;
last_return=lag(annual_return);
if not first.ticker then
increase=(annual_return > last_return);
else
increase=.;
run;
data paper.ceo_firm;
set paper.ceo_firm;
if gender='MALE' then male=1; else male=0;
run;
Any suggestions are appreciated!
-SAStuck
Assuming you want to add one more variable from paper.<crsp_table> to last created paper.ceo_firm
and that both tables are already sorted by tickr year then:
proc sql;
create table want as
select a.* , b.<varname>
from paper.ceo_firm as a
left join paper.<crsp table>
on a.ticker = b.ticker and
a.year = b.year;
quit;
check number of final output observations - it shall be the same as in paper.ceo_firm.
@Shmuel, thanks for the message! Any idea what's going on here?
*bring variable in from other data set;
proc sql;
create table want as
select a.* , b.<VOL>
from paper.ceo_firm as a
left join paper.CSRP_Monthly_Stock_char4
on a.ticker = b.ticker and
a.year = b.year;
quit;Here's the log:
1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
70
71 *bring variable in from other data set;
72 proc sql;
73 create table want as
74 select a.* , b.<VOL>
_
22
ERROR 22-322: Syntax error, expecting one of the following: a name, *.
75 from paper.ceo_firm as a
_____
22
ERROR 22-322: Syntax error, expecting one of the following: !, !!, &, (, *, **, +, ',', -, /, <, <=, <>, =, >, >=, AND, EQ, EQT,
FROM, GE, GET, GT, GTT, LE, LET, LT, LTT, NE, NET, OR, ^=, |, ||, ~=.
76 left join paper.CSRP_Monthly_Stock_char4
____
22
76
ERROR 22-322: Syntax error, expecting one of the following: a quoted string, ',', AS, FORMAT, FROM, INFORMAT, INTO, LABEL, LEN,
LENGTH, TRANSCODE.
ERROR 76-322: Syntax error, statement will be ignored.
77 on a.ticker = b.ticker and
78 a.year = b.year;
NOTE: PROC SQL set option NOEXEC and will continue to check the syntax of statements.
79 quit;
NOTE: The SAS System stopped processing this step because of errors.
NOTE: PROCEDURE SQL used (Total process time):
real time 0.00 seconds
user cpu time 0.00 seconds
system cpu time 0.01 seconds
memory 49.00k
OS Memory 23460.00k
Timestamp 04/11/2018 06:02:30 PM
Step Count 53 Switch Count 0
Page Faults 0
Page Reclaims 15
Page Swaps 0
Voluntary Context Switches 0
Involuntary Context Switches 0
Block Input Operations 0
Block Output Operations 0
80
81 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
93
Drop the < > signs:
*bring variable in from other data set;
proc sql;
create table want as
select a.* , b.VOL
from paper.ceo_firm as a
left join paper.CSRP_Monthly_Stock_char4
on a.ticker = b.ticker and
a.year = b.year;
quit;
Hey, i'm still having some issues here:
1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
70
71 *bring variable in from other data set;
72 proc sql;
73 create table want as
74 select a.* , b.VOL
75 from paper.ceo_firm as a
76 left join paper.CSRP_Monthly_Stock_char
77 on a.ticker = b.ticker and
78 a.year = b.year;
ERROR: Unresolved reference to table/correlation name b.
ERROR: Unresolved reference to table/correlation name b.
ERROR: Unresolved reference to table/correlation name b.
ERROR: Expression using equals (=) has components that are of different data types.
NOTE: PROC SQL set option NOEXEC and will continue to check the syntax of statements.
79 quit;
NOTE: The SAS System stopped processing this step because of errors.
NOTE: PROCEDURE SQL used (Total process time):
real time 0.00 seconds
user cpu time 0.01 seconds
system cpu time 0.00 seconds
memory 8607.00k
OS Memory 35504.00k
Timestamp 04/12/2018 12:09:02 AM
Step Count 29 Switch Count 0
Page Faults 0
Page Reclaims 536
Page Swaps 0
Voluntary Context Switches 10
Involuntary Context Switches 0
Block Input Operations 0
Block Output Operations 8
80
81 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
93
Sory, it's my typo:
*bring variable in from other data set;
proc sql;
create table want as
select a.* , b.VOL
from paper.ceo_firm as a
left join paper.CSRP_Monthly_Stock_char4 as b
on a.ticker = b.ticker and
a.year = b.year;
quit;
Thanks for the correction. Based on this error:
1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
70
71 proc reg data=paper.ceo_firm plots(maxpoints=none);
72 model salary=annual_return bonus stock_awards VOL;
ERROR: Variable VOL not found.
NOTE: The previous statement has been deleted.
73 run;
WARNING: No variables specified for an SSCP matrix. Execution terminating.
NOTE: PROCEDURE REG used (Total process time):
real time 0.00 seconds
user cpu time 0.01 seconds
system cpu time 0.00 seconds
memory 3410.21k
OS Memory 32960.00k
Timestamp 04/12/2018 10:59:44 AM
Step Count 41 Switch Count 0
Page Faults 0
Page Reclaims 482
Page Swaps 0
Voluntary Context Switches 6
Involuntary Context Switches 0
Block Input Operations 0
Block Output Operations 48
74
75 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
87 Do I need to call the table ceo_firm? Or reference work.want in my model statement instead of ceo_firm? Thanks!
Check your logic - which file includes the VOL variable.
If you ran the code the same as I wrote it then you added it to WANT.
If you prefer use another name, addapt the code.
Thanks for the headsup. This is something ive been wondering about actually: what is work.want? Or just want? Is this like a shorthand sort of naming convention for data sets when one doesn't know the specific name? Just wondering.
Additionally,
Somehow my number of observations has skyrocketed since adding this code. My proc reg had been reading 12,000-some observations and now it is reading 142,234. Is there any explanation for this?
Thanks again!
When I started help within the forum I found others using table names like HAVE for input and WANT for desired output
and accpeted it as very convenient.
The difference in number of observations means that the combination of ticker and year are not uniqe.
SQL join makes a cartesin join thus multiplies output.
If you know what other variables make it unique - add them to the conditions:
....
left join paper.CSRP_Monthly_Stock_char4 as b
on a.ticker = b.ticker and
a.year = b.year and
a.new_key = b.new_key; /* adapt name and add more lines as need */
@Shmuel, is there any chance that the addition of this code into my program would push me over my storage quota? My home directory is full and now I need to delete files. This wasn't an issue until I ran this code. Maybe it's a coincidence? I'm just wondering if there is something about the nature of this code or the amount of observations it pulled into my dataset that could result in this. I believe I am using a sort of university/free version of SAS for academics . . . something along those lines. Thanks!
The last dataset in your original code was paper.ceo_firm.
Have you changed the output dataset name (instead work.want) to a new name or named it again as paper.ceo_firm ?
Usually, I prefer that each step creates a unique name (new one).
When you use the same dataset name again and again and you need to change the code you have to run the code from start,
otherwise you can run only those steps starting the changed code.
You asked -
is there any chance that the addition of this code into my program would push me over my storage quota?
the answer depends on your physical storage, but the real question should be - do you realy need the cartesian multiply of observations? you wanted to add just one variable not add observations ?!
I suggest:
1) check again is there other variable(s) to make the matching unique ?
2) Fix the code to use unique matching variables and run that part that creates paper.ceo_firm dataset.
You can submit selected steps. No must to run all code from start.
I'm afraid i'm having trouble wrapping my head around what you mean by unique . . . this whole program essentially has revolved around the combination of ticker and year. I'm not sure why "VOL" for ticker and year would not be unique. Unless you have another suggestion, is there a way to undo this and recreate the ceo_firm data set as it was? With the earlier amount of observations? At this point i'm just concerned with preserving the data. Unfortunately, overwriting datasets unintentionally has been a theme of mine during this go-round.
In the next code I have copied that part of your code (line 160 on) to recreate paper.ceo_firm,
merged few steps to one, added code to check duplicates and code to add the
required variable to above dataset.
read carefully my notes in between /*...*/.
You can submit step by step to run and check each that ran OK.
Wish you success.
/********************************************************************/ /* START RERUN TO REPRODUCE PAPER.CEO_FIRM DATASET */ /********************************************************************/ /* IF YOU HAVE STORAGE ISSUE DELETE THE LAST CREATED PAPER.CEO_FIRM */ proc datasets lib=paper; delete ceo_firm nolist; quit; /* ============= RECREATE PAPER.CEO_FIRM ========================== */ *Merge CEO data and firm data; DATA paper.ceo_firm_1; length ticker $5; merge paper.CSRP_annual_returns2 (in=in1) paper.compustat_execucomp4 (in=in2); by ticker; if in1 and in2; run; *remove missing return data; *create dummy variable that indicates whether the firm's market value increases in a given year; data paper.ceo_firm_2; set paper.ceo_firm_1 (rename=(BONUS=_BONUS STOCK_AWARDS=_STOCK_AWARDS)); BONUS=input(_BONUS, ?? 8.); STOCK_AWARDS=input(_STOCK_AWARDS, ?? 8.); if annual_return=. then delete; run; data paper.ceo_firm; set paper.ceo_firm_temp_2; by ticker year; last_return=lag(annual_return); if not first.ticker then increase=(annual_return > last_return); else increase=.; if gender='MALE' then male=1; else male=0; run; /* DELETE TEMPORARY DATASETS */ proc datasets lib=paper; delete ceo_firm_1 ceo_firm_2 nolist; quit; /* CHECK FOR DUPLICATES */ data is_dup; set paper.CSRP_Monthly_Stock_char4; /* dataset origin containing var to add VOL ? */ by ticker year; if not (firt.year and last.year); run; * are there duplicates ? ; * if positive sort the dataset with NODUPKEY option ; * alternativeley add more key variables to to join in next sql; proc sort data=paper.CSRP_Monthly_Stock_char4 (keep ticker year VOL) /* replace VOL to the required var */ out=crsp_nodup NODUPKEY; by ticker year; run; * add the required var to paper.ceo_firm dataset ; *bring variable in from other data set; proc sql; create table want as select a.* , b.VOL from paper.ceo_firm as a left join paper.crsp_nodup /*CSRP_Monthly_Stock_char4*/ as b
on a.ticker = b.ticker
and a.year = b.year
/* and a.key = b.key ?????? */
; quit;
Hey @Shmuel, thanks for the revised program.
Here are some issues I ran into:
1 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
70
71 /********************************************************************/
72 /* START RERUN TO REPRODUCE PAPER.CEO_FIRM DATASET */
73 /********************************************************************/
74
75 /* IF YOU HAVE STORAGE ISSUE DELETE THE LAST CREATED PAPER.CEO_FIRM */
76 proc datasets lib=paper;
76 ! delete ceo_firm nolist; quit;
NOTE: The file PAPER.NOLIST (memtype=DATA) was not found, but appears on a DELETE statement.
NOTE: Deleting PAPER.CEO_FIRM (memtype=DATA).
NOTE: PROCEDURE DATASETS used (Total process time):
real time 0.05 seconds
user cpu time 0.05 seconds
system cpu time 0.00 seconds
memory 2160.40k
OS Memory 28072.00k
Timestamp 04/14/2018 02:44:04 PM
Step Count 23 Switch Count 1
Page Faults 0
Page Reclaims 1090
Page Swaps 0
Voluntary Context Switches 25
Involuntary Context Switches 0
Block Input Operations 32
Block Output Operations 16
77
78 /* ============= RECREATE PAPER.CEO_FIRM ========================== */
79
80 *Merge CEO data and firm data;
81 DATA paper.ceo_firm_1;
82 length ticker $5;
83 merge paper.CSRP_annual_returns2 (in=in1)
84 paper.compustat_execucomp4 (in=in2);
85 by ticker;
86
87 if in1 and in2;
88 run;
NOTE: There were 10586 observations read from the data set PAPER.CSRP_ANNUAL_RETURNS2.
NOTE: There were 13355 observations read from the data set PAPER.COMPUSTAT_EXECUCOMP4.
NOTE: The data set PAPER.CEO_FIRM_1 has 12727 observations and 108 variables.
NOTE: DATA statement used (Total process time):
real time 0.13 seconds
user cpu time 0.02 seconds
system cpu time 0.05 seconds
memory 4219.21k
OS Memory 33968.00k
Timestamp 04/14/2018 02:44:04 PM
Step Count 24 Switch Count 5
Page Faults 0
Page Reclaims 1136
Page Swaps 0
Voluntary Context Switches 1072
Involuntary Context Switches 3
Block Input Operations 55104
Block Output Operations 51976
89
90 *remove missing return data;
91 *create dummy variable that indicates whether the firm's market value increases in a given year;
92
93 data paper.ceo_firm_2;
94 set paper.ceo_firm_1 (rename=(BONUS=_BONUS STOCK_AWARDS=_STOCK_AWARDS));
95 BONUS=input(_BONUS, ?? 8.);
96 STOCK_AWARDS=input(_STOCK_AWARDS, ?? 8.);
97
98 if annual_return=. then
99 delete;
100 run;
NOTE: There were 12727 observations read from the data set PAPER.CEO_FIRM_1.
NOTE: The data set PAPER.CEO_FIRM_2 has 12727 observations and 110 variables.
NOTE: DATA statement used (Total process time):
real time 0.12 seconds
user cpu time 0.01 seconds
system cpu time 0.03 seconds
memory 3640.34k
OS Memory 33452.00k
Timestamp 04/14/2018 02:44:04 PM
Step Count 25 Switch Count 1
Page Faults 0
Page Reclaims 566
Page Swaps 0
Voluntary Context Switches 1082
Involuntary Context Switches 1
Block Input Operations 52000
Block Output Operations 52744
101
102 data paper.ceo_firm;
103 set paper.ceo_firm_temp_2;
ERROR: File PAPER.CEO_FIRM_TEMP_2.DATA does not exist.
104 by ticker year;
105 last_return=lag(annual_return);
106
107 if not first.ticker then
108 increase=(annual_return > last_return);
109 else
110 increase=.;
111
112 if gender='MALE' then male=1; else male=0;
113 run;
NOTE: The SAS System stopped processing this step because of errors.
WARNING: The data set PAPER.CEO_FIRM may be incomplete. When this step was stopped there were 0 observations and 5 variables.
NOTE: DATA statement used (Total process time):
real time 0.01 seconds
user cpu time 0.00 seconds
system cpu time 0.00 seconds
memory 498.06k
OS Memory 30632.00k
Timestamp 04/14/2018 02:44:04 PM
Step Count 26 Switch Count 1
Page Faults 0
Page Reclaims 101
Page Swaps 0
Voluntary Context Switches 30
Involuntary Context Switches 0
Block Input Operations 0
Block Output Operations 264
114
115 /* DELETE TEMPORARY DATASETS */
116 proc datasets lib=paper;
116 ! delete ceo_firm_1 ceo_firm_2 nolist; quit;
NOTE: The file PAPER.NOLIST (memtype=DATA) was not found, but appears on a DELETE statement.
NOTE: Deleting PAPER.CEO_FIRM_1 (memtype=DATA).
NOTE: Deleting PAPER.CEO_FIRM_2 (memtype=DATA).
NOTE: PROCEDURE DATASETS used (Total process time):
real time 0.05 seconds
user cpu time 0.04 seconds
system cpu time 0.01 seconds
memory 636.75k
OS Memory 30888.00k
Timestamp 04/14/2018 02:44:04 PM
Step Count 27 Switch Count 2
Page Faults 0
Page Reclaims 103
Page Swaps 0
Voluntary Context Switches 50
Involuntary Context Switches 0
Block Input Operations 32
Block Output Operations 24
117
118 /* CHECK FOR DUPLICATES */
119 data is_dup;
120 set paper.CSRP_Monthly_Stock_char4; /* dataset origin containing var to add VOL ? */
121 by ticker year;
122 if not (firt.year and last.year);
_________
557
ERROR: DATA STEP Component Object failure. Aborted during the COMPILATION phase.
ERROR 557-185: Variable firt is not an object.
NOTE: The SAS System stopped processing this step because of errors.
NOTE: DATA statement used (Total process time):
real time 0.00 seconds
user cpu time 0.00 seconds
system cpu time 0.00 seconds
memory 1979.90k
OS Memory 31656.00k
Timestamp 04/14/2018 02:44:04 PM
Step Count 28 Switch Count 0
Page Faults 0
Page Reclaims 257
Page Swaps 0
Voluntary Context Switches 9
Involuntary Context Switches 0
Block Input Operations 288
Block Output Operations 0
123 run;
124
125 * are there duplicates ? ;
126 * if positive sort the dataset with NODUPKEY option ;
127 * alternativeley add more key variables to to join in next sql;
128
129 proc sort data=paper.CSRP_Monthly_Stock_char4 (keep ticker year VOL)
____ ______
12 12
ERROR 12-63: Missing '=' for option KEEP.
130 /* replace VOL to the required var */
131 out=crsp_nodup NODUPKEY;
132 by ticker year;
133 run;
NOTE: The SAS System stopped processing this step because of errors.
WARNING: The data set WORK.CRSP_NODUP may be incomplete. When this step was stopped there were 0 observations and 0 variables.
NOTE: PROCEDURE SORT used (Total process time):
real time 0.00 seconds
user cpu time 0.00 seconds
system cpu time 0.00 seconds
memory 1658.37k
OS Memory 31656.00k
Timestamp 04/14/2018 02:44:04 PM
Step Count 29 Switch Count 1
Page Faults 0
Page Reclaims 265
Page Swaps 0
Voluntary Context Switches 8
Involuntary Context Switches 0
Block Input Operations 0
Block Output Operations 160
134
135 * add the required var to paper.ceo_firm dataset ;
136 *bring variable in from other data set;
137 proc sql;
138 create table want as
139 select a.* , b.VOL
140 from paper.ceo_firm as a
141 left join paper.crsp_nodup /*CSRP_Monthly_Stock_char4*/ as b on a.ticker = b.ticker and a.year = b.year
141 ! /* and a.key = b.key ?????? */ ; quit;
ERROR: File PAPER.CRSP_NODUP.DATA does not exist.
NOTE: PROC SQL set option NOEXEC and will continue to check the syntax of statements.
NOTE: The SAS System stopped processing this step because of errors.
NOTE: PROCEDURE SQL used (Total process time):
real time 0.00 seconds
user cpu time 0.00 seconds
system cpu time 0.01 seconds
memory 323.43k
OS Memory 30888.00k
Timestamp 04/14/2018 02:44:04 PM
Step Count 30 Switch Count 0
Page Faults 0
Page Reclaims 84
Page Swaps 0
Voluntary Context Switches 12
Involuntary Context Switches 0
Block Input Operations 288
Block Output Operations 0
142
143
144 OPTIONS NONOTES NOSTIMER NOSOURCE NOSYNTAXCHECK;
156 Here is the code again:
libname paper "~/425/425_Final_Paper";
*import compustat dataset;
PROC IMPORT out=paper.CompuStat_Execucomp
datafile="~/425/425_Final_Paper/CompuStat_Execucomp.csv" DBMS=CSV replace;
getnames=yes;
guessingrows=2000;
run;
*keep only CEOs;
data paper.Compustat_ExecuComp2;
set paper.Compustat_ExecuComp;
if CEOANN='CEO';
run;
*** extra DATA step for checking previous data step results by comparing number of records selected;
data check_CEOANN;
set paper.Compustat_ExecuComp;
if CEOANN=: 'CEO';
run;
proc sort data=paper.Compustat_ExecuComp2;
by ticker year;
run;
*import csrp dataset;
PROC IMPORT out=paper.CSRP_Monthly_Stock_char
datafile="~/425/425_Final_Paper/CSRP_MonthlyStock_char.csv" DBMS=CSV replace;
getnames=yes;
guessingrows=max;
run;
proc contents data=paper.CSRP_Monthly_Stock_char;
run;
*remove bad data;
data paper.CSRP_Monthly_Stock_char2;
set paper.CSRP_Monthly_Stock_char (rename=(ret=character_ret));
drop character_ret;
if cusip=' ' then
delete;
ret=input(character_ret, ??8.);
if ret=. then
delete;
date=input(put(date, z8.), yymmdd8.);
format date yymmdd10.;
year=year(date);
month=month(date);
if cusip=: '?' then
cusip=substr(cusip, 2);
run;
proc contents data=paper.CSRP_Monthly_Stock_char;
run;
proc contents data=paper.CSRP_Monthly_Stock_char2;
run;
proc contents data=paper.multiple_CEOs;
run;
proc sort data=paper.CSRP_Monthly_Stock_char2;
by ticker year;
run;
proc sort data=paper.Compustat_ExecuComp2;
by ticker year;
run;
*Remove all bad years from both data sources;
data paper.Compustat_ExecuComp3;
length ticker $5;
merge paper.Compustat_ExecuComp2 paper.multiple_CEOs (keep=ticker year
in=had_multiple_CEOs);
by ticker year;
if had_multiple_CEOs then
delete;
run;
data paper.CSRP_Monthly_Stock_char3;
merge paper.CSRP_Monthly_Stock_char2 paper.multiple_CEOs (keep=ticker year
in=had_multiple_CEOs);
by ticker year;
if had_multiple_CEOs then
delete;
run;
*find additional bad data: multiple return records for the same month/year;
proc freq data=paper.CSRP_Monthly_Stock_char3;
tables ticker * year * month / noprint
out=paper.multiple_returns (where=(count > 1));
run;
*Remove all matching year data for multiple returns;
proc sort data=paper.multiple_returns out=multiple_returns (keep=ticker year)
NODUPKEY;
by ticker year;
run;
data paper.Compustat_ExecuComp4;
merge paper.Compustat_ExecuComp3 (in=keepme)
multiple_returns (in=had_multiple_returns);
by ticker year;
if keepme;
if had_multiple_returns then
delete;
run;
data paper.CSRP_Monthly_Stock_char4;
length ticker $5;
merge paper.CSRP_Monthly_Stock_char3
multiple_returns (in=had_multiple_returns);
by ticker year;
if had_multiple_returns then
delete;
run;
proc contents data=paper.CSRP_Monthly_Stock_char3;
run;
proc contents data=paper.CSRP_Monthly_Stock_char4;
run;
*create new variable annualized growth;
data paper.CSRP_annual_returns;
set paper.CSRP_Monthly_Stock_char4;
by ticker year;
retain annual_return 1;
annual_return=annual_return * (1 + RET);
if month=12 or last.ticker;
annual_return=(annual_return - 1) * 100;
output;
annual_return=1;
keep ticker year annual_return;
run;
*use proc contents to see if there is a type mismatch;
proc contents data=paper.CSRP_annual_returns;
run;
proc contents data=paper.Compustat_ExecuComp4;
run;
*MERGE statement has more than one data set with repeats of BY values;
proc sort data=paper.CSRP_annual_returns out=paper.CSRP_annual_returns2
nodupkey;
by ticker;
run;
/********************************************************************/
/* START RERUN TO REPRODUCE PAPER.CEO_FIRM DATASET */
/********************************************************************/
/* IF YOU HAVE STORAGE ISSUE DELETE THE LAST CREATED PAPER.CEO_FIRM */
proc datasets lib=paper; delete ceo_firm nolist; quit;
/* ============= RECREATE PAPER.CEO_FIRM ========================== */
*Merge CEO data and firm data;
DATA paper.ceo_firm_1;
length ticker $5;
merge paper.CSRP_annual_returns2 (in=in1)
paper.compustat_execucomp4 (in=in2);
by ticker;
if in1 and in2;
run;
*remove missing return data;
*create dummy variable that indicates whether the firm's market value increases in a given year;
data paper.ceo_firm_2;
set paper.ceo_firm_1 (rename=(BONUS=_BONUS STOCK_AWARDS=_STOCK_AWARDS));
BONUS=input(_BONUS, ?? 8.);
STOCK_AWARDS=input(_STOCK_AWARDS, ?? 8.);
if annual_return=. then
delete;
run;
data paper.ceo_firm;
set paper.ceo_firm_temp_2;
by ticker year;
last_return=lag(annual_return);
if not first.ticker then
increase=(annual_return > last_return);
else
increase=.;
if gender='MALE' then male=1; else male=0;
run;
/* DELETE TEMPORARY DATASETS */
proc datasets lib=paper; delete ceo_firm_1 ceo_firm_2 nolist; quit;
/* CHECK FOR DUPLICATES */
data is_dup;
set paper.CSRP_Monthly_Stock_char4; /* dataset origin containing var to add VOL ? */
by ticker year;
if not (firt.year and last.year);
run;
* are there duplicates ? ;
* if positive sort the dataset with NODUPKEY option ;
* alternativeley add more key variables to to join in next sql;
proc sort data=paper.CSRP_Monthly_Stock_char4 (keep ticker year VOL)
/* replace VOL to the required var */
out=crsp_nodup NODUPKEY;
by ticker year;
run;
* add the required var to paper.ceo_firm dataset ;
*bring variable in from other data set;
proc sql;
create table want as
select a.* , b.VOL
from paper.ceo_firm as a
left join paper.crsp_nodup /*CSRP_Monthly_Stock_char4*/ as b on a.ticker = b.ticker and a.year = b.year /* and a.key = b.key ?????? */ ; quit;I really appreciate the help!
It's finally time to hack! Remember to visit the SAS Hacker's Hub regularly for news and updates.
What’s the difference between SAS Enterprise Guide and SAS Studio? How are they similar? Just ask SAS’ Danny Modlin.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.