Fair enough. Could you please inform me on how to recreate the data set which I have destroyed? I would appreciate that. Here's my program: libname paper "~/425/425_Final_Paper";
*import compustat dataset;
PROC IMPORT out=paper.CompuStat_Execucomp
datafile="~/425/425_Final_Paper/CompuStat_Execucomp.csv" DBMS=CSV replace;
getnames=yes;
guessingrows=2000;
run;
*keep only CEOs;
data paper.Compustat_ExecuComp2;
set paper.Compustat_ExecuComp;
if CEOANN='CEO';
run;
*** extra DATA step for checking previous data step results by comparing number of records selected;
data check_CEOANN;
set paper.Compustat_ExecuComp;
if CEOANN=: 'CEO';
run;
proc sort data=paper.Compustat_ExecuComp2;
by ticker year;
run;
*import csrp dataset;
PROC IMPORT out=paper.CSRP_Monthly_Stock_char
datafile="~/425/425_Final_Paper/CSRP_MonthlyStock_char.csv" DBMS=CSV replace;
getnames=yes;
guessingrows=max;
run;
proc contents data=paper.CSRP_Monthly_Stock_char;
run;
*remove bad data;
data paper.CSRP_Monthly_Stock_char2;
set paper.CSRP_Monthly_Stock_char (rename=(ret=character_ret));
drop character_ret;
if cusip=' ' then
delete;
ret=input(character_ret, ??8.);
if ret=. then
delete;
date=input(put(date, z8.), yymmdd8.);
format date yymmdd10.;
year=year(date);
month=month(date);
if cusip=: '?' then
cusip=substr(cusip, 2);
run;
proc contents data=paper.CSRP_Monthly_Stock_char;
run;
proc contents data=paper.CSRP_Monthly_Stock_char2;
run;
proc contents data=paper.multiple_CEOs;
run;
proc sort data=paper.CSRP_Monthly_Stock_char2;
by ticker year;
run;
proc sort data=paper.Compustat_ExecuComp2;
by ticker year;
run;
*Remove all bad years from both data sources;
data paper.Compustat_ExecuComp3;
length ticker $5;
merge paper.Compustat_ExecuComp2 paper.multiple_CEOs (keep=ticker year
in=had_multiple_CEOs);
by ticker year;
if had_multiple_CEOs then
delete;
run;
data paper.CSRP_Monthly_Stock_char3;
merge paper.CSRP_Monthly_Stock_char2 paper.multiple_CEOs (keep=ticker year
in=had_multiple_CEOs);
by ticker year;
if had_multiple_CEOs then
delete;
run;
*find additional bad data: multiple return records for the same month/year;
proc freq data=paper.CSRP_Monthly_Stock_char3;
tables ticker * year * month / noprint
out=paper.multiple_returns (where=(count > 1));
run;
*Remove all matching year data for multiple returns;
proc sort data=paper.multiple_returns out=multiple_returns (keep=ticker year)
NODUPKEY;
by ticker year;
run;
data paper.Compustat_ExecuComp4;
merge paper.Compustat_ExecuComp3 (in=keepme)
multiple_returns (in=had_multiple_returns);
by ticker year;
if keepme;
if had_multiple_returns then
delete;
run;
data paper.CSRP_Monthly_Stock_char4;
length ticker $5;
merge paper.CSRP_Monthly_Stock_char3
multiple_returns (in=had_multiple_returns);
by ticker year;
if had_multiple_returns then
delete;
run;
proc contents data=paper.CSRP_Monthly_Stock_char3;
run;
proc contents data=paper.CSRP_Monthly_Stock_char4;
run;
*create new variable annualized growth;
data paper.CSRP_annual_returns;
set paper.CSRP_Monthly_Stock_char4;
by ticker year;
retain annual_return 1;
annual_return=annual_return * (1 + RET);
if month=12 or last.ticker;
annual_return=(annual_return - 1) * 100;
output;
annual_return=1;
keep ticker year annual_return;
run;
*use proc contents to see if there is a type mismatch;
proc contents data=paper.CSRP_annual_returns;
run;
proc contents data=paper.Compustat_ExecuComp4;
run;
*MERGE statement has more than one data set with repeats of BY values;
proc sort data=paper.CSRP_annual_returns out=paper.CSRP_annual_returns2
nodupkey;
by ticker;
run;
*Merge CEO data and firm data;
DATA paper.ceo_firm;
length ticker $5;
merge paper.CSRP_annual_returns2 (in=in1) paper.compustat_execucomp4 (in=in2);
by ticker;
if in1 and in2;
run;
proc contents data=paper.ceo_firm;
run;
proc contents data=paper.ceo_firm2;
run;
*remove missing return data;
data paper.ceo_firm;
set paper.ceo_firm;
if annual_return=. then
delete;
run;
proc means data=paper.CSRP_Monthly_Stock_char3 n nmiss;
var ret;
run;
proc means data=paper.Compustat_ExecuComp4 n nmiss;
var salary;
run;
proc means data=paper.CSRP_annual_returns n nmiss;
var annual_return;
run;
data paper.ceo_firm;
set paper.ceo_firm (rename=(BONUS=_BONUS STOCK_AWARDS=_STOCK_AWARDS));
BONUS=input(_BONUS, ?? 8.);
STOCK_AWARDS=input(_STOCK_AWARDS, ?? 8.);
run;
data paper.ceo_firm2;
set paper.ceo_firm2 (rename=(othcomp=_othcomp));
othcomp=input(_othcomp, ?? 8.);
run;
*create dummy variable that indicates whether the firm's market value increases in a given year;
data paper.ceo_firm;
set paper.ceo_firm;
by ticker year;
last_return=lag(annual_return);
if not first.ticker then
increase=(annual_return > last_return);
else
increase=.;
run;
*create dummy variable that indicates whether the CEO is male or female;
data paper.ceo_firm;
set paper.ceo_firm;
if gender='MALE' then male=1; else male=0;
run;
*bring variable in from other data set;
proc sql;
create table paper.ceo_firm2 as
select a.* , b.VOL
from paper.ceo_firm as a
left join
(select ticker
,year
,sum(VOL) as VOL
from paper.CSRP_Monthly_Stock_char4
group by ticker
,year
) as b
on a.ticker = b.ticker and
a.year = b.year;
quit;
/* CHECK FOR DUPLICATES */
data is_dup;
set paper.CSRP_Monthly_Stock_char4; /* dataset origin containing var to add VOL ? */
by ticker year;
if not (first.year and last.year);
run;
*find duplicates: ticker/year combo;
proc freq data=paper.CSRP_Monthly_Stock_char4;
tables ticker * year / noprint
out=paper.multiple_tyCSRP (where=(count > 1));
run;
proc freq data=paper.ceo_firm;
tables ticker * year / noprint
out=paper.multiple_tyfirm (where=(count > 1));
run;
*create variable log_salary;
data paper.ceo_firm;
set paper.ceo_firm;
if salary=0 then log_salary=0;
else log_salary=log(salary);
run;
*create variable log_annual_return;
data paper.ceo_firm;
set paper.ceo_firm;
if annual_return=0 then log_annual_return=0;
else log_annual_return=log(annual_return);
run;
*create variable log_bonus;
data paper.ceo_firm;
set paper.ceo_firm;
if bonus=0 then log_bonus=0;
else log_bonus=log(bonus);
run;
*create variable log_TOTAL_SEC;
data paper.ceo_firm;
set paper.ceo_firm;
if TOTAL_SEC=0 then log_TOTAL_SEC=0;
else log_TOTAL_SEC=log(TOTAL_SEC);
run;
*create interaction term;
data paper.ceo_firm2;
set paper.ceo_firm2;
annual_return_VOL=(annual_return*VOL);
run;
proc print data=paper.ceo_firm2 (obs=100);
run;
proc print data=paper.compustat_execucomp4 (obs=50);
run;
*create categorical variable for industry;
data paper.ceo_firm3;
set paper.ceo_firm2;
if NAICS=11 then industry=outdoors;
if NAICS=21 then industry=mining;
if NAICS=22 then industry=utilities;
if NAICS=23 then industry=construction;
if NAICS=31 then industry=manufacturing;
if NAICS=32 then industry=manufacturing;
if NAICS=33 then industry=manufacturing;
if NAICS=42 then industry=wholesale;
if NAICS=44 then industry=retail;
if NAICS=45 then industry=retail;
if NAICS=48 then industry=transportation;
if NAICS=49 then industry=transportation;
if NAICS=51 then industry=information;
if NAICS=52 then industry=finance;
if NAICS=53 then industry=real_estate;
if NAICS=54 then industry=professional;
if NAICS=55 then industry=management;
if NAICS=56 then industry=waste;
if NAICS=61 then industry=education;
if NAICS=62 then industry=healthcare;
if NAICS=71 then industry=arts;
if NAICS=72 then industry=food;
if NAICS=81 then industry=other;
if NAICS=92 then industry=public_admin;
run;
***DESCRIPTIVE_STATS***
*find max;
proc sql;
select * from paper.ceo_firm2
having vol = max(vol);
quit;
*find max;
proc sql;
select * from paper.ceo_firm2
having salary = max(salary);
quit;
*find max;
proc sql;
select * from paper.ceo_firm2
having bonus = max(bonus);
quit;
*PROC UNIVARIATE with and ID statement should pretty well for a quick list.;
proc univariate data=paper.ceo_firm2;
id coname ;
var VOL ;
run;
*Histogram;
title "Figure 1. CEO Salary";
title2 "2000-2017";
PROC UNIVARIATE data=paper.ceo_firm;
var salary;
histogram salary;
label salary="Salary (thousands)";
run;
*Histogram;
title "Figure 1. log(salary)";
title2 "2000-2017";
PROC UNIVARIATE data=paper.ceo_firm;
var log_salary;
histogram log_salary;
label log_salary="log(salary) (thousands)";
run;
*Histogram;
title "Figure 2. Annual Return";
title2 "2000-2017";
PROC UNIVARIATE data=paper.ceo_firm;
var annual_return;
histogram annual_return/kernel overlay;
label annual_return="annual return (%)";
run;
data paper.ceo_firm;
set paper.ceo_firm (rename=(BONUS=_BONUS STOCK_AWARDS=_STOCK_AWARDS));
BONUS=input(_BONUS, ?? 8.);
STOCK_AWARDS=input(_STOCK_AWARDS, ?? 8.);
run;
data paper.ceo_firm2;
set paper.ceo_firm2 (rename=(option_awards=_option_awards));
option_awards=input(_option_awards, ?? 8.);
run;
data paper.ceo_firm2;
set paper.ceo_firm2 (rename=(NAICS=_NAICS));
NAICS=input(_NAICS, ?? 8.);
run;
title "Table 1.";
title2 "Summary Statistics";
title3 "CEO Compensation";
PROC MEANS data=paper.ceo_firm n mean median std min max skew maxdec=2;
var salary bonus stock_awards;
run;
title "Table 1.";
title2 "Summary Statistics";
title3 "Total Compensation";
PROC MEANS data=paper.ceo_firm n mean median std min max skew maxdec=2;
var TOTAL_SEC;
run;
*Histogram;
title "Figure 2. Annual Return";
title2 "2000-2017";
PROC UNIVARIATE data=paper.ceo_firm;
var TOTAL_SEC;
histogram TOTAL_SEC/kernel overlay;
label TOTAL_SEC="Total Compensation";
run;
*Histogram;
title "Figure 2. Annual Return";
title2 "2000-2017";
PROC UNIVARIATE data=paper.ceo_firm;
var log_TOTAL_SEC;
histogram log_TOTAL_SEC/kernel overlay;
label log_TOTAL_SEC="Total Compensation";
run;
title "Table 2.";
title2 "Summary Statistics";
title3 "Firm Performance";
PROC MEANS data=paper.ceo_firm n mean median std min max skew maxdec=2;
var annual_return;
run;
*PROC SGPLOT;
title "Figure 3. CEO Salary and Annual Return";
PROC SGPLOT DATA=PAPER.CEO_FIRM;
SCATTER X=SALARY Y=ANNUAL_RETURN;
LABEL SALARY="CEO SALARY";
LABEL ANNUAL_RETURN="FIRM PERFORMANCE";
RUN;
title "Figure 4. CEO Salary and Annual Return";
title2 "Correlation Coefficient";
PROC CORR DATA=paper.ceo_firm PEARSON;
VAR annual_return;
WITH salary;
RUN;
*series plot salary;
proc sort data=paper.Compustat_ExecuComp4 out=sorted;
by year;
run;
proc means data=sorted;
by year;
var salary;
output out=avg mean=avg_salary;
run;
proc sgplot data=avg;
series x=year y=avg_salary;
title "Average CEO salary (2010-2017)";
run;
*Bubble plot;
title '';
proc sgplot data=paper.ceo_firm2;
bubble x=annual_return y=salary size=VOL /
transparency=0.4 datalabelattrs=(size=9 weight=bold);
inset "Bubble size represents average volume" / position=bottomright textattrs=(size=11);
yaxis grid;
xaxis grid;
run;
*Bubble plot;
title '';
proc sgplot data=paper.ceo_firm2;
bubble x=annual_return y=salary size=bonus /
transparency=0.4 datalabelattrs=(size=9 weight=bold);
inset "Bubble size represents average volume" / position=bottomright textattrs=(size=11);
yaxis grid;
xaxis grid;
run;
***OLS REGRESSION***;
title "";
ods graphics on;
proc glm data=paper.ceo_firm plots(maxpoints=none);
model salary=annual_return annual_return*increase;
run;
ods graphics off;
ods graphics on;
proc reg data=paper.ceo_firm plots(maxpoints=none);
model salary=annual_return bonus stock_awards;
run;
ods graphics off;
ods graphics on;
proc reg data=paper.ceo_firm plots(maxpoints=none);
model salary=annual_return bonus stock_awards opt_exer_num OPT_EXER_VAL OPTION_AWARDS_NUM;
run;
ods graphics off;
proc reg data=paper.ceo_firm plots(maxpoints=none);
model log_salary=annual_return bonus stock_awards;
run;
proc reg data=paper.ceo_firm plots(maxpoints=none);
model TOTAL_SEC=annual_return age male;
run;
proc reg data=paper.ceo_firm plots(maxpoints=none);
model salary=annual_return bonus stock_awards VOL;
run;
proc reg data=paper.ceo_firm plots(maxpoints=none);
model log_TOTAL_SEC=annual_return age male;
run;
proc reg data=paper.ceo_firm plots(maxpoints=none);
model salary=annual_return bonus stock_awards age male;
run;
proc reg data=paper.ceo_firm2 plots(maxpoints=none);
model salary=annual_return bonus stock_awards age male VOL;
run;
proc reg data=paper.ceo_firm2 plots(maxpoints=none);
model salary=annual_return bonus stock_awards age male VOL;
run;
proc reg data=paper.ceo_firm2 plots(maxpoints=none);
model salary=annual_return bonus stock_awards option_awards othcomp age male VOL _NAICS / acov clb;
run;
proc reg data=paper.ceo_firm2 plots(maxpoints=none);
model TOTAL_SEC=annual_return age male VOL ZIP _NAICS / acov clb;
run;
proc reg data=paper.ceo_firm2 plots(maxpoints=none);
model salary=annual_return bonus stock_awards option_awards othcomp age male VOL _NAICS annual_return_VOL / acov clb;
run;
proc reg data=paper.ceo_firm2 plots(maxpoints=none);
model TOTAL_SEC=annual_return age male VOL ZIP _NAICS annual_return_VOL / acov clb;
run;
*proc glm data=paper.ceo_firm2 plots(maxpoints=none);
*model salary=annual_return bonus stock_awards age male VOL annual_return*VOL;
*run;
*proc glm data=paper.ceo_firm2 plots(maxpoints=none);
*model TOTAL_SEC=annual_return age male VOL annual_return*VOL;
*run;
*** end regression section***;
proc format;
value sal
.='Missing' low-4999='<5000' 5000-39999='< 40000' 40000-high='40000 or more';
value return
.='missing' low-.299='< .3'
.3-.499='<.5'
.5-high='.5 or more';
run;
proc freq data=paper.ceo_firm;
tables year*(annual_return salary)/ missing;
format salary sal.;
format annual_return return.;
run;
... View more