Thanks. This is very insightful. This is part of my code (the dates are there just for test. The actual dates will be year by year for the reason I mentioned). Basically each year I will get data from different tables (the tables are in the cloud of data provider) some of them are big (like 10 GB each year), then clean,merge,sort, aggregate them. At the end for each year one table will be the final output. If I do it year by year I should kind of put the final output tables on top of each other. Each final output table has small size (~50MB). This is a sample of first lines of the code: proc sql;
create table hld as select eff_dt,coupon,ticker,nbr_shares,report_dt,
percent_tna,market_val,cusip,crsp_portno,crsp_company_key,permno,permco from crsp.holdings
where eff_dt>='01DEC2002'd and eff_dt<'31DEC2020'd ;
*quit;
RUN;
data hld ;
set hld;
if missing(cusip) then delete;
y=year(eff_dt);
q=qtr(eff_dt);
m=month(eff_dt);
run;
*crspd is the daily prices CRSP dataset;
proc sql;
create table crspd2 as
select cusip,shrout,ret,date,permno,permco,prc From crsp.dsf
where date>='01DEC2002'd and date<'31DEC2020'd ;
*quit;
RUN;
proc sql;
create table crsp_indx as
select vwretd,date,spindx From crsp.dsi
where date>='01DEC2002'd and date<'31DEC2020'd ;
*quit;
RUN;
proc sql;
create table crspd as select *
From crspd2 inner join crsp_indx on
(crspd2.date = crsp_indx.date) ;
*quit;
RUN;
proc datasets ;
delete crspd2 crsp_indx;
run;
data crspd ;
set crspd;
if missing(cusip) then delete;
y=year(date);
q=qtr(date);
m=month(date);
relP=PRC/spindx;
run;
proc sort data= crspd out= crspd ;
by cusip date ;
run;
data crspd;
set crspd;
by cusip;
lagW_relP=lag7(relP);
RETw=(relP-lagW_relP)/lagW_relP;
run;
Proc sql;
create table crspd as
Select *, std(RETw) as std1, mean(RETw) as mean1
From crspd
GROUP BY cusip,y
ORDER BY cusip,y;
*quit;
RUN;
and proc sql;
create table cmpstat as
select datadate,fyearq,fqtr,fyr,cusip,gvkey From compd.fundq
where datadate>='01DEC2010'd and datadate<'31DEC2020'd ;
*quit;
RUN;
data cmpstat;
set cmpstat;
by gvkey ;
if first.gvkey then output;
run;
proc sort data= cmpstat ;
by cusip fyearq fqtr;
run;
data cmpstat_q;
set cmpstat;
by cusip fyearq fqtr;
if first.fqtr then output;
run;
proc sql;
create table sicx as
select sic,gvkey From compa.company;
*quit;
RUN;
proc sql;
create table cmp_sic_q as select *
From cmpstat_q inner join sicx on
( cmpstat_q.gvkey = sicx.gvkey) ;
*quit;
RUN;
proc sql;
create table cmp_sic as select *
From cmpstat inner join sicx on
( cmpstat.gvkey = sicx.gvkey) ;
*quit;
RUN;
*map contains fund names and so on.The holding databse contains
only portfo numbers and not names. Most importantly the map
let's us aggregate everything on fund level;
proc sql;
create table map as
select crsp_fundno,crsp_portno,fund_name,mgmt_name,mgmt_cd,ncusip From crsp.portnomap;
*quit;
RUN;
... View more