aggregating data and regression with dummy variables (challenging)

Accepted Solution Solved
Reply
Frequent Contributor
Posts: 141
Accepted Solution

aggregating data and regression with dummy variables (challenging)

hello everybody

I have a little problem and appreciate if anyone could help me.

I have intraday data (Trading data of each second):

Sample of my data:

 

TRD_EVENT_DTTRD_EVENT_TMTRD_EVENT_MSTRD_STCK_IDTRD_STCK_CDTRD_INSTR_NAMETRD_STCK_GRP_ID_CDTRD_PRTRD_TUROVRTRD_TRA_NR
3/24/200812:19:35 IRB5IKCO8751IKCQ1187191011N21100122197901
3/24/200812:28:01 IRO1ALBZ0001ALBZ1والبر1N1153710000122198037
3/24/200813:13:44 IRO1ALBZ0001ALBZ1والبر1N11567109999198173
3/24/200812:20:38 IRO1AZAB0001AZAB1فاذر1N268310000122197911
3/24/200813:13:44 IRO1AZAB0001AZAB1فاذر1N2695109999198167
3/24/200809:14:57 IRO1BALI0001BALI1وبوعلي1N28509260122197242
3/24/200809:15:06 IRO1BALI0001BALI1وبوعلي1N28502000122197245
3/24/200809:15:14 IRO1BALI0001BALI1وبوعلي1N285010000122197247
3/24/200809:15:24 IRO1BALI0001BALI1وبوعلي1N28506000122197249
3/24/200809:29:27 IRO1BALI0001BALI1وبوعلي1N285010000122197296
3/24/200812:28:00 IRO1BALI0001BALI1وبوعلي1N285010000122198036
3/24/200812:28:07 IRO1BALI0001BALI1وبوعلي1N285010000122198041
3/24/200813:13:44 IRO1BALI0001BALI1وبوعلي1N2865109999198133

Explanation of headers of some columns:

[TRD_EVENT_DT] Month / Day / Year
,[TRD_EVENT_TM] Time: hh:mm:ss
,[TRD_EVENT_MS] Millisecond
,[TRD_STCK_ID] 
,[TRD_STCK_CD] 
,[TRD_INSTR_NAME] 
,[TRD_STCK_GRP_ID_CD] 
,[TRD_PR] Stock Price
,[TRD_TUROVR] Turnover
,[TRD_TRA_NR] 

 

 

First:

I want to categorize this data in half an hour periods:

First half an hour: 9:00 << Volume_1 < 9:30

Second half an hour: 9:30 << Volume_2 < 10:00

Third half an hour: 10:00 << Volume_3 < 10:30

Fourth half an hour: 10:30 << Volume_4 < 11

Fifth half an hour: 11:00 << Volume_5 < 11:30

Sixth half an hour: 11:30 << Volume_6 << 12

 

Second:

I want to normalize my data by below equation:

 

Volume = Stock Price * Turnover

 

NormalizedVolume.PNG 

Third:

I want to analyze this data by the regression specified below:

Normalized Volume of half an hour = intercept + dummy var_1 [First half an hou] + dummy var_2 [Socend half an hour] + dummy var_3 [Third half an hour] + dummy var_4 [Fourth half an hour] + dummy var_5 [Fifth half an hour] + dummy var_6 [Sixth half an hour] + Residual

 

My question is how can I do each step using SAS.

 

Thanks for your attention.


Accepted Solutions
Solution
‎05-28-2017 05:01 PM
Frequent Contributor
Posts: 141

Re: aggregating data and regression with dummy variables (challenging)

[ Edited ]
Posted in reply to aminkarimid
*Importing Data;
/*
PROC IMPORT DATAFILE= 'D:\SampleData\Table87.xls'
OUT= SampleData87
DBMS= XLS;
RUN;
*/
proc contents data= Sampledata87;
run;
***********************************
*STEP 1: ROUNDING TIME;
***********************************
;
data Sampledata87_RT;
set Sampledata87;
TRD_EVENT_TIME = INPUT(TRD_EVENT_TM,time16.);
TRD_EVENT_ROUNDED = ROUND(TRD_EVENT_TIME,'00:30't);
TRD_EVENT_ROUFOR = PUT(TRD_EVENT_ROUNDED,hhmm.);

***********************************
*STEP 2: CALCULATION Of VOLUME;
***********************************
;
CountedVOLUME = TRD_PR*TRD_TUROVR;

***********************************
*STEP 3: NORMALIZED VOLUME;
***********************************
;

*Denominator
/*Sort by TRD_STCK_CD and temporal variables.*/;
proc sort data=Sampledata87_RT out=Sampledata87_SumVol;
    by  TRD_EVENT_DT;
run;

/*Sum VOLUME until the last of each TRD_STCK_CD is reached.*/
data Sampledata87_SumVolSo;
    set Sampledata87_SumVol;
    by  TRD_EVENT_DT
		TRD_STCK_CD notsorted;
	format TRD_STCK_CD  $5.;
	informat TRD_STCK_CD  $5.;
    retain tmp_volume_sum;
    tmp_volume_sum + CountedVOLUME;
    if last.TRD_STCK_CD then do;
        DailyVolume = tmp_volume_sum;
        call missing(tmp_volume_sum);
    end;
    drop tmp_:;
run;

*The numerator
/*Sum VOLUME until the last of each TRD_STCK_CD is reached.*/;
data Sampledata87_SumVolSo;
    set Sampledata87_SumVolSo;
    by  TRD_EVENT_DT
		TRD_STCK_CD
		TRD_EVENT_ROUFOR notsorted;
    retain tmp_intradayvolume_sum;
    tmp_intradayvolume_sum + CountedVOLUME;
    if last.TRD_EVENT_ROUFOR then do;
        IntradayVolume = tmp_intradayvolume_sum;
        call missing(tmp_intradayvolume_sum);
    end;
    drop tmp_:;
run;

* Another way for calculating daily volume based on data set;
/*
proc sql noprint;
	create table sums as
	select TRD_STCK_CD, TRD_EVENT_DT, sum(CountedVOLUME) as volume_sum
	from Sampledata87_SumVolSo
	group by TRD_STCK_CD, TRD_EVENT_DT;

	create index TRD_STCK_CD on sums;
quit;

data Sampledata87_SumVolSo02;
	set Sampledata87_SumVolSo;
	by  TRD_EVENT_DT
		TRD_STCK_CD notsorted;
	volume_sum = .;
	if last.TRD_STCK_CD then
    set sums key=TRD_STCK_CD;
run;
*/;

*Calculating Volume by Data set;

*Division for calculating of adjusted volume in approach 1;
proc sort data=sampledata87_sumvolso out=sampledata87_sumvolso;
by TRD_STCK_CD TRD_EVENT_DT;
run;
 
data sampledata87_adjvol;
     do until(last.TRD_STCK_CD);
           do until(last.TRD_EVENT_DT);
                set sampledata87_sumvolso;
                by TRD_STCK_CD TRD_EVENT_DT;
 
                if first.TRD_STCK_CD then
                     n=0;
 
                if first.TRD_EVENT_DT then
                     n+1;
 
                if n>1 then
                     do;
                           if not missing(IntradayVolume) then
                                adjusted_volume=divide(IntradayVolume,temp);
                                else call missing(adjusted_volume);
                     end;
 
                if last.TRD_EVENT_DT then
                     temp=dailyvolume;
                output;
           end;
     end;
     drop temp n;
run;
 
proc sort data = sampledata87_adjvol;
by TRD_EVENT_DT TRD_STCK_CD;
run;


***********************************
STEP 4: RUNNING REGRESSION
***********************************
;

*Generating Dummy Variable Automatically;
* Regressing dummy variables on normalized volume variable using calculated volume;
proc genmod data=Sampledata87_adjvol;
   class TRD_EVENT_ROUFOR / param=effect;
   model adjusted_volume = TRD_EVENT_ROUFOR / noscale;
   ods select ParameterEstimates;
run;

/* Same analysis by using the CLASS statement */
proc glm data=Sampledata87_adjvol;
   class TRD_EVENT_ROUFOR;              /* Generates dummy variables internally */
   model adjusted_volume = TRD_EVENT_ROUFOR / solution;
   ods select ParameterEstimates;
quit;




View solution in original post


All Replies
Super User
Posts: 19,787

Re: aggregating data and regression with dummy variables (challenging)

[ Edited ]
Posted in reply to aminkarimid

Re: #1

Is it a SAS time? If so, because SAS stores time as numbers so you can use functions to round it, ie ROUND() or FLOOR(), or MOD().It would be slightly easier if you could just round but that would be 9:15 to 10:14 intervals rather than 9:00 to 9:30. 

 

EDIT:

Re #3

Once your time is rounded, use PROC TRANSPOSE to flip your data and then use PROC REG. You can also look at the procedures under SAS/ETS if you're doing time series analysis. Especially PROC TIMESERIES, may help with #1 as well.

 

 

Frequent Contributor
Posts: 141

Re: aggregating data and regression with dummy variables (challenging)

[ Edited ]

Hello,

Thanks for your attention;

A sample of my data has been attached to previous massege.

Format of my data is mdb (Access). Please instruct me how to use Round statement for rounding time.

Super User
Posts: 19,787

Re: aggregating data and regression with dummy variables (challenging)

Posted in reply to aminkarimid
round(time, 30) as time_rounded

You can do that within a SQL query to round the time. 

Solution
‎05-28-2017 05:01 PM
Frequent Contributor
Posts: 141

Re: aggregating data and regression with dummy variables (challenging)

[ Edited ]
Posted in reply to aminkarimid
*Importing Data;
/*
PROC IMPORT DATAFILE= 'D:\SampleData\Table87.xls'
OUT= SampleData87
DBMS= XLS;
RUN;
*/
proc contents data= Sampledata87;
run;
***********************************
*STEP 1: ROUNDING TIME;
***********************************
;
data Sampledata87_RT;
set Sampledata87;
TRD_EVENT_TIME = INPUT(TRD_EVENT_TM,time16.);
TRD_EVENT_ROUNDED = ROUND(TRD_EVENT_TIME,'00:30't);
TRD_EVENT_ROUFOR = PUT(TRD_EVENT_ROUNDED,hhmm.);

***********************************
*STEP 2: CALCULATION Of VOLUME;
***********************************
;
CountedVOLUME = TRD_PR*TRD_TUROVR;

***********************************
*STEP 3: NORMALIZED VOLUME;
***********************************
;

*Denominator
/*Sort by TRD_STCK_CD and temporal variables.*/;
proc sort data=Sampledata87_RT out=Sampledata87_SumVol;
    by  TRD_EVENT_DT;
run;

/*Sum VOLUME until the last of each TRD_STCK_CD is reached.*/
data Sampledata87_SumVolSo;
    set Sampledata87_SumVol;
    by  TRD_EVENT_DT
		TRD_STCK_CD notsorted;
	format TRD_STCK_CD  $5.;
	informat TRD_STCK_CD  $5.;
    retain tmp_volume_sum;
    tmp_volume_sum + CountedVOLUME;
    if last.TRD_STCK_CD then do;
        DailyVolume = tmp_volume_sum;
        call missing(tmp_volume_sum);
    end;
    drop tmp_:;
run;

*The numerator
/*Sum VOLUME until the last of each TRD_STCK_CD is reached.*/;
data Sampledata87_SumVolSo;
    set Sampledata87_SumVolSo;
    by  TRD_EVENT_DT
		TRD_STCK_CD
		TRD_EVENT_ROUFOR notsorted;
    retain tmp_intradayvolume_sum;
    tmp_intradayvolume_sum + CountedVOLUME;
    if last.TRD_EVENT_ROUFOR then do;
        IntradayVolume = tmp_intradayvolume_sum;
        call missing(tmp_intradayvolume_sum);
    end;
    drop tmp_:;
run;

* Another way for calculating daily volume based on data set;
/*
proc sql noprint;
	create table sums as
	select TRD_STCK_CD, TRD_EVENT_DT, sum(CountedVOLUME) as volume_sum
	from Sampledata87_SumVolSo
	group by TRD_STCK_CD, TRD_EVENT_DT;

	create index TRD_STCK_CD on sums;
quit;

data Sampledata87_SumVolSo02;
	set Sampledata87_SumVolSo;
	by  TRD_EVENT_DT
		TRD_STCK_CD notsorted;
	volume_sum = .;
	if last.TRD_STCK_CD then
    set sums key=TRD_STCK_CD;
run;
*/;

*Calculating Volume by Data set;

*Division for calculating of adjusted volume in approach 1;
proc sort data=sampledata87_sumvolso out=sampledata87_sumvolso;
by TRD_STCK_CD TRD_EVENT_DT;
run;
 
data sampledata87_adjvol;
     do until(last.TRD_STCK_CD);
           do until(last.TRD_EVENT_DT);
                set sampledata87_sumvolso;
                by TRD_STCK_CD TRD_EVENT_DT;
 
                if first.TRD_STCK_CD then
                     n=0;
 
                if first.TRD_EVENT_DT then
                     n+1;
 
                if n>1 then
                     do;
                           if not missing(IntradayVolume) then
                                adjusted_volume=divide(IntradayVolume,temp);
                                else call missing(adjusted_volume);
                     end;
 
                if last.TRD_EVENT_DT then
                     temp=dailyvolume;
                output;
           end;
     end;
     drop temp n;
run;
 
proc sort data = sampledata87_adjvol;
by TRD_EVENT_DT TRD_STCK_CD;
run;


***********************************
STEP 4: RUNNING REGRESSION
***********************************
;

*Generating Dummy Variable Automatically;
* Regressing dummy variables on normalized volume variable using calculated volume;
proc genmod data=Sampledata87_adjvol;
   class TRD_EVENT_ROUFOR / param=effect;
   model adjusted_volume = TRD_EVENT_ROUFOR / noscale;
   ods select ParameterEstimates;
run;

/* Same analysis by using the CLASS statement */
proc glm data=Sampledata87_adjvol;
   class TRD_EVENT_ROUFOR;              /* Generates dummy variables internally */
   model adjusted_volume = TRD_EVENT_ROUFOR / solution;
   ods select ParameterEstimates;
quit;




☑ This topic is solved.

Need further help from the community? Please ask a new question.

Discussion stats
  • 4 replies
  • 178 views
  • 0 likes
  • 2 in conversation