Hello,
I am trying to write a macro that will drop variables from my regression model, one at a time, based on VIF. I want it to run, drop the highest VIF variable, run again, drop the highest, then repeat until all VIFs are 5 or less. Please help. My data set is named "save.dafinal", dependent variable is "gaaverage", and independent variables are saved as a macro variable "%radius1demo". Thanks.
Oh. Sorry. I'm new to posting in a community. Usually I just google my way to an answer from the SAS website, but this one has been rough. This is what I've done thus far.
%MACRO MULTICOLLINEARITY(YVAR,FIELDS,MAX_VIF);
ods _all_ close;
%put Running with &fields;
PROC REG DATA=save.dafinal;
MODEL &YVAR = &FIELDS / VIF COLLIN NOINT;
ODS OUTPUT PARAMETERESTIMATES=PAREST2;
RUN;
quit;
proc sort data=parest2;
by descending varianceinflation;
run;
data _null_;
set parest2(obs=1);
if varianceinflation > &max_vif then do;
fields_run = tranwrd("&fields",trim(variable),' ');
if not missing(fields_run) then do;
call_string = cats('%multicollinearity(',"&yvar.,",fields_run,",&max_vif.)");
call execute(call_string);
end;
end;
else do;
put "Stopped with Max VIF:" variable "=" varianceinflation;
end;
run;
ods preferences;
%MEND MULTICOLLINEARITY;
%MULTICOLLINEARITY(yvar=gaaverage, fields=%radius1demovars,MAX_VIF=5)
/* Author PRAKASH HULLATHI */
/*CREATING THE DATA FOR MULTICOLLINEARITY*/
%MACRO REMOVE_MULTICOLLINEARITY(DATASET=,YVAR=,VIF_CUTOFF=);
/*taking output of all variables*/
PROC CONTENTS DATA=&DATASET. VARNUM OUT=T;
RUN;
/*filter only numeric variables excluding dependent and date variables*/
DATA T1;
SET T;
KEEP NAME ;
WHERE TYPE=1 AND NAME NOT IN("&yvar.") and FORMAT not in('DATE') ;/* Remove target and date variables*/
RUN;
/*creating macro for independent variables*/
PROC SQL NOPRINT;
SELECT NAME INTO : XVARS SEPARATED BY ' '
FROM T1;
QUIT;
%PUT independent_variables=&XVARS;
/*running the regression model till independent variables vif< specified vif_cutoff*/
%DO %UNTIL (%SYSEVALF(&MAX_VIF.<=&VIF_CUTOFF.) );
/*taking the output of independent variables vif by removing the intercept*/
ODS OUTPUT PARAMETERESTIMATES=PAREST2;
PROC REG DATA=&DATASET. ;
MODEL &YVAR.= &XVARS. / VIF ;
RUN;
QUIT;
/*dropping the independent variables with missing vif value*/
DATA T11;
SET PAREST2;
IF VarianceInflation NOT IN(.) ;
RUN;
/*sorting the vif value by descending order*/
PROC SORT DATA=PAREST2 OUT=PAREST2_SORT ;
BY DESCENDING VarianceInflation;
RUN;
/*considering the highest vif value */
DATA PAREST2_SORT_2;
SET PAREST2_SORT;
IF _N_=1;
RUN;
/*creating the macro for highest vif value*/
PROC SQL NOPRINT;
SELECT VARIABLE INTO: REMOVE_VAR
FROM PAREST2_SORT_2;
QUIT;
/*dropping the highest vif value variable*/
PROC SQL NOPRINT ;
SELECT Variable INTO: XVARS SEPARATED BY ' '
FROM T11
WHERE VARIABLE NOT IN("&REMOVE_VAR.","Intercept") ;
QUIT;
/*getiing the highest vif value*/
PROC SQL NOPRINT;
SELECT MAX(VarianceInflation) INTO: MAX_VIF
FROM PAREST2;
QUIT;
%PUT max_vif=&MAX_VIF. variable_removed=&REMOVE_VAR. ;
%END;
%MEND REMOVE_MULTICOLLINEARITY;
%REMOVE_MULTICOLLINEARITY(DATASET=sashelp.cars,YVAR=MPG_City,VIF_CUTOFF=10)
/* after running the above sas macro code
for multicollinearity then run the below code */
/* final data set with no multicollinearity variables*/
data NO_MULTICOLLINEARITY;
SET PAREST2;
KEEP VARIABLE;
RUN;
PROC SQL NOPRINT;
SELECT VARIABLE INTO : XVARS_final SEPARATED BY ' '
FROM NO_MULTICOLLINEARITY
WHERE VARIABLE NOT IN("Intercept");
QUIT;
PROC REG DATA=SASHELP.CARS;
MODEL MPG_City=&XVARS_final./ VIF ;
RUN;
QUIT;
/* Author PRAKASH HULLATHI */
/*CREATING THE DATA FOR MULTICOLLINEARITY*/
%MACRO REMOVE_MULTICOLLINEARITY(DATASET=,YVAR=,VIF_CUTOFF=);
/*taking output of all variables*/
PROC CONTENTS DATA=&DATASET. VARNUM OUT=T;
RUN;
/*filter only numeric variables excluding dependent and date variables*/
DATA T1;
SET T;
KEEP NAME ;
WHERE TYPE=1 AND NAME NOT IN("&yvar.") and FORMAT not in('DATE') ;/* Remove target and date variables*/
RUN;
/*creating macro for independent variables*/
PROC SQL NOPRINT;
SELECT NAME INTO : XVARS SEPARATED BY ' '
FROM T1;
QUIT;
%PUT independent_variables=&XVARS;
/*running the regression model till independent variables vif< specified vif_cutoff*/
%DO %UNTIL (%SYSEVALF(&MAX_VIF.<=&VIF_CUTOFF.) );
/*taking the output of independent variables vif by removing the intercept*/
ODS OUTPUT PARAMETERESTIMATES=PAREST2;
PROC REG DATA=&DATASET. ;
MODEL &YVAR.= &XVARS. / VIF ;
RUN;
QUIT;
/*dropping the independent variables with missing vif value*/
DATA T11;
SET PAREST2;
IF VarianceInflation NOT IN(.) ;
RUN;
/*sorting the vif value by descending order*/
PROC SORT DATA=PAREST2 OUT=PAREST2_SORT ;
BY DESCENDING VarianceInflation;
RUN;
/*considering the highest vif value */
DATA PAREST2_SORT_2;
SET PAREST2_SORT;
IF _N_=1;
RUN;
/*creating the macro for highest vif value*/
PROC SQL NOPRINT;
SELECT VARIABLE INTO: REMOVE_VAR
FROM PAREST2_SORT_2;
QUIT;
/*dropping the highest vif value variable*/
PROC SQL NOPRINT ;
SELECT Variable INTO: XVARS SEPARATED BY ' '
FROM T11
WHERE VARIABLE NOT IN("&REMOVE_VAR.","Intercept") ;
QUIT;
/*getiing the highest vif value*/
PROC SQL NOPRINT;
SELECT MAX(VarianceInflation) INTO: MAX_VIF
FROM PAREST2;
QUIT;
%PUT max_vif=&MAX_VIF. variable_removed=&REMOVE_VAR. ;
%END;
%MEND REMOVE_MULTICOLLINEARITY;
%REMOVE_MULTICOLLINEARITY(DATASET=sashelp.cars,YVAR=MPG_City,VIF_CUTOFF=10)
/* after running the above sas macro code
for multicollinearity then run the below code */
/* final data set with no multicollinearity variables*/
data NO_MULTICOLLINEARITY;
SET PAREST2;
KEEP VARIABLE;
RUN;
PROC SQL NOPRINT;
SELECT VARIABLE INTO : XVARS_final SEPARATED BY ' '
FROM NO_MULTICOLLINEARITY
WHERE VARIABLE NOT IN("Intercept");
QUIT;
PROC REG DATA=SASHELP.CARS;
MODEL MPG_City=&XVARS_final./ VIF ;
RUN;
QUIT;
Hi Team,
I am providing the correct solution for removing multicollinearity sequentially.
Regards
Prakash Hullathi
Are you ready for the spotlight? We're accepting content ideas for SAS Innovate 2025 to be held May 6-9 in Orlando, FL. The call is open until September 25. Read more here about why you should contribute and what is in it for you!
ANOVA, or Analysis Of Variance, is used to compare the averages or means of two or more populations to better understand how they differ. Watch this tutorial for more.
Find more tutorials on the SAS Users YouTube channel.