Hi everyone,
I have created macro to test for multicollinearity but I have a couple problems that I need help from the following code.
%macro vif_automated(target,num);
%let VIF_limit=2;
%let VIF_val=100;
/*Variable list*/
proc sql noprint;
select Variable
into :varlist
separated by " "
from PD.cluster_&segment._select
where variable not in ("&target.");
quit;
/*Create a blank table for removed variable*/
proc reg data=PD.MEF_ALL_&segment.;
model &target.=&varlist. /vif;
ods output parameterestimates=vif;
quit;
data removed_variable_list;
set vif;
if _n_=0 then output removed_variable_list;
run;
/*Loop*/
%do %while (%sysevalf(&vif_val.>&VIF_limit.));
proc reg data=PD.MEF_ALL_&segment.;
model &target.=&varlist. /vif;
ods output parameterestimates=vif;
quit;
dm 'odsresults; clear';
dm 'log; clear';
proc sort data=vif;
by descending VarianceInflation;
quit;
data vif_top vif_others;
set vif;
if _n_=1 then output vif_top;
if _n_>1 then output vif_others;
run;
data vif_top;
set vif_top;
call symput(compress("vif_val"),compress(VarianceInflation));
run;
data removed_variable_list;
set vif_top removed_variable_list;
run;
proc sql;
select distinct variable
into: varlist
separated by " "
from vif_others
where variable^="Intercept";
quit;
%end;
data final_variable_vif_&num.;
set vif;
run;
%mend;
%vif_automated(ConLoanc_M12_lg2,1);
%vif_automated(X_CVM_M12_lg4,2);
%vif_automated(C_CVM_M6_lg1,3);
%vif_automated(Auto_sale_comm_C,4);
%vif_automated(C_CVM_M12_lg3,5);
%vif_automated(C_CVM_M6_lg4,6);
%vif_automated(C_CVM_M6_lg3,7);
%vif_automated(Farm_C_lg1,8);
%vif_automated(X_CVM_M6_lg2,9);
%vif_automated(UR_M6,10);
%vif_automated(X_CVM_LN_lg4,11);
%vif_automated(X_CVM_M3_lg4,12);
%vif_automated(Auto_sale_Pass_C,13);
%vif_automated(CCI_C_lg2,14);
%vif_automated(UR_LN,15);
%vif_automated(UR_M3,16);
%vif_automated(C_CVM_lg3,17);
%vif_automated(Farm_C,18);
I try to avoid to type Y variable name as a target by write code like above. Are there any faster way to set one of the variables in PD.cluster_&segment._select file as Y and the rest of them are X.
Another thing is that when I manually set Y variable, I have to put the number 1,2,3,... to keep result. Are there any way to create the final results file as one time after all combination.
Thank you so much
You could re-write the macro as big loop, which has all of the Y variables as input. This seems to solve both of your questions. However, it may not be worth the work now that you have written the existing macro, re-writing the macro may simply be extra work for no benefit.
For example, something like this
%macro vif_automated(targets=);
...
%mend;
%vif_automated(targets=ConcLoanc_M12_lg2 X_CBM_M12_lg4 C_CVM_Mg_lg1 ...)
Note: the forum software has mangled the formatting of the SAS text above...
As a side issue, there are many different existing automated ways to deal with multicollinearity that are already built into SAS, such as PROC PLS, that are superior to the method you are creating.
/* Author PRAKASH HULLATHI */
/*CREATING THE DATA FOR MULTICOLLINEARITY*/
%MACRO REMOVE_MULTICOLLINEARITY(DATASET=,YVAR=,VIF_CUTOFF=);
/*taking output of all variables*/
PROC CONTENTS DATA=&DATASET. VARNUM OUT=T;
RUN;
/*filter only numeric variables excluding dependent and date variables*/
DATA T1;
SET T;
KEEP NAME ;
WHERE TYPE=1 AND NAME NOT IN("&yvar.") and FORMAT not in('DATE') ;/* Remove target and date variables*/
RUN;
/*creating macro for independent variables*/
PROC SQL NOPRINT;
SELECT NAME INTO : XVARS SEPARATED BY ' '
FROM T1;
QUIT;
%PUT independent_variables=&XVARS;
/*running the regression model till independent variables vif< specified vif_cutoff*/
%DO %UNTIL (%SYSEVALF(&MAX_VIF.<=&VIF_CUTOFF.) );
/*taking the output of independent variables vif by removing the intercept*/
ODS OUTPUT PARAMETERESTIMATES=PAREST2;
PROC REG DATA=&DATASET. ;
MODEL &YVAR.= &XVARS. / VIF ;
RUN;
QUIT;
/*dropping the independent variables with missing vif value*/
DATA T11;
SET PAREST2;
IF VarianceInflation NOT IN(.) ;
RUN;
/*sorting the vif value by descending order*/
PROC SORT DATA=PAREST2 OUT=PAREST2_SORT ;
BY DESCENDING VarianceInflation;
RUN;
/*considering the highest vif value */
DATA PAREST2_SORT_2;
SET PAREST2_SORT;
IF _N_=1;
RUN;
/*creating the macro for highest vif value*/
PROC SQL NOPRINT;
SELECT VARIABLE INTO: REMOVE_VAR
FROM PAREST2_SORT_2;
QUIT;
/*dropping the highest vif value variable*/
PROC SQL NOPRINT ;
SELECT Variable INTO: XVARS SEPARATED BY ' '
FROM T11
WHERE VARIABLE NOT IN("&REMOVE_VAR.","Intercept") ;
QUIT;
/*getiing the highest vif value*/
PROC SQL NOPRINT;
SELECT MAX(VarianceInflation) INTO: MAX_VIF
FROM PAREST2;
QUIT;
%PUT max_vif=&MAX_VIF. variable_removed=&REMOVE_VAR. ;
%END;
%MEND REMOVE_MULTICOLLINEARITY;
%REMOVE_MULTICOLLINEARITY(DATASET=sashelp.cars,YVAR=MPG_City,VIF_CUTOFF=10)
/* after running the above sas macro code
for multicollinearity then run the below code */
/* final data set with no multicollinearity variables*/
data NO_MULTICOLLINEARITY;
SET PAREST2;
KEEP VARIABLE;
RUN;
PROC SQL NOPRINT;
SELECT VARIABLE INTO : XVARS_final SEPARATED BY ' '
FROM NO_MULTICOLLINEARITY
WHERE VARIABLE NOT IN("Intercept");
QUIT;
PROC REG DATA=SASHELP.CARS;
MODEL MPG_City=&XVARS_final./ VIF ;
RUN;
QUIT;
/* Author PRAKASH HULLATHI */
/*CREATING THE DATA FOR MULTICOLLINEARITY*/
%MACRO REMOVE_MULTICOLLINEARITY(DATASET=,YVAR=,VIF_CUTOFF=);
/*taking output of all variables*/
PROC CONTENTS DATA=&DATASET. VARNUM OUT=T;
RUN;
/*filter only numeric variables excluding dependent and date variables*/
DATA T1;
SET T;
KEEP NAME ;
WHERE TYPE=1 AND NAME NOT IN("&yvar.") and FORMAT not in('DATE') ;/* Remove target and date variables*/
RUN;
/*creating macro for independent variables*/
PROC SQL NOPRINT;
SELECT NAME INTO : XVARS SEPARATED BY ' '
FROM T1;
QUIT;
%PUT independent_variables=&XVARS;
/*running the regression model till independent variables vif< specified vif_cutoff*/
%DO %UNTIL (%SYSEVALF(&MAX_VIF.<=&VIF_CUTOFF.) );
/*taking the output of independent variables vif by removing the intercept*/
ODS OUTPUT PARAMETERESTIMATES=PAREST2;
PROC REG DATA=&DATASET. ;
MODEL &YVAR.= &XVARS. / VIF ;
RUN;
QUIT;
/*dropping the independent variables with missing vif value*/
DATA T11;
SET PAREST2;
IF VarianceInflation NOT IN(.) ;
RUN;
/*sorting the vif value by descending order*/
PROC SORT DATA=PAREST2 OUT=PAREST2_SORT ;
BY DESCENDING VarianceInflation;
RUN;
/*considering the highest vif value */
DATA PAREST2_SORT_2;
SET PAREST2_SORT;
IF _N_=1;
RUN;
/*creating the macro for highest vif value*/
PROC SQL NOPRINT;
SELECT VARIABLE INTO: REMOVE_VAR
FROM PAREST2_SORT_2;
QUIT;
/*dropping the highest vif value variable*/
PROC SQL NOPRINT ;
SELECT Variable INTO: XVARS SEPARATED BY ' '
FROM T11
WHERE VARIABLE NOT IN("&REMOVE_VAR.","Intercept") ;
QUIT;
/*getiing the highest vif value*/
PROC SQL NOPRINT;
SELECT MAX(VarianceInflation) INTO: MAX_VIF
FROM PAREST2;
QUIT;
%PUT max_vif=&MAX_VIF. variable_removed=&REMOVE_VAR. ;
%END;
%MEND REMOVE_MULTICOLLINEARITY;
%REMOVE_MULTICOLLINEARITY(DATASET=sashelp.cars,YVAR=MPG_City,VIF_CUTOFF=10)
/* after running the above sas macro code
for multicollinearity then run the below code */
/* final data set with no multicollinearity variables*/
data NO_MULTICOLLINEARITY;
SET PAREST2;
KEEP VARIABLE;
RUN;
PROC SQL NOPRINT;
SELECT VARIABLE INTO : XVARS_final SEPARATED BY ' '
FROM NO_MULTICOLLINEARITY
WHERE VARIABLE NOT IN("Intercept");
QUIT;
PROC REG DATA=SASHELP.CARS;
MODEL MPG_City=&XVARS_final./ VIF ;
RUN;
QUIT;
/* Author PRAKASH HULLATHI */
/*CREATING THE DATA FOR MULTICOLLINEARITY*/
%MACRO REMOVE_MULTICOLLINEARITY(DATASET=,YVAR=,VIF_CUTOFF=);
/*taking output of all variables*/
PROC CONTENTS DATA=&DATASET. VARNUM OUT=T;
RUN;
/*filter only numeric variables excluding dependent and date variables*/
DATA T1;
SET T;
KEEP NAME ;
WHERE TYPE=1 AND NAME NOT IN("&yvar.") and FORMAT not in('DATE') ;/* Remove target and date variables*/
RUN;
/*creating macro for independent variables*/
PROC SQL NOPRINT;
SELECT NAME INTO : XVARS SEPARATED BY ' '
FROM T1;
QUIT;
%PUT independent_variables=&XVARS;
/*running the regression model till independent variables vif< specified vif_cutoff*/
%DO %UNTIL (%SYSEVALF(&MAX_VIF.<=&VIF_CUTOFF.) );
/*taking the output of independent variables vif by removing the intercept*/
ODS OUTPUT PARAMETERESTIMATES=PAREST2;
PROC REG DATA=&DATASET. ;
MODEL &YVAR.= &XVARS. / VIF ;
RUN;
QUIT;
/*dropping the independent variables with missing vif value*/
DATA T11;
SET PAREST2;
IF VarianceInflation NOT IN(.) ;
RUN;
/*sorting the vif value by descending order*/
PROC SORT DATA=PAREST2 OUT=PAREST2_SORT ;
BY DESCENDING VarianceInflation;
RUN;
/*considering the highest vif value */
DATA PAREST2_SORT_2;
SET PAREST2_SORT;
IF _N_=1;
RUN;
/*creating the macro for highest vif value*/
PROC SQL NOPRINT;
SELECT VARIABLE INTO: REMOVE_VAR
FROM PAREST2_SORT_2;
QUIT;
/*dropping the highest vif value variable*/
PROC SQL NOPRINT ;
SELECT Variable INTO: XVARS SEPARATED BY ' '
FROM T11
WHERE VARIABLE NOT IN("&REMOVE_VAR.","Intercept") ;
QUIT;
/*getiing the highest vif value*/
PROC SQL NOPRINT;
SELECT MAX(VarianceInflation) INTO: MAX_VIF
FROM PAREST2;
QUIT;
%PUT max_vif=&MAX_VIF. variable_removed=&REMOVE_VAR. ;
%END;
%MEND REMOVE_MULTICOLLINEARITY;
%REMOVE_MULTICOLLINEARITY(DATASET=sashelp.cars,YVAR=MPG_City,VIF_CUTOFF=10)
/* after running the above sas macro code
for multicollinearity then run the below code */
/* final data set with no multicollinearity variables*/
data NO_MULTICOLLINEARITY;
SET PAREST2;
KEEP VARIABLE;
RUN;
PROC SQL NOPRINT;
SELECT VARIABLE INTO : XVARS_final SEPARATED BY ' '
FROM NO_MULTICOLLINEARITY
WHERE VARIABLE NOT IN("Intercept");
QUIT;
PROC REG DATA=SASHELP.CARS;
MODEL MPG_City=&XVARS_final./ VIF ;
RUN;
QUIT;
/* Author PRAKASH HULLATHI */
/*CREATING THE DATA FOR MULTICOLLINEARITY*/
%MACRO REMOVE_MULTICOLLINEARITY(DATASET=,YVAR=,VIF_CUTOFF=);
/*taking output of all variables*/
PROC CONTENTS DATA=&DATASET. VARNUM OUT=T;
RUN;
/*filter only numeric variables excluding dependent and date variables*/
DATA T1;
SET T;
KEEP NAME ;
WHERE TYPE=1 AND NAME NOT IN("&yvar.") and FORMAT not in('DATE') ;/* Remove target and date variables*/
RUN;
/*creating macro for independent variables*/
PROC SQL NOPRINT;
SELECT NAME INTO : XVARS SEPARATED BY ' '
FROM T1;
QUIT;
%PUT independent_variables=&XVARS;
/*running the regression model till independent variables vif< specified vif_cutoff*/
%DO %UNTIL (%SYSEVALF(&MAX_VIF.<=&VIF_CUTOFF.) );
/*taking the output of independent variables vif by removing the intercept*/
ODS OUTPUT PARAMETERESTIMATES=PAREST2;
PROC REG DATA=&DATASET. ;
MODEL &YVAR.= &XVARS. / VIF ;
RUN;
QUIT;
/*dropping the independent variables with missing vif value*/
DATA T11;
SET PAREST2;
IF VarianceInflation NOT IN(.) ;
RUN;
/*sorting the vif value by descending order*/
PROC SORT DATA=PAREST2 OUT=PAREST2_SORT ;
BY DESCENDING VarianceInflation;
RUN;
/*considering the highest vif value */
DATA PAREST2_SORT_2;
SET PAREST2_SORT;
IF _N_=1;
RUN;
/*creating the macro for highest vif value*/
PROC SQL NOPRINT;
SELECT VARIABLE INTO: REMOVE_VAR
FROM PAREST2_SORT_2;
QUIT;
/*dropping the highest vif value variable*/
PROC SQL NOPRINT ;
SELECT Variable INTO: XVARS SEPARATED BY ' '
FROM T11
WHERE VARIABLE NOT IN("&REMOVE_VAR.","Intercept") ;
QUIT;
/*getiing the highest vif value*/
PROC SQL NOPRINT;
SELECT MAX(VarianceInflation) INTO: MAX_VIF
FROM PAREST2;
QUIT;
%PUT max_vif=&MAX_VIF. variable_removed=&REMOVE_VAR. ;
%END;
%MEND REMOVE_MULTICOLLINEARITY;
%REMOVE_MULTICOLLINEARITY(DATASET=sashelp.cars,YVAR=MPG_City,VIF_CUTOFF=10)
/* after running the above sas macro code
for multicollinearity then run the below code */
/* final data set with no multicollinearity variables*/
data NO_MULTICOLLINEARITY;
SET PAREST2;
KEEP VARIABLE;
RUN;
PROC SQL NOPRINT;
SELECT VARIABLE INTO : XVARS_final SEPARATED BY ' '
FROM NO_MULTICOLLINEARITY
WHERE VARIABLE NOT IN("Intercept");
QUIT;
PROC REG DATA=SASHELP.CARS;
MODEL MPG_City=&XVARS_final./ VIF ;
RUN;
QUIT;
SAS Innovate 2025 is scheduled for May 6-9 in Orlando, FL. Sign up to be first to learn about the agenda and registration!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.