I have a GLM model :
PROC GENMOD DATA = MYDATA;
CLASS COUNTRY JOB;
MODEL MONEY = COUNTRY JOB / DIST = GAMMA LINK = LOG;
RUN;
I want to estimate the prediction error. How can I do 10-fold cross validation on my data with SAS ?
There is an example I wrote before. But it is for Logistic Regression.
/****** K-Fold CV ****/
%macro k_fold_cv(k=10);
ods select none;
proc surveyselect data=sashelp.heart group=&k out=have;
run;
%do i=1 %to &k ;
data training;
set have(where=(groupid ne &i)) ;
run;
data test;
set have(where=(groupid eq &i));
run;
ods output
Association=native(keep=label2 nvalue2 rename=(nvalue2=native) where=(label2='c'))
ScoreFitStat=true(keep=dataset freq auc rename=(auc=true));
proc logistic data=training
outest=est(keep=_status_ _name_) ;
class sex;
model status(event='Alive')=sex height weight;
score data=test fitstat;
run;
data score&i;
merge true native est;
retain id &i ;
optimism=native-true;
run;
%end;
data k_fold_cv;
set score1-score&k;
run;
ods select all;
%mend;
%k_fold_cv(k=10)
/*************************************/
%macro k_fold_cv_rep(r=1,k=10);
ods select none;
%do r=1 %to &r;
proc surveyselect data=sashelp.heart group=&k out=have;
run;
%do i=1 %to &k ;
data training;
set have(where=(groupid ne &i)) ;
run;
data test;
set have(where=(groupid eq &i));
run;
ods output
Association=native(keep=label2 nvalue2 rename=(nvalue2=native) where=(label2='c'))
ScoreFitStat=true(keep=dataset freq auc rename=(auc=true));
proc logistic data=training
outest=est(keep=_status_ _name_) ;
class sex;
model status(event='Alive')=sex height weight;
score data=test fitstat;
run;
data score_r&r._&i;
merge true native est;
retain rep &r id &i;
optimism=native-true;
run;
%end;
%end;
data k_fold_cv_rep;
set score_r:;
run;
ods select all;
%mend;
%k_fold_cv_rep(r=20,k=10);
/********************/
data all;
set k_fold_cv k_fold_cv_rep indsname=indsn;
length indsname $ 32;
indsname=indsn;
run;
proc summary data=all nway;
class indsname;
var optimism;
output out=want mean=mean lclm=lclm uclm=uclm;
run;
As far as I know, there is no way in SAS to cross-validate such a model. You'd have to write your own Cross-Validation code, or find a macro that someone else has written.
There is an example I wrote before. But it is for Logistic Regression.
/****** K-Fold CV ****/
%macro k_fold_cv(k=10);
ods select none;
proc surveyselect data=sashelp.heart group=&k out=have;
run;
%do i=1 %to &k ;
data training;
set have(where=(groupid ne &i)) ;
run;
data test;
set have(where=(groupid eq &i));
run;
ods output
Association=native(keep=label2 nvalue2 rename=(nvalue2=native) where=(label2='c'))
ScoreFitStat=true(keep=dataset freq auc rename=(auc=true));
proc logistic data=training
outest=est(keep=_status_ _name_) ;
class sex;
model status(event='Alive')=sex height weight;
score data=test fitstat;
run;
data score&i;
merge true native est;
retain id &i ;
optimism=native-true;
run;
%end;
data k_fold_cv;
set score1-score&k;
run;
ods select all;
%mend;
%k_fold_cv(k=10)
/*************************************/
%macro k_fold_cv_rep(r=1,k=10);
ods select none;
%do r=1 %to &r;
proc surveyselect data=sashelp.heart group=&k out=have;
run;
%do i=1 %to &k ;
data training;
set have(where=(groupid ne &i)) ;
run;
data test;
set have(where=(groupid eq &i));
run;
ods output
Association=native(keep=label2 nvalue2 rename=(nvalue2=native) where=(label2='c'))
ScoreFitStat=true(keep=dataset freq auc rename=(auc=true));
proc logistic data=training
outest=est(keep=_status_ _name_) ;
class sex;
model status(event='Alive')=sex height weight;
score data=test fitstat;
run;
data score_r&r._&i;
merge true native est;
retain rep &r id &i;
optimism=native-true;
run;
%end;
%end;
data k_fold_cv_rep;
set score_r:;
run;
ods select all;
%mend;
%k_fold_cv_rep(r=20,k=10);
/********************/
data all;
set k_fold_cv k_fold_cv_rep indsname=indsn;
length indsname $ 32;
indsname=indsn;
run;
proc summary data=all nway;
class indsname;
var optimism;
output out=want mean=mean lclm=lclm uclm=uclm;
run;
Join us for SAS Innovate 2025, our biggest and most exciting global event of the year, in Orlando, FL, from May 6-9. Sign up by March 14 for just $795.
Use this tutorial as a handy guide to weigh the pros and cons of these commonly used machine learning algorithms.
Find more tutorials on the SAS Users YouTube channel.