* 표본추출 및 관리 매뉴얼(한국보건사회연구원);
* MODEL VALIDATION TECHNIQUES
: http://www.listendata.com/2015/01/model-validation-in-logistic-regression.html
* Don't Be Loopy: Re-Sampling and Simulation the SAS® Way
: http://www2.sas.com/proceedings/forum2007/183-2007.pdf
* Lab Eight: implementing the bootstrap and 10-fold CV in SAS
: https://web.stanford.edu/~kcobb/hrp261/lab8.doc;
* 예제 데이터 : https://web.stanford.edu/~kcobb/hrp261/kyphosis.sas7bdat;
LIBNAME lab8 'C:\work\SAS\기타';
%LET nsamples=20;
PROC SORT DATA=lab8.kyphosis OUT=kyphosis;
BY Kyphosis;
RUN;
ods select none;
* OUTHITS : 동일한 단위가 표본에 1회이상 포함된 경우표시;
* URS : 복원, SRS : 비복원;
* strata : 개별 표본 별 타겟 비율 유지;
proc surveyselect data=kyphosis method=urs n=83 rep=&nsamples. out=boot SEED=1234;
strata Kyphosis;
run;
ods select all;
PROC SQL;
SELECT SUM(CASE WHEN Kyphosis=1 THEN 1 ELSE 0 END) / COUNT(*) AS RATIO
FROM lab8.kyphosis;
QUIT;
PROC SQL;
SELECT Replicate,
SUM(CASE WHEN Kyphosis=1 THEN 1 ELSE 0 END) / COUNT(*) AS RATIO
FROM boot
GROUP BY 1;
QUIT;
proc sort data=boot;
by Replicate;
run;
proc logistic data=boot outest=est outmodel=LOGIT_MODEL noprint;
by replicate;
model kyphosis (event="1")= age number start;
freq numberhits;
run;
/* 테스트 데이터 SCORE 산출 */
* SCORE 프로시져(TEST 데이터 SCORE 산출);
proc score data=kyphosis(rename=(Kyphosis=Kyphosis_1)) out=SCORE_OUT score=est type=parms;
by replicate;
var age number start;
run;
data RESULT_1;
set SCORE_OUT;
prob=exp(Kyphosis)/(1+exp(Kyphosis));
run;
* logistic 프로시져 INMODEL(TEST 데이터 SCORE 산출);
proc logistic inmodel=LOGIT_MODEL;
by replicate;
score data=kyphosis(rename=(Kyphosis=Kyphosis_1)) out=RESULT_2;
run;
* TEST 데이터 세트의 개별 SCORE 평균값 계산;
PROC SQL;
CREATE TABLE RESULT_PREV AS
SELECT ID,
AVG(PROB) AS AVG_SOCRE
FROM RESULT_1
GROUP BY 1;
QUIT;
*********************************************;
/* CROSS-VALIDATION */
*********************************************;
%let K=10;
%let rate=%sysevalf((&K-1)/&K);
%PUT &RATE;
/* generate the cross-validation sample */
proc surveyselect data=kyphosis out=xv seed=495857
samprate=&RATE outall rep=&K method=urs ;
run;
data xv;
set xv;
if selected then new_Kyphosis=Kyphosis;
run;
/* get predicted values for the missing new_y in each replicate */
proc logistic data=xv outest=est outmodel=LOGIT_MODEL outest=est noprint;
by replicate;
model new_Kyphosis (event="1")= age number start;
freq numberhits;
output out=out1(where=(new_Kyphosis=.)) p=yhat;
run;
*********************************************;
/* do LOOCV */
*********************************************;
data xv;
do replicate = 1 to numrecs;
do rec = 1 to numrecs;
set kyphosis nobs=numrecs point=rec;
if replicate ^= rec then new_Kyphosis=Kyphosis; else new_Kyphosis=.;
OUTPUT;
end;
end;
STOP;
run;
proc logistic data=xv outest=est outmodel=LOGIT_MODEL outest=est noprint;
by replicate;
model new_Kyphosis (event="1")= age number start;
output out=out1(where=(new_Kyphosis=.)) p=yhat;
run;
*********************************************;
* 참고;
*********************************************;
PROC SORT DATA=kyphosis;
BY ID;
RUN;
/* 테스트 데이터 10 개 생성 : 비복원 추출로 아래 코드와 동일 로직 */
proc surveyselect data=kyphosis method=SRS n=83 rep=10 out=TEST_DATA SEED=1234 NOPRINT;
run;
data TEST_DATA_1;
do replicate = 1 to &nsamples;
do rec = 1 to numrecs;
set kyphosis nobs=numrecs point=rec;
OUTPUT;
end;
end;
STOP;
run;
Registration is now open for SAS Innovate 2025 , our biggest and most exciting global event of the year! Join us in Orlando, FL, May 6-9.
Sign up by Dec. 31 to get the 2024 rate of just $495.
Register now!