/*%let integrated_profile_full_ds = fpm05.integrated_profile;*/ /*%let integrated_profile_ds = fpm05.integrated_profile;*/ ***************************************************************************************************; * PLS MACROS ***************************************************************************************************; *pls_train macro; *Fits a Partial Least Squares (PLS) model and creates a scoring dataset for future use with new data; *input_ds: Input dataset with training data; *output_ds: Output dataset (observation-level predictions) *out_score: Output dataset (scoring dataset); *target_var: Dependent variable(s); *predictor_vars: Independent variables; *id_var: Customer ID; %macro pls_train ( input_ds, output_ds, target_var, predictor_vars, id_var, out_score = , method = pls, nfac = , xscore_prefix = factor, yscore_prefix = yfactor, h_var = h, tsquare_var = tsquare, XCenScale_ds = tmp_XCenScale_ds, XWeights_ds = tmp_XWeights_ds, desc = ); %if %length(&desc.)=0 %then %let desc = "PLS on &input_ds."; ods graphics on / discretemax=60000 imagemap=on; ods output XVariableCenScale = &XCenScale_ds.; ods output XWeights = &XWeights_ds.; proc pls data = &input_ds. method = &method. plot = (parmprofiles vip diagnostics dmod) cv = split cvtest(seed=12345) details censcale varscale %if %length(&nfac.) %then %do; nfac=&nfac. %end; ; title "&desc"; %if %length(&id_var.) %then %do; id &id_var.; %end; model &target_var. = &predictor_vars. / solution; output out = &output_ds. predicted = %preffix_variable_list(&target_var., pred_) yresidual = %preffix_variable_list(&target_var., yres_) yscore = &yscore_prefix. xscore = &xscore_prefix. h = &h_var. tsquare = &tsquare_var. ; *xresidual = %preffix_variable_list(&predictor_vars., res_); run; ods graphics off; *Remove factors completely missing; %local missing_factor_list output_record_total; %let output_record_total = ; proc means noprint data=&output_ds.; var &xscore_prefix.: &yscore_prefix.: ; output out=_tmp_outpls_nmiss (drop=_type_) nmiss=; run; proc sql noprint; select _FREQ_ into :output_record_total from _tmp_outpls_nmiss; quit; %put pls_score: output_record_total = {&output_record_total.}; %let missing_factor_list = ; proc transpose data=_tmp_outpls_nmiss (drop=_freq_) out=_tmp_outpls_nmiss (rename=(col1=nmiss)); run; data _tmp_outpls_nmiss; set _tmp_outpls_nmiss; where (nmiss = &output_record_total.); run; proc sql noprint; select _NAME_ into :missing_factor_list separated by ' ' from _tmp_outpls_nmiss; quit; %put pls_score: missing_factor_list = {&missing_factor_list.}; %delete_vars(&output_ds., &missing_factor_list.); %delete_ds(_tmp_outpls_nmiss); *Create scoring dataset; proc transpose data=&XCenScale_ds. out=_tmp_xcenscale_tx; id variable ; run; data _tmp_xcenscale_tx; if _n_=1 then _TYPE_='MEAN'; else _TYPE_='STD'; set _tmp_xcenscale_tx; run; data _tmp_xweights; length _TYPE_ $ 5 _NAME_ $32; set &XWeights_ds.; _TYPE_ = 'SCORE'; _NAME_ = compress("&xscore_prefix."||_N_); run; data &out_score.; length _TYPE_ $ 5 _NAME_ $32; set _tmp_xcenscale_tx _tmp_xweights; run; %delete_vars(&out_score., _LABEL_); %delete_ds(_tmp_xcenscale_tx _tmp_xweights); *Cleanup datasets (if name not explicitly changed by user); %delete_ds(tmp_XCenScale_ds tmp_XWeights_ds); %mend pls_train; *pls_score_factors macro; *Generates the "x factors" for new observations; *new_data_input_ds: new data (with the same original predictors that went into pls_train); *score_input_ds: scoring dataset produced by pls_train(); *new_data_output_ds: output dataset (new data + x factors) *variable_list: list of prediction variables; %macro pls_score_factors ( new_data_input_ds, score_input_ds, new_data_output_ds, variable_list ); proc score data = &new_data_input_ds. score = &score_input_ds. out = &new_data_output_ds. ; %if %length(&variable_list.) %then %do; var &variable_list.; %end; run; %mend pls_score_factors; ***************************************************************************************************; * BASIC HELP MACROS ***************************************************************************************************; *Generates a list of variables by adding preffixes to the variable name (resi_a resi_b resi_c...); %macro preffix_variable_list (var_list, preffix_list); %local v i; %local v2 i2; %let i = 1; %let v = %scan(&var_list., &i.,%str( )); %do %while(%length(&v.)); %let i2 = 1; %let v2 = %scan(&preffix_list., &i2.,%str( )); %do %while(%length(&v2.)); &v2.&v. %let i2 = %eval (&i2. + 1); %let v2 = %scan(&preffix_list., &i2.,%str( )); %end; %let i = %eval (&i. + 1); %let v = %scan(&var_list., &i.,%str( )); %end; %mend preffix_variable_list; *Deletes datasets checking first if they do exist; %macro delete_ds (ds_list); %local v i; %let i = 1; %let v = %scan(&ds_list., &i., %str( )); %do %while(%length(&v.)); %if %sysfunc(exist(&v.)) %then %do; proc delete data=&v.; run; %end; %let i = %eval (&i. + 1); %let v = %scan(&ds_list., &i., %str( )); %end; %mend delete_ds; *Deletes multiple variables from a dataset if they do exist; %macro delete_vars (ds, vars); %local varidx var vardelete varnotexist varfound; %if %sysfunc(exist(&ds.)) %then %do; %let varidx = 1; %let var = %scan(&vars., &varidx., %str( )); %let vardelete=; %let varnotexist=; %do %while (%length(&var.)); *Confirm existence of vars; data _null_; dsid=open("&ds."); if varnum(dsid,"&var.")~=0 then call symput('varfound','Y'); else call symput('varfound','N'); run; *Create lists of vars; %if &varfound.=Y %then %let vardelete = &vardelete. &var.; %else %let varnotexist = &varnotexist. &var.; %let varidx = %eval(&varidx. + 1); %let var = %scan(&vars., &varidx., %str( )); %end; %if %length(&varnotexist.) %then %do; %put delete_vars: variables {&varnotexist.} not found in dataset &ds.; %end; %if %length(&vardelete.) %then %do; *Drops vars; %put delete_vars: removing variables {&vardelete.} from dataset &ds.; data &ds.; set &ds. (drop=&vardelete.); run; %end; %end; %else %do; %put delete_vars: dataset &ds not found; %end; %mend delete_vars; ***************************************************************************************************; * EXECUTE THE TEST ***************************************************************************************************; *generate some simulated data; *x1 -- x4 are the real factors *x1b -- x3c are highly redundant variables which will hopefully be eliminated (if PLS works as expected); *y is the dependent variable; *also split the data in two sets (training and validation) randomly; data input_data; call streaminit(38125); do ID = 1 to 2000; x1 = rand('NORMAL', 20, 25); x2 = rand('NORMAL', -20, 30); x3 = rand('NORMAL', 100, 20); x4 = (5*x1 + x2)/4 + rand('NORMAL', 0, 20); x1b = x1 + rand('NORMAL',0, 0.1); x2b = x2 + rand('NORMAL',0, 0.1); x3b = x3 + rand('NORMAL',0, 0.1); x1c = x1 + rand('NORMAL',100, 2); x2c = x2 + rand('NORMAL',100, 2); x3c = x3 + rand('NORMAL',100, 2); y = 100 + 10*x1 + 5*x2 - 3*x3 + 0.1*x4 + + rand('NORMAL',0,25); if rand('UNIFORM')<0.70 then group = 'TRAIN'; else group = 'VALID'; output; end; run; proc sort data=input_data; by group ID; run; title 'Check input data'; proc corr data=input_data noprob; by group; var x: y: ; run; *RUN PLS MODEL (WITH TRAINING DATA ONLY); %pls_train ( input_data (where=(group in ('TRAIN'))), output_pls, y, x: , ID, method = RRR, out_score = scoring_data, XCenScale_ds = XCenScale, XWeights_ds = XWeights, desc = PLS model for Y ); *SCORE EVERYTHING (TRAINING AND VALIDATION); %pls_score_factors ( input_data, scoring_data, output_scoring, x: ); *COMPARE THE SCORED FACTORS WITH THE ORIGINAL PLS FACTORS (FOR TRAINING DATA ONLY... JUST TO CHECK); proc means data=output_pls n min p1 q1 median mean q3 p99 max; title 'output_pls'; var factor: ; where (group in ('TRAIN')); run; proc means data=output_scoring n min p1 q1 median mean q3 p99 max; title 'output_scoring'; var factor: ; where (group in ('TRAIN')); run; proc reg data=output_pls; title 'output_pls'; model y = factor: / vif; where (group in ('TRAIN')); run; proc reg data=output_scoring; title 'output_scoring'; model y = factor: / vif; where (group in ('TRAIN')); run; *CALCULATE DIFFERENCES (FOR TRAINING DATA ONLY... JUST TO CHECK); proc sort data=output_pls; by ID; run; proc sort data=output_scoring; by ID; run; data output_combined; merge output_pls (in=in1 keep=ID factor:) output_scoring (in=in2 keep = ID factor1 rename = (factor1=factor_scr1) ); by ID; if (in1 and in2); factor1_diff = factor_scr1 - factor1; /*factor2_diff = factor_scr2 - factor2;*/ /*factor3_diff = factor_scr3 - factor3;*/ factor1_prop = factor_scr1 / factor1; /*factor2_prop = factor_scr2 / factor2;*/ /*factor3_prop = factor_scr3 / factor3;*/ run; proc means data=output_combined n min p1 q1 median mean q3 p99 max; title 'output_combined'; var factor: ; run;