/*
Hi,
I am using PROC PLS to identify orthogonal factors from a set of predictors that has a lot of redundancy (and very high multicollinearity.) PLS works great. The factors and the regression models with 
those factors are fine.

I also have additional observations that were not used when generating the PLS model (e.g. a validation set, new data, etc.) These observations need to be "scored" using the model. Unfortunately, PLS does 
not produce a "scoring output dataset" but I have a macro that will generate one using the information in these two ODS tables: XVariableCenScale and XWeights.

When I use that "scoring output dataset" with PROC SCORE, I get correct predictions for the first factor, some differences in the second factor, bigger differences in the third factor, etc. The scored factors become worse and worse.

Perhaps the assumptions of this macro are not true. I beleive that I am using the correct ODS tables because the first factor is correct. Buy why not the other ones? I know PLS builds the factors in an iterative fashion. Maybe there is no way to derive those factors with a simple matrix multiplication (which is what proc score does.)  Maybe the scoring has to be implemented in an iterative 
fashion too (e.g. start with scaling/centering the data, then get the first factor... then operate on the remaining data, then get the second factor, etc.)

Does anybody has pointers to get this to work right for all factors? or does anybody know an existing procedure that I can use? (I dont want to reinvent the wheel if something is already out there...)

I am attaching the code of the macro and an example with simulated data for anybody that wants to reproduce this issue.

I would appreciate any contributions!

Thank you,
Carlos

*/




***************************************************************************************************;
* PLS MACROS
***************************************************************************************************;

*pls_train macro;
*Fits a Partial Least Squares (PLS) model and creates a scoring dataset for future use with new data;
*input_ds:			Input dataset with training data;
*output_ds: 		Output dataset (observation-level predictions)
*out_score:         Output dataset (scoring dataset);
*target_var:		Dependent variable(s);
*predictor_vars:	Independent variables;
*id_var:            Customer ID;
%macro pls_train (
			input_ds, 
			output_ds,
			target_var,
			predictor_vars,
			id_var,
			out_score      = ,
			nfac           = ,
			xscore_prefix  = factor,
			yscore_prefix  = yfactor,
			h_var          = h,
			tsquare_var    = tsquare,
			XCenScale_ds   = tmp_XCenScale_ds,
			XWeights_ds    = tmp_XWeights_ds,
			desc           =
			);

	%if %length(&desc.)=0 %then %let desc = "PLS on &input_ds.";

	ods graphics on / discretemax=60000 imagemap=on;
	ods output XVariableCenScale = &XCenScale_ds.; 
	ods output XWeights          = &XWeights_ds.; 
	proc pls 
			data    = &input_ds.
			method  = pls
			plot    = (parmprofiles vip diagnostics dmod)  
			cv      = split cvtest(seed=12345)
			details censcale varscale
			%if %length(&nfac.) %then %do; nfac=&nfac. %end;
			;
		title "&desc";
		%if %length(&id_var.) %then %do; id &id_var.;  %end;
		model &target_var. = &predictor_vars. / solution;
		output	out       = &output_ds.
				predicted = %preffix_variable_list(&target_var., pred_)
				yresidual = %preffix_variable_list(&target_var., yres_)
				yscore    = &yscore_prefix. 
				xscore    = &xscore_prefix. 
				h         = &h_var.
				tsquare   = &tsquare_var.
				;
				*xresidual = %preffix_variable_list(&predictor_vars., res_);
	run;
	ods graphics off;

	*Remove factors completely missing;
	%local missing_factor_list output_record_total;

	%let output_record_total = ;
	proc means noprint data=&output_ds.; var &xscore_prefix.: &yscore_prefix.: ; output out=_tmp_outpls_nmiss (drop=_type_) nmiss=; run;
	proc sql noprint; select _FREQ_ into :output_record_total from _tmp_outpls_nmiss; quit; 
	%put pls_score: output_record_total = {&output_record_total.};

	%let missing_factor_list = ;
	proc transpose data=_tmp_outpls_nmiss (drop=_freq_) out=_tmp_outpls_nmiss (rename=(col1=nmiss)); run;
	data _tmp_outpls_nmiss; set _tmp_outpls_nmiss; where (nmiss = &output_record_total.); run;
	proc sql noprint; select _NAME_ into :missing_factor_list separated by ' ' from _tmp_outpls_nmiss; quit;
	%put pls_score: missing_factor_list = {&missing_factor_list.};

	%delete_vars(&output_ds., &missing_factor_list.);
	%delete_ds(_tmp_outpls_nmiss);

	*Create scoring dataset;
	proc transpose data=&XCenScale_ds. out=_tmp_xcenscale_tx; id variable ; run;
	data _tmp_xcenscale_tx; if _n_=1 then _TYPE_='MEAN'; else _TYPE_='STD'; set _tmp_xcenscale_tx; run;
	data _tmp_xweights; length _TYPE_ $ 5 _NAME_ $32; set &XWeights_ds.; _TYPE_ = 'SCORE'; _NAME_ = compress("&xscore_prefix."||_N_); run;
	data &out_score.; length _TYPE_ $ 5 _NAME_ $32; set _tmp_xcenscale_tx _tmp_xweights; run;
	%delete_vars(&out_score., _LABEL_);
	%delete_ds(_tmp_xcenscale_tx _tmp_xweights);

	*Cleanup datasets (if name not explicitly changed by user);
	%delete_ds(tmp_XCenScale_ds tmp_XWeights_ds);

%mend pls_train;

*pls_score_factors macro; 
*Generates the "x factors" for new observations;
*new_data_input_ds:     new data (with the same original predictors that went into pls_train);
*score_input_ds:        scoring dataset produced by pls_train(); 
*new_data_output_ds:    output dataset (new data + x factors)
*variable_list:         list of prediction variables;  
%macro pls_score_factors (
			new_data_input_ds, 
			score_input_ds, 
			new_data_output_ds,
			variable_list
			);

	proc score 
			data  = &new_data_input_ds. 
			score = &score_input_ds. 
			out   = &new_data_output_ds.
			;
		%if %length(&variable_list.) %then %do; var &variable_list.;  %end;
	run;

%mend pls_score_factors;


***************************************************************************************************;
* BASIC HELP MACROS
***************************************************************************************************;

*Generates a list of variables by adding preffixes to the variable name (resi_a resi_b resi_c...);
%macro preffix_variable_list (var_list, preffix_list);
	%local v i;
	%local v2 i2;

	%let i = 1;
	%let v = %scan(&var_list., &i.,%str( ));
	%do %while(%length(&v.));

		%let i2 = 1;
		%let v2 = %scan(&preffix_list., &i2.,%str( ));
		%do %while(%length(&v2.));

			&v2.&v.

			%let i2 = %eval (&i2. + 1);
			%let v2 = %scan(&preffix_list., &i2.,%str( ));
		%end;

		%let i = %eval (&i. + 1);
		%let v = %scan(&var_list., &i.,%str( ));
	%end;
%mend preffix_variable_list;

*Deletes datasets checking first if they do exist;
%macro delete_ds (ds_list);
	%local v i;
	%let i = 1;
	%let v = %scan(&ds_list., &i., %str( ));
	%do %while(%length(&v.));

		%if %sysfunc(exist(&v.)) %then %do;
			proc delete data=&v.; run;
		%end; 

		%let i = %eval (&i. + 1);
		%let v = %scan(&ds_list., &i., %str( ));
	%end;
%mend delete_ds;

*Deletes multiple variables from a dataset if they do exist;
%macro delete_vars (ds, vars);
	%local varidx var vardelete varnotexist varfound;
	%if %sysfunc(exist(&ds.)) %then %do;

		%let varidx = 1;
		%let var = %scan(&vars., &varidx., %str( ));
		%let vardelete=;
		%let varnotexist=;
		%do %while (%length(&var.));

			*Confirm existence of vars;
			data _null_;
				dsid=open("&ds.");
				if varnum(dsid,"&var.")~=0 then 
					call symput('varfound','Y');
				else
					call symput('varfound','N');
			run;

			*Create lists of vars;
			%if &varfound.=Y %then 
				%let vardelete = &vardelete. &var.;
			%else 			
				%let varnotexist = &varnotexist. &var.;

			%let varidx = %eval(&varidx. + 1);
			%let var = %scan(&vars., &varidx., %str( ));
		%end;

		%if %length(&varnotexist.) %then %do;
			%put delete_vars: variables {&varnotexist.} not found in dataset &ds.;
		%end;

		%if %length(&vardelete.) %then %do;
			*Drops vars;
			%put delete_vars: removing variables {&vardelete.} from dataset &ds.;
			data &ds.; 
				set &ds. (drop=&vardelete.); 
			run;
		%end;

	%end; %else %do;
		%put delete_vars: dataset &ds not found;
	%end;
%mend delete_vars;


***************************************************************************************************;
* EXECUTE THE TEST
***************************************************************************************************;

*generate some simulated data;
*x1 -- x4 are the real factors 
*x1b -- x3c are highly redundant variables which will hopefully be eliminated (if PLS works as expected);
*y is the dependent variable;
*also split the data in two sets (training and validation) randomly;
data input_data;
	call streaminit(38125);
	do ID = 1 to 2000;
		x1 = rand('NORMAL', 20,  25);
		x2 = rand('NORMAL', -20, 30);
		x3 = rand('NORMAL', 100, 20);
		x4 = (5*x1 + x2)/4 + rand('NORMAL', 0, 20);
		x1b = x1 + rand('NORMAL',0, 0.1);
		x2b = x2 + rand('NORMAL',0, 0.1);
		x3b = x3 + rand('NORMAL',0, 0.1);
		x1c = x1 + rand('NORMAL',100, 2);
		x2c = x2 + rand('NORMAL',100, 2);
		x3c = x3 + rand('NORMAL',100, 2);
		y = 100 + 10*x1 + 5*x2 - 3*x3 + 0.1*x4 +  + rand('NORMAL',0,25);
		if rand('UNIFORM')<0.70 then group = 'TRAIN'; else group = 'VALID';
		output;
	end;
run;
proc sort data=input_data; by group ID; run;

title 'Check input data';
proc corr data=input_data noprob; by group; var x: y: ; run;

*RUN PLS MODEL (WITH TRAINING DATA ONLY);
%pls_train (
	input_data (where=(group in ('TRAIN'))),
	output_pls,
	y,
	x: ,
	ID,
	out_score     = scoring_data,
	XCenScale_ds  = XCenScale,
	XWeights_ds   = XWeights,
	desc          = PLS model for Y
	);

*SCORE EVERYTHING (TRAINING AND VALIDATION);
%pls_score_factors (
	input_data, 
	scoring_data,
	output_scoring,
	x: 
	);

*COMPARE THE SCORED FACTORS WITH THE ORIGINAL PLS FACTORS (FOR TRAINING DATA ONLY... JUST TO CHECK);
proc means data=output_pls     n min p1 q1 median mean q3 p99 max; title 'output_pls';     var factor: ; where (group in ('TRAIN')); run;
proc means data=output_scoring n min p1 q1 median mean q3 p99 max; title 'output_scoring'; var factor: ; where (group in ('TRAIN')); run;

proc reg data=output_pls;     title 'output_pls';     model y = factor: / vif; where (group in ('TRAIN')); run;
proc reg data=output_scoring; title 'output_scoring'; model y = factor: / vif; where (group in ('TRAIN')); run;

*CALCULATE DIFFERENCES (FOR TRAINING DATA ONLY... JUST TO CHECK);
proc sort data=output_pls;     by ID; run;
proc sort data=output_scoring; by ID; run;
data output_combined;
	merge	output_pls     (in=in1 keep=ID factor1--factor3)
			output_scoring (in=in2 keep=ID factor1--factor3 rename=(factor1=factor_scr1 factor2=factor_scr2 factor3=factor_scr3));
	by ID;
	if (in1 and in2);

	factor1_diff = factor_scr1 - factor1;
	factor2_diff = factor_scr2 - factor2;
	factor3_diff = factor_scr3 - factor3;

	factor1_prop = factor_scr1 / factor1;
	factor2_prop = factor_scr2 / factor2;
	factor3_prop = factor_scr3 / factor3;
run;

proc means data=output_combined n min p1 q1 median mean q3 p99 max; title 'output_combined';  var factor: ; run;

*NOTE: LOOK AT THE DIFFERENCES (TEHY GET LARGER FOR FACTOR2 and FACTOR3);