BookmarkSubscribeRSS Feed
🔒 This topic is solved and locked. Need further help from the community? Please sign in and ask a new question.
Phil_NZ
Barite | Level 11

Hi all, 

 

Today, when I winsorize, I notice that the missing variables all gone. I do not know what happened behind

My code is (I winsorize variables from dataset "concatenate" to have the output "winsorize") following method of @mkeintz :

proc univariate data=concatenate ;
var ACC_PAY ACC_PAY_TUR ACC_REC_DAY BOO_VAL_PER_SHA CAP_INT CAS_DIV_TOT CCC 
			CF_OPE_ACT COGS DELTACF DEP DIVIDEND DPO EARNINGS
			FCF FCF_PER_SHA FIRM_SIZE FIX_ASS_GROSS FIX_ASS_NET 
			FIX_ASS_TUR GOODWILL GROSS_INC GRO_MAR GRO_PRO_MAR INVESTMENT INV_CYC 
			INV_DAY INV_GRO_RAT INV_TOT INV_TUR LAGCF_OPE_ACT LAGS18 LAGS22 LAGS29 LAGS3 LAGS43	LAGS7
			LEVERAGE LOG_MVE MAR_CAP MAR_TO_BOO MTBV NET_CF_OPE_ACT NET_INC NET_PRO_MAR NET_SAL
			NI_BF_EXT_ITE OPE_INC OPE_LEA_EXP P PRI_VOLA PROFIT REC_TUR RETURN ROA ROE R_AND_D R_AND_D_OVER_SAL SAL_GRO
			SEL_GEN_ADM	TANGIBILITY TOT_ASS TOT_ASS_TUR TOT_DEB TOT_LIA 
			/*new var:*/LAG_COGS  LAG_GOODWILL  LAG_OPE_LEA  LAG_R_AND_D 
			LAG_SAL_GRO  LAG_SEL_GEN

   			;
ods output Quantiles=outlier Moments=num_of_obs;
*ods trace off; /* stop writing to log */
ods select all;

%let L=1;    %* 1th percentile *;

%let H=%eval(100 - &L);   %* 99th percentile*;


proc univariate data=concatenate noprint;

   var ACC_PAY ACC_PAY_TUR ACC_REC_DAY BOO_VAL_PER_SHA CAP_INT CAS_DIV_TOT CCC 
			CF_OPE_ACT COGS DELTACF DEP DIVIDEND DPO EARNINGS
			FCF FCF_PER_SHA FIRM_SIZE FIX_ASS_GROSS FIX_ASS_NET 
			FIX_ASS_TUR GOODWILL GROSS_INC GRO_MAR GRO_PRO_MAR INVESTMENT INV_CYC 
			INV_DAY INV_GRO_RAT INV_TOT INV_TUR LAGCF_OPE_ACT LAGS18 LAGS22 LAGS29 LAGS3 LAGS43	LAGS7
			LEVERAGE LOG_MVE MAR_CAP MAR_TO_BOO MTBV NET_CF_OPE_ACT NET_INC NET_PRO_MAR NET_SAL
			NI_BF_EXT_ITE OPE_INC OPE_LEA_EXP P PRI_VOLA PROFIT REC_TUR RETURN ROA ROE R_AND_D R_AND_D_OVER_SAL SAL_GRO
			SEL_GEN_ADM	TANGIBILITY TOT_ASS TOT_ASS_TUR TOT_DEB TOT_LIA LAG_COGS  LAG_GOODWILL  LAG_OPE_LEA  LAG_R_AND_D 
			LAG_SAL_GRO  LAG_SEL_GEN;

   output out=_winsor   pctlpts=&L  &H     pctlpre= __ACC_PAY __ACC_PAY_TUR __ACC_REC_DAY 
			__BOO_VAL_PER_SHA __CAP_INT __CAS_DIV_TOT __CCC __CF_OPE_ACT __COGS __DELTACF 
			__DEP __DIVIDEND __DPO __EARNINGS __FCF __FCF_PER_SHA __FIRM_SIZE __FIX_ASS_GROSS 
			__FIX_ASS_NET __FIX_ASS_TUR __GOODWILL __GROSS_INC __GRO_MAR __GRO_PRO_MAR 
			__INVESTMENT __INV_CYC __INV_DAY __INV_GRO_RAT __INV_TOT __INV_TUR __LAGCF_OPE_ACT 
			__LAGS18 __LAGS22 __LAGS29 __LAGS3 __LAGS43	__LAGS7 __LEVERAGE __LOG_MVE __MAR_CAP 
			__MAR_TO_BOO __MTBV __NET_CF_OPE_ACT __NET_INC __NET_PRO_MAR __NET_SAL
			__NI_BF_EXT_ITE __OPE_INC __OPE_LEA_EXP __P __PRI_VOLA __PROFIT __REC_TUR __RETURN 
			__ROA __ROE __R_AND_D __R_AND_D_OVER_SAL __SAL_GRO
			__SEL_GEN_ADM	__TANGIBILITY __TOT_ASS __TOT_ASS_TUR __TOT_DEB __TOT_LIA 
			__LAG_COGS  __LAG_GOODWILL  __LAG_OPE_LEA  __LAG_R_AND_D 
			__LAG_SAL_GRO  __LAG_SEL_GEN;

run;

data winsorize (drop=__:);

  set concatenate;

  if _n_=1 then set _winsor;

  array wlo  {*}  __ACC_PAY&L __ACC_PAY_TUR&L __ACC_REC_DAY&L 
			__BOO_VAL_PER_SHA&L __CAP_INT&L __CAS_DIV_TOT&L __CCC&L __CF_OPE_ACT&L __COGS&L __DELTACF&L 
			__DEP&L __DIVIDEND&L __DPO&L __EARNINGS&L __FCF&L __FCF_PER_SHA&L __FIRM_SIZE&L __FIX_ASS_GROSS&L 
			__FIX_ASS_NET&L __FIX_ASS_TUR&L __GOODWILL&L __GROSS_INC&L __GRO_MAR&L __GRO_PRO_MAR&L 
			__INVESTMENT&L __INV_CYC&L __INV_DAY&L __INV_GRO_RAT&L __INV_TOT&L __INV_TUR&L __LAGCF_OPE_ACT&L 
			__LAGS18&L __LAGS22&L __LAGS29&L __LAGS3&L __LAGS43&L	__LAGS7&L __LEVERAGE&L __LOG_MVE&L __MAR_CAP&L 
			__MAR_TO_BOO&L __MTBV&L __NET_CF_OPE_ACT&L __NET_INC&L __NET_PRO_MAR&L __NET_SAL&L
			__NI_BF_EXT_ITE&L __OPE_INC&L __OPE_LEA_EXP&L __P&L __PRI_VOLA&L __PROFIT&L __REC_TUR&L __RETURN&L 
			__ROA&L __ROE&L __R_AND_D&L __R_AND_D_OVER_SAL&L __SAL_GRO&L
			__SEL_GEN_ADM&L	__TANGIBILITY&L __TOT_ASS&L __TOT_ASS_TUR&L __TOT_DEB&L __TOT_LIA&L 
			__LAG_COGS&L  __LAG_GOODWILL&L  __LAG_OPE_LEA&L  __LAG_R_AND_D&L 
			__LAG_SAL_GRO&L  __LAG_SEL_GEN&L ;

  array whi  {*} __ACC_PAY&H __ACC_PAY_TUR&H __ACC_REC_DAY&H 
			__BOO_VAL_PER_SHA&H __CAP_INT&H __CAS_DIV_TOT&H __CCC&H __CF_OPE_ACT&H __COGS&H __DELTACF&H 
			__DEP&H __DIVIDEND&H __DPO&H __EARNINGS&H __FCF&H __FCF_PER_SHA&H __FIRM_SIZE&H __FIX_ASS_GROSS&H 
			__FIX_ASS_NET&H __FIX_ASS_TUR&H __GOODWILL&H __GROSS_INC&H __GRO_MAR&H __GRO_PRO_MAR&H 
			__INVESTMENT&H __INV_CYC&H __INV_DAY&H __INV_GRO_RAT&H __INV_TOT&H __INV_TUR&H __LAGCF_OPE_ACT&H 
			__LAGS18&H __LAGS22&H __LAGS29&H __LAGS3&H __LAGS43&H	__LAGS7&H __LEVERAGE&H __LOG_MVE&H __MAR_CAP&H 
			__MAR_TO_BOO&H __MTBV&H __NET_CF_OPE_ACT&H __NET_INC&H __NET_PRO_MAR&H __NET_SAL&H
			__NI_BF_EXT_ITE&H __OPE_INC&H __OPE_LEA_EXP&H __P&H __PRI_VOLA&H __PROFIT&H __REC_TUR&H __RETURN&H 
			__ROA&H __ROE&H __R_AND_D&H __R_AND_D_OVER_SAL&H __SAL_GRO&H
			__SEL_GEN_ADM&H	__TANGIBILITY&H __TOT_ASS&H __TOT_ASS_TUR&H __TOT_DEB&H __TOT_LIA&H  
			__LAG_COGS&H  __LAG_GOODWILL&H  __LAG_OPE_LEA&H  __LAG_R_AND_D&H 
			__LAG_SAL_GRO&H  __LAG_SEL_GEN&H ;

  array wval {*} wACC_PAY wACC_PAY_TUR wACC_REC_DAY wBOO_VAL_PER_SHA wCAP_INT wCAS_DIV_TOT wCCC 
			wCF_OPE_ACT wCOGS wDELTACF wDEP wDIVIDEND wDPO wEARNINGS
			wFCF wFCF_PER_SHA wFIRM_SIZE wFIX_ASS_GROSS wFIX_ASS_NET 
			wFIX_ASS_TUR wGOODWILL wGROSS_INC wGRO_MAR wGRO_PRO_MAR wINVESTMENT wINV_CYC 
			wINV_DAY wINV_GRO_RAT wINV_TOT wINV_TUR wLAGCF_OPE_ACT wLAGS18 wLAGS22 wLAGS29 wLAGS3 wLAGS43 wLAGS7
			wLEVERAGE wLOG_MVE wMAR_CAP wMAR_TO_BOO wMTBV wNET_CF_OPE_ACT wNET_INC wNET_PRO_MAR wNET_SAL
			wNI_BF_EXT_ITE wOPE_INC wOPE_LEA_EXP wP wPRI_VOLA wPROFIT wREC_TUR wRETURN wROA wROE wR_AND_D wR_AND_D_OVER_SAL wSAL_GRO
			wSEL_GEN_ADM wTANGIBILITY wTOT_ASS wTOT_ASS_TUR wTOT_DEB wTOT_LIA wLAG_COGS  wLAG_GOODWILL  wLAG_OPE_LEA  wLAG_R_AND_D 
			wLAG_SAL_GRO  wLAG_SEL_GEN ;

  array val   {*} ACC_PAY ACC_PAY_TUR ACC_REC_DAY BOO_VAL_PER_SHA CAP_INT CAS_DIV_TOT CCC 
			CF_OPE_ACT COGS DELTACF DEP DIVIDEND DPO EARNINGS
			FCF FCF_PER_SHA FIRM_SIZE FIX_ASS_GROSS FIX_ASS_NET 
			FIX_ASS_TUR GOODWILL GROSS_INC GRO_MAR GRO_PRO_MAR INVESTMENT INV_CYC 
			INV_DAY INV_GRO_RAT INV_TOT INV_TUR LAGCF_OPE_ACT LAGS18 LAGS22 LAGS29 LAGS3 LAGS43	LAGS7
			LEVERAGE LOG_MVE MAR_CAP MAR_TO_BOO MTBV NET_CF_OPE_ACT NET_INC NET_PRO_MAR NET_SAL
			NI_BF_EXT_ITE OPE_INC OPE_LEA_EXP P PRI_VOLA PROFIT REC_TUR RETURN ROA ROE R_AND_D R_AND_D_OVER_SAL SAL_GRO
			SEL_GEN_ADM	TANGIBILITY TOT_ASS TOT_ASS_TUR TOT_DEB TOT_LIA LAG_COGS  LAG_GOODWILL  LAG_OPE_LEA  LAG_R_AND_D 
			LAG_SAL_GRO  LAG_SEL_GEN;

  do _V=1 to dim(val);

     wval{_V}=min(max(val{_V},wlo{_V}),whi{_V});

  end;

run;

And the quick summary for dataset concatenate is :

My97_0-1613512683137.png

And the summary for dataset "winsorize" is:

My97_1-1613512764505.png

 

As you can see, Nmiss in the summary of winsorized variables equals to zero, it is really strange

While @Reeza  has mentioned:

 

MIN/MAX() functions ignore missing so you should test your assumption. Make up some sample data and see what happens when you have missing values.

 

Please let me know if I need to clarify my question more.

 

Many thanks and warmest regards.

Thank you for your help, have a fabulous and productive day! I am a novice today, but someday when I accumulate enough knowledge, I can help others in my capacity.
1 ACCEPTED SOLUTION

Accepted Solutions
ChrisNZ
Tourmaline | Level 20

Like this?

WVAL{_V} = ifn(VAL{_V}=., ., min( max(VAL{_V},WLO{_V}), WHI{_V}) );

 

View solution in original post

7 REPLIES 7
ChrisNZ
Tourmaline | Level 20

@Reeza gave you the answer:  your line of code purposefully removes missing values. Why then are your surprised missing values are removed?

Phil_NZ
Barite | Level 11

Hi @ChrisNZ .

 

Is there any way to keep the missing observation?

 

Thank you!

Thank you for your help, have a fabulous and productive day! I am a novice today, but someday when I accumulate enough knowledge, I can help others in my capacity.
ChrisNZ
Tourmaline | Level 20

What set of values would return missing? When anyone of them is missing?

Phil_NZ
Barite | Level 11

Hi @ChrisNZ 

Thank you for your prompt reply.

 

I mean, I want to filter only among the non-missing observation, so as the initial dataset summary

My97_0-1613512683137.png

 

For example, I want the result of winsorized acc_pay (wACC_PAY)should have N=392726 and Nmiss= 24352 rather than N=417078 as running the code above 

My97_1-1613512764505.png

 

The idea of the code above generated by @mkeintz also fills the lowest value observation (missing observation) by 1st percentile value from my understanding while I do not want to fill the value on these missing observations.

Warmest regards.

Thank you for your help, have a fabulous and productive day! I am a novice today, but someday when I accumulate enough knowledge, I can help others in my capacity.
ChrisNZ
Tourmaline | Level 20

Like this?

WVAL{_V} = ifn(VAL{_V}=., ., min( max(VAL{_V},WLO{_V}), WHI{_V}) );

 

Phil_NZ
Barite | Level 11

Hi @ChrisNZ 

Many thanks for your help, it is the answer that I am looking for!

 

Best regards.

Thank you for your help, have a fabulous and productive day! I am a novice today, but someday when I accumulate enough knowledge, I can help others in my capacity.
Reeza
Super User
Conditional logic like this is easier to read IMO.
if not missing(wval{_V}) then wval(_V)=min(max(val{_V},wlo{_V}),whi{_V});

hackathon24-white-horiz.png

The 2025 SAS Hackathon has begun!

It's finally time to hack! Remember to visit the SAS Hacker's Hub regularly for news and updates.

Latest Updates

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

SAS Training: Just a Click Away

 Ready to level-up your skills? Choose your own adventure.

Browse our catalog!

Discussion stats
  • 7 replies
  • 1745 views
  • 3 likes
  • 3 in conversation