Hello.
Let's say that I have a data set with many continuous variables (independent variables) and response variable (1/0 Default yes or no).
Let's say that for specific variable X I want to categorize this variable into categories .
I have some criteria:
1-Classify to 4 or 5 categories
2-Category 1 is the worse category (The category with highest bads rate)
3-In each category be at least 50 bads (people with Y=1)
4-There must be cardinally in failure rate between groups
5-Must have at least 1% increasing in bads rate from one category to another category.
6-If there are some possible ways that meet 1-5 then choose the option with highest Information value.
May anyone show a nice code that perform this?
What way would you use to perform the binning based on the criteria's I mentioned?
Here is SAS code that show If the conditions I mentioned were meet.
Here can see also IV and see default rate in each category.
As I said, the magic question is how to binning the categories?
PROC FORMAT ;
VALUE IV_fmt
low-<0.02='Very weak'
0.02 <-0.1 ='Weak'
0.1 <-0.3 ='Intermediate'
0.3 <- HIGH ='Strong'
;
RUN;
%macro inf_value(Raw_Data_tbl,var,Response_VAR);
PROC SUMMARY DATA=&Raw_Data_tbl. (keep= &var. &Response_VAR.);
VAR &Response_VAR.;
OUTPUT OUT=inf0 (DROP=_TYPE_)sum=all_bad;
RUN;
data _null_;
set inf0;
good=_freq_-all_bad;
call symput('all_good',good);
call symput('all_bad',all_bad);
run;
%put &all_good;
%put &all_bad;
PROC SUMMARY DATA=&Raw_Data_tbl. (keep= &var. &Response_VAR.)nway;
class &var.;
VAR &Response_VAR.;
OUTPUT OUT=inf1 (DROP=_TYPE_ rename=(_freq_=Nr_Customers))sum=Nr_bad;
RUN;
data inf2;
set inf1;
Nr_good=Nr_Customers-Nr_bad;
if &all_bad. not in (0,.) then PCT_Bads=Nr_bad/&all_bad.;
if &all_good. not in (0,.) then PCT_Goods=Nr_good/&all_good.;
res_helkey_non=PCT_Bads/PCT_Goods;
woe=log(res_helkey_non);**weight of evidance***;
iv_categ=(PCT_Bads-PCT_Goods)*woe;
run;
PROC SUMMARY DATA= inf2;
VAR iv_categ;
OUTPUT OUT=inf3 (DROP=_TYPE_)sum=inf_val;
RUN;
data _null_;
set inf3;
call symputx('inf_val',inf_val);
run;
%put &inf_val;/***SUM IV over all categories***/
data inf4;
length var $100. categ $100.;
set inf2;
categ=compress(&var.);
drop &var.;
var="&var.";
P_Default=Nr_bad/Nr_Customers;
label iv='information value(iv)';
run;
data inf5;
retain
var
categ
Nr_Customers
Nr_bad
Nr_good
PCT_bad
PCT_Bads
PCT_Goods
P_Default
res_helkey_non
woe
iv_categ
;
set inf4;
format PCT_Bads PCT_Goods res_helkey_non woe iv_categ P_Default percent10.3;
format Nr_Customers Nr_bad Nr_good comma15. ;
run;
PROC SUMMARY DATA= inf5;
VAR Nr_Customers Nr_bad Nr_good PCT_Bads PCT_Goods iv_categ;
OUTPUT OUT=Summary_IV (DROP=_TYPE_ _freq_) sum=;
RUN;
Data Summary_IV_b;
Retain field category ;
set Summary_IV;
VAR="&var.";
P_Default=Nr_bad/Nr_Customers;
format P_Default percent8.2;
Run;
Data Want;
set inf5 Summary_IV_b;
IF missing(categ) then IV_desc=put(iv_categ,IV_fmt.);
else IV_desc='';
Run;
title;
proc print data=Want noobs;Run;
%mend inf_value;
%inf_value(Raw_Data_tbl=ttt,var=X,Response_VAR=Ind_Default)
You are building a Score Card ?
The following is the code I used before for this purpose.
/*
proc import datafile='c:\temp\1--German Credit.xlsx' dbms=xlsx out=have replace;
run;
*/
%let var=weight; *the continuous variable you need to split;
%let group=4 ; *the number of group you want to bin to;
%let n_iter=100; *the number of iteration for Genetic Algorithm to get best WOE IV;
data temp;
set sashelp.heart(obs=200);
good_bad=ifc(status='Alive','good','bad ');
if not missing(&var);
keep &var good_bad ;
run;
proc sql noprint;
select sum(good_bad='bad'),sum(good_bad='good'),
floor(min(&var)),ceil(max(&var)) into : n_bad,: n_good,: min,: max
from temp;
quit;
%put &n_bad &n_good &min &max;
proc sort data=temp;by &var ;run;
proc iml;
use temp(where=(&var is not missing));
read all var {&var good_bad};
close;
start function(x) global(bin,&var ,good_bad,group,woe);
if countunique(x)=group-1 then do;
col_x=t(x);
call sort(col_x,1);
cutpoints= .M//col_x//.I ;
b=bin(&var ,cutpoints,'right');
if countunique(b)=group then do;
do i=1 to group;
idx=loc(b=i);
temp=good_bad[idx];
n_bad=sum(temp='bad');
n_good=sum(temp='good');
bad_dist=n_bad/&n_bad ;
good_dist=n_good/&n_good ;
if Bad_Dist>0.05 & Good_Dist>0.05 then woe[i]=log(Bad_Dist/Good_Dist);
else woe[i]=.;
end;
if countmiss(woe)=0 then do;
/*
xx=j(group,1,1)||woe||woe##2;
*/
xx=j(group,1,1)||woe;
beta=solve(xx`*xx,xx`*bin);
yhat=xx*beta;
sse=ssq(bin-yhat);
end;
else sse=999999;
end;
else sse=999999;
end;
else sse=999999;
return (sse);
finish;
group=&group ;
bin=t(1:group);
woe=j(group,1,.);
encoding=j(2,group-1,&min );
encoding[2,]=&max ;
id=gasetup(2,group-1,123456789);
call gasetobj(id,0,"function");
call gasetsel(id,10,1,1);
call gainit(id,1000,encoding);
niter = &n_iter ;
do i = 1 to niter;
call garegen(id);
call gagetval(value, id);
end;
call gagetmem(mem, value, id, 1);
col_mem=t(mem);
call sort(col_mem,1);
cutpoints= .M//col_mem//.I ;
b=bin(&var ,cutpoints,'right');
create cutpoints var {cutpoints};
append;
close;
create group var {b};
append;
close;
print value[l = "Min Value:"] ;
call gaend(id);
quit;
data all_group;
set temp(keep=&var rename=(&var=b) where=(b is missing)) group;
run;
data all;
merge all_group temp;
rename b=group;
run;
title "变量: &var" ;
proc sql;
create table woe_&var as
select group label=' ',
min(&var) as min label='最小值',max(&var) as max label='最大值',count(*) as n label='频数',
calculated n/(select count(*) from all) as per format=percent7.2 label='占比',
sum(good_bad='bad') as n_bad label='bad的个数',sum(good_bad='good') as n_good label='good的个数',
sum(good_bad='bad')/(select sum(good_bad='bad') from all ) as bad_dist label='bad的占比',
sum(good_bad='good')/(select sum(good_bad='good') from all ) as good_dist label='good的占比',
log(calculated Bad_Dist/calculated Good_Dist) as woe
from all
group by group
order by woe;
create index group on woe_&var;
select *,sum( (Bad_Dist-Good_Dist)*woe ) as iv
from woe_&var ;
quit;
title ' ';
/*
data fmt_&var ;
set cutpoints;
start=lag(cutpoints);
end=cutpoints;
if start=.M then hlo='IL';
if end=.I then hlo='IH';
if _n_ ne 1 then do;group+1;output;end;
run;
data fmt_&var(index=(group));
merge fmt_&var woe_&var(keep=group woe);
by group;
retain fmtname "&var" type 'I';
keep group fmtname type start end woe hlo;
rename woe=label;
label group=' ';
run;
proc format cntlin=fmt_&var library=z;
run;
proc print data=woe_&var noobs label;run;
proc sgplot data=woe_&var;
reg y=group x=woe/degree=2 cli clm jitter;
run;
*/
proc sgplot data=woe_&var noautolegend;
vbar group/response=woe nostatlabel missing;
vline group/response=woe nostatlabel missing markers MARKERATTRS=(symbol=circlefilled
size=12) MARKERFILLATTRS=(color=white) MARKEROUTLINEATTRS=graphdata1
FILLEDOUTLINEDMARKERS;
run;
Thank you so so much!
Yes, build credit risk model (ScoreCard).
Can I ask please regarding the criteria I mentioned:
Classify to 4 or 5 categories ----Defined in macro var %let group
Category 1 is the worse category (The category with highest bads rate)--Where did you define it?
In each category be at least 50 bads (people with Y=1)--Where did you define it?
Must have at least 1% increasing in bads rate from one category to another category.--Where did you define it?
Thanks,
If in data set have 100,000 rows.
Then how long approximately should it take to run the 100 iterations for one var only?
Which code should Modify that worse group be 1 and best group be 4 ( define 4 groups)?
Can you show the code please?
I run it more than 2 hours.......still didnt finish (Run for one variable only)
you mean 3 minutes for each iteration?
so for 100 iterations will have 300 minutes?
Yes. 3 minutes for one variable.
Since you tested it for more than two hours , try to reduce %let n_iter=100 .
Or get more people to run code, each for one variable?
P.S. I do not recommend to reduce "%let n_iter= 100", and it is other way around, I would have more than 100 to get better IV .
In summary, if you want fastest way to solve your question, post your question at OR forum and calling @RobPratt .
Thanks,
Not always number of groups is 4 .
I also want that worse group be 1 and not 0.
Here is the code that do it 100%
proc sql noprint;
select count(distinct b) as nr_groups into :nr_groups
from all_group
;
quit;
%put &nr_groups;
data all;
merge all_group temp;
b=(&nr_groups.-b)+1;
/***I want that worse group be group 1 and so on****/
rename b=group;
Run;
I run your original code:
In data set CUTPOINTS there is one column called CUTPOINTS (numeric var with no format ).
How come I see letter values when the column is numeric??
What is the meaning of values "M" and "I"?
I also added to final table percent of bads within each group
SAS Innovate 2025 is scheduled for May 6-9 in Orlando, FL. Sign up to be first to learn about the agenda and registration!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.