<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic cut_off_value using deciles in SAS Data Science</title>
    <link>https://communities.sas.com/t5/SAS-Data-Science/cut-off-value-using-deciles/m-p/753685#M8793</link>
    <description>&lt;P&gt;Hi, i need help in getting cut off value from the deciles.. i have built my model using below steps:&lt;/P&gt;
&lt;P&gt;1. Splitted full_data into training and validation&lt;/P&gt;
&lt;P&gt;2. oversampled the training dataset&lt;/P&gt;
&lt;P&gt;3. Trained the model on the oversampled dataset&lt;/P&gt;
&lt;P&gt;4. binned the predicted probabilities into 10 deciles from step 3&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;My question is should i use only the oversampled dataset to determine cut off value or should I use the entire non-oversampled data set?see below code and data&lt;/P&gt;
&lt;P&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &lt;STRONG&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;Binning results&lt;/STRONG&gt;&lt;/P&gt;
&lt;TABLE width="369"&gt;
&lt;TBODY&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;bin&lt;/TD&gt;
&lt;TD width="113"&gt;HIV_Positive&lt;/TD&gt;
&lt;TD width="114"&gt;%of_HIV Positive&lt;/TD&gt;
&lt;TD width="78"&gt;Cumulative&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;1&lt;/TD&gt;
&lt;TD width="113"&gt;316&lt;/TD&gt;
&lt;TD&gt;0.162970603&lt;/TD&gt;
&lt;TD&gt;0.1629706&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;2&lt;/TD&gt;
&lt;TD width="113"&gt;293&lt;/TD&gt;
&lt;TD&gt;0.151108819&lt;/TD&gt;
&lt;TD&gt;0.31407942&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;3&lt;/TD&gt;
&lt;TD width="113"&gt;254&lt;/TD&gt;
&lt;TD&gt;0.130995358&lt;/TD&gt;
&lt;TD&gt;0.44507478&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;4&lt;/TD&gt;
&lt;TD width="113"&gt;227&lt;/TD&gt;
&lt;TD&gt;0.117070655&lt;/TD&gt;
&lt;TD&gt;0.56214544&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;5&lt;/TD&gt;
&lt;TD width="113"&gt;206&lt;/TD&gt;
&lt;TD&gt;0.10624033&lt;/TD&gt;
&lt;TD&gt;0.66838577&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;6&lt;/TD&gt;
&lt;TD width="113"&gt;173&lt;/TD&gt;
&lt;TD&gt;0.089221248&lt;/TD&gt;
&lt;TD&gt;0.75760701&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;7&lt;/TD&gt;
&lt;TD width="113"&gt;156&lt;/TD&gt;
&lt;TD&gt;0.080453842&lt;/TD&gt;
&lt;TD&gt;0.83806086&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;8&lt;/TD&gt;
&lt;TD width="113"&gt;131&lt;/TD&gt;
&lt;TD&gt;0.067560598&lt;/TD&gt;
&lt;TD&gt;0.90562145&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;9&lt;/TD&gt;
&lt;TD width="113"&gt;106&lt;/TD&gt;
&lt;TD&gt;0.054667354&lt;/TD&gt;
&lt;TD&gt;0.96028881&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;10&lt;/TD&gt;
&lt;TD width="113"&gt;77&lt;/TD&gt;
&lt;TD&gt;0.039711191&lt;/TD&gt;
&lt;TD&gt;1&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;Total&lt;/TD&gt;
&lt;TD&gt;1939&lt;/TD&gt;
&lt;TD&gt;&amp;nbsp;&lt;/TD&gt;
&lt;TD&gt;&amp;nbsp;&lt;/TD&gt;
&lt;/TR&gt;
&lt;/TBODY&gt;
&lt;/TABLE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc logistic data = oversample outmodel=hiv_scoring desc PLOTS=ALL;
class Alcohol_Consumption Existing_cover Marital_status Lead_type Education gender product_line_id SMOKING_STATUS;
model Status = SUM_INSURED Monthly_income AGE_AT_INCEPT Alcohol_Consumption Existing_cover Marital_status Lead_type Education gender product_line_id SMOKING_STATUS /selection=stepwise;
output out = outreg predprobs=individual p=predicted;
run;



proc sort data=outreg;
by predicted;
run;

data outreg;
set outreg;
if _n_&amp;lt;=3870;
run;

/*binning the data for finding proper cut-off value*/
data test;
do i=1 to 10;
do j=1 to 387;
Output;
end;
end;
drop j;
run;

data outreg;
merge outreg test;
run;

proc sql;
create table summary as
select i as bin,
sum(Status=1) as HIV_Positive
from outreg
group by i
;quit;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Tue, 13 Jul 2021 07:27:26 GMT</pubDate>
    <dc:creator>Solly7</dc:creator>
    <dc:date>2021-07-13T07:27:26Z</dc:date>
    <item>
      <title>cut_off_value using deciles</title>
      <link>https://communities.sas.com/t5/SAS-Data-Science/cut-off-value-using-deciles/m-p/753685#M8793</link>
      <description>&lt;P&gt;Hi, i need help in getting cut off value from the deciles.. i have built my model using below steps:&lt;/P&gt;
&lt;P&gt;1. Splitted full_data into training and validation&lt;/P&gt;
&lt;P&gt;2. oversampled the training dataset&lt;/P&gt;
&lt;P&gt;3. Trained the model on the oversampled dataset&lt;/P&gt;
&lt;P&gt;4. binned the predicted probabilities into 10 deciles from step 3&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;My question is should i use only the oversampled dataset to determine cut off value or should I use the entire non-oversampled data set?see below code and data&lt;/P&gt;
&lt;P&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &lt;STRONG&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;Binning results&lt;/STRONG&gt;&lt;/P&gt;
&lt;TABLE width="369"&gt;
&lt;TBODY&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;bin&lt;/TD&gt;
&lt;TD width="113"&gt;HIV_Positive&lt;/TD&gt;
&lt;TD width="114"&gt;%of_HIV Positive&lt;/TD&gt;
&lt;TD width="78"&gt;Cumulative&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;1&lt;/TD&gt;
&lt;TD width="113"&gt;316&lt;/TD&gt;
&lt;TD&gt;0.162970603&lt;/TD&gt;
&lt;TD&gt;0.1629706&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;2&lt;/TD&gt;
&lt;TD width="113"&gt;293&lt;/TD&gt;
&lt;TD&gt;0.151108819&lt;/TD&gt;
&lt;TD&gt;0.31407942&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;3&lt;/TD&gt;
&lt;TD width="113"&gt;254&lt;/TD&gt;
&lt;TD&gt;0.130995358&lt;/TD&gt;
&lt;TD&gt;0.44507478&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;4&lt;/TD&gt;
&lt;TD width="113"&gt;227&lt;/TD&gt;
&lt;TD&gt;0.117070655&lt;/TD&gt;
&lt;TD&gt;0.56214544&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;5&lt;/TD&gt;
&lt;TD width="113"&gt;206&lt;/TD&gt;
&lt;TD&gt;0.10624033&lt;/TD&gt;
&lt;TD&gt;0.66838577&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;6&lt;/TD&gt;
&lt;TD width="113"&gt;173&lt;/TD&gt;
&lt;TD&gt;0.089221248&lt;/TD&gt;
&lt;TD&gt;0.75760701&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;7&lt;/TD&gt;
&lt;TD width="113"&gt;156&lt;/TD&gt;
&lt;TD&gt;0.080453842&lt;/TD&gt;
&lt;TD&gt;0.83806086&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;8&lt;/TD&gt;
&lt;TD width="113"&gt;131&lt;/TD&gt;
&lt;TD&gt;0.067560598&lt;/TD&gt;
&lt;TD&gt;0.90562145&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;9&lt;/TD&gt;
&lt;TD width="113"&gt;106&lt;/TD&gt;
&lt;TD&gt;0.054667354&lt;/TD&gt;
&lt;TD&gt;0.96028881&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD width="64"&gt;10&lt;/TD&gt;
&lt;TD width="113"&gt;77&lt;/TD&gt;
&lt;TD&gt;0.039711191&lt;/TD&gt;
&lt;TD&gt;1&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;Total&lt;/TD&gt;
&lt;TD&gt;1939&lt;/TD&gt;
&lt;TD&gt;&amp;nbsp;&lt;/TD&gt;
&lt;TD&gt;&amp;nbsp;&lt;/TD&gt;
&lt;/TR&gt;
&lt;/TBODY&gt;
&lt;/TABLE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc logistic data = oversample outmodel=hiv_scoring desc PLOTS=ALL;
class Alcohol_Consumption Existing_cover Marital_status Lead_type Education gender product_line_id SMOKING_STATUS;
model Status = SUM_INSURED Monthly_income AGE_AT_INCEPT Alcohol_Consumption Existing_cover Marital_status Lead_type Education gender product_line_id SMOKING_STATUS /selection=stepwise;
output out = outreg predprobs=individual p=predicted;
run;



proc sort data=outreg;
by predicted;
run;

data outreg;
set outreg;
if _n_&amp;lt;=3870;
run;

/*binning the data for finding proper cut-off value*/
data test;
do i=1 to 10;
do j=1 to 387;
Output;
end;
end;
drop j;
run;

data outreg;
merge outreg test;
run;

proc sql;
create table summary as
select i as bin,
sum(Status=1) as HIV_Positive
from outreg
group by i
;quit;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 13 Jul 2021 07:27:26 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Science/cut-off-value-using-deciles/m-p/753685#M8793</guid>
      <dc:creator>Solly7</dc:creator>
      <dc:date>2021-07-13T07:27:26Z</dc:date>
    </item>
    <item>
      <title>Re: cut_off_value using deciles</title>
      <link>https://communities.sas.com/t5/SAS-Data-Science/cut-off-value-using-deciles/m-p/753806#M8794</link>
      <description>Why are you binning your predicted probabilities?</description>
      <pubDate>Tue, 13 Jul 2021 16:08:51 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Science/cut-off-value-using-deciles/m-p/753806#M8794</guid>
      <dc:creator>Reeza</dc:creator>
      <dc:date>2021-07-13T16:08:51Z</dc:date>
    </item>
  </channel>
</rss>

