<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic How to replicate KNN results from Proc Discrim in SAS Data Science</title>
    <link>https://communities.sas.com/t5/SAS-Data-Science/How-to-replicate-KNN-results-from-Proc-Discrim/m-p/676482#M8383</link>
    <description>&lt;P&gt;Hi there, I'm learning KNN. I found that my Proc Discrim procedure gives me a much better results than me doing the manual calculation for the KNN algorithm. I'm wondering if there is any expert here who can explain why Proc Discrim does so much better.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;For example, below is the code that I use to classify the fish species in the SASHelp.fish data set.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;** Standardize Columns **;
proc standard data=sashelp.fish out=fish mean=0 std=1;
var weight length1 length2 length3 height width;
run;

data fish_train fish_test;
set fish;
rand = ranuni(100);
if rand &amp;lt;= 0.5 then output fish_train;
else output fish_test;
run;

** Using Built-in Proc Discrim **;
proc discrim data = fish_train test = fish_test 
  testout = _score1 method = npar k = 9 testlist;
  class species;
  var weight height length1 length2 length3 width;
run; &lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;The error rate is very low:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="i1001.png" style="width: 645px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/48194i6CE6B60D606DE82A/image-size/large?v=v2&amp;amp;px=999" role="button" title="i1001.png" alt="i1001.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Now, I'm doing it manually by calculating the distance between the points and find the K nearest neighbor (k=9).&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;** Manually build do KNN **;
data train1 train2 (drop=num);
set fish_train;
num = _n_;
run;

proc sql;
create table train_combine as
select a.num, 
       a.species as species_a,
       b.species as species_b,
       sqrt((a.weight - b.weight)**2 + 
            (a.height - b.height)**2 +
            (a.length1 - b.length1)**2 +
            (a.length2 - b.length2)**2 +
            (a.length3 - b.length3)**2 +
            (a.width - b.width)**2       
            ) as distance
from train1 a, train2 b
order by a.num, distance;
quit;

data train_combine2;
set train_combine;
by num distance;
if first.num then i = 0;
i + 1;
if i &amp;lt;= 9;
run;
       
proc freq data=train_combine2 noprint;
table species_b / out = fish_freq;
by num species_a;
run;

proc sort data=fish_freq; by num count; run;

data fish_freq2;
set fish_freq;
by num count;
if last.num;

if species_a = species_b then match = "Y";
else match = "N";
run;

proc sql;
select species_a, match, count(*) as cnt
from fish_freq2
group by species_a, match
order by species_a, match;
quit;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;I did the Euclidean distance. And the results are not even close to being as good as Proc Discrim.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;For example, my manual model classified it all wrong for Parkki. It got only one right for Roach.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="i1002.png" style="width: 172px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/48199i3A56E96CD1B21BBA/image-size/large?v=v2&amp;amp;px=999" role="button" title="i1002.png" alt="i1002.png" /&gt;&lt;/span&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;In contrast, Proc Discrim classifies 4 Parkki and 9 Roach correctly.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="i1003.png" style="width: 645px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/48200i944193C8C86865D0/image-size/large?v=v2&amp;amp;px=999" role="button" title="i1003.png" alt="i1003.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;How does the Proc Discrim algorithm work that gives the better classification results?&lt;/P&gt;</description>
    <pubDate>Thu, 13 Aug 2020 13:28:46 GMT</pubDate>
    <dc:creator>kisumsam</dc:creator>
    <dc:date>2020-08-13T13:28:46Z</dc:date>
    <item>
      <title>How to replicate KNN results from Proc Discrim</title>
      <link>https://communities.sas.com/t5/SAS-Data-Science/How-to-replicate-KNN-results-from-Proc-Discrim/m-p/676482#M8383</link>
      <description>&lt;P&gt;Hi there, I'm learning KNN. I found that my Proc Discrim procedure gives me a much better results than me doing the manual calculation for the KNN algorithm. I'm wondering if there is any expert here who can explain why Proc Discrim does so much better.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;For example, below is the code that I use to classify the fish species in the SASHelp.fish data set.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;** Standardize Columns **;
proc standard data=sashelp.fish out=fish mean=0 std=1;
var weight length1 length2 length3 height width;
run;

data fish_train fish_test;
set fish;
rand = ranuni(100);
if rand &amp;lt;= 0.5 then output fish_train;
else output fish_test;
run;

** Using Built-in Proc Discrim **;
proc discrim data = fish_train test = fish_test 
  testout = _score1 method = npar k = 9 testlist;
  class species;
  var weight height length1 length2 length3 width;
run; &lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;The error rate is very low:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="i1001.png" style="width: 645px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/48194i6CE6B60D606DE82A/image-size/large?v=v2&amp;amp;px=999" role="button" title="i1001.png" alt="i1001.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Now, I'm doing it manually by calculating the distance between the points and find the K nearest neighbor (k=9).&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;** Manually build do KNN **;
data train1 train2 (drop=num);
set fish_train;
num = _n_;
run;

proc sql;
create table train_combine as
select a.num, 
       a.species as species_a,
       b.species as species_b,
       sqrt((a.weight - b.weight)**2 + 
            (a.height - b.height)**2 +
            (a.length1 - b.length1)**2 +
            (a.length2 - b.length2)**2 +
            (a.length3 - b.length3)**2 +
            (a.width - b.width)**2       
            ) as distance
from train1 a, train2 b
order by a.num, distance;
quit;

data train_combine2;
set train_combine;
by num distance;
if first.num then i = 0;
i + 1;
if i &amp;lt;= 9;
run;
       
proc freq data=train_combine2 noprint;
table species_b / out = fish_freq;
by num species_a;
run;

proc sort data=fish_freq; by num count; run;

data fish_freq2;
set fish_freq;
by num count;
if last.num;

if species_a = species_b then match = "Y";
else match = "N";
run;

proc sql;
select species_a, match, count(*) as cnt
from fish_freq2
group by species_a, match
order by species_a, match;
quit;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;I did the Euclidean distance. And the results are not even close to being as good as Proc Discrim.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;For example, my manual model classified it all wrong for Parkki. It got only one right for Roach.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="i1002.png" style="width: 172px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/48199i3A56E96CD1B21BBA/image-size/large?v=v2&amp;amp;px=999" role="button" title="i1002.png" alt="i1002.png" /&gt;&lt;/span&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;In contrast, Proc Discrim classifies 4 Parkki and 9 Roach correctly.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="i1003.png" style="width: 645px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/48200i944193C8C86865D0/image-size/large?v=v2&amp;amp;px=999" role="button" title="i1003.png" alt="i1003.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;How does the Proc Discrim algorithm work that gives the better classification results?&lt;/P&gt;</description>
      <pubDate>Thu, 13 Aug 2020 13:28:46 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Science/How-to-replicate-KNN-results-from-Proc-Discrim/m-p/676482#M8383</guid>
      <dc:creator>kisumsam</dc:creator>
      <dc:date>2020-08-13T13:28:46Z</dc:date>
    </item>
    <item>
      <title>Re: How to replicate KNN results from Proc Discrim</title>
      <link>https://communities.sas.com/t5/SAS-Data-Science/How-to-replicate-KNN-results-from-Proc-Discrim/m-p/676488#M8384</link>
      <description>Did you try PROC MODCLUS to run KNN ?</description>
      <pubDate>Thu, 13 Aug 2020 13:46:48 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Science/How-to-replicate-KNN-results-from-Proc-Discrim/m-p/676488#M8384</guid>
      <dc:creator>Ksharp</dc:creator>
      <dc:date>2020-08-13T13:46:48Z</dc:date>
    </item>
    <item>
      <title>Re: How to replicate KNN results from Proc Discrim</title>
      <link>https://communities.sas.com/t5/SAS-Data-Science/How-to-replicate-KNN-results-from-Proc-Discrim/m-p/676492#M8385</link>
      <description>No. Is that a better procedure than Proc Discrim for KNN?</description>
      <pubDate>Thu, 13 Aug 2020 14:07:43 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Science/How-to-replicate-KNN-results-from-Proc-Discrim/m-p/676492#M8385</guid>
      <dc:creator>kisumsam</dc:creator>
      <dc:date>2020-08-13T14:07:43Z</dc:date>
    </item>
    <item>
      <title>Re: How to replicate KNN results from Proc Discrim</title>
      <link>https://communities.sas.com/t5/SAS-Data-Science/How-to-replicate-KNN-results-from-Proc-Discrim/m-p/676554#M8386</link>
      <description>&lt;P&gt;&lt;SPAN&gt;&amp;nbsp;&amp;gt;I found that my Proc Discrim procedure gives me a much better results than me doing the manual calculation for the KNN algorithm. &lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;&amp;gt;I'm wondering if there is any expert here who can explain why Proc Discrim does so much better.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;It is simple. Proc discrim was written by an expert. You are not replicating what discrim does.&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;&lt;A href="https://documentation.sas.com/?docsetId=statug&amp;amp;docsetTarget=statug_discrim_details02.htm&amp;amp;docsetVersion=15.1&amp;amp;locale=en"&gt;https://documentation.sas.com/?docsetId=statug&amp;amp;docsetTarget=statug_discrim_details02.htm&amp;amp;docsetVersion=15.1&amp;amp;locale=en&lt;/A&gt;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 13 Aug 2020 17:18:45 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Science/How-to-replicate-KNN-results-from-Proc-Discrim/m-p/676554#M8386</guid>
      <dc:creator>WarrenKuhfeld</dc:creator>
      <dc:date>2020-08-13T17:18:45Z</dc:date>
    </item>
  </channel>
</rss>

