<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: CLUSTER ANALYSIS: Silhouette plots in Graphics Programming</title>
    <link>https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/m-p/878229#M23872</link>
    <description>&lt;P&gt;Here's one way to do it, using the code Rick Wicklin shared to calculate the silhouette values, and then using some SAS/Graph code to plot the graphs (I tried to keep the graph code minimal/simple - you could enhance it a bit by annotating cluster numbers).&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;/*
To solve request:
https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/td-p/421768

Using Rick Wicklin's code to calculate the silhouette statistic:
https://blogs.sas.com/content/iml/2023/05/17/compute-silhouette-sas.html

Using Robert Allison's SAS/Graph code to plot the results.
*/


/* Create scatter and silhouette plots for the iris data */
/* k-means cluster analysis */
proc fastclus data=Sashelp.Iris maxclusters=3 out=ClusOut noprint;
  var SepalLength SepalWidth PetalLength PetalWidth;
run;

/* Implement the silhouette statistics for n points in k clusters. The silhouette statistic is defined in
   Peter Rousseeuw (1987) "Silhouettes: A graphical aid to the interpretation 
       and validation of cluster analysis", https://doi.org/10.1016/0377-0427(87)90125-7
*/
proc iml;
/* AvgDistSelf: For each point, p, in a cluster this function returns the 
   average distance between p and other points in the same cluster.
*/
start AvgDistSelf(Y);
   n = nrow(Y);
   if n=1 then return(0);    /* convention */
   D = distance(Y);          /* D[i,j] is distance from Y[i,] to Y[j,] */
   AvgDist = D[ ,+] / (n-1); /* exclude D[i,i] */
   return AvgDist;
finish;
/* AvgDistOther: Let Y contains points in one cluster and Z contain
   points in a different cluster. For each point p in Y, this function 
   returns the average distance between p and the points in Z.
*/
start AvgDistOther(Y, Z);
   D = distance(Y,Z);        /* D[i,j] is distance from Y[i,] to Z[j,] */
   return( D[ ,:] );         /* average of the D[i,j] */
finish;
 
/* Silhouette: The main computational routine. For each point, p:
   1. Define AvgDistIn = average distance between p and other points in the same cluster.
      Note: The divisor for this cluster is N-1, not N.
   2. Find average distance between p and points in each of the other clusters.
      Define AvgDistOut to be the MINIMUM of the average distance.
   3. Define s(p) = (AvgDistOut(p) - AvgDistIn(p)) / max(AvgDistOut(p), AvgDistIn(p))
 
   Input: X         is an (n x d) matrix of observations 
          ClusterID is an (n x 1) vector that associates each point to a cluster. Typically,
                    the values are in the range {1,2,...,k}, where k is the total number of clusters.
   Output: S        is an (n x 1) vector that contains the silhouette statistic for each point in X.
*/
start Silhouette(X, ClusterID);
   n = nrow(X);
   AvgDistIn  = j(n, 1, .);
   AvgDistOut = j(n, 1, .);
 
   /* Use the Unique-Loc technique to iterate over clusterID:
   https://blogs.sas.com/content/iml/2011/11/01/the-unique-loc-trick-a-real-treat.html */
   uID = unique(ClusterID);
   k = ncol(uID);
   do i = 1 to k;
      idxIn = loc(ClusterID = uID[i]);
      Y = X[idxIn, ];            /* obs in the i_th cluster */
      v = AvgDistSelf(Y);        /* average inter-cluster distance */
      AvgDistIn[idxIn,] = v;
      dMin = j(ncol(idxIn), 1, constant('BIG'));   /* retain the min distances */
      h = setdif(1:k, i);        /* indices for the other clusters */
      do j = 1 to ncol(h);       /* loop over the other clusters */
         jdx = loc(ClusterID = uID[h[j]]); 
         Z = X[jdx, ];           /* obs in the j_th cluster, j ^= i */
         v = AvgDistOther(Y, Z); /* average between cluster distances */
         dMin = (dMin &amp;gt;&amp;lt; v);     /* retain the elementwise minimum */
      end;
      AvgDistOut[idxIn,] = dMin; /* min average distance to another cluster */
   end;
   s = (AvgDistOut - AvgDistIn) / (AvgDistOut &amp;lt;&amp;gt; AvgDistIn);
   return s;
finish;
store module=(AvgDistSelf AvgDistOther Silhouette);
QUIT;

/* compute vector of silhouette statistics */
proc iml;
load module=(Silhouette);
varNames = {'SepalLength' 'SepalWidth' 'PetalLength' 'PetalWidth'};
use ClusOut;
   read all var varNames into X;
   read all var "Cluster" into ClusterID;
close;
silhouette = Silhouette(X, ClusterID);
create S var "Silhouette"; append; close; /* write silhouette values to data set */
QUIT;

/* The SilhouetteMergeSort macro
   1. Merges the original data and the silhouette statistics
   2. Computes the overall average silhouette measure and stores it in &amp;amp;SILH macro variable
   3. Sorts data by cluster and (descending) by silhouette value
   4. Adds an _ObsNum variable for ease of plotting the silhouette values as a bar chart
 
   Args: DSname      = Original data
         SilhDSName  = Output from PROC IML, containing silhouette values
         ClusName    = Name of indicator variable for cluster membership
         SilhVarName = Name of variable that contains silhouette values
         OutName     = Name of merged data set
*/
%macro SilhouetteMergeSort(DSname, SilhDSName, ClusName, SilhVarName, OutName);
   %GLOBAL Silh;
   data _temp; 
      merge &amp;amp;DSName &amp;amp;SilhDSName end=EOF; 
      _mean + &amp;amp;SilhVarName;
      if EOF then
         call symputx( "Silh", round(_mean/_N_, 0.001));
   run;
   proc sort data=_temp; by &amp;amp;ClusName  descending &amp;amp;SilhVarName; run;
   data &amp;amp;OutName / view=&amp;amp;OutName; set _temp; _ObsNum=_N_; label _ObsNum="Observation"; run;
%mend;

/* merge data and create macro variable */
%SilhouetteMergeSort(ClusOut, S, Cluster, Silhouette, _All);
%put &amp;amp;=Silh;

symbol1 value=circle height=5pt color=cx4c4c4c interpol=none;
symbol2 value=circle height=5pt color=cx4ca0e7 interpol=none;
symbol3 value=circle height=5pt color=cx4cd05c interpol=none;

axis1 label=(angle=90) minor=none;
axis2 minor=none;

legend1 position=top repeat=1;

title1 h=15pt font="albany amt"  "Silhouette Analysis of sashelp.iris data";
title2 h=12pt font="albany amt" "(SepalLength SepalWidth PetalLength PetalWidth)";

proc gplot data=_all;
plot petallength*sepallength=cluster / 
 vaxis=axis1 haxis=axis2 legend=legend1;
run;

options replace;
proc sort data=_all out=silplot;
by cluster silhouette;
run;
data silplot; set silplot;
ordervar+1;
run;
data silplot; set silplot;
output;
silhouette=0; output;
silhouette=.; output;
run;

symbol1 value=none height=5pt color=cx4c4c4c interpol=join;
symbol2 value=none height=5pt color=cx4ca0e7 interpol=join;
symbol3 value=none height=5pt color=cx4cd05c interpol=join;

axis1 label=(angle=90 "Cluster Label") value=none major=none minor=none;
axis2 label=('The Silhouette Coefficient Values') minor=none;

legend1 position=top;

proc gplot data=silplot;
plot ordervar*silhouette=cluster /
 vaxis=axis1 haxis=axis2 legend=legend1;

&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="sil1.png" style="width: 800px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/84482iC2160DB5AA5E478D/image-size/large?v=v2&amp;amp;px=999" role="button" title="sil1.png" alt="sil1.png" /&gt;&lt;/span&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="sil2.png" style="width: 800px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/84483i18F9C79B5735094A/image-size/large?v=v2&amp;amp;px=999" role="button" title="sil2.png" alt="sil2.png" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
    <pubDate>Tue, 30 May 2023 13:32:56 GMT</pubDate>
    <dc:creator>GraphGuy</dc:creator>
    <dc:date>2023-05-30T13:32:56Z</dc:date>
    <item>
      <title>CLUSTER ANALYSIS: Silhouette plots</title>
      <link>https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/m-p/421768#M14502</link>
      <description>&lt;P&gt;Hello all.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I can't seem to be able to find the code to get silhouette plots in SAS, to complement my cluster analysis, like &lt;A href="http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html" target="_self"&gt;these&lt;/A&gt; here. Is there any way to get them using PROC CLUSTER or PROC FASTCLUS? Or do I need a different procedure altogether?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Most other statistical softwares can produce them.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Can anyone help me?&lt;/P&gt;</description>
      <pubDate>Sat, 16 Dec 2017 10:22:43 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/m-p/421768#M14502</guid>
      <dc:creator>LunaMinerva</dc:creator>
      <dc:date>2017-12-16T10:22:43Z</dc:date>
    </item>
    <item>
      <title>Re: CLUSTER ANALYSIS: Silhouette plots</title>
      <link>https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/m-p/421788#M14504</link>
      <description>&lt;P&gt;CLUSTER and FASTCLUS definitely do not produce that plot.&amp;nbsp; You would need to use data manipulations, GTL and SGRENDER.&lt;/P&gt;</description>
      <pubDate>Sat, 16 Dec 2017 20:02:08 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/m-p/421788#M14504</guid>
      <dc:creator>WarrenKuhfeld</dc:creator>
      <dc:date>2017-12-16T20:02:08Z</dc:date>
    </item>
    <item>
      <title>Re: CLUSTER ANALYSIS: Silhouette plots</title>
      <link>https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/m-p/421837#M14505</link>
      <description>&lt;P&gt;And what's the code for that? Or an example I can follow?&lt;/P&gt;</description>
      <pubDate>Sun, 17 Dec 2017 12:18:36 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/m-p/421837#M14505</guid>
      <dc:creator>LunaMinerva</dc:creator>
      <dc:date>2017-12-17T12:18:36Z</dc:date>
    </item>
    <item>
      <title>Re: CLUSTER ANALYSIS: Silhouette plots</title>
      <link>https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/m-p/877523#M23859</link>
      <description>&lt;P&gt;See these two blog posts:&lt;/P&gt;
&lt;P&gt;&lt;A href="https://blogs.sas.com/content/iml/2023/05/15/silhouette-statistic-cluster.html" target="_blank" rel="nofollow noopener noreferrer"&gt;https://blogs.sas.com/content/iml/2023/05/15/silhouette-statistic-cluster.html&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;&lt;A href="https://blogs.sas.com/content/iml/2023/05/17/compute-silhouette-sas.html" target="_blank" rel="nofollow noopener noreferrer"&gt;https://blogs.sas.com/content/iml/2023/05/17/compute-silhouette-sas.html&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 25 May 2023 14:54:45 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/m-p/877523#M23859</guid>
      <dc:creator>Rick_SAS</dc:creator>
      <dc:date>2023-05-25T14:54:45Z</dc:date>
    </item>
    <item>
      <title>Re: CLUSTER ANALYSIS: Silhouette plots</title>
      <link>https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/m-p/878229#M23872</link>
      <description>&lt;P&gt;Here's one way to do it, using the code Rick Wicklin shared to calculate the silhouette values, and then using some SAS/Graph code to plot the graphs (I tried to keep the graph code minimal/simple - you could enhance it a bit by annotating cluster numbers).&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;/*
To solve request:
https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/td-p/421768

Using Rick Wicklin's code to calculate the silhouette statistic:
https://blogs.sas.com/content/iml/2023/05/17/compute-silhouette-sas.html

Using Robert Allison's SAS/Graph code to plot the results.
*/


/* Create scatter and silhouette plots for the iris data */
/* k-means cluster analysis */
proc fastclus data=Sashelp.Iris maxclusters=3 out=ClusOut noprint;
  var SepalLength SepalWidth PetalLength PetalWidth;
run;

/* Implement the silhouette statistics for n points in k clusters. The silhouette statistic is defined in
   Peter Rousseeuw (1987) "Silhouettes: A graphical aid to the interpretation 
       and validation of cluster analysis", https://doi.org/10.1016/0377-0427(87)90125-7
*/
proc iml;
/* AvgDistSelf: For each point, p, in a cluster this function returns the 
   average distance between p and other points in the same cluster.
*/
start AvgDistSelf(Y);
   n = nrow(Y);
   if n=1 then return(0);    /* convention */
   D = distance(Y);          /* D[i,j] is distance from Y[i,] to Y[j,] */
   AvgDist = D[ ,+] / (n-1); /* exclude D[i,i] */
   return AvgDist;
finish;
/* AvgDistOther: Let Y contains points in one cluster and Z contain
   points in a different cluster. For each point p in Y, this function 
   returns the average distance between p and the points in Z.
*/
start AvgDistOther(Y, Z);
   D = distance(Y,Z);        /* D[i,j] is distance from Y[i,] to Z[j,] */
   return( D[ ,:] );         /* average of the D[i,j] */
finish;
 
/* Silhouette: The main computational routine. For each point, p:
   1. Define AvgDistIn = average distance between p and other points in the same cluster.
      Note: The divisor for this cluster is N-1, not N.
   2. Find average distance between p and points in each of the other clusters.
      Define AvgDistOut to be the MINIMUM of the average distance.
   3. Define s(p) = (AvgDistOut(p) - AvgDistIn(p)) / max(AvgDistOut(p), AvgDistIn(p))
 
   Input: X         is an (n x d) matrix of observations 
          ClusterID is an (n x 1) vector that associates each point to a cluster. Typically,
                    the values are in the range {1,2,...,k}, where k is the total number of clusters.
   Output: S        is an (n x 1) vector that contains the silhouette statistic for each point in X.
*/
start Silhouette(X, ClusterID);
   n = nrow(X);
   AvgDistIn  = j(n, 1, .);
   AvgDistOut = j(n, 1, .);
 
   /* Use the Unique-Loc technique to iterate over clusterID:
   https://blogs.sas.com/content/iml/2011/11/01/the-unique-loc-trick-a-real-treat.html */
   uID = unique(ClusterID);
   k = ncol(uID);
   do i = 1 to k;
      idxIn = loc(ClusterID = uID[i]);
      Y = X[idxIn, ];            /* obs in the i_th cluster */
      v = AvgDistSelf(Y);        /* average inter-cluster distance */
      AvgDistIn[idxIn,] = v;
      dMin = j(ncol(idxIn), 1, constant('BIG'));   /* retain the min distances */
      h = setdif(1:k, i);        /* indices for the other clusters */
      do j = 1 to ncol(h);       /* loop over the other clusters */
         jdx = loc(ClusterID = uID[h[j]]); 
         Z = X[jdx, ];           /* obs in the j_th cluster, j ^= i */
         v = AvgDistOther(Y, Z); /* average between cluster distances */
         dMin = (dMin &amp;gt;&amp;lt; v);     /* retain the elementwise minimum */
      end;
      AvgDistOut[idxIn,] = dMin; /* min average distance to another cluster */
   end;
   s = (AvgDistOut - AvgDistIn) / (AvgDistOut &amp;lt;&amp;gt; AvgDistIn);
   return s;
finish;
store module=(AvgDistSelf AvgDistOther Silhouette);
QUIT;

/* compute vector of silhouette statistics */
proc iml;
load module=(Silhouette);
varNames = {'SepalLength' 'SepalWidth' 'PetalLength' 'PetalWidth'};
use ClusOut;
   read all var varNames into X;
   read all var "Cluster" into ClusterID;
close;
silhouette = Silhouette(X, ClusterID);
create S var "Silhouette"; append; close; /* write silhouette values to data set */
QUIT;

/* The SilhouetteMergeSort macro
   1. Merges the original data and the silhouette statistics
   2. Computes the overall average silhouette measure and stores it in &amp;amp;SILH macro variable
   3. Sorts data by cluster and (descending) by silhouette value
   4. Adds an _ObsNum variable for ease of plotting the silhouette values as a bar chart
 
   Args: DSname      = Original data
         SilhDSName  = Output from PROC IML, containing silhouette values
         ClusName    = Name of indicator variable for cluster membership
         SilhVarName = Name of variable that contains silhouette values
         OutName     = Name of merged data set
*/
%macro SilhouetteMergeSort(DSname, SilhDSName, ClusName, SilhVarName, OutName);
   %GLOBAL Silh;
   data _temp; 
      merge &amp;amp;DSName &amp;amp;SilhDSName end=EOF; 
      _mean + &amp;amp;SilhVarName;
      if EOF then
         call symputx( "Silh", round(_mean/_N_, 0.001));
   run;
   proc sort data=_temp; by &amp;amp;ClusName  descending &amp;amp;SilhVarName; run;
   data &amp;amp;OutName / view=&amp;amp;OutName; set _temp; _ObsNum=_N_; label _ObsNum="Observation"; run;
%mend;

/* merge data and create macro variable */
%SilhouetteMergeSort(ClusOut, S, Cluster, Silhouette, _All);
%put &amp;amp;=Silh;

symbol1 value=circle height=5pt color=cx4c4c4c interpol=none;
symbol2 value=circle height=5pt color=cx4ca0e7 interpol=none;
symbol3 value=circle height=5pt color=cx4cd05c interpol=none;

axis1 label=(angle=90) minor=none;
axis2 minor=none;

legend1 position=top repeat=1;

title1 h=15pt font="albany amt"  "Silhouette Analysis of sashelp.iris data";
title2 h=12pt font="albany amt" "(SepalLength SepalWidth PetalLength PetalWidth)";

proc gplot data=_all;
plot petallength*sepallength=cluster / 
 vaxis=axis1 haxis=axis2 legend=legend1;
run;

options replace;
proc sort data=_all out=silplot;
by cluster silhouette;
run;
data silplot; set silplot;
ordervar+1;
run;
data silplot; set silplot;
output;
silhouette=0; output;
silhouette=.; output;
run;

symbol1 value=none height=5pt color=cx4c4c4c interpol=join;
symbol2 value=none height=5pt color=cx4ca0e7 interpol=join;
symbol3 value=none height=5pt color=cx4cd05c interpol=join;

axis1 label=(angle=90 "Cluster Label") value=none major=none minor=none;
axis2 label=('The Silhouette Coefficient Values') minor=none;

legend1 position=top;

proc gplot data=silplot;
plot ordervar*silhouette=cluster /
 vaxis=axis1 haxis=axis2 legend=legend1;

&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="sil1.png" style="width: 800px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/84482iC2160DB5AA5E478D/image-size/large?v=v2&amp;amp;px=999" role="button" title="sil1.png" alt="sil1.png" /&gt;&lt;/span&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="sil2.png" style="width: 800px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/84483i18F9C79B5735094A/image-size/large?v=v2&amp;amp;px=999" role="button" title="sil2.png" alt="sil2.png" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 30 May 2023 13:32:56 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/m-p/878229#M23872</guid>
      <dc:creator>GraphGuy</dc:creator>
      <dc:date>2023-05-30T13:32:56Z</dc:date>
    </item>
  </channel>
</rss>

