Here's one way to do it, using the code Rick Wicklin shared to calculate the silhouette values, and then using some SAS/Graph code to plot the graphs (I tried to keep the graph code minimal/simple - you could enhance it a bit by annotating cluster numbers).
/*
To solve request:
https://communities.sas.com/t5/Graphics-Programming/CLUSTER-ANALYSIS-Silhouette-plots/td-p/421768
Using Rick Wicklin's code to calculate the silhouette statistic:
https://blogs.sas.com/content/iml/2023/05/17/compute-silhouette-sas.html
Using Robert Allison's SAS/Graph code to plot the results.
*/
/* Create scatter and silhouette plots for the iris data */
/* k-means cluster analysis */
proc fastclus data=Sashelp.Iris maxclusters=3 out=ClusOut noprint;
var SepalLength SepalWidth PetalLength PetalWidth;
run;
/* Implement the silhouette statistics for n points in k clusters. The silhouette statistic is defined in
Peter Rousseeuw (1987) "Silhouettes: A graphical aid to the interpretation
and validation of cluster analysis", https://doi.org/10.1016/0377-0427(87)90125-7
*/
proc iml;
/* AvgDistSelf: For each point, p, in a cluster this function returns the
average distance between p and other points in the same cluster.
*/
start AvgDistSelf(Y);
n = nrow(Y);
if n=1 then return(0); /* convention */
D = distance(Y); /* D[i,j] is distance from Y[i,] to Y[j,] */
AvgDist = D[ ,+] / (n-1); /* exclude D[i,i] */
return AvgDist;
finish;
/* AvgDistOther: Let Y contains points in one cluster and Z contain
points in a different cluster. For each point p in Y, this function
returns the average distance between p and the points in Z.
*/
start AvgDistOther(Y, Z);
D = distance(Y,Z); /* D[i,j] is distance from Y[i,] to Z[j,] */
return( D[ ,:] ); /* average of the D[i,j] */
finish;
/* Silhouette: The main computational routine. For each point, p:
1. Define AvgDistIn = average distance between p and other points in the same cluster.
Note: The divisor for this cluster is N-1, not N.
2. Find average distance between p and points in each of the other clusters.
Define AvgDistOut to be the MINIMUM of the average distance.
3. Define s(p) = (AvgDistOut(p) - AvgDistIn(p)) / max(AvgDistOut(p), AvgDistIn(p))
Input: X is an (n x d) matrix of observations
ClusterID is an (n x 1) vector that associates each point to a cluster. Typically,
the values are in the range {1,2,...,k}, where k is the total number of clusters.
Output: S is an (n x 1) vector that contains the silhouette statistic for each point in X.
*/
start Silhouette(X, ClusterID);
n = nrow(X);
AvgDistIn = j(n, 1, .);
AvgDistOut = j(n, 1, .);
/* Use the Unique-Loc technique to iterate over clusterID:
https://blogs.sas.com/content/iml/2011/11/01/the-unique-loc-trick-a-real-treat.html */
uID = unique(ClusterID);
k = ncol(uID);
do i = 1 to k;
idxIn = loc(ClusterID = uID[i]);
Y = X[idxIn, ]; /* obs in the i_th cluster */
v = AvgDistSelf(Y); /* average inter-cluster distance */
AvgDistIn[idxIn,] = v;
dMin = j(ncol(idxIn), 1, constant('BIG')); /* retain the min distances */
h = setdif(1:k, i); /* indices for the other clusters */
do j = 1 to ncol(h); /* loop over the other clusters */
jdx = loc(ClusterID = uID[h[j]]);
Z = X[jdx, ]; /* obs in the j_th cluster, j ^= i */
v = AvgDistOther(Y, Z); /* average between cluster distances */
dMin = (dMin >< v); /* retain the elementwise minimum */
end;
AvgDistOut[idxIn,] = dMin; /* min average distance to another cluster */
end;
s = (AvgDistOut - AvgDistIn) / (AvgDistOut <> AvgDistIn);
return s;
finish;
store module=(AvgDistSelf AvgDistOther Silhouette);
QUIT;
/* compute vector of silhouette statistics */
proc iml;
load module=(Silhouette);
varNames = {'SepalLength' 'SepalWidth' 'PetalLength' 'PetalWidth'};
use ClusOut;
read all var varNames into X;
read all var "Cluster" into ClusterID;
close;
silhouette = Silhouette(X, ClusterID);
create S var "Silhouette"; append; close; /* write silhouette values to data set */
QUIT;
/* The SilhouetteMergeSort macro
1. Merges the original data and the silhouette statistics
2. Computes the overall average silhouette measure and stores it in &SILH macro variable
3. Sorts data by cluster and (descending) by silhouette value
4. Adds an _ObsNum variable for ease of plotting the silhouette values as a bar chart
Args: DSname = Original data
SilhDSName = Output from PROC IML, containing silhouette values
ClusName = Name of indicator variable for cluster membership
SilhVarName = Name of variable that contains silhouette values
OutName = Name of merged data set
*/
%macro SilhouetteMergeSort(DSname, SilhDSName, ClusName, SilhVarName, OutName);
%GLOBAL Silh;
data _temp;
merge &DSName &SilhDSName end=EOF;
_mean + &SilhVarName;
if EOF then
call symputx( "Silh", round(_mean/_N_, 0.001));
run;
proc sort data=_temp; by &ClusName descending &SilhVarName; run;
data &OutName / view=&OutName; set _temp; _ObsNum=_N_; label _ObsNum="Observation"; run;
%mend;
/* merge data and create macro variable */
%SilhouetteMergeSort(ClusOut, S, Cluster, Silhouette, _All);
%put &=Silh;
symbol1 value=circle height=5pt color=cx4c4c4c interpol=none;
symbol2 value=circle height=5pt color=cx4ca0e7 interpol=none;
symbol3 value=circle height=5pt color=cx4cd05c interpol=none;
axis1 label=(angle=90) minor=none;
axis2 minor=none;
legend1 position=top repeat=1;
title1 h=15pt font="albany amt" "Silhouette Analysis of sashelp.iris data";
title2 h=12pt font="albany amt" "(SepalLength SepalWidth PetalLength PetalWidth)";
proc gplot data=_all;
plot petallength*sepallength=cluster /
vaxis=axis1 haxis=axis2 legend=legend1;
run;
options replace;
proc sort data=_all out=silplot;
by cluster silhouette;
run;
data silplot; set silplot;
ordervar+1;
run;
data silplot; set silplot;
output;
silhouette=0; output;
silhouette=.; output;
run;
symbol1 value=none height=5pt color=cx4c4c4c interpol=join;
symbol2 value=none height=5pt color=cx4ca0e7 interpol=join;
symbol3 value=none height=5pt color=cx4cd05c interpol=join;
axis1 label=(angle=90 "Cluster Label") value=none major=none minor=none;
axis2 label=('The Silhouette Coefficient Values') minor=none;
legend1 position=top;
proc gplot data=silplot;
plot ordervar*silhouette=cluster /
vaxis=axis1 haxis=axis2 legend=legend1;
... View more