PART 2 : QUESTION 2 ---question a---- data temp; set econ324.ps2q2; run; proc means data=temp; var ressext culture actext nightlife; run; **answer: the most important factor based on the highest mean is ressext. this means that the sample of 200 people from Utah value "the quality of outdoor ressources" the most. data temp; set econ324.ps2q2; id =_N_; /* Creates unique ID for each observation */ run; proc cluster data=temp method=ward outtree=temp1 nonorm rsquare ccc; var ressext culture actext nightlife; copy id ressext culture actext nightlife; ods output stat.cluster.ClusterHistory=criteria; run; (^ all works fine) we pick 3 clusters since it's BSS is the smallest (54.99) before a big jump to 2 which has a distance between clusters is of 628.79. Also, the 3 clusters provide the highest CC which is 11.8 /*setting the clusters with respecting labels, THE CODE WORKS*/ proc tree data=temp1 out=temp2 out=clustered_data nclusters=3; id id; /* Label observations */ copy ressext culture actext nightlife; /* Copy variables */ run; /* Step 2: Assign descriptive labels to each cluster */ data labeled_clusters; set temp2; length ClusterLabel $30; /* Create a variable for cluster names */ if cluster = 1 then ClusterLabel = "Outdoor Adventure Seekers"; else if cluster = 2 then ClusterLabel = "Nature & Culture Enthusiasts"; else if cluster = 3 then ClusterLabel = "Urban Culture Aficionados"; run; /* Step 3: Sort by cluster (optional) */ proc sort data=labeled_clusters out=sorted_clusters; by cluster; run; /* (Optional) Display summary with labels */ proc means data=labeled_clusters n mean stddev min max; class ClusterLabel; /* Use cluster labels instead of numbers */ var ressext culture actext nightlife; run; GRAPHS RSQ,SPRSQ,CCC (use the last one to support the number of clusters) /*the graphs RSQ, SPRSQ, CCC; might not need the graphs, maybe pick the third graph to support my point of 3 clusters*/ ods graphics on; proc sgplot data=criteria(where=(NumberOfClusters LE 30)); series x=NumberOfClusters y=RSquared/markers markerattrs=(symbol=CircleFilled color=red); run; proc sgplot data=criteria(where=(NumberOfClusters LE 30)); series x=NumberOfClusters y=SemipartialRSq/markers markerattrs=(symbol=CircleFilled color=red); run; proc sgplot data=criteria(where=(NumberOfClusters LE 30)); series x=NumberOfClusters y=CubicClusCrit/markers markerattrs=(symbol=CircleFilled color=red); run; /*for question b*/NOT WORKING proc logistic data=ps2q2 outmodel=model_out; class cluster (ref='1'); /* Set reference cluster */ model cluster = age sex; run; /*classify future visitors of clusters = make predictions*/ proc logistic inmodel=model_out; score data=ps2q2d out=cluster_predictions; run; data cluster_assignments; set cluster_predictions; array probs{*} P_Cluster1 P_Cluster2 P_Cluster3 P_Cluster4; /* Probability variables */ max_prob = max(of P_Cluster1-P_Cluster4); do i = 1 to dim(probs); if probs{i} = max_prob then cluster_assignment = i; end; run;