Hierarchical cluster analysis
library(tidyverse)
Prepare data for analysis datasets::USArrests
library(datasets)
df <- datasets::USArrests
head(df)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
# check if 'NA' values present in data
any(is.na(df))
## [1] FALSE
# remove 'NA' if necessary
df <- na.omit(df)
# normalize
df <- scale(df)
Choosing optimal number of clusters
Elbow method
plot k ~ wss, where k - is a cluseter number and wss is a total within-cluster sum of square.
wss <- (nrow(df)-1)*sum(apply(df,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(df, centers=i)$withinss)
plot(1:15, wss, type="b",
xlab="Number of Clusters",
ylab="Within groups sum of squares")
This diagram shows that 4 number of clusers is optimal for this dataset.
Choosing the best clustering model
hc2 <- cluster::agnes(df, method='complete')
hc2$ac
## [1] 0.8531583
This metric allows to estimate the quality of cluster. Closer to 1 is better.
Using this metric we can try several models and choose the best one.
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# function to compute coefficient
ac <- function(x) { cluster::agnes(df, method = x)$ac }
purrr::map_dbl(m, ac)
## average single complete ward
## 0.7379371 0.6276128 0.8531583 0.9346210
As we can see the ward method gives the best clustering.
Hierarchical cluster analysis
Let’s first split data into 4 groups using clust::hclust
function.
# calculate distances
d <- dist(df, method='euclidean')
# hierarchical cluster analysis
# 'ward.D2' method is equivalent of agnes 'ward'
hc1 <- hclust(d, method='ward.D2')
# Plot the obtained dendrogram
plot(hc1, hang = -1, cex = 0.6)
# show 4 clusers
rect.hclust(hc1, k=4, border="blue")
# group data by clusters
groups <- cutree(hc1, k=3)
names(groups[groups == 1])
## [1] "Alabama" "Alaska" "Arizona" "California"
## [5] "Colorado" "Florida" "Georgia" "Illinois"
## [9] "Louisiana" "Maryland" "Michigan" "Mississippi"
## [13] "Nevada" "New Mexico" "New York" "North Carolina"
## [17] "South Carolina" "Tennessee" "Texas"
# check for cluster metrics
names(hc1)
## [1] "merge" "height" "order" "labels" "method"
## [6] "call" "dist.method"
Now we can split data into 4 groups using cluster::agnes
function.
# using 'agnes' for hierarhical clustering
hc3 <- cluster::agnes(df, method='ward')
# plot slaster
cluster::pltree(hc3, hang = -1, cex = 0.6)
# split into groups
groups <- cutree(as.hclust(hc3), k = 4)
groups[groups==1]
## Alabama Georgia Louisiana Mississippi North Carolina
## 1 1 1 1 1
## South Carolina Tennessee
## 1 1