factoextra::fviz_gap_stat() 与 factoextra::fviz_nbclust(df, method = "gap_stat")
factoextra::fviz_gap_stat() versus factoextra::fviz_nbclust(df, method = "gap_stat")
我想弄清楚为什么 factoextra
包中的这两个函数具有 看似相似的参数(例如 kmeans
、gap_stat,
k.maxand
B`) 产生了不同的结果。
library(cluster)
library(cluster.datasets)
library(tidyverse)
library(factoextra)
# load data and scale it
data("all.mammals.milk.1956")
mammals <- all.mammals.milk.1956 %>% select(-name)
mammals_scaled <- scale(mammals)
第一种方法使用factoextra::clusGap()
和factoextra::fviz_gap_stat()
gap_stat <- clusGap(mammals_scaled, FUN = kmeans, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")
第二种方法使用 factoextra::fviz_nbclust()
其中
fviz_nbclust(mammals_scaled, kmeans, method = "gap_stat", k.max = 24, nboot = 50) + theme_minimal() + ggtitle("fviz_nbClust_gap_stat: Gap Statistic")
我认为它可能是 clusGap()
的 nstart
选项,但是当我使用 jimhester/lookup
阅读 fviz_nbclust()
的源代码时,代码如下 我找不到问题所在:
devtools::install_github("jimhester/lookup")
lookup::lookup(fviz_nbclust)
function (x, FUNcluster = NULL, method = c("silhouette", "wss",
"gap_stat"), diss = NULL, k.max = 10, nboot = 100, verbose = interactive(),
barfill = "steelblue", barcolor = "steelblue", linecolor = "steelblue",
print.summary = TRUE, ...)
{
set.seed(123)
if (k.max < 2)
stop("k.max must bet > = 2")
method = match.arg(method)
if (!inherits(x, c("data.frame", "matrix")) & !("Best.nc" %in%
names(x)))
stop("x should be an object of class matrix/data.frame or ",
"an object created by the function NbClust() [NbClust package].")
if (inherits(x, "list") & "Best.nc" %in% names(x)) {
best_nc <- x$Best.nc
if (class(best_nc) == "numeric")
print(best_nc)
else if (class(best_nc) == "matrix")
.viz_NbClust(x, print.summary, barfill, barcolor)
}
else if (is.null(FUNcluster))
stop("The argument FUNcluster is required. ", "Possible values are kmeans, pam, hcut, clara, ...")
else if (method %in% c("silhouette", "wss")) {
if (is.data.frame(x))
x <- as.matrix(x)
if (is.null(diss))
diss <- stats::dist(x)
v <- rep(0, k.max)
if (method == "silhouette") {
for (i in 2:k.max) {
clust <- FUNcluster(x, i, ...)
v[i] <- .get_ave_sil_width(diss, clust$cluster)
}
}
else if (method == "wss") {
for (i in 1:k.max) {
clust <- FUNcluster(x, i, ...)
v[i] <- .get_withinSS(diss, clust$cluster)
}
}
df <- data.frame(clusters = as.factor(1:k.max), y = v)
ylab <- "Total Within Sum of Square"
if (method == "silhouette")
ylab <- "Average silhouette width"
p <- ggpubr::ggline(df, x = "clusters", y = "y", group = 1,
color = linecolor, ylab = ylab, xlab = "Number of clusters k",
main = "Optimal number of clusters")
if (method == "silhouette")
p <- p + geom_vline(xintercept = which.max(v), linetype = 2,
color = linecolor)
return(p)
}
else if (method == "gap_stat") {
extra_args <- list(...)
gap_stat <- cluster::clusGap(x, FUNcluster, K.max = k.max,
B = nboot, verbose = verbose, ...)
if (!is.null(extra_args$maxSE))
maxSE <- extra_args$maxSE
else maxSE <- list(method = "firstSEmax", SE.factor = 1)
p <- fviz_gap_stat(gap_stat, linecolor = linecolor,
maxSE = maxSE)
return(p)
}
}
区别就在 fviz_nbclust 函数的开头。在第 6 行中设置了随机种子:
set.seed(123)
因为 kmeans 算法使用随机开始,所以在重复运行时结果可能不同。例如,我将您的数据与两个不同的随机种子一起使用,得出的结果略有不同。
set.seed(123)
gap_stat <- cluster::clusGap(mammals_scaled, FUN = kmeans, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")
seed 123 gap stat
set.seed(42)
gap_stat <- cluster::clusGap(mammals_scaled, FUN = kmeans, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")
seed 42 gap stat
我不完全确定为什么种子 123 结果不一样,但我认为这与以下事实有关:在我的代码中,它是在 clusGap 函数上方执行的,在 Fviz_nbclust 其他几个函数中命令在两者之间进行评估。
我想弄清楚为什么 factoextra
包中的这两个函数具有 看似相似的参数(例如 kmeans
、gap_stat,
k.maxand
B`) 产生了不同的结果。
library(cluster)
library(cluster.datasets)
library(tidyverse)
library(factoextra)
# load data and scale it
data("all.mammals.milk.1956")
mammals <- all.mammals.milk.1956 %>% select(-name)
mammals_scaled <- scale(mammals)
第一种方法使用factoextra::clusGap()
和factoextra::fviz_gap_stat()
gap_stat <- clusGap(mammals_scaled, FUN = kmeans, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")
第二种方法使用 factoextra::fviz_nbclust()
其中
fviz_nbclust(mammals_scaled, kmeans, method = "gap_stat", k.max = 24, nboot = 50) + theme_minimal() + ggtitle("fviz_nbClust_gap_stat: Gap Statistic")
我认为它可能是 clusGap()
的 nstart
选项,但是当我使用 jimhester/lookup
阅读 fviz_nbclust()
的源代码时,代码如下 我找不到问题所在:
devtools::install_github("jimhester/lookup")
lookup::lookup(fviz_nbclust)
function (x, FUNcluster = NULL, method = c("silhouette", "wss",
"gap_stat"), diss = NULL, k.max = 10, nboot = 100, verbose = interactive(),
barfill = "steelblue", barcolor = "steelblue", linecolor = "steelblue",
print.summary = TRUE, ...)
{
set.seed(123)
if (k.max < 2)
stop("k.max must bet > = 2")
method = match.arg(method)
if (!inherits(x, c("data.frame", "matrix")) & !("Best.nc" %in%
names(x)))
stop("x should be an object of class matrix/data.frame or ",
"an object created by the function NbClust() [NbClust package].")
if (inherits(x, "list") & "Best.nc" %in% names(x)) {
best_nc <- x$Best.nc
if (class(best_nc) == "numeric")
print(best_nc)
else if (class(best_nc) == "matrix")
.viz_NbClust(x, print.summary, barfill, barcolor)
}
else if (is.null(FUNcluster))
stop("The argument FUNcluster is required. ", "Possible values are kmeans, pam, hcut, clara, ...")
else if (method %in% c("silhouette", "wss")) {
if (is.data.frame(x))
x <- as.matrix(x)
if (is.null(diss))
diss <- stats::dist(x)
v <- rep(0, k.max)
if (method == "silhouette") {
for (i in 2:k.max) {
clust <- FUNcluster(x, i, ...)
v[i] <- .get_ave_sil_width(diss, clust$cluster)
}
}
else if (method == "wss") {
for (i in 1:k.max) {
clust <- FUNcluster(x, i, ...)
v[i] <- .get_withinSS(diss, clust$cluster)
}
}
df <- data.frame(clusters = as.factor(1:k.max), y = v)
ylab <- "Total Within Sum of Square"
if (method == "silhouette")
ylab <- "Average silhouette width"
p <- ggpubr::ggline(df, x = "clusters", y = "y", group = 1,
color = linecolor, ylab = ylab, xlab = "Number of clusters k",
main = "Optimal number of clusters")
if (method == "silhouette")
p <- p + geom_vline(xintercept = which.max(v), linetype = 2,
color = linecolor)
return(p)
}
else if (method == "gap_stat") {
extra_args <- list(...)
gap_stat <- cluster::clusGap(x, FUNcluster, K.max = k.max,
B = nboot, verbose = verbose, ...)
if (!is.null(extra_args$maxSE))
maxSE <- extra_args$maxSE
else maxSE <- list(method = "firstSEmax", SE.factor = 1)
p <- fviz_gap_stat(gap_stat, linecolor = linecolor,
maxSE = maxSE)
return(p)
}
}
区别就在 fviz_nbclust 函数的开头。在第 6 行中设置了随机种子:
set.seed(123)
因为 kmeans 算法使用随机开始,所以在重复运行时结果可能不同。例如,我将您的数据与两个不同的随机种子一起使用,得出的结果略有不同。
set.seed(123)
gap_stat <- cluster::clusGap(mammals_scaled, FUN = kmeans, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")
seed 123 gap stat
set.seed(42)
gap_stat <- cluster::clusGap(mammals_scaled, FUN = kmeans, K.max = 24, B = 50)
fviz_gap_stat(gap_stat) + theme_minimal() + ggtitle("fviz_gap_stat: Gap Statistic")
seed 42 gap stat
我不完全确定为什么种子 123 结果不一样,但我认为这与以下事实有关:在我的代码中,它是在 clusGap 函数上方执行的,在 Fviz_nbclust 其他几个函数中命令在两者之间进行评估。