如何 运行 平行 Elbow 方法找到合适的 k-clusters
How to run parallel Elbow method to find appropriate k-clusters
大小为"data.clustering"的数据框:943x2
> head(data.clustering)
age gender
2 2 1
3 6 2
4 2 1
5 2 1
6 6 2
7 6 1
当我使用 Elbow 方法找到 k 个值时:
elbow.k <- function(mydata){
## determine a "good" k using elbow
dist.obj <- dist(mydata);
hclust.obj <- hclust(dist.obj);
css.obj <- css.hclust(dist.obj,hclust.obj);
elbow.obj <- elbow.batch(css.obj);
# print(elbow.obj)
k <- elbow.obj$k
return(k)
}
# find k value
start.time <- Sys.time();
k.clusters <- elbow.k(data.clustering);
end.time <- Sys.time();
cat('Time to find k using Elbow method is',(end.time - start.time),'seconds with k value:', k.clusters);
The time is so large:
Time to find k using Elbow method is 24.01472 seconds with k value: 10
谁能帮我在 r 中使用 parallel 来减少 Elbow 方法的时间?非常感谢。
您可以在 R 中使用库(并行)包。但是您必须考虑使用 clusterEvalQ()、clusterExport() 将变量和包导入您的环境。
我认为您的代码如下所示:
库(并行)
#
elbow.k <- function(mydata){
## determine a "good" k using elbow
dist.obj <- dist(mydata);
hclust.obj <- hclust(dist.obj);
css.obj <- css.hclust(dist.obj,hclust.obj);
elbow.obj <- elbow.batch(css.obj);
# print(elbow.obj)
k <- elbow.obj$k
return(k)
}
# find k value
no_cores <- detectCores();
cl<-makeCluster(no_cores);
clusterEvalQ(cl, library(GMD));
clusterExport(cl, list("clustering.kmeans")); // add variables and functions to your enviroment
start.time <- Sys.time();
k.clusters <- parSapply(cl, 1, function(x) elbow.k(data.clustering)); // or parLapply - it returns list.
end.time <- Sys.time();
cat('Time to find k using Elbow method is',(end.time - start.time),'seconds with k value:', k.clusters);
stopCluster(cl);
这是一个使用 k-means 创建肘部图的共享内存并行示例。
library(parallel)
elbow <- function(min_max, frame) {
set.seed(42)
wss <- (nrow(frame)-1)*sum(apply(frame,2,var))
for (i in min_max) {
wss[i] <- sum(kmeans(frame,centers=i,algorithm = c('MacQueen'))$withinss)
}
return(wss)
}
parallel_elbow <- function(kmax, frame_choice) {
# create separate kmin:kmax vectors
cut_point <- 3
centers_vec <- 2:kmax
x <- seq_along(centers_vec)
chunks <- split(centers_vec, ceiling(x/cut_point))
# use shared-memory parallelism on function of choice
results <- mclapply(chunks, FUN=elbow, frame=frame_choice)
# gather the results of each parallel run
no_nas <- list()
for(i in 1:length(results)) {
no_nas[i] <- list(as.numeric(na.omit(results[[i]])))
}
vec <- unlist(no_nas)
final_vec <- setdiff(vec, vec[1])
final_vec <- append(vec[1],final_vec)
# create scree plot of all wss values
plot(1:length(final_vec), final_vec, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares", pch = 16, main="Elbow Plot", col="steelblue")
}
这样使用:
parallel_elbow(100, iris[,-5])
比较文档术语矩阵与 2176 个文档的运行时间:
system.time(elbow(1:10, dtm))
user system elapsed
83.130 1.450 84.843
system.time(parallel_elbow(10, dtm))
user system elapsed
21.097 0.653 48.132
橙色正常,蓝色平行。
大小为"data.clustering"的数据框:943x2
> head(data.clustering)
age gender
2 2 1
3 6 2
4 2 1
5 2 1
6 6 2
7 6 1
当我使用 Elbow 方法找到 k 个值时:
elbow.k <- function(mydata){
## determine a "good" k using elbow
dist.obj <- dist(mydata);
hclust.obj <- hclust(dist.obj);
css.obj <- css.hclust(dist.obj,hclust.obj);
elbow.obj <- elbow.batch(css.obj);
# print(elbow.obj)
k <- elbow.obj$k
return(k)
}
# find k value
start.time <- Sys.time();
k.clusters <- elbow.k(data.clustering);
end.time <- Sys.time();
cat('Time to find k using Elbow method is',(end.time - start.time),'seconds with k value:', k.clusters);
The time is so large:
Time to find k using Elbow method is 24.01472 seconds with k value: 10
谁能帮我在 r 中使用 parallel 来减少 Elbow 方法的时间?非常感谢。
您可以在 R 中使用库(并行)包。但是您必须考虑使用 clusterEvalQ()、clusterExport() 将变量和包导入您的环境。 我认为您的代码如下所示: 库(并行)
#
elbow.k <- function(mydata){
## determine a "good" k using elbow
dist.obj <- dist(mydata);
hclust.obj <- hclust(dist.obj);
css.obj <- css.hclust(dist.obj,hclust.obj);
elbow.obj <- elbow.batch(css.obj);
# print(elbow.obj)
k <- elbow.obj$k
return(k)
}
# find k value
no_cores <- detectCores();
cl<-makeCluster(no_cores);
clusterEvalQ(cl, library(GMD));
clusterExport(cl, list("clustering.kmeans")); // add variables and functions to your enviroment
start.time <- Sys.time();
k.clusters <- parSapply(cl, 1, function(x) elbow.k(data.clustering)); // or parLapply - it returns list.
end.time <- Sys.time();
cat('Time to find k using Elbow method is',(end.time - start.time),'seconds with k value:', k.clusters);
stopCluster(cl);
这是一个使用 k-means 创建肘部图的共享内存并行示例。
library(parallel)
elbow <- function(min_max, frame) {
set.seed(42)
wss <- (nrow(frame)-1)*sum(apply(frame,2,var))
for (i in min_max) {
wss[i] <- sum(kmeans(frame,centers=i,algorithm = c('MacQueen'))$withinss)
}
return(wss)
}
parallel_elbow <- function(kmax, frame_choice) {
# create separate kmin:kmax vectors
cut_point <- 3
centers_vec <- 2:kmax
x <- seq_along(centers_vec)
chunks <- split(centers_vec, ceiling(x/cut_point))
# use shared-memory parallelism on function of choice
results <- mclapply(chunks, FUN=elbow, frame=frame_choice)
# gather the results of each parallel run
no_nas <- list()
for(i in 1:length(results)) {
no_nas[i] <- list(as.numeric(na.omit(results[[i]])))
}
vec <- unlist(no_nas)
final_vec <- setdiff(vec, vec[1])
final_vec <- append(vec[1],final_vec)
# create scree plot of all wss values
plot(1:length(final_vec), final_vec, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares", pch = 16, main="Elbow Plot", col="steelblue")
}
这样使用:
parallel_elbow(100, iris[,-5])
比较文档术语矩阵与 2176 个文档的运行时间:
system.time(elbow(1:10, dtm))
user system elapsed
83.130 1.450 84.843
system.time(parallel_elbow(10, dtm))
user system elapsed
21.097 0.653 48.132
橙色正常,蓝色平行。