R中列表中元素的快速组合
Fast combination of elements in a list in R
我正在尝试找到一种有效的方法来提取存储在列表中的所有向量成对组合。此代码的目的是在具有 100000 次迭代的重采样过程中计算数据向量所有可能成对组合的平均相关性。
# Data simulation
set.seed(90)
dummy_data <- matrix(runif(21120),33,640)
dummy_list <- vector("list",length = 33)
for (i in 1:33){
dummy_list[[i]] <- dummy_data[i,]
}
目前我找到的解决方案是:
n_iter <- 100000
cor_out <- vector("numeric",length = n_iter)
# For each iteration
for (z in 1:n_iter){
# Randomly shuffle the data
dummy_list_resample <- lapply(dummy_list, FUN = function(x){sample(x,size = 640)})
all_ind <- length(dummy_list_resample)
combs <- combn(all_ind,2)
# Initialize list for storing all pairwise observations
out_list <- vector("list",length(dim(combs)[2]))
# For each pairwise combination
for(i in 1:dim(combs)[2]){
# Identify and extract the data for each pair of subject
pair <- combs[,i]
vec1 <- dummy_list_resample[[pair[1]]]
vec2 <- dummy_list_resample[[pair[2]]]
out_list[[i]] <- cbind(vec1,vec2)
}
# Compute correlation for each pairwise combination
# and store the average value
cor_iter <- sapply(out_list, FUN = function(x){cor(x[,1],x[,2])})
cor_out[z] <- mean(cor_iter)
}
我发现它效率很低,因为它真的很慢(~12 小时的计算)
有没有办法避免循环?我知道 Rcpp 是加速迭代的方法,但不幸的是我不熟悉 C++。任何提示或示例将不胜感激。
您可以连接成一个矩阵,cor
函数接受一个矩阵并计算所有列的成对相关性。
您原来的做法:
list_cor <- function(seed=1) {
set.seed(seed)
dummy_list_resample <- lapply(dummy_list, FUN = function(x){sample(x,size = 640)})
all_ind <- length(dummy_list_resample)
combs <- combn(all_ind,2)
# Initialize list for storing all pairwise observations
out_list <- vector("list",length(dim(combs)[2]))
# For each pairwise combination
for(i in 1:dim(combs)[2]){
# Identify and extract the data for each pair of subject
pair <- combs[,i]
vec1 <- dummy_list_resample[[pair[1]]]
vec2 <- dummy_list_resample[[pair[2]]]
out_list[[i]] <- cbind(vec1,vec2)
}
# Compute correlation for each pairwise combination
# and store the average value
cor_iter <- sapply(out_list, FUN = function(x){cor(x[,1],x[,2])})
mean(cor_iter)
}
矩阵方法:
mat_cor <- function(seed=1) {
set.seed(seed)
dummy_list_resample <- lapply(dummy_list, FUN = function(x){sample(x,size = 640)})
dummy_mat <- do.call(cbind, dummy_list_resample)
cmat <- cor(dummy_mat)
mean(cmat[lower.tri(cmat)])
}
速度测试:
library(microbenchmark)
microbenchmark(sapply(1:10, mat_cor), sapply(1:10, list_cor), times=10)
Unit: milliseconds
expr min lq mean median uq max neval cld
sapply(1:10, mat_cor) 17.7916 19.00319 20.43652 20.68327 21.89248 22.72629 10 a
sapply(1:10, list_cor) 609.1673 622.57560 631.03171 628.26800 633.77480 673.58373 10 b
提速约 31.5 倍。
您可以检查结果是否相同(由于浮点精度,存在非常小的无关差异):
> mat_cor(1)
[1] 3.210217e-05
> list_cor(1)
[1] 3.210217e-05
如果在此之后您仍然需要更快的速度,我建议接下来查看 parallel
包。
我正在尝试找到一种有效的方法来提取存储在列表中的所有向量成对组合。此代码的目的是在具有 100000 次迭代的重采样过程中计算数据向量所有可能成对组合的平均相关性。
# Data simulation
set.seed(90)
dummy_data <- matrix(runif(21120),33,640)
dummy_list <- vector("list",length = 33)
for (i in 1:33){
dummy_list[[i]] <- dummy_data[i,]
}
目前我找到的解决方案是:
n_iter <- 100000
cor_out <- vector("numeric",length = n_iter)
# For each iteration
for (z in 1:n_iter){
# Randomly shuffle the data
dummy_list_resample <- lapply(dummy_list, FUN = function(x){sample(x,size = 640)})
all_ind <- length(dummy_list_resample)
combs <- combn(all_ind,2)
# Initialize list for storing all pairwise observations
out_list <- vector("list",length(dim(combs)[2]))
# For each pairwise combination
for(i in 1:dim(combs)[2]){
# Identify and extract the data for each pair of subject
pair <- combs[,i]
vec1 <- dummy_list_resample[[pair[1]]]
vec2 <- dummy_list_resample[[pair[2]]]
out_list[[i]] <- cbind(vec1,vec2)
}
# Compute correlation for each pairwise combination
# and store the average value
cor_iter <- sapply(out_list, FUN = function(x){cor(x[,1],x[,2])})
cor_out[z] <- mean(cor_iter)
}
我发现它效率很低,因为它真的很慢(~12 小时的计算)
有没有办法避免循环?我知道 Rcpp 是加速迭代的方法,但不幸的是我不熟悉 C++。任何提示或示例将不胜感激。
您可以连接成一个矩阵,cor
函数接受一个矩阵并计算所有列的成对相关性。
您原来的做法:
list_cor <- function(seed=1) {
set.seed(seed)
dummy_list_resample <- lapply(dummy_list, FUN = function(x){sample(x,size = 640)})
all_ind <- length(dummy_list_resample)
combs <- combn(all_ind,2)
# Initialize list for storing all pairwise observations
out_list <- vector("list",length(dim(combs)[2]))
# For each pairwise combination
for(i in 1:dim(combs)[2]){
# Identify and extract the data for each pair of subject
pair <- combs[,i]
vec1 <- dummy_list_resample[[pair[1]]]
vec2 <- dummy_list_resample[[pair[2]]]
out_list[[i]] <- cbind(vec1,vec2)
}
# Compute correlation for each pairwise combination
# and store the average value
cor_iter <- sapply(out_list, FUN = function(x){cor(x[,1],x[,2])})
mean(cor_iter)
}
矩阵方法:
mat_cor <- function(seed=1) {
set.seed(seed)
dummy_list_resample <- lapply(dummy_list, FUN = function(x){sample(x,size = 640)})
dummy_mat <- do.call(cbind, dummy_list_resample)
cmat <- cor(dummy_mat)
mean(cmat[lower.tri(cmat)])
}
速度测试:
library(microbenchmark)
microbenchmark(sapply(1:10, mat_cor), sapply(1:10, list_cor), times=10)
Unit: milliseconds
expr min lq mean median uq max neval cld
sapply(1:10, mat_cor) 17.7916 19.00319 20.43652 20.68327 21.89248 22.72629 10 a
sapply(1:10, list_cor) 609.1673 622.57560 631.03171 628.26800 633.77480 673.58373 10 b
提速约 31.5 倍。
您可以检查结果是否相同(由于浮点精度,存在非常小的无关差异):
> mat_cor(1)
[1] 3.210217e-05
> list_cor(1)
[1] 3.210217e-05
如果在此之后您仍然需要更快的速度,我建议接下来查看 parallel
包。