在 R 中有效地应用 sample()
Efficiently apply sample() in R
我需要在给定具有逐行结果概率的矩阵的情况下对结果变量进行采样。
set.seed(1010) #reproducibility
#create a matrix of probabilities
#three possible outcomes, 10.000 cases
probabilities <- matrix(runif(10000*3),nrow=10000,ncol=3)
probabilities <- probabilities / Matrix::rowSums(probabilities)
我能想出的最快方法是结合使用 apply() 和 sample()。
#row-wise sampling using these probabilities
classification <- apply(probabilities, 1, function(x) sample(1:3, 1, prob = x))
但是,在我所做的事情中,这是计算瓶颈。您知道如何加速此代码/如何更有效地采样吗?
谢谢!
如果你愿意把probabilities
放在list
里,purrr::map
或lapply
似乎快一点:
probabilities <- matrix(runif(10000*3),nrow=10000,ncol=3)
probabilities <- probabilities / Matrix::rowSums(probabilities)
probabilities_list <- split(probabilities, seq(nrow(probabilities)))
library(purrr)
set.seed(1010)
classification_list <- map(probabilities_list, function(x) sample(1:3, 1, prob = x))
set.seed(1010)
classification_list <- lapply(probabilities_list, function(x) sample(1:3, 1, prob = x))
基准测试:
microbenchmark::microbenchmark(
apply = {classification = apply(probabilities, 1, function(x) sample(1:3, 1, prob = x))},
map = {classification = map(probabilities_list, function(x) sample(1:3, 1, prob = x))},
lapply = {classification = lapply(probabilities_list, function(x) sample(1:3, 1, prob = x))},
times = 100
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# apply 39.92883 42.59249 48.39247 45.03080 47.86648 94.39828 100
# map 35.54077 37.13866 42.19719 39.95046 41.56323 66.05167 100
#lapply 34.54861 36.48664 42.69512 39.20139 52.31494 59.29200 100
100.000
例
# Unit: milliseconds
# expr min lq mean median uq max neval
# apply 457.5310 520.4926 572.5974 552.1674 611.5640 957.3997 100
# map 391.4751 457.7326 488.3286 482.1459 512.2054 899.1380 100
#lapply 386.2698 443.6732 491.9957 475.4160 507.3677 868.6725 100
RLave's comment that Rcpp
could be the way to go is spot on (you also need RcppArmadillo
对于 sample()
);我使用以下 C++ 代码创建了这样一个函数:
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadilloExtensions/sample.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerVector sample_matrix(NumericMatrix x, IntegerVector choice_set) {
int n = x.nrow();
IntegerVector result(n);
for ( int i = 0; i < n; ++i ) {
result[i] = RcppArmadillo::sample(choice_set, 1, false, x(i, _))[0];
}
return result;
}
然后我通过
在我的 R 会话中提供了该功能
Rcpp::sourceCpp("sample_matrix.cpp")
现在我们可以根据您的初始方法以及使用 purrr::map()
和 lapply()
的其他建议在 R 中对其进行测试:
set.seed(1010) #reproducibility
#create a matrix of probabilities
#three possible outcomes, 10.000 cases
probabilities <- matrix(runif(10000*3),nrow=10000,ncol=3)
probabilities <- probabilities / Matrix::rowSums(probabilities)
probabilities_list <- split(probabilities, seq(nrow(probabilities)))
library(purrr)
library(microbenchmark)
microbenchmark(
apply = apply(probabilities, 1, function(x) sample(1:3, 1, prob = x)),
map = map(probabilities_list, function(x) sample(1:3, 1, prob = x)),
lapply = lapply(probabilities_list, function(x) sample(1:3, 1, prob = x)),
rcpp = sample_matrix(probabilities, 1:3),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
apply 307.44702 321.30051 339.85403 342.36421 350.86090 434.56007 100
map 254.69721 265.10187 282.85592 286.21680 295.48886 363.95898 100
lapply 249.68224 259.70178 280.63066 279.87273 287.10062 691.21359 100
rcpp 12.16787 12.55429 13.47837 13.81601 14.25198 16.84859 100
cld
c
b
b
a
节省的时间可观。
可以考虑
vapply
和
- 并行化:
parallel::parApply
用你的 probabilities
矩阵:
set.seed(1010) #reproducibility
#create a matrix of probabilities
#three possible outcomes, 10.000 cases
probabilities <- matrix(runif(10000*3), nrow=10000,ncol=3)
probabilities <- probabilities / Matrix::rowSums(probabilities)
classification <- apply(probabilities, 1, function(x) sample(1:3, 1, prob = x))
申请
通过为 FUN.VALUE
指定 class,您也许可以加快速度。
classification2 <- vapply(split(probabilities, 1:nrow(probabilities)),
function(x) sample(1:3, 1, prob = x),
FUN.VALUE = integer(1), USE.NAMES = FALSE)
head(classification2)
#> [1] 1 3 3 1 2 3
平行套餐
benchmarkme::get_cpu()
#> $vendor_id
#> [1] "GenuineIntel"
#>
#> $model_name
#> [1] "Intel(R) Core(TM) i5-4288U CPU @ 2.60GHz"
#>
#> $no_of_cores
#> [1] 4
在上面的环境中,
cl <- parallel::makeCluster(4)
doParallel::registerDoParallel(cl, cores = 4)
parApply()
可以做 apply()
做的事。
classification3 <- parallel::parApply(cl, probabilities, 1, function(x) sample(1:3, 1, prob = x))
head(classification3)
#> [1] 2 2 2 2 3 3
三者比较,包括apply()
解,
microbenchmark::microbenchmark(
question = { # yours
apply(probabilities, 1, function(x) sample(1:3, 1, prob = x))
},
vapp = {
vapply(split(probabilities, 1:nrow(probabilities)), function(x) sample(1:3, 1, prob = x), FUN.VALUE = integer(1), USE.NAMES = FALSE)
},
parr = {
parallel::parApply(cl, probabilities, 1, function(x) sample(1:3, 1, prob = x))
}
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> question 49.93853 58.39965 65.05360 62.98119 68.28044 182.03267 100
#> vapp 44.19828 54.84294 59.47109 58.56739 62.05269 146.14792 100
#> parr 43.33227 48.16840 53.26599 50.87995 54.17286 98.67692 100
parallel::stopCluster(cl)
我需要在给定具有逐行结果概率的矩阵的情况下对结果变量进行采样。
set.seed(1010) #reproducibility
#create a matrix of probabilities
#three possible outcomes, 10.000 cases
probabilities <- matrix(runif(10000*3),nrow=10000,ncol=3)
probabilities <- probabilities / Matrix::rowSums(probabilities)
我能想出的最快方法是结合使用 apply() 和 sample()。
#row-wise sampling using these probabilities
classification <- apply(probabilities, 1, function(x) sample(1:3, 1, prob = x))
但是,在我所做的事情中,这是计算瓶颈。您知道如何加速此代码/如何更有效地采样吗?
谢谢!
如果你愿意把probabilities
放在list
里,purrr::map
或lapply
似乎快一点:
probabilities <- matrix(runif(10000*3),nrow=10000,ncol=3)
probabilities <- probabilities / Matrix::rowSums(probabilities)
probabilities_list <- split(probabilities, seq(nrow(probabilities)))
library(purrr)
set.seed(1010)
classification_list <- map(probabilities_list, function(x) sample(1:3, 1, prob = x))
set.seed(1010)
classification_list <- lapply(probabilities_list, function(x) sample(1:3, 1, prob = x))
基准测试:
microbenchmark::microbenchmark(
apply = {classification = apply(probabilities, 1, function(x) sample(1:3, 1, prob = x))},
map = {classification = map(probabilities_list, function(x) sample(1:3, 1, prob = x))},
lapply = {classification = lapply(probabilities_list, function(x) sample(1:3, 1, prob = x))},
times = 100
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# apply 39.92883 42.59249 48.39247 45.03080 47.86648 94.39828 100
# map 35.54077 37.13866 42.19719 39.95046 41.56323 66.05167 100
#lapply 34.54861 36.48664 42.69512 39.20139 52.31494 59.29200 100
100.000
例
# Unit: milliseconds
# expr min lq mean median uq max neval
# apply 457.5310 520.4926 572.5974 552.1674 611.5640 957.3997 100
# map 391.4751 457.7326 488.3286 482.1459 512.2054 899.1380 100
#lapply 386.2698 443.6732 491.9957 475.4160 507.3677 868.6725 100
RLave's comment that Rcpp
could be the way to go is spot on (you also need RcppArmadillo
对于 sample()
);我使用以下 C++ 代码创建了这样一个函数:
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadilloExtensions/sample.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerVector sample_matrix(NumericMatrix x, IntegerVector choice_set) {
int n = x.nrow();
IntegerVector result(n);
for ( int i = 0; i < n; ++i ) {
result[i] = RcppArmadillo::sample(choice_set, 1, false, x(i, _))[0];
}
return result;
}
然后我通过
在我的 R 会话中提供了该功能Rcpp::sourceCpp("sample_matrix.cpp")
现在我们可以根据您的初始方法以及使用 purrr::map()
和 lapply()
的其他建议在 R 中对其进行测试:
set.seed(1010) #reproducibility
#create a matrix of probabilities
#three possible outcomes, 10.000 cases
probabilities <- matrix(runif(10000*3),nrow=10000,ncol=3)
probabilities <- probabilities / Matrix::rowSums(probabilities)
probabilities_list <- split(probabilities, seq(nrow(probabilities)))
library(purrr)
library(microbenchmark)
microbenchmark(
apply = apply(probabilities, 1, function(x) sample(1:3, 1, prob = x)),
map = map(probabilities_list, function(x) sample(1:3, 1, prob = x)),
lapply = lapply(probabilities_list, function(x) sample(1:3, 1, prob = x)),
rcpp = sample_matrix(probabilities, 1:3),
times = 100
)
Unit: milliseconds
expr min lq mean median uq max neval
apply 307.44702 321.30051 339.85403 342.36421 350.86090 434.56007 100
map 254.69721 265.10187 282.85592 286.21680 295.48886 363.95898 100
lapply 249.68224 259.70178 280.63066 279.87273 287.10062 691.21359 100
rcpp 12.16787 12.55429 13.47837 13.81601 14.25198 16.84859 100
cld
c
b
b
a
节省的时间可观。
可以考虑
vapply
和- 并行化:
parallel::parApply
用你的 probabilities
矩阵:
set.seed(1010) #reproducibility
#create a matrix of probabilities
#three possible outcomes, 10.000 cases
probabilities <- matrix(runif(10000*3), nrow=10000,ncol=3)
probabilities <- probabilities / Matrix::rowSums(probabilities)
classification <- apply(probabilities, 1, function(x) sample(1:3, 1, prob = x))
申请
通过为 FUN.VALUE
指定 class,您也许可以加快速度。
classification2 <- vapply(split(probabilities, 1:nrow(probabilities)),
function(x) sample(1:3, 1, prob = x),
FUN.VALUE = integer(1), USE.NAMES = FALSE)
head(classification2)
#> [1] 1 3 3 1 2 3
平行套餐
benchmarkme::get_cpu()
#> $vendor_id
#> [1] "GenuineIntel"
#>
#> $model_name
#> [1] "Intel(R) Core(TM) i5-4288U CPU @ 2.60GHz"
#>
#> $no_of_cores
#> [1] 4
在上面的环境中,
cl <- parallel::makeCluster(4)
doParallel::registerDoParallel(cl, cores = 4)
parApply()
可以做 apply()
做的事。
classification3 <- parallel::parApply(cl, probabilities, 1, function(x) sample(1:3, 1, prob = x))
head(classification3)
#> [1] 2 2 2 2 3 3
三者比较,包括apply()
解,
microbenchmark::microbenchmark(
question = { # yours
apply(probabilities, 1, function(x) sample(1:3, 1, prob = x))
},
vapp = {
vapply(split(probabilities, 1:nrow(probabilities)), function(x) sample(1:3, 1, prob = x), FUN.VALUE = integer(1), USE.NAMES = FALSE)
},
parr = {
parallel::parApply(cl, probabilities, 1, function(x) sample(1:3, 1, prob = x))
}
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> question 49.93853 58.39965 65.05360 62.98119 68.28044 182.03267 100
#> vapp 44.19828 54.84294 59.47109 58.56739 62.05269 146.14792 100
#> parr 43.33227 48.16840 53.26599 50.87995 54.17286 98.67692 100
parallel::stopCluster(cl)