为什么在这种情况下管道这么慢?
why is the pipe so slow in this case?
我在 R 中有一个模型 运行 非常慢,我发现这纯粹是由于使用管道的代码行非常少解算器。有谁知道为什么这会减慢它的速度?有更快的方法吗?
示例代码:
IDS <- c(1,1,2)
output <- runif(3, 0, 100)
ID_df1 <- data.frame(IDS, output)
ID_df <- ID_df1 %>%
group_by(IDS) %>%
summarise(totals = sum(output))
portions <- ID_df1 %>%
left_join(ID_df, by = "IDS") %>%
mutate(portion = output/totals) %>%
select(IDS, portion)
建议使用 data.table。下面是一些时序比较:
library(data.table)
library(dplyr)
library(microbenchmark)
set.seed(30L)
N <- 1e6
IDS <- sample(LETTERS, N, replace=TRUE)
output <- runif(N, 0, 100)
ID_df1 <- data.frame(IDS, output)
microbenchmark::microbenchmark(
#as suggested by @Tino
mtd_dplyr=ID_df1 %>% group_by(IDS) %>% mutate(portion = output / sum(output)),
mtd_data.table=setDT(ID_df1)[, .(portion = output / sum(output)), by=.(IDS)],
mtd_base=do.call(rbind, by(ID_df1, ID_df1$IDS, function(x) data.frame(x$IDS, portion=x$output/sum(x$output)))),
times=10L
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# mtd_dplyr 96.9382 99.0871 117.64655 114.47100 133.0421 144.6009 10
# mtd_data.table 15.7899 21.1913 30.93596 21.37835 25.5830 81.5951 10
# mtd_base 1191.5829 1245.0176 1392.00927 1369.00735 1450.2118 1859.3708 10
base R 的 tapply
会 运行 更快。
ID_df1$portion <- unlist(tapply(ID_df1$output, ID_df1$IDS, function(x) x / sum(x)))
为了比较(我使用的是 10,000 行和 100 个不同的数据框 IDS
)
set.seed(pi)
IDS <- sample(1:100, size = 10000, replace = TRUE)
output <- runif(3, 0, 10000)
ID_df1 <- data.frame(IDS, output)
library(data.table)
library(dplyr)
library(microbenchmark)
microbenchmark(
orig = {
ID_df <- ID_df1 %>%
group_by(IDS) %>%
summarise(totals = sum(output))
portions <- ID_df1 %>%
left_join(ID_df, by = "IDS") %>%
mutate(portion = output/totals) %>%
select(IDS, portion)
},
tino = {
ID_df1 %>% group_by(IDS) %>% mutate(portion = output / sum(output))
},
data.table = {
setDT(ID_df1)[, .(portion = output / sum(output)), by=.(IDS)]
},
base = {
ID_df1$portion <- unlist(tapply(ID_df1$output, ID_df1$IDS, function(x) x / sum(x)))
}
)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# orig 11936.111 12101.798 12705.9227 12310.2980 12914.0975 22949.662 100 c
# tino 5224.230 5370.854 5636.8930 5558.3875 5734.9235 8466.684 100 b
# data.table 569.490 594.856 768.6615 724.4725 777.2565 3279.110 100 a
# base 497.937 524.623 606.8760 602.9200 650.2800 1933.098 100 a
我在 R 中有一个模型 运行 非常慢,我发现这纯粹是由于使用管道的代码行非常少解算器。有谁知道为什么这会减慢它的速度?有更快的方法吗?
示例代码:
IDS <- c(1,1,2)
output <- runif(3, 0, 100)
ID_df1 <- data.frame(IDS, output)
ID_df <- ID_df1 %>%
group_by(IDS) %>%
summarise(totals = sum(output))
portions <- ID_df1 %>%
left_join(ID_df, by = "IDS") %>%
mutate(portion = output/totals) %>%
select(IDS, portion)
建议使用 data.table。下面是一些时序比较:
library(data.table)
library(dplyr)
library(microbenchmark)
set.seed(30L)
N <- 1e6
IDS <- sample(LETTERS, N, replace=TRUE)
output <- runif(N, 0, 100)
ID_df1 <- data.frame(IDS, output)
microbenchmark::microbenchmark(
#as suggested by @Tino
mtd_dplyr=ID_df1 %>% group_by(IDS) %>% mutate(portion = output / sum(output)),
mtd_data.table=setDT(ID_df1)[, .(portion = output / sum(output)), by=.(IDS)],
mtd_base=do.call(rbind, by(ID_df1, ID_df1$IDS, function(x) data.frame(x$IDS, portion=x$output/sum(x$output)))),
times=10L
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# mtd_dplyr 96.9382 99.0871 117.64655 114.47100 133.0421 144.6009 10
# mtd_data.table 15.7899 21.1913 30.93596 21.37835 25.5830 81.5951 10
# mtd_base 1191.5829 1245.0176 1392.00927 1369.00735 1450.2118 1859.3708 10
base R 的 tapply
会 运行 更快。
ID_df1$portion <- unlist(tapply(ID_df1$output, ID_df1$IDS, function(x) x / sum(x)))
为了比较(我使用的是 10,000 行和 100 个不同的数据框 IDS
)
set.seed(pi)
IDS <- sample(1:100, size = 10000, replace = TRUE)
output <- runif(3, 0, 10000)
ID_df1 <- data.frame(IDS, output)
library(data.table)
library(dplyr)
library(microbenchmark)
microbenchmark(
orig = {
ID_df <- ID_df1 %>%
group_by(IDS) %>%
summarise(totals = sum(output))
portions <- ID_df1 %>%
left_join(ID_df, by = "IDS") %>%
mutate(portion = output/totals) %>%
select(IDS, portion)
},
tino = {
ID_df1 %>% group_by(IDS) %>% mutate(portion = output / sum(output))
},
data.table = {
setDT(ID_df1)[, .(portion = output / sum(output)), by=.(IDS)]
},
base = {
ID_df1$portion <- unlist(tapply(ID_df1$output, ID_df1$IDS, function(x) x / sum(x)))
}
)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# orig 11936.111 12101.798 12705.9227 12310.2980 12914.0975 22949.662 100 c
# tino 5224.230 5370.854 5636.8930 5558.3875 5734.9235 8466.684 100 b
# data.table 569.490 594.856 768.6615 724.4725 777.2565 3279.110 100 a
# base 497.937 524.623 606.8760 602.9200 650.2800 1933.098 100 a