高效过滤掉所有列连续为零的地方
Efficiently filter out where all columns are zeros consecutively
我有一个像下面这样的数据集(实际数据集有 500 万行,没有间隙),我试图过滤掉所有数字列的 总和的行该行本身及其前后行等于零.
N.B.
Time
是实际数据中的dttm
列。
- 连续零的数量可以超过 3 行,在这种情况下,将过滤掉多行。
# A tibble: 13 x 4
group Time Val1 Val2
<chr> <int> <dbl> <dbl>
1 A 1 0 0
2 B 1 0.1 0
3 A 3 0 0
4 B 3 0 0
5 A 2 0 0
6 B 2 0.2 0.2
7 B 4 0 0
8 A 4 0 0.1
9 A 5 0 0
10 A 6 0 0
11 B 6 0.1 0.5
12 B 5 0.1 0.2
13 A 7 0 0
请参阅下面的示例了解所需内容:
# A tibble: 13 x 8
group Time Val1 Val2 rowsum leadsum lagsum sum
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 0 0 0 0 NA NA
2 A 2 0 0 0 0 0 0 This will get filtered out!
3 A 3 0 0 0 0.1 0 0.1
4 A 4 0 0.1 0.1 0 0 0.1
5 A 5 0 0 0 0 0.1 0.1
6 A 6 0 0 0 0 0 0 This will get filtered out!
7 A 7 0 0 0 NA 0 NA
8 B 1 0.1 0 0.1 0.4 NA NA
9 B 2 0.2 0.2 0.4 0 0.1 0.5
10 B 3 0 0 0 0 0.4 0.4
11 B 4 0 0 0 0.3 0 0.3
12 B 5 0.1 0.2 0.3 0.6 0 0.9
13 B 6 0.1 0.5 0.6 NA 0.3 NA
到目前为止,我只是尝试使用 dplyr::lag()
和 dplyr::lead()
来做到这一点;但这是非常低效的,并且会引发实际数据集的内存分配错误:
> Error in Sys.getenv("TESTTHAT") :
> could not allocate memory (0 Mb) in C function 'R_AllocStringBuffer'
这是我目前所拥有的;我可以先得到 Val1
和 Val2
的总和,然后执行 lead
和 lag
但这不会解决问题。
df0 %>%
##arrange by group is not necessary since we're grouping by that var
arrange(group, Time) %>%
group_by(group) %>%
mutate(sum = Val1 + Val2 + lag(Val1) + lag(Val2) + lead(Val1) + lead(Val2)) # %>%
# filter(is.na(sum) | sum != 0)
## commenting out filter to show the full results
# > # A tibble: 13 x 5
# > # Groups: group [2]
# > group Time Val1 Val2 sum
# > <chr> <int> <dbl> <dbl> <dbl>
# > 1 A 1 0 0 NA
# ! - A 2 0 0 0
# > 2 A 3 0 0 0.1
# > 3 A 4 0 0.1 0.1
# > 4 A 5 0 0 0.1
# ! - A 6 0 0 0
# > 5 A 7 0 0 NA
# > 6 B 1 0.1 0 NA
# > 7 B 2 0.2 0.2 0.5
# > 8 B 3 0 0 0.4
# > 9 B 4 0 0 0.3
# > 10 B 5 0.1 0.2 0.9
# > 11 B 6 0.1 0.5 NA
玩具数据集:
df0 <- structure(list(group = c("A", "B", "A", "B", "A", "B",
"B", "A", "A", "A", "B", "B", "A"),
Time = c(1L, 1L, 3L, 3L, 2L, 2L, 4L, 4L, 5L, 6L, 6L, 5L, 7L),
Val1 = c(0, 0.1, 0, 0, 0, 0.2, 0, 0, 0, 0, 0.1, 0.1, 0),
Val2 = c(0, 0, 0, 0, 0, 0.2, 0, 0.1, 0, 0, 0.5, 0.2, 0)),
row.names = c(NA, -13L),
class = c("tbl_df", "tbl", "data.frame"))
library(tidyverse)
df0 %>%
arrange(group, Time) %>% # EDIT to arrange by time (and group for clarity)
rowwise() %>%
mutate(sum = sum(c_across(Val1:Val2))) %>%
group_by(group) %>%
filter( !(sum == 0 & lag(sum, default = 1) == 0 & lead(sum, default = 1) == 0)) %>%
ungroup()
# A tibble: 11 x 5
group Time Val1 Val2 sum
<chr> <int> <dbl> <dbl> <dbl>
1 A 1 0 0 0
2 A 3 0 0 0
3 A 4 0 0.1 0.1
4 A 5 0 0 0
5 A 7 0 0 0
6 B 1 0.1 0 0.1
7 B 2 0.2 0.2 0.4
8 B 3 0 0 0
9 B 4 0 0 0
10 B 5 0.1 0.2 0.3
11 B 6 0.1 0.5 0.6
我们可以使用基础 rle
或其更快的实现,rlenc
在 purler
包中实现。
library(tidyverse)
library(purler)
subsetter <- function(df){
df %>%
select(where(is.double)) %>%
rowSums() %>%
purler::rlenc() %>%
filter(lengths >= 3L & values == 0L) %>%
transmute(ids = map2(start, start + lengths, ~ (.x + 1) : (.y - 2))) %>%
unlist(use.names = F)
}
# to get data as shown in example
df0 <- df0 %>%
mutate(Time = as.character(Time)) %>%
arrange(group, Time)
edge_cases <- tribble(
~group, ~Time, ~Val1, ~Val2,
"C", "1", 0, 0,
"C", "2", 0, 0,
"C", "3", 0, 0,
"C", "4", 0, 0,
)
df1 <- rbind(df0, edge_cases)
df1 %>%
`[`(-subsetter(.),)
# A tibble: 13 x 4
group Time Val1 Val2
<chr> <chr> <dbl> <dbl>
1 A 1 0 0
2 A 3 0 0
3 A 4 0 0.1
4 A 5 0 0
5 A 7 0 0
6 B 1 0.1 0
7 B 2 0.2 0.2
8 B 3 0 0
9 B 4 0 0
10 B 5 0.1 0.2
11 B 6 0.1 0.5
12 C 1 0 0
13 C 4 0 0
bench::mark(df1 %>% `[`(-subsetter(.),))[,c(3,5,7)]
# A tibble: 1 x 3
median mem_alloc n_itr
<bch:tm> <bch:byt> <int>
1 3.91ms 9.38KB 93
由于您标记了 data.table,这里有一个 data.table
原生解决方案:
library(data.table)
dt0 <- as.data.table(df0)
setorder(dt0, Time) # add 'group' if you want
isnum <- names(which(sapply(dt0, function(z) is.numeric(z) & !is.integer(z))))
isnum
# [1] "Val1" "Val2"
dt0[, sum0 := abs(rowSums(.SD)) < 1e-9, .SDcols = isnum
][, .SD[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3,], by = .(group)
][, sum0 := NULL ][]
# group Time Val1 Val2
# <char> <int> <num> <num>
# 1: A 1 0.0 0.0
# 2: A 3 0.0 0.0
# 3: A 4 0.0 0.1
# 4: A 5 0.0 0.0
# 5: A 7 0.0 0.0
# 6: B 1 0.1 0.0
# 7: B 2 0.2 0.2
# 8: B 3 0.0 0.0
# 9: B 4 0.0 0.0
# 10: B 5 0.1 0.2
# 11: B 6 0.1 0.5
根据您的评论,A-2 和 A-6 均已删除。
效率:
rowSums
快速高效;
- 我们使用直接索引进行转换,默认值为
0
;在 data.table
中,这是非常有效的处理,并且不会产生 lead
/lag
/shift
调用的(公认的小)开销;
- 在对一行求和后,我们只对这个值进行行移位,而不是每行四行移位。
编辑,以获得轻微的性能改进 (15-20%):
dt0[
dt0[, sum0 := abs(rowSums(.SD)) < 1e-9, .SDcols = isnum
][, .I[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3], by=group ]$V1
][, sum0 := NULL][]
诚然,这可能有点难以遵循,但它在大约 82% 的时间内产生相同的结果(使用 this 数据集)。感谢@Henrik 帮助我更多地了解 .I
及其好处。
您可以尝试以下data.table
选项
setorder(setDT(df0), group, Time)[
,
rs := rowSums(Filter(is.double, .SD))
][, .SD[!(rs == 0 & .N > 2 & (!rowid(rs) %in% c(1, .N)))], rleid(rs)][
,
rleid := NULL
][]
这给出了
group Time Val1 Val2
1: A 1 0.0 0.0
2: A 3 0.0 0.0
3: A 4 0.0 0.1
4: A 5 0.0 0.0
5: A 7 0.0 0.0
6: B 1 0.1 0.0
7: B 2 0.2 0.2
8: B 3 0.0 0.0
9: B 4 0.0 0.0
10: B 5 0.1 0.2
11: B 6 0.1 0.5
这个解决方案主要是受@r2evans 的启发。它使用 Reduce
、+
和 shift
而不是 @r2evans 基于 rowSums
和 c
函数的解决方案。
我认为此解决方案的改进来自于在数据上使用 Reduce(
+, .SD)
而不是 rowSums(.SD)
。frame/data.table (以及避免 [, .SD[...], ...]
当使用 data.table 语法时);它更快(至少在我的 PC 上)并且内存效率更高(无需转换为矩阵)。警告:rowSums(.SD, na.rm=TRUE)
.
没有直接等价物
n = 1e7
dt0 = setDT(df0[sample(nrow(df0), n, replace=TRUE), ])
setorder(dt0, group, Time)
isnum = sapply(dt0, function(x) is.numeric(x) && !is.integer(x))
eps = sqrt(.Machine$double.eps)
# New solution
f1 = function() {
ans = dt0[, is0 := {sum0 = abs(Reduce(`+`, .SD)) < eps; Reduce(`+`, shift(sum0, -1:1, fill=0)) < 3},
by=group, .SDcols=isnum][(is0), !"is0"]
dt0[, is0 := NULL] # remove is0 from the initial dataset
ans
}
# similar to f1: easily adaptable to rowSums(.SD, na.rm=TRUE).
f2 = function() {
# here I replace Reduce(`+`, .SD) with rowSums(.SD) just in case its na.rm argument is needed.
ans = dt0[, is0 := {sum0 = abs(rowSums(.SD)) < eps; Reduce(`+`, shift(sum0, -1:1, fill=0)) < 3},
by=group, .SDcols=isnum][(is0), !"is0"]
dt0[, is0:=NULL] # remove is0 from the initial dataset
ans
}
# r2evans first solution
f3 = function() {
ans = dt0[
dt0[, sum0 := abs(rowSums(.SD)) < eps, .SDcols = isnum
][, .I[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3], by=group ]$V1
][, sum0 := NULL][]
dt0[, sum0 := NULL] # remove sum0 from the initial dataset
ans
}
# r2evans second solution
f4 = function() {
ans = dt0[, sum0 := abs(rowSums(.SD)) < eps, .SDcols = isnum
][, .SD[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3,], by = .(group)
][, sum0 := NULL ][]
dt0[, sum0:=NULL] # remove sum0 from the initial dataset
ans
}
# modified version of r2evans second solution: similar to f4 but avoid [, .SD[...], by=group]
f5 = function() {
ans = dt0[, sum0 := abs(rowSums(.SD)) < eps, .SDcols = isnum
][, sum0 := (c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3, by = .(group)
][(sum0), !"sum0"][]
dt0[, sum0:=NULL] # remove sum0 from the initial dataset
ans
}
基准
bench::mark(
f1(),
f2(),
f3(),
f4(),
f5(),
iterations=5L, check=FALSE
)
# A tibble: 5 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <bch:> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 f1() 347ms 406ms 2.49 698.47MB 5.48 5 11 2.01s
2 f2() 529ms 578ms 1.69 851.02MB 4.06 5 12 2.96s
3 f3() 717ms 821ms 1.22 1.25GB 3.40 5 14 4.12s
4 f4() 889ms 956ms 1.04 1.57GB 5.01 5 24 4.79s
5 f5() 642ms 677ms 1.40 1.07GB 3.37 5 12 3.56s
根据这个结果,第一个解决方案比 f3 和 f4 快 2+,而且内存效率更高。
我正在使用 data.table 的开发版本 (data.table 1.14.3
)
我有一个像下面这样的数据集(实际数据集有 500 万行,没有间隙),我试图过滤掉所有数字列的 总和的行该行本身及其前后行等于零.
N.B.
Time
是实际数据中的dttm
列。- 连续零的数量可以超过 3 行,在这种情况下,将过滤掉多行。
# A tibble: 13 x 4
group Time Val1 Val2
<chr> <int> <dbl> <dbl>
1 A 1 0 0
2 B 1 0.1 0
3 A 3 0 0
4 B 3 0 0
5 A 2 0 0
6 B 2 0.2 0.2
7 B 4 0 0
8 A 4 0 0.1
9 A 5 0 0
10 A 6 0 0
11 B 6 0.1 0.5
12 B 5 0.1 0.2
13 A 7 0 0
请参阅下面的示例了解所需内容:
# A tibble: 13 x 8
group Time Val1 Val2 rowsum leadsum lagsum sum
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 0 0 0 0 NA NA
2 A 2 0 0 0 0 0 0 This will get filtered out!
3 A 3 0 0 0 0.1 0 0.1
4 A 4 0 0.1 0.1 0 0 0.1
5 A 5 0 0 0 0 0.1 0.1
6 A 6 0 0 0 0 0 0 This will get filtered out!
7 A 7 0 0 0 NA 0 NA
8 B 1 0.1 0 0.1 0.4 NA NA
9 B 2 0.2 0.2 0.4 0 0.1 0.5
10 B 3 0 0 0 0 0.4 0.4
11 B 4 0 0 0 0.3 0 0.3
12 B 5 0.1 0.2 0.3 0.6 0 0.9
13 B 6 0.1 0.5 0.6 NA 0.3 NA
到目前为止,我只是尝试使用 dplyr::lag()
和 dplyr::lead()
来做到这一点;但这是非常低效的,并且会引发实际数据集的内存分配错误:
> Error in Sys.getenv("TESTTHAT") : > could not allocate memory (0 Mb) in C function 'R_AllocStringBuffer'
这是我目前所拥有的;我可以先得到 Val1
和 Val2
的总和,然后执行 lead
和 lag
但这不会解决问题。
df0 %>%
##arrange by group is not necessary since we're grouping by that var
arrange(group, Time) %>%
group_by(group) %>%
mutate(sum = Val1 + Val2 + lag(Val1) + lag(Val2) + lead(Val1) + lead(Val2)) # %>%
# filter(is.na(sum) | sum != 0)
## commenting out filter to show the full results
# > # A tibble: 13 x 5
# > # Groups: group [2]
# > group Time Val1 Val2 sum
# > <chr> <int> <dbl> <dbl> <dbl>
# > 1 A 1 0 0 NA
# ! - A 2 0 0 0
# > 2 A 3 0 0 0.1
# > 3 A 4 0 0.1 0.1
# > 4 A 5 0 0 0.1
# ! - A 6 0 0 0
# > 5 A 7 0 0 NA
# > 6 B 1 0.1 0 NA
# > 7 B 2 0.2 0.2 0.5
# > 8 B 3 0 0 0.4
# > 9 B 4 0 0 0.3
# > 10 B 5 0.1 0.2 0.9
# > 11 B 6 0.1 0.5 NA
玩具数据集:
df0 <- structure(list(group = c("A", "B", "A", "B", "A", "B",
"B", "A", "A", "A", "B", "B", "A"),
Time = c(1L, 1L, 3L, 3L, 2L, 2L, 4L, 4L, 5L, 6L, 6L, 5L, 7L),
Val1 = c(0, 0.1, 0, 0, 0, 0.2, 0, 0, 0, 0, 0.1, 0.1, 0),
Val2 = c(0, 0, 0, 0, 0, 0.2, 0, 0.1, 0, 0, 0.5, 0.2, 0)),
row.names = c(NA, -13L),
class = c("tbl_df", "tbl", "data.frame"))
library(tidyverse)
df0 %>%
arrange(group, Time) %>% # EDIT to arrange by time (and group for clarity)
rowwise() %>%
mutate(sum = sum(c_across(Val1:Val2))) %>%
group_by(group) %>%
filter( !(sum == 0 & lag(sum, default = 1) == 0 & lead(sum, default = 1) == 0)) %>%
ungroup()
# A tibble: 11 x 5
group Time Val1 Val2 sum
<chr> <int> <dbl> <dbl> <dbl>
1 A 1 0 0 0
2 A 3 0 0 0
3 A 4 0 0.1 0.1
4 A 5 0 0 0
5 A 7 0 0 0
6 B 1 0.1 0 0.1
7 B 2 0.2 0.2 0.4
8 B 3 0 0 0
9 B 4 0 0 0
10 B 5 0.1 0.2 0.3
11 B 6 0.1 0.5 0.6
我们可以使用基础 rle
或其更快的实现,rlenc
在 purler
包中实现。
library(tidyverse)
library(purler)
subsetter <- function(df){
df %>%
select(where(is.double)) %>%
rowSums() %>%
purler::rlenc() %>%
filter(lengths >= 3L & values == 0L) %>%
transmute(ids = map2(start, start + lengths, ~ (.x + 1) : (.y - 2))) %>%
unlist(use.names = F)
}
# to get data as shown in example
df0 <- df0 %>%
mutate(Time = as.character(Time)) %>%
arrange(group, Time)
edge_cases <- tribble(
~group, ~Time, ~Val1, ~Val2,
"C", "1", 0, 0,
"C", "2", 0, 0,
"C", "3", 0, 0,
"C", "4", 0, 0,
)
df1 <- rbind(df0, edge_cases)
df1 %>%
`[`(-subsetter(.),)
# A tibble: 13 x 4
group Time Val1 Val2
<chr> <chr> <dbl> <dbl>
1 A 1 0 0
2 A 3 0 0
3 A 4 0 0.1
4 A 5 0 0
5 A 7 0 0
6 B 1 0.1 0
7 B 2 0.2 0.2
8 B 3 0 0
9 B 4 0 0
10 B 5 0.1 0.2
11 B 6 0.1 0.5
12 C 1 0 0
13 C 4 0 0
bench::mark(df1 %>% `[`(-subsetter(.),))[,c(3,5,7)]
# A tibble: 1 x 3
median mem_alloc n_itr
<bch:tm> <bch:byt> <int>
1 3.91ms 9.38KB 93
由于您标记了 data.table,这里有一个 data.table
原生解决方案:
library(data.table)
dt0 <- as.data.table(df0)
setorder(dt0, Time) # add 'group' if you want
isnum <- names(which(sapply(dt0, function(z) is.numeric(z) & !is.integer(z))))
isnum
# [1] "Val1" "Val2"
dt0[, sum0 := abs(rowSums(.SD)) < 1e-9, .SDcols = isnum
][, .SD[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3,], by = .(group)
][, sum0 := NULL ][]
# group Time Val1 Val2
# <char> <int> <num> <num>
# 1: A 1 0.0 0.0
# 2: A 3 0.0 0.0
# 3: A 4 0.0 0.1
# 4: A 5 0.0 0.0
# 5: A 7 0.0 0.0
# 6: B 1 0.1 0.0
# 7: B 2 0.2 0.2
# 8: B 3 0.0 0.0
# 9: B 4 0.0 0.0
# 10: B 5 0.1 0.2
# 11: B 6 0.1 0.5
根据您的评论,A-2 和 A-6 均已删除。
效率:
rowSums
快速高效;- 我们使用直接索引进行转换,默认值为
0
;在data.table
中,这是非常有效的处理,并且不会产生lead
/lag
/shift
调用的(公认的小)开销; - 在对一行求和后,我们只对这个值进行行移位,而不是每行四行移位。
编辑,以获得轻微的性能改进 (15-20%):
dt0[
dt0[, sum0 := abs(rowSums(.SD)) < 1e-9, .SDcols = isnum
][, .I[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3], by=group ]$V1
][, sum0 := NULL][]
诚然,这可能有点难以遵循,但它在大约 82% 的时间内产生相同的结果(使用 this 数据集)。感谢@Henrik 帮助我更多地了解 .I
及其好处。
您可以尝试以下data.table
选项
setorder(setDT(df0), group, Time)[
,
rs := rowSums(Filter(is.double, .SD))
][, .SD[!(rs == 0 & .N > 2 & (!rowid(rs) %in% c(1, .N)))], rleid(rs)][
,
rleid := NULL
][]
这给出了
group Time Val1 Val2
1: A 1 0.0 0.0
2: A 3 0.0 0.0
3: A 4 0.0 0.1
4: A 5 0.0 0.0
5: A 7 0.0 0.0
6: B 1 0.1 0.0
7: B 2 0.2 0.2
8: B 3 0.0 0.0
9: B 4 0.0 0.0
10: B 5 0.1 0.2
11: B 6 0.1 0.5
这个解决方案主要是受@r2evans 的启发。它使用 Reduce
、+
和 shift
而不是 @r2evans 基于 rowSums
和 c
函数的解决方案。
我认为此解决方案的改进来自于在数据上使用 Reduce(
+, .SD)
而不是 rowSums(.SD)
。frame/data.table (以及避免 [, .SD[...], ...]
当使用 data.table 语法时);它更快(至少在我的 PC 上)并且内存效率更高(无需转换为矩阵)。警告:rowSums(.SD, na.rm=TRUE)
.
n = 1e7
dt0 = setDT(df0[sample(nrow(df0), n, replace=TRUE), ])
setorder(dt0, group, Time)
isnum = sapply(dt0, function(x) is.numeric(x) && !is.integer(x))
eps = sqrt(.Machine$double.eps)
# New solution
f1 = function() {
ans = dt0[, is0 := {sum0 = abs(Reduce(`+`, .SD)) < eps; Reduce(`+`, shift(sum0, -1:1, fill=0)) < 3},
by=group, .SDcols=isnum][(is0), !"is0"]
dt0[, is0 := NULL] # remove is0 from the initial dataset
ans
}
# similar to f1: easily adaptable to rowSums(.SD, na.rm=TRUE).
f2 = function() {
# here I replace Reduce(`+`, .SD) with rowSums(.SD) just in case its na.rm argument is needed.
ans = dt0[, is0 := {sum0 = abs(rowSums(.SD)) < eps; Reduce(`+`, shift(sum0, -1:1, fill=0)) < 3},
by=group, .SDcols=isnum][(is0), !"is0"]
dt0[, is0:=NULL] # remove is0 from the initial dataset
ans
}
# r2evans first solution
f3 = function() {
ans = dt0[
dt0[, sum0 := abs(rowSums(.SD)) < eps, .SDcols = isnum
][, .I[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3], by=group ]$V1
][, sum0 := NULL][]
dt0[, sum0 := NULL] # remove sum0 from the initial dataset
ans
}
# r2evans second solution
f4 = function() {
ans = dt0[, sum0 := abs(rowSums(.SD)) < eps, .SDcols = isnum
][, .SD[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3,], by = .(group)
][, sum0 := NULL ][]
dt0[, sum0:=NULL] # remove sum0 from the initial dataset
ans
}
# modified version of r2evans second solution: similar to f4 but avoid [, .SD[...], by=group]
f5 = function() {
ans = dt0[, sum0 := abs(rowSums(.SD)) < eps, .SDcols = isnum
][, sum0 := (c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3, by = .(group)
][(sum0), !"sum0"][]
dt0[, sum0:=NULL] # remove sum0 from the initial dataset
ans
}
基准
bench::mark(
f1(),
f2(),
f3(),
f4(),
f5(),
iterations=5L, check=FALSE
)
# A tibble: 5 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <bch:> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 f1() 347ms 406ms 2.49 698.47MB 5.48 5 11 2.01s
2 f2() 529ms 578ms 1.69 851.02MB 4.06 5 12 2.96s
3 f3() 717ms 821ms 1.22 1.25GB 3.40 5 14 4.12s
4 f4() 889ms 956ms 1.04 1.57GB 5.01 5 24 4.79s
5 f5() 642ms 677ms 1.40 1.07GB 3.37 5 12 3.56s
根据这个结果,第一个解决方案比 f3 和 f4 快 2+,而且内存效率更高。
我正在使用 data.table 的开发版本 (data.table 1.14.3
)