高效过滤掉所有列连续为零的地方

Efficiently filter out where all columns are zeros consecutively

我有一个像下面这样的数据集(实际数据集有 500 万行,没有间隙),我试图过滤掉所有数字列的 总和的行该行本身及其前后行等于零.

N.B.

# A tibble: 13 x 4
   group  Time  Val1  Val2
   <chr> <int> <dbl> <dbl>
 1 A         1   0     0  
 2 B         1   0.1   0  
 3 A         3   0     0  
 4 B         3   0     0  
 5 A         2   0     0  
 6 B         2   0.2   0.2
 7 B         4   0     0  
 8 A         4   0     0.1
 9 A         5   0     0  
10 A         6   0     0  
11 B         6   0.1   0.5
12 B         5   0.1   0.2
13 A         7   0     0  

请参阅下面的示例了解所需内容:

# A tibble: 13 x 8
   group  Time  Val1  Val2 rowsum leadsum lagsum   sum
   <chr> <int> <dbl> <dbl>  <dbl>   <dbl>  <dbl> <dbl>
 1 A         1   0     0      0       0     NA    NA  
 2 A         2   0     0      0       0      0     0     This will get filtered out! 
 3 A         3   0     0      0       0.1    0     0.1
 4 A         4   0     0.1    0.1     0      0     0.1
 5 A         5   0     0      0       0      0.1   0.1
 6 A         6   0     0      0       0      0     0     This will get filtered out!
 7 A         7   0     0      0      NA      0    NA  
 8 B         1   0.1   0      0.1     0.4   NA    NA  
 9 B         2   0.2   0.2    0.4     0      0.1   0.5
10 B         3   0     0      0       0      0.4   0.4
11 B         4   0     0      0       0.3    0     0.3
12 B         5   0.1   0.2    0.3     0.6    0     0.9
13 B         6   0.1   0.5    0.6    NA      0.3  NA  

到目前为止,我只是尝试使用 dplyr::lag()dplyr::lead() 来做到这一点;但这是非常低效的,并且会引发实际数据集的内存分配错误:

>     Error in Sys.getenv("TESTTHAT") : 
>       could not allocate memory (0 Mb) in C function 'R_AllocStringBuffer'

这是我目前所拥有的;我可以先得到 Val1Val2 的总和,然后执行 leadlag 但这不会解决问题。

df0 %>% 
  ##arrange by group is not necessary since we're grouping by that var
  arrange(group, Time) %>% 
  group_by(group) %>% 
  mutate(sum = Val1 + Val2 + lag(Val1) + lag(Val2) + lead(Val1) + lead(Val2)) # %>% 
  # filter(is.na(sum) | sum != 0)
  ## commenting out filter to show the full results
# >  # A tibble: 13 x 5
# >  # Groups:   group [2]
# >  group  Time  Val1  Val2   sum
# >  <chr> <int> <dbl> <dbl> <dbl>
# >  1  A   1     0     0      NA  
# !  -  A   2     0     0      0  
# >  2  A   3     0     0      0.1
# >  3  A   4     0     0.1    0.1
# >  4  A   5     0     0      0.1
# !  -  A   6     0     0      0  
# >  5  A   7     0     0      NA  
# >  6  B   1     0.1   0      NA  
# >  7  B   2     0.2   0.2    0.5
# >  8  B   3     0     0      0.4
# >  9  B   4     0     0      0.3
# >  10 B   5     0.1   0.2    0.9
# >  11 B   6     0.1   0.5    NA  
玩具数据集:
df0 <- structure(list(group = c("A", "B", "A", "B", "A", "B", 
                                "B", "A", "A", "A", "B", "B", "A"),
                      Time = c(1L, 1L, 3L, 3L, 2L, 2L, 4L, 4L, 5L, 6L, 6L, 5L, 7L), 
                      Val1 = c(0, 0.1, 0, 0, 0, 0.2, 0, 0, 0, 0, 0.1, 0.1, 0), 
                      Val2 = c(0, 0, 0, 0, 0, 0.2, 0, 0.1, 0, 0, 0.5, 0.2, 0)), 
                 row.names = c(NA, -13L), 
                 class = c("tbl_df", "tbl", "data.frame"))
library(tidyverse)
df0 %>%
  arrange(group, Time) %>%  # EDIT to arrange by time (and group for clarity)
  rowwise() %>%
  mutate(sum = sum(c_across(Val1:Val2))) %>%
  group_by(group) %>%
  filter( !(sum == 0 & lag(sum, default = 1) == 0 & lead(sum, default = 1) == 0)) %>%
  ungroup()

# A tibble: 11 x 5
   group  Time  Val1  Val2   sum
   <chr> <int> <dbl> <dbl> <dbl>
 1 A         1   0     0     0  
 2 A         3   0     0     0  
 3 A         4   0     0.1   0.1
 4 A         5   0     0     0  
 5 A         7   0     0     0  
 6 B         1   0.1   0     0.1
 7 B         2   0.2   0.2   0.4
 8 B         3   0     0     0  
 9 B         4   0     0     0  
10 B         5   0.1   0.2   0.3
11 B         6   0.1   0.5   0.6

我们可以使用基础 rle 或其更快的实现,rlencpurler 包中实现。

library(tidyverse)
library(purler)
subsetter <- function(df){
  df %>%
    select(where(is.double)) %>%
    rowSums() %>%
    purler::rlenc() %>%
    filter(lengths >= 3L & values == 0L) %>%
    transmute(ids = map2(start, start + lengths, ~ (.x + 1) : (.y - 2))) %>%
    unlist(use.names = F)
}
# to get data as shown in example
df0 <- df0 %>%
  mutate(Time = as.character(Time)) %>%
  arrange(group, Time)

edge_cases <- tribble(
  ~group, ~Time, ~Val1, ~Val2,
  "C", "1", 0, 0,
  "C", "2", 0, 0,
  "C", "3", 0, 0,
  "C", "4", 0, 0,
)

df1 <- rbind(df0, edge_cases)
df1 %>%
  `[`(-subsetter(.),)

# A tibble: 13 x 4
   group Time   Val1  Val2
   <chr> <chr> <dbl> <dbl>
 1 A     1       0     0  
 2 A     3       0     0  
 3 A     4       0     0.1
 4 A     5       0     0  
 5 A     7       0     0  
 6 B     1       0.1   0  
 7 B     2       0.2   0.2
 8 B     3       0     0  
 9 B     4       0     0  
10 B     5       0.1   0.2
11 B     6       0.1   0.5
12 C     1       0     0  
13 C     4       0     0  
bench::mark(df1 %>% `[`(-subsetter(.),))[,c(3,5,7)]
# A tibble: 1 x 3
    median mem_alloc n_itr
  <bch:tm> <bch:byt> <int>
1   3.91ms    9.38KB    93

由于您标记了 ,这里有一个 data.table 原生解决方案:

library(data.table)
dt0 <- as.data.table(df0)
setorder(dt0, Time) # add 'group' if you want

isnum <- names(which(sapply(dt0, function(z) is.numeric(z) & !is.integer(z))))
isnum
# [1] "Val1" "Val2"

dt0[, sum0 := abs(rowSums(.SD)) < 1e-9, .SDcols = isnum
  ][, .SD[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3,], by = .(group)
  ][, sum0 := NULL ][]
#      group  Time  Val1  Val2
#     <char> <int> <num> <num>
#  1:      A     1   0.0   0.0
#  2:      A     3   0.0   0.0
#  3:      A     4   0.0   0.1
#  4:      A     5   0.0   0.0
#  5:      A     7   0.0   0.0
#  6:      B     1   0.1   0.0
#  7:      B     2   0.2   0.2
#  8:      B     3   0.0   0.0
#  9:      B     4   0.0   0.0
# 10:      B     5   0.1   0.2
# 11:      B     6   0.1   0.5

根据您的评论,A-2 和 A-6 均已删除。

效率:

  • rowSums快速高效;
  • 我们使用直接索引进行转换,默认值为 0;在 data.table 中,这是非常有效的处理,并且不会产生 lead/lag/shift 调用的(公认的小)开销;
  • 在对一行求和后,我们只对这个值进行行移位,而不是每行四行移位。

编辑,以获得轻微的性能改进 (15-20%):

dt0[
  dt0[, sum0 := abs(rowSums(.SD)) < 1e-9, .SDcols = isnum
    ][, .I[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3], by=group ]$V1
][, sum0 := NULL][]

诚然,这可能有点难以遵循,但它在大约 82% 的时间内产生相同的结果(使用 this 数据集)。感谢@Henrik 帮助我更多地了解 .I 及其好处。

您可以尝试以下data.table选项

setorder(setDT(df0), group, Time)[
  ,
  rs := rowSums(Filter(is.double, .SD))
][, .SD[!(rs == 0 & .N > 2 & (!rowid(rs) %in% c(1, .N)))], rleid(rs)][
  ,
  rleid := NULL
][]

这给出了

    group Time Val1 Val2
 1:     A    1  0.0  0.0
 2:     A    3  0.0  0.0
 3:     A    4  0.0  0.1
 4:     A    5  0.0  0.0
 5:     A    7  0.0  0.0
 6:     B    1  0.1  0.0
 7:     B    2  0.2  0.2
 8:     B    3  0.0  0.0
 9:     B    4  0.0  0.0
10:     B    5  0.1  0.2
11:     B    6  0.1  0.5

这个解决方案主要是受@r2evans 的启发。它使用 Reduce+shift 而不是 @r2evans 基于 rowSumsc 函数的解决方案。 我认为此解决方案的改进来自于在数据上使用 Reduce(+, .SD) 而不是 rowSums(.SD)。frame/data.table (以及避免 [, .SD[...], ...] 当使用 data.table 语法时);它更快(至少在我的 PC 上)并且内存效率更高(无需转换为矩阵)。警告:rowSums(.SD, na.rm=TRUE).

没有直接等价物
n = 1e7
dt0 = setDT(df0[sample(nrow(df0), n, replace=TRUE), ])
setorder(dt0, group, Time)
isnum = sapply(dt0, function(x) is.numeric(x) && !is.integer(x))
eps = sqrt(.Machine$double.eps)

# New solution
f1 = function() {
  ans = dt0[, is0 := {sum0 = abs(Reduce(`+`, .SD)) < eps; Reduce(`+`, shift(sum0, -1:1, fill=0)) < 3}, 
            by=group, .SDcols=isnum][(is0), !"is0"]
  
  dt0[, is0 := NULL] # remove is0 from the initial dataset
  ans
}

# similar to f1: easily adaptable to rowSums(.SD, na.rm=TRUE).
f2 = function() {
  # here I replace Reduce(`+`, .SD) with rowSums(.SD) just in case its na.rm argument is needed.
  ans = dt0[, is0 := {sum0 = abs(rowSums(.SD)) < eps; Reduce(`+`, shift(sum0, -1:1, fill=0)) < 3}, 
            by=group, .SDcols=isnum][(is0), !"is0"]
  
  dt0[, is0:=NULL] # remove is0 from the initial dataset
  ans
}

# r2evans first solution
f3 = function() {
  ans = dt0[
    dt0[, sum0 := abs(rowSums(.SD)) < eps, .SDcols = isnum
    ][, .I[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3], by=group ]$V1
  ][, sum0 := NULL][]
  
  dt0[, sum0 := NULL] # remove sum0 from the initial dataset
  ans
}

# r2evans second solution
f4 = function() {
  ans = dt0[, sum0 := abs(rowSums(.SD)) < eps, .SDcols = isnum
  ][, .SD[(c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3,], by = .(group)
  ][, sum0 := NULL ][]
  
  dt0[, sum0:=NULL] # remove sum0 from the initial dataset
  ans
}

# modified version of r2evans second solution: similar to f4 but avoid [, .SD[...], by=group]
f5 = function() {
  ans = dt0[, sum0 := abs(rowSums(.SD)) < eps, .SDcols = isnum
  ][, sum0 := (c(0,sum0[-.N]) + sum0 + c(sum0[-1],0)) < 3, by = .(group)
  ][(sum0), !"sum0"][]
  
  dt0[, sum0:=NULL] # remove sum0 from the initial dataset
  ans
}

基准

bench::mark(
  f1(),
  f2(),
  f3(),
  f4(),
  f5(),
  iterations=5L, check=FALSE
)

# A tibble: 5 x 13
  expression    min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
  <bch:expr> <bch:> <bch:>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
1 f1()        347ms  406ms      2.49  698.47MB     5.48     5    11      2.01s
2 f2()        529ms  578ms      1.69  851.02MB     4.06     5    12      2.96s
3 f3()        717ms  821ms      1.22    1.25GB     3.40     5    14      4.12s
4 f4()        889ms  956ms      1.04    1.57GB     5.01     5    24      4.79s
5 f5()        642ms  677ms      1.40    1.07GB     3.37     5    12      3.56s
 

根据这个结果,第一个解决方案比 f3 和 f4 快 2+,而且内存效率更高。

我正在使用 data.table 的开发版本 (data.table 1.14.3)