在列中首次出现 0 后删除组的后续行

Remove subsequent rows of a group after first occurence of 0 in a column

我有 data.table 如下:-

data <- data.table(k = c("a", "a", "a", "a", "b", "b", "c", "c", "c", "d"),
               year = c(2011, 2012, 2013, 2014, 2012, 2013, 2014, 2015, 2016, 2001),
               grow_bool = c(1, 1, 0, 1, 0, 1, 1, 0, 1, 0))

#    k year grow_bool
# 1: a 2011         1
# 2: a 2012         1
# 3: a 2013         0
# 4: a 2014         1
# 5: b 2012         0
# 6: b 2013         1
# 7: c 2014         1
# 8: c 2015         0
# 9: c 2016         1
#10: d 2001         0

现在,我想删除在列 grow_bool 中具有 0 的行以及每个 k 之后的后续行。例如:- 2013 中的 agrow_bool 中有一个 0,因此,应删除所有这一行以及该行之后 a 的所有行. b 的所有行都将被删除,因为 b 的第一行包含 0d 只有一行,它将被删除,因为它有 0.

结果data.table应该是下面的形式:-

data_2 <- data.table(k = c(a, a, c),
                     year = c(2011, 2012, 2014),
                     grow_bool = c(1, 1, 1))

#    k year grow_bool
# 1: a 2011         1
# 2: a 2012         1
# 3: c 2014         1

提前致谢。

data[!data[, cumsum(grow_bool == 0) > 0, by = .(k)]$V1, ]
#    k year grow_bool
# 1: a 2011         1
# 2: a 2012         1
# 3: c 2014         1
library(data.table)
library(magrittr)
data <-
  data.table(
    k = c("a", "a", "a", "a", "b", "b", "c", "c", "c", "d"),
    year = c(2011, 2012, 2013, 2014, 2012, 2013, 2014, 2015, 2016, 2001),
    grow_bool = c(1, 1, 0, 1, 0, 1, 1, 0, 1, 0)
  )

data[, fltr := cumsum(grow_bool == 0), by = k] %>% 
  .[fltr == 0] %>% 
  .[, fltr := NULL] %>% 
  .[]
#>    k year grow_bool
#> 1: a 2011         1
#> 2: a 2012         1
#> 3: c 2014         1

reprex package (v2.0.0)

于 2021-07-23 创建

这是另一个使用 cummin 的选项:

DT[DT[, .I[cummin(value) >= 1L], ID]$V1]

DT[DT[, cummin(value) >= 1L, ID]$V1]

时间取决于数据集中的组数。以下是有大量组时的一些时间安排:

library(data.table)
set.seed(0L)
nr <- 1e7L
ng <- 5e5L
DT <- data.table(ID=sample(ng, nr, TRUE), value=sample(0:1, nr, TRUE))
setorder(DT, ID)

microbenchmark::microbenchmark(times=3L, 
    check=function(values) {
        all(sapply(values[-1L], function(x) fsetequal(values[[1L]], x)))
    },

    DT[!DT[, cumsum(value==0L) > 0L, ID]$V1],
    DT[-DT[, .I[cumsum(value==0L)>0L], ID]$V1],
    DT[DT[, .I[cumsum(value==0L)==0L], ID]$V1],
    DT[DT[, .I[cummin(value) >= 1L], ID]$V1],
    DT[DT[, cummin(value) >= 1L, ID]$V1])

时间:

Unit: milliseconds
                                           expr       min        lq      mean    median        uq       max neval
     DT[!DT[, cumsum(value == 0L) > 0L, ID]$V1]  766.2710  774.0762  796.6203  781.8814  811.7950  841.7086     3
 DT[-DT[, .I[cumsum(value == 0L) > 0L], ID]$V1] 1040.6440 1086.4132 1105.8104 1132.1824 1138.3936 1144.6047     3
 DT[DT[, .I[cumsum(value == 0L) == 0L], ID]$V1]  774.5525  828.6320  851.2925  882.7115  889.6626  896.6136     3
       DT[DT[, .I[cummin(value) >= 1L], ID]$V1]  723.6663  727.0806  731.0786  730.4950  734.7848  739.0745     3
           DT[DT[, cummin(value) >= 1L, ID]$V1]  657.1164  661.7835  664.8258  666.4506  668.6805  670.9103     3

检查:

a0 <- DT[!DT[, cumsum(value==0L) > 0L, ID]$V1]
a1 <- DT[DT[, .I[cumsum(value==0L)==0L], ID]$V1]
a2 <- DT[-DT[, .I[cumsum(value==0L)>0L], ID]$V1]
a3 <- DT[DT[, .I[cummin(value) >= 1L], ID]$V1]
a4 <- DT[DT[, cummin(value) >= 1L, ID]$V1]
fsetequal(a0, a1)
#[1] TRUE
fsetequal(a0, a2)
#[1] TRUE
fsetequal(a0, a3)
#[1] TRUE
fsetequal(a0, a4)
#[1] TRUE