根据两列条件的行序列过滤数据框

Filter dataframe on sequence of rows conditional on two columns

我有这种类型的数据,其中 Sequ 列中的数值定义了一个行序列,Q 中的字符值命名了序列的类型:

df <- data.frame(
  Line = 1:12,
  Speaker = c(NA, "ID01.A", NA, "ID01.B", "ID07.A", NA, "ID33.B", 
              "ID33.A", "ID33.C", NA, "ID77.A", "ID77.C"),
  Utterance = c(NA, "Who did it?", "(1.99)", "Peter did.", "Hello!", NA, "So you're coming?", 
                "erm", "Yes, sure.", "(0.22)", "Good night?", "Yeah, sleep well"),
  Sequ = c(NA,1,1,1, NA,NA, 2,2,2, NA, 3,3),
  Q = c(NA, "q_wh", "", "", NA, NA, "q_decl", "", "", NA, "q_wh", "")
)

我想根据 Sequ 数值(而不是 NA)和 Q == q_wh 的数值对数据帧进行子集化。我可以通过使用 na_if 然后 fill:

来完成这个任务
library(tidyr)
df %>%
  mutate(Q = na_if(Q, "")) %>%
  fill(Q, .direction = "down") %>%
  filter(!is.na(Sequ) & Q == "q_wh")
  Line Speaker        Utterance Sequ    Q
1    2  ID01.A      Who did it?    1 q_wh
2    3    <NA>           (1.99)    1 q_wh
3    4  ID01.B       Peter did.    1 q_wh
4   11  ID77.A      Good night?    3 q_wh
5   12  ID77.C Yeah, sleep well    3 q_wh

但是有没有另一种更直接的方法,不用绕到 na_iffill 来过滤 df

编辑:

我找到了一个没有 fillna_if 的解决方案:

df %>%
  group_by(Sequ) %>%
  mutate(Q = Q[!Q==""]) %>%
  filter(!is.na(Sequ) & Q == "q_wh")

使用ave替换"Sequ",第一个Q,最后subset

df[!is.na(df$Sequ), ] |>
  transform(Q=ave(Q, Sequ, FUN=\(x) x[1])) |>
  subset(!is.na(Sequ) & Q == 'q_wh')
#    Line Speaker        Utterance Sequ    Q
# 2     2  ID01.A      Who did it?    1 q_wh
# 3     3    <NA>           (1.99)    1 q_wh
# 4     4  ID01.B       Peter did.    1 q_wh
# 11   11  ID77.A      Good night?    3 q_wh
# 12   12  ID77.C Yeah, sleep well    3 q_wh

注: R version 4.1.2 (2021-11-01).

或者使用 tidyverse 方言。

library(magrittr)
df[!is.na(df$Sequ), ] %>%
  dplyr::mutate(Q=ave(Q, Sequ, FUN=\(x) x[1])) %>%
  dplyr::filter(!is.na(Sequ) & Q == 'q_wh')
#   Line Speaker        Utterance Sequ    Q
# 1    2  ID01.A      Who did it?    1 q_wh
# 2    3    <NA>           (1.99)    1 q_wh
# 3    4  ID01.B       Peter did.    1 q_wh
# 4   11  ID77.A      Good night?    3 q_wh
# 5   12  ID77.C Yeah, sleep well    3 q_wh

数据:

df <- structure(list(Line = 1:12, Speaker = c(NA, "ID01.A", NA, "ID01.B", 
"ID07.A", NA, "ID33.B", "ID33.A", "ID33.C", NA, "ID77.A", "ID77.C"
), Utterance = c(NA, "Who did it?", "(1.99)", "Peter did.", "Hello!", 
NA, "So you're coming?", "erm", "Yes, sure.", "(0.22)", "Good night?", 
"Yeah, sleep well"), Sequ = c(NA, 1, 1, 1, NA, NA, 2, 2, 2, NA, 
3, 3), Q = c(NA, "q_wh", "", "", NA, NA, "q_decl", "", "", NA, 
"q_wh", "")), class = "data.frame", row.names = c(NA, -12L))