根据两列条件的行序列过滤数据框
Filter dataframe on sequence of rows conditional on two columns
我有这种类型的数据,其中 Sequ
列中的数值定义了一个行序列,Q
中的字符值命名了序列的类型:
df <- data.frame(
Line = 1:12,
Speaker = c(NA, "ID01.A", NA, "ID01.B", "ID07.A", NA, "ID33.B",
"ID33.A", "ID33.C", NA, "ID77.A", "ID77.C"),
Utterance = c(NA, "Who did it?", "(1.99)", "Peter did.", "Hello!", NA, "So you're coming?",
"erm", "Yes, sure.", "(0.22)", "Good night?", "Yeah, sleep well"),
Sequ = c(NA,1,1,1, NA,NA, 2,2,2, NA, 3,3),
Q = c(NA, "q_wh", "", "", NA, NA, "q_decl", "", "", NA, "q_wh", "")
)
我想根据 Sequ
数值(而不是 NA
)和 Q == q_wh
的数值对数据帧进行子集化。我可以通过使用 na_if
然后 fill
:
来完成这个任务
library(tidyr)
df %>%
mutate(Q = na_if(Q, "")) %>%
fill(Q, .direction = "down") %>%
filter(!is.na(Sequ) & Q == "q_wh")
Line Speaker Utterance Sequ Q
1 2 ID01.A Who did it? 1 q_wh
2 3 <NA> (1.99) 1 q_wh
3 4 ID01.B Peter did. 1 q_wh
4 11 ID77.A Good night? 3 q_wh
5 12 ID77.C Yeah, sleep well 3 q_wh
但是有没有另一种更直接的方法,不用绕到 na_if
和 fill
来过滤 df
?
编辑:
我找到了一个没有 fill
和 na_if
的解决方案:
df %>%
group_by(Sequ) %>%
mutate(Q = Q[!Q==""]) %>%
filter(!is.na(Sequ) & Q == "q_wh")
使用ave
替换"Sequ"
,第一个Q
,最后subset
。
df[!is.na(df$Sequ), ] |>
transform(Q=ave(Q, Sequ, FUN=\(x) x[1])) |>
subset(!is.na(Sequ) & Q == 'q_wh')
# Line Speaker Utterance Sequ Q
# 2 2 ID01.A Who did it? 1 q_wh
# 3 3 <NA> (1.99) 1 q_wh
# 4 4 ID01.B Peter did. 1 q_wh
# 11 11 ID77.A Good night? 3 q_wh
# 12 12 ID77.C Yeah, sleep well 3 q_wh
注: R version 4.1.2 (2021-11-01)
.
或者使用 tidyverse 方言。
library(magrittr)
df[!is.na(df$Sequ), ] %>%
dplyr::mutate(Q=ave(Q, Sequ, FUN=\(x) x[1])) %>%
dplyr::filter(!is.na(Sequ) & Q == 'q_wh')
# Line Speaker Utterance Sequ Q
# 1 2 ID01.A Who did it? 1 q_wh
# 2 3 <NA> (1.99) 1 q_wh
# 3 4 ID01.B Peter did. 1 q_wh
# 4 11 ID77.A Good night? 3 q_wh
# 5 12 ID77.C Yeah, sleep well 3 q_wh
数据:
df <- structure(list(Line = 1:12, Speaker = c(NA, "ID01.A", NA, "ID01.B",
"ID07.A", NA, "ID33.B", "ID33.A", "ID33.C", NA, "ID77.A", "ID77.C"
), Utterance = c(NA, "Who did it?", "(1.99)", "Peter did.", "Hello!",
NA, "So you're coming?", "erm", "Yes, sure.", "(0.22)", "Good night?",
"Yeah, sleep well"), Sequ = c(NA, 1, 1, 1, NA, NA, 2, 2, 2, NA,
3, 3), Q = c(NA, "q_wh", "", "", NA, NA, "q_decl", "", "", NA,
"q_wh", "")), class = "data.frame", row.names = c(NA, -12L))
我有这种类型的数据,其中 Sequ
列中的数值定义了一个行序列,Q
中的字符值命名了序列的类型:
df <- data.frame(
Line = 1:12,
Speaker = c(NA, "ID01.A", NA, "ID01.B", "ID07.A", NA, "ID33.B",
"ID33.A", "ID33.C", NA, "ID77.A", "ID77.C"),
Utterance = c(NA, "Who did it?", "(1.99)", "Peter did.", "Hello!", NA, "So you're coming?",
"erm", "Yes, sure.", "(0.22)", "Good night?", "Yeah, sleep well"),
Sequ = c(NA,1,1,1, NA,NA, 2,2,2, NA, 3,3),
Q = c(NA, "q_wh", "", "", NA, NA, "q_decl", "", "", NA, "q_wh", "")
)
我想根据 Sequ
数值(而不是 NA
)和 Q == q_wh
的数值对数据帧进行子集化。我可以通过使用 na_if
然后 fill
:
library(tidyr)
df %>%
mutate(Q = na_if(Q, "")) %>%
fill(Q, .direction = "down") %>%
filter(!is.na(Sequ) & Q == "q_wh")
Line Speaker Utterance Sequ Q
1 2 ID01.A Who did it? 1 q_wh
2 3 <NA> (1.99) 1 q_wh
3 4 ID01.B Peter did. 1 q_wh
4 11 ID77.A Good night? 3 q_wh
5 12 ID77.C Yeah, sleep well 3 q_wh
但是有没有另一种更直接的方法,不用绕到 na_if
和 fill
来过滤 df
?
编辑:
我找到了一个没有 fill
和 na_if
的解决方案:
df %>%
group_by(Sequ) %>%
mutate(Q = Q[!Q==""]) %>%
filter(!is.na(Sequ) & Q == "q_wh")
使用ave
替换"Sequ"
,第一个Q
,最后subset
。
df[!is.na(df$Sequ), ] |>
transform(Q=ave(Q, Sequ, FUN=\(x) x[1])) |>
subset(!is.na(Sequ) & Q == 'q_wh')
# Line Speaker Utterance Sequ Q
# 2 2 ID01.A Who did it? 1 q_wh
# 3 3 <NA> (1.99) 1 q_wh
# 4 4 ID01.B Peter did. 1 q_wh
# 11 11 ID77.A Good night? 3 q_wh
# 12 12 ID77.C Yeah, sleep well 3 q_wh
注: R version 4.1.2 (2021-11-01)
.
或者使用 tidyverse 方言。
library(magrittr)
df[!is.na(df$Sequ), ] %>%
dplyr::mutate(Q=ave(Q, Sequ, FUN=\(x) x[1])) %>%
dplyr::filter(!is.na(Sequ) & Q == 'q_wh')
# Line Speaker Utterance Sequ Q
# 1 2 ID01.A Who did it? 1 q_wh
# 2 3 <NA> (1.99) 1 q_wh
# 3 4 ID01.B Peter did. 1 q_wh
# 4 11 ID77.A Good night? 3 q_wh
# 5 12 ID77.C Yeah, sleep well 3 q_wh
数据:
df <- structure(list(Line = 1:12, Speaker = c(NA, "ID01.A", NA, "ID01.B",
"ID07.A", NA, "ID33.B", "ID33.A", "ID33.C", NA, "ID77.A", "ID77.C"
), Utterance = c(NA, "Who did it?", "(1.99)", "Peter did.", "Hello!",
NA, "So you're coming?", "erm", "Yes, sure.", "(0.22)", "Good night?",
"Yeah, sleep well"), Sequ = c(NA, 1, 1, 1, NA, NA, 2, 2, 2, NA,
3, 3), Q = c(NA, "q_wh", "", "", NA, NA, "q_decl", "", "", NA,
"q_wh", "")), class = "data.frame", row.names = c(NA, -12L))