如何滚动过滤?
How do I filter on a rolling basis?
如何在滚动的基础上有条件地filter/select相关观察?
组 1 到 52 是基线。
- 然后在第53组中,我想过滤掉所有出现在第1到52组中的ID
- 然后对于第54组,我想过滤掉所有出现在第2到53组的ID
- 然后对于第 55 组,我想过滤掉第 3 到 54 组中出现的所有 ID
- 等等等等。基本上数据集有组和一个 ID,我正在尝试 select 相关的 ID。
下面的代码手动创建了一个示例数据集,其中 final_example_data
是起始输出,expected_output
是预期输出。
example_data <- data.frame(Groups = 1:55,
ID = 1)
`%!in%` = Negate(`%in%`)
example_data <-
example_data %>%
filter(Groups %in% c(1,4, 7 , 10, 11, 15, 44, 52))
example_data2 <- data.frame(Groups = 1:55,
ID = 2)
example_data2 <-
example_data2 %>%
filter(Groups %in% c(1,3,5,7,8,11,15,44,33,55,41))
example_data3 <- data.frame(Groups = 1:55,
ID = 7)
example_data3 <-
example_data3 %>%
filter(Groups %in% c(53))
example_data4 <-
data.frame(Groups = 1:55,
ID = 4) %>%
filter(Groups == 54)
example_data5 <-
data.frame(Groups = c(1:55), ID = 0) %>%
filter(Groups %in% c(53,54,55))
final_example_data <- rbind(example_data,
example_data2,
example_data3,
example_data4,
example_data5)
# so this would show that ID 1 is present from Groups 1 to 52, ID 2 is present from Groups 1 to 52, and ID 3 is NOT present from Groups 1 to 52...
no_present_in_1_52 <-
final_example_data %>%
filter(ID %in% c(7, 0)) %>%
filter(Groups <= 53)
# now which are not present in 2 to 53 but are present in 54
not_present_in_Groups_2_53 <-
final_example_data %>%
filter(ID == 4)
not_present_in_Groups3_to_54 <-
final_example_data %>%
filter(Groups > 54) #but you can see they are present in Groups 3 to 54 visually so they are not included, so nothing for final output for Groups 55
expected_output <- rbind(not_present_in_Groups_2_53,no_present_in_1_52)
编辑:
example_data6 <- data.frame(Groups = c(1), ID = 88)
example_data7 <- data.frame(Groups = c(54), ID = 88)
final_example_data <- rbind(final_example_data , example_data6, example_data7)
#So I would expect Groups 54 matched to ID 88 to appear in the results because it was not present in Groups 2 to 53.
为了清楚起见,我将 final_example_data
重命名为 fed
:
data.table
library(data.table)
setDT(fed)[
i = Groups>52,
j = .SD[!ID %in% fed[between(Groups, .BY$Groups-52,.BY$Groups, incbounds=F), ID]],
by = Groups
]
Groups ID
1: 53 7
2: 53 0
3: 54 4
或基数R
- 确定超出基线的组值
target_groups = unique(fed$Groups[fed$Groups>52])
- 遍历它们,每次检查该组的 ID 是否在小于该组的任何组的 ID 中;行绑定 data.frames
的结果列表
do.call(rbind, (lapply(target_groups, function(x) {
id <- fed$ID[fed$Groups==x]
id <- id[!id %in% fed$ID[fed$Groups<x & fed$Groups>(x-52)]]
if(length(id)>0) return(data.frame(Group = x,ID = id))
})))
输出:
Group ID
1 53 7
2 53 0
3 54 4
你可以试试这个tidyverse
方法-
library(dplyr)
library(purrr)
baseline <- 52
map_df((baseline + 1):max(final_example_data$Groups), ~final_example_data %>%
filter(!ID %in% ID[Groups < .x], Groups <= .x))
# Groups ID
#1 53 7
#2 53 0
#3 54 4
哪里
(baseline + 1):max(final_example_data$Groups) #returns
#[1] 53 54 55
如何在滚动的基础上有条件地filter/select相关观察?
组 1 到 52 是基线。
- 然后在第53组中,我想过滤掉所有出现在第1到52组中的ID
- 然后对于第54组,我想过滤掉所有出现在第2到53组的ID
- 然后对于第 55 组,我想过滤掉第 3 到 54 组中出现的所有 ID
- 等等等等。基本上数据集有组和一个 ID,我正在尝试 select 相关的 ID。
下面的代码手动创建了一个示例数据集,其中 final_example_data
是起始输出,expected_output
是预期输出。
example_data <- data.frame(Groups = 1:55,
ID = 1)
`%!in%` = Negate(`%in%`)
example_data <-
example_data %>%
filter(Groups %in% c(1,4, 7 , 10, 11, 15, 44, 52))
example_data2 <- data.frame(Groups = 1:55,
ID = 2)
example_data2 <-
example_data2 %>%
filter(Groups %in% c(1,3,5,7,8,11,15,44,33,55,41))
example_data3 <- data.frame(Groups = 1:55,
ID = 7)
example_data3 <-
example_data3 %>%
filter(Groups %in% c(53))
example_data4 <-
data.frame(Groups = 1:55,
ID = 4) %>%
filter(Groups == 54)
example_data5 <-
data.frame(Groups = c(1:55), ID = 0) %>%
filter(Groups %in% c(53,54,55))
final_example_data <- rbind(example_data,
example_data2,
example_data3,
example_data4,
example_data5)
# so this would show that ID 1 is present from Groups 1 to 52, ID 2 is present from Groups 1 to 52, and ID 3 is NOT present from Groups 1 to 52...
no_present_in_1_52 <-
final_example_data %>%
filter(ID %in% c(7, 0)) %>%
filter(Groups <= 53)
# now which are not present in 2 to 53 but are present in 54
not_present_in_Groups_2_53 <-
final_example_data %>%
filter(ID == 4)
not_present_in_Groups3_to_54 <-
final_example_data %>%
filter(Groups > 54) #but you can see they are present in Groups 3 to 54 visually so they are not included, so nothing for final output for Groups 55
expected_output <- rbind(not_present_in_Groups_2_53,no_present_in_1_52)
编辑:
example_data6 <- data.frame(Groups = c(1), ID = 88)
example_data7 <- data.frame(Groups = c(54), ID = 88)
final_example_data <- rbind(final_example_data , example_data6, example_data7)
#So I would expect Groups 54 matched to ID 88 to appear in the results because it was not present in Groups 2 to 53.
为了清楚起见,我将 final_example_data
重命名为 fed
:
data.table
library(data.table)
setDT(fed)[
i = Groups>52,
j = .SD[!ID %in% fed[between(Groups, .BY$Groups-52,.BY$Groups, incbounds=F), ID]],
by = Groups
]
Groups ID
1: 53 7
2: 53 0
3: 54 4
或基数R
- 确定超出基线的组值
target_groups = unique(fed$Groups[fed$Groups>52])
- 遍历它们,每次检查该组的 ID 是否在小于该组的任何组的 ID 中;行绑定 data.frames 的结果列表
do.call(rbind, (lapply(target_groups, function(x) {
id <- fed$ID[fed$Groups==x]
id <- id[!id %in% fed$ID[fed$Groups<x & fed$Groups>(x-52)]]
if(length(id)>0) return(data.frame(Group = x,ID = id))
})))
输出:
Group ID
1 53 7
2 53 0
3 54 4
你可以试试这个tidyverse
方法-
library(dplyr)
library(purrr)
baseline <- 52
map_df((baseline + 1):max(final_example_data$Groups), ~final_example_data %>%
filter(!ID %in% ID[Groups < .x], Groups <= .x))
# Groups ID
#1 53 7
#2 53 0
#3 54 4
哪里
(baseline + 1):max(final_example_data$Groups) #returns
#[1] 53 54 55