如何滚动过滤?

How do I filter on a rolling basis?

如何在滚动的基础上有条件地filter/select相关观察?

组 1 到 52 是基线。

下面的代码手动创建了一个示例数据集,其中 final_example_data 是起始输出,expected_output 是预期输出。

 
example_data <- data.frame(Groups = 1:55,
                           ID = 1)
`%!in%` = Negate(`%in%`)
example_data <-
  example_data %>%
  filter(Groups %in% c(1,4, 7 , 10, 11, 15, 44, 52))
 
example_data2 <- data.frame(Groups = 1:55,
                            ID = 2)
 
example_data2 <-
  example_data2 %>%
  filter(Groups %in% c(1,3,5,7,8,11,15,44,33,55,41))
 
example_data3 <- data.frame(Groups = 1:55,
                            ID = 7)
 
example_data3 <-
  example_data3 %>%
  filter(Groups %in% c(53))
 
example_data4 <-
  data.frame(Groups = 1:55,
             ID = 4) %>%
  filter(Groups == 54)
 
example_data5 <-
  data.frame(Groups = c(1:55), ID = 0) %>%
  filter(Groups %in% c(53,54,55))
 
final_example_data <- rbind(example_data,
                            example_data2,
                            example_data3,
                            example_data4,
                            example_data5)
 
# so this would show that ID 1 is present from Groups 1 to 52, ID 2 is present from Groups 1 to 52, and ID 3 is NOT present from Groups 1 to 52...
 
no_present_in_1_52 <-
  final_example_data %>%
  filter(ID %in% c(7, 0)) %>%
  filter(Groups <= 53)
 
# now which are not present in 2 to 53 but are present in 54
not_present_in_Groups_2_53 <-
  final_example_data %>%
  filter(ID == 4)
 
not_present_in_Groups3_to_54 <-
  final_example_data %>%
  filter(Groups > 54) #but you can see they are present in Groups 3 to 54 visually so they are not included, so nothing for final output for Groups 55
 
expected_output <- rbind(not_present_in_Groups_2_53,no_present_in_1_52)
 

编辑:

example_data6 <- data.frame(Groups = c(1), ID = 88)
example_data7 <- data.frame(Groups = c(54), ID = 88)

final_example_data <- rbind(final_example_data , example_data6, example_data7)

#So I would expect Groups 54 matched to ID 88 to appear in the results because it was not present in Groups 2 to 53. 

为了清楚起见,我将 final_example_data 重命名为 fed

data.table

library(data.table)

setDT(fed)[
  i = Groups>52,
  j = .SD[!ID %in% fed[between(Groups, .BY$Groups-52,.BY$Groups, incbounds=F), ID]],
  by = Groups
]

   Groups ID
1:     53  7
2:     53  0
3:     54  4

或基数R

  1. 确定超出基线的组值
target_groups = unique(fed$Groups[fed$Groups>52])
  1. 遍历它们,每次检查该组的 ID 是否在小于该组的任何组的 ID 中;行绑定 data.frames
  2. 的结果列表
do.call(rbind, (lapply(target_groups, function(x) {
  id <- fed$ID[fed$Groups==x]
  id <- id[!id %in% fed$ID[fed$Groups<x & fed$Groups>(x-52)]]
  if(length(id)>0) return(data.frame(Group = x,ID = id))
})))

输出:

  Group ID
1    53  7
2    53  0
3    54  4

你可以试试这个tidyverse方法-

library(dplyr)
library(purrr)

baseline <- 52
map_df((baseline + 1):max(final_example_data$Groups), ~final_example_data %>%
      filter(!ID %in% ID[Groups < .x], Groups <= .x)) 

#  Groups ID
#1     53  7
#2     53  0
#3     54  4

哪里

(baseline + 1):max(final_example_data$Groups) #returns
#[1] 53 54 55