根据条件获取每组行的平均值 - R / Dplyr

Get average of rows per group based on a condition - R / Dplyr

我有一个data table如下图:

我需要在 Test_Date > Job_End_Date 时根据用户提供的 'count'.

例如,如果计数=2,则使用 Test_Date > Job_End_Date 之后的两个数据点来计算 average

我正在使用下面的表达式,但没有得到正确的结果:

WT <- dt%>%
  arrange(Entity,Test_Date)%>%
  group_by(Entity) %>% 
  select(Entity,Job_End_Date,Value,Test_Date)%>%
  summarise(jobenddate = first(Job_End_Date),Ave = mean(head(na.omit(Value[lag(cumsum(as.Date(Job_End_Date) <= as.Date(Test_Date)),
                                                                                                                          default = 0) > 0]), n), na.rm = TRUE))%>%
  ungroup()

如果上面的例子中n=2,

平均 A = 15.02(自 Job_End_Date 以来只有一个值)

平均 B = (29.71+19.41)/2 = 24.56

dput(dt):

structure(list(Entity = c("A", "A", "A", "A", "A", "A", "A", 
"B", "B", "B", "B", "B", "B"), Job_End_Date = structure(c(1616198400, 
1616198400, 1616198400, 1616198400, 1616198400, 1616198400, 1616198400, 
1614988800, 1614988800, 1614988800, 1614988800, 1614988800, 1614988800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Test_Date = structure(c(1581379200, 
1582502400, 1582934400, 1583452800, 1584057600, 1584576000, 1585094400, 
1578528000, 1579910400, 1596499200, 1615334400, 1618272000, 1586476800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Value = c(13.8, 
14.3, 18.97, 14, 14.97, 14, 15.02, 31.16, 35.95, 19.41, 29.71, 
19.41, 29.96)), row.names = c(NA, -13L), class = c("tbl_df", 
"tbl", "data.frame"))

对'Test_Date'大于'Job_End_Date'的行进行arrange,filter,按'Entity'分组得到mean 'Value' 的 head,其中 n 指定为 'count'

library(dplyr)    
count <- 2
dt %>%      
   arrange(Entity, Test_Date, Job_End_Date) %>%
   group_by(Entity, .drop = FALSE) %>%
   filter(Test_Date > Job_End_Date, !is.na(Value)) %>%       
   summarise(Avg = mean(head(Value, count), na.rm = TRUE))

-输出

# A tibble: 2 x 2
#   Entity   Avg    
#   <chr>  <dbl>
#1 A       15.0
#2 B       24.6

或者另一种选择是使用 summarise

中的逻辑表达式
dt %>%
    arrange(Entity, Test_Date, Job_End_Date) %>%
    group_by(Entity) %>%
    summarise(Avg = mean(head(Value[Test_Date > Job_End_Date], 
            count), na.rm = TRUE))