根据条件获取每组行的平均值 - R / Dplyr
Get average of rows per group based on a condition - R / Dplyr
我有一个data table
如下图:
我需要在 Test_Date > Job_End_Date
时根据用户提供的 'count
'.
例如,如果计数=2,则使用 Test_Date > Job_End_Date 之后的两个数据点来计算 average
。
我正在使用下面的表达式,但没有得到正确的结果:
WT <- dt%>%
arrange(Entity,Test_Date)%>%
group_by(Entity) %>%
select(Entity,Job_End_Date,Value,Test_Date)%>%
summarise(jobenddate = first(Job_End_Date),Ave = mean(head(na.omit(Value[lag(cumsum(as.Date(Job_End_Date) <= as.Date(Test_Date)),
default = 0) > 0]), n), na.rm = TRUE))%>%
ungroup()
如果上面的例子中n=2,
平均 A = 15.02(自 Job_End_Date 以来只有一个值)
平均 B = (29.71+19.41)/2 = 24.56
dput(dt):
structure(list(Entity = c("A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B"), Job_End_Date = structure(c(1616198400,
1616198400, 1616198400, 1616198400, 1616198400, 1616198400, 1616198400,
1614988800, 1614988800, 1614988800, 1614988800, 1614988800, 1614988800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Test_Date = structure(c(1581379200,
1582502400, 1582934400, 1583452800, 1584057600, 1584576000, 1585094400,
1578528000, 1579910400, 1596499200, 1615334400, 1618272000, 1586476800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Value = c(13.8,
14.3, 18.97, 14, 14.97, 14, 15.02, 31.16, 35.95, 19.41, 29.71,
19.41, 29.96)), row.names = c(NA, -13L), class = c("tbl_df",
"tbl", "data.frame"))
对'Test_Date'大于'Job_End_Date'的行进行arrange
,filter
,按'Entity'分组得到mean
'Value' 的 head
,其中 n
指定为 'count'
library(dplyr)
count <- 2
dt %>%
arrange(Entity, Test_Date, Job_End_Date) %>%
group_by(Entity, .drop = FALSE) %>%
filter(Test_Date > Job_End_Date, !is.na(Value)) %>%
summarise(Avg = mean(head(Value, count), na.rm = TRUE))
-输出
# A tibble: 2 x 2
# Entity Avg
# <chr> <dbl>
#1 A 15.0
#2 B 24.6
或者另一种选择是使用 summarise
中的逻辑表达式
dt %>%
arrange(Entity, Test_Date, Job_End_Date) %>%
group_by(Entity) %>%
summarise(Avg = mean(head(Value[Test_Date > Job_End_Date],
count), na.rm = TRUE))
我有一个data table
如下图:
我需要在 Test_Date > Job_End_Date
时根据用户提供的 'count
'.
例如,如果计数=2,则使用 Test_Date > Job_End_Date 之后的两个数据点来计算 average
。
我正在使用下面的表达式,但没有得到正确的结果:
WT <- dt%>%
arrange(Entity,Test_Date)%>%
group_by(Entity) %>%
select(Entity,Job_End_Date,Value,Test_Date)%>%
summarise(jobenddate = first(Job_End_Date),Ave = mean(head(na.omit(Value[lag(cumsum(as.Date(Job_End_Date) <= as.Date(Test_Date)),
default = 0) > 0]), n), na.rm = TRUE))%>%
ungroup()
如果上面的例子中n=2,
平均 A = 15.02(自 Job_End_Date 以来只有一个值)
平均 B = (29.71+19.41)/2 = 24.56
dput(dt):
structure(list(Entity = c("A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B"), Job_End_Date = structure(c(1616198400,
1616198400, 1616198400, 1616198400, 1616198400, 1616198400, 1616198400,
1614988800, 1614988800, 1614988800, 1614988800, 1614988800, 1614988800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Test_Date = structure(c(1581379200,
1582502400, 1582934400, 1583452800, 1584057600, 1584576000, 1585094400,
1578528000, 1579910400, 1596499200, 1615334400, 1618272000, 1586476800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Value = c(13.8,
14.3, 18.97, 14, 14.97, 14, 15.02, 31.16, 35.95, 19.41, 29.71,
19.41, 29.96)), row.names = c(NA, -13L), class = c("tbl_df",
"tbl", "data.frame"))
对'Test_Date'大于'Job_End_Date'的行进行arrange
,filter
,按'Entity'分组得到mean
'Value' 的 head
,其中 n
指定为 'count'
library(dplyr)
count <- 2
dt %>%
arrange(Entity, Test_Date, Job_End_Date) %>%
group_by(Entity, .drop = FALSE) %>%
filter(Test_Date > Job_End_Date, !is.na(Value)) %>%
summarise(Avg = mean(head(Value, count), na.rm = TRUE))
-输出
# A tibble: 2 x 2
# Entity Avg
# <chr> <dbl>
#1 A 15.0
#2 B 24.6
或者另一种选择是使用 summarise
dt %>%
arrange(Entity, Test_Date, Job_End_Date) %>%
group_by(Entity) %>%
summarise(Avg = mean(head(Value[Test_Date > Job_End_Date],
count), na.rm = TRUE))