dplyr 根据订单条件和 if 语句进行汇总
dplyr summarise based on order condition with if statement
按组 (group_by(id)
),我试图根据 types
的选择对变量求和。但是,这些 types
有一个优先顺序。示例:
library(tidyverse)
df <- data.frame(id = c(rep(1, 6), 2, 2, 2, rep(3, 4), 4, 5),
types = c("1a", "1a", "2a", "3b", "4c", "7d",
"4c", "7d", "7d","4c", "5d", "6d", "6d","5d","7d"),
x = c(10, 15, 20, 15, 30, 40,
10, 10, 15, 10, 10, 10, 10, 10, 10),
y = c(1:15),
z = c(1:15)
)
df
# id types x y z
# 1 1 1a 10 1 1
# 2 1 1a 15 2 2
# 3 1 2a 20 3 3
# 4 1 3b 15 4 4
# 5 1 4c 30 5 5
# 6 1 7d 40 6 6
# 7 2 4c 10 7 7
# 8 2 7d 10 8 8
# 9 2 7d 15 9 9
# 10 3 4c 10 10 10
# 11 3 5d 10 11 11
# 12 3 6d 10 12 12
# 13 3 6d 10 13 13
# 14 4 5d 10 14 14
# 15 5 7d 10 15 15
我想 sum(x)
根据 types
偏好按此顺序:
preference_1st = c("1a", "2a", "3b")
preference_2nd = c("7d")
preference_3rd = c("4c", "5d", "6d")
所以这意味着如果 id
包含 preference_1st
中的任何类型,我们对它们求和并忽略其他类型,如果 preference_1st
中有 none,我们求和全部 preference_2nd
并忽略其余部分。最后,如果 preference_3rd
中只有 types
,我们将它们相加。所以对于 id=1
,我们要忽略类型 4c
和 7d
。 (我还希望对其他变量进行更直接的计算,本例中为 z
和 y
)。
期望输出:
desired
id sumtest ymean zmean
1 1 60 3.5 3.5
2 2 25 8.0 8.0
3 3 40 11.5 11.5
4 4 10 14.0 14.0
5 5 10 15.0 15.0
我认为一个可能的选择是使用 mutate
和 case_when
来创建某种顺序变量,但我认为使用 if
语句应该有更好的选择?以下是接近但没有正确区分首选项:
df %>%
group_by(id) %>%
summarise(sumtest = if (any(types %in% preference_1st)) {
sum(x)
} else if (any(!types %in% preference_1st) & any(types %in% preference_2nd)) {
sum(x)
} else {
sum(x)
},
ymean = mean(y),
zmean = mean(z))
# id sumtest ymean zmean
# <dbl> <dbl> <dbl> <dbl>
# 1 1 130 3.5 3.5
# 2 2 35 8 8
# 3 3 40 11.5 11.5
# 4 4 10 14 14
# 5 5 10 15 15
是否也对其他方法持开放态度?有什么建议吗?
谢谢
这是一个 dplyr 解决方案:
df %>%
group_by(id) %>%
mutate(ymean = mean(y), zmean = mean(z),
pref = 3 * types %in% preference_3rd +
2 * types %in% preference_2nd +
1 * types %in% preference_1st ) %>%
filter(pref == min(pref)) %>%
summarise(sumtest = sum(x), ymean = first(ymean), zmean = first(zmean))
#> # A tibble: 5 x 4
#> id sumtest ymean zmean
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 60 3.5 3.5
#> 2 2 25 8 8
#> 3 3 40 11.5 11.5
#> 4 4 10 14 14
#> 5 5 10 15 15
使用reduce
和anti_join
迭代过滤数据。
pref <- list(c("1a", "2a", "3b"), c("7d"), c("4c", "5d", "6d"))
pref %>%
map(~ df %>% filter(types %in% .x)) %>%
reduce(~ anti_join(.y, .x, by = "id") %>% bind_rows(.x, .)) %>%
group_by(id) %>%
summarise(sumtest = sum(x)) %>%
left_join(df %>% group_by(id) %>% summarise(ymean = mean(y), zmean = mean(z)))
# # A tibble: 5 x 4
# id sumtest ymean zmean
# <dbl> <dbl> <dbl> <dbl>
# 1 1 60 3.5 3.5
# 2 2 25 8 8
# 3 3 40 11.5 11.5
# 4 4 10 14 14
# 5 5 10 15 15
虽然我更喜欢上述解决方案,但我在 if
语句中的最初尝试中忘记了 sum(x)
的子集
df %>%
group_by(id) %>%
summarise(sumtest = if (any(types %in% preference_1st)) {
sum(x[types %in% preference_1st])
} else if (any(!types %in% preference_1st) & any(types %in% preference_2nd)) {
sum(x[types %in% preference_2nd])
} else {
sum(x[types %in% preference_3rd])
},
ymean = mean(y),
zmean = mean(z))
# id sumtest ymean zmean
# <dbl> <dbl> <dbl> <dbl>
# 1 1 60 3.5 3.5
# 2 2 25 8 8
# 3 3 40 11.5 11.5
# 4 4 10 14 14
# 5 5 10 15 15
按组 (group_by(id)
),我试图根据 types
的选择对变量求和。但是,这些 types
有一个优先顺序。示例:
library(tidyverse)
df <- data.frame(id = c(rep(1, 6), 2, 2, 2, rep(3, 4), 4, 5),
types = c("1a", "1a", "2a", "3b", "4c", "7d",
"4c", "7d", "7d","4c", "5d", "6d", "6d","5d","7d"),
x = c(10, 15, 20, 15, 30, 40,
10, 10, 15, 10, 10, 10, 10, 10, 10),
y = c(1:15),
z = c(1:15)
)
df
# id types x y z
# 1 1 1a 10 1 1
# 2 1 1a 15 2 2
# 3 1 2a 20 3 3
# 4 1 3b 15 4 4
# 5 1 4c 30 5 5
# 6 1 7d 40 6 6
# 7 2 4c 10 7 7
# 8 2 7d 10 8 8
# 9 2 7d 15 9 9
# 10 3 4c 10 10 10
# 11 3 5d 10 11 11
# 12 3 6d 10 12 12
# 13 3 6d 10 13 13
# 14 4 5d 10 14 14
# 15 5 7d 10 15 15
我想 sum(x)
根据 types
偏好按此顺序:
preference_1st = c("1a", "2a", "3b")
preference_2nd = c("7d")
preference_3rd = c("4c", "5d", "6d")
所以这意味着如果 id
包含 preference_1st
中的任何类型,我们对它们求和并忽略其他类型,如果 preference_1st
中有 none,我们求和全部 preference_2nd
并忽略其余部分。最后,如果 preference_3rd
中只有 types
,我们将它们相加。所以对于 id=1
,我们要忽略类型 4c
和 7d
。 (我还希望对其他变量进行更直接的计算,本例中为 z
和 y
)。
期望输出:
desired
id sumtest ymean zmean
1 1 60 3.5 3.5
2 2 25 8.0 8.0
3 3 40 11.5 11.5
4 4 10 14.0 14.0
5 5 10 15.0 15.0
我认为一个可能的选择是使用 mutate
和 case_when
来创建某种顺序变量,但我认为使用 if
语句应该有更好的选择?以下是接近但没有正确区分首选项:
df %>%
group_by(id) %>%
summarise(sumtest = if (any(types %in% preference_1st)) {
sum(x)
} else if (any(!types %in% preference_1st) & any(types %in% preference_2nd)) {
sum(x)
} else {
sum(x)
},
ymean = mean(y),
zmean = mean(z))
# id sumtest ymean zmean
# <dbl> <dbl> <dbl> <dbl>
# 1 1 130 3.5 3.5
# 2 2 35 8 8
# 3 3 40 11.5 11.5
# 4 4 10 14 14
# 5 5 10 15 15
是否也对其他方法持开放态度?有什么建议吗?
谢谢
这是一个 dplyr 解决方案:
df %>%
group_by(id) %>%
mutate(ymean = mean(y), zmean = mean(z),
pref = 3 * types %in% preference_3rd +
2 * types %in% preference_2nd +
1 * types %in% preference_1st ) %>%
filter(pref == min(pref)) %>%
summarise(sumtest = sum(x), ymean = first(ymean), zmean = first(zmean))
#> # A tibble: 5 x 4
#> id sumtest ymean zmean
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 60 3.5 3.5
#> 2 2 25 8 8
#> 3 3 40 11.5 11.5
#> 4 4 10 14 14
#> 5 5 10 15 15
使用reduce
和anti_join
迭代过滤数据。
pref <- list(c("1a", "2a", "3b"), c("7d"), c("4c", "5d", "6d"))
pref %>%
map(~ df %>% filter(types %in% .x)) %>%
reduce(~ anti_join(.y, .x, by = "id") %>% bind_rows(.x, .)) %>%
group_by(id) %>%
summarise(sumtest = sum(x)) %>%
left_join(df %>% group_by(id) %>% summarise(ymean = mean(y), zmean = mean(z)))
# # A tibble: 5 x 4
# id sumtest ymean zmean
# <dbl> <dbl> <dbl> <dbl>
# 1 1 60 3.5 3.5
# 2 2 25 8 8
# 3 3 40 11.5 11.5
# 4 4 10 14 14
# 5 5 10 15 15
虽然我更喜欢上述解决方案,但我在 if
语句中的最初尝试中忘记了 sum(x)
的子集
df %>%
group_by(id) %>%
summarise(sumtest = if (any(types %in% preference_1st)) {
sum(x[types %in% preference_1st])
} else if (any(!types %in% preference_1st) & any(types %in% preference_2nd)) {
sum(x[types %in% preference_2nd])
} else {
sum(x[types %in% preference_3rd])
},
ymean = mean(y),
zmean = mean(z))
# id sumtest ymean zmean
# <dbl> <dbl> <dbl> <dbl>
# 1 1 60 3.5 3.5
# 2 2 25 8 8
# 3 3 40 11.5 11.5
# 4 4 10 14 14
# 5 5 10 15 15