计算按用户 ID 分组的多个变量之间的增量
Calculating the delta between multiple variables grouped by user ids
如何计算 "long" 数据框中按用户 ID 分组的多个变量之间的差值?
数据格式:
d1 <- data.frame(
id = rep(c(1, 2, 3, 4, 5), each = 2),
purchased = c(rep(c(T, F), 3), F, T, T, F),
product = rep(c("A", "B"), 5),
grade = c(1, 2, 1, 2, 2, 3, 7, 5, 1, 2),
rate = c(10, 12, 10, 12, 12, 14, 22, 18, 10, 12),
fee = rep(c(1, 2), 5))
这是我的迂回解决方案:
dA <- d1 %>%
filter(product == "A")
dB <- d1 %>%
filter(product == "B")
d2 <- inner_join(dA, dB, by = "id", suffix = c(".A", ".B"))
d3 <- d2 %>%
mutate(
purchased = if_else(purchased.A == T, "A", "B"),
dGrade = grade.B - grade.A,
dRate = rate.B - rate.A,
dFee = fee.B - fee.A) %>%
select(id, purchased:dFee)
所有这些看起来都非常低效和复杂。 tidyr::spread
或另一个 dplyr/tidyr 函数在这里合适吗? (我无法得到任何其他工作)...
我们可以用 gather/spread
做到这一点。使用 gather
将数据从 'wide' 重塑为 'long',按 'id'、'Var' 分组,我们根据逻辑列 'product' 'purchased',得到 'Val' 与 'B' 和 'A' 的 'Val' 的差值,并且 spread
从 'long' 到 'wide'格式。
library(dplyr)
library(tidyr)
gather(d1, Var, Val, grade:fee) %>%
group_by(id, Var) %>%
summarise(purchased = product[purchased],
Val = Val[product == 'B'] - Val[product == 'A'])%>%
spread(Var, Val)
# id purchased fee grade rate
# <dbl> <fctr> <dbl> <dbl> <dbl>
#1 1 A 1 1 2
#2 2 A 1 1 2
#3 3 A 1 1 2
#4 4 B 1 -2 -4
#5 5 A 1 1 2
OP 的输出 ('d3') 是
d3
# id purchased dGrade dRate dFee
#1 1 A 1 2 1
#2 2 A 1 2 1
#3 3 A 1 2 1
#4 4 B -2 -4 1
#5 5 A 1 2 1
如何计算 "long" 数据框中按用户 ID 分组的多个变量之间的差值?
数据格式:
d1 <- data.frame(
id = rep(c(1, 2, 3, 4, 5), each = 2),
purchased = c(rep(c(T, F), 3), F, T, T, F),
product = rep(c("A", "B"), 5),
grade = c(1, 2, 1, 2, 2, 3, 7, 5, 1, 2),
rate = c(10, 12, 10, 12, 12, 14, 22, 18, 10, 12),
fee = rep(c(1, 2), 5))
这是我的迂回解决方案:
dA <- d1 %>%
filter(product == "A")
dB <- d1 %>%
filter(product == "B")
d2 <- inner_join(dA, dB, by = "id", suffix = c(".A", ".B"))
d3 <- d2 %>%
mutate(
purchased = if_else(purchased.A == T, "A", "B"),
dGrade = grade.B - grade.A,
dRate = rate.B - rate.A,
dFee = fee.B - fee.A) %>%
select(id, purchased:dFee)
所有这些看起来都非常低效和复杂。 tidyr::spread
或另一个 dplyr/tidyr 函数在这里合适吗? (我无法得到任何其他工作)...
我们可以用 gather/spread
做到这一点。使用 gather
将数据从 'wide' 重塑为 'long',按 'id'、'Var' 分组,我们根据逻辑列 'product' 'purchased',得到 'Val' 与 'B' 和 'A' 的 'Val' 的差值,并且 spread
从 'long' 到 'wide'格式。
library(dplyr)
library(tidyr)
gather(d1, Var, Val, grade:fee) %>%
group_by(id, Var) %>%
summarise(purchased = product[purchased],
Val = Val[product == 'B'] - Val[product == 'A'])%>%
spread(Var, Val)
# id purchased fee grade rate
# <dbl> <fctr> <dbl> <dbl> <dbl>
#1 1 A 1 1 2
#2 2 A 1 1 2
#3 3 A 1 1 2
#4 4 B 1 -2 -4
#5 5 A 1 1 2
OP 的输出 ('d3') 是
d3
# id purchased dGrade dRate dFee
#1 1 A 1 2 1
#2 2 A 1 2 1
#3 3 A 1 2 1
#4 4 B -2 -4 1
#5 5 A 1 2 1