如何在 summarize dplyr 中进行子集化
How to subset inside summarise dplyr
我想在 summarise()
内进行子集化。以下 subset()
-ing 是否可能?
df <- structure(list(category = structure(c(1L, 1L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L), .Label = c("category MB", "category LR"
), class = "factor"), start = c(111, 222, 333, 444, 555, 111,
222, 333, 444, 111, 111, 222, 333, 444), stop = c(666, 777, 888,
999, 1000, 666, 777, 888, 999, 666, 666, 777, 888, 999), ID = c(101,
101, 101, 101, 101, 102, 102, 102, 102, 102, 102, 102, 102, 102
)), row.names = c(NA, -14L), class = "data.frame")
library(dplyr)
df %>%
group_by(ID) %>%
summarise(
countAll = n(),
durationAll = sum(stop - start),
countCategoryMB = sum(category == "category MB"),
durationCategoryMB = sum( subset(., category == "category MB", select = stop) - subset(., category == "category MB", select = start) ), # line in question, currently wrong
countCategoryLR = sum(category == "category LR"),
durationCategoryLR = sum( subset(., category == "category LR", select = stop) - subset(., category == "category LR", select = start) ) # line in question, currently wrong
)
预期的结果(图片在post末尾),我可以用left_join()
实现。但我希望可以通过上面的代码在一次调用中实现所需的输出。
# expected result achieved with left_join()
df %>%
group_by(ID) %>%
summarise(countAll = n(),
durationALL = sum(stop - start)) %>%
left_join(
.,
df %>%
filter(category == "category MB") %>%
group_by(ID) %>%
summarise(
countCategoryMB = n(),
durationCategoryMB = sum(stop - start)
),
by = "ID"
) %>%
left_join(
.,
df %>%
filter(category == "category LR") %>%
group_by(ID) %>%
summarise(
countCategoryLR = n(),
durationCategoryLR = sum(stop - start)
) ,
by = "ID"
)
感谢您的宝贵时间!
在下面的解决方案中,如果 (category == "category MB")
为真,则等于 1,否则为 0。因此,这实际上只对类别等于 "category MB" 或的那些行的开始和停止值求和"category LR",根据要求。
df %>%
group_by(ID) %>%
summarise(
countAll = n(),
durationAll = sum(stop - start),
countCategoryMB = sum(category == "category MB"),
durationCategoryMB = sum( ((category == "category MB")*stop) - ((category == "category MB")*start) ),
countCategoryLR = sum(category == "category LR"),
durationCategoryLR = sum( ((category == "category LR")*stop) - ((category == "category LR")*start) )
)
我想在 summarise()
内进行子集化。以下 subset()
-ing 是否可能?
df <- structure(list(category = structure(c(1L, 1L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L), .Label = c("category MB", "category LR"
), class = "factor"), start = c(111, 222, 333, 444, 555, 111,
222, 333, 444, 111, 111, 222, 333, 444), stop = c(666, 777, 888,
999, 1000, 666, 777, 888, 999, 666, 666, 777, 888, 999), ID = c(101,
101, 101, 101, 101, 102, 102, 102, 102, 102, 102, 102, 102, 102
)), row.names = c(NA, -14L), class = "data.frame")
library(dplyr)
df %>%
group_by(ID) %>%
summarise(
countAll = n(),
durationAll = sum(stop - start),
countCategoryMB = sum(category == "category MB"),
durationCategoryMB = sum( subset(., category == "category MB", select = stop) - subset(., category == "category MB", select = start) ), # line in question, currently wrong
countCategoryLR = sum(category == "category LR"),
durationCategoryLR = sum( subset(., category == "category LR", select = stop) - subset(., category == "category LR", select = start) ) # line in question, currently wrong
)
预期的结果(图片在post末尾),我可以用left_join()
实现。但我希望可以通过上面的代码在一次调用中实现所需的输出。
# expected result achieved with left_join()
df %>%
group_by(ID) %>%
summarise(countAll = n(),
durationALL = sum(stop - start)) %>%
left_join(
.,
df %>%
filter(category == "category MB") %>%
group_by(ID) %>%
summarise(
countCategoryMB = n(),
durationCategoryMB = sum(stop - start)
),
by = "ID"
) %>%
left_join(
.,
df %>%
filter(category == "category LR") %>%
group_by(ID) %>%
summarise(
countCategoryLR = n(),
durationCategoryLR = sum(stop - start)
) ,
by = "ID"
)
感谢您的宝贵时间!
在下面的解决方案中,如果 (category == "category MB")
为真,则等于 1,否则为 0。因此,这实际上只对类别等于 "category MB" 或的那些行的开始和停止值求和"category LR",根据要求。
df %>%
group_by(ID) %>%
summarise(
countAll = n(),
durationAll = sum(stop - start),
countCategoryMB = sum(category == "category MB"),
durationCategoryMB = sum( ((category == "category MB")*stop) - ((category == "category MB")*start) ),
countCategoryLR = sum(category == "category LR"),
durationCategoryLR = sum( ((category == "category LR")*stop) - ((category == "category LR")*start) )
)