如何在R中分别计算组的中位数

Question

数据的小例子

    df=structure(list(Dt = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 
22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 
35L, 36L, 37L, 38L, 39L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 
36L, 37L, 38L, 39L), .Label = c("2018-02-20 00:00:00.000", "2018-02-21 00:00:00.000", 
"2018-02-22 00:00:00.000", "2018-02-23 00:00:00.000", "2018-02-24 00:00:00.000", 
"2018-02-25 00:00:00.000", "2018-02-26 00:00:00.000", "2018-02-27 00:00:00.000", 
"2018-02-28 00:00:00.000", "2018-03-01 00:00:00.000", "2018-03-02 00:00:00.000", 
"2018-03-03 00:00:00.000", "2018-03-04 00:00:00.000", "2018-03-05 00:00:00.000", 
"2018-03-06 00:00:00.000", "2018-03-07 00:00:00.000", "2018-03-08 00:00:00.000", 
"2018-03-09 00:00:00.000", "2018-03-10 00:00:00.000", "2018-03-11 00:00:00.000", 
"2018-03-12 00:00:00.000", "2018-03-13 00:00:00.000", "2018-03-14 00:00:00.000", 
"2018-03-15 00:00:00.000", "2018-03-16 00:00:00.000", "2018-03-17 00:00:00.000", 
"2018-03-18 00:00:00.000", "2018-03-19 00:00:00.000", "2018-03-20 00:00:00.000", 
"2018-03-21 00:00:00.000", "2018-03-22 00:00:00.000", "2018-03-23 00:00:00.000", 
"2018-03-24 00:00:00.000", "2018-03-25 00:00:00.000", "2018-03-26 00:00:00.000", 
"2018-03-27 00:00:00.000", "2018-03-28 00:00:00.000", "2018-03-29 00:00:00.000", 
"2018-03-30 00:00:00.000"), class = "factor"), ItemRelation = c(158043L, 
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 
158043L, 158043L, 158043L, 234L, 234L, 234L, 234L, 234L, 234L, 
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L
), stuff = c(200L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3600L, 
0L, 0L, 0L, 0L, 700L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 1000L, 2600L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 400L, 700L, 
200L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3600L, 0L, 0L, 0L, 
0L, 700L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1000L, 
2600L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 400L, 700L), num = c(1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L), year = c(2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L), action = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 1L, 1L, 1L, 1L)), .Names = c("Dt", "ItemRelation", 
"stuff", "num", "year", "action"), class = "data.frame", row.names = c(NA, 
-78L))

现在每组 ItemRelation +num +year 我必须计算中位数。如果我使用这个解决方案

# df with action 0 and stuff > 0
v <- df$stuff[intersect(which(df$action == 0),
                        which(df$stuff > 0))]

# df with action 1 and stuff > 0
w <- df$stuff[intersect(which(df$action == 1),
                        which(df$stuff > 0))]


# calulating the median of v for the last 5 observations
l <- length(v)
m0 <- median(v[(l-4):l]) # taking the median of the last 5 observations
# computing the final difference
m <- median(w) - m0

我一次计算所有组的中位数，但我必须计算每个组分开。我该如何执行？

此处预期输出

ItemRelation    num year    value
158043       1459   2018    45
158043      234    2018     67

post 已编辑。请注意，值不是真实的，中位数将是另一个，我只是想显示我想要的输出

编辑

动作列只有两个值 0 和 1。我必须计算 1 类动作的中值，然后计算 0 类动作的中值，使用一类前的最后五个整数值。我只取最后 5 个观察值，需要取零类动作中的最后 5 个观察值，但只取整数值，而不是计算零类所有值的中位数。在我们的例子中是

然后用一个类别的中位数减去零类别的中位数。

零类动作的观察次数可以从 0 到 10 不等。如果我们有 10 个零类别的整数值，我们取最后五个。如果只有 1、2、3、4、5 个整数值，我们减去整数值的实数的中位数。如果我们只有 0 而没有 integer ，我们就减去 0.

但是代码必须按零个类别计算中位数，但是在一个类别之前有5个最后的obs。

请注意，操作的零类别可能有其他值而不是 0。

Answer 1

最简单的方法是使用 dplyr 包中的 group_by 和 summarize：

library(dplyr)

# median of groups
medians <- df %>%
    group_by(ItemRelation, num, year) %>%
    summarize(med = median(stuff, na.rm = T))

# median of nonzero values in each group
medians <- df %>%
    filter(stuff>0) %>%
    group_by(ItemRelation, num, year) %>%
    summarize(med = median(stuff, na.rm = T))


subtract <- function(x){return(x[1]-x[2])}
median_diffs <- medians %>%
                group_by(ItemRelation, num, year) %>%
                mutate(med_diff = subtract(med))

Answer 2

一个解决方案可以使用 dplyr 并遵循以下提到的步骤来实现。请在下面的代码中找到注释以了解方法。

注：看来OP的样例数据意义不大。

library(dplyr)

df %>% filter(stuff > 0) %>%  #First filter out for stuff > 0 which of our interest
  group_by(ItemRelation, num, year) %>%
    mutate(m = median(stuff[action==1]),
           m0 = median(tail(stuff[action==0], 5))) %>%  # Calculate m and m0 for all rows
  filter(action == 1) %>%  # Now keep only rows with action == 1
  mutate(m = m-m0) %>%
  select(-Dt,-m0,-action)

# # A tibble: 4 x 5
# # Groups: ItemRelation, num, year [2]
# ItemRelation stuff   num  year     m
# <int> <int> <int> <int> <dbl>
# 1       158043   400  1459  2018  -450
# 2       158043   700  1459  2018  -450
# 3          234   400  1459  2018  -450
# 4          234   700  1459  2018  -450

如何在R中分别计算组的中位数

how to calculate the median for groups separately in R

r

plyr

lapply

dplyr

编辑