在 R 中按条件分别乘以组的中位数

Question

我有这个数据集

df=structure(list(Dt = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 
22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 
35L, 36L, 37L, 38L, 39L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 
36L, 37L, 38L, 39L), .Label = c("2018-02-20 00:00:00.000", "2018-02-21 00:00:00.000", 
"2018-02-22 00:00:00.000", "2018-02-23 00:00:00.000", "2018-02-24 00:00:00.000", 
"2018-02-25 00:00:00.000", "2018-02-26 00:00:00.000", "2018-02-27 00:00:00.000", 
"2018-02-28 00:00:00.000", "2018-03-01 00:00:00.000", "2018-03-02 00:00:00.000", 
"2018-03-03 00:00:00.000", "2018-03-04 00:00:00.000", "2018-03-05 00:00:00.000", 
"2018-03-06 00:00:00.000", "2018-03-07 00:00:00.000", "2018-03-08 00:00:00.000", 
"2018-03-09 00:00:00.000", "2018-03-10 00:00:00.000", "2018-03-11 00:00:00.000", 
"2018-03-12 00:00:00.000", "2018-03-13 00:00:00.000", "2018-03-14 00:00:00.000", 
"2018-03-15 00:00:00.000", "2018-03-16 00:00:00.000", "2018-03-17 00:00:00.000", 
"2018-03-18 00:00:00.000", "2018-03-19 00:00:00.000", "2018-03-20 00:00:00.000", 
"2018-03-21 00:00:00.000", "2018-03-22 00:00:00.000", "2018-03-23 00:00:00.000", 
"2018-03-24 00:00:00.000", "2018-03-25 00:00:00.000", "2018-03-26 00:00:00.000", 
"2018-03-27 00:00:00.000", "2018-03-28 00:00:00.000", "2018-03-29 00:00:00.000", 
"2018-03-30 00:00:00.000"), class = "factor"), ItemRelation = c(158043L, 
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 
158043L, 158043L, 158043L, 234L, 234L, 234L, 234L, 234L, 234L, 
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L
), stuff = c(200L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3600L, 
0L, 0L, 0L, 0L, 700L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 1000L, 2600L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 400L, 700L, 
200L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3600L, 0L, 0L, 0L, 
0L, 700L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1000L, 
2600L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 400L, 700L), num = c(1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 
1459L, 1459L, 1459L, 1459L, 1459L), year = c(2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L), action = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 1L, 1L, 1L, 1L)), .Names = c("Dt", "ItemRelation", 
"stuff", "num", "year", "action"), class = "data.frame", row.names = c(NA, 
-78L))

下一个操作是对这个数据进行的。 1. 计算第一类动作和零类动作的中位数的操作（最后五个非零观察值）。 2.then 第一个类别的中位数减去零类别的中位数。 MKR的解决方案非常准确。

library(dplyr)

df %>% filter(stuff > 0) %>%  #First filter out for stuff > 0 which of our interest
  group_by(ItemRelation, num, year) %>%
    mutate(m = median(stuff[action==1]),
           m0 = median(tail(stuff[action==0], 5))) %>%  # Calculate m and m0 for all rows
  filter(action == 1) %>%  # Now keep only rows with action == 1
  mutate(m = m-m0) %>%
  select(-Dt,-m0,-action

如何将每个组的计算结果乘以动作的个数，但仅适用于那些大于零的那些。例如，对于 stratum

ItemRelation    num     year
158043          1459    2018

我们有 4 个在运行，只有两个大于零所以计算结果（m）我们乘以二。

Answer 1

dplyr - chain 中 stuff>0 的数据已过滤。 n() 表示每组的计数，其中 stuff>0 和 action ==1。因此，可以将 m 的最终值乘以 n()。最后，distinct 将确保删除重复的行。

library(dplyr)

df %>% filter(stuff > 0) %>%  #First filter out for stuff > 0 which of our interest
  group_by(ItemRelation, num, year) %>%
  mutate(m = median(stuff[action==1]),
         m0 = median(tail(stuff[action==0], 5))) %>%  # Calculate m and m0 for all rows
  filter(action == 1) %>%  # Now keep only rows with action == 1
  mutate(m = (m-m0)*n()) %>%
  select(-Dt,-m0,-action, - stuff) %>% distinct()

# # A tibble: 2 x 4
# # Groups: ItemRelation, num, year [2]
#   ItemRelation   num  year     m
#          <int> <int> <int> <dbl>
# 1       158043  1459  2018  -900
# 2          234  1459  2018  -900

在 R 中按条件分别乘以组的中位数

multiply median for groups separately in R by condition

r

plyr

lapply

dplyr