基于附加条件构建变量滞后

Constructing variable lags based on additional condition

我想根据以下附加条件和操作创建滞后变量:

假设我们观察一名游戏玩家十天。 day_active 表示他当天是否活跃,n_wins 表示他赢了多少场比赛。

Example dataset:
    da = data.frame(day = c(1,2,3,4,5,6,7,8,9,10), day_active = c(1,1,0,0,1,1,0,0,1,1), n_wins = c(2,3,0,0,1,0,0,0,0,1))

da
   day day_active n_wins
1    1          1      2
2    2          1      3
3    3          0      0
4    4          0      0
5    5          1      1
6    6          1      0
7    7          0      0
8    8          0      0
9    9          1      0
10  10          1      1

转换后应该是这样的:

da2 = data.frame(day = c(1,2,3,4,5,6,7,8,9,10), day_active = c(1,1,0,0,1,1,0,0,1,1), n_wins = c(2,3,0,0,1,0,0,0,0,1), lag_n_wins = c(NA,2,3,3,3,1,0,0,0,0))
da2
   day day_active n_wins lag_n_wins
1    1          1      2         NA
2    2          1      3          2
3    3          0      0          3
4    4          0      0          3
5    5          1      1          3
6    6          1      0          1
7    7          0      0          0
8    8          0      0          0
9    9          1      0          0
10  10          1      1          0

我们可以根据'day_active'中1的存在情况,对逻辑向量求和,创建一个分组列,然后if所有值都不为0,替换为NA 并将 NA 替换为具有 na.locf(来自 zoo)的先前非 NA 元素,ungroup 并获取创建的列的 lag

library(dplyr)    
da %>%
     group_by(grp = cumsum(day_active == 1)) %>%
     mutate(lag_n_wins = zoo::na.locf0(if(all(n_wins == 0)) n_wins 
                  else na_if(n_wins, 0)) ) %>%
     ungroup %>% 
     mutate(lag_n_wins = lag(lag_n_wins)) %>%
     select(-grp)
# A tibble: 10 x 4
#     day day_active n_wins lag_n_wins
#   <dbl>      <dbl>  <dbl>      <dbl>
# 1     1          1      2         NA
# 2     2          1      3          2
# 3     3          0      0          3
# 4     4          0      0          3
# 5     5          1      1          3
# 6     6          1      0          1
# 7     7          0      0          0
# 8     8          0      0          0
# 9     9          1      0          0
#10    10          1      1          0