当列更改值时对数据框进行子集化

Subset a dataframe when a column changes value

我有一个包含曲棍球比赛实况数据的数据集。我的目标是创建玩家不会将冰球翻过来的子集。

数据集包含一个标记为“营业额”的二进制列,如果比赛导致营业额为 1,如果比赛成功则为 0。我的 objective 是为营业额列中具有连续 0 的行的每个子集创建一个唯一的 ID。

# Example data:
set.seed(33)
x <- runif(20)
y <- runif(20)
df <-as.data.frame(cbind(x,y)) 
df$turnover = ifelse(x<0.5,1,0)

#desired output:
play_id <- c(1,2,3,4,4,4,5,6,7,8,9,10,11,12,13,13,13,14,15,16)
df$play_id <- play_id
df

这是 data.table 的一种方法:

library(data.table)
setDT(df)[, tmp := if(first(turnover) == 0) 
                   replace(turnover, 1, 1) else turnover, rleid(turnover)]
df[, result := cumsum(tmp)]
df[, tmp := NULL]
df
#             x          y turnover play_id result
# 1: 0.44594048 0.56645266        1       1      1
# 2: 0.39465031 0.04273416        1       2      2
# 3: 0.48372887 0.48831925        1       3      3
# 4: 0.91887596 0.35122322        0       4      4
# 5: 0.84388144 0.96966171        0       4      4
# 6: 0.51734962 0.78957889        0       4      4
# 7: 0.43712500 0.59663026        1       5      5
# 8: 0.34319822 0.35667053        1       6      6
# 9: 0.01551696 0.75870834        1       7      7
#10: 0.11799116 0.26105077        1       8      8
#11: 0.69098590 0.49398192        0       9      9
#12: 0.26048568 0.79289819        1      10     10
#13: 0.22505121 0.44299896        1      11     11
#14: 0.34238622 0.33326556        1      12     12
#15: 0.78188794 0.62576846        0      13     13
#16: 0.84324669 0.13619319        0      13     13
#17: 0.77474887 0.54868278        0      13     13
#18: 0.38719298 0.92183168        1      14     14
#19: 0.13576507 0.27603364        1      15     15
#20: 0.90035758 0.91741393        0      16     16

对于 turnover 值为 0 的组,我们将该组中的第一个值更改为 1 并对其求和。

虽然有点长但是只有dplyr。 (尽管 data.table::rleid() 也是我最喜欢的功能)

df %>% group_by(turnover, play_id = cumsum(turnover)) %>%
  mutate(play_id = ifelse(turnover == 0 & as.numeric(row_number()) == 1, 1, turnover)) %>%
  ungroup() %>% mutate(play_id = cumsum(play_id))

# A tibble: 20 x 4
        x      y turnover play_id
    <dbl>  <dbl>    <dbl>   <dbl>
 1 0.446  0.566         1       1
 2 0.395  0.0427        1       2
 3 0.484  0.488         1       3
 4 0.919  0.351         0       4
 5 0.844  0.970         0       4
 6 0.517  0.790         0       4
 7 0.437  0.597         1       5
 8 0.343  0.357         1       6
 9 0.0155 0.759         1       7
10 0.118  0.261         1       8
11 0.691  0.494         0       9
12 0.260  0.793         1      10
13 0.225  0.443         1      11
14 0.342  0.333         1      12
15 0.782  0.626         0      13
16 0.843  0.136         0      13
17 0.775  0.549         0      13
18 0.387  0.922         1      14
19 0.136  0.276         1      15
20 0.900  0.917         0      16

仅使用 baseR

df$play_id <- cumsum(replace(df$turnover, with(rle(df$turnover != 0), 1+cumsum(lengths)[values]), 1))

            x          y turnover play_id
1  0.44594048 0.56645266        1       1
2  0.39465031 0.04273416        1       2
3  0.48372887 0.48831925        1       3
4  0.91887596 0.35122322        0       4
5  0.84388144 0.96966171        0       4
6  0.51734962 0.78957889        0       4
7  0.43712500 0.59663026        1       5
8  0.34319822 0.35667053        1       6
9  0.01551696 0.75870834        1       7
10 0.11799116 0.26105077        1       8
11 0.69098590 0.49398192        0       9
12 0.26048568 0.79289819        1      10
13 0.22505121 0.44299896        1      11
14 0.34238622 0.33326556        1      12
15 0.78188794 0.62576846        0      13
16 0.84324669 0.13619319        0      13
17 0.77474887 0.54868278        0      13
18 0.38719298 0.92183168        1      14
19 0.13576507 0.27603364        1      15
20 0.90035758 0.91741393        0      16