计算行数直到达到标准,然后重新开始

Count rows until criterion is reached, then start again

我的数据是这样的:

id = c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4)
time=c(20,30,1100,40,31,32,33,1005,22,23,1001,24,12,13,14,1002)
test <- data.frame(id,time)

我现在正在尝试计算行数,直到 time > 1000id 分组。到目前为止我得到了

library(dplyr) 
test %>%    
group_by(id, idx = cumsum(time >= 1000))
%>%    
mutate(trip_count = row_number()) %>%    
ungroup %>%   
select(-idx)

到目前为止,这是有效的,但是当 time > 1000 时,我希望计数更进一步,并在下一列再次从 1 开始,而不是 1。这有可能吗?

您可以使用 lag:

library(dplyr)

test %>%    
  group_by(id, idx = cumsum(lag(time, default = 0) >= 1000)) %>%    
  mutate(trip_count = row_number()) %>%    
  ungroup %>%   
  select(-idx)

输出:

# A tibble: 16 x 3
      id  time trip_count
   <dbl> <dbl>      <int>
 1     1    20          1
 2     1    30          2
 3     1    40          3
 4     1  1100          4
 5     2    31          1
 6     2    32          2
 7     2    33          3
 8     2  1005          4
 9     3    22          1
10     3    23          2
11     3    24          3
12     3  1001          4
13     4    12          1
14     4    13          2
15     4    14          3
16     4  1002          4

由于每个组在您的数据中有 4 行,我们可以使用:

> test %>% left_join(test %>% filter(time < 1000) %>% group_by(id) %>% mutate(trip_count = row_number())) %>% group_by(id) %>% 
+                                                       mutate(trip_count = replace_na(trip_count, 4))
Joining, by = c("id", "time")
# A tibble: 16 x 3
# Groups:   id [4]
      id  time trip_count
   <dbl> <dbl>      <dbl>
 1     1    20          1
 2     1    30          2
 3     1    40          3
 4     1  1100          4
 5     2    31          1
 6     2    32          2
 7     2    33          3
 8     2  1005          4
 9     3    22          1
10     3    23          2
11     3    24          3
12     3  1001          4
13     4    12          1
14     4    13          2
15     4    14          3
16     4  1002          4
> 

如果你的数据不是每组 4 行,可以使用这个:

> id = c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,4)
> time=c(20,30,40,1100,31,32,33,1005,22,23,24,1001,12,13,14,15,1002)
> test <- data.frame(id,time)
> test %>% left_join(test %>% filter(time < 1000) %>% group_by(id) %>% mutate(trip_count = row_number())) %>% group_by(id) %>% 
+               mutate(across(trip_count, ~ replace(., is.na(.), n())))
Joining, by = c("id", "time")
# A tibble: 17 x 3
# Groups:   id [4]
      id  time trip_count
   <dbl> <dbl>      <int>
 1     1    20          1
 2     1    30          2
 3     1    40          3
 4     1  1100          4
 5     2    31          1
 6     2    32          2
 7     2    33          3
 8     2  1005          4
 9     3    22          1
10     3    23          2
11     3    24          3
12     3  1001          4
13     4    12          1
14     4    13          2
15     4    14          3
16     4    15          4
17     4  1002          5
> 

我在第 4 组中添加了额外的行。

基于 OP 共享的新数据:

> test %>% 
+   left_join(test %>% group_by(id) %>% filter(row_number() < which(time >= 1000)) %>% 
+   mutate(trip_count = row_number())) %>% 
+   left_join(test %>% group_by(id) %>% filter(row_number() > which(time >= 1000)) %>% mutate(trip_count1 = row_number())) %>% 
+                     mutate(trip_count = coalesce(trip_count, trip_count1)) %>% select(-trip_count1) %>% group_by(id) %>% 
+                       mutate(rowid = row_number()) %>% rowwise() %>%  mutate(trip_count = replace_na(trip_count, rowid)) %>% select(-rowid)
Joining, by = c("id", "time")
Joining, by = c("id", "time")
# A tibble: 16 x 3
# Rowwise:  id
      id  time trip_count
   <dbl> <dbl>      <int>
 1     1    20          1
 2     1    30          2
 3     1  1100          3
 4     1    40          1
 5     2    31          1
 6     2    32          2
 7     2    33          3
 8     2  1005          4
 9     3    22          1
10     3    23          2
11     3  1001          3
12     3    24          1
13     4    12          1
14     4    13          2
15     4    14          3
16     4  1002          4
>