通过使用 tidyverse/其他 R 命令对变量进行分组来查找最长的数据条目行

finding the longest stretch of rows of data entries by grouping variable using tidyverse/ other R command

我不确定我是否用正确的标题描述了我的问题,但我的想法是:

我想在使用 group_by() 之后对每个组的数据条目行进行最长的拉伸,这对当前的行顺序也很敏感。换句话说,一个组内有一个(或多个)不连续性(例如在 arrange() 之后被其他一些列)。我想获得一个新列(例如 mutate()),用于标记每组最长范围内的行。下面是一个例子:

data.frame(group = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 1, 1, 3, 1, 2, 2, 2),
           order = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17))

其中,我想得到如下数据框:

data.frame(group = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 1, 1, 3, 1, 2, 2, 2),
           order = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
           longest = c(T, T, T, F, F, T, T, T, T, T, F, F, F, F, T, T, T))

我们可以为 group 列中的连续值创建一个组。然后,获取这些组的行数,然后我们可以按 group 和 return TRUE 对每个组中连续行数最多的行进行分组。

library(tidyverse)
  
df %>% 
  group_by(group_weight = cumsum(c(1, diff(group) != 0))) %>% 
  mutate(longest = n()) %>% 
  group_by(group) %>% 
  mutate(longest = longest == max(longest)) %>% 
  ungroup %>% 
  select(-group_weight)

输出

   group order longest
   <dbl> <dbl> <lgl>  
 1     1     1 TRUE   
 2     1     2 TRUE   
 3     1     3 TRUE   
 4     2     4 FALSE  
 5     2     5 FALSE  
 6     3     6 TRUE   
 7     3     7 TRUE   
 8     3     8 TRUE   
 9     3     9 TRUE   
10     3    10 TRUE   
11     1    11 FALSE  
12     1    12 FALSE  
13     3    13 FALSE  
14     1    14 FALSE  
15     2    15 TRUE   
16     2    16 TRUE   
17     2    17 TRUE  

如果您在连续的行中有平局并且只想 return 第一个分组为 T,那么您可以这样做:

df2 %>% 
  group_by(group_weight = cumsum(c(1, diff(group) != 0))) %>% 
  mutate(longest = n()) %>% 
  group_by(group) %>% 
  mutate(longest = longest==max(longest)) %>% 
  group_by(longest, .add = TRUE) %>% 
  mutate(x = min(group_weight)) %>% 
  ungroup(longest) %>% 
  mutate(longest = longest == TRUE & group_weight == x & !is.na(x)) %>% 
  ungroup %>% 
  dplyr::select(-c(group_weight, x))

输出

   group order longest
   <dbl> <dbl> <lgl>  
 1     1     1 TRUE   
 2     1     2 TRUE   
 3     1     3 TRUE   
 4     2     4 FALSE  
 5     2     5 FALSE  
 6     3     6 TRUE   
 7     3     7 TRUE   
 8     3     8 TRUE   
 9     3     9 TRUE   
10     3    10 TRUE   
11     1    11 FALSE  
12     1    12 FALSE  
13     3    13 FALSE  
14     1    14 FALSE  
15     2    15 TRUE   
16     2    16 TRUE   
17     2    17 TRUE   
18     1    18 FALSE  
19     1    19 FALSE  
20     1    20 FALSE  

数据

df2 <- structure(list(group = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 1, 1, 
3, 1, 2, 2, 2, 1, 1, 1), order = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)), class = "data.frame", row.names = c(NA, 
-20L))

在 Base R 中:

df$longest <- with(rle(df$group), 
                    rep(ave(lengths, values, FUN = max) == lengths,lengths))

df
   group order longest
1      1     1    TRUE
2      1     2    TRUE
3      1     3    TRUE
4      2     4   FALSE
5      2     5   FALSE
6      3     6    TRUE
7      3     7    TRUE
8      3     8    TRUE
9      3     9    TRUE
10     3    10    TRUE
11     1    11   FALSE
12     1    12   FALSE
13     3    13   FALSE
14     1    14   FALSE
15     2    15    TRUE
16     2    16    TRUE
17     2    17    TRUE

另一个基地R:

a <- rle(df$group)
a$values <- ave(a$lengths, a$values, FUN = max) == a$lengths

df$longest <- inverse.rle(a)

在data.table中:

library(data.table)
setDT(df)[, N := .N, by = rleid(group)][, longest := N == max(N), by = group][]

   group order N longest
 1:     1     1 3    TRUE
 2:     1     2 3    TRUE
 3:     1     3 3    TRUE
 4:     2     4 2   FALSE
 5:     2     5 2   FALSE
 6:     3     6 5    TRUE
 7:     3     7 5    TRUE
 8:     3     8 5    TRUE
 9:     3     9 5    TRUE
10:     3    10 5    TRUE
11:     1    11 2   FALSE
12:     1    12 2   FALSE
13:     3    13 1   FALSE
14:     1    14 1   FALSE
15:     2    15 3    TRUE
16:     2    16 3    TRUE
17:     2    17 3    TRUE