通过使用 tidyverse/其他 R 命令对变量进行分组来查找最长的数据条目行
finding the longest stretch of rows of data entries by grouping variable using tidyverse/ other R command
我不确定我是否用正确的标题描述了我的问题,但我的想法是:
我想在使用 group_by()
之后对每个组的数据条目行进行最长的拉伸,这对当前的行顺序也很敏感。换句话说,一个组内有一个(或多个)不连续性(例如在 arrange()
之后被其他一些列)。我想获得一个新列(例如 mutate()
),用于标记每组最长范围内的行。下面是一个例子:
data.frame(group = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 1, 1, 3, 1, 2, 2, 2),
order = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17))
其中,我想得到如下数据框:
data.frame(group = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 1, 1, 3, 1, 2, 2, 2),
order = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
longest = c(T, T, T, F, F, T, T, T, T, T, F, F, F, F, T, T, T))
我们可以为 group
列中的连续值创建一个组。然后,获取这些组的行数,然后我们可以按 group
和 return TRUE
对每个组中连续行数最多的行进行分组。
library(tidyverse)
df %>%
group_by(group_weight = cumsum(c(1, diff(group) != 0))) %>%
mutate(longest = n()) %>%
group_by(group) %>%
mutate(longest = longest == max(longest)) %>%
ungroup %>%
select(-group_weight)
输出
group order longest
<dbl> <dbl> <lgl>
1 1 1 TRUE
2 1 2 TRUE
3 1 3 TRUE
4 2 4 FALSE
5 2 5 FALSE
6 3 6 TRUE
7 3 7 TRUE
8 3 8 TRUE
9 3 9 TRUE
10 3 10 TRUE
11 1 11 FALSE
12 1 12 FALSE
13 3 13 FALSE
14 1 14 FALSE
15 2 15 TRUE
16 2 16 TRUE
17 2 17 TRUE
如果您在连续的行中有平局并且只想 return 第一个分组为 T
,那么您可以这样做:
df2 %>%
group_by(group_weight = cumsum(c(1, diff(group) != 0))) %>%
mutate(longest = n()) %>%
group_by(group) %>%
mutate(longest = longest==max(longest)) %>%
group_by(longest, .add = TRUE) %>%
mutate(x = min(group_weight)) %>%
ungroup(longest) %>%
mutate(longest = longest == TRUE & group_weight == x & !is.na(x)) %>%
ungroup %>%
dplyr::select(-c(group_weight, x))
输出
group order longest
<dbl> <dbl> <lgl>
1 1 1 TRUE
2 1 2 TRUE
3 1 3 TRUE
4 2 4 FALSE
5 2 5 FALSE
6 3 6 TRUE
7 3 7 TRUE
8 3 8 TRUE
9 3 9 TRUE
10 3 10 TRUE
11 1 11 FALSE
12 1 12 FALSE
13 3 13 FALSE
14 1 14 FALSE
15 2 15 TRUE
16 2 16 TRUE
17 2 17 TRUE
18 1 18 FALSE
19 1 19 FALSE
20 1 20 FALSE
数据
df2 <- structure(list(group = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 1, 1,
3, 1, 2, 2, 2, 1, 1, 1), order = c(1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)), class = "data.frame", row.names = c(NA,
-20L))
在 Base R 中:
df$longest <- with(rle(df$group),
rep(ave(lengths, values, FUN = max) == lengths,lengths))
df
group order longest
1 1 1 TRUE
2 1 2 TRUE
3 1 3 TRUE
4 2 4 FALSE
5 2 5 FALSE
6 3 6 TRUE
7 3 7 TRUE
8 3 8 TRUE
9 3 9 TRUE
10 3 10 TRUE
11 1 11 FALSE
12 1 12 FALSE
13 3 13 FALSE
14 1 14 FALSE
15 2 15 TRUE
16 2 16 TRUE
17 2 17 TRUE
另一个基地R:
a <- rle(df$group)
a$values <- ave(a$lengths, a$values, FUN = max) == a$lengths
df$longest <- inverse.rle(a)
在data.table中:
library(data.table)
setDT(df)[, N := .N, by = rleid(group)][, longest := N == max(N), by = group][]
group order N longest
1: 1 1 3 TRUE
2: 1 2 3 TRUE
3: 1 3 3 TRUE
4: 2 4 2 FALSE
5: 2 5 2 FALSE
6: 3 6 5 TRUE
7: 3 7 5 TRUE
8: 3 8 5 TRUE
9: 3 9 5 TRUE
10: 3 10 5 TRUE
11: 1 11 2 FALSE
12: 1 12 2 FALSE
13: 3 13 1 FALSE
14: 1 14 1 FALSE
15: 2 15 3 TRUE
16: 2 16 3 TRUE
17: 2 17 3 TRUE
我不确定我是否用正确的标题描述了我的问题,但我的想法是:
我想在使用 group_by()
之后对每个组的数据条目行进行最长的拉伸,这对当前的行顺序也很敏感。换句话说,一个组内有一个(或多个)不连续性(例如在 arrange()
之后被其他一些列)。我想获得一个新列(例如 mutate()
),用于标记每组最长范围内的行。下面是一个例子:
data.frame(group = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 1, 1, 3, 1, 2, 2, 2),
order = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17))
其中,我想得到如下数据框:
data.frame(group = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 1, 1, 3, 1, 2, 2, 2),
order = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17),
longest = c(T, T, T, F, F, T, T, T, T, T, F, F, F, F, T, T, T))
我们可以为 group
列中的连续值创建一个组。然后,获取这些组的行数,然后我们可以按 group
和 return TRUE
对每个组中连续行数最多的行进行分组。
library(tidyverse)
df %>%
group_by(group_weight = cumsum(c(1, diff(group) != 0))) %>%
mutate(longest = n()) %>%
group_by(group) %>%
mutate(longest = longest == max(longest)) %>%
ungroup %>%
select(-group_weight)
输出
group order longest
<dbl> <dbl> <lgl>
1 1 1 TRUE
2 1 2 TRUE
3 1 3 TRUE
4 2 4 FALSE
5 2 5 FALSE
6 3 6 TRUE
7 3 7 TRUE
8 3 8 TRUE
9 3 9 TRUE
10 3 10 TRUE
11 1 11 FALSE
12 1 12 FALSE
13 3 13 FALSE
14 1 14 FALSE
15 2 15 TRUE
16 2 16 TRUE
17 2 17 TRUE
如果您在连续的行中有平局并且只想 return 第一个分组为 T
,那么您可以这样做:
df2 %>%
group_by(group_weight = cumsum(c(1, diff(group) != 0))) %>%
mutate(longest = n()) %>%
group_by(group) %>%
mutate(longest = longest==max(longest)) %>%
group_by(longest, .add = TRUE) %>%
mutate(x = min(group_weight)) %>%
ungroup(longest) %>%
mutate(longest = longest == TRUE & group_weight == x & !is.na(x)) %>%
ungroup %>%
dplyr::select(-c(group_weight, x))
输出
group order longest
<dbl> <dbl> <lgl>
1 1 1 TRUE
2 1 2 TRUE
3 1 3 TRUE
4 2 4 FALSE
5 2 5 FALSE
6 3 6 TRUE
7 3 7 TRUE
8 3 8 TRUE
9 3 9 TRUE
10 3 10 TRUE
11 1 11 FALSE
12 1 12 FALSE
13 3 13 FALSE
14 1 14 FALSE
15 2 15 TRUE
16 2 16 TRUE
17 2 17 TRUE
18 1 18 FALSE
19 1 19 FALSE
20 1 20 FALSE
数据
df2 <- structure(list(group = c(1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 1, 1,
3, 1, 2, 2, 2, 1, 1, 1), order = c(1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)), class = "data.frame", row.names = c(NA,
-20L))
在 Base R 中:
df$longest <- with(rle(df$group),
rep(ave(lengths, values, FUN = max) == lengths,lengths))
df
group order longest
1 1 1 TRUE
2 1 2 TRUE
3 1 3 TRUE
4 2 4 FALSE
5 2 5 FALSE
6 3 6 TRUE
7 3 7 TRUE
8 3 8 TRUE
9 3 9 TRUE
10 3 10 TRUE
11 1 11 FALSE
12 1 12 FALSE
13 3 13 FALSE
14 1 14 FALSE
15 2 15 TRUE
16 2 16 TRUE
17 2 17 TRUE
另一个基地R:
a <- rle(df$group)
a$values <- ave(a$lengths, a$values, FUN = max) == a$lengths
df$longest <- inverse.rle(a)
在data.table中:
library(data.table)
setDT(df)[, N := .N, by = rleid(group)][, longest := N == max(N), by = group][]
group order N longest
1: 1 1 3 TRUE
2: 1 2 3 TRUE
3: 1 3 3 TRUE
4: 2 4 2 FALSE
5: 2 5 2 FALSE
6: 3 6 5 TRUE
7: 3 7 5 TRUE
8: 3 8 5 TRUE
9: 3 9 5 TRUE
10: 3 10 5 TRUE
11: 1 11 2 FALSE
12: 1 12 2 FALSE
13: 3 13 1 FALSE
14: 1 14 1 FALSE
15: 2 15 3 TRUE
16: 2 16 3 TRUE
17: 2 17 3 TRUE