在 r 中保持组元素从第一个非缺失开始直到最后一个
in r keep group elements starting from first non-missing until last
我想为每个数据框组保留第一个非缺失元素,直到最后一个元素(即使最后一个元素缺失)。
例如:
df <- structure(list(Ind = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), result = c(81,
90, NA, NA, 84, 86, 101, 134, NA, 78, NA, 103, NA, 112, NA, 111,
NA, 110, 256, 123, NA, NA, 194, 301, 98.5, 151, NA, 105, NA,
158, NA, 190, NA, 167, NA, 190, NA, NA, 113, 157, NA, 197, 155,
NA, 178, 146, NA, 153, 153, 147, NA, 170, NA, 157)), row.names = c(NA,
-54L), class = c("tbl_df", "tbl", "data.frame"))
在此示例中,第 3 组应从第 3 个观察开始,第 1 组和第 2 组应保持不变。
dplyr
您可以使用 dplyr::cumany
,它将设置为 TRUE
不是 NA 的第一个和后续值 (!is.na(result)
)。
df %>%
group_by(Ind) %>%
filter(cumany(!is.na(result)))
基础 R
使用Reduce
和accumulate = T
模仿dplyr::cumany(!is.na(result))
和ave
分组。
df[as.logical(ave(df$result, df$Ind, FUN = function(x) Reduce(any, !is.na(x), accumulate = TRUE))),]
输出
# A tibble: 52 x 2
# Groups: Ind [3]
Ind result
<int> <dbl>
1 1 81
2 1 90
3 1 NA
4 1 NA
5 1 84
6 1 86
7 1 101
8 1 134
9 1 NA
10 1 78
11 1 NA
12 1 103
13 1 NA
14 1 112
15 1 NA
16 1 111
17 1 NA
18 1 110
19 2 256
20 2 123
21 2 NA
22 2 NA
23 2 194
24 2 301
25 2 98.5
26 2 151
27 2 NA
28 2 105
29 2 NA
30 2 158
31 2 NA
32 2 190
33 2 NA
34 2 167
35 2 NA
36 2 190
37 3 113
38 3 157
39 3 NA
40 3 197
41 3 155
42 3 NA
43 3 178
44 3 146
45 3 NA
46 3 153
47 3 153
48 3 147
49 3 NA
50 3 170
51 3 NA
52 3 157
另一个可能的解决方案:
library(tidyverse)
df <- structure(list(Ind = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), result = c(81,
90, NA, NA, 84, 86, 101, 134, NA, 78, NA, 103, NA, 112, NA, 111,
NA, 110, 256, 123, NA, NA, 194, 301, 98.5, 151, NA, 105, NA,
158, NA, 190, NA, 167, NA, 190, NA, NA, 113, 157, NA, 197, 155,
NA, 178, 146, NA, 153, 153, 147, NA, 170, NA, 157)), row.names = c(NA,
-54L), class = c("tbl_df", "tbl", "data.frame"))
df %>%
group_by(Ind) %>%
mutate(aux = which.min(is.na(result))) %>%
slice(unique(aux):n()) %>% select(-aux) %>%
ungroup
#> Ind result
#> 1 1 81.0
#> 2 1 90.0
#> 3 1 NA
#> 4 1 NA
#> 5 1 84.0
#> 6 1 86.0
#> 7 1 101.0
#> 8 1 134.0
#> 9 1 NA
#> 10 1 78.0
#> 11 1 NA
#> 12 1 103.0
#> 13 1 NA
#> 14 1 112.0
#> 15 1 NA
#> 16 1 111.0
#> 17 1 NA
#> 18 1 110.0
#> 19 2 256.0
#> 20 2 123.0
#> 21 2 NA
#> 22 2 NA
#> 23 2 194.0
#> 24 2 301.0
#> 25 2 98.5
#> 26 2 151.0
#> 27 2 NA
#> 28 2 105.0
#> 29 2 NA
#> 30 2 158.0
#> 31 2 NA
#> 32 2 190.0
#> 33 2 NA
#> 34 2 167.0
#> 35 2 NA
#> 36 2 190.0
#> 37 3 113.0
#> 38 3 157.0
#> 39 3 NA
#> 40 3 197.0
#> 41 3 155.0
#> 42 3 NA
#> 43 3 178.0
#> 44 3 146.0
#> 45 3 NA
#> 46 3 153.0
#> 47 3 153.0
#> 48 3 147.0
#> 49 3 NA
#> 50 3 170.0
#> 51 3 NA
#> 52 3 157.0
我想为每个数据框组保留第一个非缺失元素,直到最后一个元素(即使最后一个元素缺失)。 例如:
df <- structure(list(Ind = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), result = c(81,
90, NA, NA, 84, 86, 101, 134, NA, 78, NA, 103, NA, 112, NA, 111,
NA, 110, 256, 123, NA, NA, 194, 301, 98.5, 151, NA, 105, NA,
158, NA, 190, NA, 167, NA, 190, NA, NA, 113, 157, NA, 197, 155,
NA, 178, 146, NA, 153, 153, 147, NA, 170, NA, 157)), row.names = c(NA,
-54L), class = c("tbl_df", "tbl", "data.frame"))
在此示例中,第 3 组应从第 3 个观察开始,第 1 组和第 2 组应保持不变。
dplyr
您可以使用 dplyr::cumany
,它将设置为 TRUE
不是 NA 的第一个和后续值 (!is.na(result)
)。
df %>%
group_by(Ind) %>%
filter(cumany(!is.na(result)))
基础 R
使用Reduce
和accumulate = T
模仿dplyr::cumany(!is.na(result))
和ave
分组。
df[as.logical(ave(df$result, df$Ind, FUN = function(x) Reduce(any, !is.na(x), accumulate = TRUE))),]
输出
# A tibble: 52 x 2
# Groups: Ind [3]
Ind result
<int> <dbl>
1 1 81
2 1 90
3 1 NA
4 1 NA
5 1 84
6 1 86
7 1 101
8 1 134
9 1 NA
10 1 78
11 1 NA
12 1 103
13 1 NA
14 1 112
15 1 NA
16 1 111
17 1 NA
18 1 110
19 2 256
20 2 123
21 2 NA
22 2 NA
23 2 194
24 2 301
25 2 98.5
26 2 151
27 2 NA
28 2 105
29 2 NA
30 2 158
31 2 NA
32 2 190
33 2 NA
34 2 167
35 2 NA
36 2 190
37 3 113
38 3 157
39 3 NA
40 3 197
41 3 155
42 3 NA
43 3 178
44 3 146
45 3 NA
46 3 153
47 3 153
48 3 147
49 3 NA
50 3 170
51 3 NA
52 3 157
另一个可能的解决方案:
library(tidyverse)
df <- structure(list(Ind = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), result = c(81,
90, NA, NA, 84, 86, 101, 134, NA, 78, NA, 103, NA, 112, NA, 111,
NA, 110, 256, 123, NA, NA, 194, 301, 98.5, 151, NA, 105, NA,
158, NA, 190, NA, 167, NA, 190, NA, NA, 113, 157, NA, 197, 155,
NA, 178, 146, NA, 153, 153, 147, NA, 170, NA, 157)), row.names = c(NA,
-54L), class = c("tbl_df", "tbl", "data.frame"))
df %>%
group_by(Ind) %>%
mutate(aux = which.min(is.na(result))) %>%
slice(unique(aux):n()) %>% select(-aux) %>%
ungroup
#> Ind result
#> 1 1 81.0
#> 2 1 90.0
#> 3 1 NA
#> 4 1 NA
#> 5 1 84.0
#> 6 1 86.0
#> 7 1 101.0
#> 8 1 134.0
#> 9 1 NA
#> 10 1 78.0
#> 11 1 NA
#> 12 1 103.0
#> 13 1 NA
#> 14 1 112.0
#> 15 1 NA
#> 16 1 111.0
#> 17 1 NA
#> 18 1 110.0
#> 19 2 256.0
#> 20 2 123.0
#> 21 2 NA
#> 22 2 NA
#> 23 2 194.0
#> 24 2 301.0
#> 25 2 98.5
#> 26 2 151.0
#> 27 2 NA
#> 28 2 105.0
#> 29 2 NA
#> 30 2 158.0
#> 31 2 NA
#> 32 2 190.0
#> 33 2 NA
#> 34 2 167.0
#> 35 2 NA
#> 36 2 190.0
#> 37 3 113.0
#> 38 3 157.0
#> 39 3 NA
#> 40 3 197.0
#> 41 3 155.0
#> 42 3 NA
#> 43 3 178.0
#> 44 3 146.0
#> 45 3 NA
#> 46 3 153.0
#> 47 3 153.0
#> 48 3 147.0
#> 49 3 NA
#> 50 3 170.0
#> 51 3 NA
#> 52 3 157.0