按组删除每列中的 NA

Remove NAs in each column by group

我有一个数据框,其中的行按年份分组。变量不一定每年都有观测值,但当它们出现时,那一年有 3 个观测值,但出现在不同的行中。

> na_data
   Year Peter Paul John
1  2011     1   NA   NA
2  2011     2   NA   NA
3  2011     3   NA   NA
4  2011    NA    1   NA
5  2011    NA    2   NA
6  2011    NA    3   NA
7  2012     1   NA   NA
8  2012    NA    3   NA
9  2012     2   NA   NA
10 2012    NA    2   NA
11 2012     3   NA   NA
12 2012    NA    1   NA
13 2013    NA    1    4
14 2013    NA    2    5
15 2013    NA    3    6
16 2013     1   NA   NA
17 2013     2   NA   NA
18 2013     3   NA   NA

我想按组删除每列中的 NA。这样输出看起来像这样:

final_data
      Year Peter Paul John
 [1,] 2011     1    1   NA
 [2,] 2011     2    2   NA
 [3,] 2011     3    3   NA
 [4,] 2012     1    3   NA
 [5,] 2012     2    2   NA
 [6,] 2012     3    1   NA
 [7,] 2013     1    1    4
 [8,] 2013     2    2    5
 [9,] 2013     3    3    6

到目前为止,我已经使用了一个循环,但我正在寻找一个更简洁的解决方案,如果有人能提供帮助,那就太好了。我的解决方案:

cleaned_list <- vector("list", length(unique(full_data$Year)))

names(cleaned_list) <- unique(full_data$Year)

for(yr in unique(na_data$Year)) {
  
  temp <- matrix(NA, nrow = 3, ncol = ncol(na_data),
                 dimnames = list(NULL, colnames(na_data)))
  
  for(name in colnames(na_data)[-1]){
    no_nas <- as.vector(na.omit(na_data[Year==yr, name]))
    if (length(no_nas)!=0) temp[,name] <- no_nas
  }
  temp[,1] <- yr
  cleaned_list[[as.character(yr)]] <- temp
  
}

final_data <- do.call("rbind", cleaned_list) 

数据:

na_data <- data.frame(
  Year = rep(c(2011,2012,2013), each = 6),
  Peter = c(1:3, rep(NA, 3), 1,NA,2,NA,3,NA, rep(NA, 3),1:3),
  Paul = c(rep(NA,3), 1:3, NA,3,NA,2,NA, 1, 1:3, rep(NA,3)),
  John = c(rep(NA, 12), 4:6, rep(NA, 3))
)

desired <- data.frame(
  Year = rep(c(2011,2012,2013), each = 3),
  Peter = c(1:3, 1:3, 1:3),
  Paul = c( 1:3, 3:1, 1:3),
  John = c(rep(NA, 6), 4:6)
) # same as final_data but a dataframe

这是使用 data.table 包的一种可能的解决方案:

library(data.table)

setDT(na_data)[, lapply(.SD, function(x) if(length(y<-na.omit(x))) y else first(x)), by=Year]

#     Year Peter  Paul  John
# 1:  2011     1     1    NA
# 2:  2011     2     2    NA
# 3:  2011     3     3    NA
# 4:  2012     1     3    NA
# 5:  2012     2     2    NA
# 6:  2012     3     1    NA
# 7:  2013     1     1     4
# 8:  2013     2     2     5
# 9:  2013     3     3     6

dplyr 等效:

library(dplyr)

na_data |> 
  group_by(Year) |> 
  summarise(across(.fns = ~ if(length(y<-na.omit(.x))) y else first(.x)))

# # A tibble: 9 x 4
# # Groups:   Year [3]
#    Year Peter  Paul  John
#   <dbl> <dbl> <dbl> <int>
# 1  2011     1     1    NA
# 2  2011     2     2    NA
# 3  2011     3     3    NA
# 4  2012     1     3    NA
# 5  2012     2     2    NA
# 6  2012     3     1    NA
# 7  2013     1     1     4
# 8  2013     2     2     5
# 9  2013     3     3     6

转换为长格式,删除 NA,添加序列号 n,转换回并删除 n。

library(dplyr)
library(tidyr)

na_data %>%
  pivot_longer(-Year) %>%
  drop_na %>%
  group_by(Year, name) %>%
  mutate(n = 1:n()) %>%
  ungroup %>%
  pivot_wider %>%
  select(-n)

给予:

# A tibble: 9 x 4
   Year  Paul Peter  John
  <dbl> <dbl> <dbl> <dbl>
1  2011     1     1    NA
2  2011     2     2    NA
3  2011     3     3    NA
4  2012     1     1    NA
5  2012     2     2    NA
6  2012     3     3    NA
7  2013     1     1     4
8  2013     2     2     5
9  2013     3     3     6