通过分组将缺失值替换为多列的先前值
Replace missing values with previous value for multiple columns by grouping
我有一个包含 6 个变量的数据框。对于每一列,同一组的数据相同,但有一些缺失值。我想通过为每个变量复制同一组的值来填充这些缺失值。如果某个特定组的所有值都缺失,则应填写上述组的值。所以,我希望结果为 df_complete.
这是我尝试过的方法,但是当缺少对任何组的第一次观察时它会失败。无法弄清楚其中有什么问题。
set.seed(123)
df <- data.frame(matrix(rnorm(100), ncol = 5))
df$Group <- letters[1:20]
df <- df[rep(seq_len(nrow(df)), sample(1:10, 20, replace = T)),]
df_complete <- df
df$X1[sample(1:nrow(df), 15)] <- NA
df$X2[sample(1:nrow(df), 10)] <- NA
df$X3[sample(1:nrow(df), 25)] <- NA
df$X4[sample(1:nrow(df), 10)] <- NA
df$X5[sample(1:nrow(df), 15)] <- NA
lvcf <- function(x)
{
miss_ind <- which(is.na(x))
if(length(miss_ind) != 0)
{
if(miss_ind[1]==1)
{
ind1 <- which(!is.na(x))[1]
x[1] <- x[ind1]
miss_ind <- which(is.na(x))
}
for(i in 1:length(miss_ind))
{
x[miss_ind[i]] <- x[miss_ind[i]-1]
}
}
return(x)
}
df_complete <- df %>%
group_by(Group) %>%
sapply(lvcf)
程序包 zoo
具有处理此类问题的功能,na.locf
,last observation carried forward
。
library(zoo)
df_complete <- df %>%
group_by(Group) %>%
na.locf(., na.rm = FALSE)
head(df_complete)
## A tibble: 6 x 6
## Groups: Group [2]
# X1 X2 X3 X4 X5 Group
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 -0.56047565 -1.06782371 -0.69470698 <NA> 0.005764186 a
#2 -0.56047565 -1.06782371 -0.69470698 0.37963948 0.005764186 a
#3 -0.56047565 -1.06782371 -0.69470698 0.37963948 0.005764186 a
#4 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
#5 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
#6 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
注意 X4
列中的 <NA>
。
编辑。
根据下面 OP 的评论和 G. Grothendieck 的回答,以下内容删除了所有 NA
值。只需使用第二个 na.locf
和参数 fromLast = TRUE
.
df_complete <- df %>%
group_by(Group) %>%
na.locf(., na.rm = FALSE) %>%
na.locf(., fromLast = TRUE)
head(df_complete)
## A tibble: 6 x 6
## Groups: Group [2]
# X1 X2 X3 X4 X5 Group
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 -0.56047565 -1.06782371 -0.69470698 0.37963948 0.005764186 a
#2 -0.56047565 -1.06782371 -0.69470698 0.37963948 0.005764186 a
#3 -0.56047565 -1.06782371 -0.69470698 0.37963948 0.005764186 a
#4 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
#5 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
#6 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
编辑 2
根据 OP 发现的错误,这是一个仅使用 base R
的解决方案。我将用 NA
值创建一个新的 df 从每个组开始,但第一个,即组 a
.
set.seed(123)
df2 <- data.frame(X1 = rnorm(20),
X2 = rnorm(20),
Group = rep(letters[1:4], each = 5))
df2[c(6, 11, 16), 1:2] <- NA
df2_complete <- lapply(split(df2, df2$Group), function(x){
k <- which(names(x) == "Group")
x[-k] <- sapply(x[-k], na.locf, na.rm = FALSE)
x[-k] <- sapply(x[-k], na.locf, fromLast = TRUE)
x
})
df2_complete <- do.call(rbind, df2_complete)
row.names(df2_complete) <- NULL
df2_complete
我有一个包含 6 个变量的数据框。对于每一列,同一组的数据相同,但有一些缺失值。我想通过为每个变量复制同一组的值来填充这些缺失值。如果某个特定组的所有值都缺失,则应填写上述组的值。所以,我希望结果为 df_complete.
这是我尝试过的方法,但是当缺少对任何组的第一次观察时它会失败。无法弄清楚其中有什么问题。
set.seed(123)
df <- data.frame(matrix(rnorm(100), ncol = 5))
df$Group <- letters[1:20]
df <- df[rep(seq_len(nrow(df)), sample(1:10, 20, replace = T)),]
df_complete <- df
df$X1[sample(1:nrow(df), 15)] <- NA
df$X2[sample(1:nrow(df), 10)] <- NA
df$X3[sample(1:nrow(df), 25)] <- NA
df$X4[sample(1:nrow(df), 10)] <- NA
df$X5[sample(1:nrow(df), 15)] <- NA
lvcf <- function(x)
{
miss_ind <- which(is.na(x))
if(length(miss_ind) != 0)
{
if(miss_ind[1]==1)
{
ind1 <- which(!is.na(x))[1]
x[1] <- x[ind1]
miss_ind <- which(is.na(x))
}
for(i in 1:length(miss_ind))
{
x[miss_ind[i]] <- x[miss_ind[i]-1]
}
}
return(x)
}
df_complete <- df %>%
group_by(Group) %>%
sapply(lvcf)
程序包 zoo
具有处理此类问题的功能,na.locf
,last observation carried forward
。
library(zoo)
df_complete <- df %>%
group_by(Group) %>%
na.locf(., na.rm = FALSE)
head(df_complete)
## A tibble: 6 x 6
## Groups: Group [2]
# X1 X2 X3 X4 X5 Group
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 -0.56047565 -1.06782371 -0.69470698 <NA> 0.005764186 a
#2 -0.56047565 -1.06782371 -0.69470698 0.37963948 0.005764186 a
#3 -0.56047565 -1.06782371 -0.69470698 0.37963948 0.005764186 a
#4 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
#5 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
#6 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
注意 X4
列中的 <NA>
。
编辑。
根据下面 OP 的评论和 G. Grothendieck 的回答,以下内容删除了所有 NA
值。只需使用第二个 na.locf
和参数 fromLast = TRUE
.
df_complete <- df %>%
group_by(Group) %>%
na.locf(., na.rm = FALSE) %>%
na.locf(., fromLast = TRUE)
head(df_complete)
## A tibble: 6 x 6
## Groups: Group [2]
# X1 X2 X3 X4 X5 Group
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 -0.56047565 -1.06782371 -0.69470698 0.37963948 0.005764186 a
#2 -0.56047565 -1.06782371 -0.69470698 0.37963948 0.005764186 a
#3 -0.56047565 -1.06782371 -0.69470698 0.37963948 0.005764186 a
#4 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
#5 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
#6 -0.23017749 -0.21797491 -0.20791728 -0.50232345 0.385280401 b
编辑 2
根据 OP 发现的错误,这是一个仅使用 base R
的解决方案。我将用 NA
值创建一个新的 df 从每个组开始,但第一个,即组 a
.
set.seed(123)
df2 <- data.frame(X1 = rnorm(20),
X2 = rnorm(20),
Group = rep(letters[1:4], each = 5))
df2[c(6, 11, 16), 1:2] <- NA
df2_complete <- lapply(split(df2, df2$Group), function(x){
k <- which(names(x) == "Group")
x[-k] <- sapply(x[-k], na.locf, na.rm = FALSE)
x[-k] <- sapply(x[-k], na.locf, fromLast = TRUE)
x
})
df2_complete <- do.call(rbind, df2_complete)
row.names(df2_complete) <- NULL
df2_complete