构建 table 个缺失值 count/percentage
Building a table of missing values count/percentage
df 示例:
a b c d Y
0 NA NA 8 3 1
1 NA 2 5 0 1
2 1 0 7 NA 0
3 NA NA 7 1 0
4 6 NA 2 NA 1
我正在尝试为变量所在的缺失值做一个数据框(Y
是二进制):
Variable <- colnames(df)
x1 <- apply(df,2,function(x) sum(is.na(x))/NROW(x)) #percentage over total
x2 <- apply(df,2,function(x) sum(is.na(x))) #NA count
x3 <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))/NROW(x)) #percentage over total if Y=1
x4 <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))) #NA count if Y=1
x5 <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))/NROW(x)) #percentage over total if Y=0
x6 <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))) #NA count if Y=0
df_nan <- data.frame(Variable,x1,x2,x3,x4,x5,x6)
但是,有没有更简洁的方法来做到这一点?因为每个列名称都保持为 x1
、x2
等。我正在尝试找出一种方法来更改这些名称 ,而 我正在做数据框(不是之后!因为我们的想法是拥有更清晰的代码)。
使用gather和summarize可以达到你想要的效果。我没有完全重现你的输出,因为我觉得它有点太复杂了。
library(tidyverse)
##Reproducing the data
df <- tibble(a = c(NA, NA, 1, NA, 6),
b = c(NA, 2, 0, NA, NA),
c = c(8,5,7,7,2),
d = c(3,0,NA, 1, NA),
Y = c(1,1,0,0,1))
##Creating table
my_result <- df %>%
gather(column,value,-Y) %>%
select(column, value,Y) %>%
group_by(column, Y) %>%
summarise(total_na = sum(is.na(value)),
total_obs = n()) %>%
mutate(percent_na = total_na/total_obs)
my_result
这是使用 gather
的一种方法,我们创建一个新列 'Y' 到 summarise
输出 'x3' 到 'x6'
library(tidyverse)
df %>%
mutate(new = Y) %>%
gather(Variable, val, -new) %>%
group_by(Variable) %>%
summarise(x1 = mean(is.na(val)),
x2 = sum(is.na(val)),
x3 = mean(is.na(val[new == 1])),
x4 = sum(is.na(val[new == 1])),
x5 = mean(is.na(val[new == 0])),
x6 = sum(is.na(val[new == 0])))
# A tibble: 5 x 7
# Variable x1 x2 x3 x4 x5 x6
# <chr> <dbl> <int> <dbl> <int> <dbl> <int>
#1 a 0.6 3 0.667 2 0.5 1
#2 b 0.6 3 0.667 2 0.5 1
#3 c 0 0 0 0 0 0
#4 d 0.4 2 0.333 1 0.5 1
#5 Y 0 0 0 0 0 0
或使用data.table
到melt
和dcast
library(data.table)
dM <- melt(setDT(df)[, new := Y], id.var = c('new'))[, value1 := is.na(value)]
dM[, .(x1 = sum(value1), x2 = mean(value1)), variable][dcast(dM,
variable ~ new, value.var = 'value1', c(mean, sum)), on = .(variable)]
数据
df <- structure(list(a = c(NA, NA, 1L, NA, 6L), b = c(NA, 2L, 0L, NA,
NA), c = c(8L, 5L, 7L, 7L, 2L), d = c(3L, 0L, NA, 1L, NA), Y = c(1L,
1L, 0L, 0L, 1L)), class = "data.frame", row.names = c("0", "1",
"2", "3", "4"))
我终于解决了(这段代码没有像我问题的代码那样创建太多变量)。虽然我认为应该有更有效的方法,但我认为这是非常有效的:
df_nan <- data.frame(
Variable = colnames(df))
df_nan["%NA"] <- apply(df,2,function(x) round(sum(is.na(x))/NROW(x)*100,2)) #percentage over total
df_nan["#NA"] <- apply(df,2,function(x) sum(is.na(x))) #NA count
df_nan["%NA Y=1"] <- apply(df[df$Y == 1,],2,function(x) round(sum(is.na(x))/NROW(x)*100,2)) #percentage over total if Y=1
df_nan["%NA Y=0"] <- apply(df[df$Y == 0,],2,function(x) round(sum(is.na(x))/NROW(x)*100,2)) #percentage over total if Y=0
df_nan["#NA Y=1"] <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))) #NA count if Y=1
df_nan["#NA Y=0"] <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))) #NA count if Y=0
df 示例:
a b c d Y
0 NA NA 8 3 1
1 NA 2 5 0 1
2 1 0 7 NA 0
3 NA NA 7 1 0
4 6 NA 2 NA 1
我正在尝试为变量所在的缺失值做一个数据框(Y
是二进制):
Variable <- colnames(df)
x1 <- apply(df,2,function(x) sum(is.na(x))/NROW(x)) #percentage over total
x2 <- apply(df,2,function(x) sum(is.na(x))) #NA count
x3 <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))/NROW(x)) #percentage over total if Y=1
x4 <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))) #NA count if Y=1
x5 <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))/NROW(x)) #percentage over total if Y=0
x6 <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))) #NA count if Y=0
df_nan <- data.frame(Variable,x1,x2,x3,x4,x5,x6)
但是,有没有更简洁的方法来做到这一点?因为每个列名称都保持为 x1
、x2
等。我正在尝试找出一种方法来更改这些名称 ,而 我正在做数据框(不是之后!因为我们的想法是拥有更清晰的代码)。
使用gather和summarize可以达到你想要的效果。我没有完全重现你的输出,因为我觉得它有点太复杂了。
library(tidyverse)
##Reproducing the data
df <- tibble(a = c(NA, NA, 1, NA, 6),
b = c(NA, 2, 0, NA, NA),
c = c(8,5,7,7,2),
d = c(3,0,NA, 1, NA),
Y = c(1,1,0,0,1))
##Creating table
my_result <- df %>%
gather(column,value,-Y) %>%
select(column, value,Y) %>%
group_by(column, Y) %>%
summarise(total_na = sum(is.na(value)),
total_obs = n()) %>%
mutate(percent_na = total_na/total_obs)
my_result
这是使用 gather
的一种方法,我们创建一个新列 'Y' 到 summarise
输出 'x3' 到 'x6'
library(tidyverse)
df %>%
mutate(new = Y) %>%
gather(Variable, val, -new) %>%
group_by(Variable) %>%
summarise(x1 = mean(is.na(val)),
x2 = sum(is.na(val)),
x3 = mean(is.na(val[new == 1])),
x4 = sum(is.na(val[new == 1])),
x5 = mean(is.na(val[new == 0])),
x6 = sum(is.na(val[new == 0])))
# A tibble: 5 x 7
# Variable x1 x2 x3 x4 x5 x6
# <chr> <dbl> <int> <dbl> <int> <dbl> <int>
#1 a 0.6 3 0.667 2 0.5 1
#2 b 0.6 3 0.667 2 0.5 1
#3 c 0 0 0 0 0 0
#4 d 0.4 2 0.333 1 0.5 1
#5 Y 0 0 0 0 0 0
或使用data.table
到melt
和dcast
library(data.table)
dM <- melt(setDT(df)[, new := Y], id.var = c('new'))[, value1 := is.na(value)]
dM[, .(x1 = sum(value1), x2 = mean(value1)), variable][dcast(dM,
variable ~ new, value.var = 'value1', c(mean, sum)), on = .(variable)]
数据
df <- structure(list(a = c(NA, NA, 1L, NA, 6L), b = c(NA, 2L, 0L, NA,
NA), c = c(8L, 5L, 7L, 7L, 2L), d = c(3L, 0L, NA, 1L, NA), Y = c(1L,
1L, 0L, 0L, 1L)), class = "data.frame", row.names = c("0", "1",
"2", "3", "4"))
我终于解决了(这段代码没有像我问题的代码那样创建太多变量)。虽然我认为应该有更有效的方法,但我认为这是非常有效的:
df_nan <- data.frame(
Variable = colnames(df))
df_nan["%NA"] <- apply(df,2,function(x) round(sum(is.na(x))/NROW(x)*100,2)) #percentage over total
df_nan["#NA"] <- apply(df,2,function(x) sum(is.na(x))) #NA count
df_nan["%NA Y=1"] <- apply(df[df$Y == 1,],2,function(x) round(sum(is.na(x))/NROW(x)*100,2)) #percentage over total if Y=1
df_nan["%NA Y=0"] <- apply(df[df$Y == 0,],2,function(x) round(sum(is.na(x))/NROW(x)*100,2)) #percentage over total if Y=0
df_nan["#NA Y=1"] <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))) #NA count if Y=1
df_nan["#NA Y=0"] <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))) #NA count if Y=0