构建 table 个缺失值 count/percentage

Question

df 示例：

    a    b    c   d   Y
0   NA   NA   8   3   1
1   NA   2    5   0   1
2   1    0    7   NA  0
3   NA   NA   7   1   0
4   6    NA   2   NA  1

我正在尝试为变量所在的缺失值做一个数据框（Y 是二进制）：

Variable  <- colnames(df)
x1 <- apply(df,2,function(x) sum(is.na(x))/NROW(x)) #percentage over total
x2 <- apply(df,2,function(x) sum(is.na(x))) #NA count
x3 <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))/NROW(x)) #percentage over total if Y=1
x4 <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))) #NA count if Y=1
x5 <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))/NROW(x)) #percentage over total if Y=0
x6 <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))) #NA count if Y=0
df_nan <- data.frame(Variable,x1,x2,x3,x4,x5,x6)

但是，有没有更简洁的方法来做到这一点？因为每个列名称都保持为 x1、x2 等。我正在尝试找出一种方法来更改这些名称，而我正在做数据框（不是之后！因为我们的想法是拥有更清晰的代码）。

Answer 1

使用gather和summarize可以达到你想要的效果。我没有完全重现你的输出，因为我觉得它有点太复杂了。

library(tidyverse)


##Reproducing the data
df <- tibble(a = c(NA, NA, 1, NA, 6),
       b = c(NA, 2, 0, NA, NA),
       c = c(8,5,7,7,2),
       d = c(3,0,NA, 1, NA),
       Y = c(1,1,0,0,1))

##Creating table
my_result <- df %>%
    gather(column,value,-Y) %>%
    select(column, value,Y) %>%
    group_by(column, Y) %>%
    summarise(total_na = sum(is.na(value)),
              total_obs = n()) %>%
    mutate(percent_na = total_na/total_obs)


my_result

Answer 2

这是使用 gather 的一种方法，我们创建一个新列 'Y' 到 summarise 输出 'x3' 到 'x6'

library(tidyverse)
df %>% 
     mutate(new = Y) %>% 
     gather(Variable, val, -new) %>% 
     group_by(Variable) %>% 
     summarise(x1 = mean(is.na(val)),
               x2 = sum(is.na(val)), 
               x3 = mean(is.na(val[new == 1])), 
               x4 = sum(is.na(val[new == 1])),
               x5 = mean(is.na(val[new == 0])), 
               x6 = sum(is.na(val[new == 0])))
# A tibble: 5 x 7
#  Variable    x1    x2    x3    x4    x5    x6
#  <chr>    <dbl> <int> <dbl> <int> <dbl> <int>
#1 a          0.6     3 0.667     2   0.5     1
#2 b          0.6     3 0.667     2   0.5     1
#3 c          0       0 0         0   0       0
#4 d          0.4     2 0.333     1   0.5     1
#5 Y          0       0 0         0   0       0

或使用data.table到melt和dcast

library(data.table)
dM <- melt(setDT(df)[, new := Y], id.var = c('new'))[, value1 := is.na(value)]
dM[, .(x1 = sum(value1), x2 = mean(value1)), variable][dcast(dM,  
     variable ~ new, value.var = 'value1', c(mean, sum)), on = .(variable)]

数据

df <- structure(list(a = c(NA, NA, 1L, NA, 6L), b = c(NA, 2L, 0L, NA, 
 NA), c = c(8L, 5L, 7L, 7L, 2L), d = c(3L, 0L, NA, 1L, NA), Y = c(1L, 
 1L, 0L, 0L, 1L)), class = "data.frame", row.names = c("0", "1", 
  "2", "3", "4"))

Answer 3

我终于解决了（这段代码没有像我问题的代码那样创建太多变量）。虽然我认为应该有更有效的方法，但我认为这是非常有效的：

df_nan <- data.frame(
  Variable = colnames(df))
df_nan["%NA"] <- apply(df,2,function(x) round(sum(is.na(x))/NROW(x)*100,2)) #percentage over total
df_nan["#NA"] <- apply(df,2,function(x) sum(is.na(x))) #NA count
df_nan["%NA Y=1"] <- apply(df[df$Y == 1,],2,function(x) round(sum(is.na(x))/NROW(x)*100,2)) #percentage over total if Y=1
df_nan["%NA Y=0"] <- apply(df[df$Y == 0,],2,function(x) round(sum(is.na(x))/NROW(x)*100,2)) #percentage over total if Y=0
df_nan["#NA Y=1"] <- apply(df[df$Y == 1,],2,function(x) sum(is.na(x))) #NA count if Y=1
df_nan["#NA Y=0"] <- apply(df[df$Y == 0,],2,function(x) sum(is.na(x))) #NA count if Y=0

构建 table 个缺失值 count/percentage

Building a table of missing values count/percentage

r

nan

missing-data

dataframe

数据