有效清除R中数据框中的缺失值

Efficient cleaning of missing value in dataframe in R

require (data.table)
dat1 <- fread('https://archive.ics.uci.edu/ml/machine-learning-databases/primary-tumor/primary-tumor.data',stringsAsFactors=T)

我想用每列最常见的值替换 ? 和缺失值,并使它们成为 factor(对于 RandomForest)。 我试图从 dat1$V4:

中省略 ?
> dat2=subset(dat1, dat1$V4!='?')
Error in `[.data.table`(x, r, vars, with = FALSE) : 
  i evaluates to a logical vector length 339 but there are 184 rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.

然后如果成功使用所有 dataframefactor:

dat1 <- data.frame(lapply(dat1, as.factor))

这里是 dat1 的 header:

> head (dat1)
   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
1:  1  1  1  ?  3  2  2  1  2   2   2   2   2   2   2   2   2   2
2:  1  1  1  ?  3  2  2  2  2   2   1   2   2   2   1   2   1   2
3:  1  1  2  2  3  1  2  2  2   2   2   2   2   2   2   2   1   2
4:  1  1  2  ?  3  1  2  1  1   2   2   2   2   2   2   2   1   2
5:  1  1  2  ?  3  1  2  1  1   2   2   2   2   2   2   2   1   2
6:  1  1  2  ?  3  1  2  2  2   2   2   1   2   2   1   1   1   2

这里是 str(dat1):

> str (dat1)
Classes ‘data.table’ and 'data.frame':  339 obs. of  18 variables:
 $ V1 : int  1 1 1 1 1 1 1 1 1 1 ...
 $ V2 : int  1 1 1 1 1 1 2 2 2 2 ...
 $ V3 : Factor w/ 3 levels "1","2","?": 1 1 2 2 2 2 1 1 1 1 ...
 $ V4 : Factor w/ 4 levels "1","2","3","?": 4 4 2 4 4 4 1 1 1 1 ...
 $ V5 : Factor w/ 4 levels "1","2","3","?": 3 3 3 3 3 3 1 1 1 2 ...
 $ V6 : int  2 2 1 1 1 1 1 1 2 1 ...
 $ V7 : int  2 2 2 2 2 2 2 2 2 2 ...
 $ V8 : int  1 2 2 1 1 2 2 2 2 2 ...
 $ V9 : int  2 2 2 1 1 2 2 2 2 2 ...
 $ V10: int  2 2 2 2 2 2 2 2 2 2 ...
 $ V11: int  2 1 2 2 2 2 2 2 2 2 ...
 $ V12: int  2 2 2 2 2 1 2 2 2 2 ...
 $ V13: Factor w/ 3 levels "1","2","?": 2 2 2 2 2 2 1 2 2 3 ...
 $ V14: int  2 2 2 2 2 2 1 2 1 1 ...
 $ V15: int  2 1 2 2 2 1 1 2 2 1 ...
 $ V16: Factor w/ 3 levels "1","2","?": 2 2 2 2 2 1 2 2 2 2 ...
 $ V17: int  2 1 1 1 1 1 2 2 2 2 ...
 $ V18: int  2 2 2 2 2 2 2 2 2 2 ...
 - attr(*, ".internal.selfref")=<externalptr>

以下函数将所有 NA'?' 值替换为最常见的列值。然后只需 lapply 将其发送到 data.frame.

mostFreq <- function(x, na = '?'){
  i <- is.na(x) | x %in% na
  tbl <- table(x[!i])
  x[i] <- names(tbl)[which.max(tbl)]
  if(is.factor(x)) x <- droplevels(x)
  x
}

# Before    
as.list(dat1[1:20, 1:3])
#$V1
# [1] "1" "?" "2" "?" "2" NA  "?" "?" "2" "?" "?" "?" NA  NA 
#[15] NA  NA  "?" "2" "2" "2"
#
#$V2
# [1] "1" "3" "2" "3" "1" "2" "1" "2" "3" "1" "2" "1" "?" NA 
#[15] "?" "3" "1" NA  "?" "1"
#
#$V3
# [1] "?" "1" "?" "3" "1" NA  NA  "3" "1" "1" "1" "2" NA  NA 
#[15] NA  NA  "?" "?" NA  "2"

# After
lapply(dat1[1:20, 1:3], mostFreq)
#$V1
# [1] "1" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2"
#[15] "2" "2" "2" "2" "2" "2"
#
#$V2
# [1] "1" "3" "2" "3" "1" "2" "1" "2" "3" "1" "2" "1" "1" "1"
#[15] "1" "3" "1" "1" "1" "1"
#
#$V3
# [1] "1" "1" "1" "3" "1" "1" "1" "3" "1" "1" "1" "2" "1" "1"
#[15] "1" "1" "1" "1" "1" "2"

并更改整个数据框。

dat1[] <- lapply(dat1, mostFreq)

并强制 class factor:

dat1[] <- lapply(dat1, factor)

编辑。

如果您阅读数据设置 na.strings = '?' 开始,可以简化上述功能。

dat1 <- fread(<URI>, na.strings = '?', <other args>)

然后在原来的 mostFreq.

处使用下面的函数
mostFreq2 <- function(x){
  tbl <- table(x, useNA = "no")
  x[is.na(x)] <- names(tbl)[which.max(tbl)]
  x
}

测试数据。

由于您尚未发布示例数据集,我将创建一个类似于问题描述的数据集。

set.seed(1234)    # Make the results reproducible
n <- 300
x <- replicate(6, sample(c(NA, '?', 1:2), n, TRUE))
y <- replicate(6, sample(c(NA, '?', 1:3), n, TRUE))
dat1 <- cbind.data.frame(x, y, stringsAsFactors = FALSE)
dat1 <- dat1[, sample(ncol(dat1))]
names(dat1) <- paste0('V', 1:12)
str(dat1)

尽管它很好 "hacky" 这应该可以帮助您。我在你的 data.frame.

中没有看到任何 NA
library(dplyr)
library(stringr)

dat1 <- read.table('https://archive.ics.uci.edu/ml/machine-learning- 
databases/primary-tumor/primary-tumor.data',stringsAsFactors=T, sep = ",")

dat1 <- sapply(dat1, as.character)
temp <- list()

for (i in 1:ncol(dat1)){
  temp[i] <- data.frame(str_replace(dat1[,i], "[?]",names(sort(table(dat1[,i]), 
decreasing = T))[1]))

}
dat2 <- bind_cols(temp)
colnames(dat2) <- colnames(dat1)