有效清除R中数据框中的缺失值
Efficient cleaning of missing value in dataframe in R
require (data.table)
dat1 <- fread('https://archive.ics.uci.edu/ml/machine-learning-databases/primary-tumor/primary-tumor.data',stringsAsFactors=T)
我想用每列最常见的值替换 ?
和缺失值,并使它们成为 factor
(对于 RandomForest
)。
我试图从 dat1$V4:
中省略 ?
> dat2=subset(dat1, dat1$V4!='?')
Error in `[.data.table`(x, r, vars, with = FALSE) :
i evaluates to a logical vector length 339 but there are 184 rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.
然后如果成功使用所有 dataframe
列 factor
:
dat1 <- data.frame(lapply(dat1, as.factor))
这里是 dat1
的 header:
> head (dat1)
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
1: 1 1 1 ? 3 2 2 1 2 2 2 2 2 2 2 2 2 2
2: 1 1 1 ? 3 2 2 2 2 2 1 2 2 2 1 2 1 2
3: 1 1 2 2 3 1 2 2 2 2 2 2 2 2 2 2 1 2
4: 1 1 2 ? 3 1 2 1 1 2 2 2 2 2 2 2 1 2
5: 1 1 2 ? 3 1 2 1 1 2 2 2 2 2 2 2 1 2
6: 1 1 2 ? 3 1 2 2 2 2 2 1 2 2 1 1 1 2
这里是 str(dat1)
:
> str (dat1)
Classes ‘data.table’ and 'data.frame': 339 obs. of 18 variables:
$ V1 : int 1 1 1 1 1 1 1 1 1 1 ...
$ V2 : int 1 1 1 1 1 1 2 2 2 2 ...
$ V3 : Factor w/ 3 levels "1","2","?": 1 1 2 2 2 2 1 1 1 1 ...
$ V4 : Factor w/ 4 levels "1","2","3","?": 4 4 2 4 4 4 1 1 1 1 ...
$ V5 : Factor w/ 4 levels "1","2","3","?": 3 3 3 3 3 3 1 1 1 2 ...
$ V6 : int 2 2 1 1 1 1 1 1 2 1 ...
$ V7 : int 2 2 2 2 2 2 2 2 2 2 ...
$ V8 : int 1 2 2 1 1 2 2 2 2 2 ...
$ V9 : int 2 2 2 1 1 2 2 2 2 2 ...
$ V10: int 2 2 2 2 2 2 2 2 2 2 ...
$ V11: int 2 1 2 2 2 2 2 2 2 2 ...
$ V12: int 2 2 2 2 2 1 2 2 2 2 ...
$ V13: Factor w/ 3 levels "1","2","?": 2 2 2 2 2 2 1 2 2 3 ...
$ V14: int 2 2 2 2 2 2 1 2 1 1 ...
$ V15: int 2 1 2 2 2 1 1 2 2 1 ...
$ V16: Factor w/ 3 levels "1","2","?": 2 2 2 2 2 1 2 2 2 2 ...
$ V17: int 2 1 1 1 1 1 2 2 2 2 ...
$ V18: int 2 2 2 2 2 2 2 2 2 2 ...
- attr(*, ".internal.selfref")=<externalptr>
以下函数将所有 NA
和 '?'
值替换为最常见的列值。然后只需 lapply
将其发送到 data.frame.
mostFreq <- function(x, na = '?'){
i <- is.na(x) | x %in% na
tbl <- table(x[!i])
x[i] <- names(tbl)[which.max(tbl)]
if(is.factor(x)) x <- droplevels(x)
x
}
# Before
as.list(dat1[1:20, 1:3])
#$V1
# [1] "1" "?" "2" "?" "2" NA "?" "?" "2" "?" "?" "?" NA NA
#[15] NA NA "?" "2" "2" "2"
#
#$V2
# [1] "1" "3" "2" "3" "1" "2" "1" "2" "3" "1" "2" "1" "?" NA
#[15] "?" "3" "1" NA "?" "1"
#
#$V3
# [1] "?" "1" "?" "3" "1" NA NA "3" "1" "1" "1" "2" NA NA
#[15] NA NA "?" "?" NA "2"
# After
lapply(dat1[1:20, 1:3], mostFreq)
#$V1
# [1] "1" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2"
#[15] "2" "2" "2" "2" "2" "2"
#
#$V2
# [1] "1" "3" "2" "3" "1" "2" "1" "2" "3" "1" "2" "1" "1" "1"
#[15] "1" "3" "1" "1" "1" "1"
#
#$V3
# [1] "1" "1" "1" "3" "1" "1" "1" "3" "1" "1" "1" "2" "1" "1"
#[15] "1" "1" "1" "1" "1" "2"
并更改整个数据框。
dat1[] <- lapply(dat1, mostFreq)
并强制 class factor
:
dat1[] <- lapply(dat1, factor)
编辑。
如果您阅读数据设置 na.strings = '?'
开始,可以简化上述功能。
dat1 <- fread(<URI>, na.strings = '?', <other args>)
然后在原来的 mostFreq
.
处使用下面的函数
mostFreq2 <- function(x){
tbl <- table(x, useNA = "no")
x[is.na(x)] <- names(tbl)[which.max(tbl)]
x
}
测试数据。
由于您尚未发布示例数据集,我将创建一个类似于问题描述的数据集。
set.seed(1234) # Make the results reproducible
n <- 300
x <- replicate(6, sample(c(NA, '?', 1:2), n, TRUE))
y <- replicate(6, sample(c(NA, '?', 1:3), n, TRUE))
dat1 <- cbind.data.frame(x, y, stringsAsFactors = FALSE)
dat1 <- dat1[, sample(ncol(dat1))]
names(dat1) <- paste0('V', 1:12)
str(dat1)
尽管它很好 "hacky" 这应该可以帮助您。我在你的 data.frame.
中没有看到任何 NA
library(dplyr)
library(stringr)
dat1 <- read.table('https://archive.ics.uci.edu/ml/machine-learning-
databases/primary-tumor/primary-tumor.data',stringsAsFactors=T, sep = ",")
dat1 <- sapply(dat1, as.character)
temp <- list()
for (i in 1:ncol(dat1)){
temp[i] <- data.frame(str_replace(dat1[,i], "[?]",names(sort(table(dat1[,i]),
decreasing = T))[1]))
}
dat2 <- bind_cols(temp)
colnames(dat2) <- colnames(dat1)
require (data.table)
dat1 <- fread('https://archive.ics.uci.edu/ml/machine-learning-databases/primary-tumor/primary-tumor.data',stringsAsFactors=T)
我想用每列最常见的值替换 ?
和缺失值,并使它们成为 factor
(对于 RandomForest
)。
我试图从 dat1$V4:
?
> dat2=subset(dat1, dat1$V4!='?')
Error in `[.data.table`(x, r, vars, with = FALSE) :
i evaluates to a logical vector length 339 but there are 184 rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.
然后如果成功使用所有 dataframe
列 factor
:
dat1 <- data.frame(lapply(dat1, as.factor))
这里是 dat1
的 header:
> head (dat1)
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
1: 1 1 1 ? 3 2 2 1 2 2 2 2 2 2 2 2 2 2
2: 1 1 1 ? 3 2 2 2 2 2 1 2 2 2 1 2 1 2
3: 1 1 2 2 3 1 2 2 2 2 2 2 2 2 2 2 1 2
4: 1 1 2 ? 3 1 2 1 1 2 2 2 2 2 2 2 1 2
5: 1 1 2 ? 3 1 2 1 1 2 2 2 2 2 2 2 1 2
6: 1 1 2 ? 3 1 2 2 2 2 2 1 2 2 1 1 1 2
这里是 str(dat1)
:
> str (dat1)
Classes ‘data.table’ and 'data.frame': 339 obs. of 18 variables:
$ V1 : int 1 1 1 1 1 1 1 1 1 1 ...
$ V2 : int 1 1 1 1 1 1 2 2 2 2 ...
$ V3 : Factor w/ 3 levels "1","2","?": 1 1 2 2 2 2 1 1 1 1 ...
$ V4 : Factor w/ 4 levels "1","2","3","?": 4 4 2 4 4 4 1 1 1 1 ...
$ V5 : Factor w/ 4 levels "1","2","3","?": 3 3 3 3 3 3 1 1 1 2 ...
$ V6 : int 2 2 1 1 1 1 1 1 2 1 ...
$ V7 : int 2 2 2 2 2 2 2 2 2 2 ...
$ V8 : int 1 2 2 1 1 2 2 2 2 2 ...
$ V9 : int 2 2 2 1 1 2 2 2 2 2 ...
$ V10: int 2 2 2 2 2 2 2 2 2 2 ...
$ V11: int 2 1 2 2 2 2 2 2 2 2 ...
$ V12: int 2 2 2 2 2 1 2 2 2 2 ...
$ V13: Factor w/ 3 levels "1","2","?": 2 2 2 2 2 2 1 2 2 3 ...
$ V14: int 2 2 2 2 2 2 1 2 1 1 ...
$ V15: int 2 1 2 2 2 1 1 2 2 1 ...
$ V16: Factor w/ 3 levels "1","2","?": 2 2 2 2 2 1 2 2 2 2 ...
$ V17: int 2 1 1 1 1 1 2 2 2 2 ...
$ V18: int 2 2 2 2 2 2 2 2 2 2 ...
- attr(*, ".internal.selfref")=<externalptr>
以下函数将所有 NA
和 '?'
值替换为最常见的列值。然后只需 lapply
将其发送到 data.frame.
mostFreq <- function(x, na = '?'){
i <- is.na(x) | x %in% na
tbl <- table(x[!i])
x[i] <- names(tbl)[which.max(tbl)]
if(is.factor(x)) x <- droplevels(x)
x
}
# Before
as.list(dat1[1:20, 1:3])
#$V1
# [1] "1" "?" "2" "?" "2" NA "?" "?" "2" "?" "?" "?" NA NA
#[15] NA NA "?" "2" "2" "2"
#
#$V2
# [1] "1" "3" "2" "3" "1" "2" "1" "2" "3" "1" "2" "1" "?" NA
#[15] "?" "3" "1" NA "?" "1"
#
#$V3
# [1] "?" "1" "?" "3" "1" NA NA "3" "1" "1" "1" "2" NA NA
#[15] NA NA "?" "?" NA "2"
# After
lapply(dat1[1:20, 1:3], mostFreq)
#$V1
# [1] "1" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2"
#[15] "2" "2" "2" "2" "2" "2"
#
#$V2
# [1] "1" "3" "2" "3" "1" "2" "1" "2" "3" "1" "2" "1" "1" "1"
#[15] "1" "3" "1" "1" "1" "1"
#
#$V3
# [1] "1" "1" "1" "3" "1" "1" "1" "3" "1" "1" "1" "2" "1" "1"
#[15] "1" "1" "1" "1" "1" "2"
并更改整个数据框。
dat1[] <- lapply(dat1, mostFreq)
并强制 class factor
:
dat1[] <- lapply(dat1, factor)
编辑。
如果您阅读数据设置 na.strings = '?'
开始,可以简化上述功能。
dat1 <- fread(<URI>, na.strings = '?', <other args>)
然后在原来的 mostFreq
.
mostFreq2 <- function(x){
tbl <- table(x, useNA = "no")
x[is.na(x)] <- names(tbl)[which.max(tbl)]
x
}
测试数据。
由于您尚未发布示例数据集,我将创建一个类似于问题描述的数据集。
set.seed(1234) # Make the results reproducible
n <- 300
x <- replicate(6, sample(c(NA, '?', 1:2), n, TRUE))
y <- replicate(6, sample(c(NA, '?', 1:3), n, TRUE))
dat1 <- cbind.data.frame(x, y, stringsAsFactors = FALSE)
dat1 <- dat1[, sample(ncol(dat1))]
names(dat1) <- paste0('V', 1:12)
str(dat1)
尽管它很好 "hacky" 这应该可以帮助您。我在你的 data.frame.
中没有看到任何 NAlibrary(dplyr)
library(stringr)
dat1 <- read.table('https://archive.ics.uci.edu/ml/machine-learning-
databases/primary-tumor/primary-tumor.data',stringsAsFactors=T, sep = ",")
dat1 <- sapply(dat1, as.character)
temp <- list()
for (i in 1:ncol(dat1)){
temp[i] <- data.frame(str_replace(dat1[,i], "[?]",names(sort(table(dat1[,i]),
decreasing = T))[1]))
}
dat2 <- bind_cols(temp)
colnames(dat2) <- colnames(dat1)