如何用 IQR 内的值替换多个变量和个体的异常值
How to replace outliers with values inside the IQR for multiple variables and individuals
我有一个类似于 iris
的数据集,需要编写一个函数来按以下方式处理异常值:对于每个物种 setosa
、versicolor
和 virginica
,在每个变量 iris$Sepal.Length
、iris$Sepal.Width
、iris$Petal.Length
和 Petal.Width
中,将超出 1.5*IQR 的值替换为 IQR +/- 1.5 的值*IQR(取决于它是高于还是低于 IQR)。我一直在使用下面的代码来实现这一点,但是它非常重复,耗时且容易出错。此外,这样做会改变原始对象中的值。将参数合并到一个函数中会很好,它不仅可以实现这一点,而且可以告诉我更改了哪些值并将所有输出保存到一个新的数据框中,而不是更改原始数据集中的值。
data(iris)
#create separate objects containing the data for each species
setosa <-
iris%>%
filter(Species == "setosa")
versicolor <-
iris%>%
filter(Species == "versicolor")
virginica <-
iris%>%
filter(Species == "virginica")
#for each variable within each species, do the following:
#create an object (qnt) that contains the 25th and 75th percentile
#create an object (H) containing the value of 1.5 times the interquartile range(IQR)
#replace any number less than the 25th percentile minus H with the value of the
#25th percentile minus H
#replace any number greater than the 75th percentile plus H with the value of the
#75th percentile plus H
qnt <- quantile(setosa$Sepal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Sepal.Length, na.rm = T)
setosa$Sepal.Length[setosa$Sepal.Length < (qnt[1] - H)] <- qnt[1]-H
setosa$Sepal.Length[setosa$Sepal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(setosa$Sepal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Sepal.Width, na.rm = T)
setosa$Sepal.Width[setosa$Sepal.Width < (qnt[1] - H)] <- qnt[1]-H
setosa$Sepal.Width[setosa$Sepal.Width > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(setosa$Petal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Petal.Length, na.rm = T)
setosa$Petal.Length[setosa$Petal.Length < (qnt[1] - H)] <- qnt[1]-H
setosa$Petal.Length[setosa$Petal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(setosa$Petal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Petal.Width, na.rm = T)
setosa$Sepal.Width[setosa$Petal.Width < (qnt[1] - H)] <- qnt[1]-H
setosa$Sepal.Width[setosa$Petal.Width > (qnt[2] + H)] <- qnt[2]+H
#now do versicolor
qnt <- quantile(versicolor$Sepal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Sepal.Length, na.rm = T)
versicolor$Sepal.Length[versicolor$Sepal.Length < (qnt[1] - H)] <- qnt[1]-H
versicolor$Sepal.Length[versicolor$Sepal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(versicolor$Sepal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Sepal.Width, na.rm = T)
versicolor$Sepal.Width[versicolor$Sepal.Width < (qnt[1] - H)] <- qnt[1]-H
versicolor$Sepal.Width[versicolor$Sepal.Width > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(versicolor$Petal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Petal.Length, na.rm = T)
versicolor$Petal.Length[versicolor$Petal.Length < (qnt[1] - H)] <- qnt[1]-H
versicolor$Petal.Length[versicolor$Petal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(versicolor$Petal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Petal.Width, na.rm = T)
versicolor$Sepal.Width[versicolor$Petal.Width < (qnt[1] - H)] <- qnt[1]-H
versicolor$Sepal.Width[versicolor$Petal.Width > (qnt[2] + H)] <- qnt[2]+H
#now do virginica
qnt <- quantile(virginica$Sepal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Sepal.Length, na.rm = T)
virginica$Sepal.Length[virginica$Sepal.Length < (qnt[1] - H)] <- qnt[1]-H
virginica$Sepal.Length[virginica$Sepal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(virginica$Sepal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Sepal.Width, na.rm = T)
virginica$Sepal.Width[virginica$Sepal.Width < (qnt[1] - H)] <- qnt[1]-H
virginica$Sepal.Width[virginica$Sepal.Width > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(virginica$Petal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Petal.Length, na.rm = T)
virginica$Petal.Length[virginica$Petal.Length < (qnt[1] - H)] <- qnt[1]-H
virginica$Petal.Length[virginica$Petal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(virginica$Petal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Petal.Width, na.rm = T)
virginica$Sepal.Width[virginica$Petal.Width < (qnt[1] - H)] <- qnt[1]-H
virginica$Sepal.Width[virginica$Petal.Width > (qnt[2] + H)] <- qnt[2]+H
创建一个函数,然后在按 'Species' 分组后应用到列上,并将其分配给新对象会更容易。使用 dplyr
,原始数据集不会即时更改,除非我们使用来自 magrittr
的特殊运算符(%<>%
而不是 %>%
)
f1 <- function(x) {
qnt <- quantile(x, probs = c(.25, .75), na.rm = TRUE)
H <- 1.5*IQR(x, na.rm = TRUE)
x[x< (qnt[1] - H)] <- qnt[1]-H
x[x> (qnt[2] + H)] <- qnt[2]+H
x
}
library(dplyr)
iris1 <- iris %>%
group_by(Species) %>%
mutate_at(vars(-group_cols()), f1)
另外,如果我们只需要应用数字列(如果数据集有其他不同类型的列)
iris1 <- iris %>%
group_by(Species) %>%
mutate_if(is.numeric, f1)
我有一个类似于 iris
的数据集,需要编写一个函数来按以下方式处理异常值:对于每个物种 setosa
、versicolor
和 virginica
,在每个变量 iris$Sepal.Length
、iris$Sepal.Width
、iris$Petal.Length
和 Petal.Width
中,将超出 1.5*IQR 的值替换为 IQR +/- 1.5 的值*IQR(取决于它是高于还是低于 IQR)。我一直在使用下面的代码来实现这一点,但是它非常重复,耗时且容易出错。此外,这样做会改变原始对象中的值。将参数合并到一个函数中会很好,它不仅可以实现这一点,而且可以告诉我更改了哪些值并将所有输出保存到一个新的数据框中,而不是更改原始数据集中的值。
data(iris)
#create separate objects containing the data for each species
setosa <-
iris%>%
filter(Species == "setosa")
versicolor <-
iris%>%
filter(Species == "versicolor")
virginica <-
iris%>%
filter(Species == "virginica")
#for each variable within each species, do the following:
#create an object (qnt) that contains the 25th and 75th percentile
#create an object (H) containing the value of 1.5 times the interquartile range(IQR)
#replace any number less than the 25th percentile minus H with the value of the
#25th percentile minus H
#replace any number greater than the 75th percentile plus H with the value of the
#75th percentile plus H
qnt <- quantile(setosa$Sepal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Sepal.Length, na.rm = T)
setosa$Sepal.Length[setosa$Sepal.Length < (qnt[1] - H)] <- qnt[1]-H
setosa$Sepal.Length[setosa$Sepal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(setosa$Sepal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Sepal.Width, na.rm = T)
setosa$Sepal.Width[setosa$Sepal.Width < (qnt[1] - H)] <- qnt[1]-H
setosa$Sepal.Width[setosa$Sepal.Width > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(setosa$Petal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Petal.Length, na.rm = T)
setosa$Petal.Length[setosa$Petal.Length < (qnt[1] - H)] <- qnt[1]-H
setosa$Petal.Length[setosa$Petal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(setosa$Petal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Petal.Width, na.rm = T)
setosa$Sepal.Width[setosa$Petal.Width < (qnt[1] - H)] <- qnt[1]-H
setosa$Sepal.Width[setosa$Petal.Width > (qnt[2] + H)] <- qnt[2]+H
#now do versicolor
qnt <- quantile(versicolor$Sepal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Sepal.Length, na.rm = T)
versicolor$Sepal.Length[versicolor$Sepal.Length < (qnt[1] - H)] <- qnt[1]-H
versicolor$Sepal.Length[versicolor$Sepal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(versicolor$Sepal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Sepal.Width, na.rm = T)
versicolor$Sepal.Width[versicolor$Sepal.Width < (qnt[1] - H)] <- qnt[1]-H
versicolor$Sepal.Width[versicolor$Sepal.Width > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(versicolor$Petal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Petal.Length, na.rm = T)
versicolor$Petal.Length[versicolor$Petal.Length < (qnt[1] - H)] <- qnt[1]-H
versicolor$Petal.Length[versicolor$Petal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(versicolor$Petal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Petal.Width, na.rm = T)
versicolor$Sepal.Width[versicolor$Petal.Width < (qnt[1] - H)] <- qnt[1]-H
versicolor$Sepal.Width[versicolor$Petal.Width > (qnt[2] + H)] <- qnt[2]+H
#now do virginica
qnt <- quantile(virginica$Sepal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Sepal.Length, na.rm = T)
virginica$Sepal.Length[virginica$Sepal.Length < (qnt[1] - H)] <- qnt[1]-H
virginica$Sepal.Length[virginica$Sepal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(virginica$Sepal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Sepal.Width, na.rm = T)
virginica$Sepal.Width[virginica$Sepal.Width < (qnt[1] - H)] <- qnt[1]-H
virginica$Sepal.Width[virginica$Sepal.Width > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(virginica$Petal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Petal.Length, na.rm = T)
virginica$Petal.Length[virginica$Petal.Length < (qnt[1] - H)] <- qnt[1]-H
virginica$Petal.Length[virginica$Petal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(virginica$Petal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Petal.Width, na.rm = T)
virginica$Sepal.Width[virginica$Petal.Width < (qnt[1] - H)] <- qnt[1]-H
virginica$Sepal.Width[virginica$Petal.Width > (qnt[2] + H)] <- qnt[2]+H
创建一个函数,然后在按 'Species' 分组后应用到列上,并将其分配给新对象会更容易。使用 dplyr
,原始数据集不会即时更改,除非我们使用来自 magrittr
的特殊运算符(%<>%
而不是 %>%
)
f1 <- function(x) {
qnt <- quantile(x, probs = c(.25, .75), na.rm = TRUE)
H <- 1.5*IQR(x, na.rm = TRUE)
x[x< (qnt[1] - H)] <- qnt[1]-H
x[x> (qnt[2] + H)] <- qnt[2]+H
x
}
library(dplyr)
iris1 <- iris %>%
group_by(Species) %>%
mutate_at(vars(-group_cols()), f1)
另外,如果我们只需要应用数字列(如果数据集有其他不同类型的列)
iris1 <- iris %>%
group_by(Species) %>%
mutate_if(is.numeric, f1)