R:将 "NA" 个因子添加到 "levels" 函数
R: Adding "NA" factors to the "levels" function
我正在使用 R 编程语言。在这个例子中,我有以下数据:
library("dplyr")
df <- data.frame(b = rnorm(100,5,5), d = rnorm(100,2,2),
c = rnorm(100,10,10))
a <- c("a", "b", "c", "d", "e")
a <- sample(a, 100, replace=TRUE, prob=c(0.3, 0.2, 0.3, 0.1, 0.1))
a<- as.factor(a)
df$a = a
f <- c("a", "b", "c", "d", "e")
f <- sample(f, 100, replace=TRUE, prob=c(0.3, 0.2, 0.3, 0.1, 0.1))
f<- as.factor(f)
df$f = f
head(df)
b d c a f
1 6.896434 2.037835 2.867707 e a
2 -3.314758 2.681726 20.038918 d d
3 2.018130 2.229342 -8.341578 c a
4 9.738082 1.127069 18.337212 c c
5 2.442182 3.475735 27.875924 c c
6 5.061937 1.098709 6.166077 a e
然后我有以下函数 ("my_subset_mean") 计算 df$c 对于 "a,b,d,f" 的不同子集的"平均"值:
my_subset_mean <- function(r1, r2, r3, r4){
subset <- df %>% filter(a %in% r1, f %in% r4, b > r2, d < r3 )
return(mean(subset$c))
}
在上一个问题中,我学习了如何编写一个循环来计算函数“my_subset_mean”在“a,b,d,f”的随机子集上的值:
create_output <- function() {
uv <- levels(df$a)
r1 <- sample(uv, sample(length(uv)))
uv1 <- levels(df$f)
r4 <- sample(uv1, sample(length(uv1)))
rgb <- range(df$b)
rgd <- range(df$d)
r2 <- runif(1, rgb[1], rgb[2])
r3 <- runif(1, rgd[1], rgd[2])
my_subset_mean <- my_subset_mean(r1, r2, r3, r4)
data.frame(r1 = toString(r1), r4 = toString(r4), r2, r3, my_subset_mean)
}
out <- do.call(rbind, replicate(100, create_output(), simplify = FALSE))
head(out)
r1 r4 r2 r3 my_subset_mean
1 a, c, b, e, d d 14.560821 3.4251138 NaN
2 d, e e, d, b, c 9.027482 -1.7108754 NaN
3 d e, b, a, d 1.447395 0.4279652 18.019990
4 a, e, b, c, d e -6.807861 2.6301878 7.424415
5 a, d d 8.307980 -1.8923647 NaN
6 a b, c, a 7.180056 -0.4022791 NaN
问题:是否可以这样写循环(create_output),使得有时“r1, r2, r3, r4”的值不是经过考虑的?例如
r1 r4 r2 r3 my_subset_mean
1 NA d 14.56 3.4251138 5
2 d, e, d, b, NA NA -1.7108754 3.1
3 e, b, d 1.447 NA 18.019990
我在想也许这可以在“级别”声明中指定:
uv <- levels(df$a)
r1 <- sample(uv, sample(length(uv)))
在这里,我们可以看到“uv”的值:
uv
[1] "a" "b" "c" "d" "e"
有什么办法可以让函数“my_subset_mean”有时忽略“a、b、d、f”的某些子集条件吗?例如。 “均值”仅使用“a,d”的子集条件计算?
谢谢
您可以修改 中的 my_subset_mean
函数以包含 r4
值。
library(dplyr)
my_subset_mean <- function(r1=NA, r2=NA, r3=NA, r4 = NA) {
if (all(is.na(r1))) r1 <- unique(df$a)
if (all(is.na(r4))) r4 <- unique(df$f)
if (is.na(r2)) r2 <- -Inf
if (is.na(r3)) r3 <- Inf
s <- filter(df, a %in% r1 , f %in% r4, b > r2 , d < r3)
return(mean(s$c))
}
然后将create_output
函数改成-
create_output <- function() {
uv <- levels(df$a)
r1 <- sample(list(sample(uv, sample(length(uv))), NA), 1)[[1]]
uv1 <- levels(df$f)
r4 <- sample(list(sample(uv1, sample(length(uv1))), NA), 1)[[1]]
rgb <- range(df$b)
rgd <- range(df$d)
r2 <- sample(c(runif(1, rgb[1], rgb[2]), NA), 1)
r3 <- sample(c(runif(1, rgd[1], rgd[2]), NA), 1)
my_subset_mean <- my_subset_mean(r1, r2, r3, r4)
data.frame(r1 = toString(r1), r4 = toString(r4), r2, r3, my_subset_mean)
}
set.seed(123)
out <- do.call(rbind, replicate(100, create_output(), simplify = FALSE))
head(out)
# r1 r4 r2 r3 my_subset_mean
#1 NA c NA 4.2164973 12.095431
#2 a, b, c, d, e b, a, c NA 0.4394423 7.130999
#3 NA a, c, e, b 9.285701 NA 8.236054
#4 NA NA 14.060829 3.8960888 10.562523
#5 c, b, a, d NA NA NA 9.015613
#6 NA a, c, d 2.251218 NA 10.070425
请注意,目前我还没有分配任何出现 NA
值的概率,因此将 NA
作为任何参数的输入的概率为 50%。如果您想更改,可以根据您在 sample
.
中的选择分配 probs
值
我正在使用 R 编程语言。在这个例子中,我有以下数据:
library("dplyr")
df <- data.frame(b = rnorm(100,5,5), d = rnorm(100,2,2),
c = rnorm(100,10,10))
a <- c("a", "b", "c", "d", "e")
a <- sample(a, 100, replace=TRUE, prob=c(0.3, 0.2, 0.3, 0.1, 0.1))
a<- as.factor(a)
df$a = a
f <- c("a", "b", "c", "d", "e")
f <- sample(f, 100, replace=TRUE, prob=c(0.3, 0.2, 0.3, 0.1, 0.1))
f<- as.factor(f)
df$f = f
head(df)
b d c a f
1 6.896434 2.037835 2.867707 e a
2 -3.314758 2.681726 20.038918 d d
3 2.018130 2.229342 -8.341578 c a
4 9.738082 1.127069 18.337212 c c
5 2.442182 3.475735 27.875924 c c
6 5.061937 1.098709 6.166077 a e
然后我有以下函数 ("my_subset_mean") 计算 df$c 对于 "a,b,d,f" 的不同子集的"平均"值:
my_subset_mean <- function(r1, r2, r3, r4){
subset <- df %>% filter(a %in% r1, f %in% r4, b > r2, d < r3 )
return(mean(subset$c))
}
在上一个问题中,我学习了如何编写一个循环来计算函数“my_subset_mean”在“a,b,d,f”的随机子集上的值:
create_output <- function() {
uv <- levels(df$a)
r1 <- sample(uv, sample(length(uv)))
uv1 <- levels(df$f)
r4 <- sample(uv1, sample(length(uv1)))
rgb <- range(df$b)
rgd <- range(df$d)
r2 <- runif(1, rgb[1], rgb[2])
r3 <- runif(1, rgd[1], rgd[2])
my_subset_mean <- my_subset_mean(r1, r2, r3, r4)
data.frame(r1 = toString(r1), r4 = toString(r4), r2, r3, my_subset_mean)
}
out <- do.call(rbind, replicate(100, create_output(), simplify = FALSE))
head(out)
r1 r4 r2 r3 my_subset_mean
1 a, c, b, e, d d 14.560821 3.4251138 NaN
2 d, e e, d, b, c 9.027482 -1.7108754 NaN
3 d e, b, a, d 1.447395 0.4279652 18.019990
4 a, e, b, c, d e -6.807861 2.6301878 7.424415
5 a, d d 8.307980 -1.8923647 NaN
6 a b, c, a 7.180056 -0.4022791 NaN
问题:是否可以这样写循环(create_output),使得有时“r1, r2, r3, r4”的值不是经过考虑的?例如
r1 r4 r2 r3 my_subset_mean
1 NA d 14.56 3.4251138 5
2 d, e, d, b, NA NA -1.7108754 3.1
3 e, b, d 1.447 NA 18.019990
我在想也许这可以在“级别”声明中指定:
uv <- levels(df$a)
r1 <- sample(uv, sample(length(uv)))
在这里,我们可以看到“uv”的值:
uv
[1] "a" "b" "c" "d" "e"
有什么办法可以让函数“my_subset_mean”有时忽略“a、b、d、f”的某些子集条件吗?例如。 “均值”仅使用“a,d”的子集条件计算?
谢谢
您可以修改 my_subset_mean
函数以包含 r4
值。
library(dplyr)
my_subset_mean <- function(r1=NA, r2=NA, r3=NA, r4 = NA) {
if (all(is.na(r1))) r1 <- unique(df$a)
if (all(is.na(r4))) r4 <- unique(df$f)
if (is.na(r2)) r2 <- -Inf
if (is.na(r3)) r3 <- Inf
s <- filter(df, a %in% r1 , f %in% r4, b > r2 , d < r3)
return(mean(s$c))
}
然后将create_output
函数改成-
create_output <- function() {
uv <- levels(df$a)
r1 <- sample(list(sample(uv, sample(length(uv))), NA), 1)[[1]]
uv1 <- levels(df$f)
r4 <- sample(list(sample(uv1, sample(length(uv1))), NA), 1)[[1]]
rgb <- range(df$b)
rgd <- range(df$d)
r2 <- sample(c(runif(1, rgb[1], rgb[2]), NA), 1)
r3 <- sample(c(runif(1, rgd[1], rgd[2]), NA), 1)
my_subset_mean <- my_subset_mean(r1, r2, r3, r4)
data.frame(r1 = toString(r1), r4 = toString(r4), r2, r3, my_subset_mean)
}
set.seed(123)
out <- do.call(rbind, replicate(100, create_output(), simplify = FALSE))
head(out)
# r1 r4 r2 r3 my_subset_mean
#1 NA c NA 4.2164973 12.095431
#2 a, b, c, d, e b, a, c NA 0.4394423 7.130999
#3 NA a, c, e, b 9.285701 NA 8.236054
#4 NA NA 14.060829 3.8960888 10.562523
#5 c, b, a, d NA NA NA 9.015613
#6 NA a, c, d 2.251218 NA 10.070425
请注意,目前我还没有分配任何出现 NA
值的概率,因此将 NA
作为任何参数的输入的概率为 50%。如果您想更改,可以根据您在 sample
.
probs
值