R中的条件概率实验
Conditional probability experiment in R
这是我的代码
library(dplyr)
rain_vector <- sample(c(0,1), 1000000, replace = T, prob= c(0.2,0.8))
for(el in 1:10){
df <- data.frame(rain = rain_vector )
df <- df %>% mutate(A= if_else(rain == 1, sample(c(0,1), 1, replace = T, prob= c(1/3,2/3)),
sample(c(0,1), 1, replace = T, prob= c(2/3,1/3))))
print(NROW(df[df$A==1,]))
print(NROW(df[df$A == 1 & df$rain == 1, ]))
print(NROW(df[df$rain == 1,]))
print("______________")
}
这是输出:
[1] 0
[1] 0
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 1000000
[1] 800325
[1] 800325
[1] "______________"
[1] 1000000
[1] 800325
[1] 800325
[1] "______________"
[1] 1000000
[1] 800325
[1] 800325
[1] "______________"
None 的结果对我来说很有意义。让我们看看最后一个。情况 A = 1
总是发生,而它应该以 1/3 和 2/3 的概率发生,具体取决于下雨。 dplyr
包有什么问题吗?有什么建议吗?
问题是 sample(c(0,1), 1, replace = T, prob= c(1/3,2/3))
的长度为 1,因此它会重复每一行的值。
相反,您可以在 mutate 调用之前使用 rowwise()
,这样您就可以明确地告诉每一行应该有一个 sample()
调用。
df <- df %>%
rowwise() %>%
mutate(A= if_else(rain == 1, sample(c(0,1), 1, replace = T, prob= c(1/3,2/3)),
sample(c(0,1), 1, replace = T, prob= c(2/3,1/3))))
另一个更快的选择是使用 base R 并为 rain
的两个值的每个值计算具有适当长度的单个样本,将 sample()
的调用次数从 1000000 减少到 2 :
rain_vector <- sample(c(0,1), 1000000, replace = T, prob= c(0.2,0.8))
for(el in 1:10){
df <- data.frame(rain = rain_vector, A = numeric(length(rain_vector)))
df[rain_vector == 1, "A"] <- sample(c(0,1), sum(rain_vector==1), prob= c(1/3,2/3), replace = T)
df[rain_vector == 0, "A"] <- sample(c(0,1), sum(rain_vector==0), prob= c(2/3,1/3), replace = T)
print(NROW(df[df$A==1,]))
print(NROW(df[df$A == 1 & df$rain == 1, ]))
print(NROW(df[df$rain == 1,]))
print("______________")
}
这是我的代码
library(dplyr)
rain_vector <- sample(c(0,1), 1000000, replace = T, prob= c(0.2,0.8))
for(el in 1:10){
df <- data.frame(rain = rain_vector )
df <- df %>% mutate(A= if_else(rain == 1, sample(c(0,1), 1, replace = T, prob= c(1/3,2/3)),
sample(c(0,1), 1, replace = T, prob= c(2/3,1/3))))
print(NROW(df[df$A==1,]))
print(NROW(df[df$A == 1 & df$rain == 1, ]))
print(NROW(df[df$rain == 1,]))
print("______________")
}
这是输出:
[1] 0
[1] 0
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 800325
[1] 800325
[1] 800325
[1] "______________"
[1] 1000000
[1] 800325
[1] 800325
[1] "______________"
[1] 1000000
[1] 800325
[1] 800325
[1] "______________"
[1] 1000000
[1] 800325
[1] 800325
[1] "______________"
None 的结果对我来说很有意义。让我们看看最后一个。情况 A = 1
总是发生,而它应该以 1/3 和 2/3 的概率发生,具体取决于下雨。 dplyr
包有什么问题吗?有什么建议吗?
问题是 sample(c(0,1), 1, replace = T, prob= c(1/3,2/3))
的长度为 1,因此它会重复每一行的值。
相反,您可以在 mutate 调用之前使用 rowwise()
,这样您就可以明确地告诉每一行应该有一个 sample()
调用。
df <- df %>%
rowwise() %>%
mutate(A= if_else(rain == 1, sample(c(0,1), 1, replace = T, prob= c(1/3,2/3)),
sample(c(0,1), 1, replace = T, prob= c(2/3,1/3))))
另一个更快的选择是使用 base R 并为 rain
的两个值的每个值计算具有适当长度的单个样本,将 sample()
的调用次数从 1000000 减少到 2 :
rain_vector <- sample(c(0,1), 1000000, replace = T, prob= c(0.2,0.8))
for(el in 1:10){
df <- data.frame(rain = rain_vector, A = numeric(length(rain_vector)))
df[rain_vector == 1, "A"] <- sample(c(0,1), sum(rain_vector==1), prob= c(1/3,2/3), replace = T)
df[rain_vector == 0, "A"] <- sample(c(0,1), sum(rain_vector==0), prob= c(2/3,1/3), replace = T)
print(NROW(df[df$A==1,]))
print(NROW(df[df$A == 1 & df$rain == 1, ]))
print(NROW(df[df$rain == 1,]))
print("______________")
}