大量存储 rnorm 样本的所有值(向量),以便稍后进行 Tukey 测试
Storing all values of a rnorm sample in large quantities (vector) for a Tukey test later
所以我想我快到了!我想做的是创建一个脚本,在其中填写两个总体的“预期均值 (mu)”和“标准差 (sd)”,作为输出,您会得到一个 table,其中显示显着差异的分数根据样本量在这两个群体之间发现(由方差分析和 Tukey 检验确定)。当然,期望样本量越大,重要结果的比例就越高。现在发生了一些奇怪的事情,当我将两个均值(200 和 260)都与 sd=30 进行比较时,似乎显着 p 值的分数随机分布在样本大小上(下面我使用样本大小 1、2、3。 ..9).我希望发现的是,随着样本量的增加,ANOVA 和 Tukey 检测到的显着差异的比例也会由于 ANOVA 的工作原理而增加。更大的样本量将导致更可靠的模型,从而产生更可靠的 p 值。我所期望的是
像这样:
n fraction significant
1 0.1
2 0.3
3 0.4
4 0.6
5 0.8
6 0.9
这不会发生,我得到这样的结果:
n fraction significant
1 0.1
2 0.0
3 0.2
4 0.4
5 0.0
6 0.1
我认为这是因为最初当我 运行 我的循环和 rnorm 时,它只存储一个值作为每个初始样本大小的“样本”:例如对于初始样本大小 n=1 和“exp”=1(这些是我的迭代),我得到 1 个随机生成的样本值,这是合乎逻辑的。但是对于 n=9 和 "exp=1" 我也只得到一个值而不是 9 个值。在这种情况下,我应该将 9 个值放入方差分析和 Tukey 中,可能会产生更多的“exp”迭代,从而导致显着差异。我认为如果我设法实现这样的目标应该可行,但我不知道如何管理,我尝试了 3 天但我被卡住了:
n sample pop 1 sample pop 2
1 200.4 270.1 compare by ANOVA and Tukey -> p value
1 190.3 234.7 compare by ANOVA and Tukey -> p value
2. [200.2 184.6] [260.2 265.3] compare by ANOVA and Tukey -> p value
2. [230.1 222.7] [280.1 204.6] compare by ANOVA and Tukey -> p value
3. [270.3 230.1 201.3] [232.2 222.1] compare by ANOVA and Tukey -> p value
#etc...
#(Right now 10 times per n-value, but ultimately I want to do 10.000 iterations)
下面是我做的脚本,有经验的请见谅;)
{
library(tidyverse)
library(dplyr)
library(reshape2)
library(multcomp)
library(stringr)
}
rm(list=ls())
{
mu <- 200
sd <- 30
n_array_1 <- c()
exp_array_1 <- c()
pop_sample_array_1 <- c()
ID_array_1 <- "population1"
for (n_1 in c(1:9)) {
for (exp_1 in c(1:10)) {
n_array_1 <- c(n_array_1, n_1)
exp_array_1 <- c(exp_array_1, exp_1)
population_sample_1 <- rnorm(n_1, mu, sd)
pop_sample_array_1 <- c(pop_sample_array_1, population_sample_1)
}
}
mu_2 <- 260
sd_2 <- 30
n_array_2 <- c()
exp_array_2 <- c()
pop_sample_array_2 <- c()
ID_array_2 <- "population2"
for (n_2 in c(1:9)) {
for (exp_2 in c(1:10)) {
n_array_2 <- c(n_array_2, n_2)
exp_array_2 <- c(exp_array_2, exp_2)
population_sample_2 <- rnorm(n_2, mu_2, sd_2)
pop_sample_array_2 <- c(pop_sample_array_2, population_sample_2)
}
}
df1 <- data.frame(n=n_array_1, exp=exp_array_1, sample=pop_sample_array_1, ID=ID_array_1)
df2 <- data.frame(n=n_array_2, exp=exp_array_2, sample=pop_sample_array_2, ID=ID_array_2)
}
combineddf <- rbind(df1, df2)
view(combineddf)
#
## Running an ANOVA comparing the 2 groups specified
{
combineddf$IDcombined <- as.factor(paste(combineddf$IDcombined, combineddf$ID, combineddf$n, combineddf$exp, sep = "_"))
combineddf$IDcombined <- as.factor(combineddf$IDcombined)
combineddf
resaov <- aov(sample ~ IDcombined, data=combineddf)
test <- TukeyHSD(resaov)
TK <- test
TK_data<-as.data.frame(TK[1:1])
TK_data$IDcombined <- rownames(TK_data)
}
{
LC1 <- separate(TK_data, col=IDcombined, into = c("A", "B"), sep = "-")
LC2 <- separate(LC1, col = A, into=c("ignore", "comparison_factor_1", "sample_size", "iteration"), sep="_")
LC3 <- separate(LC2, col=B, into=c("ignore", "comparison_factor_2", "sample_size_2", "iteration_2"), sep="_")
}
{
results <- dplyr::select(LC3, comparison_factor_1, comparison_factor_2, iteration, iteration_2, sample_size, sample_size_2, IDcombined.p.adj)
results <- dplyr::filter(results, comparison_factor_1=="population1" & comparison_factor_2=="population2" | comparison_factor_1=="population2" & comparison_factor_2=="population1")
results <- dplyr::filter(results, sample_size == sample_size_2 & iteration == iteration_2)
}
#
## calculations of fractions significant p-values of a tukey test comparing the 2 populations based on the different sample sizes
results$significant <- results$IDcombined.p.adj < 0.05
for (sample_size in unique(results$sample_size)) {
subset_results <- results[results$sample_size == sample_size, ]
n_significant <- sum(subset_results$significant)
n_total <- nrow(subset_results)
fraction_significant <- n_significant / n_total
cat(sample_size, " ", fraction_significant, "\n")
}
添加另一个“for”循环(带有“replicates”)解决了我的问题。现在我只需要将“cat”函数的输出存储在 table 或数据框中,我可以扩展脚本(添加更多组进行比较)脚本如下:
{
library(tidyverse)
library(dplyr)
library(reshape2)
library(multcomp)
library(stringr)
}
rm(list=ls())
#
## Syncom C - (t=10)
{
{
mu_1 <- 628
sd_1 <- 130
n_array_1 <- c()
exp_array_1 <- c()
pop_sample_array_1 <- c()
ID_array_1 <- "population1"
for (n_1 in c(1:20)) {
for (exp_1 in c(1:30)) {
population_sample_1 <- rnorm(n_1, mu_1, sd_1)
for (replicate_1 in population_sample_1) {
n_array_1 <- c(n_array_1, n_1)
exp_array_1 <- c(exp_array_1, exp_1)
pop_sample_array_1 <- c(pop_sample_array_1, replicate_1)
}
}
}
}
}
#
## Control - (t=10)
{
{
mu_2 <- 423
sd_2 <- 96
n_array_2 <- c()
exp_array_2 <- c()
pop_sample_array_2 <- c()
ID_array_2 <- "population2"
for (n_2 in c(1:20)) {
for (exp_2 in c(1:30)) {
population_sample_2 <- rnorm(n_2, mu_2, sd_2)
for (replicate_2 in population_sample_2) {
n_array_2 <- c(n_array_2, n_2)
exp_array_2 <- c(exp_array_2, exp_2)
pop_sample_array_2 <- c(pop_sample_array_2, replicate_2)
}
}
}
}
}
{
df1 <- data.frame(n=n_array_1, exp=exp_array_1, sample=pop_sample_array_1, ID=ID_array_1)
df2 <- data.frame(n=n_array_2, exp=exp_array_2, sample=pop_sample_array_2, ID=ID_array_2)
}
combineddf <- rbind(df1, df2)
#view(combineddf)
#
## Running an ANOVA comparing the 2 groups specified
{
combineddf$IDcombined <- as.factor(paste(combineddf$IDcombined, combineddf$ID, combineddf$n, combineddf$exp, sep = "_"))
combineddf$IDcombined <- as.factor(combineddf$IDcombined)
combineddf
resaov <- aov(sample ~ IDcombined, data=combineddf)
test <- TukeyHSD(resaov)
TK <- test
TK_data<-as.data.frame(TK[1:1])
TK_data$IDcombined <- rownames(TK_data)
}
{
LC1 <- separate(TK_data, col=IDcombined, into = c("A", "B"), sep = "-")
LC2 <- separate(LC1, col = A, into=c("ignore", "comparison_factor_1", "sample_size", "iteration"), sep="_")
LC3 <- separate(LC2, col=B, into=c("ignore", "comparison_factor_2", "sample_size_2", "iteration_2"), sep="_")
}
{
results <- dplyr::select(LC3, comparison_factor_1, comparison_factor_2, iteration, iteration_2, sample_size, sample_size_2, IDcombined.p.adj)
results <- dplyr::filter(results, comparison_factor_1=="population1" & comparison_factor_2=="population2" | comparison_factor_1=="population2" & comparison_factor_2=="population1")
results <- dplyr::filter(results, sample_size == sample_size_2 & iteration == iteration_2)
}
#
## calculations of fractions significant p-values of a tukey test comparing the 2 populations based on the different sample sizes
results$significant <- results$IDcombined.p.adj < 0.05
for (sample_size in unique(results$sample_size)) {
subset_results <- results[results$sample_size == sample_size, ]
n_significant <- sum(subset_results$significant)
n_total <- nrow(subset_results)
fraction_significant <- n_significant / n_total
cat(sample_size, " ", fraction_significant, "\n")
}
所以我想我快到了!我想做的是创建一个脚本,在其中填写两个总体的“预期均值 (mu)”和“标准差 (sd)”,作为输出,您会得到一个 table,其中显示显着差异的分数根据样本量在这两个群体之间发现(由方差分析和 Tukey 检验确定)。当然,期望样本量越大,重要结果的比例就越高。现在发生了一些奇怪的事情,当我将两个均值(200 和 260)都与 sd=30 进行比较时,似乎显着 p 值的分数随机分布在样本大小上(下面我使用样本大小 1、2、3。 ..9).我希望发现的是,随着样本量的增加,ANOVA 和 Tukey 检测到的显着差异的比例也会由于 ANOVA 的工作原理而增加。更大的样本量将导致更可靠的模型,从而产生更可靠的 p 值。我所期望的是 像这样:
n fraction significant
1 0.1
2 0.3
3 0.4
4 0.6
5 0.8
6 0.9
这不会发生,我得到这样的结果:
n fraction significant
1 0.1
2 0.0
3 0.2
4 0.4
5 0.0
6 0.1
我认为这是因为最初当我 运行 我的循环和 rnorm 时,它只存储一个值作为每个初始样本大小的“样本”:例如对于初始样本大小 n=1 和“exp”=1(这些是我的迭代),我得到 1 个随机生成的样本值,这是合乎逻辑的。但是对于 n=9 和 "exp=1" 我也只得到一个值而不是 9 个值。在这种情况下,我应该将 9 个值放入方差分析和 Tukey 中,可能会产生更多的“exp”迭代,从而导致显着差异。我认为如果我设法实现这样的目标应该可行,但我不知道如何管理,我尝试了 3 天但我被卡住了:
n sample pop 1 sample pop 2
1 200.4 270.1 compare by ANOVA and Tukey -> p value
1 190.3 234.7 compare by ANOVA and Tukey -> p value
2. [200.2 184.6] [260.2 265.3] compare by ANOVA and Tukey -> p value
2. [230.1 222.7] [280.1 204.6] compare by ANOVA and Tukey -> p value
3. [270.3 230.1 201.3] [232.2 222.1] compare by ANOVA and Tukey -> p value
#etc...
#(Right now 10 times per n-value, but ultimately I want to do 10.000 iterations)
下面是我做的脚本,有经验的请见谅;)
{
library(tidyverse)
library(dplyr)
library(reshape2)
library(multcomp)
library(stringr)
}
rm(list=ls())
{
mu <- 200
sd <- 30
n_array_1 <- c()
exp_array_1 <- c()
pop_sample_array_1 <- c()
ID_array_1 <- "population1"
for (n_1 in c(1:9)) {
for (exp_1 in c(1:10)) {
n_array_1 <- c(n_array_1, n_1)
exp_array_1 <- c(exp_array_1, exp_1)
population_sample_1 <- rnorm(n_1, mu, sd)
pop_sample_array_1 <- c(pop_sample_array_1, population_sample_1)
}
}
mu_2 <- 260
sd_2 <- 30
n_array_2 <- c()
exp_array_2 <- c()
pop_sample_array_2 <- c()
ID_array_2 <- "population2"
for (n_2 in c(1:9)) {
for (exp_2 in c(1:10)) {
n_array_2 <- c(n_array_2, n_2)
exp_array_2 <- c(exp_array_2, exp_2)
population_sample_2 <- rnorm(n_2, mu_2, sd_2)
pop_sample_array_2 <- c(pop_sample_array_2, population_sample_2)
}
}
df1 <- data.frame(n=n_array_1, exp=exp_array_1, sample=pop_sample_array_1, ID=ID_array_1)
df2 <- data.frame(n=n_array_2, exp=exp_array_2, sample=pop_sample_array_2, ID=ID_array_2)
}
combineddf <- rbind(df1, df2)
view(combineddf)
#
## Running an ANOVA comparing the 2 groups specified
{
combineddf$IDcombined <- as.factor(paste(combineddf$IDcombined, combineddf$ID, combineddf$n, combineddf$exp, sep = "_"))
combineddf$IDcombined <- as.factor(combineddf$IDcombined)
combineddf
resaov <- aov(sample ~ IDcombined, data=combineddf)
test <- TukeyHSD(resaov)
TK <- test
TK_data<-as.data.frame(TK[1:1])
TK_data$IDcombined <- rownames(TK_data)
}
{
LC1 <- separate(TK_data, col=IDcombined, into = c("A", "B"), sep = "-")
LC2 <- separate(LC1, col = A, into=c("ignore", "comparison_factor_1", "sample_size", "iteration"), sep="_")
LC3 <- separate(LC2, col=B, into=c("ignore", "comparison_factor_2", "sample_size_2", "iteration_2"), sep="_")
}
{
results <- dplyr::select(LC3, comparison_factor_1, comparison_factor_2, iteration, iteration_2, sample_size, sample_size_2, IDcombined.p.adj)
results <- dplyr::filter(results, comparison_factor_1=="population1" & comparison_factor_2=="population2" | comparison_factor_1=="population2" & comparison_factor_2=="population1")
results <- dplyr::filter(results, sample_size == sample_size_2 & iteration == iteration_2)
}
#
## calculations of fractions significant p-values of a tukey test comparing the 2 populations based on the different sample sizes
results$significant <- results$IDcombined.p.adj < 0.05
for (sample_size in unique(results$sample_size)) {
subset_results <- results[results$sample_size == sample_size, ]
n_significant <- sum(subset_results$significant)
n_total <- nrow(subset_results)
fraction_significant <- n_significant / n_total
cat(sample_size, " ", fraction_significant, "\n")
}
添加另一个“for”循环(带有“replicates”)解决了我的问题。现在我只需要将“cat”函数的输出存储在 table 或数据框中,我可以扩展脚本(添加更多组进行比较)脚本如下:
{
library(tidyverse)
library(dplyr)
library(reshape2)
library(multcomp)
library(stringr)
}
rm(list=ls())
#
## Syncom C - (t=10)
{
{
mu_1 <- 628
sd_1 <- 130
n_array_1 <- c()
exp_array_1 <- c()
pop_sample_array_1 <- c()
ID_array_1 <- "population1"
for (n_1 in c(1:20)) {
for (exp_1 in c(1:30)) {
population_sample_1 <- rnorm(n_1, mu_1, sd_1)
for (replicate_1 in population_sample_1) {
n_array_1 <- c(n_array_1, n_1)
exp_array_1 <- c(exp_array_1, exp_1)
pop_sample_array_1 <- c(pop_sample_array_1, replicate_1)
}
}
}
}
}
#
## Control - (t=10)
{
{
mu_2 <- 423
sd_2 <- 96
n_array_2 <- c()
exp_array_2 <- c()
pop_sample_array_2 <- c()
ID_array_2 <- "population2"
for (n_2 in c(1:20)) {
for (exp_2 in c(1:30)) {
population_sample_2 <- rnorm(n_2, mu_2, sd_2)
for (replicate_2 in population_sample_2) {
n_array_2 <- c(n_array_2, n_2)
exp_array_2 <- c(exp_array_2, exp_2)
pop_sample_array_2 <- c(pop_sample_array_2, replicate_2)
}
}
}
}
}
{
df1 <- data.frame(n=n_array_1, exp=exp_array_1, sample=pop_sample_array_1, ID=ID_array_1)
df2 <- data.frame(n=n_array_2, exp=exp_array_2, sample=pop_sample_array_2, ID=ID_array_2)
}
combineddf <- rbind(df1, df2)
#view(combineddf)
#
## Running an ANOVA comparing the 2 groups specified
{
combineddf$IDcombined <- as.factor(paste(combineddf$IDcombined, combineddf$ID, combineddf$n, combineddf$exp, sep = "_"))
combineddf$IDcombined <- as.factor(combineddf$IDcombined)
combineddf
resaov <- aov(sample ~ IDcombined, data=combineddf)
test <- TukeyHSD(resaov)
TK <- test
TK_data<-as.data.frame(TK[1:1])
TK_data$IDcombined <- rownames(TK_data)
}
{
LC1 <- separate(TK_data, col=IDcombined, into = c("A", "B"), sep = "-")
LC2 <- separate(LC1, col = A, into=c("ignore", "comparison_factor_1", "sample_size", "iteration"), sep="_")
LC3 <- separate(LC2, col=B, into=c("ignore", "comparison_factor_2", "sample_size_2", "iteration_2"), sep="_")
}
{
results <- dplyr::select(LC3, comparison_factor_1, comparison_factor_2, iteration, iteration_2, sample_size, sample_size_2, IDcombined.p.adj)
results <- dplyr::filter(results, comparison_factor_1=="population1" & comparison_factor_2=="population2" | comparison_factor_1=="population2" & comparison_factor_2=="population1")
results <- dplyr::filter(results, sample_size == sample_size_2 & iteration == iteration_2)
}
#
## calculations of fractions significant p-values of a tukey test comparing the 2 populations based on the different sample sizes
results$significant <- results$IDcombined.p.adj < 0.05
for (sample_size in unique(results$sample_size)) {
subset_results <- results[results$sample_size == sample_size, ]
n_significant <- sum(subset_results$significant)
n_total <- nrow(subset_results)
fraction_significant <- n_significant / n_total
cat(sample_size, " ", fraction_significant, "\n")
}