大量存储 rnorm 样本的所有值(向量),以便稍后进行 Tukey 测试

Storing all values of a rnorm sample in large quantities (vector) for a Tukey test later

所以我想我快到了!我想做的是创建一个脚本,在其中填写两个总体的“预期均值 (mu)”和“标准差 (sd)”,作为输出,您会得到一个 table,其中显示显着差异的分数根据样本量在这两个群体之间发现(由方差分析和 Tukey 检验确定)。当然,期望样本量越大,重要结果的比例就越高。现在发生了一些奇怪的事情,当我将两个均值(200 和 260)都与 sd=30 进行比较时,似乎显着 p 值的分数随机分布在样本大小上(下面我使用样本大小 1、2、3。 ..9).我希望发现的是,随着样本量的增加,ANOVA 和 Tukey 检测到的显着差异的比例也会由于 ANOVA 的工作原理而增加。更大的样本量将导致更可靠的模型,从而产生更可靠的 p 值。我所期望的是 像这样:

n   fraction significant
1        0.1
2        0.3
3        0.4
4        0.6
5        0.8
6        0.9

这不会发生,我得到这样的结果:


n   fraction significant
1        0.1
2        0.0
3        0.2
4        0.4
5        0.0
6        0.1

我认为这是因为最初当我 运行 我的循环和 rnorm 时,它只存储一个值作为每个初始样本大小的“样本”:例如对于初始样本大小 n=1 和“exp”=1(这些是我的迭代),我得到 1 个随机生成的样本值,这是合乎逻辑的。但是对于 n=9 和 "exp=1" 我也只得到一个值而不是 9 个值。在这种情况下,我应该将 9 个值放入方差分析和 Tukey 中,可能会产生更多的“exp”迭代,从而导致显着差异。我认为如果我设法实现这样的目标应该可行,但我不知道如何管理,我尝试了 3 天但我被卡住了:

n    sample pop 1         sample pop 2
1    200.4                270.1           compare by ANOVA and Tukey -> p value
1    190.3                234.7           compare by ANOVA and Tukey -> p value
2.   [200.2 184.6]        [260.2 265.3]   compare by ANOVA and Tukey -> p value
2.   [230.1 222.7]        [280.1 204.6]   compare by ANOVA and Tukey -> p value
3.   [270.3 230.1 201.3]  [232.2 222.1]   compare by ANOVA and Tukey -> p value
#etc... 
#(Right now 10 times per n-value, but ultimately I want to do 10.000 iterations)

下面是我做的脚本,有经验的请见谅;)

{
  library(tidyverse)
  library(dplyr)
  library(reshape2)
  library(multcomp)
  library(stringr)
}
rm(list=ls())
{
  mu <- 200
  sd <- 30
  n_array_1 <- c()
  exp_array_1 <- c()
  pop_sample_array_1 <- c()
  ID_array_1 <- "population1"
  for (n_1 in c(1:9)) {
    for (exp_1 in c(1:10)) {
      n_array_1 <- c(n_array_1, n_1)
      exp_array_1 <- c(exp_array_1, exp_1)
      population_sample_1 <- rnorm(n_1, mu, sd)
      pop_sample_array_1 <- c(pop_sample_array_1, population_sample_1)
      
    }
}

    mu_2 <- 260
    sd_2 <- 30
    n_array_2 <- c()
    exp_array_2 <- c()
    pop_sample_array_2 <- c()
    ID_array_2 <- "population2"
   
    for (n_2 in c(1:9)) {
      for (exp_2 in c(1:10)) {
        n_array_2 <- c(n_array_2, n_2)
        exp_array_2 <- c(exp_array_2, exp_2)
        population_sample_2 <- rnorm(n_2, mu_2, sd_2)
        pop_sample_array_2 <- c(pop_sample_array_2, population_sample_2)
      
        
      }
}
  
  df1 <- data.frame(n=n_array_1, exp=exp_array_1, sample=pop_sample_array_1, ID=ID_array_1)
  df2 <- data.frame(n=n_array_2, exp=exp_array_2, sample=pop_sample_array_2, ID=ID_array_2)
}

combineddf <- rbind(df1, df2)
view(combineddf)
#
## Running an ANOVA comparing the 2 groups specified
{
  combineddf$IDcombined <- as.factor(paste(combineddf$IDcombined, combineddf$ID, combineddf$n, combineddf$exp,  sep = "_"))
  combineddf$IDcombined <- as.factor(combineddf$IDcombined)
  combineddf
  resaov <- aov(sample ~ IDcombined, data=combineddf)
  test <- TukeyHSD(resaov)
  TK <- test
  TK_data<-as.data.frame(TK[1:1])
  TK_data$IDcombined <- rownames(TK_data)
}
{
LC1 <- separate(TK_data, col=IDcombined, into = c("A", "B"), sep = "-")
LC2 <- separate(LC1, col = A, into=c("ignore", "comparison_factor_1", "sample_size", "iteration"), sep="_")
LC3 <- separate(LC2, col=B, into=c("ignore", "comparison_factor_2", "sample_size_2", "iteration_2"), sep="_")
  }
{
  results <- dplyr::select(LC3, comparison_factor_1, comparison_factor_2, iteration, iteration_2, sample_size, sample_size_2, IDcombined.p.adj)
results <- dplyr::filter(results, comparison_factor_1=="population1" & comparison_factor_2=="population2" |  comparison_factor_1=="population2" & comparison_factor_2=="population1")
results <- dplyr::filter(results, sample_size == sample_size_2 & iteration == iteration_2)
}

#
## calculations of fractions significant p-values of a tukey test comparing the 2 populations based on the different sample sizes

results$significant <- results$IDcombined.p.adj < 0.05
for (sample_size in unique(results$sample_size)) {
  subset_results <- results[results$sample_size == sample_size, ]
  n_significant <- sum(subset_results$significant)
  n_total <- nrow(subset_results)
  fraction_significant <- n_significant / n_total
  cat(sample_size, " ", fraction_significant, "\n")
}

添加另一个“for”循环(带有“replicates”)解决了我的问题。现在我只需要将“cat”函数的输出存储在 table 或数据框中,我可以扩展脚本(添加更多组进行比较)脚本如下:

{
  library(tidyverse)
  library(dplyr)
  library(reshape2)
  library(multcomp)
  library(stringr)
}
rm(list=ls())
#
## Syncom C - (t=10)
{
  {
    mu_1 <- 628
    sd_1 <- 130
    n_array_1 <- c()
    exp_array_1 <- c()
    pop_sample_array_1 <- c()
    ID_array_1 <- "population1"
    for (n_1 in c(1:20)) {
      for (exp_1 in c(1:30)) {
        population_sample_1 <- rnorm(n_1, mu_1, sd_1)
        for (replicate_1 in population_sample_1) {
          n_array_1 <- c(n_array_1, n_1)
          exp_array_1 <- c(exp_array_1, exp_1)
          pop_sample_array_1 <- c(pop_sample_array_1, replicate_1)
        }
      }
    }
  }
}
#
## Control - (t=10)
{
  {
    mu_2 <- 423
    sd_2 <- 96
    n_array_2 <- c()
    exp_array_2 <- c()
    pop_sample_array_2 <- c()
    ID_array_2 <- "population2"
    for (n_2 in c(1:20)) {
      for (exp_2 in c(1:30)) {
        population_sample_2 <- rnorm(n_2, mu_2, sd_2)
        for (replicate_2 in population_sample_2) {
          n_array_2 <- c(n_array_2, n_2)
          exp_array_2 <- c(exp_array_2, exp_2)
          pop_sample_array_2 <- c(pop_sample_array_2, replicate_2)
        }
      }
    }
  }
}
{  
  df1 <- data.frame(n=n_array_1, exp=exp_array_1, sample=pop_sample_array_1, ID=ID_array_1)
  df2 <- data.frame(n=n_array_2, exp=exp_array_2, sample=pop_sample_array_2, ID=ID_array_2)
}

combineddf <- rbind(df1, df2)
#view(combineddf)
#
## Running an ANOVA comparing the 2 groups specified
{
  combineddf$IDcombined <- as.factor(paste(combineddf$IDcombined, combineddf$ID, combineddf$n, combineddf$exp,  sep = "_"))
  combineddf$IDcombined <- as.factor(combineddf$IDcombined)
  combineddf
  resaov <- aov(sample ~ IDcombined, data=combineddf)
  test <- TukeyHSD(resaov)
  TK <- test
  TK_data<-as.data.frame(TK[1:1])
  TK_data$IDcombined <- rownames(TK_data)
}
{
LC1 <- separate(TK_data, col=IDcombined, into = c("A", "B"), sep = "-")
LC2 <- separate(LC1, col = A, into=c("ignore", "comparison_factor_1", "sample_size", "iteration"), sep="_")
LC3 <- separate(LC2, col=B, into=c("ignore", "comparison_factor_2", "sample_size_2", "iteration_2"), sep="_")
  }
{
  results <- dplyr::select(LC3, comparison_factor_1, comparison_factor_2, iteration, iteration_2, sample_size, sample_size_2, IDcombined.p.adj)
results <- dplyr::filter(results, comparison_factor_1=="population1" & comparison_factor_2=="population2" |  comparison_factor_1=="population2" & comparison_factor_2=="population1")
results <- dplyr::filter(results, sample_size == sample_size_2 & iteration == iteration_2)
}

#
## calculations of fractions significant p-values of a tukey test comparing the 2 populations based on the different sample sizes

results$significant <- results$IDcombined.p.adj < 0.05
for (sample_size in unique(results$sample_size)) {
  subset_results <- results[results$sample_size == sample_size, ]
  n_significant <- sum(subset_results$significant)
  n_total <- nrow(subset_results)
  fraction_significant <- n_significant / n_total
  cat(sample_size, " ", fraction_significant, "\n")
}