使用列表 运行 一组代码进行多项统计测试

Using lists to run multiple statistics test with one set of code

我有兴趣使用列表 运行 通过一组代码进行多项统计测试。

例如,我想 运行 glm() 测试根据数据框/列表中的行在 DV、IV、数据和系列方面有所不同。我可以长期执行此操作,并且可以使用 lapply() 以“中等方式”执行此操作,以便我可以更改测试中使用的 DV。但我想知道是否有一种方法{最好使用 lapply()} 以更少的代码和更 automated/iterative 的方式完成此任务。

对于示例数据,我使用 ggplot2::diamonds 数据和以下代码创建了 2 个数据集:

### for dataset with top 300 rows
# ---- NOTE: selects only the top 300 rows of the dataset
diamonds_top300 <- data.frame(dplyr::top_n(diamonds, 300, table))
# ---- NOTE: gives dataset info
head(diamonds_top300)
str(diamonds_top300)
colnames(diamonds_top300)
nrow(diamonds_top300)
# ---- NOTE: gives unique values of Fixed and Random effects, and dvs
unique(diamonds_top300$price)
unique(diamonds_top300$y)
unique(diamonds_top300$cut)
unique(diamonds_top300$color)
unique(diamonds_top300$carat)
unique(diamonds_top300$clarity)
unique(diamonds_top300$depth)
unique(diamonds_top300$table)

### for dataset with bottom 300 rows
# ---- NOTE: selects only the bottom 300 rows of the dataset
diamonds_bottom300 <- data.frame(dplyr::top_n(diamonds, -300, table))
# ---- NOTE: gives dataset info
head(diamonds_bottom300)
str(diamonds_bottom300)
colnames(diamonds_bottom300)
nrow(diamonds_bottom300)
# ---- NOTE: gives unique values of Fixed and Random effects, and dvs
unique(diamonds_bottom300$price)
unique(diamonds_bottom300$y)
unique(diamonds_bottom300$cut)
unique(diamonds_bottom300$color)
unique(diamonds_bottom300$carat)
unique(diamonds_bottom300$clarity)
unique(diamonds_bottom300$depth)
unique(diamonds_bottom300$table)

然后我使用这些数据创建了一个包含列表信息的数据框,并得到了这些结果:

## creates df with variable info
model_variable_df <-
  data.frame(
    cbind(
      DV_name = c("carat", "depth", "price"),
      DV_label = c("carat size", "depth size", "diamond price"),
      dataset_name = c("diamonds_bottom300", "diamonds_bottom300", "diamonds_top300"),
      IV_name = c("x + y + color", "x + y + clarity", "x + z + color"),
      family = c("poisson", "poisson", "gaussian")
    )
  )

> model_variable_df
  DV_name      DV_label       dataset_name         IV_name   family
1   carat    carat size diamonds_bottom300   x + y + color  poisson
2   depth    depth size diamonds_bottom300 x + y + clarity  poisson
3   price diamond price    diamonds_top300   x + z + color gaussian

我可以使用 long 方法完成我的任务:

## long form of 3 models

### creates first model
freq_glm_poisson_carat <- 
  (
  glm(
    carat ~ x + y + color,
             data = diamonds_bottom300, 
             family = poisson()
           )
  )

### creates 2nd model
freq_glm_binomial_depth <- 
  glm(
    depth ~ x + y + clarity,
    data = diamonds_bottom300, 
    family= poisson()
  )

### creates 3rd model
freq_glm_gaussian_price <- 
  glm(
    price ~ x + z + color,
    data = diamonds_top300, 
    family= gaussian()
  )

我也可以使用媒介方法来完成更具体和有限的 DV_name 任务。

## model that uses lapply, and just values DV
# ---- NOTE: DV_name is the only thing that changes
# ---- NOTE: IVs / effects (fixed and random) are the same as x + y + color, or model 1
# ---- NOTE: data = diamonds_top300 only
# ---- NOTE: family = poisson() only
# ---- NOTE: creates list object
freq_checking_mlm_poisson_Effects_x_y_color_1z_model <- 
  lapply(model_variable_df$DV_name,
         function(DV_list) wrapr::let(
           c(DV_col = DV_list, 
             dataset_obj = "diamonds_top300"),
           glm(
             DV_col ~ x + y + color, 
             data = dataset_obj,
             family = poisson()
             )
         )
  )
# ---- NOTE: changes list object name
freq_checking_mlm_poisson_Effects_x_y_color_1z_model <- 
  setNames(freq_checking_mlm_poisson_Effects_x_y_color_1z_model, paste("freq_checking_mlm_poisson_Effects_x_y_color_1z_model_contracts_filter", 
                                                                       model_variable_df$DV_name,
                                                                       sep = "__")
  )
# ---- NOTE: creates unique objects for each part list object
list2env(freq_checking_mlm_poisson_Effects_x_y_color_1z_model, .GlobalEnv)
# ---- NOTE: gathers objects with prefix
apropos("freq_checking_mlm_poisson_Effects_x_y_color_1z_model_contracts_filter")

有什么方法可以让我使用 (1) 更少的代码和 (2) 更多的代码来完成这项任务 iteration/automation?非常感谢任何帮助。

仅供参考,我在 2013 Intel Macbook Pro 上使用 RStudio。

谢谢。



练习代码:

# sets up data

## Loads packages
# ---- NOTE: making plots and diamonds dataset
if(!require(ggplot2)){install.packages("ggplot2")}
# ---- NOTE: run mixed effects models
if(!require(lme4)){install.packages("lme4")}
# ---- NOTE: for data wrangling
if(!require(dplyr)){install.packages("dplyr")}
# ---- NOTE: for iteration
if(!require(wrapr)){install.packages("wrapr")}

## dataset creation

### for dataset with top 300 rows
# ---- NOTE: selects only the top 300 rows of the dataset
diamonds_top300 <- data.frame(dplyr::top_n(diamonds, 300, table))
# ---- NOTE: gives dataset info
head(diamonds_top300)
str(diamonds_top300)
colnames(diamonds_top300)
nrow(diamonds_top300)
# ---- NOTE: gives unique values of Fixed and Random effects, and dvs
unique(diamonds_top300$price)
unique(diamonds_top300$y)
unique(diamonds_top300$cut)
unique(diamonds_top300$color)
unique(diamonds_top300$carat)
unique(diamonds_top300$clarity)
unique(diamonds_top300$depth)
unique(diamonds_top300$table)

### for dataset with bottom 300 rows
# ---- NOTE: selects only the bottom 300 rows of the dataset
diamonds_bottom300 <- data.frame(dplyr::top_n(diamonds, -300, table))
# ---- NOTE: gives dataset info
head(diamonds_bottom300)
str(diamonds_bottom300)
colnames(diamonds_bottom300)
nrow(diamonds_bottom300)
# ---- NOTE: gives unique values of Fixed and Random effects, and dvs
unique(diamonds_bottom300$price)
unique(diamonds_bottom300$y)
unique(diamonds_bottom300$cut)
unique(diamonds_bottom300$color)
unique(diamonds_bottom300$carat)
unique(diamonds_bottom300$clarity)
unique(diamonds_bottom300$depth)
unique(diamonds_bottom300$table)

## creates df with variable info
model_variable_df <-
  data.frame(
    cbind(
      DV_name = c("carat", "depth", "price"),
      DV_label = c("carat size", "depth size", "diamond price"),
      dataset_name = c("diamonds_bottom300", "diamonds_bottom300", "diamonds_top300"),
      IV_name = c("x + y + color", "x + y + clarity", "x + z + color"),
      family = c("poisson", "poisson", "gaussian")
    )
  )

## long for of 3 models

### creates first model
freq_glm_poisson_carat <- 
  (
  glm(
    carat ~ x + y + color,
             data = diamonds_bottom300, 
             family = poisson()
           )
  )

### creates 2nd model
freq_glm_binomial_depth <- 
  glm(
    depth ~ x + y + clarity,
    data = diamonds_bottom300, 
    family= poisson()
  )

### creates 3rd model
freq_glm_gaussian_price <- 
  glm(
    price ~ x + z + color,
    data = diamonds_top300, 
    family= gaussian()
  )

## model that uses lapply, and just values DV
# ---- NOTE: DV_name is the only thing that changes
# ---- NOTE: IVs / effects (fixed and random) are the same as x + y + color, or model 1
# ---- NOTE: data = diamonds_top300 only
# ---- NOTE: family = poisson() only
# ---- NOTE: creates list object
freq_checking_mlm_poisson_Effects_x_y_color_1z_model <- 
  lapply(model_variable_df$DV_name,
         function(DV_list) wrapr::let(
           c(DV_col = DV_list, 
             dataset_obj = "diamonds_top300"),
           glm(
             DV_col ~ x + y + color, 
             data = dataset_obj,
             family = poisson()
             )
         )
  )
# ---- NOTE: changes list object name
freq_checking_mlm_poisson_Effects_x_y_color_1z_model <- 
  setNames(freq_checking_mlm_poisson_Effects_x_y_color_1z_model, paste("freq_checking_mlm_poisson_Effects_x_y_color_1z_model_contracts_filter", 
                                                                       model_variable_df$DV_name,
                                                                       sep = "__")
  )
# ---- NOTE: creates unique objects for each part list object
list2env(freq_checking_mlm_poisson_Effects_x_y_color_1z_model, .GlobalEnv)
# ---- NOTE: gathers objects with prefix
apropos("freq_checking_mlm_poisson_Effects_x_y_color_1z_model_contracts_filter")

您可以使用 lapply 执行此操作:

lapply(seq(nrow(model_variable_df)), function(i) {
  val <- model_variable_df[i, ]
  glm(as.formula(paste(val$DV_name, val$IV_name, sep = '~')), 
      data = get(val$dataset_name), family = val$family)
}) -> model_list

as.formula用于将字符串转换为公式,get用于从字符串值中获取数据集。