在 glm 的摘要输出中提取 NA 值

Question

我是运行对大型数据框的许多不同子集进行逻辑回归。为此，我使用了以下代码（使用 dplyr 和 purrr）：

# define model to be run
mod_fun <- function(df) {
  glm(presence ~ transect, data = df, family = "binomial")
}

# nest data and run model
mod.glm <- dat %>%
  nest(-c(region, fYear, species, road)) %>%
  mutate(model = map(data, mod_fun))

# define functions to extract model coefficients
b_fun <- function(mod) {
  coef(summary(mod))[2]
}
p_fun <- function(mod) {
  coef(summary(mod))[8]
}

# extract coefficients
slope<-mod.glm %>% group_by(species, region, fYear, road) %>%
  transmute(beta = map_dbl(model, b_fun),
            p_val = map_dbl(model, p_fun))

如您所见，我只想提取斜率的估计值和 p 值（称为 transect）。为此，我使用索引 coef(summary(mod))[2] 等。问题是我的数据框中也有一些子集导致过度确定的系统，其中一些系数将设置为 NA。使用 coef(summary(mod))[2] 提取 coef() 输出的第二个值，并且由于在 coef() 中忽略了 NA，这将不再是我要提取的 transect 的估计值。到目前为止，我尝试取消 coef(summary(mod_2), complete = TRUE)（--> 没有任何变化，NA 仍然没有显示）并直接对值进行寻址 coef(summary(mod_2), complete = TRUE)["transect","Estimate"]（--> 抛出错误）。有谁知道我该如何解决这个问题？

到目前为止我尝试了什么：

# two example models; mod_2 will result in NAs
mod_1 <- glm(presence ~ transect, data = dat[dat$fYear == 1&  dat$species=="Plantago lanceolata",], family = "binomial")
mod_2 <- glm(presence ~ transect, data = dat[dat$fYear == 2&  dat$species=="Plantago lanceolata",], family = "binomial")

coef(summary(mod_1))[2] # works fine
coef(summary(mod_2))[2] # not the value I want

coef(summary(mod_1), complete = TRUE)["transect","Estimate"] # works fine
coef(summary(mod_2), complete = TRUE)["transect","Estimate"] # error

coef(summary(mod_2), complete = TRUE) # NAs for transect are still not displayed

summary(mod_2)$coefficients["transect","Estimate"] # is not working either

数据：

dput(dat)
structure(list(region = c("HWI", "HWI", "HWI", "HWI", "HWI", 
"HWI", "HWI", "HWI", "HWI", "HWI", "HWI", "HWI", "HWI", "HWI", 
"HWI", "HWI", "HWI", "HWI", "HWI", "HWI", "HWI", "HWI", "HWI", 
"HWI", "HWI", "HWI", "HWI", "HWI", "HWI"), road = c("MK", "MK", 
"MK", "MK", "MK", "MK", "MK", "MK", "MK", "MK", "MK", "MK", "MK", 
"MK", "MK", "MK", "MK", "MK", "MK", "MK", "MK", "MK", "MK", "MK", 
"MK", "MK", "MK", "MK", "MK"), transect = c(1L, 1L, 2L, 2L, 3L, 
3L, 4L, 4L, 4L, 4L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 
11L, 11L, 12L, 12L, 13L, 13L, 15L, 15L, 1L), fYear = c(1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), species = c("Plantago lanceolata", 
"Poa pratensis", "Plantago lanceolata", "Poa pratensis", "Plantago lanceolata", 
"Poa pratensis", "Plantago lanceolata", "Poa pratensis", "Plantago lanceolata", 
"Poa pratensis", "Plantago lanceolata", "Poa pratensis", "Plantago lanceolata", 
"Poa pratensis", "Plantago lanceolata", "Poa pratensis", "Plantago lanceolata", 
"Poa pratensis", "Plantago lanceolata", "Poa pratensis", "Plantago lanceolata", 
"Poa pratensis", "Plantago lanceolata", "Poa pratensis", "Plantago lanceolata", 
"Poa pratensis", "Plantago lanceolata", "Poa pratensis", "Plantago lanceolata"
), presence = c(1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1)), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -29L), groups = structure(list(
    fYear = c(1L, 1L, 2L), road = c("MK", "MK", "MK"), species = c("Plantago lanceolata", 
    "Poa pratensis", "Plantago lanceolata"), .rows = list(c(1L, 
    3L, 5L, 7L, 9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L
    ), c(2L, 4L, 6L, 8L, 10L, 12L, 14L, 16L, 18L, 20L, 22L, 24L, 
    26L, 28L), 29L)), row.names = c(NA, -3L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE))

感谢您的帮助！

Answer 1

我不知道如何从系数 table 中提取 NA 行。相反，答案可能是在提取所需元素后添加 NA 行。这可以用 complete() 来完成。

在这个例子中，我使用 broom::tidy() 因为它可以很容易地获得系数、不确定性的度量和测试结果。这意味着我必须在事后过滤掉拦截，但是如果您在 complete() on.

上添加一列，您当然可以对您的函数执行类似的操作

library(purrr)
library(tidyr)
library(dplyr)

mod.glm %>%
     group_by(species, region, fYear, road) %>%
     transmute(results = map(model, broom::tidy) ) %>%
     unnest(results) %>%
     complete(term = "transect") %>%
     filter(term != "(Intercept)") %>%
     ungroup()

# A tibble: 3 x 9
  species          region fYear road  term    estimate std.error statistic p.value
  <chr>            <chr>  <int> <chr> <chr>      <dbl>     <dbl>     <dbl>   <dbl>
1 Plantago lanceo~ HWI        1 MK    transe~  -42.7   31127.     -0.00137   0.999
2 Plantago lanceo~ HWI        2 MK    transe~   NA        NA      NA        NA    
3 Poa pratensis    HWI        1 MK    transe~   -0.206     0.188  -1.10      0.272

Answer 2

走一条完全不同的路线，您可以将提取函数更改为 return NA 当它们出错时。这是像 tryCatch() 这样的函数的工作，但我发现 purrr 中的 possibly() 对于这类任务非常方便。

possibly() 环绕一个函数。 otherwise 参数表示如果在使用该函数时发生错误，return 的值。

这是你的两个函数，封装在 possibly() 中。我已将它们更改为专门使用系数摘要的 "transect" 行，因此如果不存在，这将出错。

b_fun <- possibly(
     function(mod) {
          coef(summary(mod))["transect", 1]
          }, otherwise = NA)

p_fun <- possibly(
     function(mod) {
          coef(summary(mod))["transect", 4]
          }, otherwise = NA)

# extract coefficients
mod.glm %>%
     group_by(species, region, fYear, road) %>%
     transmute(beta = map_dbl(model, b_fun),
               p_val = map_dbl(model, p_fun) ) %>%
     ungroup() 

# A tibble: 3 x 6
  species             region fYear road     beta  p_val
  <chr>               <chr>  <int> <chr>   <dbl>  <dbl>
1 Plantago lanceolata HWI        1 MK    -42.7    0.999
2 Poa pratensis       HWI        1 MK     -0.206  0.272
3 Plantago lanceolata HWI        2 MK     NA     NA

在 glm 的摘要输出中提取 NA 值

Extract NA Values in Summary Output of glm

r

glm

na

purrr