Purrr 映射到拆分的数据帧上以获得每个组的 AUROC

Purrr map over splited dataframe to get AUROC for each group

我已经在这里问过这个问题 (Purrr map over splitted training-dataframe to get auroc for each model),但是数据真的很糟糕,这个问题可能有点令人困惑。另外我找到了解决这个问题的方法,但感觉不是一个好方法。

所以问题如下。我根据 geology 列拆分数据框。然后,我想使用训练数据框中的其他两列来预测 trigger 列。我想为每个地质学(列表的每个元素 - 数据框)执行此操作。 然后我想在新的测试数据帧上预测这个模型并计算每个地质学的 AUROC。

这里是训练数据:

# the training data
structure(list(p3 = c(5, 0, 0, 2, 0, 0, 0, 2.8999999165535, 0, 
0, 0, 0, 17.5999999046326, 0, 0, 0.899999976158142, 10.0000000149012, 
1.79999995231628, 0, 8.80000019073486, 0, 0, 11.8999999761581, 
0, 9.60000026226044, 5.19999980926514, 11.7000002861023, 20.0999999046326, 
34.6000008583069, 0, 8.70000028610229, 33.8000000119209, 2.40000009536743, 
17, 27, 36.7999992370605, 0, 15.8999997973442, 0.300000011920929, 
7.69999980926514, 0, 0.899999976158142, 1.5, 0.300000011920929, 
3.50000002980232, 51.3999991416931, 6.09999990463257, 0.400000005960464, 
22, 65.3000020980835), p15 = c(34.2999999374151, 10.4999997392297, 
8.30000010877848, 69.4999992623925, 1.30000001192093, 0, 62.3999992161989, 
71.8999995738268, 32.6999994888902, 4, 0, 0.400000005960464, 
35.699999935925, 24.1000001206994, 0, 53.8999998271465, 41.8999992236495, 
37.8999994322658, 24.0999998524785, 65.5999999046326, 0, 0, 20.5000002905726, 
75.4000002145767, 68.1999989748001, 45.9000007808208, 180.999998480082, 
70.5000009462237, 112.099999666214, 11.3000001907349, 88.0999987274408, 
103.499998867512, 100.399998664856, 59.9999995827675, 200.699998855591, 
21.2999993562698, 47.1999997496605, 42.1999989748001, 58.6000000834465, 
161.299998879433, 43.3999999314547, 110.899999141693, 73.9000004529953, 
46.7999998703599, 25.7999995350838, 21.1000004559755, 86.100000500679, 
15.8999998569489, 5.3999999538064, 143.399998903275), trigger = c(FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), 
    geology = c("Sedimentary rocks", "Crystalline basement", 
    "Sedimentary rocks", "Sedimentary rocks", "Porphyry", "Porphyry", 
    "Porphyry", "Sedimentary rocks", "Sedimentary rocks", "Crystalline basement", 
    "Calcschists with ophiolites", "Crystalline basement", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Porphyry", "Crystalline basement", 
    "Crystalline basement", "Sedimentary rocks", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Crystalline basement", 
    "Crystalline basement", "Crystalline basement", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Sedimentary rocks", "Crystalline basement", 
    "Crystalline basement", "Crystalline basement", "Porphyry", 
    "Calcschists with ophiolites", "Crystalline basement", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Crystalline basement", 
    "Porphyry", "Crystalline basement", "Crystalline basement", 
    "Crystalline basement", "Crystalline basement", "Calcschists with ophiolites", 
    "Plutonite", "Crystalline basement", "Crystalline basement", 
    "Porphyry", "Sedimentary rocks")), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))

这里是测试数据


structure(list(p3 = c(6.40000009536743, 0, 0, 16.3000003397465, 
0, 0, 0, 0, 1, 1.5, 29.1000003814697, 7.49999982118607, 3.5, 
18.3999996185303, 2.69999990612268, 11.3000001907349, 0, 2, 9.5, 
0, 10.1999998092651, 0, 3.60000005364418, 0, 5.29999995231628, 
112.599998474121, 118.099997758865, 54.8999996185303, 72.8000011444092, 
79.9000015258789, 88.7000015377998, 0, 54.6000022888184, 144.599998474121, 
111.200000762939, 7.10000009834766, 32.0999999046326, 0.5, 5.3999999165535, 
0.300000011920929, 0, 36.7999982833862, 101.599998474121, 121.699998855591, 
31.0999994277954, 66.8000020980835, 139.200000762939, 9.50000011920929, 
135.300003051758, 110.900001525879), p15 = c(12.3999996185303, 
63.8000009655952, 20.7000007629395, 121.299998179078, 10.4000001549721, 
27.1999999880791, 49.5000003874302, 13.3000001907349, 31.3999998569489, 
15.4000002890825, 64.3999997377396, 25.1000001430511, 43.6999994516373, 
50.799999833107, 35.1999998092651, 35.1999998837709, 67.1000003442168, 
19.400000333786, 49.300000667572, 21.3999996706843, 75.600000411272, 
38.700000859797, 30.2999994754791, 14.9000003933907, 53.2000011727214, 
137.900000333786, 0.100000001490116, 119.300001859665, 139.700000107288, 
147.799997329712, 45.3000004068017, 56.5000000670552, 47.7999995946884, 
2.90000009536743, 139.499999403954, 26.6999999284744, 6.5, 149.700001835823, 
210.299998342991, 114.499999642372, 3.60000002384186, 60.099999524653, 
97.5999984890223, 153.100000120699, 245.299996376038, 123.49999922514, 
3.70000004768372, 90.5999985486269, 49.1000001132488, 138.599999785423
), trigger = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE)), row.names = c(NA, -50L), groups = structure(list(
    trigger = c(FALSE, TRUE), .rows = structure(list(1:25, 26:50), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

我现在为每个地质学拟合逻辑回归模型 trigger ~ p3 + 15 并为每个 class 获取 auroc 所做的工作如下:


res = train %>% split(., .$geology) %>%
  map( ~ glm(trigger ~ p3 + p15, data = .x, family = "binomial")) %>%
  map( ~ predict(.x, newdata = test, type = "response")) %>%
  map(function(x) {
    df = data.frame(ref = test$trigger)
    df[["pred"]] = x
    df
  }) %>% map_dfr(function(x) {
   auc = as.numeric(roc(ref ~ pred, data = x)$auc)
  }) %>% pivot_longer(cols = everything(), names_to = "geology", values_to="auc")

但有些部分(function(x){...})我想用更简洁的 purrr 风格的语法替换。我在思考 .x. 以及何时使用 {} 来防止结果传递到小标题时遇到了一些麻烦(在某些时候可能是必要的)。 那么我怎样才能获得相同的结果,但是省略了 function(x) 语法?

library(purrr)
library(tidyr)
library(pROC)

res <- train %>% 
  split(., .$geology) %>%
  map( ~ glm(trigger ~ p3 + p15, data = .x, family = "binomial")) %>%
  map( ~ predict(.x, newdata = test, type = "response")) %>%
  map( ~ data.frame(ref = test$trigger, pred = .x)) %>% 
  map_dfc( ~ as.numeric(roc(ref ~ pred, data = .x)$auc)) %>% 
  pivot_longer(cols = everything(), names_to = "geology", values_to = "auc")

.是从管道传来的数据,以便在函数中使用。 purrr 函数改为提供 .x 参数作为传递给函数的数据。

因为您使用 pivot_longer,每个结果需要一列,所以我使用了 map_dfc。要从 function(x) 样式转换,我认为最好的方法是考虑你想要什么 return,在你的情况下是 data.frame 和一个值,所以你也可以将它写在~ 风格。

有很多方法可以实现这一点,请参阅下面我的方法。但是请注意,在像这样的映射链中使用函数可能是一项非常有用的技术,原则上不应避免。

library(purrr)

train %>%
  split(~ geology) %>%
  map(~ glm(trigger ~ p3 + p15, data = .x, family = "binomial")) %>%
  map(~ predict(.x, newdata = test, type = "response")) %>%
  map(~ data.frame(ref = test$trigger, pred = .x)) %>%
  map(~ pROC::roc(ref ~ pred, data = .x)$auc) %>%
  unlist() %>%
  tibble::tibble(geology = names(.), auc = .)

Returns:

# A tibble: 5 x 2
  geology                       auc
  <chr>                       <dbl>
1 Calcschists with ophiolites 0.794
2 Crystalline basement        0.84 
3 Plutonite                   0.5  
4 Porphyry                    0.864
5 Sedimentary rocks           0.912

您可以使用 group_bysummarise 而不是 split()。起初列表列使用起来有点棘手,但一旦你习惯了它们,它们在很多情况下都非常有用。我会尝试这样的事情:

library(tidyverse)

train %>% 
  group_by(geology) %>% 
  summarise(
    model = list(glm(trigger ~ p3 + p15, data = cur_data(), family = "binomial")),
    yhat = map(model, ~predict(.x, newdata = test, type = "response")),
    auc = map_dbl(yhat, ~pROC::roc(test$trigger, .x)$auc)
  ) %>% 
  select(geology, auc)

## A tibble: 5 x 2
#  geology                       auc
#  <chr>                       <dbl>
# 1 Calcschists with ophiolites 0.794
# 2 Crystalline basement        0.84 
# 3 Plutonite                   0.5  
# 4 Porphyry                    0.864
# 5 Sedimentary rocks           0.912

或者不创建临时列

train %>% 
  group_by(geology) %>% 
  summarise(
    auc = glm(trigger ~ p3 + p15, data = cur_data(), family = "binomial") %>% 
      predict(newdata = test, type = "response") %>% 
      pROC::roc(test$trigger, .) %>% `$`("auc") %>% as.numeric()
  )

reprex package (v1.0.0)

于 2021-06-28 创建