如何将列表元素提取到 r 中的多个 tibble 列中?

How to extract list elements into multiple tibble columns in r?

我有一个非常大的 tibble 形式的数据集。我想使用 return 列出的一些函数来总结数据。我对列表中的几个组件感兴趣,我想 return 我需要的每个组件都放入新的 tibble 列中。

这是一个例子

library(tibble)
library(dplyr)

# Create a data set of 1,000 random values in 100 subgroups with sample size 10
contrived_data <- tibble(subgroup = rep(1:100, each = 10),
                         value    = rnorm(1000, mean = 5, sd = 1))


# Run the KS test vs. normal distribution on each sample of size 10. Return the KS statistic and p-value
# into new tibble columns
contrived_data %>% group_by(subgroup) %>%
  summarize(avg     = mean(value),
            std_dev = sd(value),
            ks_stat = ks.test(value, "pnorm", mean = 5, sd = 1)$statistic,
            ks_pval = ks.test(value, "pnorm", mean = 5, sd = 1)$p.value)

运行 这样可以得到我想要的结果,但是效率不高。调用 ks.test 函数两次意味着执行时间(几乎)加倍。似乎必须有一种更有效的方法来通过单个函数调用提取这两个列表组件,但我不知道该怎么做。

测试可以是运行一次并包裹在list中,然后使用map(来自purrr)提取值

library(purrr)
library(dplyr)
library(tidyr)
contrived_data %>% 
      group_by(subgroup) %>%
      summarize(avg     = mean(value),
                std_dev = sd(value), 
            test = list(ks.test(value, "pnorm", mean = 5, sd = 1))) %>%
      mutate(out = map(test, ~  tibble(ks_stat = .x$statistic,
                      ks_pval = .x$p.value))) %>%
      unnest_wider(c(out)) %>%
      select(-test)
# A tibble: 100 x 5
#   subgroup   avg std_dev ks_stat ks_pval
#      <int> <dbl>   <dbl>   <dbl>   <dbl>
# 1        1  4.52   0.675   0.375  0.0907
# 2        2  5.17   1.02    0.342  0.152 
# 3        3  5.02   0.909   0.141  0.972 
# 4        4  5.08   0.846   0.313  0.227 
# 5        5  4.82   0.819   0.225  0.614 
# 6        6  5.07   0.866   0.159  0.928 
# 7        7  4.94   0.914   0.145  0.966 
# 8        8  5.52   1.01    0.290  0.306 
# 9        9  5.17   0.787   0.258  0.443 
#10       10  4.61   1.15    0.476  0.0132
# … with 90 more rows

另一种选择是 tidy 输出(使用 broom)并一次提取所有组件

library(broom)
contrived_data %>% 
       group_by(subgroup) %>%
       summarize(avg     = mean(value),
                 std_dev = sd(value), 
                 out = list(tidy(ks.test(value, "pnorm", mean = 5, sd = 1)))) %>%
       unnest_wider(c(out))

使用 rowwise 命令的 dplyr 解决方案,它执行与 map 相同的任务。

contrived_data %>%
      group_by(subgroup) %>%
      summarise(
        avg = mean(value),
        std_dev = sd(value),
        ks_test = list(ks.test(value,"pnorm",mean=5,sd=1))
      ) %>%
      ungroup() %>%
      rowwise() %>%
      mutate(
        ks_stat = ks_test$statistic,
        ks_pval = ks_test$p.value
      ) %>%
      ungroup() %>%
      select(-ks_test)

# A tibble: 100 x 5
#   subgroup   avg std_dev ks_stat ks_pval
#      <int> <dbl>   <dbl>   <dbl>   <dbl>
# 1        1  5.10   1.24    0.186  0.819 
# 2        2  4.86   0.805   0.231  0.584 
# 3        3  5.24   0.729   0.258  0.445 
# 4        4  5.16   0.642   0.307  0.247 
# 5        5  4.63   0.752   0.393  0.0664

# Benchmark using rbenchmark:
#      test replications elapsed relative user.self sys.self user.child sys.child
#2   nested         1000   10.58    1.000     10.58        0         NA        NA
#1 original         1000   16.75    1.583     16.73        0         NA        NA

您可以定义函数并使用来自 purrr 的地图:

library(tibble)
library(dplyr)
library(purrr)

func = function(DA){
kstest = ks.test(DA$value, "pnorm", mean = 5, sd = 1)
data.frame(
subgroup = unique(DA$subgroup),
avg=mean(DA$value),
std_dev = sd(DA$value),
ks_stat = kstest$statistic,
ks_pval = kstest$p.value)
}

contrived_data %>% 
split(.$subgroup) %>%
map_dfr(func)

您可以使用group_modify

library(tidyverse)

contrived_data %>% 
  group_by(subgroup) %>% 
  group_modify(~{
      ks <- ks.test(.$value, "pnorm", mean = 5, sd = 1)
      tibble(
        avg = mean(.$value), 
        std_dev = sd(.$value),
        ks_stat = ks$statistic,
        ks_pval = ks$p.value) 
  })

或 data.table

library(data.table)
setDT(contrived_data)

contrived_data[, {
  ks <- ks.test(value, "pnorm", mean = 5, sd = 1)
  .(avg = mean(value), 
    std_dev = sd(value),
    ks_stat = ks$statistic,
    ks_pval = ks$p.value) 
}, by = subgroup]