如何将列表元素提取到 r 中的多个 tibble 列中?
How to extract list elements into multiple tibble columns in r?
我有一个非常大的 tibble 形式的数据集。我想使用 return 列出的一些函数来总结数据。我对列表中的几个组件感兴趣,我想 return 我需要的每个组件都放入新的 tibble 列中。
这是一个例子
library(tibble)
library(dplyr)
# Create a data set of 1,000 random values in 100 subgroups with sample size 10
contrived_data <- tibble(subgroup = rep(1:100, each = 10),
value = rnorm(1000, mean = 5, sd = 1))
# Run the KS test vs. normal distribution on each sample of size 10. Return the KS statistic and p-value
# into new tibble columns
contrived_data %>% group_by(subgroup) %>%
summarize(avg = mean(value),
std_dev = sd(value),
ks_stat = ks.test(value, "pnorm", mean = 5, sd = 1)$statistic,
ks_pval = ks.test(value, "pnorm", mean = 5, sd = 1)$p.value)
运行 这样可以得到我想要的结果,但是效率不高。调用 ks.test
函数两次意味着执行时间(几乎)加倍。似乎必须有一种更有效的方法来通过单个函数调用提取这两个列表组件,但我不知道该怎么做。
测试可以是运行一次并包裹在list
中,然后使用map
(来自purrr
)提取值
library(purrr)
library(dplyr)
library(tidyr)
contrived_data %>%
group_by(subgroup) %>%
summarize(avg = mean(value),
std_dev = sd(value),
test = list(ks.test(value, "pnorm", mean = 5, sd = 1))) %>%
mutate(out = map(test, ~ tibble(ks_stat = .x$statistic,
ks_pval = .x$p.value))) %>%
unnest_wider(c(out)) %>%
select(-test)
# A tibble: 100 x 5
# subgroup avg std_dev ks_stat ks_pval
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 4.52 0.675 0.375 0.0907
# 2 2 5.17 1.02 0.342 0.152
# 3 3 5.02 0.909 0.141 0.972
# 4 4 5.08 0.846 0.313 0.227
# 5 5 4.82 0.819 0.225 0.614
# 6 6 5.07 0.866 0.159 0.928
# 7 7 4.94 0.914 0.145 0.966
# 8 8 5.52 1.01 0.290 0.306
# 9 9 5.17 0.787 0.258 0.443
#10 10 4.61 1.15 0.476 0.0132
# … with 90 more rows
另一种选择是 tidy
输出(使用 broom
)并一次提取所有组件
library(broom)
contrived_data %>%
group_by(subgroup) %>%
summarize(avg = mean(value),
std_dev = sd(value),
out = list(tidy(ks.test(value, "pnorm", mean = 5, sd = 1)))) %>%
unnest_wider(c(out))
使用 rowwise
命令的 dplyr
解决方案,它执行与 map
相同的任务。
contrived_data %>%
group_by(subgroup) %>%
summarise(
avg = mean(value),
std_dev = sd(value),
ks_test = list(ks.test(value,"pnorm",mean=5,sd=1))
) %>%
ungroup() %>%
rowwise() %>%
mutate(
ks_stat = ks_test$statistic,
ks_pval = ks_test$p.value
) %>%
ungroup() %>%
select(-ks_test)
# A tibble: 100 x 5
# subgroup avg std_dev ks_stat ks_pval
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 5.10 1.24 0.186 0.819
# 2 2 4.86 0.805 0.231 0.584
# 3 3 5.24 0.729 0.258 0.445
# 4 4 5.16 0.642 0.307 0.247
# 5 5 4.63 0.752 0.393 0.0664
# Benchmark using rbenchmark:
# test replications elapsed relative user.self sys.self user.child sys.child
#2 nested 1000 10.58 1.000 10.58 0 NA NA
#1 original 1000 16.75 1.583 16.73 0 NA NA
您可以定义函数并使用来自 purrr 的地图:
library(tibble)
library(dplyr)
library(purrr)
func = function(DA){
kstest = ks.test(DA$value, "pnorm", mean = 5, sd = 1)
data.frame(
subgroup = unique(DA$subgroup),
avg=mean(DA$value),
std_dev = sd(DA$value),
ks_stat = kstest$statistic,
ks_pval = kstest$p.value)
}
contrived_data %>%
split(.$subgroup) %>%
map_dfr(func)
您可以使用group_modify
library(tidyverse)
contrived_data %>%
group_by(subgroup) %>%
group_modify(~{
ks <- ks.test(.$value, "pnorm", mean = 5, sd = 1)
tibble(
avg = mean(.$value),
std_dev = sd(.$value),
ks_stat = ks$statistic,
ks_pval = ks$p.value)
})
或 data.table
library(data.table)
setDT(contrived_data)
contrived_data[, {
ks <- ks.test(value, "pnorm", mean = 5, sd = 1)
.(avg = mean(value),
std_dev = sd(value),
ks_stat = ks$statistic,
ks_pval = ks$p.value)
}, by = subgroup]
我有一个非常大的 tibble 形式的数据集。我想使用 return 列出的一些函数来总结数据。我对列表中的几个组件感兴趣,我想 return 我需要的每个组件都放入新的 tibble 列中。
这是一个例子
library(tibble)
library(dplyr)
# Create a data set of 1,000 random values in 100 subgroups with sample size 10
contrived_data <- tibble(subgroup = rep(1:100, each = 10),
value = rnorm(1000, mean = 5, sd = 1))
# Run the KS test vs. normal distribution on each sample of size 10. Return the KS statistic and p-value
# into new tibble columns
contrived_data %>% group_by(subgroup) %>%
summarize(avg = mean(value),
std_dev = sd(value),
ks_stat = ks.test(value, "pnorm", mean = 5, sd = 1)$statistic,
ks_pval = ks.test(value, "pnorm", mean = 5, sd = 1)$p.value)
运行 这样可以得到我想要的结果,但是效率不高。调用 ks.test
函数两次意味着执行时间(几乎)加倍。似乎必须有一种更有效的方法来通过单个函数调用提取这两个列表组件,但我不知道该怎么做。
测试可以是运行一次并包裹在list
中,然后使用map
(来自purrr
)提取值
library(purrr)
library(dplyr)
library(tidyr)
contrived_data %>%
group_by(subgroup) %>%
summarize(avg = mean(value),
std_dev = sd(value),
test = list(ks.test(value, "pnorm", mean = 5, sd = 1))) %>%
mutate(out = map(test, ~ tibble(ks_stat = .x$statistic,
ks_pval = .x$p.value))) %>%
unnest_wider(c(out)) %>%
select(-test)
# A tibble: 100 x 5
# subgroup avg std_dev ks_stat ks_pval
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 4.52 0.675 0.375 0.0907
# 2 2 5.17 1.02 0.342 0.152
# 3 3 5.02 0.909 0.141 0.972
# 4 4 5.08 0.846 0.313 0.227
# 5 5 4.82 0.819 0.225 0.614
# 6 6 5.07 0.866 0.159 0.928
# 7 7 4.94 0.914 0.145 0.966
# 8 8 5.52 1.01 0.290 0.306
# 9 9 5.17 0.787 0.258 0.443
#10 10 4.61 1.15 0.476 0.0132
# … with 90 more rows
另一种选择是 tidy
输出(使用 broom
)并一次提取所有组件
library(broom)
contrived_data %>%
group_by(subgroup) %>%
summarize(avg = mean(value),
std_dev = sd(value),
out = list(tidy(ks.test(value, "pnorm", mean = 5, sd = 1)))) %>%
unnest_wider(c(out))
使用 rowwise
命令的 dplyr
解决方案,它执行与 map
相同的任务。
contrived_data %>%
group_by(subgroup) %>%
summarise(
avg = mean(value),
std_dev = sd(value),
ks_test = list(ks.test(value,"pnorm",mean=5,sd=1))
) %>%
ungroup() %>%
rowwise() %>%
mutate(
ks_stat = ks_test$statistic,
ks_pval = ks_test$p.value
) %>%
ungroup() %>%
select(-ks_test)
# A tibble: 100 x 5
# subgroup avg std_dev ks_stat ks_pval
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 5.10 1.24 0.186 0.819
# 2 2 4.86 0.805 0.231 0.584
# 3 3 5.24 0.729 0.258 0.445
# 4 4 5.16 0.642 0.307 0.247
# 5 5 4.63 0.752 0.393 0.0664
# Benchmark using rbenchmark:
# test replications elapsed relative user.self sys.self user.child sys.child
#2 nested 1000 10.58 1.000 10.58 0 NA NA
#1 original 1000 16.75 1.583 16.73 0 NA NA
您可以定义函数并使用来自 purrr 的地图:
library(tibble)
library(dplyr)
library(purrr)
func = function(DA){
kstest = ks.test(DA$value, "pnorm", mean = 5, sd = 1)
data.frame(
subgroup = unique(DA$subgroup),
avg=mean(DA$value),
std_dev = sd(DA$value),
ks_stat = kstest$statistic,
ks_pval = kstest$p.value)
}
contrived_data %>%
split(.$subgroup) %>%
map_dfr(func)
您可以使用group_modify
library(tidyverse)
contrived_data %>%
group_by(subgroup) %>%
group_modify(~{
ks <- ks.test(.$value, "pnorm", mean = 5, sd = 1)
tibble(
avg = mean(.$value),
std_dev = sd(.$value),
ks_stat = ks$statistic,
ks_pval = ks$p.value)
})
或 data.table
library(data.table)
setDT(contrived_data)
contrived_data[, {
ks <- ks.test(value, "pnorm", mean = 5, sd = 1)
.(avg = mean(value),
std_dev = sd(value),
ks_stat = ks$statistic,
ks_pval = ks$p.value)
}, by = subgroup]