一次按多列拆分或 group_split 数据框
split or group_split dataframe by multiple columns at once
我确定有人问过这个问题,但找不到我要找的东西。同时但分开地 split
(或 group_split
)多个列的最佳方法是什么。
给定:
library(tidyverse)
#example data
set.seed(1)
df <- data.frame( group = c(rep("A",3), rep("B", 6)),
test_value = c(0,1,2, 0,1,2,3,4,5),
ff = rnorm(9, 0, 1),
dd = c(rep("C", 5), rep("D", 4)))
df
# group test_value ff dd
# 1 A 0 -0.6264538 C
# 2 A 1 0.1836433 C
# 3 A 2 -0.8356286 C
# 4 B 0 1.5952808 C
# 5 B 1 0.3295078 C
# 6 B 2 -0.8204684 D
# 7 B 3 0.4874291 D
# 8 B 4 0.7383247 D
# 9 B 5 0.5757814 D
我想,例如 summarise
通过 group
然后通过 dd
使用 group_split
:
df %>%
group_split(group) %>%
map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE))))
#OR split
#df %>%
# split(.$group)) %>%
# map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE),
# ~median(.x, na.rm = TRUE))))
# [[1]]
# # A tibble: 1 x 2
# mean median
# <dbl> <dbl>
# 1 -0.426 -0.626
# [[2]]
# # A tibble: 1 x 2
# mean median
# <dbl> <dbl>
# 1 0.484 0.532
#then by dd
df %>%
group_split(dd) %>%
map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE))))
# [[1]]
# # A tibble: 1 x 2
# mean median
# <dbl> <dbl>
# 1 0.129 0.184
# [[2]]
# # A tibble: 1 x 2
# mean median
# <dbl> <dbl>
# 1 0.245 0.532
但我想要一种灵活的方式在一个电话中完成这项工作。将两个变量都添加到 split
或 group_split
中,将 group
和 dd
的组合分组,这不是我想要的。
df %>%
split(list(.$group, .$dd)) %>%
map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE))))
#OR
df %>%
group_split(group, dd) %>%
map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE))))
有什么建议吗?如果能在每个输出中打印分组变量作为奖励就好了!
谢谢
如果我们想单独执行此操作,则遍历名称
library(dplyr)
library(purrr)
map(c("group", "dd"), ~ df %>%
split(.[.x]) %>%
map(~ .x %>%
summarise_at(vars(ff),
list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE)))) )
-输出
[[1]]
[[1]]$A
mean median
1 -0.4261464 -0.6264538
[[1]]$B
mean median
1 0.4843092 0.5316052
[[2]]
[[2]]$C
mean median
1 0.1292699 0.1836433
[[2]]$D
mean median
1 0.2452667 0.5316052
或使用group_split
out <- map(c("group", "dd"), ~ df %>%
group_split(across(all_of(.x))) %>%
map(~ .x %>%
summarise_at(vars(ff),
list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE)))) )
names(out) <- c("group", "dd")
-输出
> out
$group
$group[[1]]
# A tibble: 1 x 2
mean median
<dbl> <dbl>
1 -0.426 -0.626
$group[[2]]
# A tibble: 1 x 2
mean median
<dbl> <dbl>
1 0.484 0.532
$dd
$dd[[1]]
# A tibble: 1 x 2
mean median
<dbl> <dbl>
1 0.129 0.184
$dd[[2]]
# A tibble: 1 x 2
mean median
<dbl> <dbl>
1 0.245 0.532
我确定有人问过这个问题,但找不到我要找的东西。同时但分开地 split
(或 group_split
)多个列的最佳方法是什么。
给定:
library(tidyverse)
#example data
set.seed(1)
df <- data.frame( group = c(rep("A",3), rep("B", 6)),
test_value = c(0,1,2, 0,1,2,3,4,5),
ff = rnorm(9, 0, 1),
dd = c(rep("C", 5), rep("D", 4)))
df
# group test_value ff dd
# 1 A 0 -0.6264538 C
# 2 A 1 0.1836433 C
# 3 A 2 -0.8356286 C
# 4 B 0 1.5952808 C
# 5 B 1 0.3295078 C
# 6 B 2 -0.8204684 D
# 7 B 3 0.4874291 D
# 8 B 4 0.7383247 D
# 9 B 5 0.5757814 D
我想,例如 summarise
通过 group
然后通过 dd
使用 group_split
:
df %>%
group_split(group) %>%
map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE))))
#OR split
#df %>%
# split(.$group)) %>%
# map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE),
# ~median(.x, na.rm = TRUE))))
# [[1]]
# # A tibble: 1 x 2
# mean median
# <dbl> <dbl>
# 1 -0.426 -0.626
# [[2]]
# # A tibble: 1 x 2
# mean median
# <dbl> <dbl>
# 1 0.484 0.532
#then by dd
df %>%
group_split(dd) %>%
map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE))))
# [[1]]
# # A tibble: 1 x 2
# mean median
# <dbl> <dbl>
# 1 0.129 0.184
# [[2]]
# # A tibble: 1 x 2
# mean median
# <dbl> <dbl>
# 1 0.245 0.532
但我想要一种灵活的方式在一个电话中完成这项工作。将两个变量都添加到 split
或 group_split
中,将 group
和 dd
的组合分组,这不是我想要的。
df %>%
split(list(.$group, .$dd)) %>%
map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE))))
#OR
df %>%
group_split(group, dd) %>%
map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE))))
有什么建议吗?如果能在每个输出中打印分组变量作为奖励就好了!
谢谢
如果我们想单独执行此操作,则遍历名称
library(dplyr)
library(purrr)
map(c("group", "dd"), ~ df %>%
split(.[.x]) %>%
map(~ .x %>%
summarise_at(vars(ff),
list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE)))) )
-输出
[[1]]
[[1]]$A
mean median
1 -0.4261464 -0.6264538
[[1]]$B
mean median
1 0.4843092 0.5316052
[[2]]
[[2]]$C
mean median
1 0.1292699 0.1836433
[[2]]$D
mean median
1 0.2452667 0.5316052
或使用group_split
out <- map(c("group", "dd"), ~ df %>%
group_split(across(all_of(.x))) %>%
map(~ .x %>%
summarise_at(vars(ff),
list(~mean(.x, na.rm = TRUE),
~median(.x, na.rm = TRUE)))) )
names(out) <- c("group", "dd")
-输出
> out
$group
$group[[1]]
# A tibble: 1 x 2
mean median
<dbl> <dbl>
1 -0.426 -0.626
$group[[2]]
# A tibble: 1 x 2
mean median
<dbl> <dbl>
1 0.484 0.532
$dd
$dd[[1]]
# A tibble: 1 x 2
mean median
<dbl> <dbl>
1 0.129 0.184
$dd[[2]]
# A tibble: 1 x 2
mean median
<dbl> <dbl>
1 0.245 0.532