如何使用 dplyr::count() 和 purrr::map() 遍历分组数据框
How to loop over a grouped dataframe using dplyr::count() and purrr::map()
背景
我想遍历一组因子变量的数据框来计算
使用 dplyr
中的 count
函数在变量中出现每个值,
我认为 purrr::map
函数最合适。
但是,我无法让它工作。
我尝试使用 来满足我的需要,但这也不起作用。
我还尝试将一个基于
this post,但无法使其与分组变量一起使用。
问题
是否可以按照我想要的方式遍历分组数据框?如果是这样,
怎么样?
提前感谢您的考虑。
可重现的例子
library(tidyverse)
vars_df <-
structure(list(c = structure(c(2L, 3L, 3L, 2L, 3L, 3L, 2L, 2L,
1L, 2L, 2L, 2L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
2L, 3L, 1L, 2L, 1L, 3L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 3L, 2L, 1L,
1L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L), .Label = c("1", "2",
"3"), class = "factor"), pastpsyc = structure(c(2L, 1L, NA, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, NA, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), pastmed = structure(c(2L, 1L, NA, 2L,
1L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxsuicide = structure(c(2L, 1L, NA,
2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, NA, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxdsh = structure(c(2L, 1L, NA, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, NA, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxtrauma = structure(c(2L, 1L, NA, 2L,
1L, 1L, 1L, 1L, NA, 1L, 1L, NA, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, NA, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
map_dfr(setNames(c('1', '2', '3'),
c('1', '2', '3')), ~
vars_df %>%
group_by(c) %>%
summarise(across(everything(), function(x)
sum(x == .x, na.rm = TRUE)), .groups = 'drop'), .id = 'var')
#> # A tibble: 9 x 7
#> var c pastpsyc pastmed hxsuicide hxdsh hxtrauma
#> <chr> <fct> <int> <int> <int> <int> <int>
#> 1 1 1 3 2 2 5 1
#> 2 1 2 16 9 18 16 10
#> 3 1 3 12 3 8 11 9
#> 4 2 1 0 0 0 0 0
#> 5 2 2 0 0 0 0 0
#> 6 2 3 0 0 0 0 0
#> 7 3 1 0 0 0 0 0
#> 8 3 2 0 0 0 0 0
#> 9 3 3 0 0 0 0 0
vars_df %>%
group_by(c) %>%
count(pastpsyc)
#> # A tibble: 7 x 3
#> # Groups: c [3]
#> c pastpsyc n
#> <fct> <fct> <int>
#> 1 1 0 4
#> 2 1 1 3
#> 3 2 0 8
#> 4 2 1 16
#> 5 3 0 5
#> 6 3 1 12
#> 7 3 <NA> 2
vars_df %>%
group_by(c) %>%
map(~ count(.))
#> Error in UseMethod("count"): no applicable method for 'count' applied to an object of class "factor"
.get_count <-
function(mygroup) {
quo_var <- enquo(mygroup)
vars_df %>%
group_by(!! quo_var) %>%
count() %>%
ungroup()
}
vars <-
vars_df %>%
colnames()
vars %>%
syms() %>%
map(function(var) .get_count(!!var))
#> [[1]]
#> # A tibble: 3 x 2
#> c n
#> <fct> <int>
#> 1 1 7
#> 2 2 24
#> 3 3 19
#>
#> [[2]]
#> # A tibble: 3 x 2
#> pastpsyc n
#> <fct> <int>
#> 1 0 17
#> 2 1 31
#> 3 <NA> 2
#>
#> [[3]]
#> # A tibble: 3 x 2
#> pastmed n
#> <fct> <int>
#> 1 0 33
#> 2 1 14
#> 3 <NA> 3
#>
#> [[4]]
#> # A tibble: 3 x 2
#> hxsuicide n
#> <fct> <int>
#> 1 0 20
#> 2 1 28
#> 3 <NA> 2
#>
#> [[5]]
#> # A tibble: 3 x 2
#> hxdsh n
#> <fct> <int>
#> 1 0 16
#> 2 1 32
#> 3 <NA> 2
#>
#> [[6]]
#> # A tibble: 3 x 2
#> hxtrauma n
#> <fct> <int>
#> 1 0 26
#> 2 1 20
#> 3 <NA> 4
vars %>%
syms() %>%
group_by(c) %>%
map(function(var) .get_count(!!var))
#> Error in UseMethod("group_by"): no applicable method for 'group_by' applied to an object of class "list"
# Created on 2021-05-26 by the reprex package (v2.0.0)
您可以将 map
用作 -
library(tidyverse)
vars %>% map(~vars_df %>% count(c, .data[[.x]]))
#[[1]]
# A tibble: 3 x 2
# c n
# <fct> <int>
#1 1 7
#2 2 24
3 3 19
#[[2]]
# A tibble: 7 x 3
# c pastpsyc n
# <fct> <fct> <int>
#1 1 0 4
#2 1 1 3
#3 2 0 8
#4 2 1 16
#5 3 0 5
#6 3 1 12
#7 3 NA 2
#...
#...
以长格式显示输出的不同方式 -
vars_df %>% pivot_longer(cols = -c) %>% count(c, name, value)
# c name value n
# <fct> <chr> <fct> <int>
# 1 1 hxdsh 0 2
# 2 1 hxdsh 1 5
# 3 1 hxsuicide 0 5
# 4 1 hxsuicide 1 2
# 5 1 hxtrauma 0 5
# 6 1 hxtrauma 1 1
# 7 1 hxtrauma NA 1
# 8 1 pastmed 0 4
# 9 1 pastmed 1 2
#10 1 pastmed NA 1
# … with 28 more rows
背景
我想遍历一组因子变量的数据框来计算
使用 dplyr
中的 count
函数在变量中出现每个值,
我认为 purrr::map
函数最合适。
但是,我无法让它工作。
我尝试使用
问题
是否可以按照我想要的方式遍历分组数据框?如果是这样, 怎么样?
提前感谢您的考虑。
可重现的例子
library(tidyverse)
vars_df <-
structure(list(c = structure(c(2L, 3L, 3L, 2L, 3L, 3L, 2L, 2L,
1L, 2L, 2L, 2L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
2L, 3L, 1L, 2L, 1L, 3L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 3L, 2L, 1L,
1L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L), .Label = c("1", "2",
"3"), class = "factor"), pastpsyc = structure(c(2L, 1L, NA, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, NA, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), pastmed = structure(c(2L, 1L, NA, 2L,
1L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxsuicide = structure(c(2L, 1L, NA,
2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, NA, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxdsh = structure(c(2L, 1L, NA, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, NA, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxtrauma = structure(c(2L, 1L, NA, 2L,
1L, 1L, 1L, 1L, NA, 1L, 1L, NA, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, NA, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
map_dfr(setNames(c('1', '2', '3'),
c('1', '2', '3')), ~
vars_df %>%
group_by(c) %>%
summarise(across(everything(), function(x)
sum(x == .x, na.rm = TRUE)), .groups = 'drop'), .id = 'var')
#> # A tibble: 9 x 7
#> var c pastpsyc pastmed hxsuicide hxdsh hxtrauma
#> <chr> <fct> <int> <int> <int> <int> <int>
#> 1 1 1 3 2 2 5 1
#> 2 1 2 16 9 18 16 10
#> 3 1 3 12 3 8 11 9
#> 4 2 1 0 0 0 0 0
#> 5 2 2 0 0 0 0 0
#> 6 2 3 0 0 0 0 0
#> 7 3 1 0 0 0 0 0
#> 8 3 2 0 0 0 0 0
#> 9 3 3 0 0 0 0 0
vars_df %>%
group_by(c) %>%
count(pastpsyc)
#> # A tibble: 7 x 3
#> # Groups: c [3]
#> c pastpsyc n
#> <fct> <fct> <int>
#> 1 1 0 4
#> 2 1 1 3
#> 3 2 0 8
#> 4 2 1 16
#> 5 3 0 5
#> 6 3 1 12
#> 7 3 <NA> 2
vars_df %>%
group_by(c) %>%
map(~ count(.))
#> Error in UseMethod("count"): no applicable method for 'count' applied to an object of class "factor"
.get_count <-
function(mygroup) {
quo_var <- enquo(mygroup)
vars_df %>%
group_by(!! quo_var) %>%
count() %>%
ungroup()
}
vars <-
vars_df %>%
colnames()
vars %>%
syms() %>%
map(function(var) .get_count(!!var))
#> [[1]]
#> # A tibble: 3 x 2
#> c n
#> <fct> <int>
#> 1 1 7
#> 2 2 24
#> 3 3 19
#>
#> [[2]]
#> # A tibble: 3 x 2
#> pastpsyc n
#> <fct> <int>
#> 1 0 17
#> 2 1 31
#> 3 <NA> 2
#>
#> [[3]]
#> # A tibble: 3 x 2
#> pastmed n
#> <fct> <int>
#> 1 0 33
#> 2 1 14
#> 3 <NA> 3
#>
#> [[4]]
#> # A tibble: 3 x 2
#> hxsuicide n
#> <fct> <int>
#> 1 0 20
#> 2 1 28
#> 3 <NA> 2
#>
#> [[5]]
#> # A tibble: 3 x 2
#> hxdsh n
#> <fct> <int>
#> 1 0 16
#> 2 1 32
#> 3 <NA> 2
#>
#> [[6]]
#> # A tibble: 3 x 2
#> hxtrauma n
#> <fct> <int>
#> 1 0 26
#> 2 1 20
#> 3 <NA> 4
vars %>%
syms() %>%
group_by(c) %>%
map(function(var) .get_count(!!var))
#> Error in UseMethod("group_by"): no applicable method for 'group_by' applied to an object of class "list"
# Created on 2021-05-26 by the reprex package (v2.0.0)
您可以将 map
用作 -
library(tidyverse)
vars %>% map(~vars_df %>% count(c, .data[[.x]]))
#[[1]]
# A tibble: 3 x 2
# c n
# <fct> <int>
#1 1 7
#2 2 24
3 3 19
#[[2]]
# A tibble: 7 x 3
# c pastpsyc n
# <fct> <fct> <int>
#1 1 0 4
#2 1 1 3
#3 2 0 8
#4 2 1 16
#5 3 0 5
#6 3 1 12
#7 3 NA 2
#...
#...
以长格式显示输出的不同方式 -
vars_df %>% pivot_longer(cols = -c) %>% count(c, name, value)
# c name value n
# <fct> <chr> <fct> <int>
# 1 1 hxdsh 0 2
# 2 1 hxdsh 1 5
# 3 1 hxsuicide 0 5
# 4 1 hxsuicide 1 2
# 5 1 hxtrauma 0 5
# 6 1 hxtrauma 1 1
# 7 1 hxtrauma NA 1
# 8 1 pastmed 0 4
# 9 1 pastmed 1 2
#10 1 pastmed NA 1
# … with 28 more rows