使用带有值列表的列来指定从哪些列创建另一个值列表
Using a column, with lists of values, to specify from which columns to create another list of values
我有如下数据:
上一个解决方案
library(dplyr)
dat_in <- structure(list(rn = c("Type_A", "Type_B"
), `[0,25)` = c(5L, 0L), `[25,50)` = c(0L, 0L), `[25,100)` = c(38L,
3L), `[50,100)` = c(0L, 0L), `[100,250)` = c(43L, 5L), `[100,500)` = c(0L,
0L), `[250,500)` = c(27L, 12L), `[500,1000)` = c(44L, 0L), `[1000,1500)` = c(0L,
0L), `[1500,3000)` = c(0L, 0L), `[500,1000000]` = c(0L, 53L),
`[1000,1000000]` = c(20L, 0L), `[3000,1000000]` = c(0L, 0L
), Sum_bin = c(177, 73), strata = list(c(0, 25, 100, 250,
500, 1000, 1e+06), c(0, 25, 100, 250, 500, 1e+06))), row.names = c(NA,
-2L), class = c("data.table", "data.frame"))
我之前使用 来获取频率高于零的列。
out <- dat_in %>%
rowwise() %>%
summarise(rn = rn,
freq =list(c_across(-c(rn,strata)))) %>%
rowwise() %>%
mutate(freq = list(freq[which(freq > 0)]))
新一期
遗憾的是,此解决方案对于数据而言不够稳健(因为例如 [0,25)
层实际上 type_B
的频率为零)。
我想尝试的是根据 strata
列在 freq
列中创建一个值列表。
[[1]]
[1] 0 25 100 250 500 1000 1000000
[[2]]
[1] 0 25 100 250 500 1000000
因此对于第一行,应该收集 [0,25), [25,100), [100,250), [250,500), [500,1000) and [1000,1000000)
中的值。
但是下一行有不同的层值
对于第二行,应收集值 [0,25), [25,100), [100,250), [250,500), and [500,1000000)
。
我很难想出一种方法来做到这一点。任何人都可以提出一个好的方法吗?
期望的输出:
dat_out <- structure(list(rn = c("Type_A", "Type_B"), freq = list(c(5, 38,
43, 27, 44, 20, 177), c(0, 3, 5, 12, 53, 73))), class = c("rowwise_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -2L), groups = structure(list(
.rows = structure(list(1L, 2L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")))
也许是这个?
library(tidyverse)
dat_in %>%
as_tibble() %>%
dplyr::select(where(~ any(unlist(.) !=0))) %>%
select(-Sum_bin, -strata) %>%
nest(freq = -rn)
给出:
rn freq
<chr> <list>
1 Type_A <tibble [1 x 7]>
2 Type_B <tibble [1 x 7]>
或者这样:
library(tidyverse)
dat_in %>%
as_tibble() %>%
dplyr::select(where(~ any(unlist(.) !=0))) %>%
select(-Sum_bin, -strata) %>%
unite(new_var, -rn, sep = ", ", remove = TRUE) %>%
nest(freq = -rn)
这给出了这个:
rn freq
<chr> <list>
1 Type_A <tibble [1 x 1]>
2 Type_B <tibble [1 x 1]>
dat_in %>%
pivot_longer(-c(rn, strata)) %>%
extract(name, c('lower', 'upper'), '(\d+),(\d+)', convert = TRUE) %>%
group_by(rn) %>%
filter(lower%in%strata[[1]] & upper %in% strata[[1]]) %>%
group_by(upper,.add = TRUE) %>%
summarise(freq = sum(value), .groups = 'drop_last') %>%
group_modify(~add_row(.,freq = sum(.$freq))) %>%
summarise(freq = list(freq))
# A tibble: 2 x 2
rn freq
<chr> <list>
1 Type_A <dbl [7]>
2 Type_B <dbl [6]>
使用频率列:
[[1]]
[1] 5 38 43 27 44 20 177
[[2]]
[1] 0 3 5 12 53 73
我有如下数据:
上一个解决方案
library(dplyr)
dat_in <- structure(list(rn = c("Type_A", "Type_B"
), `[0,25)` = c(5L, 0L), `[25,50)` = c(0L, 0L), `[25,100)` = c(38L,
3L), `[50,100)` = c(0L, 0L), `[100,250)` = c(43L, 5L), `[100,500)` = c(0L,
0L), `[250,500)` = c(27L, 12L), `[500,1000)` = c(44L, 0L), `[1000,1500)` = c(0L,
0L), `[1500,3000)` = c(0L, 0L), `[500,1000000]` = c(0L, 53L),
`[1000,1000000]` = c(20L, 0L), `[3000,1000000]` = c(0L, 0L
), Sum_bin = c(177, 73), strata = list(c(0, 25, 100, 250,
500, 1000, 1e+06), c(0, 25, 100, 250, 500, 1e+06))), row.names = c(NA,
-2L), class = c("data.table", "data.frame"))
我之前使用
out <- dat_in %>%
rowwise() %>%
summarise(rn = rn,
freq =list(c_across(-c(rn,strata)))) %>%
rowwise() %>%
mutate(freq = list(freq[which(freq > 0)]))
新一期
遗憾的是,此解决方案对于数据而言不够稳健(因为例如 [0,25)
层实际上 type_B
的频率为零)。
我想尝试的是根据 strata
列在 freq
列中创建一个值列表。
[[1]]
[1] 0 25 100 250 500 1000 1000000
[[2]]
[1] 0 25 100 250 500 1000000
因此对于第一行,应该收集 [0,25), [25,100), [100,250), [250,500), [500,1000) and [1000,1000000)
中的值。
但是下一行有不同的层值
对于第二行,应收集值 [0,25), [25,100), [100,250), [250,500), and [500,1000000)
。
我很难想出一种方法来做到这一点。任何人都可以提出一个好的方法吗?
期望的输出:
dat_out <- structure(list(rn = c("Type_A", "Type_B"), freq = list(c(5, 38,
43, 27, 44, 20, 177), c(0, 3, 5, 12, 53, 73))), class = c("rowwise_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -2L), groups = structure(list(
.rows = structure(list(1L, 2L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame")))
也许是这个?
library(tidyverse)
dat_in %>%
as_tibble() %>%
dplyr::select(where(~ any(unlist(.) !=0))) %>%
select(-Sum_bin, -strata) %>%
nest(freq = -rn)
给出:
rn freq
<chr> <list>
1 Type_A <tibble [1 x 7]>
2 Type_B <tibble [1 x 7]>
或者这样:
library(tidyverse)
dat_in %>%
as_tibble() %>%
dplyr::select(where(~ any(unlist(.) !=0))) %>%
select(-Sum_bin, -strata) %>%
unite(new_var, -rn, sep = ", ", remove = TRUE) %>%
nest(freq = -rn)
这给出了这个:
rn freq
<chr> <list>
1 Type_A <tibble [1 x 1]>
2 Type_B <tibble [1 x 1]>
dat_in %>%
pivot_longer(-c(rn, strata)) %>%
extract(name, c('lower', 'upper'), '(\d+),(\d+)', convert = TRUE) %>%
group_by(rn) %>%
filter(lower%in%strata[[1]] & upper %in% strata[[1]]) %>%
group_by(upper,.add = TRUE) %>%
summarise(freq = sum(value), .groups = 'drop_last') %>%
group_modify(~add_row(.,freq = sum(.$freq))) %>%
summarise(freq = list(freq))
# A tibble: 2 x 2
rn freq
<chr> <list>
1 Type_A <dbl [7]>
2 Type_B <dbl [6]>
使用频率列:
[[1]]
[1] 5 38 43 27 44 20 177
[[2]]
[1] 0 3 5 12 53 73