使用 R/dplyr 中的 dictionary/list 协调数据集*列类型*(格式)
Reconcile dataset *column types* (formats) using a dictionary/list in R/dplyr
在 #67453183 之后,我想使用字典对 formats 做同样的事情,因为它不会将不同类型的列组合在一起。
I have a series of data sets and a dictionary to bring these together. But I'm struggling to figure out how to automate this. > Suppose this data and dictionary (actual one is much longer, thus I want to automate):
mtcarsA <- mtcars[1:2,1:3] %>% rename(mpgA = mpg, cyl_A = cyl) %>% as_tibble()
mtcarsB <- mtcars[3:4,1:3] %>% rename(mpg_B = mpg, B_cyl = cyl) %>% as_tibble()
mtcarsB$B_cyl <- as.factor(mtcarsB$B_cyl)
dic <- tibble(true_name = c("mpg_true", "cyl_true"),
nameA = c("mpgA", "cyl_A"),
nameB = c("mpg_B", "B_cyl"),
true_format = c("factor", "numeric")
)
I want these datasets (from years A and B) appended to one another, and then to have the names changed or coalesced to the 'true_name' values.... I want to automate 'coalesce all columns with duplicate names'.
要将它们放在一起,类型也必须相同。我在这里给出了整个问题,因为也许有人对 'using a data dictionary'.
也有更好的解决方案
@ronakShah 在上一个查询中提出
pmap(dic, ~setNames(..1, paste0(c(..2, ..3), collapse = '|'))) %>%
flatten_chr() -> val
mtcars_all <- list(mtcarsA,mtcarsB) %>%
map_df(function(x) x %>% rename_with(~str_replace_all(.x, val)))
在前面的示例中效果很好,但如果格式不同,不会。这里抛出错误:
Error: Can't combine
..1$cyl_true<double> and
..2$cyl_true <factor<51fac>>.
到 #56773354 提供了一个相关的解决方案,如果一个人有一个 完整的 类型列表,但不是像我那样按列名列出类型。
期望输出:
mtcars_all
# A tibble: 4 x 3
mpg_true cyl_true disp
<factor> <numeric> <dbl>
1 21 6 160
2 21 6 160
3 22.8 4 108
4 21.4 6 258
我采用了不同于 Ronak 的另一种方法来阅读字典。它更冗长,但我发现它更具可读性。一个基准测试会很有趣,看看哪个更快 ;-)
不幸的是,您似乎不能盲目地将变量转换为因子,所以我改用字符。在实践中,它的行为应该完全像一个因素,如果这对你很重要,你可以在结束对象上调用 as_factor()
。另一种可能性是在字典中存储转换函数名称(例如 as_factor()
),使用 get()
检索它并使用它而不是 as()
.
library(tidyverse)
mtcarsA <- mtcars[1:2,1:3] %>% rename(mpgA = mpg, cyl_A = cyl) %>% as_tibble()
mtcarsB <- mtcars[3:4,1:3] %>% rename(mpg_B = mpg, B_cyl = cyl) %>% as_tibble()
mtcarsB$B_cyl <- as.factor(mtcarsB$B_cyl)
dic <- tibble(true_name = c("mpg_true", "cyl_true"),
nameA = c("mpgA", "cyl_A"),
nameB = c("mpg_B", "B_cyl"),
true_format = c("numeric", "character") #instead of factor
)
dic2 = dic %>%
pivot_longer(-c(true_name, true_format), names_to=NULL)
read_dic = function(key, dict=dic2){
x = dict[dict$value==key,][["true_name"]]
if(length(x)!=1) x=key
x
}
rename_from_dic = function(df, dict=dic2){
rename_with(df, ~{
map_chr(.x, ~read_dic(.x, dict))
})
}
cast_from_dic = function(df, dict=dic){
mutate(df, across(everything(), ~{
cl=dict[dict$true_name==cur_column(),][["true_format"]]
if(length(cl)!=1) cl=class(.x)
as(.x, cl, strict=FALSE)
}))
}
list(mtcarsA,mtcarsB) %>%
map(rename_from_dic) %>%
map_df(cast_from_dic)
#> # A tibble: 4 x 3
#> mpg_true cyl_true disp
#> <dbl> <chr> <dbl>
#> 1 21 6 160
#> 2 21 6 160
#> 3 22.8 4 108
#> 4 21.4 6 258
由 reprex package (v2.0.0)
于 2021-05-09 创建
更简单的东西:
library(magrittr) # %<>% is cool
library(dplyr)
# The renaming is easy:
renameA <- dic$nameA
renameB <- dic$nameB
names(renameA) <- dic$true_name
names(renameB) <- dic$true_name
mtcarsA %<>% rename(all_of(renameA))
mtcarsB %<>% rename(all_of(renameB))
# Formatting is a little harder:
formats <- dic$true_format
names(formats) <- dic$true_name
lapply(names(formats), function (x) {
# there's no nice programmatic way to do this, I think
coercer <- switch(formats[[x]],
factor = as.factor,
numeric = as.numeric,
warning("Unrecognized format")
)
mtcarsA[[x]] <<- coercer(mtcarsA[[x]])
mtcarsB[[x]] <<- coercer(mtcarsB[[x]])
})
mtcars_all <- bind_rows(mtcarsA, mtcarsB)
在后台,您应该知道在 4.1.0 之前 base R 是如何处理连接因子的,以及这将如何改变。在这里它可能并不重要,因为 bind_rows
将使用 vctrs
包。
在
I have a series of data sets and a dictionary to bring these together. But I'm struggling to figure out how to automate this. > Suppose this data and dictionary (actual one is much longer, thus I want to automate):
mtcarsA <- mtcars[1:2,1:3] %>% rename(mpgA = mpg, cyl_A = cyl) %>% as_tibble()
mtcarsB <- mtcars[3:4,1:3] %>% rename(mpg_B = mpg, B_cyl = cyl) %>% as_tibble()
mtcarsB$B_cyl <- as.factor(mtcarsB$B_cyl)
dic <- tibble(true_name = c("mpg_true", "cyl_true"),
nameA = c("mpgA", "cyl_A"),
nameB = c("mpg_B", "B_cyl"),
true_format = c("factor", "numeric")
)
I want these datasets (from years A and B) appended to one another, and then to have the names changed or coalesced to the 'true_name' values.... I want to automate 'coalesce all columns with duplicate names'.
要将它们放在一起,类型也必须相同。我在这里给出了整个问题,因为也许有人对 'using a data dictionary'.
也有更好的解决方案@ronakShah 在上一个查询中提出
pmap(dic, ~setNames(..1, paste0(c(..2, ..3), collapse = '|'))) %>%
flatten_chr() -> val
mtcars_all <- list(mtcarsA,mtcarsB) %>%
map_df(function(x) x %>% rename_with(~str_replace_all(.x, val)))
在前面的示例中效果很好,但如果格式不同,不会。这里抛出错误:
Error: Can't combine
..1$cyl_true<double> and
..2$cyl_true <factor<51fac>>.
期望输出:
mtcars_all
# A tibble: 4 x 3
mpg_true cyl_true disp
<factor> <numeric> <dbl>
1 21 6 160
2 21 6 160
3 22.8 4 108
4 21.4 6 258
我采用了不同于 Ronak 的另一种方法来阅读字典。它更冗长,但我发现它更具可读性。一个基准测试会很有趣,看看哪个更快 ;-)
不幸的是,您似乎不能盲目地将变量转换为因子,所以我改用字符。在实践中,它的行为应该完全像一个因素,如果这对你很重要,你可以在结束对象上调用 as_factor()
。另一种可能性是在字典中存储转换函数名称(例如 as_factor()
),使用 get()
检索它并使用它而不是 as()
.
library(tidyverse)
mtcarsA <- mtcars[1:2,1:3] %>% rename(mpgA = mpg, cyl_A = cyl) %>% as_tibble()
mtcarsB <- mtcars[3:4,1:3] %>% rename(mpg_B = mpg, B_cyl = cyl) %>% as_tibble()
mtcarsB$B_cyl <- as.factor(mtcarsB$B_cyl)
dic <- tibble(true_name = c("mpg_true", "cyl_true"),
nameA = c("mpgA", "cyl_A"),
nameB = c("mpg_B", "B_cyl"),
true_format = c("numeric", "character") #instead of factor
)
dic2 = dic %>%
pivot_longer(-c(true_name, true_format), names_to=NULL)
read_dic = function(key, dict=dic2){
x = dict[dict$value==key,][["true_name"]]
if(length(x)!=1) x=key
x
}
rename_from_dic = function(df, dict=dic2){
rename_with(df, ~{
map_chr(.x, ~read_dic(.x, dict))
})
}
cast_from_dic = function(df, dict=dic){
mutate(df, across(everything(), ~{
cl=dict[dict$true_name==cur_column(),][["true_format"]]
if(length(cl)!=1) cl=class(.x)
as(.x, cl, strict=FALSE)
}))
}
list(mtcarsA,mtcarsB) %>%
map(rename_from_dic) %>%
map_df(cast_from_dic)
#> # A tibble: 4 x 3
#> mpg_true cyl_true disp
#> <dbl> <chr> <dbl>
#> 1 21 6 160
#> 2 21 6 160
#> 3 22.8 4 108
#> 4 21.4 6 258
由 reprex package (v2.0.0)
于 2021-05-09 创建更简单的东西:
library(magrittr) # %<>% is cool
library(dplyr)
# The renaming is easy:
renameA <- dic$nameA
renameB <- dic$nameB
names(renameA) <- dic$true_name
names(renameB) <- dic$true_name
mtcarsA %<>% rename(all_of(renameA))
mtcarsB %<>% rename(all_of(renameB))
# Formatting is a little harder:
formats <- dic$true_format
names(formats) <- dic$true_name
lapply(names(formats), function (x) {
# there's no nice programmatic way to do this, I think
coercer <- switch(formats[[x]],
factor = as.factor,
numeric = as.numeric,
warning("Unrecognized format")
)
mtcarsA[[x]] <<- coercer(mtcarsA[[x]])
mtcarsB[[x]] <<- coercer(mtcarsB[[x]])
})
mtcars_all <- bind_rows(mtcarsA, mtcarsB)
在后台,您应该知道在 4.1.0 之前 base R 是如何处理连接因子的,以及这将如何改变。在这里它可能并不重要,因为 bind_rows
将使用 vctrs
包。