如何使用purrr将因子变量转换为二分变量
How to use purrr to convert factor variables to dichotomous variables
我想使用 purrr
将许多因子变量转换为二分变量。这是我正在尝试完成的示例,使用玩具数据集和改编自 :
的函数
library(dplyr)
library(forcats)
library(tidyr)
library(purrr)
df <- tibble(a = c(1,2,3),
b = c(1,1,2),
c = as_factor(c("Rose","Pink","Red")),
d = c(2,3,4),
e = as_factor(c("Paris", "London", "Paris"))
)
fac_to_d <- function(.data, col) {
.data %>%
mutate(value = 1) %>%
pivot_wider(names_from = {{col}},
values_from = value,
values_fill = 0)
}
函数有效:
df %>%
fac_to_d("c") %>%
fac_to_d("e")
#> # A tibble: 3 × 8
#> a b d Rose Pink Red Paris London
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 2 1 0 0 1 0
#> 2 2 1 3 0 1 0 0 1
#> 3 3 2 4 0 0 1 1 0
但我不知道如何让它与 purrr
一起工作。例如:
cols <- c("c", "e")
df %>% map_dfr(.f = fac_to_d, col = cols)
#> Error in UseMethod("mutate"): no applicable method for 'mutate' applied to an object of class "c('double', 'numeric')"
df %>% map(.f = fac_to_d, col = cols)
#> Error in UseMethod("mutate"): no applicable method for 'mutate' applied to an object of class "c('double', 'numeric')"
如何让这个函数与 purrr
一起使用? (如果有更简洁的方法将许多因子变量转换为二分变量,我也有兴趣了解它!)
我建议在处理特征时使用 tidymodels
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(forcats)
library(tidyr)
library(purrr)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
data_example <- tibble(a = c(1,2,3),
b = c(1,1,2),
c = as_factor(c("Rose","Pink","Red")),
d = c(2,3,4),
e = as_factor(c("Paris", "London", "Paris"))
)
fac_to_d <- function(.data, col) {
.data %>%
mutate(value = 1) %>%
pivot_wider(names_from = {{col}},
values_from = value,
values_fill = 0)
}
recipe_hot_encode <- recipe(x = data_example)
recipe_hot_encode |>
step_dummy(c(c,e),one_hot = TRUE) |>
prep() |>
juice()
#> # A tibble: 3 x 8
#> a b d c_Rose c_Pink c_Red e_Paris e_London
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 2 1 0 0 1 0
#> 2 2 1 3 0 1 0 0 1
#> 3 3 2 4 0 0 1 1 0
# if you want the old names
dummy_names2 <- function (var, lvl, ordinal = FALSE, sep = "_")
{
args <- vctrs::vec_recycle_common(var, lvl)
var <- args[[1]]
lvl <- args[[2]]
if (!ordinal)
nms <- paste(make.names(lvl), sep = sep)
else nms <- paste0(names0(length(lvl), sep))
nms
}
recipe_hot_encode |>
step_dummy(c(c,e),one_hot = TRUE,naming = dummy_names2) |>
prep() |>
juice()
#> # A tibble: 3 x 8
#> a b d Rose Pink Red Paris London
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 2 1 0 0 1 0
#> 2 2 1 3 0 1 0 0 1
#> 3 3 2 4 0 0 1 1 0
由 reprex package (v2.0.1)
于 2021-10-21 创建
试试这个:
library(purrr)
library(dplyr)
library(tidyr)
df %>%
mutate(c_one_hot = map(c, ~ set_names(levels(c) == .x, levels(c)))) %>%
unnest_wider(c_one_hot) %>%
mutate(e_one_hot = map(e, ~ set_names(levels(e) == .x, levels(e)))) %>%
unnest_wider(e_one_hot) %>%
mutate(across(everything(), ~.*1)) %>%
select(-c, -e)
输出:
a b d Rose Pink Red Paris London
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 2 1 0 0 1 0
2 2 1 3 0 1 0 0 1
3 3 2 4 0 0 1 1 0
这是一种可能的方法,但它涉及遍历每个分类变量并分别制作虚拟变量,然后在最后将它们绑定回数字变量。
我用 model.matrix()
来制作虚拟变量,用 reformulate()
来构造公式。注意 reformulate()
有有用的 intercept
参数来抑制截距以避免处理对比。
这是一个工作函数,将数据集和列名作为字符串。我把它变成了 data.frame 所以它可以与 map_df()
函数一起使用:
to_dummy = function(data, col) {
col %>%
reformulate(intercept = FALSE) %>%
model.matrix(data = data) %>%
as.data.frame()
}
to_dummy(data = df, col = "e")
#> eParis eLondon
#> 1 1 0
#> 2 0 1
#> 3 1 0
然后使用 map_dfc()
(用于列绑定)循环遍历字符串向量中的分类列。最后一步是将数字列绑定回数据集的其余部分,我有点笨拙地使用嵌套 select()
.
cols %>%
map_dfc(.f = to_dummy, data = df) %>%
cbind(select(df, where(is.numeric)), .)
#> a b d cRose cPink cRed eParis eLondon
#> 1 1 1 2 1 0 0 1 0
#> 2 2 1 3 0 1 0 0 1
#> 3 3 2 4 0 0 1 1 0
由 reprex package (v2.0.0)
于 2021-10-21 创建
缺点是列名基于原始变量加上因子水平,而不仅仅是因子水平。
我想使用 purrr
将许多因子变量转换为二分变量。这是我正在尝试完成的示例,使用玩具数据集和改编自
library(dplyr)
library(forcats)
library(tidyr)
library(purrr)
df <- tibble(a = c(1,2,3),
b = c(1,1,2),
c = as_factor(c("Rose","Pink","Red")),
d = c(2,3,4),
e = as_factor(c("Paris", "London", "Paris"))
)
fac_to_d <- function(.data, col) {
.data %>%
mutate(value = 1) %>%
pivot_wider(names_from = {{col}},
values_from = value,
values_fill = 0)
}
函数有效:
df %>%
fac_to_d("c") %>%
fac_to_d("e")
#> # A tibble: 3 × 8
#> a b d Rose Pink Red Paris London
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 2 1 0 0 1 0
#> 2 2 1 3 0 1 0 0 1
#> 3 3 2 4 0 0 1 1 0
但我不知道如何让它与 purrr
一起工作。例如:
cols <- c("c", "e")
df %>% map_dfr(.f = fac_to_d, col = cols)
#> Error in UseMethod("mutate"): no applicable method for 'mutate' applied to an object of class "c('double', 'numeric')"
df %>% map(.f = fac_to_d, col = cols)
#> Error in UseMethod("mutate"): no applicable method for 'mutate' applied to an object of class "c('double', 'numeric')"
如何让这个函数与 purrr
一起使用? (如果有更简洁的方法将许多因子变量转换为二分变量,我也有兴趣了解它!)
我建议在处理特征时使用 tidymodels
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(forcats)
library(tidyr)
library(purrr)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
data_example <- tibble(a = c(1,2,3),
b = c(1,1,2),
c = as_factor(c("Rose","Pink","Red")),
d = c(2,3,4),
e = as_factor(c("Paris", "London", "Paris"))
)
fac_to_d <- function(.data, col) {
.data %>%
mutate(value = 1) %>%
pivot_wider(names_from = {{col}},
values_from = value,
values_fill = 0)
}
recipe_hot_encode <- recipe(x = data_example)
recipe_hot_encode |>
step_dummy(c(c,e),one_hot = TRUE) |>
prep() |>
juice()
#> # A tibble: 3 x 8
#> a b d c_Rose c_Pink c_Red e_Paris e_London
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 2 1 0 0 1 0
#> 2 2 1 3 0 1 0 0 1
#> 3 3 2 4 0 0 1 1 0
# if you want the old names
dummy_names2 <- function (var, lvl, ordinal = FALSE, sep = "_")
{
args <- vctrs::vec_recycle_common(var, lvl)
var <- args[[1]]
lvl <- args[[2]]
if (!ordinal)
nms <- paste(make.names(lvl), sep = sep)
else nms <- paste0(names0(length(lvl), sep))
nms
}
recipe_hot_encode |>
step_dummy(c(c,e),one_hot = TRUE,naming = dummy_names2) |>
prep() |>
juice()
#> # A tibble: 3 x 8
#> a b d Rose Pink Red Paris London
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 2 1 0 0 1 0
#> 2 2 1 3 0 1 0 0 1
#> 3 3 2 4 0 0 1 1 0
由 reprex package (v2.0.1)
于 2021-10-21 创建试试这个:
library(purrr)
library(dplyr)
library(tidyr)
df %>%
mutate(c_one_hot = map(c, ~ set_names(levels(c) == .x, levels(c)))) %>%
unnest_wider(c_one_hot) %>%
mutate(e_one_hot = map(e, ~ set_names(levels(e) == .x, levels(e)))) %>%
unnest_wider(e_one_hot) %>%
mutate(across(everything(), ~.*1)) %>%
select(-c, -e)
输出:
a b d Rose Pink Red Paris London
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 2 1 0 0 1 0
2 2 1 3 0 1 0 0 1
3 3 2 4 0 0 1 1 0
这是一种可能的方法,但它涉及遍历每个分类变量并分别制作虚拟变量,然后在最后将它们绑定回数字变量。
我用 model.matrix()
来制作虚拟变量,用 reformulate()
来构造公式。注意 reformulate()
有有用的 intercept
参数来抑制截距以避免处理对比。
这是一个工作函数,将数据集和列名作为字符串。我把它变成了 data.frame 所以它可以与 map_df()
函数一起使用:
to_dummy = function(data, col) {
col %>%
reformulate(intercept = FALSE) %>%
model.matrix(data = data) %>%
as.data.frame()
}
to_dummy(data = df, col = "e")
#> eParis eLondon
#> 1 1 0
#> 2 0 1
#> 3 1 0
然后使用 map_dfc()
(用于列绑定)循环遍历字符串向量中的分类列。最后一步是将数字列绑定回数据集的其余部分,我有点笨拙地使用嵌套 select()
.
cols %>%
map_dfc(.f = to_dummy, data = df) %>%
cbind(select(df, where(is.numeric)), .)
#> a b d cRose cPink cRed eParis eLondon
#> 1 1 1 2 1 0 0 1 0
#> 2 2 1 3 0 1 0 0 1
#> 3 3 2 4 0 0 1 1 0
由 reprex package (v2.0.0)
于 2021-10-21 创建缺点是列名基于原始变量加上因子水平,而不仅仅是因子水平。