虚拟变量的 R rowise 操作
R rowise operations on dummy variables
我有一个 ATM ID 数据集,这些 ID 用代表每个日期的开放和关闭的虚拟变量进行编码。目标是生成新列,告诉我们 ATM 首次出现在数据中的时间、出现在数据中后第一次关闭的时间以及它出现在数据中的最后日期。这是我期望的数据和输出:
data <- tribble(
~atm_id, ~nov_2019, ~feb_2020, ~may_2020, ~aug_2020, ~first_appeared, ~closed, ~always_open, ~last_appeared,
"xx1", 1, 1, 0, 0, "nov_2019", "may_2020", 0, "feb_2020",
"xx2", 1, 1, 1, 1, "nov_2019", NA, 1, "aug_2020",
"xx3", 0, 0, 0, 1, "aug_2020", NA, 0, "aug_2020",
"xx4", 1, 0, 0, 1, "nov_2019", "feb_2020", 0, "aug_2020"
)
请注意,atm_id
xx3
是一种新的 ATM,仅在 aug_2020
中首次出现,因此当询问它是否关闭时,它会得到一个 NA
。
最后四列将使用虚拟变量列生成。我目前只表示四个日期,但此数据是在每年的 11 月、2 月、5 月和 8 月生成的。例如,这是用于生成日期列的字符向量。
column_names <- c("nov_2019", "feb_2020", "may_2020", "aug_2020", "nov_2020", "feb_2021", "may_2021", "aug_2021", "nov_2021")
有没有办法使用 dplyr 或 tidyverse 包来做到这一点?
library(tidyverse)
data <- tribble(
~atm_id, ~nov_2019, ~feb_2020, ~may_2020, ~aug_2020, ~first_appeared, ~closed, ~always_open, ~last_appeared,
"xx1", 1, 1, 0, 0, "nov_2019", "may_2020", 0, "feb_2020",
"xx2", 1, 1, 1, 1, "nov_2019", NA, 1, "aug_2020",
"xx3", 0, 0, 0, 1, "aug_2020", NA, 0, "aug_2020",
"xx4", 1, 0, 0, 1, "nov_2019", "feb_2020", 0, "aug_2020"
)
date_levels <- c("nov_2019", "feb_2020", "may_2020", "aug_2020", "nov_2020", "feb_2021", "may_2021", "aug_2021", "nov_2021")
raw_data <-
data %>%
select(1:5)
raw_data_long <-
raw_data %>%
pivot_longer(-atm_id, names_to = "date", values_to = "open") %>%
mutate(date = date %>% factor(date_levels)) %>%
group_by(atm_id)
raw_data_long
#> # A tibble: 16 x 3
#> # Groups: atm_id [4]
#> atm_id date open
#> <chr> <fct> <dbl>
#> 1 xx1 nov_2019 1
#> 2 xx1 feb_2020 1
#> 3 xx1 may_2020 0
#> 4 xx1 aug_2020 0
#> 5 xx2 nov_2019 1
#> 6 xx2 feb_2020 1
#> 7 xx2 may_2020 1
#> 8 xx2 aug_2020 1
#> 9 xx3 nov_2019 0
#> 10 xx3 feb_2020 0
#> 11 xx3 may_2020 0
#> 12 xx3 aug_2020 1
#> 13 xx4 nov_2019 1
#> 14 xx4 feb_2020 0
#> 15 xx4 may_2020 0
#> 16 xx4 aug_2020 1
appeared <-
raw_data_long %>%
filter(open == 1) %>%
arrange(date) %>%
summarise(
first_appeared = first(date),
last_appeared = last(date)
)
appeared
#> # A tibble: 4 x 3
#> atm_id first_appeared last_appeared
#> <chr> <fct> <fct>
#> 1 xx1 nov_2019 feb_2020
#> 2 xx2 nov_2019 aug_2020
#> 3 xx3 aug_2020 aug_2020
#> 4 xx4 nov_2019 aug_2020
always_open <-
raw_data_long %>%
mutate(n_open = open %>% keep(~ .x == 1) %>% length()) %>%
summarise(always_open = as.numeric(n_open == n())) %>%
distinct(always_open)
#> `summarise()` has grouped output by 'atm_id'. You can override using the `.groups` argument.
always_open
#> # A tibble: 4 x 2
#> # Groups: atm_id [4]
#> atm_id always_open
#> <chr> <dbl>
#> 1 xx1 0
#> 2 xx2 1
#> 3 xx3 0
#> 4 xx4 0
closed <-
raw_data_long %>%
filter(open == 0) %>%
arrange(date) %>%
summarise(closed = first(date)) %>%
anti_join(
# must start with open date
raw_data_long %>% filter(first(open) == 0)
)
#> Joining, by = "atm_id"
closed
#> # A tibble: 2 x 2
#> atm_id closed
#> <chr> <fct>
#> 1 xx1 may_2020
#> 2 xx4 feb_2020
raw_data %>%
left_join(appeared) %>%
left_join(closed) %>%
left_join(always_open)
#> Joining, by = "atm_id"
#> Joining, by = "atm_id"
#> Joining, by = "atm_id"
#> # A tibble: 4 x 9
#> atm_id nov_2019 feb_2020 may_2020 aug_2020 first_appeared last_appeared closed
#> <chr> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
#> 1 xx1 1 1 0 0 nov_2019 feb_2020 may_2…
#> 2 xx2 1 1 1 1 nov_2019 aug_2020 <NA>
#> 3 xx3 0 0 0 1 aug_2020 aug_2020 <NA>
#> 4 xx4 1 0 0 1 nov_2019 aug_2020 feb_2…
#> # … with 1 more variable: always_open <dbl>
我有一个 ATM ID 数据集,这些 ID 用代表每个日期的开放和关闭的虚拟变量进行编码。目标是生成新列,告诉我们 ATM 首次出现在数据中的时间、出现在数据中后第一次关闭的时间以及它出现在数据中的最后日期。这是我期望的数据和输出:
data <- tribble(
~atm_id, ~nov_2019, ~feb_2020, ~may_2020, ~aug_2020, ~first_appeared, ~closed, ~always_open, ~last_appeared,
"xx1", 1, 1, 0, 0, "nov_2019", "may_2020", 0, "feb_2020",
"xx2", 1, 1, 1, 1, "nov_2019", NA, 1, "aug_2020",
"xx3", 0, 0, 0, 1, "aug_2020", NA, 0, "aug_2020",
"xx4", 1, 0, 0, 1, "nov_2019", "feb_2020", 0, "aug_2020"
)
请注意,atm_id
xx3
是一种新的 ATM,仅在 aug_2020
中首次出现,因此当询问它是否关闭时,它会得到一个 NA
。
最后四列将使用虚拟变量列生成。我目前只表示四个日期,但此数据是在每年的 11 月、2 月、5 月和 8 月生成的。例如,这是用于生成日期列的字符向量。
column_names <- c("nov_2019", "feb_2020", "may_2020", "aug_2020", "nov_2020", "feb_2021", "may_2021", "aug_2021", "nov_2021")
有没有办法使用 dplyr 或 tidyverse 包来做到这一点?
library(tidyverse)
data <- tribble(
~atm_id, ~nov_2019, ~feb_2020, ~may_2020, ~aug_2020, ~first_appeared, ~closed, ~always_open, ~last_appeared,
"xx1", 1, 1, 0, 0, "nov_2019", "may_2020", 0, "feb_2020",
"xx2", 1, 1, 1, 1, "nov_2019", NA, 1, "aug_2020",
"xx3", 0, 0, 0, 1, "aug_2020", NA, 0, "aug_2020",
"xx4", 1, 0, 0, 1, "nov_2019", "feb_2020", 0, "aug_2020"
)
date_levels <- c("nov_2019", "feb_2020", "may_2020", "aug_2020", "nov_2020", "feb_2021", "may_2021", "aug_2021", "nov_2021")
raw_data <-
data %>%
select(1:5)
raw_data_long <-
raw_data %>%
pivot_longer(-atm_id, names_to = "date", values_to = "open") %>%
mutate(date = date %>% factor(date_levels)) %>%
group_by(atm_id)
raw_data_long
#> # A tibble: 16 x 3
#> # Groups: atm_id [4]
#> atm_id date open
#> <chr> <fct> <dbl>
#> 1 xx1 nov_2019 1
#> 2 xx1 feb_2020 1
#> 3 xx1 may_2020 0
#> 4 xx1 aug_2020 0
#> 5 xx2 nov_2019 1
#> 6 xx2 feb_2020 1
#> 7 xx2 may_2020 1
#> 8 xx2 aug_2020 1
#> 9 xx3 nov_2019 0
#> 10 xx3 feb_2020 0
#> 11 xx3 may_2020 0
#> 12 xx3 aug_2020 1
#> 13 xx4 nov_2019 1
#> 14 xx4 feb_2020 0
#> 15 xx4 may_2020 0
#> 16 xx4 aug_2020 1
appeared <-
raw_data_long %>%
filter(open == 1) %>%
arrange(date) %>%
summarise(
first_appeared = first(date),
last_appeared = last(date)
)
appeared
#> # A tibble: 4 x 3
#> atm_id first_appeared last_appeared
#> <chr> <fct> <fct>
#> 1 xx1 nov_2019 feb_2020
#> 2 xx2 nov_2019 aug_2020
#> 3 xx3 aug_2020 aug_2020
#> 4 xx4 nov_2019 aug_2020
always_open <-
raw_data_long %>%
mutate(n_open = open %>% keep(~ .x == 1) %>% length()) %>%
summarise(always_open = as.numeric(n_open == n())) %>%
distinct(always_open)
#> `summarise()` has grouped output by 'atm_id'. You can override using the `.groups` argument.
always_open
#> # A tibble: 4 x 2
#> # Groups: atm_id [4]
#> atm_id always_open
#> <chr> <dbl>
#> 1 xx1 0
#> 2 xx2 1
#> 3 xx3 0
#> 4 xx4 0
closed <-
raw_data_long %>%
filter(open == 0) %>%
arrange(date) %>%
summarise(closed = first(date)) %>%
anti_join(
# must start with open date
raw_data_long %>% filter(first(open) == 0)
)
#> Joining, by = "atm_id"
closed
#> # A tibble: 2 x 2
#> atm_id closed
#> <chr> <fct>
#> 1 xx1 may_2020
#> 2 xx4 feb_2020
raw_data %>%
left_join(appeared) %>%
left_join(closed) %>%
left_join(always_open)
#> Joining, by = "atm_id"
#> Joining, by = "atm_id"
#> Joining, by = "atm_id"
#> # A tibble: 4 x 9
#> atm_id nov_2019 feb_2020 may_2020 aug_2020 first_appeared last_appeared closed
#> <chr> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
#> 1 xx1 1 1 0 0 nov_2019 feb_2020 may_2…
#> 2 xx2 1 1 1 1 nov_2019 aug_2020 <NA>
#> 3 xx3 0 0 0 1 aug_2020 aug_2020 <NA>
#> 4 xx4 1 0 0 1 nov_2019 aug_2020 feb_2…
#> # … with 1 more variable: always_open <dbl>