使用第二个数据帧有条件地在一个数据帧中填充值的更高效方法
More performant way of conditionally filling in values in one data frame using second data frame
背景
我正在使用两个 tibble。 dta_miss_dates
大约有 200K 行,由整数和字符向量组成。字符向量使用 format.Date(x, "%Y%m")
从日期派生。字符向量大约有 ~ 20% 的缺失值。
任务
任务是使用 dta_all_dates
tibble 中可用的值填充缺失值。该小标题大约有 700 万行。填充算法的工作原理如下:
- 对于每个缺少日期
var_id_miss
的 ID,对应的 ID 在 table 中与所有日期 var_id_all
. 匹配
- 然后部署汇总值的函数。最常见的是
max
但解决方案必须具有足够的不可知性才能合并其他功能,例如 min
或 median
.
问题
下面概述的解决方案使用 purrr 包中的 map_chr
。在给定 id 对应的子集上部署汇总函数。这提供了所需的灵活性,但部署在实际数据上太慢了。
例子
数据
为了使示例数据与实际情况相似,reduce_example_date <- TRUE
应设置为 FALSE
。
# Settings ----------------------------------------------------------------
# Libraries
library("tidyverse")
library("stringi")
library("progress")
set.seed(123)
# Tibble sizes
# Reduce sample sizes for faster development
reduce_example_date <- TRUE # FALSE reflects actual experiment settings
nrow_missing_dates <- 2e5
nrow_all_dates <- 7e6
if (reduce_example_date) {
nrow_missing_dates <- nrow_missing_dates / 100
nrow_all_dates <- nrow_all_dates / 100
}
# Sample data with missing dates
dta_miss_dates <- tibble(
var_id_miss = sample(1e6:9e6, nrow_missing_dates, replace = FALSE),
var_dts_miss = sample(c(
seq.Date(
from = Sys.Date() - 2 * 365,
to = Sys.Date(),
by = "day"
),
rep.int(NA, 100)
), nrow_missing_dates, replace = TRUE)
) %>%
mutate(var_dts_miss = format.Date(var_dts_miss, "%Y%m"))
# Data with all dates
dta_all_dates <- tibble(
var_id_all = sample(dta_miss_dates$var_id_miss, nrow_all_dates, TRUE),
var_grp_sth = stri_rand_strings(
n = nrow_all_dates,
length = 3,
pattern = "[A-D]"
),
var_dts_all = sample(
seq.Date(
from = Sys.Date() - 50,
to = Sys.Date(),
by = "day"
),
nrow_all_dates,
replace = TRUE
)
)
匹配
# Matching Functions ------------------------------------------------------
match_via_purr <-
function(id_col,
dta_dates,
search_fun,
date_coll,
verbose) {
# Iterates over IDs and where date is missing conducts a search
f_match <- function(id_obs) {
filter(dta_all_dates, var_id_all == id_obs) %>%
summarise(across(.cols = {{date_coll}}, .fns = {{search_fun}})) %>%
pull({{date_coll}}) %>%
format.Date(format = "%Y%m")
}
pb <- progress_bar$new(total = length({{id_col}}),
format = "[:bar] :current / :total (:percent) ETA: :eta")
map_chr(.x = {{id_col}}, .f = ~ {pb$tick(); f_match(id_obs = .x)})
}
测试
dta_miss_dates %>%
mutate(var_dts_miss = if_else(
is.na(var_dts_miss),
match_via_purr(
id_col = var_id_miss,
dta_dates = dta_all_dates,
search_fun = max,
date_coll = var_dts_all
),
var_dts_miss
))
问题
这是一个使用基数 R merge
的解决方案。我认为您应该提前准备汇总查找 table,而不是在矢量化循环中重复调用它。 {dplyr} 相当快,但有一些已知的性能问题,您可以相对轻松地编写比所需工作更多的东西。
下面的这个代表在我的机器上大约 30 秒内“填充”了您的数据集,而您使用的基于 {purrr} 的方法的预计到达时间是 5 小时左右。
# Settings ----------------------------------------------------------------
# Libraries
library("tidyverse")
library("stringi")
library("progress")
set.seed(123)
# Tibble sizes
# Reduce sample sizes for faster development
reduce_example_date <- FALSE # FALSE reflects actual experiment settings
nrow_missing_dates <- 2e5
nrow_all_dates <- 7e6
if (reduce_example_date) {
nrow_missing_dates <- nrow_missing_dates / 100
nrow_all_dates <- nrow_all_dates / 100
}
# Sample data with missing dates
dta_miss_dates <- tibble(
var_id_miss = sample(1e6:9e6, nrow_missing_dates, replace = FALSE),
var_dts_miss = sample(c(
seq.Date(
from = Sys.Date() - 2 * 365,
to = Sys.Date(),
by = "day"
),
rep.int(NA, 100)
), nrow_missing_dates, replace = TRUE)
) %>%
mutate(var_dts_miss = format.Date(var_dts_miss, "%Y%m"))
# Data with all dates
dta_all_dates <- tibble(
var_id_all = sample(dta_miss_dates$var_id_miss, nrow_all_dates, TRUE),
var_grp_sth = stri_rand_strings(
n = nrow_all_dates,
length = 3,
pattern = "[A-D]"
),
var_dts_all = sample(
seq.Date(
from = Sys.Date() - 50,
to = Sys.Date(),
by = "day"
),
nrow_all_dates,
replace = TRUE
)
)
# pre-calculate ID summaries based on search_fun
prepare_data <- function(dat, id_col, date_coll, search_fun) {
dat %>%
group_by({{id_col}}) %>%
summarise(across(.cols = {{date_coll}}, .fns = {{search_fun}})) %>%
mutate(across(.cols = {{date_coll}}, format.Date, format = "%Y%m"))
}
# prepare a lookup table, using desired summary function
system.time( {
lut <- prepare_data(dta_all_dates, var_id_all, var_dts_all, max)
# identify missing indices
na_idx <- which(is.na(dta_miss_dates$var_dts_miss))
# fill missing indices, merge on ID
dta_miss_dates[na_idx, 'var_dts_miss'] <- merge(dta_miss_dates[na_idx,], lut,
by.x = "var_id_miss",
by.y = "var_id_all",
all.x = TRUE, sort=FALSE)$var_dts_all
} )
#> `summarise()` ungrouping output (override with `.groups` argument)
#> user system elapsed
#> 31.721 0.176 31.935
any(is.na(dta_miss_dates$var_dts_miss))
#> [1] FALSE
由 reprex package (v0.3.0)
于 2020-12-06 创建
您可以使用 {data.table} 来压缩您的大 table,从而使数据准备更快。类似于:
library(data.table)
prepare_data2 <- function(dat, id_col, date_coll, search_fun) {
data.table(dat)[, .(var_dts_all=search_fun(.SD[[date_coll]])), by=c(eval(id_col)), .SDcols = c(eval(date_coll))]
}
system.time(lut2 <- prepare_data2(dta_all_dates, "var_id_all", "var_dts_all", max))
# user system elapsed
# 7.248 0.095 6.991
背景
我正在使用两个 tibble。 dta_miss_dates
大约有 200K 行,由整数和字符向量组成。字符向量使用 format.Date(x, "%Y%m")
从日期派生。字符向量大约有 ~ 20% 的缺失值。
任务
任务是使用 dta_all_dates
tibble 中可用的值填充缺失值。该小标题大约有 700 万行。填充算法的工作原理如下:
- 对于每个缺少日期
var_id_miss
的 ID,对应的 ID 在 table 中与所有日期var_id_all
. 匹配
- 然后部署汇总值的函数。最常见的是
max
但解决方案必须具有足够的不可知性才能合并其他功能,例如min
或median
.
问题
下面概述的解决方案使用 purrr 包中的 map_chr
。在给定 id 对应的子集上部署汇总函数。这提供了所需的灵活性,但部署在实际数据上太慢了。
例子
数据
为了使示例数据与实际情况相似,reduce_example_date <- TRUE
应设置为 FALSE
。
# Settings ----------------------------------------------------------------
# Libraries
library("tidyverse")
library("stringi")
library("progress")
set.seed(123)
# Tibble sizes
# Reduce sample sizes for faster development
reduce_example_date <- TRUE # FALSE reflects actual experiment settings
nrow_missing_dates <- 2e5
nrow_all_dates <- 7e6
if (reduce_example_date) {
nrow_missing_dates <- nrow_missing_dates / 100
nrow_all_dates <- nrow_all_dates / 100
}
# Sample data with missing dates
dta_miss_dates <- tibble(
var_id_miss = sample(1e6:9e6, nrow_missing_dates, replace = FALSE),
var_dts_miss = sample(c(
seq.Date(
from = Sys.Date() - 2 * 365,
to = Sys.Date(),
by = "day"
),
rep.int(NA, 100)
), nrow_missing_dates, replace = TRUE)
) %>%
mutate(var_dts_miss = format.Date(var_dts_miss, "%Y%m"))
# Data with all dates
dta_all_dates <- tibble(
var_id_all = sample(dta_miss_dates$var_id_miss, nrow_all_dates, TRUE),
var_grp_sth = stri_rand_strings(
n = nrow_all_dates,
length = 3,
pattern = "[A-D]"
),
var_dts_all = sample(
seq.Date(
from = Sys.Date() - 50,
to = Sys.Date(),
by = "day"
),
nrow_all_dates,
replace = TRUE
)
)
匹配
# Matching Functions ------------------------------------------------------
match_via_purr <-
function(id_col,
dta_dates,
search_fun,
date_coll,
verbose) {
# Iterates over IDs and where date is missing conducts a search
f_match <- function(id_obs) {
filter(dta_all_dates, var_id_all == id_obs) %>%
summarise(across(.cols = {{date_coll}}, .fns = {{search_fun}})) %>%
pull({{date_coll}}) %>%
format.Date(format = "%Y%m")
}
pb <- progress_bar$new(total = length({{id_col}}),
format = "[:bar] :current / :total (:percent) ETA: :eta")
map_chr(.x = {{id_col}}, .f = ~ {pb$tick(); f_match(id_obs = .x)})
}
测试
dta_miss_dates %>%
mutate(var_dts_miss = if_else(
is.na(var_dts_miss),
match_via_purr(
id_col = var_id_miss,
dta_dates = dta_all_dates,
search_fun = max,
date_coll = var_dts_all
),
var_dts_miss
))
问题
这是一个使用基数 R merge
的解决方案。我认为您应该提前准备汇总查找 table,而不是在矢量化循环中重复调用它。 {dplyr} 相当快,但有一些已知的性能问题,您可以相对轻松地编写比所需工作更多的东西。
下面的这个代表在我的机器上大约 30 秒内“填充”了您的数据集,而您使用的基于 {purrr} 的方法的预计到达时间是 5 小时左右。
# Settings ----------------------------------------------------------------
# Libraries
library("tidyverse")
library("stringi")
library("progress")
set.seed(123)
# Tibble sizes
# Reduce sample sizes for faster development
reduce_example_date <- FALSE # FALSE reflects actual experiment settings
nrow_missing_dates <- 2e5
nrow_all_dates <- 7e6
if (reduce_example_date) {
nrow_missing_dates <- nrow_missing_dates / 100
nrow_all_dates <- nrow_all_dates / 100
}
# Sample data with missing dates
dta_miss_dates <- tibble(
var_id_miss = sample(1e6:9e6, nrow_missing_dates, replace = FALSE),
var_dts_miss = sample(c(
seq.Date(
from = Sys.Date() - 2 * 365,
to = Sys.Date(),
by = "day"
),
rep.int(NA, 100)
), nrow_missing_dates, replace = TRUE)
) %>%
mutate(var_dts_miss = format.Date(var_dts_miss, "%Y%m"))
# Data with all dates
dta_all_dates <- tibble(
var_id_all = sample(dta_miss_dates$var_id_miss, nrow_all_dates, TRUE),
var_grp_sth = stri_rand_strings(
n = nrow_all_dates,
length = 3,
pattern = "[A-D]"
),
var_dts_all = sample(
seq.Date(
from = Sys.Date() - 50,
to = Sys.Date(),
by = "day"
),
nrow_all_dates,
replace = TRUE
)
)
# pre-calculate ID summaries based on search_fun
prepare_data <- function(dat, id_col, date_coll, search_fun) {
dat %>%
group_by({{id_col}}) %>%
summarise(across(.cols = {{date_coll}}, .fns = {{search_fun}})) %>%
mutate(across(.cols = {{date_coll}}, format.Date, format = "%Y%m"))
}
# prepare a lookup table, using desired summary function
system.time( {
lut <- prepare_data(dta_all_dates, var_id_all, var_dts_all, max)
# identify missing indices
na_idx <- which(is.na(dta_miss_dates$var_dts_miss))
# fill missing indices, merge on ID
dta_miss_dates[na_idx, 'var_dts_miss'] <- merge(dta_miss_dates[na_idx,], lut,
by.x = "var_id_miss",
by.y = "var_id_all",
all.x = TRUE, sort=FALSE)$var_dts_all
} )
#> `summarise()` ungrouping output (override with `.groups` argument)
#> user system elapsed
#> 31.721 0.176 31.935
any(is.na(dta_miss_dates$var_dts_miss))
#> [1] FALSE
由 reprex package (v0.3.0)
于 2020-12-06 创建您可以使用 {data.table} 来压缩您的大 table,从而使数据准备更快。类似于:
library(data.table)
prepare_data2 <- function(dat, id_col, date_coll, search_fun) {
data.table(dat)[, .(var_dts_all=search_fun(.SD[[date_coll]])), by=c(eval(id_col)), .SDcols = c(eval(date_coll))]
}
system.time(lut2 <- prepare_data2(dta_all_dates, "var_id_all", "var_dts_all", max))
# user system elapsed
# 7.248 0.095 6.991