使用第二个数据帧有条件地在一个数据帧中填充值的更高效方法

More performant way of conditionally filling in values in one data frame using second data frame

背景

我正在使用两个 dta_miss_dates 大约有 200K 行,由整数和字符向量组成。字符向量使用 format.Date(x, "%Y%m") 从日期派生。字符向量大约有 ~ 20% 的缺失值。

任务

任务是使用 dta_all_dates 中可用的值填充缺失值。该小标题大约有 700 万行。填充算法的工作原理如下:

  1. 对于每个缺少日期 var_id_miss 的 ID,对应的 ID 在 table 中与所有日期 var_id_all.
  2. 匹配
  3. 然后部署汇总值的函数。最常见的是 max 但解决方案必须具有足够的不可知性才能合并其他功能,例如 minmedian.

问题

下面概述的解决方案使用 包中的 map_chr。在给定 id 对应的子集上部署汇总函数。这提供了所需的灵活性,但部署在实际数据上太慢了。

例子

数据

为了使示例数据与实际情况相似,reduce_example_date <- TRUE 应设置为 FALSE

# Settings ----------------------------------------------------------------

# Libraries
library("tidyverse")
library("stringi")
library("progress")

set.seed(123)

# Tibble sizes
# Reduce sample sizes for faster development
reduce_example_date <- TRUE # FALSE reflects actual experiment settings

nrow_missing_dates <- 2e5
nrow_all_dates <- 7e6

if (reduce_example_date) {
  nrow_missing_dates <- nrow_missing_dates / 100
  nrow_all_dates <- nrow_all_dates / 100
}


# Sample data with missing dates
dta_miss_dates <- tibble(
  var_id_miss = sample(1e6:9e6, nrow_missing_dates, replace = FALSE),
  var_dts_miss = sample(c(
    seq.Date(
      from = Sys.Date() - 2 * 365,
      to = Sys.Date(),
      by = "day"
    ),
    rep.int(NA, 100)
  ), nrow_missing_dates, replace = TRUE)
) %>%
  mutate(var_dts_miss = format.Date(var_dts_miss, "%Y%m"))

# Data with all dates
dta_all_dates <- tibble(
  var_id_all = sample(dta_miss_dates$var_id_miss, nrow_all_dates, TRUE),
  var_grp_sth = stri_rand_strings(
    n = nrow_all_dates,
    length = 3,
    pattern = "[A-D]"
  ),
  var_dts_all = sample(
    seq.Date(
      from = Sys.Date() - 50,
      to = Sys.Date(),
      by = "day"
    ),
    nrow_all_dates,
    replace = TRUE
  )
) 

匹配

# Matching Functions ------------------------------------------------------

match_via_purr <-
  function(id_col,
           dta_dates,
           search_fun,
           date_coll,
           verbose) {

    # Iterates over IDs and where date is missing conducts a search
    f_match <- function(id_obs) {

      filter(dta_all_dates, var_id_all == id_obs) %>%
      summarise(across(.cols = {{date_coll}}, .fns = {{search_fun}})) %>%
        pull({{date_coll}}) %>%
        format.Date(format = "%Y%m")

    }

    pb <- progress_bar$new(total = length({{id_col}}),
                           format = "[:bar] :current / :total (:percent) ETA: :eta")

    map_chr(.x = {{id_col}}, .f = ~ {pb$tick(); f_match(id_obs = .x)})
  }

测试

dta_miss_dates %>%
  mutate(var_dts_miss = if_else(
    is.na(var_dts_miss),
    match_via_purr(
      id_col = var_id_miss,
      dta_dates = dta_all_dates,
      search_fun = max,
      date_coll = var_dts_all
    ),
    var_dts_miss
  ))

问题

这是一个使用基数 R merge 的解决方案。我认为您应该提前准备汇总查找 table,而不是在矢量化循环中重复调用它。 {dplyr} 相当快,但有一些已知的性能问题,您可以相对轻松地编写比所需工作更多的东西。

下面的这个代表在我的机器上大约 30 秒内“填充”了您的数据集,而您使用的基于 {purrr} 的方法的预计到达时间是 5 小时左右。

# Settings ----------------------------------------------------------------

# Libraries
library("tidyverse")
library("stringi")
library("progress")

set.seed(123)

# Tibble sizes
# Reduce sample sizes for faster development
reduce_example_date <- FALSE # FALSE reflects actual experiment settings

nrow_missing_dates <- 2e5
nrow_all_dates <- 7e6

if (reduce_example_date) {
  nrow_missing_dates <- nrow_missing_dates / 100
  nrow_all_dates <- nrow_all_dates / 100
}

# Sample data with missing dates
dta_miss_dates <- tibble(
  var_id_miss = sample(1e6:9e6, nrow_missing_dates, replace = FALSE),
  var_dts_miss = sample(c(
    seq.Date(
      from = Sys.Date() - 2 * 365,
      to = Sys.Date(),
      by = "day"
    ),
    rep.int(NA, 100)
  ), nrow_missing_dates, replace = TRUE)
) %>%
  mutate(var_dts_miss = format.Date(var_dts_miss, "%Y%m"))

# Data with all dates
dta_all_dates <- tibble(
  var_id_all = sample(dta_miss_dates$var_id_miss, nrow_all_dates, TRUE),
  var_grp_sth = stri_rand_strings(
    n = nrow_all_dates,
    length = 3,
    pattern = "[A-D]"
  ),
  var_dts_all = sample(
    seq.Date(
      from = Sys.Date() - 50,
      to = Sys.Date(),
      by = "day"
    ),
    nrow_all_dates,
    replace = TRUE
  )
) 

# pre-calculate ID summaries based on search_fun

prepare_data <- function(dat, id_col, date_coll, search_fun) {
 dat %>%
  group_by({{id_col}}) %>%
  summarise(across(.cols = {{date_coll}}, .fns = {{search_fun}})) %>%
  mutate(across(.cols = {{date_coll}}, format.Date, format = "%Y%m"))
}

# prepare a lookup table, using desired summary function
system.time( {
  lut <- prepare_data(dta_all_dates, var_id_all, var_dts_all, max)

  # identify missing indices
  na_idx <- which(is.na(dta_miss_dates$var_dts_miss))
  
  # fill missing indices, merge on ID
  dta_miss_dates[na_idx, 'var_dts_miss'] <- merge(dta_miss_dates[na_idx,], lut, 
                                                  by.x = "var_id_miss", 
                                                  by.y = "var_id_all", 
                                                  all.x = TRUE, sort=FALSE)$var_dts_all
} )
#> `summarise()` ungrouping output (override with `.groups` argument)
#>    user  system elapsed 
#>  31.721   0.176  31.935

any(is.na(dta_miss_dates$var_dts_miss))
#> [1] FALSE

reprex package (v0.3.0)

于 2020-12-06 创建

您可以使用 {data.table} 来压缩您的大 table,从而使数据准备更快。类似于:

library(data.table)

prepare_data2 <- function(dat, id_col, date_coll, search_fun) {
  data.table(dat)[, .(var_dts_all=search_fun(.SD[[date_coll]])), by=c(eval(id_col)), .SDcols = c(eval(date_coll))]
}
system.time(lut2 <- prepare_data2(dta_all_dates, "var_id_all", "var_dts_all", max))
#   user  system elapsed 
#  7.248   0.095   6.991