将 purrr::map2() 与 dbplyr 一起使用
Using purrr::map2() with dbplyr
我正在尝试 select 行 table ("positons") 中特定列 ("position") 的值落在定义的范围内另一个 ("my_ranges") table,然后从 "my_ranges" table.
添加一个分组标签
我可以使用 tibbles 和几个 purrr::map2
调用来做到这一点,但同样的方法不适用于 dbplyr database-tibbles。这是预期的行为吗?如果是,我是否应该采用不同的方法来使用 dbplyr 来完成此类任务?
这是我的例子:
library("tidyverse")
set.seed(42)
my_ranges <-
tibble(
group_id = c("a", "b", "c", "d"),
start = c(1, 7, 2, 25),
end = c(5, 23, 7, 29)
)
positions <-
tibble(
position = as.integer(runif(n = 100, min = 0, max = 30)),
annotation = stringi::stri_rand_strings(n = 100, length = 10)
)
# note: this works as I expect and returns a tibble with 106 obs of 3 variables:
result <- map2(.x = my_ranges$start, .y = my_ranges$end,
.f = function(x, y) {between(positions$position, x, y)}) %>%
map2(.y = my_ranges$group_id,
.f = function(x, y){
positions %>%
filter(x) %>%
mutate(group_id = y)}
) %>% bind_rows()
# next, make an in-memory db for testing:
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
# copy data to db
copy_to(con, my_ranges, "my_ranges", temporary = FALSE)
copy_to(con, positions, "positions", temporary = FALSE)
# get db-backed tibbles:
my_ranges_db <- tbl(con, "my_ranges")
positions_db <- tbl(con, "positions")
# note: this does not work as I expect, and instead returns a tibble with 0 obsevations of 0 variables:
# database range-based query:
db_result <- map2(.x = my_ranges_db$start, .y = my_ranges_db$end,
.f = function(x, y) {
between(positions_db$position, x, y)
}) %>%
map2(.y = my_ranges_db$group_id,
.f = function(x, y){
positions_db %>%
filter(x) %>%
mutate(group_id = y)}
) %>% bind_rows()
dbplyr
将 R
翻译成 SQL
。 SQL
中不存在列表。 map
创建列表。因此不可能将 map
翻译成 SQL
.
主要翻译了dplyr
函数和一些base
函数,据我了解,他们也在翻译tidyr
函数。使用 dbplyr
时,请尝试在您的方法中使用 SQL
逻辑,否则它很容易崩溃。
只要每次迭代创建相同维度的 table,那么可能会有一种巧妙的方法将整个操作推送到数据库。想法是同时使用 purrr
中的 map()
和 reduce()
。每个 tbl_sql()
操作都是惰性的,所以我们可以遍历它们而不用担心发送一堆查询,然后我们可以使用 union()
基本上将每次迭代的结果 SQL 附加到接下来使用给定数据库中的 UNION
子句。这是一个例子:
library(dbplyr, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(purrr, warn.conflicts = FALSE)
library(DBI, warn.conflicts = FALSE)
library(rlang, warn.conflicts = FALSE)
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":dbname:")
db_mtcars <- copy_to(con, mtcars)
cyls <- c(4, 6, 8)
all <- cyls %>%
map(~{
db_mtcars %>%
filter(cyl == .x) %>%
summarise(mpg = mean(mpg, na.rm = TRUE)
)
}) %>%
reduce(function(x, y) union(x, y))
all
#> # Source: lazy query [?? x 1]
#> # Database: sqlite 3.22.0 []
#> mpg
#> <dbl>
#> 1 15.1
#> 2 19.7
#> 3 26.7
show_query(all)
#> <SQL>
#> SELECT AVG(`mpg`) AS `mpg`
#> FROM (SELECT *
#> FROM (SELECT *
#> FROM `mtcars`)
#> WHERE (`cyl` = 4.0))
#> UNION
#> SELECT AVG(`mpg`) AS `mpg`
#> FROM (SELECT *
#> FROM (SELECT *
#> FROM `mtcars`)
#> WHERE (`cyl` = 6.0))
#> UNION
#> SELECT AVG(`mpg`) AS `mpg`
#> FROM (SELECT *
#> FROM (SELECT *
#> FROM `mtcars`)
#> WHERE (`cyl` = 8.0))
dbDisconnect(con)
我正在尝试 select 行 table ("positons") 中特定列 ("position") 的值落在定义的范围内另一个 ("my_ranges") table,然后从 "my_ranges" table.
添加一个分组标签我可以使用 tibbles 和几个 purrr::map2
调用来做到这一点,但同样的方法不适用于 dbplyr database-tibbles。这是预期的行为吗?如果是,我是否应该采用不同的方法来使用 dbplyr 来完成此类任务?
这是我的例子:
library("tidyverse")
set.seed(42)
my_ranges <-
tibble(
group_id = c("a", "b", "c", "d"),
start = c(1, 7, 2, 25),
end = c(5, 23, 7, 29)
)
positions <-
tibble(
position = as.integer(runif(n = 100, min = 0, max = 30)),
annotation = stringi::stri_rand_strings(n = 100, length = 10)
)
# note: this works as I expect and returns a tibble with 106 obs of 3 variables:
result <- map2(.x = my_ranges$start, .y = my_ranges$end,
.f = function(x, y) {between(positions$position, x, y)}) %>%
map2(.y = my_ranges$group_id,
.f = function(x, y){
positions %>%
filter(x) %>%
mutate(group_id = y)}
) %>% bind_rows()
# next, make an in-memory db for testing:
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
# copy data to db
copy_to(con, my_ranges, "my_ranges", temporary = FALSE)
copy_to(con, positions, "positions", temporary = FALSE)
# get db-backed tibbles:
my_ranges_db <- tbl(con, "my_ranges")
positions_db <- tbl(con, "positions")
# note: this does not work as I expect, and instead returns a tibble with 0 obsevations of 0 variables:
# database range-based query:
db_result <- map2(.x = my_ranges_db$start, .y = my_ranges_db$end,
.f = function(x, y) {
between(positions_db$position, x, y)
}) %>%
map2(.y = my_ranges_db$group_id,
.f = function(x, y){
positions_db %>%
filter(x) %>%
mutate(group_id = y)}
) %>% bind_rows()
dbplyr
将 R
翻译成 SQL
。 SQL
中不存在列表。 map
创建列表。因此不可能将 map
翻译成 SQL
.
主要翻译了dplyr
函数和一些base
函数,据我了解,他们也在翻译tidyr
函数。使用 dbplyr
时,请尝试在您的方法中使用 SQL
逻辑,否则它很容易崩溃。
只要每次迭代创建相同维度的 table,那么可能会有一种巧妙的方法将整个操作推送到数据库。想法是同时使用 purrr
中的 map()
和 reduce()
。每个 tbl_sql()
操作都是惰性的,所以我们可以遍历它们而不用担心发送一堆查询,然后我们可以使用 union()
基本上将每次迭代的结果 SQL 附加到接下来使用给定数据库中的 UNION
子句。这是一个例子:
library(dbplyr, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(purrr, warn.conflicts = FALSE)
library(DBI, warn.conflicts = FALSE)
library(rlang, warn.conflicts = FALSE)
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":dbname:")
db_mtcars <- copy_to(con, mtcars)
cyls <- c(4, 6, 8)
all <- cyls %>%
map(~{
db_mtcars %>%
filter(cyl == .x) %>%
summarise(mpg = mean(mpg, na.rm = TRUE)
)
}) %>%
reduce(function(x, y) union(x, y))
all
#> # Source: lazy query [?? x 1]
#> # Database: sqlite 3.22.0 []
#> mpg
#> <dbl>
#> 1 15.1
#> 2 19.7
#> 3 26.7
show_query(all)
#> <SQL>
#> SELECT AVG(`mpg`) AS `mpg`
#> FROM (SELECT *
#> FROM (SELECT *
#> FROM `mtcars`)
#> WHERE (`cyl` = 4.0))
#> UNION
#> SELECT AVG(`mpg`) AS `mpg`
#> FROM (SELECT *
#> FROM (SELECT *
#> FROM `mtcars`)
#> WHERE (`cyl` = 6.0))
#> UNION
#> SELECT AVG(`mpg`) AS `mpg`
#> FROM (SELECT *
#> FROM (SELECT *
#> FROM `mtcars`)
#> WHERE (`cyl` = 8.0))
dbDisconnect(con)