根据 dbplyr 中的列值创建数组
Create array based on column values in dbplyr
我希望能够使用 dbplyr
.
做一些与此等效的事情
library(magrittr)
library(tidyverse)
library(bigrquery)
library(dbplyr)
#>
#> Attaching package: 'dbplyr'
#> The following objects are masked from 'package:dplyr':
#>
#> ident, sql
bq_deauth()
bq_auth()
bq_conn = dbConnect(
bigquery(),
project = "eacri-genomics"
)
df = tibble(
chr = c(1,1,1,2,2,3),
start = c(0, 10, 12, 0, 5, 1),
end = c(2, 11, 15, 1, 8, 3)
)
df
#> # A tibble: 6 x 3
#> chr start end
#> <dbl> <dbl> <dbl>
#> 1 1 0 2
#> 2 1 10 11
#> 3 1 12 15
#> 4 2 0 1
#> 5 2 5 8
#> 6 3 1 3
df %>%
rowwise() %>% mutate(range = list(seq(start, end)))
#> # A tibble: 6 x 4
#> # Rowwise:
#> chr start end range
#> <dbl> <dbl> <dbl> <list>
#> 1 1 0 2 <int [3]>
#> 2 1 10 11 <int [2]>
#> 3 1 12 15 <int [4]>
#> 4 2 0 1 <int [2]>
#> 5 2 5 8 <int [4]>
#> 6 3 1 3 <int [3]>
df %>%
rowwise() %>% mutate(range = list(seq(start, end))) %>%
unnest(range)
#> # A tibble: 18 x 4
#> chr start end range
#> <dbl> <dbl> <dbl> <int>
#> 1 1 0 2 0
#> 2 1 0 2 1
#> 3 1 0 2 2
#> 4 1 10 11 10
#> 5 1 10 11 11
#> 6 1 12 15 12
#> 7 1 12 15 13
#> 8 1 12 15 14
#> 9 1 12 15 15
#> 10 2 0 1 0
#> 11 2 0 1 1
#> 12 2 5 8 5
#> 13 2 5 8 6
#> 14 2 5 8 7
#> 15 2 5 8 8
#> 16 3 1 3 1
#> 17 3 1 3 2
#> 18 3 1 3 3
dbWriteTable(
bq_conn,
name = "balter.range_test",
value = df,
overwrite = T
)
df_bq = tbl(bq_conn, "balter.range_test")
df_bq %>%
rowwise() %>% mutate(range = list(seq(start, end)))
#> Error in UseMethod("rowwise"): no applicable method for 'rowwise' applied to an object of class "c('tbl_BigQueryConnection', 'tbl_dbi', 'tbl_sql', 'tbl_lazy', 'tbl')"
df_bq %>%
mutate(range = generate_series(start, end))
#> Error: Job 'eacri-genomics.job_dnmwBmRtY9j0heYY1AtmwEtgblGp.US' failed
#> x Function not found: generate_series at [1:31] [invalidQuery]
由 reprex package (v1.0.0)
于 2021-02-18 创建
在尝试使用 dbplyr 执行此操作之前,值得首先考虑您使用的数据库是否支持具有 list/array 类型的列。这是您的 range <list>
列所必需的。
我怀疑 (1) 许多数据库 common/widely 不支持此功能,并且 (2) dbplyr 目前不提供直接翻译。 (比如看这两个问题: and )。
但是由于您的序列只是一个数字范围,您可以通过连接完成同样的事情:
df = tibble(
chr = c(1,1,1,2,2,3),
start = c(0, 10, 12, 0, 5, 1),
end = c(2, 11, 15, 1, 8, 3)
)
whole_range = tibble(range = -100:100)
# need start < min(df$start) and end > max(df$end)
dbWriteTable(conn, name = "df", value = df, overwrite = T)
dbWriteTable(conn, name = "whole_range", value = whole_range, overwrite = T)
remote_df = tbl(conn, "df")
remote_whole_range = tbl(conn, "whole_range")
# create dummy columns to join
remote_df = remote_df %>% mutate(ones = 1)
remote_whole_range = remote_whole_range %>% mutate(ones = 1)
# join and filter
remote_output = inner_join(remote_df, remote_whole_range, by = "ones") %>%
filter(start <= range,
range <= end)
我发现了一个非规范的 postgresql 函数,但在 BigQuery 中有效:generate_array
。
library(magrittr)
library(tidyverse)
library(dbplyr)
#>
#> Attaching package: 'dbplyr'
#> The following objects are masked from 'package:dplyr':
#>
#> ident, sql
library(bigrquery)
library(DBI)
library(RPostgres)
library(RPostgreSQL)
bq_deauth()
bq_auth(email="ariel.balter@gmail.com")
bq_conn = dbConnect(
bigquery(),
project = "elite-magpie-257717",
dataset = "test_dataset"
)
df = tibble(
chr = c(1,1,1,2,2,3),
start = c(0, 10, 12, 0, 5, 1),
end = c(2, 11, 15, 1, 8, 3)
)
df %>%
rowwise() %>% mutate(range = list(seq(start, end)))
#> # A tibble: 6 x 4
#> # Rowwise:
#> chr start end range
#> <dbl> <dbl> <dbl> <list>
#> 1 1 0 2 <int [3]>
#> 2 1 10 11 <int [2]>
#> 3 1 12 15 <int [4]>
#> 4 2 0 1 <int [2]>
#> 5 2 5 8 <int [4]>
#> 6 3 1 3 <int [3]>
df %>%
rowwise() %>% mutate(range = list(seq(start, end))) %>%
unnest(range)
#> # A tibble: 18 x 4
#> chr start end range
#> <dbl> <dbl> <dbl> <int>
#> 1 1 0 2 0
#> 2 1 0 2 1
#> 3 1 0 2 2
#> 4 1 10 11 10
#> 5 1 10 11 11
#> 6 1 12 15 12
#> 7 1 12 15 13
#> 8 1 12 15 14
#> 9 1 12 15 15
#> 10 2 0 1 0
#> 11 2 0 1 1
#> 12 2 5 8 5
#> 13 2 5 8 6
#> 14 2 5 8 7
#> 15 2 5 8 8
#> 16 3 1 3 1
#> 17 3 1 3 2
#> 18 3 1 3 3
dbWriteTable(
bq_conn,
name = "test_dataset.range_test",
value = df,
overwrite = T
)
#> Auto-refreshing stale OAuth token.
df_bq = tbl(bq_conn, "test_dataset.range_test")
df_bq %>%
mutate(range = generate_array(start, end, 1))
#> # Source: lazy query [?? x 4]
#> # Database: BigQueryConnection
#> end start chr range
#> <int> <int> <int> <list>
#> 1 2 0 1 <dbl [3]>
#> 2 11 10 1 <dbl [2]>
#> 3 15 12 1 <dbl [4]>
#> 4 1 0 2 <dbl [2]>
#> 5 8 5 2 <dbl [4]>
#> 6 3 1 3 <dbl [3]>
由 reprex package (v1.0.0)
于 2021 年 2 月 19 日创建
我希望能够使用 dbplyr
.
library(magrittr)
library(tidyverse)
library(bigrquery)
library(dbplyr)
#>
#> Attaching package: 'dbplyr'
#> The following objects are masked from 'package:dplyr':
#>
#> ident, sql
bq_deauth()
bq_auth()
bq_conn = dbConnect(
bigquery(),
project = "eacri-genomics"
)
df = tibble(
chr = c(1,1,1,2,2,3),
start = c(0, 10, 12, 0, 5, 1),
end = c(2, 11, 15, 1, 8, 3)
)
df
#> # A tibble: 6 x 3
#> chr start end
#> <dbl> <dbl> <dbl>
#> 1 1 0 2
#> 2 1 10 11
#> 3 1 12 15
#> 4 2 0 1
#> 5 2 5 8
#> 6 3 1 3
df %>%
rowwise() %>% mutate(range = list(seq(start, end)))
#> # A tibble: 6 x 4
#> # Rowwise:
#> chr start end range
#> <dbl> <dbl> <dbl> <list>
#> 1 1 0 2 <int [3]>
#> 2 1 10 11 <int [2]>
#> 3 1 12 15 <int [4]>
#> 4 2 0 1 <int [2]>
#> 5 2 5 8 <int [4]>
#> 6 3 1 3 <int [3]>
df %>%
rowwise() %>% mutate(range = list(seq(start, end))) %>%
unnest(range)
#> # A tibble: 18 x 4
#> chr start end range
#> <dbl> <dbl> <dbl> <int>
#> 1 1 0 2 0
#> 2 1 0 2 1
#> 3 1 0 2 2
#> 4 1 10 11 10
#> 5 1 10 11 11
#> 6 1 12 15 12
#> 7 1 12 15 13
#> 8 1 12 15 14
#> 9 1 12 15 15
#> 10 2 0 1 0
#> 11 2 0 1 1
#> 12 2 5 8 5
#> 13 2 5 8 6
#> 14 2 5 8 7
#> 15 2 5 8 8
#> 16 3 1 3 1
#> 17 3 1 3 2
#> 18 3 1 3 3
dbWriteTable(
bq_conn,
name = "balter.range_test",
value = df,
overwrite = T
)
df_bq = tbl(bq_conn, "balter.range_test")
df_bq %>%
rowwise() %>% mutate(range = list(seq(start, end)))
#> Error in UseMethod("rowwise"): no applicable method for 'rowwise' applied to an object of class "c('tbl_BigQueryConnection', 'tbl_dbi', 'tbl_sql', 'tbl_lazy', 'tbl')"
df_bq %>%
mutate(range = generate_series(start, end))
#> Error: Job 'eacri-genomics.job_dnmwBmRtY9j0heYY1AtmwEtgblGp.US' failed
#> x Function not found: generate_series at [1:31] [invalidQuery]
由 reprex package (v1.0.0)
于 2021-02-18 创建在尝试使用 dbplyr 执行此操作之前,值得首先考虑您使用的数据库是否支持具有 list/array 类型的列。这是您的 range <list>
列所必需的。
我怀疑 (1) 许多数据库 common/widely 不支持此功能,并且 (2) dbplyr 目前不提供直接翻译。 (比如看这两个问题:
但是由于您的序列只是一个数字范围,您可以通过连接完成同样的事情:
df = tibble(
chr = c(1,1,1,2,2,3),
start = c(0, 10, 12, 0, 5, 1),
end = c(2, 11, 15, 1, 8, 3)
)
whole_range = tibble(range = -100:100)
# need start < min(df$start) and end > max(df$end)
dbWriteTable(conn, name = "df", value = df, overwrite = T)
dbWriteTable(conn, name = "whole_range", value = whole_range, overwrite = T)
remote_df = tbl(conn, "df")
remote_whole_range = tbl(conn, "whole_range")
# create dummy columns to join
remote_df = remote_df %>% mutate(ones = 1)
remote_whole_range = remote_whole_range %>% mutate(ones = 1)
# join and filter
remote_output = inner_join(remote_df, remote_whole_range, by = "ones") %>%
filter(start <= range,
range <= end)
我发现了一个非规范的 postgresql 函数,但在 BigQuery 中有效:generate_array
。
library(magrittr)
library(tidyverse)
library(dbplyr)
#>
#> Attaching package: 'dbplyr'
#> The following objects are masked from 'package:dplyr':
#>
#> ident, sql
library(bigrquery)
library(DBI)
library(RPostgres)
library(RPostgreSQL)
bq_deauth()
bq_auth(email="ariel.balter@gmail.com")
bq_conn = dbConnect(
bigquery(),
project = "elite-magpie-257717",
dataset = "test_dataset"
)
df = tibble(
chr = c(1,1,1,2,2,3),
start = c(0, 10, 12, 0, 5, 1),
end = c(2, 11, 15, 1, 8, 3)
)
df %>%
rowwise() %>% mutate(range = list(seq(start, end)))
#> # A tibble: 6 x 4
#> # Rowwise:
#> chr start end range
#> <dbl> <dbl> <dbl> <list>
#> 1 1 0 2 <int [3]>
#> 2 1 10 11 <int [2]>
#> 3 1 12 15 <int [4]>
#> 4 2 0 1 <int [2]>
#> 5 2 5 8 <int [4]>
#> 6 3 1 3 <int [3]>
df %>%
rowwise() %>% mutate(range = list(seq(start, end))) %>%
unnest(range)
#> # A tibble: 18 x 4
#> chr start end range
#> <dbl> <dbl> <dbl> <int>
#> 1 1 0 2 0
#> 2 1 0 2 1
#> 3 1 0 2 2
#> 4 1 10 11 10
#> 5 1 10 11 11
#> 6 1 12 15 12
#> 7 1 12 15 13
#> 8 1 12 15 14
#> 9 1 12 15 15
#> 10 2 0 1 0
#> 11 2 0 1 1
#> 12 2 5 8 5
#> 13 2 5 8 6
#> 14 2 5 8 7
#> 15 2 5 8 8
#> 16 3 1 3 1
#> 17 3 1 3 2
#> 18 3 1 3 3
dbWriteTable(
bq_conn,
name = "test_dataset.range_test",
value = df,
overwrite = T
)
#> Auto-refreshing stale OAuth token.
df_bq = tbl(bq_conn, "test_dataset.range_test")
df_bq %>%
mutate(range = generate_array(start, end, 1))
#> # Source: lazy query [?? x 4]
#> # Database: BigQueryConnection
#> end start chr range
#> <int> <int> <int> <list>
#> 1 2 0 1 <dbl [3]>
#> 2 11 10 1 <dbl [2]>
#> 3 15 12 1 <dbl [4]>
#> 4 1 0 2 <dbl [2]>
#> 5 8 5 2 <dbl [4]>
#> 6 3 1 3 <dbl [3]>
由 reprex package (v1.0.0)
于 2021 年 2 月 19 日创建