使用 R 中的 dbplyr 计算 SQL table 每行的 TRUE/FALSE 个值
Count TRUE/FALSE values per row of a SQL table using dbplyr from R
我确实使用 dbplyr 远程连接到 SQL table。
table 之一由一个 ID 列和其他几个存储 0 和 1 值的列组成(SQL bit
- 从 R 端解释为布尔值 TRUE/FALSE )和R 我只是想得到每行1的总数。
在 R 中使用通常的 table 很简单,例如 rowSums()
不幸的是,它不能通过 dbplyr 工作(没有 SQL 等价物)。
出于明显的原因,由于底层 table 的大小,我不想 collect()
数据。
如何在这样的背景下实现这一目标?
library(dplyr)
# Local case
DF <- tibble(ID = LETTERS[1:3], col1 = c(1,1,1), col2 = c(1,1,0), col3 = c(1,0,0))
DF %>%
summarise(sum = rowSums(select(., -1)))
# sum
# 1 3
# 2 2
# 3 1
# If DF is a remote SQL table, therefore one would get the following error message:
# Error: nanodbc/nanodbc.cpp:1655: 42000: [Microsoft][ODBC SQL Server Driver][SQL Server]'rowSums' is not a recognized built-in function name. [Microsoft][ODBC SQL Server Driver][SQL Server]
编辑 - 添加最小的可重现示例
关注@Simon.S.A。回复,下面一个MRE:
# Table creation
DF <- tibble(ID = LETTERS[1:3], col1 = c(1,1,1), col2 = c(1, 1,0), col3 = c(1,0,0))
colnames(DF) <- c("col 1", "col 2", "col 3", "col 4")
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con, DF)
con %>% tbl("DF") # just checking
#preparing formula
cols <- colnames(DF)[-1]
all_equations <- paste0("`", cols, "` = sum(`", cols,"`)")
# actual query
con %>%
tbl("DF") %>%
summarise(!!!rlang::parse_exprs(all_equations))
# Error: near "=": syntax error
# %>% show_query() shows a strange query, but I am no SQL expert as you understood.
# also tried:
# all_equations <- paste(cols ,"= sum(",cols,")")
# all_equations <- paste0("`[", cols, "]` = sum(`[", cols,"]`)")
这里的部分挑战是 dbplyr 将 dplyr 命令翻译成 SQL,但只为某些 R 命令定义了翻译。由于标准 dplyr 命令存在翻译,我们可以使用 summarise
.
总结一下,我们可以做到以下几点:
library(dplyr)
library(rlang)
cols = colnames(DF)
cols = cols[2:length(cols)]
all_equations = paste(cols ,"= sum(",cols,")")
DF %>%
summarise(!!!parse_exprs(all_equations))
想法是构建每个和的文本字符串,然后使用!!!parse_exprs(.)
将此文本转换为 R 代码。
编辑 - 相同的方法,但行总和
# Table creation
DF <- tibble(ID = LETTERS[1:3], col1 = c(1,1,1), col2 = c(1, 1,0), col3 = c(1,0,0))
colnames(DF) <- c("col 1", "col 2", "col 3", "col 4")
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con, DF)
con %>% tbl("DF") # just checking
#preparing formula
cols <- colnames(DF)[-1]
eq <- paste0("`",paste0(cols, collapse = "` + `"),"`")
# actual query
con %>%
tbl("DF") %>%
mutate(new = !!parse_expr(eq))
但仍然依赖于 dbplyr 翻译,因此可能无法正确处理反引号。
我发现一个可能的解决方法是写下实际的查询,例如使用 DBI 包。但我仍然对使用 dbplyr 的更优雅的方式感兴趣。
DF <- tibble(ID = LETTERS[1:3], col1 = c(1,1,1), col2 = c(1, 1,0), col3 = c(1,0,0))
colnames(DF) <- c("col 1", "col 2", "col 3", "col 4") # having spaces in column names increase handling complexity
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con, DF)
con %>% tbl("DF") # just checking
cols <- colnames(DF)[-1]
col2select <- colnames(DF) # column to select in the result
query <- paste0("SELECT ",
paste0("[", col2select, "]", collapse =", "),
", ",
# paste0("CAST([", cols,"] AS INT)", collapse = " + "),
paste0("[", cols,"]", collapse = " + "),
" AS sum FROM DF")
rs <- DBI::dbSendQuery(con, query)
DBI::dbFetch(rs)
DBI::dbClearResult(rs)
DBI::dbDisconnect(con)
使用 tidyr
可以生成非常易读的 dplyr
代码。这对大 table 的表现不太清楚。
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
DF <- tibble(ID = LETTERS[1:3], col1 = c(1,1,1),
col2 = c(1, 1,0), col3 = c(1,0,0))
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
DF <- copy_to(con, DF, overwrite = TRUE)
result <-
DF %>%
pivot_longer(cols = -ID) %>%
group_by(ID) %>%
summarize(sum = sum(value, na.rm = TRUE))
result
#> # Source: lazy query [?? x 2]
#> # Database: sqlite 3.35.5 [:memory:]
#> ID sum
#> <chr> <dbl>
#> 1 A 3
#> 2 B 2
#> 3 C 1
result %>% show_query()
#> <SQL>
#> SELECT `ID`, SUM(`value`) AS `sum`
#> FROM (SELECT `ID`, 'col1' AS `name`, `col1` AS `value`
#> FROM `DF`
#> UNION ALL
#> SELECT `ID`, 'col2' AS `name`, `col2` AS `value`
#> FROM `DF`
#> UNION ALL
#> SELECT `ID`, 'col3' AS `name`, `col3` AS `value`
#> FROM `DF`)
#> GROUP BY `ID`
由 reprex package (v2.0.0)
于 2021 年 6 月 18 日创建
library(dplyr, warn.conflicts = FALSE)
library(DBI)
n <- 26e3
df <- tibble(ID = rep(LETTERS, n/26))
for (i in 1:100) df[[paste0("col", i)]] <- rbinom(prob = 0.5, n = n, size = 1)
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
df_sql <- copy_to(con, df, overwrite = TRUE)
row_sum_1 <- function(df, con) {
sum_cols <- setdiff(colnames(df), "ID")
names <- paste(DBI::dbQuoteIdentifier(con, colnames(df)), collapse = ", ")
sum_sql <- paste(DBI::dbQuoteIdentifier(con, sum_cols), collapse = " + ")
query <- paste0("SELECT ", names, ", ",
sum_sql," AS sum FROM df")
tbl(con, sql(query))
}
row_sum_1(df_sql, con) %>% select(ID, sum)
#> # Source: lazy query [?? x 2]
#> # Database: sqlite 3.35.5 [:memory:]
#> ID sum
#> <chr> <int>
#> 1 A 49
#> 2 B 53
#> 3 C 54
#> 4 D 49
#> 5 E 51
#> 6 F 46
#> 7 G 55
#> 8 H 48
#> 9 I 44
#> 10 J 50
#> # … with more rows
system.time(compute(row_sum_1(df_sql, con)))
#> user system elapsed
#> 0.307 0.007 0.315
由 reprex package (v2.0.0)
于 2021-06-21 创建
我确实使用 dbplyr 远程连接到 SQL table。
table 之一由一个 ID 列和其他几个存储 0 和 1 值的列组成(SQL bit
- 从 R 端解释为布尔值 TRUE/FALSE )和R 我只是想得到每行1的总数。
在 R 中使用通常的 table 很简单,例如 rowSums()
不幸的是,它不能通过 dbplyr 工作(没有 SQL 等价物)。
出于明显的原因,由于底层 table 的大小,我不想 collect()
数据。
如何在这样的背景下实现这一目标?
library(dplyr)
# Local case
DF <- tibble(ID = LETTERS[1:3], col1 = c(1,1,1), col2 = c(1,1,0), col3 = c(1,0,0))
DF %>%
summarise(sum = rowSums(select(., -1)))
# sum
# 1 3
# 2 2
# 3 1
# If DF is a remote SQL table, therefore one would get the following error message:
# Error: nanodbc/nanodbc.cpp:1655: 42000: [Microsoft][ODBC SQL Server Driver][SQL Server]'rowSums' is not a recognized built-in function name. [Microsoft][ODBC SQL Server Driver][SQL Server]
编辑 - 添加最小的可重现示例
关注@Simon.S.A。回复,下面一个MRE:
# Table creation
DF <- tibble(ID = LETTERS[1:3], col1 = c(1,1,1), col2 = c(1, 1,0), col3 = c(1,0,0))
colnames(DF) <- c("col 1", "col 2", "col 3", "col 4")
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con, DF)
con %>% tbl("DF") # just checking
#preparing formula
cols <- colnames(DF)[-1]
all_equations <- paste0("`", cols, "` = sum(`", cols,"`)")
# actual query
con %>%
tbl("DF") %>%
summarise(!!!rlang::parse_exprs(all_equations))
# Error: near "=": syntax error
# %>% show_query() shows a strange query, but I am no SQL expert as you understood.
# also tried:
# all_equations <- paste(cols ,"= sum(",cols,")")
# all_equations <- paste0("`[", cols, "]` = sum(`[", cols,"]`)")
这里的部分挑战是 dbplyr 将 dplyr 命令翻译成 SQL,但只为某些 R 命令定义了翻译。由于标准 dplyr 命令存在翻译,我们可以使用 summarise
.
总结一下,我们可以做到以下几点:
library(dplyr)
library(rlang)
cols = colnames(DF)
cols = cols[2:length(cols)]
all_equations = paste(cols ,"= sum(",cols,")")
DF %>%
summarise(!!!parse_exprs(all_equations))
想法是构建每个和的文本字符串,然后使用!!!parse_exprs(.)
将此文本转换为 R 代码。
编辑 - 相同的方法,但行总和
# Table creation
DF <- tibble(ID = LETTERS[1:3], col1 = c(1,1,1), col2 = c(1, 1,0), col3 = c(1,0,0))
colnames(DF) <- c("col 1", "col 2", "col 3", "col 4")
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con, DF)
con %>% tbl("DF") # just checking
#preparing formula
cols <- colnames(DF)[-1]
eq <- paste0("`",paste0(cols, collapse = "` + `"),"`")
# actual query
con %>%
tbl("DF") %>%
mutate(new = !!parse_expr(eq))
但仍然依赖于 dbplyr 翻译,因此可能无法正确处理反引号。
我发现一个可能的解决方法是写下实际的查询,例如使用 DBI 包。但我仍然对使用 dbplyr 的更优雅的方式感兴趣。
DF <- tibble(ID = LETTERS[1:3], col1 = c(1,1,1), col2 = c(1, 1,0), col3 = c(1,0,0))
colnames(DF) <- c("col 1", "col 2", "col 3", "col 4") # having spaces in column names increase handling complexity
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con, DF)
con %>% tbl("DF") # just checking
cols <- colnames(DF)[-1]
col2select <- colnames(DF) # column to select in the result
query <- paste0("SELECT ",
paste0("[", col2select, "]", collapse =", "),
", ",
# paste0("CAST([", cols,"] AS INT)", collapse = " + "),
paste0("[", cols,"]", collapse = " + "),
" AS sum FROM DF")
rs <- DBI::dbSendQuery(con, query)
DBI::dbFetch(rs)
DBI::dbClearResult(rs)
DBI::dbDisconnect(con)
使用 tidyr
可以生成非常易读的 dplyr
代码。这对大 table 的表现不太清楚。
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
DF <- tibble(ID = LETTERS[1:3], col1 = c(1,1,1),
col2 = c(1, 1,0), col3 = c(1,0,0))
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
DF <- copy_to(con, DF, overwrite = TRUE)
result <-
DF %>%
pivot_longer(cols = -ID) %>%
group_by(ID) %>%
summarize(sum = sum(value, na.rm = TRUE))
result
#> # Source: lazy query [?? x 2]
#> # Database: sqlite 3.35.5 [:memory:]
#> ID sum
#> <chr> <dbl>
#> 1 A 3
#> 2 B 2
#> 3 C 1
result %>% show_query()
#> <SQL>
#> SELECT `ID`, SUM(`value`) AS `sum`
#> FROM (SELECT `ID`, 'col1' AS `name`, `col1` AS `value`
#> FROM `DF`
#> UNION ALL
#> SELECT `ID`, 'col2' AS `name`, `col2` AS `value`
#> FROM `DF`
#> UNION ALL
#> SELECT `ID`, 'col3' AS `name`, `col3` AS `value`
#> FROM `DF`)
#> GROUP BY `ID`
由 reprex package (v2.0.0)
于 2021 年 6 月 18 日创建library(dplyr, warn.conflicts = FALSE)
library(DBI)
n <- 26e3
df <- tibble(ID = rep(LETTERS, n/26))
for (i in 1:100) df[[paste0("col", i)]] <- rbinom(prob = 0.5, n = n, size = 1)
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
df_sql <- copy_to(con, df, overwrite = TRUE)
row_sum_1 <- function(df, con) {
sum_cols <- setdiff(colnames(df), "ID")
names <- paste(DBI::dbQuoteIdentifier(con, colnames(df)), collapse = ", ")
sum_sql <- paste(DBI::dbQuoteIdentifier(con, sum_cols), collapse = " + ")
query <- paste0("SELECT ", names, ", ",
sum_sql," AS sum FROM df")
tbl(con, sql(query))
}
row_sum_1(df_sql, con) %>% select(ID, sum)
#> # Source: lazy query [?? x 2]
#> # Database: sqlite 3.35.5 [:memory:]
#> ID sum
#> <chr> <int>
#> 1 A 49
#> 2 B 53
#> 3 C 54
#> 4 D 49
#> 5 E 51
#> 6 F 46
#> 7 G 55
#> 8 H 48
#> 9 I 44
#> 10 J 50
#> # … with more rows
system.time(compute(row_sum_1(df_sql, con)))
#> user system elapsed
#> 0.307 0.007 0.315
由 reprex package (v2.0.0)
于 2021-06-21 创建