使用胶水构造查询但使其不是字符串
Construct a query using glue but make it not a string
我在 rstudio 中使用 r 脚本工作,该脚本使用胶水读取 sql 脚本。我的 r 脚本:
library(tidyverse)
library(dbplyr)
library(DBI)
library(glue)
con <- dbConnect(odbc(), "Athena")
day_from <- 7
day_to <- 30
game_name <- 'fungame'
query <- read_lines("example.sql") %>% glue_collapse(sep = "\n") %>% glue_sql(.con = con)
setosa <- dbGetQuery(con, query)
setosa %>% glimpse()
我的 sql 脚本 query.sql:
/*a comment describing the query*/
with
/*
Athena installs data. After deducting the training period, get the preceding 90 days of installs for training.
*/
installs as (
select s,
install_dt,
split(game_name, '_')[2] as platform,
case when country = 'United States' then 1 else 0 end as usa
from device_metrics.game_install
where year || '-' || month || '-' || day >= date_format(date_add('day', -({day_to} + 91), current_date), '%Y-%m-%d')
and year || '-' || month || '-' || day <= date_format(date_add('day', -({day_to} + 1), current_date), '%Y-%m-%d')
and regexp_like(lower(game_name), ('^(?!.*QA).*' || {game_name} || '.*')) -- excludes 'QA' devices
),
/*
Get marketing data from adx.
Full quarter of training data with at least one full day_m day cycle i.e. last full quarter + day_m days
*/
adx_min as (
select
adx_id,
publisher_name,
row_number() over(partition by adx_id order by time_stamp asc) rn -- some dups, get first instance of an install
from blah_ui_dev.adxdata_match_v2
where lower(game_name) = {game_name}
and concat(yy,'-',mm,'-',dd) >= date_format(date_add('day', -({day_to} + 91), current_date), '%Y-%m-%d')
and concat(yy,'-',mm,'-',dd) <= date_format(date_add('day', -({day_to} + 1), current_date), '%Y-%m-%d')
),
/*
Dedupped installs based on earliest timestamp
*/
adx as (
select
adx_id,
publisher_name
from adx_min
where rn = 1
),
/*
installs and marketing dta where exists
use min/max to dedup, some cases with a single s assoociated with multiple platforms
*/
installs_base as (
select
i.s,
i.usa,
min(i.install_dt) as install_dt,
min(i.platform) as platform,
min(a.publisher_name) as publisher_name
from installs i
left join adx a on upper(if(i.s like 'IDFV%', substr(i.s,6), i.s)) = a.adx_id
group by i.s, i.usa
),
/*
day n sessions count
*/
sessions_day_from as (
select i.s,
count(1) as sessions_day_from,
sum(session_length) / 1000 as sum_session_time_day_from
from installs_base i
join device_metrics.user_game_session sess on sess.s = i.s
where regexp_like(lower(sess.game_name), '^(?!.*QA).*' || {game_name} || '.*')
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(sess.activity_date, '%Y-%m-%d')) <= {day_from}
group by i.s
),
/*
day n utility
*/
utility_day_from as (
select
u.s,
sum(u.utility) as utility_day_from
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = {game_name}
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= {day_from}
group by u.s
),
/*
recent utility. Are users continuing to play the game more recently or did they drop off.
*/
utility_recent as (
select
u.s,
sum(u.utility) as recent_utility_sum
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = {game_name}
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= floor({day_from} / 2)
group by u.s
),
/*
day n spend
*/
spend_day_from as (
select
i.s,
sum(dr.amt) as spend_day_from
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || {game_name} || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= {day_from}
group by 1
),
/*
day m target spend (target)
*/
spend_day_to as (
select
i.s,
sum(dr.amt) as spend_day_to
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || {game_name} || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= {day_to}
group by 1
)
select
i.s,
i.install_dt,
i.platform,
i.usa,
i.publisher_name,
coalesce(sn.sessions_day_from, 0) as {glue('sessions_day_{day_from}')},
coalesce(sn.sum_session_time_day_from, 0) as {glue('sum_session_time_day_{day_from}')},
coalesce(un.utility_day_from, 0) as {glue('utility_day_{day_from}')},
coalesce(spn.spend_day_from, 0) as {glue('spend_day_{day_from}')},
round(coalesce(ru.recent_utility_sum / un.utility_day_from, 0), 2) as recent_utility_ratio,
coalesce(spm.spend_day_to, 0) as {glue('spend_day_{day_to}')}
from installs_base i
left join sessions_day_from sn on sn.s = i.s
left join utility_day_from un on un.s = i.s
left join spend_day_from spn on spn.s = i.s
left join utility_recent ru on ru.s = i.s
left join spend_day_to spm on spm.s = i.s
我可以通过输入变量名查询来查看 sql 在控制台中的样子:
query
<SQL> -- !preview conn=con
/*DDA day day_from to day day_to training data query*/
with
/*
Athena installs data. After deducting the training period, get the preceeding 90 days of installs for training.
*/
installs as (
select s,
install_dt,
split(game_name, '_')[2] as platform,
case when country = 'United States' then 1 else 0 end as usa
from device_metrics.game_install
where year || '-' || month || '-' || day >= date_format(date_add('day', -(30 + 91), current_date), '%Y-%m-%d')
and year || '-' || month || '-' || day <= date_format(date_add('day', -(30 + 1), current_date), '%Y-%m-%d')
and regexp_like(lower(game_name), ('^(?!.*QA).*' || 'fungame' || '.*')) -- excludes 'QA' devices
),
/*
Get marketing data from adx.
Full quarter of training data with at least one full day_m day cycle i.e. last full quarter + day_m days
*/
adx_min as (
select
adx_id,
publisher_name,
row_number() over(partition by adx_id order by time_stamp asc) rn -- some dups, get first instance of an install
from glu_ui_dev.adxdata_match_v2
where lower(game_name) = 'fungame'
and concat(yy,'-',mm,'-',dd) >= date_format(date_add('day', -(30 + 91), current_date), '%Y-%m-%d')
and concat(yy,'-',mm,'-',dd) <= date_format(date_add('day', -(30 + 1), current_date), '%Y-%m-%d')
),
/*
Dedupped installs based on earliest timestamp
*/
adx as (
select
adx_id,
publisher_name
from adx_min
where rn = 1
),
/*
installs and marketing dta where exists
use min/max to dedup, some cases with a single s assoociated with multiple platforms
*/
installs_base as (
select
i.s,
i.usa,
min(i.install_dt) as install_dt,
min(i.platform) as platform,
min(a.publisher_name) as publisher_name
from installs i
left join adx a on upper(if(i.s like 'IDFV%', substr(i.s,6), i.s)) = a.adx_id
group by i.s, i.usa
),
/*
day n sessions count
*/
sessions_day_from as (
select i.s,
count(1) as sessions_day_from,
sum(session_length) / 1000 as sum_session_time_day_from
from installs_base i
join device_metrics.user_game_session sess on sess.s = i.s
where regexp_like(lower(sess.game_name), '^(?!.*QA).*' || 'fungame' || '.*')
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(sess.activity_date, '%Y-%m-%d')) <= 7
group by i.s
),
/*
day n utility
*/
utility_day_from as (
select
u.s,
sum(u.utility) as utility_day_from
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = 'fungame'
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= 7
group by u.s
),
/*
recent utility. Are users continuing to play the game more recently or did they drop off.
*/
utility_recent as (
select
u.s,
sum(u.utility) as recent_utility_sum
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = 'fungame'
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= floor(7 / 2)
group by u.s
),
/*
day n spend
*/
spend_day_from as (
select
i.s,
sum(dr.amt) as spend_day_from
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || 'fungame' || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= 7
group by 1
),
/*
day m target spend (target)
*/
spend_day_to as (
select
i.s,
sum(dr.amt) as spend_day_to
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || 'fungame' || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= 30
group by 1
)
select
i.s,
i.install_dt,
i.platform,
i.usa,
i.publisher_name,
coalesce(sn.sessions_day_from, 0) as 'sessions_day_7',
coalesce(sn.sum_session_time_day_from, 0) as 'sum_session_time_day_7',
coalesce(un.utility_day_from, 0) as 'utility_day_7',
coalesce(spn.spend_day_from, 0) as 'spend_day_7',
round(coalesce(ru.recent_utility_sum / un.utility_day_from, 0), 2) as recent_utility_ratio,
coalesce(spm.spend_day_to, 0) as 'spend_day_30'
from installs_base i
left join sessions_day_from sn on sn.s = i.s
left join utility_day_from un on un.s = i.s
left join spend_day_from spn on spn.s = i.s
left join utility_recent ru on ru.s = i.s
left join spend_day_to spm on spm.s = i.s
我发布的问题具体是在最底部的 select 字段名称部分。在 'as' 之后,我的字段名称是字符串引用。
将'as'之后的部分粘贴returns为字符串。在这种情况下,这似乎不会引起问题,但是,在我的实际脚本中,我将 odbc 与 Athena 一起使用,并且它不喜欢查询的形式 select field_name as 'string'
而不是常规的select field_name as string
// 字符串没有被引用为字符串
有没有办法让胶水在 'as' 之后用字符串引号组成这部分?所需查询:
<SQL>
select
Species as Species-setosa
from iris
where species = 'setosa'
一个选项是通过将特定位置的引号与正则表达式环视相匹配来替换 '
。在这里,我们可以使用str_replace_all
library(magrittr)
library(stringr)
query %>%
str_replace_all("(?<=as )'|(?<![%])'(?=(\nfrom|,))", "") %>%
glue_sql(.con = con)
#<SQL> select
# Species as Species-setosa
#from iris
#where species = 'setosa'
我们匹配单词 as
之后的 '
和 space 或 (|
) '
之前的逗号 (,
) 或下一行,来自 (\nfrom
)
我在 rstudio 中使用 r 脚本工作,该脚本使用胶水读取 sql 脚本。我的 r 脚本:
library(tidyverse)
library(dbplyr)
library(DBI)
library(glue)
con <- dbConnect(odbc(), "Athena")
day_from <- 7
day_to <- 30
game_name <- 'fungame'
query <- read_lines("example.sql") %>% glue_collapse(sep = "\n") %>% glue_sql(.con = con)
setosa <- dbGetQuery(con, query)
setosa %>% glimpse()
我的 sql 脚本 query.sql:
/*a comment describing the query*/
with
/*
Athena installs data. After deducting the training period, get the preceding 90 days of installs for training.
*/
installs as (
select s,
install_dt,
split(game_name, '_')[2] as platform,
case when country = 'United States' then 1 else 0 end as usa
from device_metrics.game_install
where year || '-' || month || '-' || day >= date_format(date_add('day', -({day_to} + 91), current_date), '%Y-%m-%d')
and year || '-' || month || '-' || day <= date_format(date_add('day', -({day_to} + 1), current_date), '%Y-%m-%d')
and regexp_like(lower(game_name), ('^(?!.*QA).*' || {game_name} || '.*')) -- excludes 'QA' devices
),
/*
Get marketing data from adx.
Full quarter of training data with at least one full day_m day cycle i.e. last full quarter + day_m days
*/
adx_min as (
select
adx_id,
publisher_name,
row_number() over(partition by adx_id order by time_stamp asc) rn -- some dups, get first instance of an install
from blah_ui_dev.adxdata_match_v2
where lower(game_name) = {game_name}
and concat(yy,'-',mm,'-',dd) >= date_format(date_add('day', -({day_to} + 91), current_date), '%Y-%m-%d')
and concat(yy,'-',mm,'-',dd) <= date_format(date_add('day', -({day_to} + 1), current_date), '%Y-%m-%d')
),
/*
Dedupped installs based on earliest timestamp
*/
adx as (
select
adx_id,
publisher_name
from adx_min
where rn = 1
),
/*
installs and marketing dta where exists
use min/max to dedup, some cases with a single s assoociated with multiple platforms
*/
installs_base as (
select
i.s,
i.usa,
min(i.install_dt) as install_dt,
min(i.platform) as platform,
min(a.publisher_name) as publisher_name
from installs i
left join adx a on upper(if(i.s like 'IDFV%', substr(i.s,6), i.s)) = a.adx_id
group by i.s, i.usa
),
/*
day n sessions count
*/
sessions_day_from as (
select i.s,
count(1) as sessions_day_from,
sum(session_length) / 1000 as sum_session_time_day_from
from installs_base i
join device_metrics.user_game_session sess on sess.s = i.s
where regexp_like(lower(sess.game_name), '^(?!.*QA).*' || {game_name} || '.*')
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(sess.activity_date, '%Y-%m-%d')) <= {day_from}
group by i.s
),
/*
day n utility
*/
utility_day_from as (
select
u.s,
sum(u.utility) as utility_day_from
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = {game_name}
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= {day_from}
group by u.s
),
/*
recent utility. Are users continuing to play the game more recently or did they drop off.
*/
utility_recent as (
select
u.s,
sum(u.utility) as recent_utility_sum
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = {game_name}
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= floor({day_from} / 2)
group by u.s
),
/*
day n spend
*/
spend_day_from as (
select
i.s,
sum(dr.amt) as spend_day_from
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || {game_name} || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= {day_from}
group by 1
),
/*
day m target spend (target)
*/
spend_day_to as (
select
i.s,
sum(dr.amt) as spend_day_to
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || {game_name} || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= {day_to}
group by 1
)
select
i.s,
i.install_dt,
i.platform,
i.usa,
i.publisher_name,
coalesce(sn.sessions_day_from, 0) as {glue('sessions_day_{day_from}')},
coalesce(sn.sum_session_time_day_from, 0) as {glue('sum_session_time_day_{day_from}')},
coalesce(un.utility_day_from, 0) as {glue('utility_day_{day_from}')},
coalesce(spn.spend_day_from, 0) as {glue('spend_day_{day_from}')},
round(coalesce(ru.recent_utility_sum / un.utility_day_from, 0), 2) as recent_utility_ratio,
coalesce(spm.spend_day_to, 0) as {glue('spend_day_{day_to}')}
from installs_base i
left join sessions_day_from sn on sn.s = i.s
left join utility_day_from un on un.s = i.s
left join spend_day_from spn on spn.s = i.s
left join utility_recent ru on ru.s = i.s
left join spend_day_to spm on spm.s = i.s
我可以通过输入变量名查询来查看 sql 在控制台中的样子:
query
<SQL> -- !preview conn=con
/*DDA day day_from to day day_to training data query*/
with
/*
Athena installs data. After deducting the training period, get the preceeding 90 days of installs for training.
*/
installs as (
select s,
install_dt,
split(game_name, '_')[2] as platform,
case when country = 'United States' then 1 else 0 end as usa
from device_metrics.game_install
where year || '-' || month || '-' || day >= date_format(date_add('day', -(30 + 91), current_date), '%Y-%m-%d')
and year || '-' || month || '-' || day <= date_format(date_add('day', -(30 + 1), current_date), '%Y-%m-%d')
and regexp_like(lower(game_name), ('^(?!.*QA).*' || 'fungame' || '.*')) -- excludes 'QA' devices
),
/*
Get marketing data from adx.
Full quarter of training data with at least one full day_m day cycle i.e. last full quarter + day_m days
*/
adx_min as (
select
adx_id,
publisher_name,
row_number() over(partition by adx_id order by time_stamp asc) rn -- some dups, get first instance of an install
from glu_ui_dev.adxdata_match_v2
where lower(game_name) = 'fungame'
and concat(yy,'-',mm,'-',dd) >= date_format(date_add('day', -(30 + 91), current_date), '%Y-%m-%d')
and concat(yy,'-',mm,'-',dd) <= date_format(date_add('day', -(30 + 1), current_date), '%Y-%m-%d')
),
/*
Dedupped installs based on earliest timestamp
*/
adx as (
select
adx_id,
publisher_name
from adx_min
where rn = 1
),
/*
installs and marketing dta where exists
use min/max to dedup, some cases with a single s assoociated with multiple platforms
*/
installs_base as (
select
i.s,
i.usa,
min(i.install_dt) as install_dt,
min(i.platform) as platform,
min(a.publisher_name) as publisher_name
from installs i
left join adx a on upper(if(i.s like 'IDFV%', substr(i.s,6), i.s)) = a.adx_id
group by i.s, i.usa
),
/*
day n sessions count
*/
sessions_day_from as (
select i.s,
count(1) as sessions_day_from,
sum(session_length) / 1000 as sum_session_time_day_from
from installs_base i
join device_metrics.user_game_session sess on sess.s = i.s
where regexp_like(lower(sess.game_name), '^(?!.*QA).*' || 'fungame' || '.*')
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(sess.activity_date, '%Y-%m-%d')) <= 7
group by i.s
),
/*
day n utility
*/
utility_day_from as (
select
u.s,
sum(u.utility) as utility_day_from
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = 'fungame'
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= 7
group by u.s
),
/*
recent utility. Are users continuing to play the game more recently or did they drop off.
*/
utility_recent as (
select
u.s,
sum(u.utility) as recent_utility_sum
from installs_base i
join adhoc.device_sessions_daily u on u.s = i.s
where lower(u.game_base) = 'fungame'
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), u.activity_date) <= floor(7 / 2)
group by u.s
),
/*
day n spend
*/
spend_day_from as (
select
i.s,
sum(dr.amt) as spend_day_from
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || 'fungame' || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= 7
group by 1
),
/*
day m target spend (target)
*/
spend_day_to as (
select
i.s,
sum(dr.amt) as spend_day_to
from device_metrics.daily_revenue dr
join installs_base i on i.s = dr.s
where coalesce(channel,'IAP') = 'IAP'
and regexp_like(lower(game_name), ('^(?!.*QA).*' || 'fungame' || '.*')) -- excludes 'QA' devices
and date_diff('day', date_parse(i.install_dt, '%Y-%m-%d'), date_parse(dr.activity_date, '%Y-%m-%d')) <= 30
group by 1
)
select
i.s,
i.install_dt,
i.platform,
i.usa,
i.publisher_name,
coalesce(sn.sessions_day_from, 0) as 'sessions_day_7',
coalesce(sn.sum_session_time_day_from, 0) as 'sum_session_time_day_7',
coalesce(un.utility_day_from, 0) as 'utility_day_7',
coalesce(spn.spend_day_from, 0) as 'spend_day_7',
round(coalesce(ru.recent_utility_sum / un.utility_day_from, 0), 2) as recent_utility_ratio,
coalesce(spm.spend_day_to, 0) as 'spend_day_30'
from installs_base i
left join sessions_day_from sn on sn.s = i.s
left join utility_day_from un on un.s = i.s
left join spend_day_from spn on spn.s = i.s
left join utility_recent ru on ru.s = i.s
left join spend_day_to spm on spm.s = i.s
我发布的问题具体是在最底部的 select 字段名称部分。在 'as' 之后,我的字段名称是字符串引用。
将'as'之后的部分粘贴returns为字符串。在这种情况下,这似乎不会引起问题,但是,在我的实际脚本中,我将 odbc 与 Athena 一起使用,并且它不喜欢查询的形式 select field_name as 'string'
而不是常规的select field_name as string
// 字符串没有被引用为字符串
有没有办法让胶水在 'as' 之后用字符串引号组成这部分?所需查询:
<SQL>
select
Species as Species-setosa
from iris
where species = 'setosa'
一个选项是通过将特定位置的引号与正则表达式环视相匹配来替换 '
。在这里,我们可以使用str_replace_all
library(magrittr)
library(stringr)
query %>%
str_replace_all("(?<=as )'|(?<![%])'(?=(\nfrom|,))", "") %>%
glue_sql(.con = con)
#<SQL> select
# Species as Species-setosa
#from iris
#where species = 'setosa'
我们匹配单词 as
之后的 '
和 space 或 (|
) '
之前的逗号 (,
) 或下一行,来自 (\nfrom
)