根据数据点的重要性过滤 BigQuery 中的数据 sql
filter data in BigQuery based on importance of data points sql
我正在 BigQuery 中合并两个表并根据几个条件过滤它们。代码如下所示:
SELECT,
d.id,
d.duration,
c.action,
c.url
FROM
(
`table_action_url` c
INNER JOIN `table_duration` d ON (d.id = c.id)
)
WHERE c.url LIKE "https://www.mywebpage%"
AND d.duration = '15000'
AND c.action in ('First quartile', 'Midpoint', 'Third quartile', 'Complete')
输出为:
id duration action url
1 15000 Midpoint https://www.mywebpage_fashion
1 15000 Complete https://www.mywebpage_fashion
2 15000 First quartile https://www.mywebpage_home
2 15000 Midpoint https://www.mywebpage_home
我需要添加一个只从操作中获取一个值的逻辑。优先级是 Complete
、Third quartile
等。因此代码需要比较 ids
和 urls
,如果最大值是 Complete
(对于相同的 ID和网址),然后抓住它。
期望的输出是:
id duration action url
1 15000 Complete https://www.mywebpage_fashion
2 15000 Midpoint https://www.mywebpage_home
您可以使用 window 函数和 CASE
表达式:
SELECT * EXCEPT(rn)
FROM (
SELECT,
d.id,
d.duration,
c.action,
c.url,
ROW_NUMBER() OVER(PARTITION BY d.id ORDER BY CASE c.action
WHEN 'Complete' THEN 1
WHEN 'Third quartile' THEN 2
WHEN 'Midpoint' THEN 3
WHEN 'First quartile' THEN 4
END) rn
FROM `table_action_url` c
INNER JOIN `table_duration` d ON d.id = c.id
WHERE
c.url LIKE "https://www.mywebpage%"
AND d.duration = '15000'
AND c.action in ('First quartile', 'Midpoint', 'Third quartile', 'Complete')
) t
WHERE rn = 1
在 BigQuery 中,您可以使用聚合来做到这一点:
SELECT d.id, d.duration,
( ARRAY_AGG(c.action ORDER BY ao.ord DESC LIMIT 1) )[ORDINAL(1)] as action,
( ARRAY_AGG(c.url ORDER BY ao.ord DESC LIMIT 1) )[ORDINAL(1)] as url
FROM `table_action_url` c JOIN
`table_duration` d
ON d.id = c.id JOIN
(SELECT 'Complete' as action, 1 as ord UNION ALL
SELECT 'Third quartile' as action, 2 as ord UNION ALL
SELECT 'Midpoint' as action, 3 as ord UNION ALL
SELECT 'First quartile' as action, 4 as ord
) ao
ON c.action = ao.action
WHERE c.url LIKE 'https://www.mywebpage%' AND
d.duration = '15000'
GROUP BY d.id, d.duration;
我在这里看到的最简单和通用的方法就是用下面的代码包装您现有的查询
#standardSQL
SELECT AS VALUE
ARRAY_AGG(current_query_result
ORDER BY CASE action
WHEN 'Complete' THEN 1
WHEN 'Third quartile' THEN 2
WHEN 'Midpoint' THEN 3
WHEN 'First quartile' THEN 4
END
LIMIT 1
)[OFFSET(0)]
FROM (
SELECT,
d.id,
d.duration,
c.action,
c.url
FROM `table_action_url` c
INNER JOIN `table_duration` d USING(id)
WHERE c.url LIKE "https://www.mywebpage%"
AND d.duration = '15000'
AND c.action in ('First quartile', 'Midpoint', 'Third quartile', 'Complete')
) current_query_result
GROUP BY id, url
有输出
Row id duration action url
1 1 15000 Complete https://www.mywebpage_fashion
2 2 15000 Midpoint https://www.mywebpage_home
如您所见,排序候选人和选择候选人的方式是通过以下片段实现的
ORDER BY CASE action
WHEN 'Complete' THEN 1
WHEN 'Third quartile' THEN 2
WHEN 'Midpoint' THEN 3
WHEN 'First quartile' THEN 4
END
LIMIT 1
还有另一种方法可以用更简洁、更易于管理并且可能更高效(这尚未得到证实 - 只是我的感觉)代码来实现相同目的
ORDER BY STRPOS('Complete, Third quartile, Midpoint, First quartile', action)
LIMIT 1
我正在 BigQuery 中合并两个表并根据几个条件过滤它们。代码如下所示:
SELECT,
d.id,
d.duration,
c.action,
c.url
FROM
(
`table_action_url` c
INNER JOIN `table_duration` d ON (d.id = c.id)
)
WHERE c.url LIKE "https://www.mywebpage%"
AND d.duration = '15000'
AND c.action in ('First quartile', 'Midpoint', 'Third quartile', 'Complete')
输出为:
id duration action url
1 15000 Midpoint https://www.mywebpage_fashion
1 15000 Complete https://www.mywebpage_fashion
2 15000 First quartile https://www.mywebpage_home
2 15000 Midpoint https://www.mywebpage_home
我需要添加一个只从操作中获取一个值的逻辑。优先级是 Complete
、Third quartile
等。因此代码需要比较 ids
和 urls
,如果最大值是 Complete
(对于相同的 ID和网址),然后抓住它。
期望的输出是:
id duration action url
1 15000 Complete https://www.mywebpage_fashion
2 15000 Midpoint https://www.mywebpage_home
您可以使用 window 函数和 CASE
表达式:
SELECT * EXCEPT(rn)
FROM (
SELECT,
d.id,
d.duration,
c.action,
c.url,
ROW_NUMBER() OVER(PARTITION BY d.id ORDER BY CASE c.action
WHEN 'Complete' THEN 1
WHEN 'Third quartile' THEN 2
WHEN 'Midpoint' THEN 3
WHEN 'First quartile' THEN 4
END) rn
FROM `table_action_url` c
INNER JOIN `table_duration` d ON d.id = c.id
WHERE
c.url LIKE "https://www.mywebpage%"
AND d.duration = '15000'
AND c.action in ('First quartile', 'Midpoint', 'Third quartile', 'Complete')
) t
WHERE rn = 1
在 BigQuery 中,您可以使用聚合来做到这一点:
SELECT d.id, d.duration,
( ARRAY_AGG(c.action ORDER BY ao.ord DESC LIMIT 1) )[ORDINAL(1)] as action,
( ARRAY_AGG(c.url ORDER BY ao.ord DESC LIMIT 1) )[ORDINAL(1)] as url
FROM `table_action_url` c JOIN
`table_duration` d
ON d.id = c.id JOIN
(SELECT 'Complete' as action, 1 as ord UNION ALL
SELECT 'Third quartile' as action, 2 as ord UNION ALL
SELECT 'Midpoint' as action, 3 as ord UNION ALL
SELECT 'First quartile' as action, 4 as ord
) ao
ON c.action = ao.action
WHERE c.url LIKE 'https://www.mywebpage%' AND
d.duration = '15000'
GROUP BY d.id, d.duration;
我在这里看到的最简单和通用的方法就是用下面的代码包装您现有的查询
#standardSQL
SELECT AS VALUE
ARRAY_AGG(current_query_result
ORDER BY CASE action
WHEN 'Complete' THEN 1
WHEN 'Third quartile' THEN 2
WHEN 'Midpoint' THEN 3
WHEN 'First quartile' THEN 4
END
LIMIT 1
)[OFFSET(0)]
FROM (
SELECT,
d.id,
d.duration,
c.action,
c.url
FROM `table_action_url` c
INNER JOIN `table_duration` d USING(id)
WHERE c.url LIKE "https://www.mywebpage%"
AND d.duration = '15000'
AND c.action in ('First quartile', 'Midpoint', 'Third quartile', 'Complete')
) current_query_result
GROUP BY id, url
有输出
Row id duration action url
1 1 15000 Complete https://www.mywebpage_fashion
2 2 15000 Midpoint https://www.mywebpage_home
如您所见,排序候选人和选择候选人的方式是通过以下片段实现的
ORDER BY CASE action
WHEN 'Complete' THEN 1
WHEN 'Third quartile' THEN 2
WHEN 'Midpoint' THEN 3
WHEN 'First quartile' THEN 4
END
LIMIT 1
还有另一种方法可以用更简洁、更易于管理并且可能更高效(这尚未得到证实 - 只是我的感觉)代码来实现相同目的
ORDER BY STRPOS('Complete, Third quartile, Midpoint, First quartile', action)
LIMIT 1