大查询:如何在 table 中获得前 20 个相关性?
Big Query: How to get top 20 correlations in a table?
我有 100,000 个时间序列文件,每个文件有 2 列,日期和值。我将在 Google BigQuery 中创建一个 table 并将所有时间序列附加到此 table 以便每个附加将扩展 3 列,time_series_name、日期、值。最后,我将有 3 列和数百万行。给定 time_series_name,对于前 20 个相关时间序列,我必须使用什么代码。我想我必须做一些 GROUPBY(time_series_name) 然后计算这个 time_series_name 与其他所有项目的相关性,然后按降序对项目进行排序。那正确吗?什么查询代码会执行此操作?
试试下面,
它假定您的 table 名为 all_time_series
,字段为:time_series_name
、dt
和 value
,并且是根据您在问题
#standardSQL
WITH series AS (
SELECT DISTINCT time_series_name
FROM all_time_series
),
pairs AS (
SELECT
series1.time_series_name AS time_series_1,
series2.time_series_name AS time_series_2,
CONCAT(series1.time_series_name, ' - ', series2.time_series_name) AS pair_name
FROM series AS series1
JOIN series AS series2
ON series1.time_series_name < series2.time_series_name
)
SELECT pair_name, CORR(value1, value2) AS correlation
FROM (
SELECT pair_name, a1.dt AS dt, a1.value AS value1, a2.value AS value2
FROM pairs AS p
JOIN all_time_series AS a1
ON p.time_series_1 = a1.time_series_name
JOIN all_time_series AS a2
ON p.time_series_2 = a2.time_series_name
AND a1.dt = a2.dt
)
GROUP BY pair_name
ORDER BY correlation DESC
LIMIT 20
你可以用下面的虚拟数据测试上面的内容
#standardSQL
WITH all_time_series AS (
SELECT 'a' AS time_series_name, '2016-01-01' AS dt, 1 AS value UNION ALL
SELECT 'a', '2016-01-02', 2 UNION ALL
SELECT 'a', '2016-01-03', 3 UNION ALL
SELECT 'b', '2016-01-01', 1 UNION ALL
SELECT 'b', '2016-01-02', 2 UNION ALL
SELECT 'b', '2016-01-03', 3 UNION ALL
SELECT 'c', '2016-01-01', 5 UNION ALL
SELECT 'c', '2016-01-02', 6 UNION ALL
SELECT 'c', '2016-01-03', 7 UNION ALL
SELECT 'd', '2016-01-01', 6 UNION ALL
SELECT 'd', '2016-01-02', 2 UNION ALL
SELECT 'd', '2016-01-03', 3
),
series AS (
SELECT DISTINCT time_series_name
FROM all_time_series
),
pairs AS (
SELECT
series1.time_series_name AS time_series_1,
series2.time_series_name AS time_series_2,
CONCAT(series1.time_series_name, ' - ', series2.time_series_name) AS pair_name
FROM series AS series1
JOIN series AS series2
ON series1.time_series_name < series2.time_series_name
)
SELECT pair_name, CORR(value1, value2) AS correlation
FROM (
SELECT pair_name, a1.dt AS dt, a1.value AS value1, a2.value AS value2
FROM pairs AS p
JOIN all_time_series AS a1
ON p.time_series_1 = a1.time_series_name
JOIN all_time_series AS a2
ON p.time_series_2 = a2.time_series_name
AND a1.dt = a2.dt
)
GROUP BY pair_name
ORDER BY correlation DESC
LIMIT 2
我有 100,000 个时间序列文件,每个文件有 2 列,日期和值。我将在 Google BigQuery 中创建一个 table 并将所有时间序列附加到此 table 以便每个附加将扩展 3 列,time_series_name、日期、值。最后,我将有 3 列和数百万行。给定 time_series_name,对于前 20 个相关时间序列,我必须使用什么代码。我想我必须做一些 GROUPBY(time_series_name) 然后计算这个 time_series_name 与其他所有项目的相关性,然后按降序对项目进行排序。那正确吗?什么查询代码会执行此操作?
试试下面,
它假定您的 table 名为 all_time_series
,字段为:time_series_name
、dt
和 value
,并且是根据您在问题
#standardSQL
WITH series AS (
SELECT DISTINCT time_series_name
FROM all_time_series
),
pairs AS (
SELECT
series1.time_series_name AS time_series_1,
series2.time_series_name AS time_series_2,
CONCAT(series1.time_series_name, ' - ', series2.time_series_name) AS pair_name
FROM series AS series1
JOIN series AS series2
ON series1.time_series_name < series2.time_series_name
)
SELECT pair_name, CORR(value1, value2) AS correlation
FROM (
SELECT pair_name, a1.dt AS dt, a1.value AS value1, a2.value AS value2
FROM pairs AS p
JOIN all_time_series AS a1
ON p.time_series_1 = a1.time_series_name
JOIN all_time_series AS a2
ON p.time_series_2 = a2.time_series_name
AND a1.dt = a2.dt
)
GROUP BY pair_name
ORDER BY correlation DESC
LIMIT 20
你可以用下面的虚拟数据测试上面的内容
#standardSQL
WITH all_time_series AS (
SELECT 'a' AS time_series_name, '2016-01-01' AS dt, 1 AS value UNION ALL
SELECT 'a', '2016-01-02', 2 UNION ALL
SELECT 'a', '2016-01-03', 3 UNION ALL
SELECT 'b', '2016-01-01', 1 UNION ALL
SELECT 'b', '2016-01-02', 2 UNION ALL
SELECT 'b', '2016-01-03', 3 UNION ALL
SELECT 'c', '2016-01-01', 5 UNION ALL
SELECT 'c', '2016-01-02', 6 UNION ALL
SELECT 'c', '2016-01-03', 7 UNION ALL
SELECT 'd', '2016-01-01', 6 UNION ALL
SELECT 'd', '2016-01-02', 2 UNION ALL
SELECT 'd', '2016-01-03', 3
),
series AS (
SELECT DISTINCT time_series_name
FROM all_time_series
),
pairs AS (
SELECT
series1.time_series_name AS time_series_1,
series2.time_series_name AS time_series_2,
CONCAT(series1.time_series_name, ' - ', series2.time_series_name) AS pair_name
FROM series AS series1
JOIN series AS series2
ON series1.time_series_name < series2.time_series_name
)
SELECT pair_name, CORR(value1, value2) AS correlation
FROM (
SELECT pair_name, a1.dt AS dt, a1.value AS value1, a2.value AS value2
FROM pairs AS p
JOIN all_time_series AS a1
ON p.time_series_1 = a1.time_series_name
JOIN all_time_series AS a2
ON p.time_series_2 = a2.time_series_name
AND a1.dt = a2.dt
)
GROUP BY pair_name
ORDER BY correlation DESC
LIMIT 2