bigquery unnest 数组中的值取决于大小写
bigquery unnest values in array depending on case
我有一个非常特殊的数据 table,我想将其转换为可视化目的(见图)。我知道交叉连接 unnest 相当于笛卡尔积,但是在这种情况下,我需要根据 *
旁边的值进行重复
下面是一个带有 TEMP function 的 SQL 示例,它演示了如何展平数组
CREATE TEMP FUNCTION flatten(input ARRAY<STRING>)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
let flatten = []
for (let j = 0; j < input.length; j++) {
if (input[j].indexOf('*') === -1) {
flatten.push(input[j])
} else {
let prefix = input[j].split('*')[1]
let value = input[j].split('*')[0]
for (let i = 0; i < prefix; i++) {
flatten.push(value)
}
}
}
return flatten
""";
WITH numbers AS
(SELECT ['5*2','8','6'] as value
)
SELECT flatten(value) as product
FROM numbers;
请求的这个调用的输出是:
这是一个使用 SQL UDF 的解决方案。这应该更快,因为它避免了使用 JavaScript v8 沙箱的延迟:
CREATE TEMP FUNCTION ExpandList(input STRING) AS (
ARRAY(
-- Find the value before the *
SELECT SPLIT(elem, '*')[OFFSET(0)]
-- For each comma-separated element inside the braces
FROM UNNEST(REGEXP_EXTRACT_ALL(input, r'[^\[\],]+')) AS elem,
-- Repeated by the value after the *, or once if there is no *
UNNEST(GENERATE_ARRAY(1, IFNULL(CAST(SPLIT(elem, '*')[SAFE_OFFSET(1)] AS INT64), 1))))
);
WITH Input AS (
SELECT 1 AS id, '[5*2,8,6]' AS values UNION ALL
SELECT 2, '[5*2,0*3]' UNION ALL
SELECT 3, '[1,2,5,6]'
)
SELECT id, value
FROM Input,
UNNEST(ExpandList(values)) AS value;
下面是 BigQuery Standard SQL 的另一个简单选项(基于 REPEAT 函数的使用)
#standardSQL
SELECT id, value
FROM `project.dataset.table` t,
UNNEST(SPLIT(REGEXP_REPLACE(t.values, r'\[|]', ''))) x,
UNNEST(SPLIT(REPEAT(
CONCAT(',', SPLIT(x, '*')[OFFSET(0)]),
IFNULL(CAST(SPLIT(x, '*')[SAFE_OFFSET(1)] AS INT64), 1)
))) value
WHERE value != ''
您可以使用您问题中的样本数据来测试和使用上面的示例,如下例所示
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 AS id, '[5*2,8,6]' AS `values` UNION ALL
SELECT 2, '[5*2,0*3]' UNION ALL
SELECT 3, '[1*1,2,5,6]'
)
SELECT id, value
FROM `project.dataset.table` t,
UNNEST(SPLIT(REGEXP_REPLACE(t.values, r'\[|]', ''))) x,
UNNEST(SPLIT(REPEAT(
CONCAT(',', SPLIT(x, '*')[OFFSET(0)]),
IFNULL(CAST(SPLIT(x, '*')[SAFE_OFFSET(1)] AS INT64), 1)
))) value
WHERE value != ''
结果
Row id value
1 1 5
2 1 5
3 1 8
4 1 6
5 2 5
6 2 5
7 2 0
8 2 0
9 2 0
10 3 1
11 3 2
12 3 5
13 3 6
我有一个非常特殊的数据 table,我想将其转换为可视化目的(见图)。我知道交叉连接 unnest 相当于笛卡尔积,但是在这种情况下,我需要根据 *
旁边的值进行重复下面是一个带有 TEMP function 的 SQL 示例,它演示了如何展平数组
CREATE TEMP FUNCTION flatten(input ARRAY<STRING>)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
let flatten = []
for (let j = 0; j < input.length; j++) {
if (input[j].indexOf('*') === -1) {
flatten.push(input[j])
} else {
let prefix = input[j].split('*')[1]
let value = input[j].split('*')[0]
for (let i = 0; i < prefix; i++) {
flatten.push(value)
}
}
}
return flatten
""";
WITH numbers AS
(SELECT ['5*2','8','6'] as value
)
SELECT flatten(value) as product
FROM numbers;
请求的这个调用的输出是:
这是一个使用 SQL UDF 的解决方案。这应该更快,因为它避免了使用 JavaScript v8 沙箱的延迟:
CREATE TEMP FUNCTION ExpandList(input STRING) AS (
ARRAY(
-- Find the value before the *
SELECT SPLIT(elem, '*')[OFFSET(0)]
-- For each comma-separated element inside the braces
FROM UNNEST(REGEXP_EXTRACT_ALL(input, r'[^\[\],]+')) AS elem,
-- Repeated by the value after the *, or once if there is no *
UNNEST(GENERATE_ARRAY(1, IFNULL(CAST(SPLIT(elem, '*')[SAFE_OFFSET(1)] AS INT64), 1))))
);
WITH Input AS (
SELECT 1 AS id, '[5*2,8,6]' AS values UNION ALL
SELECT 2, '[5*2,0*3]' UNION ALL
SELECT 3, '[1,2,5,6]'
)
SELECT id, value
FROM Input,
UNNEST(ExpandList(values)) AS value;
下面是 BigQuery Standard SQL 的另一个简单选项(基于 REPEAT 函数的使用)
#standardSQL
SELECT id, value
FROM `project.dataset.table` t,
UNNEST(SPLIT(REGEXP_REPLACE(t.values, r'\[|]', ''))) x,
UNNEST(SPLIT(REPEAT(
CONCAT(',', SPLIT(x, '*')[OFFSET(0)]),
IFNULL(CAST(SPLIT(x, '*')[SAFE_OFFSET(1)] AS INT64), 1)
))) value
WHERE value != ''
您可以使用您问题中的样本数据来测试和使用上面的示例,如下例所示
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 AS id, '[5*2,8,6]' AS `values` UNION ALL
SELECT 2, '[5*2,0*3]' UNION ALL
SELECT 3, '[1*1,2,5,6]'
)
SELECT id, value
FROM `project.dataset.table` t,
UNNEST(SPLIT(REGEXP_REPLACE(t.values, r'\[|]', ''))) x,
UNNEST(SPLIT(REPEAT(
CONCAT(',', SPLIT(x, '*')[OFFSET(0)]),
IFNULL(CAST(SPLIT(x, '*')[SAFE_OFFSET(1)] AS INT64), 1)
))) value
WHERE value != ''
结果
Row id value
1 1 5
2 1 5
3 1 8
4 1 6
5 2 5
6 2 5
7 2 0
8 2 0
9 2 0
10 3 1
11 3 2
12 3 5
13 3 6