bigquery unnest 数组中的值取决于大小写

bigquery unnest values in array depending on case

我有一个非常特殊的数据 table,我想将其转换为可视化目的(见图)。我知道交叉连接 unnest 相当于笛卡尔积,但是在这种情况下,我需要根据 *

旁边的值进行重复

下面是一个带有 TEMP function 的 SQL 示例,它演示了如何展平数组

CREATE TEMP FUNCTION flatten(input ARRAY<STRING>)
RETURNS ARRAY<STRING>
LANGUAGE js AS """

        let flatten = []
        for (let j = 0; j < input.length; j++) {
            if (input[j].indexOf('*') === -1) {
                flatten.push(input[j])
            } else {
                let prefix = input[j].split('*')[1]
                let value = input[j].split('*')[0]

                for (let i = 0; i < prefix; i++) {
                    flatten.push(value)
                }
            }
        }
        return flatten
""";


WITH numbers AS
  (SELECT ['5*2','8','6'] as value
 )
SELECT flatten(value) as product
FROM numbers;

请求的这个调用的输出是:

这是一个使用 SQL UDF 的解决方案。这应该更快,因为它避免了使用 JavaScript v8 沙箱的延迟:

CREATE TEMP FUNCTION ExpandList(input STRING) AS (
  ARRAY(
    -- Find the value before the *
    SELECT SPLIT(elem, '*')[OFFSET(0)]
    -- For each comma-separated element inside the braces
    FROM UNNEST(REGEXP_EXTRACT_ALL(input, r'[^\[\],]+')) AS elem,
    -- Repeated by the value after the *, or once if there is no *
    UNNEST(GENERATE_ARRAY(1, IFNULL(CAST(SPLIT(elem, '*')[SAFE_OFFSET(1)] AS INT64), 1))))
);

WITH Input AS (
  SELECT 1 AS id, '[5*2,8,6]' AS values UNION ALL
  SELECT 2, '[5*2,0*3]' UNION ALL
  SELECT 3, '[1,2,5,6]'
)
SELECT id, value
FROM Input,
UNNEST(ExpandList(values)) AS value;

下面是 BigQuery Standard SQL 的另一个简单选项(基于 REPEAT 函数的使用)

#standardSQL
SELECT id, value
FROM `project.dataset.table` t,
UNNEST(SPLIT(REGEXP_REPLACE(t.values, r'\[|]', ''))) x,
UNNEST(SPLIT(REPEAT(
  CONCAT(',', SPLIT(x, '*')[OFFSET(0)]), 
  IFNULL(CAST(SPLIT(x, '*')[SAFE_OFFSET(1)] AS INT64), 1)
))) value
WHERE value != ''  

您可以使用您问题中的样本数据来测试和使用上面的示例,如下例所示

#standardSQL
WITH `project.dataset.table` AS (
  SELECT 1 AS id, '[5*2,8,6]' AS `values` UNION ALL
  SELECT 2, '[5*2,0*3]' UNION ALL
  SELECT 3, '[1*1,2,5,6]'
)
SELECT id, value
FROM `project.dataset.table` t,
UNNEST(SPLIT(REGEXP_REPLACE(t.values, r'\[|]', ''))) x,
UNNEST(SPLIT(REPEAT(
  CONCAT(',', SPLIT(x, '*')[OFFSET(0)]), 
  IFNULL(CAST(SPLIT(x, '*')[SAFE_OFFSET(1)] AS INT64), 1)
))) value
WHERE value != ''   

结果

Row id  value    
1   1   5    
2   1   5    
3   1   8    
4   1   6    
5   2   5    
6   2   5    
7   2   0    
8   2   0    
9   2   0    
10  3   1    
11  3   2    
12  3   5    
13  3   6