基于在 BigQuery 中添加和删除事件行构建数组

Build array based on add and remove event rows in BigQuery

我在 BigQuery 中有一个具有以下结构的 table:

datetime | event  | value
==========================
1        | add    | 1
---------+--------+-------
2        | remove | 1
---------+--------+-------
6        | add    | 2
---------+--------+-------
8        | add    | 3
---------+--------+-------
11       | add    | 4
---------+--------+-------
23       | remove | 3
---------+--------+-------

我正在尝试构建一个视图,它向包含数组当前状态的每一行添加一个 list 列。该数组永远不会包含重复项。这应该是结果:

datetime | event  | value | list
===================================
1        | add    | 1     | [1]
---------+--------+-------+--------
2        | remove | 1     | []
---------+--------+-------+--------
6        | add    | 2     | [2]
---------+--------+-------+--------
8        | add    | 3     | [2,3]
---------+--------+-------+--------
11       | add    | 4     | [2,3,4]
---------+--------+-------+--------
23       | remove | 3     | [2,4]
---------+--------+-------+--------

我尝试使用解析函数,但没有成功。使用数组的 api 非常有限。我想如果我可以使用递归 WITH 子句我会成功,不幸的是这在 BigQuery 中是不可能的。

我正在使用启用了标准 SQL 的 BigQuery。

以下适用于 BigQuery SQL(只是可能的众多选项之一)

#standardSQL
CREATE TEMP FUNCTION CUST_ARRAY_AGG(arr ARRAY<STRUCT<event STRING, value STRING>>)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
  var result = [];  
  for (i = 0; i < arr.length; i++) { 
    if (arr[i].event == 'add') {
      result.push(arr[i].value);
    } else {
      var index = result.indexOf(arr[i].value);
      if (index > -1) {
        result.splice(index, 1);
      }
    }
  }
  return result;
""";
WITH `project.dataset.events` AS (
  SELECT 1 dt, 'add' event, '1' value UNION ALL
  SELECT 2, 'remove', '1' UNION ALL
  SELECT 6, 'add', '2' UNION ALL
  SELECT 8, 'add', '3' UNION ALL
  SELECT 11, 'add', '4' UNION ALL
  SELECT 23, 'remove', '3' 
)
SELECT dt, event, value, 
  CUST_ARRAY_AGG(arr) list_as_arr,
  (SELECT CONCAT('[', IFNULL(STRING_AGG(item), ''), ']') FROM UNNEST(CUST_ARRAY_AGG(arr)) item) list_as_string
FROM (
  SELECT dt, event, value,
    ARRAY_AGG(STRUCT<event STRING, value STRING>(event, value)) OVER(ORDER BY dt) arr
  FROM `project.dataset.events`
)

结果如下

Row dt  event   value   list_as_arr list_as_string   
1   1   add     1       1           [1]  
2   2   remove  1                   []   
3   6   add     2       2           [2]  
4   8   add     3       2           [2,3]    
                        3        
5   11  add     4       2           [2,3,4]  
                        3        
                        4        
6   23  remove  3       2           [2,4]    
                        4   

以下版本适用于 BigQuery Standard SQL,仅使用纯 SQL(无 JS UDF)

#standardSQL
WITH `project.dataset.events` AS (
  SELECT 1 dt,'add' event,'1' value UNION ALL
  SELECT 2,   'remove',   '1' UNION ALL
  SELECT 6,   'add',      '2' UNION ALL
  SELECT 8,   'add',      '3' UNION ALL
  SELECT 11,  'add',      '4' UNION ALL
  SELECT 23,  'remove',   '3' 
), cum AS (
  SELECT dt, event, value,
    SUM(IF(event = 'add', 1, -1)) OVER(PARTITION BY value ORDER BY dt) state
  FROM `project.dataset.events`
), pre AS (
  SELECT 
    a.dt, a.event, a.value, a.state, b.value AS b_value,
    ARRAY_AGG(b.state ORDER BY b.dt DESC)[SAFE_OFFSET(0)] b_state, 
    MAX(b.dt) b_dt 
  FROM cum a
  JOIN cum b ON b.dt <= a.dt
  GROUP BY a.dt, a.event, a.value, a.state, b.value
)
SELECT dt, event, value, 
  SPLIT(IFNULL(STRING_AGG(IF(b_state = 1, b_value, NULL) ORDER BY b_dt), '')) list_as_array,
  CONCAT('[', IFNULL(STRING_AGG(IF(b_state = 1, b_value, NULL) ORDER BY b_dt), ''), ']') list_as_string
FROM pre
GROUP BY dt, event, value
ORDER BY dt  

结果是 "surprisingly" :o) 与我之前 answered/posted

的 JS UDF 版本完全相同
Row dt  event   value   list_as_arr list_as_string   
1   1   add     1       1           [1]  
2   2   remove  1                   []   
3   6   add     2       2           [2]  
4   8   add     3       2           [2,3]    
                        3        
5   11  add     4       2           [2,3,4]  
                        3        
                        4        
6   23  remove  3       2           [2,4]    
                        4   

注意:我认为上面的设计可能有点过度 - 但我只是没有时间对其进行潜在的润色/优化 - 应该是可行的 - 将其留给问题的所有者

虽然已经解决了我喜欢这个问题。此解决方案中的想法是首先使用窗口获取完整历史记录,同时考虑所有前面的行 - 然后删除删除项:

#standardSQL
-- stolen test table ;)
WITH test AS (
  SELECT 1 dt,'add' event,'1' value UNION ALL
  SELECT 2,   'remove',   '1' UNION ALL
  SELECT 6,   'add',      '2' UNION ALL
  SELECT 8,   'add',      '3' UNION ALL
  SELECT 11,  'add',      '4' UNION ALL
  SELECT 23,  'remove',   '3' 
)

, windowing as (
SELECT *,
  -- add history using window function
  ARRAY_AGG(STRUCT(event, value)) OVER ( ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as history
FROM test)

SELECT 
  dt,
  event,
  value,
  --history, -- for testing

  -- Get all added items that are not removed items
  -- This sub-select runs within the row only, treating history-array as (sub-)table
  (SELECT ARRAY_AGG(value) value FROM unnest(t.history) l 
    WHERE l.event = 'add' 
    AND l.value NOT IN 
      (SELECT l.value FROM unnest(t.history) l WHERE l.event = 'remove')
  ) AS list
FROM windowing t

我没有比较性能,但会感兴趣!

所有原始答案都很棒(尤其是我的 - 哈哈)并且主要基于基于集合的(处理大数据的最佳方式)处理,在这种情况下对于 sql 用户来说可能变得足够复杂!

幸运的是,对 Scripting and Stored Procedures 的支持现在处于测试阶段(截至 2019 年 10 月)

您可以提交多个用分号分隔的语句,BigQuery 现在可以 运行 它们。与基于集合的方式相比,以过程方式表达所需逻辑(显然至少以性能为代价)要容易得多 - 因此更多用户可以受益

下面的脚本实现了问题中表达的逻辑

DECLARE arr ARRAY<STRUCT<dt INT64, event STRING, value STRING>>;
DECLARE result ARRAY<STRUCT<dt INT64, event STRING, value STRING, list STRING>> DEFAULT [STRUCT(NULL, '', '', '')];
DECLARE list ARRAY<STRING> DEFAULT [];
DECLARE i, m INT64 DEFAULT -1; 

SET arr = (
  WITH t AS (
    SELECT 1 dt,'add' event,'1' value UNION ALL
    SELECT 2,   'remove',   '1' UNION ALL
    SELECT 6,   'add',      '2' UNION ALL
    SELECT 8,   'add',      '3' UNION ALL
    SELECT 11,  'add',      '4' UNION ALL
    SELECT 23,  'remove',   '3' 
  )
  SELECT ARRAY_AGG(t) FROM t
);

SET m = ARRAY_LENGTH(arr);

LOOP
  SET i = i + 1;
  IF i >= m THEN LEAVE;
  ELSE
    IF arr[OFFSET(i)].event = 'add' THEN 
      SET list = (
        SELECT ARRAY_CONCAT(list, [arr[OFFSET(i)].value])
      );    
    ELSE
      SET list = ARRAY(
        SELECT item
        FROM UNNEST(list) item 
        WHERE item != arr[OFFSET(i)].value
      );    
    END IF;

    SET result = (
      SELECT ARRAY_CONCAT(
        result, 
        [(arr[OFFSET(i)].dt, arr[OFFSET(i)].event, arr[OFFSET(i)].value, ARRAY_TO_STRING(list, ','))]
      )
    );
  END IF;
END LOOP;

SELECT * FROM UNNEST(result) WHERE NOT dt IS NULL;

结果