SQL:多列填充缺失值

SQL: Fill missing values multiple columns

使用给定的数据和 sql 命令

WITH
  TableItem AS (
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0004' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 4 AS Quantity UNION ALL
  # missing 2018-05-01
  # missing Item18-004
  # new Item18-0006
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0005' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 5 AS Quantity UNION ALL
  # missing Item18-0004, Item18-0005
  # new Item18-0006
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0006' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 6 AS Quantity UNION ALL
  # some missing
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 3 AS Quantity UNION ALL
  # some missing, some new
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0005' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 5 AS Quantity UNION ALL
  SELECT 'Item18-0007' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 7 AS Quantity ),
  # Cross Join to get all combinations of ItemNr and PostingDate
  TableItemNrPostingDate AS (
  SELECT
    ItemNr,
    PostingDate
  FROM (
    SELECT
      it1.ItemNr
    FROM
      TableItem it1
    GROUP BY
      it1.ItemNr ) t2
  CROSS JOIN (
    SELECT
      it2.PostingDate
    FROM
      TableItem it2
    GROUP BY
      it2.PostingDate ) t3 ),
  # Create Calender Table to get missing dates
  TableCalenderDayItemNrPostingDate AS (
  SELECT
    CalenderDay,
    TableItemNrPostingDate.ItemNr As ItemNr,
    TableItemNrPostingDate.PostingDate as PostingDate
  FROM
    UNNEST( GENERATE_DATE_ARRAY("2018-04-30", DATE_ADD(DATE_ADD(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 MONTH), INTERVAL -1 DAY), INTERVAL 1 DAY)) AS CalenderDay
  LEFT JOIN
    TableItemNrPostingDate
  ON
    CalenderDay = DATE(TableItemNrPostingDate.PostingDate)
  ORDER BY
    CalenderDay )
SELECT
  CalenderDay,
  FIRST_VALUE(ItemNr) OVER (PARTITION BY ItemNr ORDER BY CalenderDay ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),
  ItemNr,
  PostingDate
FROM
  TableCalenderDayItemNrPostingDate
ORDER BY
  CalenderDay, ItemNr

我想获取每个 ItemNr 和每个日历日期的 ItemNr、PostingDate、数量 "from beginning"。缺失值必须用以前的值填充。

我的 table 将仅使用更改的数量和日期进行填充/更新。这意味着并非所有项目都每天更新,新项目稍后进入 table 并且在某些日子根本没有变化。

对于给定的数据,我想得到结果。 Quantity = ItemNr 方便识别。

Item18-0001   2018-04-30   1
Item18-0002   2018-04-30   2
Item18-0003   2018-04-30   3
Item18-0004   2018-04-30   4
Item18-0005   2018-04-30   0 (or null or empty row)
Item18-0006   2018-04-30   0 (or null or empty row)
Item18-0007   2018-04-30   0 (or null or empty row)

Item18-0001   2018-05-01   1
Item18-0002   2018-05-01   2
Item18-0003   2018-05-01   3
Item18-0004   2018-05-01   4
Item18-0005   2018-05-01   0 (or null or empty row)
Item18-0006   2018-05-01   0 (or null or empty row)
Item18-0007   2018-05-01   0 (or null or empty row)

Item18-0001   2018-05-02   1
Item18-0002   2018-05-02   2
Item18-0003   2018-05-02   3
Item18-0004   2018-05-02   4
Item18-0005   2018-05-02   5
Item18-0006   2018-05-02   0 (or null or empty row)
Item18-0007   2018-05-02   0 (or null or empty row)

Item18-0001   2018-05-03   1
Item18-0002   2018-05-03   2
Item18-0003   2018-05-03   3
Item18-0004   2018-05-03   4
Item18-0005   2018-05-03   5
Item18-0006   2018-05-03   6
Item18-0007   2018-05-03   0 (or null or empty row)

Item18-0001   2018-05-04   1
Item18-0002   2018-05-04   2
Item18-0003   2018-05-04   3
Item18-0004   2018-05-04   4
Item18-0005   2018-05-04   5
Item18-0006   2018-05-04   6
Item18-0007   2018-05-03   0 (or null or empty row)

Item18-0001   2018-05-05   1
Item18-0002   2018-05-05   2
Item18-0003   2018-05-05   3
Item18-0004   2018-05-05   4
Item18-0005   2018-05-05   5
Item18-0006   2018-05-05   6
Item18-0007   2018-05-05   7

我的SQL命令没有准备好,是错误的。我只是想表明我的努力或意图。

以下适用于 BigQuery StandardSQL

#standardSQL
WITH TableItemNr AS (
  SELECT DISTINCT ItemNr FROM `project.dataset.TableItem`
), TableDates AS (
  SELECT CAST(PostingDate AS TIMESTAMP) PostingDate
  FROM (
    SELECT DATE(MIN(PostingDate)) minPostingDate, DATE(MAX(PostingDate)) maxPostingDate 
    FROM `project.dataset.TableItem`
  ), UNNEST(GENERATE_DATE_ARRAY(minPostingDate, maxPostingDate)) PostingDate --  CURRENT_DATE() can be used instead of maxPostingDate depends on your needs  
)
SELECT i.ItemNr, d.PostingDate, t.Quantity, 
  IF(t.ItemNr IS NULL, 0, 1) original,
  LAST_VALUE(Quantity IGNORE NULLS) OVER(PARTITION BY ItemNr ORDER BY PostingDate) updatedQuantity
FROM TableDates d
CROSS JOIN TableItemNr i
LEFT JOIN `project.dataset.TableItem` t
USING(ItemNr, PostingDate)
-- ORDER BY PostingDate, ItemNr

您可以使用问题中的虚拟数据测试/玩上面的内容

#standardSQL
WITH `project.dataset.TableItem` AS (
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0004' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 4 AS Quantity UNION ALL
  # missing 2018-05-01
  # missing Item18-004
  # new Item18-0006
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0005' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 5 AS Quantity UNION ALL
  # missing Item18-0004, Item18-0005
  # new Item18-0006
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0006' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 6 AS Quantity UNION ALL
  # some missing
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 3 AS Quantity UNION ALL
  # some missing, some new
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0005' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 5 AS Quantity UNION ALL
  SELECT 'Item18-0007' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 7 AS Quantity 
), TableItemNr AS (
  SELECT DISTINCT ItemNr FROM `project.dataset.TableItem`
), TableDates AS (
  SELECT CAST(PostingDate AS TIMESTAMP) PostingDate
  FROM (
    SELECT DATE(MIN(PostingDate)) minPostingDate, DATE(MAX(PostingDate)) maxPostingDate 
    FROM `project.dataset.TableItem`
  ), UNNEST(GENERATE_DATE_ARRAY(minPostingDate, maxPostingDate)) PostingDate
)
SELECT i.ItemNr, d.PostingDate, t.Quantity, 
  IF(t.ItemNr IS NULL, 0, 1) original,
  LAST_VALUE(Quantity IGNORE NULLS) OVER(PARTITION BY ItemNr ORDER BY PostingDate) updatedQuantity
FROM TableDates d
CROSS JOIN TableItemNr i
LEFT JOIN `project.dataset.TableItem` t
USING(ItemNr, PostingDate)
ORDER BY PostingDate, ItemNr

结果为

Row ItemNr      PostingDate                 Quantity    original    updatedQuantity  
1   Item18-0001 2018-04-30 00:00:00.000 UTC 1       1   1    
2   Item18-0002 2018-04-30 00:00:00.000 UTC 2       1   2    
3   Item18-0003 2018-04-30 00:00:00.000 UTC 3       1   3    
4   Item18-0004 2018-04-30 00:00:00.000 UTC 4       1   4    
5   Item18-0005 2018-04-30 00:00:00.000 UTC null    0   null     
6   Item18-0006 2018-04-30 00:00:00.000 UTC null    0   null     
7   Item18-0007 2018-04-30 00:00:00.000 UTC null    0   null     
8   Item18-0001 2018-05-01 00:00:00.000 UTC null    0   1    
9   Item18-0002 2018-05-01 00:00:00.000 UTC null    0   2    
10  Item18-0003 2018-05-01 00:00:00.000 UTC null    0   3    
11  Item18-0004 2018-05-01 00:00:00.000 UTC null    0   4    
12  Item18-0005 2018-05-01 00:00:00.000 UTC null    0   null     
13  Item18-0006 2018-05-01 00:00:00.000 UTC null    0   null     
14  Item18-0007 2018-05-01 00:00:00.000 UTC null    0   null     
15  Item18-0001 2018-05-02 00:00:00.000 UTC 1       1   1    
16  Item18-0002 2018-05-02 00:00:00.000 UTC 1       1   1    
17  Item18-0003 2018-05-02 00:00:00.000 UTC 1       1   1    
18  Item18-0004 2018-05-02 00:00:00.000 UTC null    0   4    
19  Item18-0005 2018-05-02 00:00:00.000 UTC 5       1   5    
20  Item18-0006 2018-05-02 00:00:00.000 UTC null    0   null     
21  Item18-0007 2018-05-02 00:00:00.000 UTC null    0   null     
22  Item18-0001 2018-05-03 00:00:00.000 UTC 1       1   1    
23  Item18-0002 2018-05-03 00:00:00.000 UTC 2       1   2    
24  Item18-0003 2018-05-03 00:00:00.000 UTC 3       1   3    
25  Item18-0004 2018-05-03 00:00:00.000 UTC null    0   4    
26  Item18-0005 2018-05-03 00:00:00.000 UTC null    0   5    
27  Item18-0006 2018-05-03 00:00:00.000 UTC 6       1   6    
28  Item18-0007 2018-05-03 00:00:00.000 UTC null    0   null     
29  Item18-0001 2018-05-04 00:00:00.000 UTC null    0   1    
30  Item18-0002 2018-05-04 00:00:00.000 UTC 2       1   2    
31  Item18-0002 2018-05-04 00:00:00.000 UTC 2       1   2    
32  Item18-0003 2018-05-04 00:00:00.000 UTC 3       1   3    
33  Item18-0003 2018-05-04 00:00:00.000 UTC 3       1   3    
34  Item18-0004 2018-05-04 00:00:00.000 UTC null    0   4    
35  Item18-0005 2018-05-04 00:00:00.000 UTC null    0   5    
36  Item18-0006 2018-05-04 00:00:00.000 UTC null    0   6    
37  Item18-0007 2018-05-04 00:00:00.000 UTC null    0   null     
38  Item18-0001 2018-05-05 00:00:00.000 UTC 1       1   1    
39  Item18-0002 2018-05-05 00:00:00.000 UTC null    0   2    
40  Item18-0003 2018-05-05 00:00:00.000 UTC 3       1   3    
41  Item18-0004 2018-05-05 00:00:00.000 UTC null    0   4    
42  Item18-0005 2018-05-05 00:00:00.000 UTC 5       1   5    
43  Item18-0006 2018-05-05 00:00:00.000 UTC null    0   6    
44  Item18-0007 2018-05-05 00:00:00.000 UTC 7       1   7        

注意:出于某种原因,您的日期是 TIMESTAMP 数据类型,我是否需要做一些额外的 CAST'ing