聚合日志 table 记录以避免冗余
Aggregate log table records to avoid redundancy
我有一个用于产品变更跟踪的 table,如下所示:
CREATE TABLE ProductHistory (
ProductId INT NOT NULL,
Name NVARCHAR(50) NOT NULL,
Price MONEY NOT NULL,
StartDate DATETIME NOT NULL,
EndDate DATETIME NOT NULL
)
INSERT INTO ProductHistory VALUES
(1, 'Phone', 100, '2020-11-20 00:00', '2020-11-20 01:00'), /* initial */
(1, 'Phone', 100, '2020-11-20 01:01', '2020-11-20 02:00'), /* no change */
(1, 'Phone', 200, '2020-11-20 02:01', '2020-11-20 03:00'), /* no change, current */
(2, 'Apple', 5, '2020-11-20 00:00', '2020-11-20 01:00'), /* initial */
(2, 'Apple', 10, '2020-11-20 01:01', '2020-11-20 02:00'), /* changed */
(2, 'Pineapple', 10, '2020-11-20 02:01', '2020-11-20 03:00'), /* no change, current */
(3, 'Orange juice', 100, '2020-11-21 00:00', '2020-11-21 01:00'), /* initial */
(3, 'Orange juice', 100, '2020-11-21 01:01', '2020-11-21 02:00'), /* no change */
(3, 'Orange juice', 100, '2020-11-21 02:01', '2020-11-21 03:00') /* no change, current */
我希望提出一个查询来获得以下结果。请注意,没有实际更改的记录应该合并在一起,这样就没有冗余。
ProductId Name Price StartDate EndDate
----------- -------------- ------- -------------------------------------- --------------------------------------
1 Phone 100 2020-11-20 00:00:00.000 (first row) 2020-11-20 02:00:00.000 (second row)
1 Phone 200 2020-11-20 02:01:00.000 (third row) 2020-11-20 03:00:00.000 (third row)
2 Apple 5 2020-11-20 00:00:00.000 (first row) 2020-11-20 01:00:00.000 (first row)
2 Apple 10 2020-11-20 01:01:00.000 (second row) 2020-11-20 02:00:00.000 (second row)
2 Pineapple 10 2020-11-20 02:01:00.000 (third row) 2020-11-20 03:00:00.000 (third row)
3 Orange juice 100 2020-11-21 00:00:00.000 (first row) 2020-11-20 03:00:00.000 (third row)
我最接近的是:
; WITH history AS (
SELECT
ProductId,
Name,
Price,
StartDate,
EndDate
FROM (
SELECT
ROW_NUMBER() OVER (PARTITION BY ProductId ORDER BY StartDate DESC) 'RowNumber',
*
FROM ProductHistory
) history
WHERE history.RowNumber = 1 -- select newest row per ProductId
UNION ALL
SELECT
previous.ProductId,
previous.Name,
previous.Price,
previous.StartDate,
EndDate
FROM (
SELECT
ROW_NUMBER() OVER (PARTITION BY previous.ProductId ORDER BY previous.StartDate DESC) 'RowNumber',
previous.*
FROM history [current]
INNER JOIN ProductHistory previous
ON previous.ProductId = [current].ProductId
AND previous.StartDate < [current].StartDate
AND (
previous.Name <> [current].Name
OR previous.Price <> [current].Price
)
) previous
WHERE previous.RowNumber = 1 -- select previous row of each ProductId, recursively
)
SELECT *
FROM history
ORDER BY
ProductId,
StartDate
ProductId Name Price StartDate EndDate
----------- -------------- -------- ------------------------- -------------------------
1 Phone 100,00 2020-11-20 01:01:00.000 2020-11-20 02:00:00.000
1 Phone 200,00 2020-11-20 02:01:00.000 2020-11-20 03:00:00.000
2 Apple 5,00 2020-11-20 00:00:00.000 2020-11-20 01:00:00.000
2 Apple 10,00 2020-11-20 01:01:00.000 2020-11-20 02:00:00.000
2 Pineapple 10,00 2020-11-20 02:01:00.000 2020-11-20 03:00:00.000
3 Orange juice 100,00 2020-11-21 02:01:00.000 2020-11-21 03:00:00.000
虽然 Name
和 Price
列值是正确的,但我不确定如何聚合 StartDate
和 EndDate
列以获得我需要的值。 fiddle 中提供了所有代码,如果有帮助的话。
这是一种间隙和孤岛问题。可能最简单的方法就是行数的不同:
select productid, name, price, min(startdate), max(enddate)
from (select ph.*,
row_number() over (partition by productid order by startdate) as seqnum,
row_number() over (partition by productid, name, price order by startdate) as seqnum_2
from producthistory
) ph
group by productid, name, price, (seqnum - seqnum_2);
这假设时间范围内没有间隙——这对于此数据模型来说似乎是合理的。
为什么这行得通?这有点难以解释。但是,如果您查看子查询的结果,您将看到相邻行的两个行号之间的差异是如何保持不变的,其中 name
和 price
相同。
我有一个用于产品变更跟踪的 table,如下所示:
CREATE TABLE ProductHistory (
ProductId INT NOT NULL,
Name NVARCHAR(50) NOT NULL,
Price MONEY NOT NULL,
StartDate DATETIME NOT NULL,
EndDate DATETIME NOT NULL
)
INSERT INTO ProductHistory VALUES
(1, 'Phone', 100, '2020-11-20 00:00', '2020-11-20 01:00'), /* initial */
(1, 'Phone', 100, '2020-11-20 01:01', '2020-11-20 02:00'), /* no change */
(1, 'Phone', 200, '2020-11-20 02:01', '2020-11-20 03:00'), /* no change, current */
(2, 'Apple', 5, '2020-11-20 00:00', '2020-11-20 01:00'), /* initial */
(2, 'Apple', 10, '2020-11-20 01:01', '2020-11-20 02:00'), /* changed */
(2, 'Pineapple', 10, '2020-11-20 02:01', '2020-11-20 03:00'), /* no change, current */
(3, 'Orange juice', 100, '2020-11-21 00:00', '2020-11-21 01:00'), /* initial */
(3, 'Orange juice', 100, '2020-11-21 01:01', '2020-11-21 02:00'), /* no change */
(3, 'Orange juice', 100, '2020-11-21 02:01', '2020-11-21 03:00') /* no change, current */
我希望提出一个查询来获得以下结果。请注意,没有实际更改的记录应该合并在一起,这样就没有冗余。
ProductId Name Price StartDate EndDate
----------- -------------- ------- -------------------------------------- --------------------------------------
1 Phone 100 2020-11-20 00:00:00.000 (first row) 2020-11-20 02:00:00.000 (second row)
1 Phone 200 2020-11-20 02:01:00.000 (third row) 2020-11-20 03:00:00.000 (third row)
2 Apple 5 2020-11-20 00:00:00.000 (first row) 2020-11-20 01:00:00.000 (first row)
2 Apple 10 2020-11-20 01:01:00.000 (second row) 2020-11-20 02:00:00.000 (second row)
2 Pineapple 10 2020-11-20 02:01:00.000 (third row) 2020-11-20 03:00:00.000 (third row)
3 Orange juice 100 2020-11-21 00:00:00.000 (first row) 2020-11-20 03:00:00.000 (third row)
我最接近的是:
; WITH history AS (
SELECT
ProductId,
Name,
Price,
StartDate,
EndDate
FROM (
SELECT
ROW_NUMBER() OVER (PARTITION BY ProductId ORDER BY StartDate DESC) 'RowNumber',
*
FROM ProductHistory
) history
WHERE history.RowNumber = 1 -- select newest row per ProductId
UNION ALL
SELECT
previous.ProductId,
previous.Name,
previous.Price,
previous.StartDate,
EndDate
FROM (
SELECT
ROW_NUMBER() OVER (PARTITION BY previous.ProductId ORDER BY previous.StartDate DESC) 'RowNumber',
previous.*
FROM history [current]
INNER JOIN ProductHistory previous
ON previous.ProductId = [current].ProductId
AND previous.StartDate < [current].StartDate
AND (
previous.Name <> [current].Name
OR previous.Price <> [current].Price
)
) previous
WHERE previous.RowNumber = 1 -- select previous row of each ProductId, recursively
)
SELECT *
FROM history
ORDER BY
ProductId,
StartDate
ProductId Name Price StartDate EndDate
----------- -------------- -------- ------------------------- -------------------------
1 Phone 100,00 2020-11-20 01:01:00.000 2020-11-20 02:00:00.000
1 Phone 200,00 2020-11-20 02:01:00.000 2020-11-20 03:00:00.000
2 Apple 5,00 2020-11-20 00:00:00.000 2020-11-20 01:00:00.000
2 Apple 10,00 2020-11-20 01:01:00.000 2020-11-20 02:00:00.000
2 Pineapple 10,00 2020-11-20 02:01:00.000 2020-11-20 03:00:00.000
3 Orange juice 100,00 2020-11-21 02:01:00.000 2020-11-21 03:00:00.000
虽然 Name
和 Price
列值是正确的,但我不确定如何聚合 StartDate
和 EndDate
列以获得我需要的值。 fiddle 中提供了所有代码,如果有帮助的话。
这是一种间隙和孤岛问题。可能最简单的方法就是行数的不同:
select productid, name, price, min(startdate), max(enddate)
from (select ph.*,
row_number() over (partition by productid order by startdate) as seqnum,
row_number() over (partition by productid, name, price order by startdate) as seqnum_2
from producthistory
) ph
group by productid, name, price, (seqnum - seqnum_2);
这假设时间范围内没有间隙——这对于此数据模型来说似乎是合理的。
为什么这行得通?这有点难以解释。但是,如果您查看子查询的结果,您将看到相邻行的两个行号之间的差异是如何保持不变的,其中 name
和 price
相同。