如果日期列在 SQL 中重叠,则合并行

Merge rows if date columns are overlapping in TSQL

我有一个table格式如下

Id  StartDate   EndDate Type
1   2012-02-18  2012-03-18  1
1   2012-03-17  2012-06-29  1
1   2012-06-27  2012-09-27  1
1   2014-08-23  2014-09-24  3
1   2014-09-23  2014-10-24  3
1   2014-10-23  2014-11-24  3
2   2015-07-04  2015-08-06  1
2   2015-08-04  2015-09-06  1
3   2013-11-01  2013-12-01  0
3   2018-01-09  2018-02-09  0

我在这里发现了类似的问题,但不能帮助我解决问题。我想合并具有相同 IdType 和重叠日期期间的行。

上面table的结果应该是

Id  StartDate   EndDate Type
1   2012-02-18  2012-09-27  1
1   2014-08-23  2014-11-24  3
2   2015-07-04  2015-09-06  1
3   2013-11-01  2013-12-01  0
3   2018-01-09  2018-02-09  0

在另一台服务器上,我可以通过以下限制和以下查询来完成:


SELECT Id
     , MIN(StartDate) AS StartDate
     , MAX(EndDate) AS EndDate
FROM (
    SELECT *
         , SUM(CASE WHEN a.EndDate = a.StartDate THEN 0
                    ELSE 1
               END
           ) OVER (ORDER BY Id, StartDate) sm
    FROM (
        SELECT Id
             , StartDate
             , EndDate
             , LAG(EndDate, 1, NULL) OVER (PARTITION BY Id ORDER BY Id, EndDate) EndDate
        FROM #temptable
    ) a
) b
GROUP BY Id, sm

有什么建议吗

create table #table
(Id int,StartDate date,  EndDate date, Type int)

insert into #table
values

('1','2012-02-18','2012-03-18','1'),('1','2012-03-19','2012-06-19','1'),
('1','2012-06-27','2012-09-27','1'),('1','2014-08-23','2014-09-24','3'),
('1','2014-09-23','2014-10-24','3'),('1','2014-10-23','2014-11-24','3'),
('2','2015-07-04','2015-08-06','1'),('2','2015-08-04','2015-09-06','1'),
('3','2013-11-01','2013-12-01','0'),('3','2018-01-09','2018-02-09','0')

select ID,MIN(startdate)sd,MAX(EndDate)ed,type from #table
group by ID,TYPE,YEAR(startdate),YEAR(EndDate)

这可以通过使用一些 window 函数和 CTE 轻松实现。这是解决方案

DECLARE @table TABLE
(id        INT, 
 StartDate DATE, 
 EndDate   DATE, 
 [Type]    INT
);

INSERT INTO @table(Id,  StartDate,  EndDate,  [Type]) VALUES
(1,  '2012-02-18',  '2012-03-18',  1),
(1,  '2012-03-17',  '2012-06-29',  1),
(1,  '2012-06-27',  '2012-09-27',  1),
(1,  '2014-08-23',  '2014-09-24',  3),
(1,  '2014-09-23',  '2014-10-24',  3),
(1,  '2014-10-23',  '2014-11-24',  3),
(2,  '2015-07-04',  '2015-08-06',  1),
(2,  '2015-08-04',  '2015-09-06',  1),
(3,  '2013-11-01',  '2013-12-01',  0),
(3,  '2018-01-09',  '2018-02-09',  0);

WITH C1 AS
(
  SELECT *,
    MAX(EndDate) OVER(PARTITION BY Id, [Type]
          ORDER BY StartDate, EndDate
          ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS PrevEnd
  FROM @table
),
C2 AS
(
  SELECT *,
    SUM(StartFlag) OVER(PARTITION BY Id, [Type]
          ORDER BY StartDate, EndDate
          ROWS UNBOUNDED PRECEDING) AS GroupID
  FROM C1
    CROSS APPLY ( VALUES(CASE WHEN StartDate <= PrevEnd THEN NULL ELSE 1 END) ) AS A(StartFlag)
)
SELECT Id, [Type], MIN(StartDate) AS StartDate, MAX(EndDate) AS EndDate
FROM C2
GROUP BY Id, [Type], GroupID;

这是 2008 兼容的。在我看来,CTE 确实是 link 整理所有重叠记录的最佳方式。日期重叠逻辑来自这个线程:SO Date Overlap

我添加了更复杂的额外数据以确保它按预期工作。

DECLARE @Data table (Id INT, StartDate DATE, EndDate DATE, Type INT)
INSERT INTO @data 

SELECT 1,'2/18/2012' ,'3/18/2012', 1 UNION ALL
select 1,'3/17/2012','6/29/2012',1 UNION ALL
select 1,'6/27/2012','9/27/2012',1 UNION ALL
select 1,'8/23/2014','9/24/2014',3 UNION ALL
select 1,'9/23/2014','10/24/2014',3 UNION ALL
select 1,'10/23/2014','11/24/2014',3 UNION ALL
select 2,'7/4/2015','8/6/2015',1 UNION ALL
select 2,'8/4/2015','9/6/2015',1 UNION ALL
select 3,'11/1/2013','12/1/2013',0 UNION ALL
select 3,'1/9/2018','2/9/2018',0 UNION ALL 
select 4,'1/1/2018','1/2/2018',0 UNION ALL --many non overlapping dates
select 4,'1/4/2018','1/5/2018',0 UNION ALL
select 4,'1/7/2018','1/9/2018',0 UNION ALL
select 4,'1/11/2018','1/13/2018',0 UNION ALL

select 4,'2/7/2018','2/8/2018',0 UNION ALL --many overlapping dates
select 4,'2/8/2018','2/9/2018',0 UNION ALL 
select 4,'2/9/2018','2/10/2018',0 UNION all
select 4,'2/10/2018','2/11/2018',0 UNION all
select 4,'2/11/2018','2/12/2018',0 UNION all
select 4,'2/12/2018','2/13/2018',0 UNION all

select 4,'3/7/2018','3/8/2018',0 UNION ALL --many overlapping dates, second instance of id 4, type 0
select 4,'3/8/2018','3/9/2018',0 UNION ALL 
select 4,'3/9/2018','3/10/2018',0 UNION all
select 4,'3/10/2018','3/11/2018',0 UNION all
select 4,'3/11/2018','3/12/2018',0 UNION all
select 4,'3/12/2018','3/13/2018',0 



;
WITH cdata
AS (SELECT  Id,
        d.Type,
        d.StartDate,
        d.EndDate,
        CurrentStart = d.StartDate
    FROM    @Data d
    WHERE
        NOT EXISTS (
            SELECT * FROM @Data x WHERE x.StartDate < d.StartDate AND d.StartDate <= x.EndDate AND d.EndDate >= x.StartDate AND d.Id = x.Id AND d.Type = x.Type --get first records for overlapping ranges

                )       
    UNION ALL
    SELECT  d.Id,
        d.Type,
        StartDate = CASE WHEN d2.StartDate < d.StartDate THEN d2.StartDate ELSE d.StartDate END,
        EndDate = CASE WHEN d2.EndDate > d.EndDate THEN d2.EndDate ELSE d.EndDate END,
        CurrentStart = d2.StartDate
    FROM    cdata d
        INNER JOIN @Data d2
            ON (
                d.StartDate <= d2.EndDate
                AND d.EndDate >= d2.StartDate
            ) 
            AND d2.Id = d.Id
            AND d2.Type = d.Type
            AND d2.StartDate > d.CurrentStart)
SELECT cdata.Id, cdata.Type, cdata.StartDate, EndDate = MAX(cdata.EndDate) 
FROM        cdata 
GROUP BY cdata.Id, cdata.Type, cdata.StartDate

此方法使用额外的临时 table 来识别重叠日期的组,然后根据分组执行快速聚合。

SELECT *, ROW_NUMBER() OVER (ORDER BY Id, Type) AS UID,
    ROW_NUMBER() OVER (ORDER BY Id, Type) AS GroupId INTO #G FROM #TempTable
WHILE @@ROWCOUNT <> 0 BEGIN
    UPDATE T1 SET
        GroupId = T2.GroupId
    FROM #G T1
        INNER JOIN (
            SELECT T1.UID, CASE WHEN T1.GroupId  < T2.GroupId THEN T1.GroupId ELSE T2.GroupId END
            FROM #G T1
                LEFT OUTER JOIN #G T2
                    ON T1.Id = T2.Id AND T1.Type = T2.Type AND T1.GroupId <> T2.GroupId
                        AND T1.StartDate <= T2.EndDate AND T2.StartDate <= T1.EndDate
            ) T2 (UID, GroupId)
            ON T1.UID = T2.UID
    WHERE T1.GroupId <> T2.GroupId
END
SELECT Id, MIN(StartDate) AS StartDate, MAX(EndDate) AS EndDate, Type
FROM #G G GROUP BY GroupId, Id, Type

这个returns期望值

Id          StartDate  EndDate    Type
----------- ---------- ---------- -----------
1           2012-02-18 2012-09-27 1
1           2014-08-23 2014-11-24 3
2           2015-07-04 2015-09-06 1
3           2013-11-01 2013-12-01 0
3           2018-01-09 2018-02-09 0

这看起来像是一个 Packing Intervals 问题。请参阅 Itzik Ben-Gan 的 post 以了解所有详细信息以及他推荐的索引以使其高效工作。他提出了一个没有递归 CTE 的解决方案。

两个音符。

  1. 下面的查询假定间隔是[封闭的;开放),即 StartDate 是包容性的,而 EndDate 是排他性的。这种表示此类数据的方式通常是最方便的。 (与从零开始而不是从 1 开始的数组在编程语言中通常更方便的意义相同)。

  2. 我添加了一个 RowID 列以进行明确排序。

示例数据

DECLARE @T TABLE
(
    RowID int IDENTITY,
    id int,
    StartDate date,
    EndDate date,
    tp int
);

INSERT INTO @T(Id, StartDate, EndDate, tp) VALUES
(1,  '2012-02-18',  '2012-03-18',  1),
(1,  '2012-03-17',  '2012-06-29',  1),
(1,  '2012-06-27',  '2012-09-27',  1),
(1,  '2014-08-23',  '2014-09-24',  3),
(1,  '2014-09-23',  '2014-10-24',  3),
(1,  '2014-10-23',  '2014-11-24',  3),
(2,  '2015-07-04',  '2015-08-06',  1),
(2,  '2015-08-04',  '2015-09-06',  1),
(3,  '2013-11-01',  '2013-12-01',  0),
(3,  '2018-01-09',  '2018-02-09',  0);

-- Make EndDate an opened interval, make it exclusive
-- [Start; End)
UPDATE @T
SET EndDate = DATEADD(day, 1, EndDate)
;

推荐索引

-- indexes to support solutions
CREATE UNIQUE INDEX idx_start_id ON T(id, tp, StartDate, RowID);
CREATE UNIQUE INDEX idx_end_id ON T(id, tp, EndDate, RowID);

查询

阅读 Itzik 的 post 以了解发生了什么。他那里有很好的插图。简而言之,每个时间戳(开始或结束)都被视为一个事件。每个事件都有一个 +- 类型。每次遇到 + 事件(某个时间间隔开始),我们都会增加 运行 计数器。每次我们遇到 - 事件(某个间隔结束)时,我们都会减少 运行 计数器。当 运行 计数器为 0 时,表示重叠间隔的条纹结束。

我按原样接受了 Itzik 的查询,只是更改了列名以匹配您的名字。

WITH C1 AS
-- let e = end ordinals, let s = start ordinals
(
    SELECT
        RowID, id, tp, StartDate AS ts, +1 AS EventType,
        NULL AS e,
        ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY StartDate, RowID) AS s
    FROM @T

    UNION ALL

    SELECT
        RowID, id, tp, EndDate AS ts, -1 AS EventType,
        ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY EndDate, RowID) AS e,
        NULL AS s
    FROM @T
),
C2 AS
-- let se = start or end ordinal, namely, how many events (start or end) happened so far
(
    SELECT C1.*,
    ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY ts, EventType DESC, RowID) AS se
    FROM C1
),
C3 AS
-- For start events, the expression s - (se - s) - 1 represents how many sessions were active
-- just before the current (hence - 1)
--
-- For end events, the expression (se - e) - e represents how many sessions are active
-- right after this one
--
-- The above two expressions are 0 exactly when a group of packed intervals
-- either starts or ends, respectively
--
-- After filtering only events when a group of packed intervals either starts or ends,
-- group each pair of adjacent start/end events
(
    SELECT id, tp, ts,
        ((ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY ts) - 1) / 2 + 1)
        AS grpnum
    FROM C2
    WHERE COALESCE(s - (se - s) - 1, (se - e) - e) = 0
)
SELECT id, tp, MIN(ts) AS StartDate, DATEADD(day, -1, MAX(ts)) AS EndDate
FROM C3
GROUP BY id, tp, grpnum
ORDER BY id, tp, StartDate;

结果

+----+----+------------+------------+
| id | tp | StartDate  |  EndDate   |
+----+----+------------+------------+
|  1 |  1 | 2012-02-18 | 2012-09-27 |
|  1 |  3 | 2014-08-23 | 2014-11-24 |
|  2 |  1 | 2015-07-04 | 2015-09-06 |
|  3 |  0 | 2013-11-01 | 2013-12-01 |
|  3 |  0 | 2018-01-09 | 2018-02-09 |
+----+----+------------+------------+