使用 ClickHouse 折叠重叠时间间隔

Collapsing overlapping time intervals using ClickHouse

我阅读了类似的问题,可以通过使用 window 函数使其工作,但是,由于 ClickHouse 似乎不支持它们,我正在寻找替代解决方案。

给定像 (1, 5), (2, 3), (3, 8), (10, 15) 这样的时间间隔,我想 "merge" 将重叠间隔变成单个间隔。在我的示例中,它将是:

(1, 8) 和 (10, 15)。

感谢任何指点!谢谢!

如果此函数使用任意 lambda 运算,arrayReduce 可以轻松解决此任务。虽然这不存在,但请尝试通过可用的方式解决问题。

SELECT
    intervals,

    arraySort(x -> x, intervals) sortedIntervals,

    /* try to merge each interval with precede ones */
    arrayMap((x, index) -> index != 1
        ? (arrayReduce(
            'min', 
            arrayMap(
              i -> sortedIntervals[i + 1].1, 
              /* get indexes of intervals that can be merged with the current one (index is zero-based) */              
              arrayFilter(
                i -> x.1 <= sortedIntervals[i + 1].2 AND x.2 >= sortedIntervals[i + 1].1, 
                range(index)))),
          arrayReduce(
            'max', 
            arrayMap(
              i -> sortedIntervals[i + 1].2,  
              /* get indexes of intervals that can be merged with the current one (index is zero-based) */              
              arrayFilter(
                i -> x.1 <= sortedIntervals[i + 1].2 AND x.2 >= sortedIntervals[i + 1].1, 
                range(index)))))
        : x,
      sortedIntervals, 
      arrayEnumerate(sortedIntervals)) rawResult,

    /* filter out intervals nested to other ones */
    arrayFilter(
      (x, index) -> index == length(rawResult) OR x.1 != rawResult[index + 1].1,
      rawResult, 
      arrayEnumerate(rawResult)) result
FROM
(
    SELECT [(1, 5), (2, 3), (3, 8), (10, 15)] intervals
    UNION ALL
    SELECT [(2, 4), (1, 3), (3, 6), (12, 14), (7, 7), (13, 16), (9, 9), (8, 9), (10, 15)]
    UNION ALL
    SELECT [(20, 22), (18, 18), (16, 21), (1, 8), (2, 9), (3, 5), (10, 12), (11, 13), (14, 15)]
    UNION ALL
    SELECT []
    UNION ALL 
    SELECT [(1, 11)]
)
FORMAT Vertical;

/*
Row 1:
──────
intervals:       [(2,4),(1,3),(3,6),(12,14),(7,7),(13,16),(9,9),(8,9),(10,15)]
sortedIntervals: [(1,3),(2,4),(3,6),(7,7),(8,9),(9,9),(10,15),(12,14),(13,16)]
rawResult:       [(1,3),(1,4),(1,6),(7,7),(8,9),(8,9),(10,15),(10,15),(10,16)]
result:          [(1,6),(7,7),(8,9),(10,16)]

Row 2:
──────
intervals:       [(1,5),(2,3),(3,8),(10,15)]
sortedIntervals: [(1,5),(2,3),(3,8),(10,15)]
rawResult:       [(1,5),(1,5),(1,8),(10,15)]
result:          [(1,8),(10,15)]

Row 3:
──────
intervals:       [(20,22),(18,18),(16,21),(1,8),(2,9),(3,5),(10,12),(11,13),(14,15)]
sortedIntervals: [(1,8),(2,9),(3,5),(10,12),(11,13),(14,15),(16,21),(18,18),(20,22)]
rawResult:       [(1,8),(1,9),(1,9),(10,12),(10,13),(14,15),(16,21),(16,21),(16,22)]
result:          [(1,9),(10,13),(14,15),(16,22)]

Row 4:
──────
intervals:       []
sortedIntervals: []
rawResult:       []
result:          []

Row 5:
──────
intervals:       [(1,11)]
sortedIntervals: [(1,11)]
rawResult:       [(1,11)]
result:          [(1,11)]
*/
SELECT arraySplit((x, y) -> y, sortedDisArray, arrayMap(x -> if(x <= 1, 0, 1), arrayDifference(sortedDisArray))) AS z,
   arrayMap(el -> (arrayReduce('min', el), arrayReduce('max', el)), z) as result
FROM
(
  SELECT [(1, 5), (2, 3), (3, 8), (10, 15)] AS a,
        arrayMap(x -> x.2 -x.1 ,a) as durations,
        arrayMap((i, y) ->
            arrayMap(n -> n + a[y].1 ,
                range(toUInt64(durations[y]+1))) , durations, arrayEnumerate(a)) as disArray,
        arraySort(arrayDistinct(flatten(disArray))) as sortedDisArray
 )

我遇到了同样的问题,不幸的是,这里的两个答案都不适合我。 有错误。只需尝试以下示例:

 [(1, 5), (2, 3), (3, 8), (6, 9), (11, 15)]

它会return你

[(1, 8), (3, 9), (11, 15)] 

是正确的,但它需要将所有区间转换为点列表。在我的例子中,我有一个日期时间间隔,所以我必须创建非常大的数组,其中包含间隔中每一秒的值。会很重。

这就是我试图找到解决方案的原因,我希望我做到了。

SELECT [ (1,5), (3, 7), (5, 10), (6,7), (7,8), (9, 12), (15, 20), (16, 22) ]  AS sortedIntervals,

   -- collect a new array with finishes of all intervals
    arrayMap(i -> i.2, sortedIntervals) as finishes,
    -- create an array with cumulative finishes (merge small internal intervals and increase finish)
    arrayMap( (i, y) -> arrayReduce('max', arraySlice(finishes, 1, y)), finishes, arrayEnumerate(finishes)) AS cumulative_finish,
    
    -- get sequence number of intervals, that will generate a new intervals (which will not be merged)
    arrayFilter ( i -> i > 0 ,
        arrayMap( (i, y) -> 
            -- there is no check for element existing, because array[0] will return 0
            -- if start of current interval is bigger then cumulative finish of previous one - it's a new interval
            ( i.1 > cumulative_finish[y-1] ) 
            ? y
            : 0, sortedIntervals, arrayEnumerate(sortedIntervals))
        ) AS new_interval_indexes,
    
    -- get result's intervals. 
    -- the first start - it's our initial start, finish - it's a value from cumulative finish of previous for new start interval
    arrayMap( (i, n ) -> 
        -- there is no checking on existing element, because array[<unexisting_element>] return 0, array[-1] return last element, that we want
        ( sortedIntervals[i].1, cumulative_finish[new_interval_indexes[n+1]-1] ) ,
        new_interval_indexes, arrayEnumerate(new_interval_indexes)
        ) as result

会return

 [(1, 12), (15, 22)]

再测试一个方案:

select
    id,
    data.1      as period_group,
    min(data.2) as start,
    max(data.3) as end,
    end - start as diff
from
    (   select
            id,
            groupArray((start, end)) as arr,

            arraySort(
                y -> y.1,
                arrayFilter(
                    x ->
                    not(arrayExists(
                            a ->
                                a.1 <= x.1
                                and a.2 > x.2
                                or  a.1 < x.1
                                    and a.2 >= x.2,
                            arr)),
                    arr)) as arr_clear,

            arrayMap(
                x, y -> (x, y),
                arr_clear,
                arrayPushFront(
                    arrayPopBack(arr_clear),
                    arr_clear[1])) as arr_w_prev,

            arrayMap(
                x, y -> (y, x.1, x.2),
                arr_clear,
                arrayCumSum(
                    x -> x.1.1 > x.2.2,
                    arr_w_prev)) as arr_groups
        from
            values (
                'id String, start UInt8, end UInt8',

                ('a', 1, 10),
                ('a', 2, 8),
                ('a', 7, 15),
                ('a', 20, 25),

                ('b', 116, 130),
                ('b', 111, 114),
                ('b', 107, 109),
                ('b', 103, 105),
                ('b', 100, 120),

                ('c', 50, 60),
                ('c', 50, 70),

                ('d', 1, 5),
                ('d', 2, 3),
                ('d', 3, 8),
                ('d', 6, 9),
                ('d', 11, 15),

                ('e', 1, 5),
                ('e', 3, 7),
                ('e', 5, 10),
                ('e', 6, 7),
                ('e', 7, 8),
                ('e', 9, 12),
                ('e', 15, 20),
                ('e', 16, 22),

                ('f', 1, 8),
                ('f', 2, 9),
                ('f', 3, 5),
                ('f', 10, 12),
                ('f', 11, 13),
                ('f', 14, 15),
                ('f', 16, 21),
                ('f', 18, 18),
                ('f', 20, 21)
            )
        group by
            id) as t
    array join arr_groups as data
group by
    id,
    period_group
order by
    id,
    period_group;