SQL 中分组时间序列的最小值和最大值
Min and max of grouped time sequences in SQL
我有一个大型 Postgres table test
,我想从中提取 连续 个 no_signal
状态序列,每个 mobile_id
,或者换句话说,单个移动设备停止服务的时间长度。
在真实的 table 中,记录没有排序,我认为这意味着除了 window 函数之外还必须包含 PARTITION OVER (time, mobile_id
) 语句。任何关于如何为单个连续序列创建一个组,然后取每个组的最小值和最大值的建议将不胜感激。
-- CREATE TABLE test (mobile_id int, state varchar, time timestamp, region varchar)
INSERT INTO test (mobile_id, state, time, region ) VALUES
(1, 'active', TIMESTAMP '2018-08-09 15:00:00', 'EU'),
(1, 'active', TIMESTAMP '2018-08-09 16:00:00', 'EU'),
(1, 'no_signal', TIMESTAMP '2018-08-09 17:00:00', 'EU'),
(1, 'no_signal', TIMESTAMP '2018-08-09 18:00:00', 'EU'),
(1, 'no_signal', TIMESTAMP '2018-08-09 19:00:00', 'EU'),
(1, 'active', TIMESTAMP '2018-08-09 20:00:00', 'EU'),
(1, 'inactive', TIMESTAMP '2018-08-09 21:00:00', 'EU'),
(1, 'active', TIMESTAMP '2018-08-09 22:00:00', 'EU'),
(1, 'active', TIMESTAMP '2018-08-09 23:00:00', 'EU'),
(2, 'active', TIMESTAMP '2018-08-10 00:00:00', 'EU'),
(2, 'no_signal', TIMESTAMP '2018-08-10 01:00:00', 'EU'),
(2, 'active', TIMESTAMP '2018-08-10 02:00:00', 'EU'),
(2, 'no_signal', TIMESTAMP '2018-08-10 03:00:00', 'EU'),
(2, 'no_signal', TIMESTAMP '2018-08-10 04:00:00', 'EU'),
(2, 'no_signal', TIMESTAMP '2018-08-10 05:00:00', 'EU'),
(2, 'no_signal', TIMESTAMP '2018-08-10 06:00:00', 'EU'),
(3, 'active', TIMESTAMP '2018-08-10 07:00:00', 'SA'),
(3, 'active', TIMESTAMP '2018-08-10 08:00:00', 'SA'),
(3, 'no_signal', TIMESTAMP '2018-08-10 09:00:00', 'SA'),
(3, 'no_signal', TIMESTAMP '2018-08-10 10:00:00', 'SA'),
(3, 'inactive', TIMESTAMP '2018-08-10 11:00:00', 'SA'),
(3, 'inactive', TIMESTAMP '2018-08-10 12:00:00', 'SA'),
(3, 'no_signal', TIMESTAMP '2018-08-10 13:00:00', 'SA')
我的目标输出是这样的:
mobile_id start_time end_time diff_time region
1 2018-08-09 17:00:00 2018-08-09 19:00:00 120 EU
2 2018-08-10 01:00:00 2018-08-10 01:00:00 0 EU
2 2018-08-10 03:00:00 2018-08-10 06:00:00 180 EU
3 2018-08-10 09:00:00 2018-08-10 10:00:00 60 SA
3 2018-08-10 13:00:00 2018-08-10 13:00:00 0 SA
由于未正确创建组,以下代码未产生预期结果:
select mobile_id, region,
least(extract(epoch from max(time) - min(time)), 0) as diff
from (select t.*,
count(*) filter (where state = 'no_signal) over (partition by mobile_id, region order by time) as grp
from t
) t
group by mobile_id, region, grp;
SELECT DISTINCT
mobile_id,
first_value(time) over (partition by ranked, time) as start_time, -- B
first_value(time) over (partition by ranked, time desc) as end_time,
region
FROM
(
SELECT *, SUM(is_diff) OVER (ORDER BY time) as ranked -- A
FROM
(
SELECT *,
CASE WHEN state = lag(state) over (order by time) THEN 0 ELSE 1 END as is_diff
FROM test
) s
) s
WHERE
state = 'no_signal';
A:问题是您正在尝试对一列进行排序,然后又想为另一列进行分区。这个问题可以通过这个子查询来解决。该问题已讨论 here。我正在寻找更好的解决方案,但此子查询有效。这将创建一个可用于您想要的列 window.
B:创建 window 后,可以使用 first_value(time)
和 first_value(time) ... ORDER BY time DESC
函数轻松计算出 start_time
和 end_time
。 DESC
因为它用最晚的时间对 window 进行排序,然后您可以获得它的第一个值 ()。
为了更清楚地了解真正的问题,我省略了上面的 diff
计算:要添加 diff
你只需要做一个子查询:
SELECT
*,
EXTRACT(epoch from (end_time - start_time)) / 60 as diff
FROM (
-- <QUERY ABOVE>
) s
这是间隙和孤岛问题的变体。在这种情况下,您正在尝试检测每个手机号码具有 no_signal
的多个记录岛。
这个答案使用 "difference in row number method." 这个技巧与以两种方式在 table 上应用 ROW_NUMBER
有关。第一个生成所有记录的序列,按时间排序,而第二个为每个 mobile_id
组生成序列,然后只为状态为 no_signal
的记录生成序列。这些行号值中的 差异 可用于形成每个岛。那么,我们只需要聚合取min/max时间戳值就可以得到你想要的结果
WITH cte1 AS (
SELECT *, ROW_NUMBER() OVER (ORDER BY time) rn1
FROM test
),
cte2 AS (
SELECT *, ROW_NUMBER() OVER (PARTITION BY mobile_id ORDER BY time) rn2
FROM test
WHERE state = 'no_signal'
),
cte3 AS (
SELECT t1.*, t2.rn2
FROM cte1 t1
LEFT JOIN cte2 t2
ON t1.mobile_id = t2.mobile_id AND t1.time = t2.time
WHERE t1.state = 'no_signal'
)
SELECT
mobile_id,
MIN(time) AS start_time,
MAX(time) AS end_time,
EXTRACT(epoch FROM MAX(time::timestamp) - MIN(time::timestamp)) / 60 diff_time,
region
FROM cte3
GROUP BY
mobile_id,
region,
(rn1 - rn2)
ORDER BY
mobile_id,
start_time;
我有一个大型 Postgres table test
,我想从中提取 连续 个 no_signal
状态序列,每个 mobile_id
,或者换句话说,单个移动设备停止服务的时间长度。
在真实的 table 中,记录没有排序,我认为这意味着除了 window 函数之外还必须包含 PARTITION OVER (time, mobile_id
) 语句。任何关于如何为单个连续序列创建一个组,然后取每个组的最小值和最大值的建议将不胜感激。
-- CREATE TABLE test (mobile_id int, state varchar, time timestamp, region varchar)
INSERT INTO test (mobile_id, state, time, region ) VALUES
(1, 'active', TIMESTAMP '2018-08-09 15:00:00', 'EU'),
(1, 'active', TIMESTAMP '2018-08-09 16:00:00', 'EU'),
(1, 'no_signal', TIMESTAMP '2018-08-09 17:00:00', 'EU'),
(1, 'no_signal', TIMESTAMP '2018-08-09 18:00:00', 'EU'),
(1, 'no_signal', TIMESTAMP '2018-08-09 19:00:00', 'EU'),
(1, 'active', TIMESTAMP '2018-08-09 20:00:00', 'EU'),
(1, 'inactive', TIMESTAMP '2018-08-09 21:00:00', 'EU'),
(1, 'active', TIMESTAMP '2018-08-09 22:00:00', 'EU'),
(1, 'active', TIMESTAMP '2018-08-09 23:00:00', 'EU'),
(2, 'active', TIMESTAMP '2018-08-10 00:00:00', 'EU'),
(2, 'no_signal', TIMESTAMP '2018-08-10 01:00:00', 'EU'),
(2, 'active', TIMESTAMP '2018-08-10 02:00:00', 'EU'),
(2, 'no_signal', TIMESTAMP '2018-08-10 03:00:00', 'EU'),
(2, 'no_signal', TIMESTAMP '2018-08-10 04:00:00', 'EU'),
(2, 'no_signal', TIMESTAMP '2018-08-10 05:00:00', 'EU'),
(2, 'no_signal', TIMESTAMP '2018-08-10 06:00:00', 'EU'),
(3, 'active', TIMESTAMP '2018-08-10 07:00:00', 'SA'),
(3, 'active', TIMESTAMP '2018-08-10 08:00:00', 'SA'),
(3, 'no_signal', TIMESTAMP '2018-08-10 09:00:00', 'SA'),
(3, 'no_signal', TIMESTAMP '2018-08-10 10:00:00', 'SA'),
(3, 'inactive', TIMESTAMP '2018-08-10 11:00:00', 'SA'),
(3, 'inactive', TIMESTAMP '2018-08-10 12:00:00', 'SA'),
(3, 'no_signal', TIMESTAMP '2018-08-10 13:00:00', 'SA')
我的目标输出是这样的:
mobile_id start_time end_time diff_time region
1 2018-08-09 17:00:00 2018-08-09 19:00:00 120 EU
2 2018-08-10 01:00:00 2018-08-10 01:00:00 0 EU
2 2018-08-10 03:00:00 2018-08-10 06:00:00 180 EU
3 2018-08-10 09:00:00 2018-08-10 10:00:00 60 SA
3 2018-08-10 13:00:00 2018-08-10 13:00:00 0 SA
由于未正确创建组,以下代码未产生预期结果:
select mobile_id, region,
least(extract(epoch from max(time) - min(time)), 0) as diff
from (select t.*,
count(*) filter (where state = 'no_signal) over (partition by mobile_id, region order by time) as grp
from t
) t
group by mobile_id, region, grp;
SELECT DISTINCT
mobile_id,
first_value(time) over (partition by ranked, time) as start_time, -- B
first_value(time) over (partition by ranked, time desc) as end_time,
region
FROM
(
SELECT *, SUM(is_diff) OVER (ORDER BY time) as ranked -- A
FROM
(
SELECT *,
CASE WHEN state = lag(state) over (order by time) THEN 0 ELSE 1 END as is_diff
FROM test
) s
) s
WHERE
state = 'no_signal';
A:问题是您正在尝试对一列进行排序,然后又想为另一列进行分区。这个问题可以通过这个子查询来解决。该问题已讨论 here。我正在寻找更好的解决方案,但此子查询有效。这将创建一个可用于您想要的列 window.
B:创建 window 后,可以使用 first_value(time)
和 first_value(time) ... ORDER BY time DESC
函数轻松计算出 start_time
和 end_time
。 DESC
因为它用最晚的时间对 window 进行排序,然后您可以获得它的第一个值 (
为了更清楚地了解真正的问题,我省略了上面的 diff
计算:要添加 diff
你只需要做一个子查询:
SELECT
*,
EXTRACT(epoch from (end_time - start_time)) / 60 as diff
FROM (
-- <QUERY ABOVE>
) s
这是间隙和孤岛问题的变体。在这种情况下,您正在尝试检测每个手机号码具有 no_signal
的多个记录岛。
这个答案使用 "difference in row number method." 这个技巧与以两种方式在 table 上应用 ROW_NUMBER
有关。第一个生成所有记录的序列,按时间排序,而第二个为每个 mobile_id
组生成序列,然后只为状态为 no_signal
的记录生成序列。这些行号值中的 差异 可用于形成每个岛。那么,我们只需要聚合取min/max时间戳值就可以得到你想要的结果
WITH cte1 AS (
SELECT *, ROW_NUMBER() OVER (ORDER BY time) rn1
FROM test
),
cte2 AS (
SELECT *, ROW_NUMBER() OVER (PARTITION BY mobile_id ORDER BY time) rn2
FROM test
WHERE state = 'no_signal'
),
cte3 AS (
SELECT t1.*, t2.rn2
FROM cte1 t1
LEFT JOIN cte2 t2
ON t1.mobile_id = t2.mobile_id AND t1.time = t2.time
WHERE t1.state = 'no_signal'
)
SELECT
mobile_id,
MIN(time) AS start_time,
MAX(time) AS end_time,
EXTRACT(epoch FROM MAX(time::timestamp) - MIN(time::timestamp)) / 60 diff_time,
region
FROM cte3
GROUP BY
mobile_id,
region,
(rn1 - rn2)
ORDER BY
mobile_id,
start_time;