计算布尔列更改之间的间隔
Calculate interval between boolean column change
我有一个 table 可以测量天气,这里是它的简化版本:
"station_id","measured_at","rainy"
-------------------------------------------------------------------------
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:35:35.27+00",FALSE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:36:33.976+00",FALSE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:37:33.864+00",FALSE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:38:34.767+00",TRUE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:39:36.076+00",TRUE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:40:29.776+00",FALSE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:41:35.579+00",FALSE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:42:34.274+00",TRUE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:43:23.842+00",TRUE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:44:35.08+00",FALSE
我需要计算有多少时间是晴天。我知道我需要使用 window 函数,但我被卡住了。因为我需要在某个范围内计算这个间隔,例如最后一天。我已经可以查询到这个阶段了。
SELECT
prev.station_id,
prev.rainy,
prev.measured_at AS started_at,
COALESCE(LEAD(prev.measured_at) OVER (ORDER BY prev.measured_at ASC), NOW()) AS ended_at,
(COALESCE(LEAD(prev.measured_at) OVER (ORDER BY prev.measured_at ASC), NOW()) - prev.measured_at) AS diff
FROM (
SELECT
m.station_id,
m.measured_at,
m.rainy,
COALESCE(LEAD(m.rainy) OVER (ORDER BY m.measured_at ASC), m.rainy) AS prev_rainy
FROM
z_measurements m
WHERE m.measured_at >= '2020-01-30T00:00:00.000Z'
ORDER BY m.measured_at ASC
) prev
WHERE prev.rainy IS DISTINCT FROM prev.prev_rainy
ORDER BY prev.measured_at ASC;
此查询结果为:
"station_id","rainy","started_at","ended_at","diff"
---------------------------------------------------
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485",FALSE,"2020-01-31 18:37:33.864","2020-01-31 18:39:36.076+00","00:02:02.212"
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485",TRUE,"2020-01-31 18:39:36.076","2020-01-31 18:41:35.579+00","00:01:59.503"
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485",FALSE,"2020-01-31 18:41:35.579","2020-01-31 18:43:23.842+00","00:01:48.263"
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485",TRUE,"2020-01-31 18:43:23.842","2020-01-31 21:18:04.89333+00","02:34:41.05133"
但是如果这个结果的第一条车道是 rainy = False
则 start_date
应该与查询 2020-01-30T00:00:00.000Z
中的相同(我假设如果第一个是晴天结果在我的范围开始和第一次记录的测量之间的整个时间都是晴天)并且还缺少最后一行,因为最后一次测量状态是晴天。所以最后一行的 started_at
应该是 2020-01-31 18:44:35.08+00
而 end_at
应该是 NOW()
.
有人可以帮我吗?
我正在使用 postgresql 12.1。
这是一个间隙和孤岛问题,您希望将一系列相邻记录组合在一起。
这是在两个不同的分区上使用 row_number()
来解决它的一种方法:等级之间的差异为您提供了每条记录所属的组,您可以使用它来聚合结果集。
select
station_id,
rainy,
min(measured_at) started_at,
max(measured_at) ended_at,
max(measured_at) - min(measured_at) diff
from (
select
t.*,
row_number() over(partition by station_id order by measured_at) rn1,
row_number() over(partition by station_id, rainy order by measured_at) rn2
from mytable t
) t
group by station_id, rainy, rn1 - rn2
order by station_id, started_at
SELECT t.station_id, t.rainy, t.started_at, t.ended_at, t.ended_at - t.started_at AS diff
FROM (
SELECT
prev.station_id,
prev.rainy,
CASE
WHEN LAG(prev.measured_at) OVER measured_at_by_station_id IS NULL THEN '2020-01-30T00:00:00.000Z'
ELSE prev.measured_at
END AS started_at,
LEAD(prev.measured_at, 1, NOW()) OVER measured_at_by_station_id AS ended_at
FROM (
SELECT
m.station_id,
m.measured_at,
m.rainy,
LAG(m.rainy, 1, NOT(m.rainy)) OVER (PARTITION BY m.station_id ORDER BY m.measured_at ASC) AS prev_rainy
FROM z_measurements m
WHERE m.measured_at >= '2020-01-30T00:00:00.000Z'
ORDER BY m.station_id ASC, m.measured_at ASC
) prev
WHERE prev.rainy IS DISTINCT FROM prev.prev_rainy
WINDOW measured_at_by_station_id AS (PARTITION BY prev.station_id ORDER BY prev.measured_at ASC)
) t
ORDER BY t.station_id ASC, t.started_at ASC
我有一个 table 可以测量天气,这里是它的简化版本:
"station_id","measured_at","rainy"
-------------------------------------------------------------------------
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:35:35.27+00",FALSE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:36:33.976+00",FALSE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:37:33.864+00",FALSE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:38:34.767+00",TRUE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:39:36.076+00",TRUE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:40:29.776+00",FALSE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:41:35.579+00",FALSE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:42:34.274+00",TRUE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:43:23.842+00",TRUE
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485","2020-01-31 18:44:35.08+00",FALSE
我需要计算有多少时间是晴天。我知道我需要使用 window 函数,但我被卡住了。因为我需要在某个范围内计算这个间隔,例如最后一天。我已经可以查询到这个阶段了。
SELECT
prev.station_id,
prev.rainy,
prev.measured_at AS started_at,
COALESCE(LEAD(prev.measured_at) OVER (ORDER BY prev.measured_at ASC), NOW()) AS ended_at,
(COALESCE(LEAD(prev.measured_at) OVER (ORDER BY prev.measured_at ASC), NOW()) - prev.measured_at) AS diff
FROM (
SELECT
m.station_id,
m.measured_at,
m.rainy,
COALESCE(LEAD(m.rainy) OVER (ORDER BY m.measured_at ASC), m.rainy) AS prev_rainy
FROM
z_measurements m
WHERE m.measured_at >= '2020-01-30T00:00:00.000Z'
ORDER BY m.measured_at ASC
) prev
WHERE prev.rainy IS DISTINCT FROM prev.prev_rainy
ORDER BY prev.measured_at ASC;
此查询结果为:
"station_id","rainy","started_at","ended_at","diff"
---------------------------------------------------
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485",FALSE,"2020-01-31 18:37:33.864","2020-01-31 18:39:36.076+00","00:02:02.212"
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485",TRUE,"2020-01-31 18:39:36.076","2020-01-31 18:41:35.579+00","00:01:59.503"
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485",FALSE,"2020-01-31 18:41:35.579","2020-01-31 18:43:23.842+00","00:01:48.263"
"b6b53561-dab5-4b9a-8d28-a7de1e4d1485",TRUE,"2020-01-31 18:43:23.842","2020-01-31 21:18:04.89333+00","02:34:41.05133"
但是如果这个结果的第一条车道是 rainy = False
则 start_date
应该与查询 2020-01-30T00:00:00.000Z
中的相同(我假设如果第一个是晴天结果在我的范围开始和第一次记录的测量之间的整个时间都是晴天)并且还缺少最后一行,因为最后一次测量状态是晴天。所以最后一行的 started_at
应该是 2020-01-31 18:44:35.08+00
而 end_at
应该是 NOW()
.
有人可以帮我吗?
我正在使用 postgresql 12.1。
这是一个间隙和孤岛问题,您希望将一系列相邻记录组合在一起。
这是在两个不同的分区上使用 row_number()
来解决它的一种方法:等级之间的差异为您提供了每条记录所属的组,您可以使用它来聚合结果集。
select
station_id,
rainy,
min(measured_at) started_at,
max(measured_at) ended_at,
max(measured_at) - min(measured_at) diff
from (
select
t.*,
row_number() over(partition by station_id order by measured_at) rn1,
row_number() over(partition by station_id, rainy order by measured_at) rn2
from mytable t
) t
group by station_id, rainy, rn1 - rn2
order by station_id, started_at
SELECT t.station_id, t.rainy, t.started_at, t.ended_at, t.ended_at - t.started_at AS diff
FROM (
SELECT
prev.station_id,
prev.rainy,
CASE
WHEN LAG(prev.measured_at) OVER measured_at_by_station_id IS NULL THEN '2020-01-30T00:00:00.000Z'
ELSE prev.measured_at
END AS started_at,
LEAD(prev.measured_at, 1, NOW()) OVER measured_at_by_station_id AS ended_at
FROM (
SELECT
m.station_id,
m.measured_at,
m.rainy,
LAG(m.rainy, 1, NOT(m.rainy)) OVER (PARTITION BY m.station_id ORDER BY m.measured_at ASC) AS prev_rainy
FROM z_measurements m
WHERE m.measured_at >= '2020-01-30T00:00:00.000Z'
ORDER BY m.station_id ASC, m.measured_at ASC
) prev
WHERE prev.rainy IS DISTINCT FROM prev.prev_rainy
WINDOW measured_at_by_station_id AS (PARTITION BY prev.station_id ORDER BY prev.measured_at ASC)
) t
ORDER BY t.station_id ASC, t.started_at ASC