PostgreSQL:数据子集中的非空替换
PostgreSQL: Non-null replacements in data subsets
编辑我的错,称为时间戳'date'...
我们的数据 table 由时间戳、值和增量列组成。增量是自上次非空读数以来的分钟数。
CREATE TABLE Table1
("ts" timestamp with time zone, "value" numeric, "delta" int)
;
INSERT INTO Table1
("ts", "value", "delta")
VALUES
('2019-09-09 12:01:00', 3.5, NULL),
('2019-09-09 12:02:00', 3.2, 1),
('2019-09-09 12:03:00', NULL, 1),
('2019-09-09 12:04:00', 2.9, 2),
('2019-09-09 12:05:00', NULL, 1),
('2019-09-09 12:06:00', 3.0, 2),
('2019-09-09 12:07:00', NULL, 1),
('2019-09-09 12:08:00', NULL, 2),
('2019-09-09 12:09:00', NULL, 3),
('2019-09-09 12:10:00', NULL, 4),
('2019-09-09 12:11:00', 3.2, 5),
('2019-09-09 12:12:00', NULL, 1)
;
SELECT ts,
value,
delta,
FROM table
+---------------------+-------+-------+
| ts | value | delta |
+---------------------+-------+-------+
| 2019-09-09 12:01:00 | 3.5 | 1 |
| 2019-09-09 12:02:00 | 3.2 | 1 |
| 2019-09-09 12:03:00 | | 1 |
| 2019-09-09 12:04:00 | 2.9 | 2 |
| 2019-09-09 12:05:00 | | 1 |
| 2019-09-09 12:06:00 | 3.0 | 2 |
| 2019-09-09 12:07:00 | | 1 |
| 2019-09-09 12:08:00 | | 2 |
| 2019-09-09 12:09:00 | | 3 |
| 2019-09-09 12:10:00 | | 4 |
| 2019-09-09 12:11:00 | 3.2 | 5 |
| 2019-09-09 12:12:00 | | 1 |
+---------------------+-------+-------+
给定数据子集,我们如何用最后一个非空值替换空值如果尚未选择替换值:
SELECT ts,
value,
delta,
FROM table
WHERE (/* expression giving us an arbitrary distribution of rows */)
+---------------------+-------+-------+
| ts | value | delta |
+---------------------+-------+-------+
| 2019-09-09 12:01:00 | 3.5 | |
| 2019-09-09 12:03:00 | | 1 |
| 2019-09-09 12:05:00 | | 1 |
| 2019-09-09 12:07:00 | | 1 |
| 2019-09-09 12:09:00 | | 3 |
| 2019-09-09 12:11:00 | 3.2 | 5 |
+---------------------+-------+-------+
我们想要:
+---------------------+-------+-------+
| ts | value | delta |
+---------------------+-------+-------+
| 2019-09-09 12:01:00 | 3.5 | |
| 2019-09-09 12:03:00 | 3.2 | 1 |
| 2019-09-09 12:05:00 | 2.9 | 1 |
| 2019-09-09 12:07:00 | 3.0 | 1 |
| 2019-09-09 12:09:00 | | 3 |<- an actual null
| 2019-09-09 12:11:00 | 3.2 | 5 |
+---------------------+-------+-------+
在这种情况下,行的分布是奇数;然而,这是任意的。我们不能使用日期频率中的感知模式来确定何时以及是否使用最后的非空值。
到目前为止我们已经尝试了什么
第一步,结转所有最后的值。
WITH seq AS (
SELECT ts,
value,
delta,
ROW_NUMBER() OVER(ORDER BY date) AS row_no,
COUNT(*) OVER() AS total_count
FROM Table1
),
val AS (
SELECT ts,
value,
value_p,
first_value(value) over (partition by value_p order by date),
delta,
row_no,
total_count
FROM (
SELECT ts,
value,
delta,
row_no,
total_count,
sum(case when value is null then 0 else 1 end) over
(order by date) as value_p
FROM seq
ORDER BY ts
) as a
)
SELECT ts,
delta,
value,
case when value is null then first_value else value
end as cf
FROM val
| ts | delta | value | cf |
|---------------------|--------|--------|-----|
| 2019-09-09 12:01:00 | (null) | 3.5 | 3.5 |
| 2019-09-09 12:02:00 | 1 | 3.2 | 3.2 |
| 2019-09-09 12:03:00 | 1 | (null) | 3.2 |
| 2019-09-09 12:04:00 | 2 | 2.9 | 2.9 |
| 2019-09-09 12:05:00 | 1 | (null) | 2.9 |
| 2019-09-09 12:06:00 | 2 | 3 | 3 |
| 2019-09-09 12:07:00 | 1 | (null) | 3 |
| 2019-09-09 12:08:00 | 2 | (null) | 3 |
| 2019-09-09 12:09:00 | 3 | (null) | 3 |
| 2019-09-09 12:10:00 | 4 | (null) | 3 |
| 2019-09-09 12:11:00 | 5 | 3.2 | 3.2 |
| 2019-09-09 12:12:00 | 1 | (null) | 3.2 |
当我们为数据子集分配行时,我们现在既有值也有该值来自多少行。我们想不通的是,在通过WHERE生成子集的时候,如何判断是前传还是留空。
如果解决方案不需要预定义的增量列,则奖励积分。
使用 sum(case when value is null then 0 else 1 end) over (order by date) as value_p
的想法很好。这会将值分类到具有相同 value_p 的组中。
从那里开始,如果您将 date
视为实际时间戳,则可以使用 tsrange(min(date), max(date), '[]') 将日期分组在一起。确保范围的末端包含在内,以捕获组的开始和结束时间相同的行。
然后,只需使用包含的 by 运算符加入您的测试日期。
WITH test_dates(test_date) as (VALUES
('2019-09-09 12:01:00'::timestamp),
('2019-09-09 12:03:00'),
('2019-09-09 12:05:00'),
('2019-09-09 12:07:00'),
('2019-09-09 12:09:00'),
('2019-09-09 12:11:00')
), value_ranges AS (
SELECT tsrange(min(date)::timestamp, max(date)::timestamp, '[]') as sample_range,
max(value) as value, -- There's only one non-null value, this could be min
value_p
FROM (
SELECT date,
value,
sum(case when value is null then 0 else 1 end) over
(order by date) as value_p
FROM table1
) sub
GROUP BY value_p
)
SELECT test_date,
CASE WHEN row_number() OVER (PARTITION BY value_p ORDER BY test_date) = 1 THEN value
ELSE null END -- Only the first row of the group is non-null
FROM test_dates
JOIN value_ranges on test_date <@ sample_range
;
不需要增量列。
更新:意识到我在示例参考点之后提取条目,而我之前应该提取条目。固定。
给定你的 table 并假设你想要一个时间戳,而不是日期,这会给你你想要的。只需更改第一个 table 表达式中的 minutes_between_intervals
列即可展开样本。
为了提高可读性,我让 CTE 变得比它们需要的更加冗长。
WITH with_offsets AS (
-- First add in some metadata about how many minutes have elapsed since you
-- started sampling along with a constant for the sampling interval.
SELECT
2 AS minutes_between_intervals, -- This is how often you're sampling
date,
value,
delta,
extract(minute FROM date - (min(date) OVER (ORDER BY date)))::integer AS minutes_offset
FROM Table1
), with_groups AS (
-- Add grouping, setting the sample entries as reference points and the
-- entries leading up to it as part of its group.
SELECT
*,
CASE WHEN minutes_offset % minutes_between_intervals = 0 THEN minutes_offset
ELSE minutes_offset + (minutes_between_intervals - (minutes_offset % minutes_between_intervals))
END AS sample_group,
minutes_offset % minutes_between_intervals = 0 AS is_sample_boundary
FROM with_offsets
), with_arrays AS (
-- Then aggregate them into arrays. The values array has all NULLs
-- removed. The groups with sample entries are marked.
SELECT
array_agg(date) AS dates,
array_agg(value) FILTER (WHERE value IS NOT NULL) AS values,
array_agg(delta) AS deltas,
bool_or(is_sample_boundary) AS has_complete_sample
FROM with_groups
GROUP BY sample_group
)
-- Now take the last entry from each array, which will be the sample date,
-- the last recorded value, and the last recorded sample delta.
SELECT
dates[array_upper(dates, 1)] AS date,
values[array_upper(values, 1)] AS value,
deltas[array_upper(deltas, 1)] AS delta
FROM with_arrays
WHERE has_complete_sample;
编辑我的错,称为时间戳'date'...
我们的数据 table 由时间戳、值和增量列组成。增量是自上次非空读数以来的分钟数。
CREATE TABLE Table1
("ts" timestamp with time zone, "value" numeric, "delta" int)
;
INSERT INTO Table1
("ts", "value", "delta")
VALUES
('2019-09-09 12:01:00', 3.5, NULL),
('2019-09-09 12:02:00', 3.2, 1),
('2019-09-09 12:03:00', NULL, 1),
('2019-09-09 12:04:00', 2.9, 2),
('2019-09-09 12:05:00', NULL, 1),
('2019-09-09 12:06:00', 3.0, 2),
('2019-09-09 12:07:00', NULL, 1),
('2019-09-09 12:08:00', NULL, 2),
('2019-09-09 12:09:00', NULL, 3),
('2019-09-09 12:10:00', NULL, 4),
('2019-09-09 12:11:00', 3.2, 5),
('2019-09-09 12:12:00', NULL, 1)
;
SELECT ts,
value,
delta,
FROM table
+---------------------+-------+-------+
| ts | value | delta |
+---------------------+-------+-------+
| 2019-09-09 12:01:00 | 3.5 | 1 |
| 2019-09-09 12:02:00 | 3.2 | 1 |
| 2019-09-09 12:03:00 | | 1 |
| 2019-09-09 12:04:00 | 2.9 | 2 |
| 2019-09-09 12:05:00 | | 1 |
| 2019-09-09 12:06:00 | 3.0 | 2 |
| 2019-09-09 12:07:00 | | 1 |
| 2019-09-09 12:08:00 | | 2 |
| 2019-09-09 12:09:00 | | 3 |
| 2019-09-09 12:10:00 | | 4 |
| 2019-09-09 12:11:00 | 3.2 | 5 |
| 2019-09-09 12:12:00 | | 1 |
+---------------------+-------+-------+
给定数据子集,我们如何用最后一个非空值替换空值如果尚未选择替换值:
SELECT ts,
value,
delta,
FROM table
WHERE (/* expression giving us an arbitrary distribution of rows */)
+---------------------+-------+-------+
| ts | value | delta |
+---------------------+-------+-------+
| 2019-09-09 12:01:00 | 3.5 | |
| 2019-09-09 12:03:00 | | 1 |
| 2019-09-09 12:05:00 | | 1 |
| 2019-09-09 12:07:00 | | 1 |
| 2019-09-09 12:09:00 | | 3 |
| 2019-09-09 12:11:00 | 3.2 | 5 |
+---------------------+-------+-------+
我们想要:
+---------------------+-------+-------+
| ts | value | delta |
+---------------------+-------+-------+
| 2019-09-09 12:01:00 | 3.5 | |
| 2019-09-09 12:03:00 | 3.2 | 1 |
| 2019-09-09 12:05:00 | 2.9 | 1 |
| 2019-09-09 12:07:00 | 3.0 | 1 |
| 2019-09-09 12:09:00 | | 3 |<- an actual null
| 2019-09-09 12:11:00 | 3.2 | 5 |
+---------------------+-------+-------+
在这种情况下,行的分布是奇数;然而,这是任意的。我们不能使用日期频率中的感知模式来确定何时以及是否使用最后的非空值。
到目前为止我们已经尝试了什么
第一步,结转所有最后的值。
WITH seq AS (
SELECT ts,
value,
delta,
ROW_NUMBER() OVER(ORDER BY date) AS row_no,
COUNT(*) OVER() AS total_count
FROM Table1
),
val AS (
SELECT ts,
value,
value_p,
first_value(value) over (partition by value_p order by date),
delta,
row_no,
total_count
FROM (
SELECT ts,
value,
delta,
row_no,
total_count,
sum(case when value is null then 0 else 1 end) over
(order by date) as value_p
FROM seq
ORDER BY ts
) as a
)
SELECT ts,
delta,
value,
case when value is null then first_value else value
end as cf
FROM val
| ts | delta | value | cf |
|---------------------|--------|--------|-----|
| 2019-09-09 12:01:00 | (null) | 3.5 | 3.5 |
| 2019-09-09 12:02:00 | 1 | 3.2 | 3.2 |
| 2019-09-09 12:03:00 | 1 | (null) | 3.2 |
| 2019-09-09 12:04:00 | 2 | 2.9 | 2.9 |
| 2019-09-09 12:05:00 | 1 | (null) | 2.9 |
| 2019-09-09 12:06:00 | 2 | 3 | 3 |
| 2019-09-09 12:07:00 | 1 | (null) | 3 |
| 2019-09-09 12:08:00 | 2 | (null) | 3 |
| 2019-09-09 12:09:00 | 3 | (null) | 3 |
| 2019-09-09 12:10:00 | 4 | (null) | 3 |
| 2019-09-09 12:11:00 | 5 | 3.2 | 3.2 |
| 2019-09-09 12:12:00 | 1 | (null) | 3.2 |
当我们为数据子集分配行时,我们现在既有值也有该值来自多少行。我们想不通的是,在通过WHERE生成子集的时候,如何判断是前传还是留空。
如果解决方案不需要预定义的增量列,则奖励积分。
使用 sum(case when value is null then 0 else 1 end) over (order by date) as value_p
的想法很好。这会将值分类到具有相同 value_p 的组中。
从那里开始,如果您将 date
视为实际时间戳,则可以使用 tsrange(min(date), max(date), '[]') 将日期分组在一起。确保范围的末端包含在内,以捕获组的开始和结束时间相同的行。
然后,只需使用包含的 by 运算符加入您的测试日期。
WITH test_dates(test_date) as (VALUES
('2019-09-09 12:01:00'::timestamp),
('2019-09-09 12:03:00'),
('2019-09-09 12:05:00'),
('2019-09-09 12:07:00'),
('2019-09-09 12:09:00'),
('2019-09-09 12:11:00')
), value_ranges AS (
SELECT tsrange(min(date)::timestamp, max(date)::timestamp, '[]') as sample_range,
max(value) as value, -- There's only one non-null value, this could be min
value_p
FROM (
SELECT date,
value,
sum(case when value is null then 0 else 1 end) over
(order by date) as value_p
FROM table1
) sub
GROUP BY value_p
)
SELECT test_date,
CASE WHEN row_number() OVER (PARTITION BY value_p ORDER BY test_date) = 1 THEN value
ELSE null END -- Only the first row of the group is non-null
FROM test_dates
JOIN value_ranges on test_date <@ sample_range
;
不需要增量列。
更新:意识到我在示例参考点之后提取条目,而我之前应该提取条目。固定。
给定你的 table 并假设你想要一个时间戳,而不是日期,这会给你你想要的。只需更改第一个 table 表达式中的 minutes_between_intervals
列即可展开样本。
为了提高可读性,我让 CTE 变得比它们需要的更加冗长。
WITH with_offsets AS (
-- First add in some metadata about how many minutes have elapsed since you
-- started sampling along with a constant for the sampling interval.
SELECT
2 AS minutes_between_intervals, -- This is how often you're sampling
date,
value,
delta,
extract(minute FROM date - (min(date) OVER (ORDER BY date)))::integer AS minutes_offset
FROM Table1
), with_groups AS (
-- Add grouping, setting the sample entries as reference points and the
-- entries leading up to it as part of its group.
SELECT
*,
CASE WHEN minutes_offset % minutes_between_intervals = 0 THEN minutes_offset
ELSE minutes_offset + (minutes_between_intervals - (minutes_offset % minutes_between_intervals))
END AS sample_group,
minutes_offset % minutes_between_intervals = 0 AS is_sample_boundary
FROM with_offsets
), with_arrays AS (
-- Then aggregate them into arrays. The values array has all NULLs
-- removed. The groups with sample entries are marked.
SELECT
array_agg(date) AS dates,
array_agg(value) FILTER (WHERE value IS NOT NULL) AS values,
array_agg(delta) AS deltas,
bool_or(is_sample_boundary) AS has_complete_sample
FROM with_groups
GROUP BY sample_group
)
-- Now take the last entry from each array, which will be the sample date,
-- the last recorded value, and the last recorded sample delta.
SELECT
dates[array_upper(dates, 1)] AS date,
values[array_upper(values, 1)] AS value,
deltas[array_upper(deltas, 1)] AS delta
FROM with_arrays
WHERE has_complete_sample;