在 ClickHouse 中加入具有不同值的时间序列
Join timeseries with distinct values in ClickHouse
我遇到了以下无法解决的问题。主要目的是在 Grafana 中显示图表。第一个sql请求给我:
SELECT toStartOfMinute(date_time) as t, COUNT(1) as count, service_name
FROM SB_STAT.SBCommonJournal
WHERE t BETWEEN toDateTime('2019-06-04 00:00:00') AND toDateTime('2019-06-05 00:00:00')
GROUP BY t, service_name
t;count;service_name
2019-06-04 15:43:00;1;test3
2019-06-04 15:35:00;1;test3
2019-06-04 15:12:00;1;test
2019-06-04 14:57:00;1;test
2019-06-04 15:32:00;1;test3
2019-06-04 16:36:00;1;test3
2019-06-04 15:21:00;1;test
第二个:
SELECT arrayJoin(
arrayMap(
x -> toStartOfMinute(addMinutes(toDateTime('2019-06-04 00:00:00'), x)),
range(toUInt64(dateDiff('minute', toDateTime('2019-06-04 00:00:00'), toDateTime('2019-06-05 00:00:00')) + 1)))) AS t,
0 AS count;
t;count
2019-06-04 00:00:00;0
2019-06-04 00:01:00;0
2019-06-04 00:02:00;0
2019-06-04 00:03:00;0
2019-06-04 00:04:00;0
2019-06-04 00:05:00;0
2019-06-04 00:06:00;0
2019-06-04 00:07:00;0
2019-06-04 00:08:00;0
2019-06-04 00:09:00;0
2019-06-04 00:10:00;0
etc..
我如何加入这两个请求以每分钟为每个 service_name 设置一个计数器?所以我要有这样的东西
t;count;service_name
2019-06-04 15:12:00;1;test
2019-06-04 15:12:00;0;test3
2019-06-04 15:13:00;0;test
2019-06-04 15:13:00;0;test3
etc...
试试这个查询:
SELECT stub_data.time_tick tick, stub_data.service_name service_name, source_data.count > stub_data.count ? source_data.count : stub_data.count AS count
FROM (
SELECT toStartOfMinute(date_time) as time_tick, COUNT() as count, service_name
FROM (
/* test data */
SELECT test_data.1 date_time, test_data.3 service_name, test_data.2 count
FROM (
SELECT arrayJoin([
(toDateTime('2019-06-04 15:43:01'), 1, 'test3'),
(toDateTime('2019-06-04 15:43:51'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:52'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:53'), 1, 'test4'),
(toDateTime('2019-06-04 15:35:02'), 1, 'test3'),
(toDateTime('2019-06-04 15:30:03'), 1, 'test'),
(toDateTime('2019-06-04 15:31:04'), 1, 'test'),
(toDateTime('2019-06-04 15:32:05'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:06'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:07'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:46'), 1, 'test4'),
(toDateTime('2019-06-04 15:38:07'), 1, 'test')
]) test_data)
)
WHERE time_tick BETWEEN toDateTime('2019-06-04 00:00:00') AND toDateTime('2019-06-05 00:00:00')
GROUP BY time_tick, service_name) source_data
RIGHT JOIN (
/* Cartesian product: [ticks * service_names] */
SELECT time_tick, service_name, 0 as count
FROM (
SELECT arrayJoin(
arrayMap(
x -> addMinutes(toDateTime('2019-06-04 15:30:00'), x),
range(toUInt64(dateDiff('minute', toDateTime('2019-06-04 15:30:00'), toDateTime('2019-06-04 15:43:00')) + 1)))) AS time_tick)
CROSS JOIN (
SELECT arrayJoin(groupUniqArray(test_data.3)) service_name
FROM (
/* test data */
SELECT arrayJoin([
(toDateTime('2019-06-04 15:43:01'), 1, 'test3'),
(toDateTime('2019-06-04 15:43:51'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:52'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:53'), 1, 'test4'),
(toDateTime('2019-06-04 15:35:02'), 1, 'test3'),
(toDateTime('2019-06-04 15:30:03'), 1, 'test'),
(toDateTime('2019-06-04 15:31:04'), 1, 'test'),
(toDateTime('2019-06-04 15:32:05'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:06'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:07'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:46'), 1, 'test4'),
(toDateTime('2019-06-04 15:38:07'), 1, 'test')
]) test_data))) stub_data
ON source_data.time_tick = stub_data.time_tick AND source_data.service_name = stub_data.service_name
ORDER BY tick, service_name;
/* Result:
┌────────────────tick─┬─service_name─┬─count─┐
│ 2019-06-04 15:30:00 │ test │ 1 │
│ 2019-06-04 15:30:00 │ test3 │ 0 │
│ 2019-06-04 15:30:00 │ test4 │ 0 │
│ 2019-06-04 15:31:00 │ test │ 1 │
│ 2019-06-04 15:31:00 │ test3 │ 0 │
│ 2019-06-04 15:31:00 │ test4 │ 0 │
│ 2019-06-04 15:32:00 │ test │ 0 │
│ 2019-06-04 15:32:00 │ test3 │ 1 │
│ 2019-06-04 15:32:00 │ test4 │ 0 │
│ 2019-06-04 15:33:00 │ test │ 0 │
│ 2019-06-04 15:33:00 │ test3 │ 0 │
│ 2019-06-04 15:33:00 │ test4 │ 0 │
│ 2019-06-04 15:34:00 │ test │ 0 │
│ 2019-06-04 15:34:00 │ test3 │ 0 │
│ 2019-06-04 15:34:00 │ test4 │ 0 │
│ 2019-06-04 15:35:00 │ test │ 0 │
│ 2019-06-04 15:35:00 │ test3 │ 1 │
│ 2019-06-04 15:35:00 │ test4 │ 0 │
│ 2019-06-04 15:36:00 │ test │ 0 │
│ 2019-06-04 15:36:00 │ test3 │ 2 │
│ 2019-06-04 15:36:00 │ test4 │ 1 │
│ 2019-06-04 15:37:00 │ test │ 0 │
│ 2019-06-04 15:37:00 │ test3 │ 0 │
│ 2019-06-04 15:37:00 │ test4 │ 0 │
│ 2019-06-04 15:38:00 │ test │ 1 │
│ 2019-06-04 15:38:00 │ test3 │ 0 │
│ 2019-06-04 15:38:00 │ test4 │ 0 │
│ 2019-06-04 15:39:00 │ test │ 0 │
│ 2019-06-04 15:39:00 │ test3 │ 0 │
│ 2019-06-04 15:39:00 │ test4 │ 0 │
│ 2019-06-04 15:40:00 │ test │ 0 │
│ 2019-06-04 15:40:00 │ test3 │ 0 │
│ 2019-06-04 15:40:00 │ test4 │ 0 │
│ 2019-06-04 15:41:00 │ test │ 0 │
│ 2019-06-04 15:41:00 │ test3 │ 0 │
│ 2019-06-04 15:41:00 │ test4 │ 0 │
│ 2019-06-04 15:42:00 │ test │ 0 │
│ 2019-06-04 15:42:00 │ test3 │ 0 │
│ 2019-06-04 15:42:00 │ test4 │ 0 │
│ 2019-06-04 15:43:00 │ test │ 0 │
│ 2019-06-04 15:43:00 │ test3 │ 1 │
│ 2019-06-04 15:43:00 │ test4 │ 3 │
└─────────────────────┴──────────────┴───────┘
*/
Grafana 实际上有一个零填充选项。对于 ClickHouse,您应该做的唯一一件事可能是在每个时间戳 key/value 对的元组上使用 groupArray。 Grafana 通常会将返回的 JSON 数据分开,并将元组中的第一个元素用作系列名称。
SELECT
t,
groupArray((service_name, cnt)) AS series
FROM (
SELECT
service_name,
toStartOfMinute(date_time) AS t,
count() AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
GROUP BY
service_name,
t
)
GROUP BY t
ORDER BY t
使用 WITH FILL 失败
SELECT
t,
groupArray((service_name, cnt)) AS series
FROM (
SELECT
service_name,
toStartOfMinute(date_time) AS t,
count() AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
GROUP BY
service_name,
t
)
GROUP BY t
ORDER BY t WITH FILL STEP 60
如果这对您仍然不起作用,则以下方法应该有效(使用 Grafana $to 和 $from)。
使用一些生成的 service_names 和指标创建一些示例数据:
DROP TABLE IF EXISTS SBCommonJournal;
CREATE TEMPORARY TABLE SBCommonJournal AS
WITH
(
SELECT arrayMap(x -> arrayStringConcat(arrayMap(i -> char(65 + (rand((i + x) + 1000) % 26)), range(16))), range(10))
) AS service_names
SELECT
service_names[1 + (rand() % length(service_names))] AS service_name,
toDateTime('2019-06-04 00:00:00') + toIntervalSecond(rand() % 86400) AS date_time
FROM numbers_mt(1000000)
查询:
SELECT
service_name,
t,
sum(cnt) AS cnt
FROM
(
SELECT
arrayJoin(groupUniqArray(service_name)) AS service_name,
arrayJoin(
(
SELECT groupArray(d)
FROM
(
SELECT arrayJoin([toDateTime('2019-06-04 00:00:00'), toDateTime('2019-06-05 00:00:00')]) AS d
GROUP BY d
ORDER BY d ASC WITH FILL STEP 60
)
)) AS t,
0 AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
UNION ALL
SELECT
service_name,
toStartOfMinute(date_time) AS t,
count() AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
GROUP BY
service_name,
t
)
GROUP BY
service_name,
t
ORDER BY
t ASC,
service_name ASC
我遇到了以下无法解决的问题。主要目的是在 Grafana 中显示图表。第一个sql请求给我:
SELECT toStartOfMinute(date_time) as t, COUNT(1) as count, service_name
FROM SB_STAT.SBCommonJournal
WHERE t BETWEEN toDateTime('2019-06-04 00:00:00') AND toDateTime('2019-06-05 00:00:00')
GROUP BY t, service_name
t;count;service_name
2019-06-04 15:43:00;1;test3
2019-06-04 15:35:00;1;test3
2019-06-04 15:12:00;1;test
2019-06-04 14:57:00;1;test
2019-06-04 15:32:00;1;test3
2019-06-04 16:36:00;1;test3
2019-06-04 15:21:00;1;test
第二个:
SELECT arrayJoin(
arrayMap(
x -> toStartOfMinute(addMinutes(toDateTime('2019-06-04 00:00:00'), x)),
range(toUInt64(dateDiff('minute', toDateTime('2019-06-04 00:00:00'), toDateTime('2019-06-05 00:00:00')) + 1)))) AS t,
0 AS count;
t;count
2019-06-04 00:00:00;0
2019-06-04 00:01:00;0
2019-06-04 00:02:00;0
2019-06-04 00:03:00;0
2019-06-04 00:04:00;0
2019-06-04 00:05:00;0
2019-06-04 00:06:00;0
2019-06-04 00:07:00;0
2019-06-04 00:08:00;0
2019-06-04 00:09:00;0
2019-06-04 00:10:00;0
etc..
我如何加入这两个请求以每分钟为每个 service_name 设置一个计数器?所以我要有这样的东西
t;count;service_name
2019-06-04 15:12:00;1;test
2019-06-04 15:12:00;0;test3
2019-06-04 15:13:00;0;test
2019-06-04 15:13:00;0;test3
etc...
试试这个查询:
SELECT stub_data.time_tick tick, stub_data.service_name service_name, source_data.count > stub_data.count ? source_data.count : stub_data.count AS count
FROM (
SELECT toStartOfMinute(date_time) as time_tick, COUNT() as count, service_name
FROM (
/* test data */
SELECT test_data.1 date_time, test_data.3 service_name, test_data.2 count
FROM (
SELECT arrayJoin([
(toDateTime('2019-06-04 15:43:01'), 1, 'test3'),
(toDateTime('2019-06-04 15:43:51'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:52'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:53'), 1, 'test4'),
(toDateTime('2019-06-04 15:35:02'), 1, 'test3'),
(toDateTime('2019-06-04 15:30:03'), 1, 'test'),
(toDateTime('2019-06-04 15:31:04'), 1, 'test'),
(toDateTime('2019-06-04 15:32:05'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:06'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:07'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:46'), 1, 'test4'),
(toDateTime('2019-06-04 15:38:07'), 1, 'test')
]) test_data)
)
WHERE time_tick BETWEEN toDateTime('2019-06-04 00:00:00') AND toDateTime('2019-06-05 00:00:00')
GROUP BY time_tick, service_name) source_data
RIGHT JOIN (
/* Cartesian product: [ticks * service_names] */
SELECT time_tick, service_name, 0 as count
FROM (
SELECT arrayJoin(
arrayMap(
x -> addMinutes(toDateTime('2019-06-04 15:30:00'), x),
range(toUInt64(dateDiff('minute', toDateTime('2019-06-04 15:30:00'), toDateTime('2019-06-04 15:43:00')) + 1)))) AS time_tick)
CROSS JOIN (
SELECT arrayJoin(groupUniqArray(test_data.3)) service_name
FROM (
/* test data */
SELECT arrayJoin([
(toDateTime('2019-06-04 15:43:01'), 1, 'test3'),
(toDateTime('2019-06-04 15:43:51'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:52'), 1, 'test4'),
(toDateTime('2019-06-04 15:43:53'), 1, 'test4'),
(toDateTime('2019-06-04 15:35:02'), 1, 'test3'),
(toDateTime('2019-06-04 15:30:03'), 1, 'test'),
(toDateTime('2019-06-04 15:31:04'), 1, 'test'),
(toDateTime('2019-06-04 15:32:05'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:06'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:07'), 1, 'test3'),
(toDateTime('2019-06-04 15:36:46'), 1, 'test4'),
(toDateTime('2019-06-04 15:38:07'), 1, 'test')
]) test_data))) stub_data
ON source_data.time_tick = stub_data.time_tick AND source_data.service_name = stub_data.service_name
ORDER BY tick, service_name;
/* Result:
┌────────────────tick─┬─service_name─┬─count─┐
│ 2019-06-04 15:30:00 │ test │ 1 │
│ 2019-06-04 15:30:00 │ test3 │ 0 │
│ 2019-06-04 15:30:00 │ test4 │ 0 │
│ 2019-06-04 15:31:00 │ test │ 1 │
│ 2019-06-04 15:31:00 │ test3 │ 0 │
│ 2019-06-04 15:31:00 │ test4 │ 0 │
│ 2019-06-04 15:32:00 │ test │ 0 │
│ 2019-06-04 15:32:00 │ test3 │ 1 │
│ 2019-06-04 15:32:00 │ test4 │ 0 │
│ 2019-06-04 15:33:00 │ test │ 0 │
│ 2019-06-04 15:33:00 │ test3 │ 0 │
│ 2019-06-04 15:33:00 │ test4 │ 0 │
│ 2019-06-04 15:34:00 │ test │ 0 │
│ 2019-06-04 15:34:00 │ test3 │ 0 │
│ 2019-06-04 15:34:00 │ test4 │ 0 │
│ 2019-06-04 15:35:00 │ test │ 0 │
│ 2019-06-04 15:35:00 │ test3 │ 1 │
│ 2019-06-04 15:35:00 │ test4 │ 0 │
│ 2019-06-04 15:36:00 │ test │ 0 │
│ 2019-06-04 15:36:00 │ test3 │ 2 │
│ 2019-06-04 15:36:00 │ test4 │ 1 │
│ 2019-06-04 15:37:00 │ test │ 0 │
│ 2019-06-04 15:37:00 │ test3 │ 0 │
│ 2019-06-04 15:37:00 │ test4 │ 0 │
│ 2019-06-04 15:38:00 │ test │ 1 │
│ 2019-06-04 15:38:00 │ test3 │ 0 │
│ 2019-06-04 15:38:00 │ test4 │ 0 │
│ 2019-06-04 15:39:00 │ test │ 0 │
│ 2019-06-04 15:39:00 │ test3 │ 0 │
│ 2019-06-04 15:39:00 │ test4 │ 0 │
│ 2019-06-04 15:40:00 │ test │ 0 │
│ 2019-06-04 15:40:00 │ test3 │ 0 │
│ 2019-06-04 15:40:00 │ test4 │ 0 │
│ 2019-06-04 15:41:00 │ test │ 0 │
│ 2019-06-04 15:41:00 │ test3 │ 0 │
│ 2019-06-04 15:41:00 │ test4 │ 0 │
│ 2019-06-04 15:42:00 │ test │ 0 │
│ 2019-06-04 15:42:00 │ test3 │ 0 │
│ 2019-06-04 15:42:00 │ test4 │ 0 │
│ 2019-06-04 15:43:00 │ test │ 0 │
│ 2019-06-04 15:43:00 │ test3 │ 1 │
│ 2019-06-04 15:43:00 │ test4 │ 3 │
└─────────────────────┴──────────────┴───────┘
*/
Grafana 实际上有一个零填充选项。对于 ClickHouse,您应该做的唯一一件事可能是在每个时间戳 key/value 对的元组上使用 groupArray。 Grafana 通常会将返回的 JSON 数据分开,并将元组中的第一个元素用作系列名称。
SELECT
t,
groupArray((service_name, cnt)) AS series
FROM (
SELECT
service_name,
toStartOfMinute(date_time) AS t,
count() AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
GROUP BY
service_name,
t
)
GROUP BY t
ORDER BY t
使用 WITH FILL 失败
SELECT
t,
groupArray((service_name, cnt)) AS series
FROM (
SELECT
service_name,
toStartOfMinute(date_time) AS t,
count() AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
GROUP BY
service_name,
t
)
GROUP BY t
ORDER BY t WITH FILL STEP 60
如果这对您仍然不起作用,则以下方法应该有效(使用 Grafana $to 和 $from)。
使用一些生成的 service_names 和指标创建一些示例数据:
DROP TABLE IF EXISTS SBCommonJournal;
CREATE TEMPORARY TABLE SBCommonJournal AS
WITH
(
SELECT arrayMap(x -> arrayStringConcat(arrayMap(i -> char(65 + (rand((i + x) + 1000) % 26)), range(16))), range(10))
) AS service_names
SELECT
service_names[1 + (rand() % length(service_names))] AS service_name,
toDateTime('2019-06-04 00:00:00') + toIntervalSecond(rand() % 86400) AS date_time
FROM numbers_mt(1000000)
查询:
SELECT
service_name,
t,
sum(cnt) AS cnt
FROM
(
SELECT
arrayJoin(groupUniqArray(service_name)) AS service_name,
arrayJoin(
(
SELECT groupArray(d)
FROM
(
SELECT arrayJoin([toDateTime('2019-06-04 00:00:00'), toDateTime('2019-06-05 00:00:00')]) AS d
GROUP BY d
ORDER BY d ASC WITH FILL STEP 60
)
)) AS t,
0 AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
UNION ALL
SELECT
service_name,
toStartOfMinute(date_time) AS t,
count() AS cnt
FROM SBCommonJournal
WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
GROUP BY
service_name,
t
)
GROUP BY
service_name,
t
ORDER BY
t ASC,
service_name ASC