在 ClickHouse 中加入具有不同值的时间序列

Question

我遇到了以下无法解决的问题。主要目的是在 Grafana 中显示图表。第一个sql请求给我：

SELECT toStartOfMinute(date_time) as t, COUNT(1) as count, service_name
FROM SB_STAT.SBCommonJournal
WHERE t BETWEEN toDateTime('2019-06-04 00:00:00') AND toDateTime('2019-06-05 00:00:00')
GROUP BY t, service_name

t;count;service_name
2019-06-04 15:43:00;1;test3
2019-06-04 15:35:00;1;test3
2019-06-04 15:12:00;1;test
2019-06-04 14:57:00;1;test
2019-06-04 15:32:00;1;test3
2019-06-04 16:36:00;1;test3
2019-06-04 15:21:00;1;test

第二个：

SELECT arrayJoin(
         arrayMap(
           x -> toStartOfMinute(addMinutes(toDateTime('2019-06-04 00:00:00'), x)), 
           range(toUInt64(dateDiff('minute', toDateTime('2019-06-04 00:00:00'), toDateTime('2019-06-05 00:00:00')) + 1)))) AS t,
       0 AS count;

t;count
2019-06-04 00:00:00;0
2019-06-04 00:01:00;0
2019-06-04 00:02:00;0
2019-06-04 00:03:00;0
2019-06-04 00:04:00;0
2019-06-04 00:05:00;0
2019-06-04 00:06:00;0
2019-06-04 00:07:00;0
2019-06-04 00:08:00;0
2019-06-04 00:09:00;0
2019-06-04 00:10:00;0

etc..

我如何加入这两个请求以每分钟为每个 service_name 设置一个计数器？所以我要有这样的东西

t;count;service_name
2019-06-04 15:12:00;1;test
2019-06-04 15:12:00;0;test3
2019-06-04 15:13:00;0;test
2019-06-04 15:13:00;0;test3
etc...

Answer 1

试试这个查询：

SELECT stub_data.time_tick tick, stub_data.service_name service_name, source_data.count > stub_data.count ? source_data.count : stub_data.count AS count
FROM (
  SELECT toStartOfMinute(date_time) as time_tick, COUNT() as count, service_name
  FROM (
    /* test data */
    SELECT test_data.1 date_time, test_data.3 service_name, test_data.2 count
    FROM (
      SELECT arrayJoin([
        (toDateTime('2019-06-04 15:43:01'), 1, 'test3'),
        (toDateTime('2019-06-04 15:43:51'), 1, 'test4'),
        (toDateTime('2019-06-04 15:43:52'), 1, 'test4'),
        (toDateTime('2019-06-04 15:43:53'), 1, 'test4'),          
        (toDateTime('2019-06-04 15:35:02'), 1, 'test3'),
        (toDateTime('2019-06-04 15:30:03'), 1, 'test'),
        (toDateTime('2019-06-04 15:31:04'), 1, 'test'),
        (toDateTime('2019-06-04 15:32:05'), 1, 'test3'),
        (toDateTime('2019-06-04 15:36:06'), 1, 'test3'),
        (toDateTime('2019-06-04 15:36:07'), 1, 'test3'),        
        (toDateTime('2019-06-04 15:36:46'), 1, 'test4'),        
        (toDateTime('2019-06-04 15:38:07'), 1, 'test')
      ]) test_data)
  )
  WHERE time_tick BETWEEN toDateTime('2019-06-04 00:00:00') AND toDateTime('2019-06-05 00:00:00')
  GROUP BY time_tick, service_name) source_data
RIGHT JOIN (
  /* Cartesian product: [ticks * service_names] */
  SELECT time_tick, service_name, 0 as count
  FROM (
    SELECT arrayJoin(
             arrayMap(
               x -> addMinutes(toDateTime('2019-06-04 15:30:00'), x), 
               range(toUInt64(dateDiff('minute', toDateTime('2019-06-04 15:30:00'), toDateTime('2019-06-04 15:43:00')) + 1)))) AS time_tick)
  CROSS JOIN (         
    SELECT arrayJoin(groupUniqArray(test_data.3)) service_name
    FROM (
      /* test data */
      SELECT arrayJoin([
        (toDateTime('2019-06-04 15:43:01'), 1, 'test3'),
        (toDateTime('2019-06-04 15:43:51'), 1, 'test4'),
        (toDateTime('2019-06-04 15:43:52'), 1, 'test4'),
        (toDateTime('2019-06-04 15:43:53'), 1, 'test4'),          
        (toDateTime('2019-06-04 15:35:02'), 1, 'test3'),
        (toDateTime('2019-06-04 15:30:03'), 1, 'test'),
        (toDateTime('2019-06-04 15:31:04'), 1, 'test'),
        (toDateTime('2019-06-04 15:32:05'), 1, 'test3'),
        (toDateTime('2019-06-04 15:36:06'), 1, 'test3'),
        (toDateTime('2019-06-04 15:36:07'), 1, 'test3'),        
        (toDateTime('2019-06-04 15:36:46'), 1, 'test4'),        
        (toDateTime('2019-06-04 15:38:07'), 1, 'test')
      ]) test_data))) stub_data
ON source_data.time_tick = stub_data.time_tick AND source_data.service_name = stub_data.service_name
ORDER BY tick, service_name;

/* Result:
┌────────────────tick─┬─service_name─┬─count─┐
│ 2019-06-04 15:30:00 │ test         │     1 │
│ 2019-06-04 15:30:00 │ test3        │     0 │
│ 2019-06-04 15:30:00 │ test4        │     0 │
│ 2019-06-04 15:31:00 │ test         │     1 │
│ 2019-06-04 15:31:00 │ test3        │     0 │
│ 2019-06-04 15:31:00 │ test4        │     0 │
│ 2019-06-04 15:32:00 │ test         │     0 │
│ 2019-06-04 15:32:00 │ test3        │     1 │
│ 2019-06-04 15:32:00 │ test4        │     0 │
│ 2019-06-04 15:33:00 │ test         │     0 │
│ 2019-06-04 15:33:00 │ test3        │     0 │
│ 2019-06-04 15:33:00 │ test4        │     0 │
│ 2019-06-04 15:34:00 │ test         │     0 │
│ 2019-06-04 15:34:00 │ test3        │     0 │
│ 2019-06-04 15:34:00 │ test4        │     0 │
│ 2019-06-04 15:35:00 │ test         │     0 │
│ 2019-06-04 15:35:00 │ test3        │     1 │
│ 2019-06-04 15:35:00 │ test4        │     0 │
│ 2019-06-04 15:36:00 │ test         │     0 │
│ 2019-06-04 15:36:00 │ test3        │     2 │
│ 2019-06-04 15:36:00 │ test4        │     1 │
│ 2019-06-04 15:37:00 │ test         │     0 │
│ 2019-06-04 15:37:00 │ test3        │     0 │
│ 2019-06-04 15:37:00 │ test4        │     0 │
│ 2019-06-04 15:38:00 │ test         │     1 │
│ 2019-06-04 15:38:00 │ test3        │     0 │
│ 2019-06-04 15:38:00 │ test4        │     0 │
│ 2019-06-04 15:39:00 │ test         │     0 │
│ 2019-06-04 15:39:00 │ test3        │     0 │
│ 2019-06-04 15:39:00 │ test4        │     0 │
│ 2019-06-04 15:40:00 │ test         │     0 │
│ 2019-06-04 15:40:00 │ test3        │     0 │
│ 2019-06-04 15:40:00 │ test4        │     0 │
│ 2019-06-04 15:41:00 │ test         │     0 │
│ 2019-06-04 15:41:00 │ test3        │     0 │
│ 2019-06-04 15:41:00 │ test4        │     0 │
│ 2019-06-04 15:42:00 │ test         │     0 │
│ 2019-06-04 15:42:00 │ test3        │     0 │
│ 2019-06-04 15:42:00 │ test4        │     0 │
│ 2019-06-04 15:43:00 │ test         │     0 │
│ 2019-06-04 15:43:00 │ test3        │     1 │
│ 2019-06-04 15:43:00 │ test4        │     3 │
└─────────────────────┴──────────────┴───────┘
*/

Answer 2

Grafana 实际上有一个零填充选项。对于 ClickHouse，您应该做的唯一一件事可能是在每个时间戳 key/value 对的元组上使用 groupArray。 Grafana 通常会将返回的 JSON 数据分开，并将元组中的第一个元素用作系列名称。

SELECT
    t,
    groupArray((service_name, cnt)) AS series
FROM (
    SELECT 
        service_name, 
        toStartOfMinute(date_time) AS t, 
        count() AS cnt
    FROM SBCommonJournal
    WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
    GROUP BY 
        service_name, 
        t
)
GROUP BY t
ORDER BY t

使用 WITH FILL 失败

SELECT
    t,
    groupArray((service_name, cnt)) AS series
FROM (
    SELECT 
        service_name, 
        toStartOfMinute(date_time) AS t, 
        count() AS cnt
    FROM SBCommonJournal
    WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
    GROUP BY 
        service_name, 
        t
)
GROUP BY t
ORDER BY t WITH FILL STEP 60

如果这对您仍然不起作用，则以下方法应该有效（使用 Grafana $to 和 $from）。

使用一些生成的 service_names 和指标创建一些示例数据：

DROP TABLE IF EXISTS SBCommonJournal;
CREATE TEMPORARY TABLE SBCommonJournal AS
WITH 
    (
        SELECT arrayMap(x -> arrayStringConcat(arrayMap(i -> char(65 + (rand((i + x) + 1000) % 26)), range(16))), range(10))
    ) AS service_names
SELECT 
    service_names[1 + (rand() % length(service_names))] AS service_name, 
    toDateTime('2019-06-04 00:00:00') + toIntervalSecond(rand() % 86400) AS date_time
FROM numbers_mt(1000000)

查询：

SELECT 
    service_name, 
    t, 
    sum(cnt) AS cnt
FROM 
(
    SELECT 
        arrayJoin(groupUniqArray(service_name)) AS service_name, 
        arrayJoin(
        (
            SELECT groupArray(d)
            FROM 
            (
                SELECT arrayJoin([toDateTime('2019-06-04 00:00:00'), toDateTime('2019-06-05 00:00:00')]) AS d
                GROUP BY d
                ORDER BY d ASC WITH FILL STEP 60
            )
        )) AS t, 
        0 AS cnt
    FROM SBCommonJournal
    WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
    UNION ALL
    SELECT 
        service_name, 
        toStartOfMinute(date_time) AS t, 
        count() AS cnt
    FROM SBCommonJournal
    WHERE (date_time >= toDateTime('2019-06-04 00:00:00')) AND (date_time <= toDateTime('2019-06-05 00:00:00'))
    GROUP BY 
        service_name, 
        t
)
GROUP BY 
    service_name, 
    t
ORDER BY 
    t ASC, 
    service_name ASC

在 ClickHouse 中加入具有不同值的时间序列

Join timeseries with distinct values in ClickHouse

sql

clickhouse