如果选择不同的 pk 并按 pk 和限制排序,不要聚合整列
Dont aggregate entire column if selecting distinct pk's with order by pk and limit
我有一个 clickhouse table events 一年包含 5000 万行(可能重复)
create table events (
event LowCardinality(String),
event_time DateTime64(3),
uid String
) engine=ReplacingMergeTree() toYYYYMM(event_time)
primary key (event, event_time)
order by (event, event_time, uid)
当我尝试 select 前 500 个不同的行时,Clickhouse 处理所有这些行 (50M)
select distinct event, event_time
from events
where event='some_event' and event_time between '2020-02-24 00:00:00.000' AND '2021-02-24 00:00:00.000'
order by(event, event_time) desc limit 500
所以它基本上按事件分组,event_time 50M 行,然后再对其应用限制。是否可以强制clickhouse全程不读取数据?
CREATE TABLE events
(
`event` LowCardinality(String),
`event_time` DateTime,
`uid` String
)
ENGINE = ReplacingMergeTree
PARTITION BY toYYYYMM(event_time)
ORDER BY (event, event_time, uid);
INSERT INTO events SELECT
'ev',
toDateTime('2020-01-01 00:00:00') + toIntervalSecond(number),
randomPrintableASCII(5)
FROM numbers(30000000);
SELECT *
FROM
(
SELECT event_time
FROM events
WHERE (event = 'ev') AND ((event_time >= '2020-01-01 00:00:00') AND (event_time <= '2021-01-01 00:00:00'))
ORDER BY
event DESC,
event_time DESC
LIMIT 1 BY event_time
)
LIMIT 500
.....
Elapsed: 0.008 sec. Processed 1.07 million rows
我有一个 clickhouse table events 一年包含 5000 万行(可能重复)
create table events (
event LowCardinality(String),
event_time DateTime64(3),
uid String
) engine=ReplacingMergeTree() toYYYYMM(event_time)
primary key (event, event_time)
order by (event, event_time, uid)
当我尝试 select 前 500 个不同的行时,Clickhouse 处理所有这些行 (50M)
select distinct event, event_time
from events
where event='some_event' and event_time between '2020-02-24 00:00:00.000' AND '2021-02-24 00:00:00.000'
order by(event, event_time) desc limit 500
所以它基本上按事件分组,event_time 50M 行,然后再对其应用限制。是否可以强制clickhouse全程不读取数据?
CREATE TABLE events
(
`event` LowCardinality(String),
`event_time` DateTime,
`uid` String
)
ENGINE = ReplacingMergeTree
PARTITION BY toYYYYMM(event_time)
ORDER BY (event, event_time, uid);
INSERT INTO events SELECT
'ev',
toDateTime('2020-01-01 00:00:00') + toIntervalSecond(number),
randomPrintableASCII(5)
FROM numbers(30000000);
SELECT *
FROM
(
SELECT event_time
FROM events
WHERE (event = 'ev') AND ((event_time >= '2020-01-01 00:00:00') AND (event_time <= '2021-01-01 00:00:00'))
ORDER BY
event DESC,
event_time DESC
LIMIT 1 BY event_time
)
LIMIT 500
.....
Elapsed: 0.008 sec. Processed 1.07 million rows