选择前 N 行,其中 TEXT 字段的长度总和达到某个限制
Selecting first N rows, where sum of lengths of TEXT field is up to some limit
我有一个 table 这样的:
CREATE TABLE cache (
id BIGSERIAL PRIMARY KEY,
source char(2) NOT NULL,
target char(2) NOT NULL,
q TEXT NOT NULL,
result TEXT,
profile TEXT NOT NULL DEFAULT '',
created TIMESTAMP NOT NULL DEFAULT now(),
api_engine text NOT NULL,
encoded TEXT NOT NULL
);
我想传递 encoded 字段的列表(也许 OVER ... WINDOW ?)
像这样的东西:
SELECT id, string_agg(encoded, '&q=') FROM cache
所以我会得到相应 ID 的列表,以及一串串接的字段 encoded: '&q=encoded1&q=encoded2&q=encoded3'
... 总长度不超过某个限制(比如不超过 2000 个字符)。
第二个条件,我想转到下一个window,当其中一个字段:源、目标或配置文件发生更改时。
如果可以在 FOR 循环中使用 SQL SELECT?
我知道如何用 plpgsql/plpython/plperl 做到这一点,但我想优化这个请求。
FOR rec IN
SELECT array_agg(id) AS ids, string_agg(encoded, '&q=') AS url FROM cache
WHERE result IS NULL
ORDER BY source, target
LOOP
-- here I call curl with that *url*
示例数据:
INSERT INTO cache (id, source, target, q, result, profile, api_engine, encoded) VALUES
(1, 'ru', 'en', 'Длинная фраза по-русски' , NULL, '', 'google', '%D0%94%D0%BB%D0%B8%D0%BD%D0%BD%D0%B0%D1%8F+%D1%84%D1%80%D0%B0%D0%B7%D0%B0+%D0%BF%D0%BE-%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B8')
, (2, 'ru', 'es', 'Ещё одна непонятная фраза по-русски', NULL, '', 'google', '%D0%95%D1%89%D1%91+%D0%BE%D0%B4%D0%BD%D0%B0+%D0%BD%D0%B5%D0%BF%D0%BE%D0%BD%D1%8F%D1%82%D0%BD%D0%B0%D1%8F+%D1%84%D1%80%D0%B0%D0%B7%D0%B0+%D0%BF%D0%BE-%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B8')
-- etc...
等等,100500 行这样的。字段 source 和 target 可以是不同的语言代码,它们重复,所以我可能需要做 GROUP BY source, target, profile
.
我想要 SELECT 前 N 行,其中字段 编码 与一些分隔符(如
的连接
&q=%D0%94%D0%BB%D0%B8%D0%BD%D0%BD%D0%B0%D1%8F+%D1%84%D1%80%D0%B0%D0%B7%D0%B0+%D0%BF%D0%BE-%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B8&q=%D0%95%D1%89%D1%91+%D0%BE%D0%B4%D0%BD%D0%B0+%D0%BD%D0%B5%D0%BF%D0%BE%D0%BD%D1%8F%D1%82%D0%BD%D0%B0%D1%8F+%D1%84%D1%80%D0%B0%D0%B7%D0%B0+%D0%BF%D0%BE-%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B8
所以这个连接字符串的长度不超过 (2000) 个字符。所以我将把那个字符串,以及那些行的所有 id,包含在 url 中(以相同的顺序,当然)。
然后我想 select 具有相同条件的下 N 行,依此类推。
您可以使用智能递归 CTE 来完成:
WITH RECURSIVE c AS ( -- 1st CTE is not recursive
SELECT dense_rank() OVER (ORDER BY source, target, profile) AS rnk
, row_number() OVER (PARTITION BY source, target, profile ORDER BY id) AS rn
, lead(encoded) OVER (PARTITION BY source, target, profile ORDER BY id) AS next_enc
, id, encoded
FROM cache
)
, rcte AS ( -- "recursion" starts here
SELECT rnk, rn, ARRAY[id] AS ids, encoded AS url
, CASE WHEN length(concat_ws('&q=', encoded || next_enc)) > 2000 -- max len
OR next_enc IS NULL -- last in partition
THEN TRUE END AS print
FROM c
WHERE rn = 1
UNION ALL
SELECT c.rnk, c.rn
, CASE WHEN r.print THEN ARRAY[id] ELSE r.ids || c.id END AS ids
, CASE WHEN r.print THEN c.encoded ELSE concat_ws('&q=', r.url, c.encoded) END AS url
, CASE WHEN length(
CASE WHEN r.print THEN concat_ws('&q=', c.encoded, c.next_enc)
ELSE concat_ws('&q=', r.url, c.encoded, c.next_enc) END) > 2000 -- max len
OR c.next_enc IS NULL -- last in partition
THEN TRUE END AS print
FROM rcte r
JOIN c USING (rnk)
WHERE c.rn = r.rn + 1
)
SELECT ids, url
FROM rcte
WHERE print
ORDER BY rnk, rn;
关于包含非递归 CTE 的 rCTE:
- Multiple CTE in single query
但这可能是在 plpgsql 函数中循环实际上更快的罕见情况之一。
请参阅此相关答案以获取更多解释:
我有一个 table 这样的:
CREATE TABLE cache (
id BIGSERIAL PRIMARY KEY,
source char(2) NOT NULL,
target char(2) NOT NULL,
q TEXT NOT NULL,
result TEXT,
profile TEXT NOT NULL DEFAULT '',
created TIMESTAMP NOT NULL DEFAULT now(),
api_engine text NOT NULL,
encoded TEXT NOT NULL
);
我想传递 encoded 字段的列表(也许 OVER ... WINDOW ?) 像这样的东西:
SELECT id, string_agg(encoded, '&q=') FROM cache
所以我会得到相应 ID 的列表,以及一串串接的字段 encoded: '&q=encoded1&q=encoded2&q=encoded3'
... 总长度不超过某个限制(比如不超过 2000 个字符)。
第二个条件,我想转到下一个window,当其中一个字段:源、目标或配置文件发生更改时。
如果可以在 FOR 循环中使用 SQL SELECT?
我知道如何用 plpgsql/plpython/plperl 做到这一点,但我想优化这个请求。
FOR rec IN
SELECT array_agg(id) AS ids, string_agg(encoded, '&q=') AS url FROM cache
WHERE result IS NULL
ORDER BY source, target
LOOP
-- here I call curl with that *url*
示例数据:
INSERT INTO cache (id, source, target, q, result, profile, api_engine, encoded) VALUES
(1, 'ru', 'en', 'Длинная фраза по-русски' , NULL, '', 'google', '%D0%94%D0%BB%D0%B8%D0%BD%D0%BD%D0%B0%D1%8F+%D1%84%D1%80%D0%B0%D0%B7%D0%B0+%D0%BF%D0%BE-%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B8')
, (2, 'ru', 'es', 'Ещё одна непонятная фраза по-русски', NULL, '', 'google', '%D0%95%D1%89%D1%91+%D0%BE%D0%B4%D0%BD%D0%B0+%D0%BD%D0%B5%D0%BF%D0%BE%D0%BD%D1%8F%D1%82%D0%BD%D0%B0%D1%8F+%D1%84%D1%80%D0%B0%D0%B7%D0%B0+%D0%BF%D0%BE-%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B8')
-- etc...
等等,100500 行这样的。字段 source 和 target 可以是不同的语言代码,它们重复,所以我可能需要做 GROUP BY source, target, profile
.
我想要 SELECT 前 N 行,其中字段 编码 与一些分隔符(如
的连接&q=%D0%94%D0%BB%D0%B8%D0%BD%D0%BD%D0%B0%D1%8F+%D1%84%D1%80%D0%B0%D0%B7%D0%B0+%D0%BF%D0%BE-%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B8&q=%D0%95%D1%89%D1%91+%D0%BE%D0%B4%D0%BD%D0%B0+%D0%BD%D0%B5%D0%BF%D0%BE%D0%BD%D1%8F%D1%82%D0%BD%D0%B0%D1%8F+%D1%84%D1%80%D0%B0%D0%B7%D0%B0+%D0%BF%D0%BE-%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%B8
所以这个连接字符串的长度不超过 (2000) 个字符。所以我将把那个字符串,以及那些行的所有 id,包含在 url 中(以相同的顺序,当然)。
然后我想 select 具有相同条件的下 N 行,依此类推。
您可以使用智能递归 CTE 来完成:
WITH RECURSIVE c AS ( -- 1st CTE is not recursive
SELECT dense_rank() OVER (ORDER BY source, target, profile) AS rnk
, row_number() OVER (PARTITION BY source, target, profile ORDER BY id) AS rn
, lead(encoded) OVER (PARTITION BY source, target, profile ORDER BY id) AS next_enc
, id, encoded
FROM cache
)
, rcte AS ( -- "recursion" starts here
SELECT rnk, rn, ARRAY[id] AS ids, encoded AS url
, CASE WHEN length(concat_ws('&q=', encoded || next_enc)) > 2000 -- max len
OR next_enc IS NULL -- last in partition
THEN TRUE END AS print
FROM c
WHERE rn = 1
UNION ALL
SELECT c.rnk, c.rn
, CASE WHEN r.print THEN ARRAY[id] ELSE r.ids || c.id END AS ids
, CASE WHEN r.print THEN c.encoded ELSE concat_ws('&q=', r.url, c.encoded) END AS url
, CASE WHEN length(
CASE WHEN r.print THEN concat_ws('&q=', c.encoded, c.next_enc)
ELSE concat_ws('&q=', r.url, c.encoded, c.next_enc) END) > 2000 -- max len
OR c.next_enc IS NULL -- last in partition
THEN TRUE END AS print
FROM rcte r
JOIN c USING (rnk)
WHERE c.rn = r.rn + 1
)
SELECT ids, url
FROM rcte
WHERE print
ORDER BY rnk, rn;
关于包含非递归 CTE 的 rCTE:
- Multiple CTE in single query
但这可能是在 plpgsql 函数中循环实际上更快的罕见情况之一。
请参阅此相关答案以获取更多解释: