SQL/PostgreSQL 中带加权过滤器的随机行选择
Random row selection with weighted filters in SQL/PostgreSQL
我有一道题 table,我需要做 X 道题来准备考试。问题需要根据多个标准(学科、机构、地区等)进行筛选,每个标准具有不同的权重。
过滤器权重是在查询之外动态设置和规范化的。例如:
- 主题 1 — 0.4
- 科目 2 — 0.1
- 科目 3 — 0.5
- 机构 1 — 0.2
- 机构 2 — 0.04
- 机构 3 — 0.76
- 区域 1 — 1
其他几点:
- 今天,我有 10 个不同的过滤器(主题、机构、区域等),但用户可以 select 以多种方式混合(例如:10 个主题、5 个机构、30 个区域等),如上例所示。
- 问题 table 有 ~500k 行;
- 过滤器是 N — N 个问题;
- 过滤后,我想限制return行;
- 如果某个过滤器不能提供更多的问题,则必须考虑其他的(记住:我想准备一个测试——如果我还有问题,必须使用它们)
- 我非常关心这个查询的性能。
为了说明,如果我不想对过滤器进行加权,我会这样做:
SELECT
*
FROM
public.questions q
INNER JOIN public.subjects_questions sq ON q.id = sq.question_id
INNER JOIN public.subjects s ON s.id = sq.subject_id
INNER JOIN public.institutions_questions iq ON iq.question_id = q.id
INNER JOIN public.institutions i ON i.id = iq.institution_id
INNER JOIN public.areas_questions aq ON aq.question_id = q.id
INNER JOIN public.areas a ON a.id = aq.area_id
WHERE
s.id IN :subjects
AND a.id IN :areas
AND i.id IN :institutions
ORDER BY
random() limit 200
期望的输出:
Question — Subject — Institution — Area
我的想法是这样的:
- 使用过滤器 return 编辑的问题创建 CTE;必须考虑到同一个问题可以 return 被多个过滤器编辑——我是否需要分开评估每个过滤器然后 UNION ALL 来解决这个问题?也必须分配问题来自哪个过滤器;
- 创建另一个具有权重和关联的相应过滤器的 CTE;
- 加入 CTE,但此时必须对问题进行分组并对权重求和;
- 应用 Window 函数和 return 结果,限于 X 行 (LIMIT X)。
你会如何编写这样的查询/解决这个问题?
像这样的事情呢。这只是为了演示这个想法,我会把细节留给你。如果您不熟悉这种随机选择方法,如果您随机生成一个介于 0 和 1 之间的数字,它有 40% 的机会小于 .4。所以 rand() <= .4 将 return 40% 的时间为真。
假设您已经或可以创建一个看起来有点像这样的 "Filters" 实体
CREATE TABLE Filters
( FieldName VARCHAR(100),
FieldValue VARCHAR(100),
Prob Float -- probability of selection based on Name and Value
);
SELECT DISTINCT TMP.* -- The fields you want. Distinct needed to get rid of
-- records which pass multiple conditions.
FROM (SELECT YRSWF.*,
RAND() AS rnd
FROM YourResultSetWithoutFilters YRSWF -- You can code the details
) TMP
INNER
JOIN Filters F
ON (
TMP.Subject = F.FieldValue
AND F.FieldName = 'Subject'
AND TMP.rnd <= F.prob
)
OR (
TMP.Institution = F.FieldValue
AND F.FieldName = 'Institution'
AND TMP.rnd <= F.prob
)
OR (
TMP.Area = F.FieldValue
AND F.FieldName = 'Area'
AND TMP.rnd <= F.prob
);
好的。设法解决了它。基本上,使用问题中已经概述的策略和 here -- I had already seen this post before, but I was (and still am) trying to solve in a more elegant way -- something like this 的一些帮助,但对于多行 --,不需要手动创建 "bounds"。
让我们逐步尝试:
由于具有权重的过滤器来自架构外部,让我们创建一个 CTE:
WITH filters (type, id, weight) AS (
SELECT 'subject', '148232e0-dece-40d9-81e0-0fa675f040e5'::uuid, 0.5
UNION SELECT 'subject', '854431bb-18ee-4efb-803f-185757d25235'::uuid, 0.4
UNION SELECT 'area', 'e12863fb-afb7-45cf-9198-f9f58ebc80cf'::uuid, 1
UNION SELECT 'institution', '7f56c89f-705e-45c7-98fb-fee470550edf'::uuid, 0.5
UNION SELECT 'institution', '0066257b-b2e3-4ee8-8075-517a2aa1379e'::uuid, 0.5
)
现在,让我们过滤行,忽略权重(暂时),这样以后我们就不需要处理整个 table:
WITH filtered_questions AS (
SELECT
q.id,
s.id subject_id,
a.id area_id,
i.id institution_id
FROM
public.questions q
INNER JOIN public.subjects_questions sq ON q.id = sq.question_id
INNER JOIN public.subjects s ON s.id = sq.subject_id
INNER JOIN public.institutions_questions iq ON iq.question_id = q.id
INNER JOIN public.institutions i ON i.id = iq.institution_id
INNER JOIN public.areas_questions aq ON aq.question_id = q.id
INNER JOIN public.areas a ON a.id = aq.area_id
WHERE
subject_id IN (SELECT id from filters where type = 'subject')
and institution_id IN (SELECT id from filters where type = 'institution')
and area_id IN (SELECT id from filters where type = 'area')
)
同一个问题可以被多个过滤器选中,增加被选中的几率。我们必须更新权重来解决这个问题。
WITH filtered_questions_weights_sum AS (
SELECT
q.id,
SUM(filters.weight) weight_sum
FROM filtered_questions q
INNER JOIN filters
ON (filters.type = 'subject' AND q.subject_id IN(filters.id))
OR (filters.type = 'area' AND q.area_id IN(filters.id))
OR (filters.type = 'institution' AND q.institution_id IN(filters.id))
GROUP BY q.id
)
正在生成边界,如暴露 here。
WITH cumulative_prob AS (
SELECT
id,
SUM(weight_sum) OVER (ORDER BY id) AS cum_prob
FROM filtered_questions_weights_sum
),
cumulative_bounds AS (
SELECT
id,
COALESCE( lag(cum_prob) OVER (ORDER BY cum_prob, id), 0 ) AS lower_cum_bound,
cum_prob AS upper_cum_bound
FROM cumulative_prob
)
正在生成随机序列。必须重新规范化 (random() * (SELECT SUM(weight_sum)
),因为权重已在上一步中更新。 10 是我们想要 return.
的行数
WITH random_series AS (
SELECT generate_series (1,10),random() * (SELECT SUM(weight_sum) FROM filtered_questions_weights_sum) AS R
)
最后:
SELECT
id, lower_cum_bound, upper_cum_bound, R
FROM random_series
JOIN cumulative_bounds
ON R::NUMERIC <@ numrange(lower_cum_bound::NUMERIC, upper_cum_bound::NUMERIC, '(]')
我们得到以下分布:
id lower_cum_bound upper_cum_bound r
------------------------------------ --------------- --------------- -------------------
380f46e9-f373-4b89-a863-05f484e6b3b6 0 2.0 0.41090718149207534
42bcb088-fc19-4272-8c49-e77999edd01c 2.0 3.9 3.4483200465794654
46a97f1d-789f-46e7-9d3b-bd881a22a32e 3.9 5.9 5.159445870062337
46a97f1d-789f-46e7-9d3b-bd881a22a32e 3.9 5.9 5.524481557868421
972d0296-acc3-4b44-b67d-928049d5e9c2 5.9 7.8 6.842470594821498
bdcc26f7-ccaf-4f8f-9e0b-81b9a6d29cdb 11.6 13.5 12.207371663767844
bdcc26f7-ccaf-4f8f-9e0b-81b9a6d29cdb 11.6 13.5 12.674184153741226
c935e3de-f1b6-4399-b5eb-ed3a9194eb7b 15.5 17.5 17.16804686235264
e5061aeb-53b7-4247-8404-87508c5ac723 21.4 23.4 22.622627633158118
f8c37700-0c3a-457e-8882-7c65269482ea 25.4 27.3 26.841821723571048
综合起来:
WITH filters (type, id, weight) AS (
SELECT 'subject', '148232e0-dece-40d9-81e0-0fa675f040e5'::uuid, 0.5
UNION SELECT 'subject', '854431bb-18ee-4efb-803f-185757d25235'::uuid, 0.4
UNION SELECT 'area', 'e12863fb-afb7-45cf-9198-f9f58ebc80cf'::uuid, 1
UNION SELECT 'institution', '7f56c89f-705e-45c7-98fb-fee470550edf'::uuid, 0.5
UNION SELECT 'institution', '0066257b-b2e3-4ee8-8075-517a2aa1379e'::uuid, 0.5
)
,
filtered_questions AS
(
SELECT
q.id,
SUM(filters.weight) weight_sum
FROM
public.questions q
INNER JOIN public.subjects_questions sq ON q.id = sq.question_id
INNER JOIN public.subjects s ON s.id = sq.subject_id
INNER JOIN public.institutions_questions iq ON iq.question_id = q.id
INNER JOIN public.institutions i ON i.id = iq.institution_id
INNER JOIN public.activity_areas_questions aq ON aq.question_id = q.id
INNER JOIN public.activity_areas a ON a.id = aq.activity_area_id
INNER JOIN filters
ON (filters.type = 'subject' AND s.id IN(filters.id))
OR (filters.type = 'area' AND a.id IN(filters.id))
OR (filters.type = 'institution' AND i.id IN(filters.id))
WHERE
s.id IN (SELECT id from filters where type = 'subject')
and i.id IN (SELECT id from filters where type = 'institution')
and a.id IN (SELECT id from filters where type = 'area')
GROUP BY q.id
)
,
cumulative_prob AS (
SELECT
id,
SUM(weight_sum) OVER (ORDER BY id) AS cum_prob
FROM filtered_questions
)
,
cumulative_bounds AS (
SELECT
id,
COALESCE( lag(cum_prob) OVER (ORDER BY cum_prob, id), 0 ) AS lower_cum_bound,
cum_prob AS upper_cum_bound
FROM cumulative_prob
)
,
random_series AS
(
SELECT generate_series (1,14),random() * (SELECT SUM(weight_sum) FROM filtered_questions) AS R
)
SELECT id, lower_cum_bound, upper_cum_bound, R
FROM random_series
JOIN cumulative_bounds
ON R::NUMERIC <@ numrange(lower_cum_bound::NUMERIC, upper_cum_bound::NUMERIC, '(]')
我有一道题 table,我需要做 X 道题来准备考试。问题需要根据多个标准(学科、机构、地区等)进行筛选,每个标准具有不同的权重。
过滤器权重是在查询之外动态设置和规范化的。例如:
- 主题 1 — 0.4
- 科目 2 — 0.1
- 科目 3 — 0.5
- 机构 1 — 0.2
- 机构 2 — 0.04
- 机构 3 — 0.76
- 区域 1 — 1
其他几点:
- 今天,我有 10 个不同的过滤器(主题、机构、区域等),但用户可以 select 以多种方式混合(例如:10 个主题、5 个机构、30 个区域等),如上例所示。
- 问题 table 有 ~500k 行;
- 过滤器是 N — N 个问题;
- 过滤后,我想限制return行;
- 如果某个过滤器不能提供更多的问题,则必须考虑其他的(记住:我想准备一个测试——如果我还有问题,必须使用它们)
- 我非常关心这个查询的性能。
为了说明,如果我不想对过滤器进行加权,我会这样做:
SELECT
*
FROM
public.questions q
INNER JOIN public.subjects_questions sq ON q.id = sq.question_id
INNER JOIN public.subjects s ON s.id = sq.subject_id
INNER JOIN public.institutions_questions iq ON iq.question_id = q.id
INNER JOIN public.institutions i ON i.id = iq.institution_id
INNER JOIN public.areas_questions aq ON aq.question_id = q.id
INNER JOIN public.areas a ON a.id = aq.area_id
WHERE
s.id IN :subjects
AND a.id IN :areas
AND i.id IN :institutions
ORDER BY
random() limit 200
期望的输出:
Question — Subject — Institution — Area
我的想法是这样的:
- 使用过滤器 return 编辑的问题创建 CTE;必须考虑到同一个问题可以 return 被多个过滤器编辑——我是否需要分开评估每个过滤器然后 UNION ALL 来解决这个问题?也必须分配问题来自哪个过滤器;
- 创建另一个具有权重和关联的相应过滤器的 CTE;
- 加入 CTE,但此时必须对问题进行分组并对权重求和;
- 应用 Window 函数和 return 结果,限于 X 行 (LIMIT X)。
你会如何编写这样的查询/解决这个问题?
像这样的事情呢。这只是为了演示这个想法,我会把细节留给你。如果您不熟悉这种随机选择方法,如果您随机生成一个介于 0 和 1 之间的数字,它有 40% 的机会小于 .4。所以 rand() <= .4 将 return 40% 的时间为真。
假设您已经或可以创建一个看起来有点像这样的 "Filters" 实体
CREATE TABLE Filters
( FieldName VARCHAR(100),
FieldValue VARCHAR(100),
Prob Float -- probability of selection based on Name and Value
);
SELECT DISTINCT TMP.* -- The fields you want. Distinct needed to get rid of
-- records which pass multiple conditions.
FROM (SELECT YRSWF.*,
RAND() AS rnd
FROM YourResultSetWithoutFilters YRSWF -- You can code the details
) TMP
INNER
JOIN Filters F
ON (
TMP.Subject = F.FieldValue
AND F.FieldName = 'Subject'
AND TMP.rnd <= F.prob
)
OR (
TMP.Institution = F.FieldValue
AND F.FieldName = 'Institution'
AND TMP.rnd <= F.prob
)
OR (
TMP.Area = F.FieldValue
AND F.FieldName = 'Area'
AND TMP.rnd <= F.prob
);
好的。设法解决了它。基本上,使用问题中已经概述的策略和 here -- I had already seen this post before, but I was (and still am) trying to solve in a more elegant way -- something like this 的一些帮助,但对于多行 --,不需要手动创建 "bounds"。
让我们逐步尝试:
由于具有权重的过滤器来自架构外部,让我们创建一个 CTE:
WITH filters (type, id, weight) AS (
SELECT 'subject', '148232e0-dece-40d9-81e0-0fa675f040e5'::uuid, 0.5
UNION SELECT 'subject', '854431bb-18ee-4efb-803f-185757d25235'::uuid, 0.4
UNION SELECT 'area', 'e12863fb-afb7-45cf-9198-f9f58ebc80cf'::uuid, 1
UNION SELECT 'institution', '7f56c89f-705e-45c7-98fb-fee470550edf'::uuid, 0.5
UNION SELECT 'institution', '0066257b-b2e3-4ee8-8075-517a2aa1379e'::uuid, 0.5
)
现在,让我们过滤行,忽略权重(暂时),这样以后我们就不需要处理整个 table:
WITH filtered_questions AS (
SELECT
q.id,
s.id subject_id,
a.id area_id,
i.id institution_id
FROM
public.questions q
INNER JOIN public.subjects_questions sq ON q.id = sq.question_id
INNER JOIN public.subjects s ON s.id = sq.subject_id
INNER JOIN public.institutions_questions iq ON iq.question_id = q.id
INNER JOIN public.institutions i ON i.id = iq.institution_id
INNER JOIN public.areas_questions aq ON aq.question_id = q.id
INNER JOIN public.areas a ON a.id = aq.area_id
WHERE
subject_id IN (SELECT id from filters where type = 'subject')
and institution_id IN (SELECT id from filters where type = 'institution')
and area_id IN (SELECT id from filters where type = 'area')
)
同一个问题可以被多个过滤器选中,增加被选中的几率。我们必须更新权重来解决这个问题。
WITH filtered_questions_weights_sum AS (
SELECT
q.id,
SUM(filters.weight) weight_sum
FROM filtered_questions q
INNER JOIN filters
ON (filters.type = 'subject' AND q.subject_id IN(filters.id))
OR (filters.type = 'area' AND q.area_id IN(filters.id))
OR (filters.type = 'institution' AND q.institution_id IN(filters.id))
GROUP BY q.id
)
正在生成边界,如暴露 here。
WITH cumulative_prob AS (
SELECT
id,
SUM(weight_sum) OVER (ORDER BY id) AS cum_prob
FROM filtered_questions_weights_sum
),
cumulative_bounds AS (
SELECT
id,
COALESCE( lag(cum_prob) OVER (ORDER BY cum_prob, id), 0 ) AS lower_cum_bound,
cum_prob AS upper_cum_bound
FROM cumulative_prob
)
正在生成随机序列。必须重新规范化 (random() * (SELECT SUM(weight_sum)
),因为权重已在上一步中更新。 10 是我们想要 return.
WITH random_series AS (
SELECT generate_series (1,10),random() * (SELECT SUM(weight_sum) FROM filtered_questions_weights_sum) AS R
)
最后:
SELECT
id, lower_cum_bound, upper_cum_bound, R
FROM random_series
JOIN cumulative_bounds
ON R::NUMERIC <@ numrange(lower_cum_bound::NUMERIC, upper_cum_bound::NUMERIC, '(]')
我们得到以下分布:
id lower_cum_bound upper_cum_bound r
------------------------------------ --------------- --------------- -------------------
380f46e9-f373-4b89-a863-05f484e6b3b6 0 2.0 0.41090718149207534
42bcb088-fc19-4272-8c49-e77999edd01c 2.0 3.9 3.4483200465794654
46a97f1d-789f-46e7-9d3b-bd881a22a32e 3.9 5.9 5.159445870062337
46a97f1d-789f-46e7-9d3b-bd881a22a32e 3.9 5.9 5.524481557868421
972d0296-acc3-4b44-b67d-928049d5e9c2 5.9 7.8 6.842470594821498
bdcc26f7-ccaf-4f8f-9e0b-81b9a6d29cdb 11.6 13.5 12.207371663767844
bdcc26f7-ccaf-4f8f-9e0b-81b9a6d29cdb 11.6 13.5 12.674184153741226
c935e3de-f1b6-4399-b5eb-ed3a9194eb7b 15.5 17.5 17.16804686235264
e5061aeb-53b7-4247-8404-87508c5ac723 21.4 23.4 22.622627633158118
f8c37700-0c3a-457e-8882-7c65269482ea 25.4 27.3 26.841821723571048
综合起来:
WITH filters (type, id, weight) AS (
SELECT 'subject', '148232e0-dece-40d9-81e0-0fa675f040e5'::uuid, 0.5
UNION SELECT 'subject', '854431bb-18ee-4efb-803f-185757d25235'::uuid, 0.4
UNION SELECT 'area', 'e12863fb-afb7-45cf-9198-f9f58ebc80cf'::uuid, 1
UNION SELECT 'institution', '7f56c89f-705e-45c7-98fb-fee470550edf'::uuid, 0.5
UNION SELECT 'institution', '0066257b-b2e3-4ee8-8075-517a2aa1379e'::uuid, 0.5
)
,
filtered_questions AS
(
SELECT
q.id,
SUM(filters.weight) weight_sum
FROM
public.questions q
INNER JOIN public.subjects_questions sq ON q.id = sq.question_id
INNER JOIN public.subjects s ON s.id = sq.subject_id
INNER JOIN public.institutions_questions iq ON iq.question_id = q.id
INNER JOIN public.institutions i ON i.id = iq.institution_id
INNER JOIN public.activity_areas_questions aq ON aq.question_id = q.id
INNER JOIN public.activity_areas a ON a.id = aq.activity_area_id
INNER JOIN filters
ON (filters.type = 'subject' AND s.id IN(filters.id))
OR (filters.type = 'area' AND a.id IN(filters.id))
OR (filters.type = 'institution' AND i.id IN(filters.id))
WHERE
s.id IN (SELECT id from filters where type = 'subject')
and i.id IN (SELECT id from filters where type = 'institution')
and a.id IN (SELECT id from filters where type = 'area')
GROUP BY q.id
)
,
cumulative_prob AS (
SELECT
id,
SUM(weight_sum) OVER (ORDER BY id) AS cum_prob
FROM filtered_questions
)
,
cumulative_bounds AS (
SELECT
id,
COALESCE( lag(cum_prob) OVER (ORDER BY cum_prob, id), 0 ) AS lower_cum_bound,
cum_prob AS upper_cum_bound
FROM cumulative_prob
)
,
random_series AS
(
SELECT generate_series (1,14),random() * (SELECT SUM(weight_sum) FROM filtered_questions) AS R
)
SELECT id, lower_cum_bound, upper_cum_bound, R
FROM random_series
JOIN cumulative_bounds
ON R::NUMERIC <@ numrange(lower_cum_bound::NUMERIC, upper_cum_bound::NUMERIC, '(]')