如何在 postgresql table 中对匹配输入值或匹配任何其他匹配行的值的行进行聚类?
How to cluster rows in a postgresql table that match an input value or match a value from any of the other matching rows?
我的 postgresql 数据库中有一个看起来像这样的 table
如果集群中的每个联系人都与集群中的另一个联系人共享 contact_id_a 或 contact_id_b 值(或两者),我如何恢复联系人集群?
在上面的屏幕截图示例中,第 1-6 行将属于同一集群,而第 8 行将不属于任何集群。
如何使用 SQL 查询或 SQL 查询结合 Java 代码来实现?
对于上下文,此 table 列出了联系人列表中所有可能重复的联系人。我们想向列表所有者显示所有可能重复的联系人,以便用户可以手动管理这些重复项。
这是我的起始代码:
DuplicateCandidate firstDuplicate = db.sql("select * from duplicates where list_id = "+list_id+ " and ignore_duplicate is not true").first(DuplicateCandidate);
String sql = "select * from duplicates where list_id = "+list_id+ "and ignore_duplicate is not true "
+ "and (contact_id_a = ? or contact_id_b = ? or contact_id_a = ? or contact_id_b = ?";
List<DuplicateCandidate> groupOfDuplicates = db.sql(sql, firstDuplicate.contact_id_a,firstDuplicate.contact_id_a, firstDuplicate.contact_id_b, firstDuplicate.contact_id_b).results(DuplicateCandidate.class);
这将返回第一行和包含 16247096 或 16247097 的任何其他行,但不返回第二个查询结果中与 contact_ids 匹配的其他基本行。
干杯。
您可以使用递归 CTE。这会遍历图表,然后为图表中的每一行分配最小标识符。请注意,您的数据没有每一行的唯一标识符,因此首先生成一个:
with recursive d as (
select row_number() over (order by contact_id_a, contact_id_b) as id, d.*
from duplicates d
),
cte (id, contact_id_a, contact_id_b, min_id, ids, lev) as (
select id, contact_id_a, contact_id_b, id as min_id, array[id] as ids, 1 as lev
from d
union all
select d.id, d.contact_id_a, d.contact_id_b, least(d.id, cte.min_id), ids || d.id, lev + 1
from cte join
d
on cte.contact_id_a = d.contact_id_a or cte.contact_id_b = d.contact_id_b
where d.id <> ALL (cte.ids)
)
select distinct on (id) cte.*
from cte
order by id, min_id;
列 min_id
包含您想要的分组。
Here 是一个 db<>fiddle 说明代码。
像这样的聚类是一个具有未知步骤数的迭代过程。我从未找到可以在递归查询中完成的解决方案。
我已经六年多没做CRM了,但是下面的功能和我们以前生成匹配组的方法差不多。这样做 row-by-row 对于我们的工作量来说表现不够好,并且通过宿主语言使用例如Java HashMap()
和 HashSet()
以及倒排索引会产生非常混乱的代码。
假设此架构:
\d contact_info
Table "public.contact_info"
Column | Type | Collation | Nullable | Default
------------------+---------+-----------+----------+---------
contact_id_a | bigint | | |
contact_id_b | bigint | | |
ignore_duplicate | boolean | | | false
list_id | integer | | | 496
select * from contact_info ;
contact_id_a | contact_id_b | ignore_duplicate | list_id
--------------+--------------+------------------+---------
16247096 | 16247097 | f | 496
16247096 | 16247098 | f | 496
16247096 | 16247099 | f | 496
16247097 | 16247098 | f | 496
16247097 | 16247099 | f | 496
16247098 | 16247099 | f | 496
16247094 | 16247095 | f | 496
(7 rows)
此函数创建两个临时表来保存中间集群,然后returns一旦没有更多的集群可能就产生结果。
create or replace function cluster_contact()
returns table (clust_id bigint, contact_id bigint)
language plpgsql as $$
declare
last_count bigint := 1;
this_count bigint := 0;
begin
create temp table contact_match (clust_id bigint, contact_id bigint) on commit drop;
create index cm_1 on contact_match (contact_id, clust_id);
create index cm_2 on contact_match using hash (clust_id);
create temp table contact_hold (clust_id bigint, contact_id bigint) on commit drop;
with dedup as (
select distinct least(ci.contact_id_a) as clust_id,
greatest(ci.contact_id_b) as contact_id
from contact_info ci
where not ci.ignore_duplicate
)
insert into contact_match
select d.clust_id, d.clust_id from dedup d
union
select d.clust_id, d.contact_id from dedup d;
while last_count > this_count loop
if this_count = 0 then
select count(distinct cm.clust_id) into last_count from contact_match cm;
else
last_count := this_count;
end if;
with new_cid as (
select cm.contact_id as clust_id_old,
min(cm.clust_id) as clust_id_new
from contact_match cm
group by cm.contact_id
)
update contact_match
set clust_id = nc.clust_id_new
from new_cid nc
where contact_match.clust_id = nc.clust_id_old;
truncate table contact_hold;
insert into contact_hold
select distinct * from contact_match;
truncate table contact_match;
insert into contact_match
select * from contact_hold;
select count(distinct cm.clust_id) into this_count from contact_match cm;
end loop;
return query select * from contact_match order by clust_id, contact_id;
end $$;
我见过开发人员面临的最大心理障碍之一是忽略了 contact_id
与其自身的关系。这导致不相交的处理和心智模型被 left-side 和 right-side.
不必要地复杂化
select * from cluster_contact();
clust_id | contact_id
----------+------------
16247094 | 16247094
16247094 | 16247095
16247096 | 16247096
16247096 | 16247097
16247096 | 16247098
16247096 | 16247099
(6 rows)
如果您需要澄清此解决方案中的任何步骤或者它是否不适合您,请发表评论。
此外,知道 Levenshtein 在 fuzzystrmatch
中可用,而且效果很好。
如果您希望顺序 clust_id
从 1
开始,请将函数中的 return query
更改为:
return query
select dense_rank() over (order by cm.clust_id) as clust_id,
cm.contact_id
from contact_match cm
order by clust_id, contact_id;
它将产生:
select * from cluster_contact();
clust_id | contact_id
----------+------------
1 | 16247094
1 | 16247095
2 | 16247096
2 | 16247097
2 | 16247098
2 | 16247099
(6 rows)
我的 postgresql 数据库中有一个看起来像这样的 table
如果集群中的每个联系人都与集群中的另一个联系人共享 contact_id_a 或 contact_id_b 值(或两者),我如何恢复联系人集群?
在上面的屏幕截图示例中,第 1-6 行将属于同一集群,而第 8 行将不属于任何集群。
如何使用 SQL 查询或 SQL 查询结合 Java 代码来实现?
对于上下文,此 table 列出了联系人列表中所有可能重复的联系人。我们想向列表所有者显示所有可能重复的联系人,以便用户可以手动管理这些重复项。
这是我的起始代码:
DuplicateCandidate firstDuplicate = db.sql("select * from duplicates where list_id = "+list_id+ " and ignore_duplicate is not true").first(DuplicateCandidate);
String sql = "select * from duplicates where list_id = "+list_id+ "and ignore_duplicate is not true "
+ "and (contact_id_a = ? or contact_id_b = ? or contact_id_a = ? or contact_id_b = ?";
List<DuplicateCandidate> groupOfDuplicates = db.sql(sql, firstDuplicate.contact_id_a,firstDuplicate.contact_id_a, firstDuplicate.contact_id_b, firstDuplicate.contact_id_b).results(DuplicateCandidate.class);
这将返回第一行和包含 16247096 或 16247097 的任何其他行,但不返回第二个查询结果中与 contact_ids 匹配的其他基本行。
干杯。
您可以使用递归 CTE。这会遍历图表,然后为图表中的每一行分配最小标识符。请注意,您的数据没有每一行的唯一标识符,因此首先生成一个:
with recursive d as (
select row_number() over (order by contact_id_a, contact_id_b) as id, d.*
from duplicates d
),
cte (id, contact_id_a, contact_id_b, min_id, ids, lev) as (
select id, contact_id_a, contact_id_b, id as min_id, array[id] as ids, 1 as lev
from d
union all
select d.id, d.contact_id_a, d.contact_id_b, least(d.id, cte.min_id), ids || d.id, lev + 1
from cte join
d
on cte.contact_id_a = d.contact_id_a or cte.contact_id_b = d.contact_id_b
where d.id <> ALL (cte.ids)
)
select distinct on (id) cte.*
from cte
order by id, min_id;
列 min_id
包含您想要的分组。
Here 是一个 db<>fiddle 说明代码。
像这样的聚类是一个具有未知步骤数的迭代过程。我从未找到可以在递归查询中完成的解决方案。
我已经六年多没做CRM了,但是下面的功能和我们以前生成匹配组的方法差不多。这样做 row-by-row 对于我们的工作量来说表现不够好,并且通过宿主语言使用例如Java HashMap()
和 HashSet()
以及倒排索引会产生非常混乱的代码。
假设此架构:
\d contact_info
Table "public.contact_info"
Column | Type | Collation | Nullable | Default
------------------+---------+-----------+----------+---------
contact_id_a | bigint | | |
contact_id_b | bigint | | |
ignore_duplicate | boolean | | | false
list_id | integer | | | 496
select * from contact_info ;
contact_id_a | contact_id_b | ignore_duplicate | list_id
--------------+--------------+------------------+---------
16247096 | 16247097 | f | 496
16247096 | 16247098 | f | 496
16247096 | 16247099 | f | 496
16247097 | 16247098 | f | 496
16247097 | 16247099 | f | 496
16247098 | 16247099 | f | 496
16247094 | 16247095 | f | 496
(7 rows)
此函数创建两个临时表来保存中间集群,然后returns一旦没有更多的集群可能就产生结果。
create or replace function cluster_contact()
returns table (clust_id bigint, contact_id bigint)
language plpgsql as $$
declare
last_count bigint := 1;
this_count bigint := 0;
begin
create temp table contact_match (clust_id bigint, contact_id bigint) on commit drop;
create index cm_1 on contact_match (contact_id, clust_id);
create index cm_2 on contact_match using hash (clust_id);
create temp table contact_hold (clust_id bigint, contact_id bigint) on commit drop;
with dedup as (
select distinct least(ci.contact_id_a) as clust_id,
greatest(ci.contact_id_b) as contact_id
from contact_info ci
where not ci.ignore_duplicate
)
insert into contact_match
select d.clust_id, d.clust_id from dedup d
union
select d.clust_id, d.contact_id from dedup d;
while last_count > this_count loop
if this_count = 0 then
select count(distinct cm.clust_id) into last_count from contact_match cm;
else
last_count := this_count;
end if;
with new_cid as (
select cm.contact_id as clust_id_old,
min(cm.clust_id) as clust_id_new
from contact_match cm
group by cm.contact_id
)
update contact_match
set clust_id = nc.clust_id_new
from new_cid nc
where contact_match.clust_id = nc.clust_id_old;
truncate table contact_hold;
insert into contact_hold
select distinct * from contact_match;
truncate table contact_match;
insert into contact_match
select * from contact_hold;
select count(distinct cm.clust_id) into this_count from contact_match cm;
end loop;
return query select * from contact_match order by clust_id, contact_id;
end $$;
我见过开发人员面临的最大心理障碍之一是忽略了 contact_id
与其自身的关系。这导致不相交的处理和心智模型被 left-side 和 right-side.
select * from cluster_contact();
clust_id | contact_id
----------+------------
16247094 | 16247094
16247094 | 16247095
16247096 | 16247096
16247096 | 16247097
16247096 | 16247098
16247096 | 16247099
(6 rows)
如果您需要澄清此解决方案中的任何步骤或者它是否不适合您,请发表评论。
此外,知道 Levenshtein 在 fuzzystrmatch
中可用,而且效果很好。
如果您希望顺序 clust_id
从 1
开始,请将函数中的 return query
更改为:
return query
select dense_rank() over (order by cm.clust_id) as clust_id,
cm.contact_id
from contact_match cm
order by clust_id, contact_id;
它将产生:
select * from cluster_contact();
clust_id | contact_id
----------+------------
1 | 16247094
1 | 16247095
2 | 16247096
2 | 16247097
2 | 16247098
2 | 16247099
(6 rows)