通过交集加入 2 个数据集
Joining 2 data sets via intersection
我之前发布了这个问题的一个版本,但我正在努力使用这种略有不同的数据格式来获得答案......因此我再次联系这个问题。
我有如下一组数据(注意下面数据的读取方式是ID1,Ford,有以下属性和值A:B,B:C和C:D
+------------------------------------------------+
| ID NAME Attribute Attribute Value |
+------------------------------------------------+
| 1 Ford A B |
| 1 Ford B C |
| 1 Ford C D |
| 2 BMW A B |
| 2 BMW C D |
| 2 BMW F G |
| 3 TESLA Z Y |
| 3 TESLA E F |
| 3 TESLA A B |
+------------------------------------------------+
我基本上想将 table 中的每个 ID 与其余 ID 进行比较并输出结果。第一个比较是检查 ID 1 与 2 和 3
并进行比较,看看哪里匹配,哪里不匹配。
输出(仅完成第一次比较与仅 1 条记录):
+----------------------------------------------------------------------------+
| BaseID BaseNAME Target ID TargetName MatchedOn Baseonly Tgtonly |
+----------------------------------------------------------------------------+
| 1 Ford 2 BMW A:B;C:D B:C F:G |
+----------------------------------------------------------------------------+
之前好心人帮我实现了一个笛卡尔积,但是数据的格式略有不同——但是有点太慢了。所以我想看看是否有人对达到预期结果的最佳方法有任何想法?
这可能会更快:
with
t1 as (select distinct a.id ia, a.name na, b.id ib, b.name nb
from t a join t b on a.id < b.id),
t2 as (
select ia, na, ib, nb,
cast(multiset(select attr||':'||val from t where id = ia intersect
select attr||':'||val from t where id = ib )
as sys.odcivarchar2list) a1,
cast(multiset(select attr||':'||val from t where id = ia minus
select attr||':'||val from t where id = ib )
as sys.odcivarchar2list) a2,
cast(multiset(select attr||':'||val from t where id = ib minus
select attr||':'||val from t where id = ia )
as sys.odcivarchar2list) a3
from t1)
select ia, na, ib, nb,
(select listagg(column_value, ';') within group (order by null) from table(t2.a1)) l1,
(select listagg(column_value, ';') within group (order by null) from table(t2.a2)) l2,
(select listagg(column_value, ';') within group (order by null) from table(t2.a3)) l3
from t2
order by ia, ib
- 子查询
t1
创建对 "cars" 我们将比较
t2
收集每对共同或不同属性的集合。 sys.odcivarchar2list
是内置类型,只是 table of string
最终查询将集合更改为字符串列表。结果:
IA NA IB NB L1 L2 L3
-- ------------ --- ----- --------- ------------ -----------
1 Ford 2 BMW A:B;C:D B:C F:G
1 Ford 3 TESLA A:B B:C;C:D E:F;Z:Y
2 BMW 3 TESLA A:B C:D;F:G E:F;Z:Y
我希望这会更快,因为我们没有使用任何用户定义的函数,并且操作次数已降至最低。
另一种方法是使用像这样的函数:
-- find different or common attributes
create or replace function dca(i1 in number, i2 in number, op in char)
return varchar2 is
ret varchar2(1000);
begin
case op
when 'M' then -- minus
select listagg(attr||':'||val, ';') within group (order by null) into ret
from (select attr, val from t where id = i1 minus
select attr, val from t where id = i2 );
when 'I' then -- intersect
select listagg(attr||':'||val, ';') within group (order by null) into ret
from (select attr, val from t where id = i1 intersect
select attr, val from t where id = i2 );
end case;
return ret;
end;
在此查询中:
select ia, na, ib, nb,
dca(ia, ib, 'I') ab, dca(ia, ib, 'M') a_b, dca(ib, ia, 'M') b_a
from (select distinct a.id ia, a.name na, b.id ib, b.name nb
from t a join t b on a.id < b.id)
order by ia, ib;
它也可以,但这是 UDF,在查询中表现较差。
适用于 Oracle 12+。
在 11g 中,您可以使用 listagg 或 UDF 连接集合元素。
with
function collagg(p in sys.ku$_vcnt) return varchar2 is
result varchar2(4000);
begin
for i in 1..p.count loop result := result || '; ' || p(i); end loop;
return(substr(result,2));
end;
t(id, name, attr, val) as
( select 1, 'Ford', 'A', 'B' from dual union all
select 1, 'Ford', 'B', 'C' from dual union all
select 1, 'Ford', 'C', 'D' from dual union all
select 2, 'BMW', 'A', 'B' from dual union all
select 2, 'BMW', 'C', 'D' from dual union all
select 2, 'BMW', 'F', 'G' from dual union all
select 3, 'TESLA', 'Z', 'Y' from dual union all
select 3, 'TESLA', 'E', 'F' from dual union all
select 3, 'TESLA', 'A', 'B' from dual)
, t0 as
(select id, name,
cast(collect(cast(attr||':'||val as varchar2(4000))) as sys.ku$_vcnt) c
from t t1
group by id, name)
select t1.id baseid,
t1.name basename,
t2.id tgtid,
t2.name tgtname,
collagg(t1.c multiset intersect t2.c) matchedon,
collagg(t1.c multiset except t2.c) baseonly,
collagg(t2.c multiset except t1.c) tgtonly
from t0 t1 join t0 t2 on t1.id < t2.id;
我之前发布了这个问题的一个版本,但我正在努力使用这种略有不同的数据格式来获得答案......因此我再次联系这个问题。
我有如下一组数据(注意下面数据的读取方式是ID1,Ford,有以下属性和值A:B,B:C和C:D
+------------------------------------------------+
| ID NAME Attribute Attribute Value |
+------------------------------------------------+
| 1 Ford A B |
| 1 Ford B C |
| 1 Ford C D |
| 2 BMW A B |
| 2 BMW C D |
| 2 BMW F G |
| 3 TESLA Z Y |
| 3 TESLA E F |
| 3 TESLA A B |
+------------------------------------------------+
我基本上想将 table 中的每个 ID 与其余 ID 进行比较并输出结果。第一个比较是检查 ID 1 与 2 和 3 并进行比较,看看哪里匹配,哪里不匹配。
输出(仅完成第一次比较与仅 1 条记录):
+----------------------------------------------------------------------------+
| BaseID BaseNAME Target ID TargetName MatchedOn Baseonly Tgtonly |
+----------------------------------------------------------------------------+
| 1 Ford 2 BMW A:B;C:D B:C F:G |
+----------------------------------------------------------------------------+
之前好心人帮我实现了一个笛卡尔积,但是数据的格式略有不同——但是有点太慢了。所以我想看看是否有人对达到预期结果的最佳方法有任何想法?
这可能会更快:
with
t1 as (select distinct a.id ia, a.name na, b.id ib, b.name nb
from t a join t b on a.id < b.id),
t2 as (
select ia, na, ib, nb,
cast(multiset(select attr||':'||val from t where id = ia intersect
select attr||':'||val from t where id = ib )
as sys.odcivarchar2list) a1,
cast(multiset(select attr||':'||val from t where id = ia minus
select attr||':'||val from t where id = ib )
as sys.odcivarchar2list) a2,
cast(multiset(select attr||':'||val from t where id = ib minus
select attr||':'||val from t where id = ia )
as sys.odcivarchar2list) a3
from t1)
select ia, na, ib, nb,
(select listagg(column_value, ';') within group (order by null) from table(t2.a1)) l1,
(select listagg(column_value, ';') within group (order by null) from table(t2.a2)) l2,
(select listagg(column_value, ';') within group (order by null) from table(t2.a3)) l3
from t2
order by ia, ib
- 子查询
t1
创建对 "cars" 我们将比较 t2
收集每对共同或不同属性的集合。sys.odcivarchar2list
是内置类型,只是 table of string最终查询将集合更改为字符串列表。结果:
IA NA IB NB L1 L2 L3 -- ------------ --- ----- --------- ------------ ----------- 1 Ford 2 BMW A:B;C:D B:C F:G 1 Ford 3 TESLA A:B B:C;C:D E:F;Z:Y 2 BMW 3 TESLA A:B C:D;F:G E:F;Z:Y
我希望这会更快,因为我们没有使用任何用户定义的函数,并且操作次数已降至最低。
另一种方法是使用像这样的函数:
-- find different or common attributes
create or replace function dca(i1 in number, i2 in number, op in char)
return varchar2 is
ret varchar2(1000);
begin
case op
when 'M' then -- minus
select listagg(attr||':'||val, ';') within group (order by null) into ret
from (select attr, val from t where id = i1 minus
select attr, val from t where id = i2 );
when 'I' then -- intersect
select listagg(attr||':'||val, ';') within group (order by null) into ret
from (select attr, val from t where id = i1 intersect
select attr, val from t where id = i2 );
end case;
return ret;
end;
在此查询中:
select ia, na, ib, nb,
dca(ia, ib, 'I') ab, dca(ia, ib, 'M') a_b, dca(ib, ia, 'M') b_a
from (select distinct a.id ia, a.name na, b.id ib, b.name nb
from t a join t b on a.id < b.id)
order by ia, ib;
它也可以,但这是 UDF,在查询中表现较差。
适用于 Oracle 12+。
在 11g 中,您可以使用 listagg 或 UDF 连接集合元素。
with
function collagg(p in sys.ku$_vcnt) return varchar2 is
result varchar2(4000);
begin
for i in 1..p.count loop result := result || '; ' || p(i); end loop;
return(substr(result,2));
end;
t(id, name, attr, val) as
( select 1, 'Ford', 'A', 'B' from dual union all
select 1, 'Ford', 'B', 'C' from dual union all
select 1, 'Ford', 'C', 'D' from dual union all
select 2, 'BMW', 'A', 'B' from dual union all
select 2, 'BMW', 'C', 'D' from dual union all
select 2, 'BMW', 'F', 'G' from dual union all
select 3, 'TESLA', 'Z', 'Y' from dual union all
select 3, 'TESLA', 'E', 'F' from dual union all
select 3, 'TESLA', 'A', 'B' from dual)
, t0 as
(select id, name,
cast(collect(cast(attr||':'||val as varchar2(4000))) as sys.ku$_vcnt) c
from t t1
group by id, name)
select t1.id baseid,
t1.name basename,
t2.id tgtid,
t2.name tgtname,
collagg(t1.c multiset intersect t2.c) matchedon,
collagg(t1.c multiset except t2.c) baseonly,
collagg(t2.c multiset except t1.c) tgtonly
from t0 t1 join t0 t2 on t1.id < t2.id;