PostgreSQL UNION 不能正确合并行
PostgreSQL UNION don't merge lines properly
我在 PostgreSQL 数据库中有 3 个表:
localities (loc, 12561 rows)
plants (pl, 17052 rows)
specimens or samples (esp, 9211 rows)
pl
和 esp
各有一个字段 loc
,用于指定标记的植物生长在哪里,或者样本(通常是有叶子和花朵的树枝)来自哪里。
我需要一份关于有植物或样本的地方的报告,以及每个地方的植物和样本数量。到目前为止我做的最好的是两个子查询的联合,运行速度非常快(33 毫秒获取 69 行):
(select l.id,l.nome,count(pl.id) pls,null esps
from loc l
left join pl on pl.loc = l.id
where l.id in
(select distinct pl.loc
from pl
where pl.loc > 0)
group by l.id,l.nome
union
select l.id,l.nome,null pls,count(e.id) esps
from loc l
left join esp e on e.loc = l.id
where l.id in
(select distinct e.loc
from esp e
where e.loc > 0)
group by l.id,l.nome)
order by id
重点是,当同一个地方既有植物又有样品时,就变成了截然不同的两条线,比如:
11950 | San Martin | | 5 |
11950 | San Martin | 61 | |
当然我要的是:
11950 | San Martin | 61 | 5 |
在此之前,我曾尝试在一个查询中完成所有操作:
select l.id,l.nome,count(pl.id),count(e.id) esps
from loc l
left join pl on pl.loc = l.id
left join esp e on e.loc = l.id
where l.id in
(select distinct pl.loc
from pl
where pl.loc > 0)
or l.id in
(select distinct e.loc
from esp e
where e.loc > 0)
group by l.id,l.nome
但它 returns 是一个奇怪的重复(它将两个结果相乘并显示两次结果):
11950 | San Martin | 305 | 305 |
我试过不使用子查询,但大约需要 13 秒,太长了。
这应该和
一样简单
select * from (
select
location.*,
(select count(id) from plant where plant.location = location.id) as plants,
(select count(id) from sample where sample.location = location.id) as samples
from location
) subquery
where subquery.plants > 0 or subquery.samples > 0;
id | name | plants | samples
----+------------+--------+---------
1 | San Martin | 2 | 1
2 | Rome | 1 | 2
3 | Dallas | 3 | 1
(3 rows)
这是我快速设置用于试验的数据库:
create table location(id serial primary key, name text);
create table plant(id serial primary key, name text, location integer references location(id));
create table sample(id serial primary key, name text, location integer references location(id));
insert into location (name) values ('San Martin'), ('Rome'), ('Dallas'), ('Ghost Town');
insert into plant (name, location) values ('San Martin Dandelion', 1),('San Martin Camomile', 1), ('Rome Raspberry', 2), ('Dallas Locoweed', 3), ('Dallas Lemongrass', 3), ('Dallas Setaria', 3);
insert into sample (name, location) values ('San Martin Bramble', 1), ('Rome Iris', 2), ('Rome Eucalypt', 2), ('Dallas Dogbane', 3);
tests=# select * from location;
id | name
----+------------
1 | San Martin
2 | Rome
3 | Dallas
4 | Ghost Town
(4 rows)
tests=# select * from plant;
id | name | location
----+----------------------+----------
1 | San Martin Dandelion | 1
2 | San Martin Camomile | 1
3 | Rome Raspberry | 2
4 | Dallas Locoweed | 3
5 | Dallas Lemongrass | 3
6 | Dallas Setaria | 3
(6 rows)
tests=# select * from sample;
id | name | location
----+--------------------+----------
1 | San Martin Bramble | 1
2 | Rome Iris | 2
3 | Rome Eucalypt | 2
4 | Dallas Dogbane | 3
(4 rows)
我没有测试过,但我认为它可能是这样的:
SELECT
l.id,
l.nome,
SUM(CASE WHEN pl.id IS NOT NULL THEN 1 ELSE 0 END) as plants_count,
SUM(CASE WHEN e.id IS NOT NULL THEN 1 ELSE 0 END) as esp_count
FROM loc l
LEFT JOIN pl ON pl.loc = l.id
LEFT JOIN esp e ON e.loc = l.id
GROUP BY l.id,l.nome
重点是计算每种类型的非空 ID。
我创建了测试布局:
create table localities (id integer, loc_name text);
create table plants (plant_id integer, loc_id integer);
create table samples (sample_id integer, loc_id integer);
insert into localities select x, ('Loc ' || x::text) from generate_series(1, 12561) x ;
insert into plants select x, (random()*12561)::integer from generate_series(1, 17052) x;
insert into samples select x, (random()*12561)::integer from generate_series(1, 9211) x;
诀窍是从植物和样本中创建具有相同结构的中间体 table。在数据没有意义的地方(植物没有 sample_id),您添加 null:
select loc_id, plant_id, null as sample_id from plants
union all
select loc_id, null as plant_id, sample_id from samples
这个 table 具有统一的结构,然后您可以在其上聚合(我使用 WITH 使其更具可读性。):
with localities_used as (
select loc_id, plant_id, null as sample_id from plants
union all
select loc_id, null as plant_id, sample_id from samples)
select
localities_used.loc_id,
count(localities_used.plant_id) plant_count,
count(localities_used.sample_id) sample_count
from
localities_used
group by
localities_used.loc_id;
如果您需要来自地方的其他数据,您可以将它们加入聚合 table:
with localities_used as (
select loc_id, plant_id, null as sample_id from plants
union all
select loc_id, null as plant_id, sample_id from samples),
aggregated as (
select
localities_used.loc_id,
count(localities_used.plant_id) plant_count,
count(localities_used.sample_id) sample_count
from
localities_used
group by
localities_used.loc_id)
select * from aggregated left outer join localities on aggregated.loc_id = localities.id;
这在我的笔记本电脑上总共需要 75 毫秒。
我在 PostgreSQL 数据库中有 3 个表:
localities (loc, 12561 rows)
plants (pl, 17052 rows)
specimens or samples (esp, 9211 rows)
pl
和 esp
各有一个字段 loc
,用于指定标记的植物生长在哪里,或者样本(通常是有叶子和花朵的树枝)来自哪里。
我需要一份关于有植物或样本的地方的报告,以及每个地方的植物和样本数量。到目前为止我做的最好的是两个子查询的联合,运行速度非常快(33 毫秒获取 69 行):
(select l.id,l.nome,count(pl.id) pls,null esps
from loc l
left join pl on pl.loc = l.id
where l.id in
(select distinct pl.loc
from pl
where pl.loc > 0)
group by l.id,l.nome
union
select l.id,l.nome,null pls,count(e.id) esps
from loc l
left join esp e on e.loc = l.id
where l.id in
(select distinct e.loc
from esp e
where e.loc > 0)
group by l.id,l.nome)
order by id
重点是,当同一个地方既有植物又有样品时,就变成了截然不同的两条线,比如:
11950 | San Martin | | 5 |
11950 | San Martin | 61 | |
当然我要的是:
11950 | San Martin | 61 | 5 |
在此之前,我曾尝试在一个查询中完成所有操作:
select l.id,l.nome,count(pl.id),count(e.id) esps
from loc l
left join pl on pl.loc = l.id
left join esp e on e.loc = l.id
where l.id in
(select distinct pl.loc
from pl
where pl.loc > 0)
or l.id in
(select distinct e.loc
from esp e
where e.loc > 0)
group by l.id,l.nome
但它 returns 是一个奇怪的重复(它将两个结果相乘并显示两次结果):
11950 | San Martin | 305 | 305 |
我试过不使用子查询,但大约需要 13 秒,太长了。
这应该和
一样简单select * from (
select
location.*,
(select count(id) from plant where plant.location = location.id) as plants,
(select count(id) from sample where sample.location = location.id) as samples
from location
) subquery
where subquery.plants > 0 or subquery.samples > 0;
id | name | plants | samples
----+------------+--------+---------
1 | San Martin | 2 | 1
2 | Rome | 1 | 2
3 | Dallas | 3 | 1
(3 rows)
这是我快速设置用于试验的数据库:
create table location(id serial primary key, name text);
create table plant(id serial primary key, name text, location integer references location(id));
create table sample(id serial primary key, name text, location integer references location(id));
insert into location (name) values ('San Martin'), ('Rome'), ('Dallas'), ('Ghost Town');
insert into plant (name, location) values ('San Martin Dandelion', 1),('San Martin Camomile', 1), ('Rome Raspberry', 2), ('Dallas Locoweed', 3), ('Dallas Lemongrass', 3), ('Dallas Setaria', 3);
insert into sample (name, location) values ('San Martin Bramble', 1), ('Rome Iris', 2), ('Rome Eucalypt', 2), ('Dallas Dogbane', 3);
tests=# select * from location;
id | name
----+------------
1 | San Martin
2 | Rome
3 | Dallas
4 | Ghost Town
(4 rows)
tests=# select * from plant;
id | name | location
----+----------------------+----------
1 | San Martin Dandelion | 1
2 | San Martin Camomile | 1
3 | Rome Raspberry | 2
4 | Dallas Locoweed | 3
5 | Dallas Lemongrass | 3
6 | Dallas Setaria | 3
(6 rows)
tests=# select * from sample;
id | name | location
----+--------------------+----------
1 | San Martin Bramble | 1
2 | Rome Iris | 2
3 | Rome Eucalypt | 2
4 | Dallas Dogbane | 3
(4 rows)
我没有测试过,但我认为它可能是这样的:
SELECT
l.id,
l.nome,
SUM(CASE WHEN pl.id IS NOT NULL THEN 1 ELSE 0 END) as plants_count,
SUM(CASE WHEN e.id IS NOT NULL THEN 1 ELSE 0 END) as esp_count
FROM loc l
LEFT JOIN pl ON pl.loc = l.id
LEFT JOIN esp e ON e.loc = l.id
GROUP BY l.id,l.nome
重点是计算每种类型的非空 ID。
我创建了测试布局:
create table localities (id integer, loc_name text);
create table plants (plant_id integer, loc_id integer);
create table samples (sample_id integer, loc_id integer);
insert into localities select x, ('Loc ' || x::text) from generate_series(1, 12561) x ;
insert into plants select x, (random()*12561)::integer from generate_series(1, 17052) x;
insert into samples select x, (random()*12561)::integer from generate_series(1, 9211) x;
诀窍是从植物和样本中创建具有相同结构的中间体 table。在数据没有意义的地方(植物没有 sample_id),您添加 null:
select loc_id, plant_id, null as sample_id from plants
union all
select loc_id, null as plant_id, sample_id from samples
这个 table 具有统一的结构,然后您可以在其上聚合(我使用 WITH 使其更具可读性。):
with localities_used as (
select loc_id, plant_id, null as sample_id from plants
union all
select loc_id, null as plant_id, sample_id from samples)
select
localities_used.loc_id,
count(localities_used.plant_id) plant_count,
count(localities_used.sample_id) sample_count
from
localities_used
group by
localities_used.loc_id;
如果您需要来自地方的其他数据,您可以将它们加入聚合 table:
with localities_used as (
select loc_id, plant_id, null as sample_id from plants
union all
select loc_id, null as plant_id, sample_id from samples),
aggregated as (
select
localities_used.loc_id,
count(localities_used.plant_id) plant_count,
count(localities_used.sample_id) sample_count
from
localities_used
group by
localities_used.loc_id)
select * from aggregated left outer join localities on aggregated.loc_id = localities.id;
这在我的笔记本电脑上总共需要 75 毫秒。