识别 SQLite 中缺失的序列 table
Identifying missing sequences in SQLite table
我有一个 table 有 1000 条记录。每条记录代表子文件夹中的一个文件以及与该文件相关的一些属性。感兴趣的字段/列如下:
__dirpath = 包含感兴趣文件的每个子文件夹的名称
track = 文件的序列号(这些应该是连续的,范围可以从一到任何数字
我正在寻找与每个 __dirpath.
相关的文件所代表的序列中缺失的数字
列出序列中每个缺失数字的开始和结束的通用查询如下(来源:https://www.xaprb.com/blog/2005/12/06/find-missing-numbers-in-a-sequence-with-sql/):
select start, stop from (
select m.id + 1 as start,
(select min(id) - 1 from sequence as x where x.id > m.id) as stop
from sequence as m
left outer join sequence as r on m.id = r.id - 1
where r.id is null
) as x
where stop is not null order by start, stop;
然而,在这种情况下,我需要对与具有相同 __dirpath 值的记录相关的每个序列执行相同的操作。假设序列 table 除了通用示例中的 id 字段外还有一个 __dirpath 列,该怎么做?
这是一个带有虚拟数据的 table,并且上述查询适用于此 table,而不考虑 __dirpath:
drop table if exists sequence;
create table sequence (__dirpath blob, id int not null);
insert into sequence(__dirpath, id) values
("A", 1), ("A",2), ("A", 3), ("A", 4), ("A", 6), ("A", 7), ("A", 8), ("A", 9),
("A", 10), ("A", 15), ("A", 16), ("A", 17), ("A", 18), ("A", 19), ("A", 20);
如果然后运行以下查询,您将获得正确的答案集:
select dir, start, stop from (
select m.id + 1 as start,
(select min(id) - 1 from sequence as x where x.id > m.id) as stop, m.__dirpath as dir
from sequence as m
left outer join sequence as r on m.id = r.id - 1
where r.id is null
) as x
where stop is not null order by dir, start, stop;
结果正确如下:
If one then add the following records to the table:
insert into sequence(__dirpath, id) values
("B", 1), ("B",4), ("B", 5), ("B", 6), ("B", 7), ("B", 117), ("B", 14), ("B", 9),
("B", 10), ("B", 15), ("B", 16), ("B", 17), ("B", 18), ("B", 19), ("B", 20);
并重新运行上面的左外连接,结果没有意义,因为与 __dirpath = "A" 和 __dirpath = "B" 相关的值都在查询中被引用,产生:
所以问题本质上是如何修改查询以引用仅与每个 __dirpath 条目相关的记录。
您必须在相关子查询和连接中添加列 __dirpath
:
SELECT dir, start, stop
FROM (
SELECT m.id + 1 start,
(SELECT MIN(id) - 1 FROM sequence x WHERE x.__dirpath = m.__dirpath AND x.id > m.id) stop,
m.__dirpath dir
FROM sequence m LEFT JOIN sequence r
ON m.__dirpath = r.__dirpath AND m.id = r.id - 1
WHERE r.id IS NULL
)
WHERE stop IS NOT NULL
ORDER BY dir, start, stop;
另一种具有 CTE 和 window 功能的解决方案:
WITH cte AS (
SELECT __dirpath, grp, MIN(id) min_id, MAX(id) max_id
FROM (
SELECT *, SUM(flag) OVER (PARTITION BY __dirpath ORDER BY id) grp
FROM (
SELECT *, id - 1 <> LAG(id, 1, id - 1) OVER (PARTITION BY __dirpath ORDER BY id) flag
FROM sequence
)
)
GROUP BY __dirpath, grp
)
SELECT c1.__dirpath,
MAX(c1.max_id) + 1 start,
MIN(c2.min_id) - 1 stop
FROM cte c1 INNER JOIN cte c2
ON c2.__dirpath = c1.__dirpath AND c2.grp = c1.grp + 1
GROUP BY c1.__dirpath, c1.grp
参见demo。
我有一个 table 有 1000 条记录。每条记录代表子文件夹中的一个文件以及与该文件相关的一些属性。感兴趣的字段/列如下:
__dirpath = 包含感兴趣文件的每个子文件夹的名称 track = 文件的序列号(这些应该是连续的,范围可以从一到任何数字
我正在寻找与每个 __dirpath.
相关的文件所代表的序列中缺失的数字列出序列中每个缺失数字的开始和结束的通用查询如下(来源:https://www.xaprb.com/blog/2005/12/06/find-missing-numbers-in-a-sequence-with-sql/):
select start, stop from (
select m.id + 1 as start,
(select min(id) - 1 from sequence as x where x.id > m.id) as stop
from sequence as m
left outer join sequence as r on m.id = r.id - 1
where r.id is null
) as x
where stop is not null order by start, stop;
然而,在这种情况下,我需要对与具有相同 __dirpath 值的记录相关的每个序列执行相同的操作。假设序列 table 除了通用示例中的 id 字段外还有一个 __dirpath 列,该怎么做?
这是一个带有虚拟数据的 table,并且上述查询适用于此 table,而不考虑 __dirpath:
drop table if exists sequence;
create table sequence (__dirpath blob, id int not null);
insert into sequence(__dirpath, id) values
("A", 1), ("A",2), ("A", 3), ("A", 4), ("A", 6), ("A", 7), ("A", 8), ("A", 9),
("A", 10), ("A", 15), ("A", 16), ("A", 17), ("A", 18), ("A", 19), ("A", 20);
如果然后运行以下查询,您将获得正确的答案集:
select dir, start, stop from (
select m.id + 1 as start,
(select min(id) - 1 from sequence as x where x.id > m.id) as stop, m.__dirpath as dir
from sequence as m
left outer join sequence as r on m.id = r.id - 1
where r.id is null
) as x
where stop is not null order by dir, start, stop;
结果正确如下:
If one then add the following records to the table:
insert into sequence(__dirpath, id) values
("B", 1), ("B",4), ("B", 5), ("B", 6), ("B", 7), ("B", 117), ("B", 14), ("B", 9),
("B", 10), ("B", 15), ("B", 16), ("B", 17), ("B", 18), ("B", 19), ("B", 20);
并重新运行上面的左外连接,结果没有意义,因为与 __dirpath = "A" 和 __dirpath = "B" 相关的值都在查询中被引用,产生:
所以问题本质上是如何修改查询以引用仅与每个 __dirpath 条目相关的记录。
您必须在相关子查询和连接中添加列 __dirpath
:
SELECT dir, start, stop
FROM (
SELECT m.id + 1 start,
(SELECT MIN(id) - 1 FROM sequence x WHERE x.__dirpath = m.__dirpath AND x.id > m.id) stop,
m.__dirpath dir
FROM sequence m LEFT JOIN sequence r
ON m.__dirpath = r.__dirpath AND m.id = r.id - 1
WHERE r.id IS NULL
)
WHERE stop IS NOT NULL
ORDER BY dir, start, stop;
另一种具有 CTE 和 window 功能的解决方案:
WITH cte AS (
SELECT __dirpath, grp, MIN(id) min_id, MAX(id) max_id
FROM (
SELECT *, SUM(flag) OVER (PARTITION BY __dirpath ORDER BY id) grp
FROM (
SELECT *, id - 1 <> LAG(id, 1, id - 1) OVER (PARTITION BY __dirpath ORDER BY id) flag
FROM sequence
)
)
GROUP BY __dirpath, grp
)
SELECT c1.__dirpath,
MAX(c1.max_id) + 1 start,
MIN(c2.min_id) - 1 stop
FROM cte c1 INNER JOIN cte c2
ON c2.__dirpath = c1.__dirpath AND c2.grp = c1.grp + 1
GROUP BY c1.__dirpath, c1.grp
参见demo。