如何将简单的 "semijoin" 查询从 SQL 查询转换为 JavaScript?
How to translate a simple "semijoin" query from a SQL query into JavaScript?
根据,我们有:
SELECT DISTINCT s.id
FROM students s
LEFT JOIN grades g ON g.student_id = s.id
WHERE g.student_id IS NOT NULL
其中使用了LEFT JOIN
。但它可以用这样的“半连接”来实现:
SELECT s.id
FROM students s
WHERE EXISTS (SELECT 1 FROM grades g
WHERE g.student_id = s.id)
如何在 JavaScript 中编写这两个查询,给定两个集合 students
和 grades
?我认为第一个是用“嵌套循环连接”实现的(或者可以用索引连接数据结构优化,但我还不知道)。
const query1Results = nestedJoin(students, grades, (s, g) => {
return Boolean(g.student_id && g.student_id === s.id)
}).map(([s]) => s.id)
function nestedJoin(R, S, compare) {
const out = []
for (const r of R) {
for (const s of S) {
if (compare(r, s)) {
out.push([ r, s ])
}
}
}
return out
}
第二个你会怎么做?
const query2Results = semiJoin(students, grades, (s, g) => {
return g.student_id === s.id
}).map(([s]) => s.id)
function semiJoin(R, S) {
const out = []
for (const r of R) {
for (const s of S) {
if (compare(r, s)) {
out.push([ r, s ])
}
}
}
return out
}
基本上我也是这样实现的
const students = [
{ id: 1 },
{ id: 2 },
{ id: 3 },
{ id: 4 },
{ id: 5 },
{ id: 6 },
]
const grades = [
{ id: 1, student_id: 1 },
{ id: 2, student_id: 1 },
{ id: 3, student_id: 1 },
{ id: 4, student_id: 1 },
{ id: 5, student_id: 1 },
{ id: 6, student_id: 1 },
{ id: 10, student_id: 2 },
{ id: 20, student_id: 2 },
{ id: 30, student_id: 2 },
{ id: 40, student_id: 2 },
{ id: 50, student_id: 2 },
{ id: 60, student_id: 2 },
{ id: 100, student_id: 3 },
{ id: 200, student_id: 3 },
{ id: 300, student_id: 3 },
{ id: 400, student_id: 3 },
{ id: 500, student_id: 3 },
{ id: 600, student_id: 3 },
]
const expectedOutput = [
1, 2, 3
]
来自 U-SQL 对 semijoin 的定义...
Semijoins are U-SQL’s way [to] filter a rowset based on the inclusion of its rows in another rowset
这确实取决于您的 table 设计和索引,但假设您在 grades.student_id
上有一个,等效的 JS 可能看起来像
const students = [{"id":1},{"id":2},{"id":3},{"id":4},{"id":5},{"id":6}]
const grades = [{"id":1,"student_id":1},{"id":2,"student_id":1},{"id":3,"student_id":1},{"id":4,"student_id":1},{"id":5,"student_id":1},{"id":6,"student_id":1},{"id":10,"student_id":2},{"id":20,"student_id":2},{"id":30,"student_id":2},{"id":40,"student_id":2},{"id":50,"student_id":2},{"id":60,"student_id":2},{"id":100,"student_id":3},{"id":200,"student_id":3},{"id":300,"student_id":3},{"id":400,"student_id":3},{"id":500,"student_id":3},{"id":600,"student_id":3}]
// index
const idxGradesStudentId = new Set(grades.map(({ student_id }) => student_id))
function semiJoin(R, S) {
// WHERE EXISTS
return R.filter(({ id }) => idxGradesStudentId.has(id))
.map(({ id }) => id) // SELECT
}
console.log(semiJoin(students, grades))
因此,这不是嵌套循环,而是使用 grades.student_id
上的索引(由 Set 表示)来评估 EXISTS
子句(Set.prototype.has()
),它具有O(1) 查找时间复杂度的好处。
没有索引,您仍然可以执行 exists 操作(通过 Array.prototype.some()),但显然性能会降低。
function semiJoin(R, S) {
return R.filter(({ id }) =>
S.some(({ student_id }) => student_id === id)
).map(({ id }) => id)
}
一些初步的评论:
第一个查询在其 WHERE
子句中有一个 NOT NULL
条件,您没有在您的 JS 代码中专门翻译它。事实上,这样的条件抵消了outer join的特殊作用,使其充当了inner join。所以实际上第一个查询是:
SELECT DISTINCT s.id
FROM students s
INNER JOIN grades g ON g.student_id = s.id
所需的输出实际上应该是一个二维数组,因为通常一个 SQL 语句可以 select 多个表达式。所以每个输出行本身可以是一个数组。在这种情况下,我希望输出为 [[1], [2], [3]]
实际上不可能准确翻译数据库引擎如何执行查询,因为数据库引擎通常会检查表和索引以确定查询将如何执行,并且其中一些可能是实时发生的,这意味着当涉及的表之一中的记录数量发生显着变化时,相同的查询甚至可能会以不同的方式执行。
查询的执行方式(访问表的顺序,首先执行哪个连接,使用或不使用哪个索引,...)称为执行计划。这也符合
SQL 语言的原始精神:它应该让用户
说他们想要什么,而不是如何去做。
话虽如此,这个问题让我想到了一些流行的库,这些库允许使用一些混合语法来表达查询,其中 SQL 子句可以通过方法调用链接在一起。
例如,您的两个查询可以这样写:
res = select("s.id")
.distinct()
.from(students, "s")
.from(grades, "g")
.where("g.student_id", "=", "s.id")
.exec();
并且:
res = select("s.id")
.from(students, "s")
.whereExists(
select(1)
.from(grades, "g")
.where("g.student_id", "=", "s.id")
)
.exec();
这是 JavaScript 中的一个实现,它允许上述表达式起作用。它仅支持 SQL 的一小部分,足以处理这两个 SQL 语句中使用的结构:
class Query {
constructor() {
this.tables = {};
this.selections = [];
this.conditions = [];
this.exists = [];
this.isDistinct = false;
}
static parseField(field) {
try { // Try to interpret the argument as a literal value (like 1)
let value = JSON.parse(field);
return {value};
} catch(e) { // Otherwise interpret it as table.field
let [table, column] = field.split(".");
return {table, column};
}
}
distinct() {
this.isDistinct = true;
return this;
}
from(records, alias) {
this.tables[alias] = records;
return this;
}
where(fieldA, operator, fieldB) {
let {table: tableA, column: columnA} = Query.parseField(fieldA);
let {table: tableB, column: columnB} = Query.parseField(fieldB);
this.conditions.push({tableA, columnA, operator, tableB, columnB});
return this;
}
whereExists(subquery) {
this.exists.push({subquery, not: false});
return this;
}
whereNotExists(subquery) {
this.exists.push({subquery, not: true});
return this;
}
exec(parentRecords={}) {
// Perform cartesian product (by lack of index)
let max = Object.values(this.tables).reduce((size, {length}) => size * length, 1);
let results = [];
for (let i = 0; i < max; i++) {
let k = i;
// Select a record from each table
let records = {...parentRecords}; // Allow access to parent query
for (let alias in this.tables) {
let recId = k % this.tables[alias].length;
k = (k - recId) / this.tables[alias].length;
records[alias] = this.tables[alias][recId];
}
// Apply conditions
let include = this.conditions.every(({tableA, columnA, operator, tableB, columnB}) => {
let a = records[tableA][columnA];
let b = records[tableB][columnB];
if (operator == "=") return a === b;
if (operator == "<") return a < b;
if (operator == ">") return a > b;
// ...etc
}) && this.exists.every(({subquery, not}) => { // Where (NOT) EXIST
return not === !subquery.exec(records).length;
});
if (include) {
// Apply SELECT
results.push(this.selections.map(({value, table, column}) => value ?? records[table][column]));
}
}
// Apply DISTINCT
if (this.isDistinct) {
results = [...new Map(results.map(result => [JSON.stringify(result), result])).values()];
}
return results;
}
}
function select(...fields) {
let query = new Query;
query.selections = fields.map(Query.parseField);
return query;
}
// Example run
const students = [
{ id: 1 },
{ id: 2 },
{ id: 3 },
{ id: 4 },
{ id: 5 },
{ id: 6 },
]
const grades = [
{ id: 1, student_id: 1 },
{ id: 2, student_id: 1 },
{ id: 3, student_id: 1 },
{ id: 4, student_id: 1 },
{ id: 5, student_id: 1 },
{ id: 6, student_id: 1 },
{ id: 10, student_id: 2 },
{ id: 20, student_id: 2 },
{ id: 30, student_id: 2 },
{ id: 40, student_id: 2 },
{ id: 50, student_id: 2 },
{ id: 60, student_id: 2 },
{ id: 100, student_id: 3 },
{ id: 200, student_id: 3 },
{ id: 300, student_id: 3 },
{ id: 400, student_id: 3 },
{ id: 500, student_id: 3 },
{ id: 600, student_id: 3 },
]
let res1 = select("s.id")
.distinct()
.from(students, "s")
.from(grades, "g")
.where("g.student_id", "=", "s.id")
.exec();
console.log(res1);
let res2 = select("s.id")
.from(students, "s")
.whereExists(
select(1)
.from(grades, "g")
.where("g.student_id", "=", "s.id")
)
.exec();
console.log(res2);
根据
SELECT DISTINCT s.id
FROM students s
LEFT JOIN grades g ON g.student_id = s.id
WHERE g.student_id IS NOT NULL
其中使用了LEFT JOIN
。但它可以用这样的“半连接”来实现:
SELECT s.id
FROM students s
WHERE EXISTS (SELECT 1 FROM grades g
WHERE g.student_id = s.id)
如何在 JavaScript 中编写这两个查询,给定两个集合 students
和 grades
?我认为第一个是用“嵌套循环连接”实现的(或者可以用索引连接数据结构优化,但我还不知道
const query1Results = nestedJoin(students, grades, (s, g) => {
return Boolean(g.student_id && g.student_id === s.id)
}).map(([s]) => s.id)
function nestedJoin(R, S, compare) {
const out = []
for (const r of R) {
for (const s of S) {
if (compare(r, s)) {
out.push([ r, s ])
}
}
}
return out
}
第二个你会怎么做?
const query2Results = semiJoin(students, grades, (s, g) => {
return g.student_id === s.id
}).map(([s]) => s.id)
function semiJoin(R, S) {
const out = []
for (const r of R) {
for (const s of S) {
if (compare(r, s)) {
out.push([ r, s ])
}
}
}
return out
}
基本上我也是这样实现的
const students = [
{ id: 1 },
{ id: 2 },
{ id: 3 },
{ id: 4 },
{ id: 5 },
{ id: 6 },
]
const grades = [
{ id: 1, student_id: 1 },
{ id: 2, student_id: 1 },
{ id: 3, student_id: 1 },
{ id: 4, student_id: 1 },
{ id: 5, student_id: 1 },
{ id: 6, student_id: 1 },
{ id: 10, student_id: 2 },
{ id: 20, student_id: 2 },
{ id: 30, student_id: 2 },
{ id: 40, student_id: 2 },
{ id: 50, student_id: 2 },
{ id: 60, student_id: 2 },
{ id: 100, student_id: 3 },
{ id: 200, student_id: 3 },
{ id: 300, student_id: 3 },
{ id: 400, student_id: 3 },
{ id: 500, student_id: 3 },
{ id: 600, student_id: 3 },
]
const expectedOutput = [
1, 2, 3
]
来自 U-SQL 对 semijoin 的定义...
Semijoins are U-SQL’s way [to] filter a rowset based on the inclusion of its rows in another rowset
这确实取决于您的 table 设计和索引,但假设您在 grades.student_id
上有一个,等效的 JS 可能看起来像
const students = [{"id":1},{"id":2},{"id":3},{"id":4},{"id":5},{"id":6}]
const grades = [{"id":1,"student_id":1},{"id":2,"student_id":1},{"id":3,"student_id":1},{"id":4,"student_id":1},{"id":5,"student_id":1},{"id":6,"student_id":1},{"id":10,"student_id":2},{"id":20,"student_id":2},{"id":30,"student_id":2},{"id":40,"student_id":2},{"id":50,"student_id":2},{"id":60,"student_id":2},{"id":100,"student_id":3},{"id":200,"student_id":3},{"id":300,"student_id":3},{"id":400,"student_id":3},{"id":500,"student_id":3},{"id":600,"student_id":3}]
// index
const idxGradesStudentId = new Set(grades.map(({ student_id }) => student_id))
function semiJoin(R, S) {
// WHERE EXISTS
return R.filter(({ id }) => idxGradesStudentId.has(id))
.map(({ id }) => id) // SELECT
}
console.log(semiJoin(students, grades))
因此,这不是嵌套循环,而是使用 grades.student_id
上的索引(由 Set 表示)来评估 EXISTS
子句(Set.prototype.has()
),它具有O(1) 查找时间复杂度的好处。
没有索引,您仍然可以执行 exists 操作(通过 Array.prototype.some()),但显然性能会降低。
function semiJoin(R, S) {
return R.filter(({ id }) =>
S.some(({ student_id }) => student_id === id)
).map(({ id }) => id)
}
一些初步的评论:
第一个查询在其
WHERE
子句中有一个NOT NULL
条件,您没有在您的 JS 代码中专门翻译它。事实上,这样的条件抵消了outer join的特殊作用,使其充当了inner join。所以实际上第一个查询是:SELECT DISTINCT s.id FROM students s INNER JOIN grades g ON g.student_id = s.id
所需的输出实际上应该是一个二维数组,因为通常一个 SQL 语句可以 select 多个表达式。所以每个输出行本身可以是一个数组。在这种情况下,我希望输出为
[[1], [2], [3]]
实际上不可能准确翻译数据库引擎如何执行查询,因为数据库引擎通常会检查表和索引以确定查询将如何执行,并且其中一些可能是实时发生的,这意味着当涉及的表之一中的记录数量发生显着变化时,相同的查询甚至可能会以不同的方式执行。
查询的执行方式(访问表的顺序,首先执行哪个连接,使用或不使用哪个索引,...)称为执行计划。这也符合 SQL 语言的原始精神:它应该让用户 说他们想要什么,而不是如何去做。
话虽如此,这个问题让我想到了一些流行的库,这些库允许使用一些混合语法来表达查询,其中 SQL 子句可以通过方法调用链接在一起。
例如,您的两个查询可以这样写:
res = select("s.id")
.distinct()
.from(students, "s")
.from(grades, "g")
.where("g.student_id", "=", "s.id")
.exec();
并且:
res = select("s.id")
.from(students, "s")
.whereExists(
select(1)
.from(grades, "g")
.where("g.student_id", "=", "s.id")
)
.exec();
这是 JavaScript 中的一个实现,它允许上述表达式起作用。它仅支持 SQL 的一小部分,足以处理这两个 SQL 语句中使用的结构:
class Query {
constructor() {
this.tables = {};
this.selections = [];
this.conditions = [];
this.exists = [];
this.isDistinct = false;
}
static parseField(field) {
try { // Try to interpret the argument as a literal value (like 1)
let value = JSON.parse(field);
return {value};
} catch(e) { // Otherwise interpret it as table.field
let [table, column] = field.split(".");
return {table, column};
}
}
distinct() {
this.isDistinct = true;
return this;
}
from(records, alias) {
this.tables[alias] = records;
return this;
}
where(fieldA, operator, fieldB) {
let {table: tableA, column: columnA} = Query.parseField(fieldA);
let {table: tableB, column: columnB} = Query.parseField(fieldB);
this.conditions.push({tableA, columnA, operator, tableB, columnB});
return this;
}
whereExists(subquery) {
this.exists.push({subquery, not: false});
return this;
}
whereNotExists(subquery) {
this.exists.push({subquery, not: true});
return this;
}
exec(parentRecords={}) {
// Perform cartesian product (by lack of index)
let max = Object.values(this.tables).reduce((size, {length}) => size * length, 1);
let results = [];
for (let i = 0; i < max; i++) {
let k = i;
// Select a record from each table
let records = {...parentRecords}; // Allow access to parent query
for (let alias in this.tables) {
let recId = k % this.tables[alias].length;
k = (k - recId) / this.tables[alias].length;
records[alias] = this.tables[alias][recId];
}
// Apply conditions
let include = this.conditions.every(({tableA, columnA, operator, tableB, columnB}) => {
let a = records[tableA][columnA];
let b = records[tableB][columnB];
if (operator == "=") return a === b;
if (operator == "<") return a < b;
if (operator == ">") return a > b;
// ...etc
}) && this.exists.every(({subquery, not}) => { // Where (NOT) EXIST
return not === !subquery.exec(records).length;
});
if (include) {
// Apply SELECT
results.push(this.selections.map(({value, table, column}) => value ?? records[table][column]));
}
}
// Apply DISTINCT
if (this.isDistinct) {
results = [...new Map(results.map(result => [JSON.stringify(result), result])).values()];
}
return results;
}
}
function select(...fields) {
let query = new Query;
query.selections = fields.map(Query.parseField);
return query;
}
// Example run
const students = [
{ id: 1 },
{ id: 2 },
{ id: 3 },
{ id: 4 },
{ id: 5 },
{ id: 6 },
]
const grades = [
{ id: 1, student_id: 1 },
{ id: 2, student_id: 1 },
{ id: 3, student_id: 1 },
{ id: 4, student_id: 1 },
{ id: 5, student_id: 1 },
{ id: 6, student_id: 1 },
{ id: 10, student_id: 2 },
{ id: 20, student_id: 2 },
{ id: 30, student_id: 2 },
{ id: 40, student_id: 2 },
{ id: 50, student_id: 2 },
{ id: 60, student_id: 2 },
{ id: 100, student_id: 3 },
{ id: 200, student_id: 3 },
{ id: 300, student_id: 3 },
{ id: 400, student_id: 3 },
{ id: 500, student_id: 3 },
{ id: 600, student_id: 3 },
]
let res1 = select("s.id")
.distinct()
.from(students, "s")
.from(grades, "g")
.where("g.student_id", "=", "s.id")
.exec();
console.log(res1);
let res2 = select("s.id")
.from(students, "s")
.whereExists(
select(1)
.from(grades, "g")
.where("g.student_id", "=", "s.id")
)
.exec();
console.log(res2);