如何将简单的 "semijoin" 查询从 SQL 查询转换为 JavaScript?

How to translate a simple "semijoin" query from a SQL query into JavaScript?

根据,我们有:

SELECT DISTINCT s.id
FROM students s
LEFT JOIN grades g ON g.student_id = s.id
WHERE g.student_id IS NOT NULL

其中使用了LEFT JOIN。但它可以用这样的“半连接”来实现:

SELECT s.id
FROM  students s
WHERE EXISTS (SELECT 1 FROM grades g
              WHERE g.student_id = s.id)

如何在 JavaScript 中编写这两个查询,给定两个集合 studentsgrades?我认为第一个是用“嵌套循环连接”实现的(或者可以用索引连接数据结构优化,但我还不知道)。

const query1Results = nestedJoin(students, grades, (s, g) => {
  return Boolean(g.student_id && g.student_id === s.id)
}).map(([s]) => s.id)

function nestedJoin(R, S, compare) {
  const out = []
  
  for (const r of R) {
    for (const s of S) {
      if (compare(r, s)) {
        out.push([ r, s ])
      }
    }
  }

  return out
}

第二个你会怎么做?

const query2Results = semiJoin(students, grades, (s, g) => {
  return g.student_id === s.id
}).map(([s]) => s.id)

function semiJoin(R, S) {
  const out = []
  
  for (const r of R) {
    for (const s of S) {
      if (compare(r, s)) {
        out.push([ r, s ])
      }
    }
  }

  return out
}

基本上我也是这样实现的

const students = [
  { id: 1 },
  { id: 2 },
  { id: 3 },
  { id: 4 },
  { id: 5 },
  { id: 6 },
]
const grades = [
  { id: 1, student_id: 1 },
  { id: 2, student_id: 1 },
  { id: 3, student_id: 1 },
  { id: 4, student_id: 1 },
  { id: 5, student_id: 1 },
  { id: 6, student_id: 1 },
  { id: 10, student_id: 2 },
  { id: 20, student_id: 2 },
  { id: 30, student_id: 2 },
  { id: 40, student_id: 2 },
  { id: 50, student_id: 2 },
  { id: 60, student_id: 2 },
  { id: 100, student_id: 3 },
  { id: 200, student_id: 3 },
  { id: 300, student_id: 3 },
  { id: 400, student_id: 3 },
  { id: 500, student_id: 3 },
  { id: 600, student_id: 3 },
]

const expectedOutput = [
  1, 2, 3
]

来自 U-SQL 对 semijoin 的定义...

Semijoins are U-SQL’s way [to] filter a rowset based on the inclusion of its rows in another rowset

这确实取决于您的 table 设计和索引,但假设您在 grades.student_id 上有一个,等效的 JS 可能看起来像

const students = [{"id":1},{"id":2},{"id":3},{"id":4},{"id":5},{"id":6}]
const grades = [{"id":1,"student_id":1},{"id":2,"student_id":1},{"id":3,"student_id":1},{"id":4,"student_id":1},{"id":5,"student_id":1},{"id":6,"student_id":1},{"id":10,"student_id":2},{"id":20,"student_id":2},{"id":30,"student_id":2},{"id":40,"student_id":2},{"id":50,"student_id":2},{"id":60,"student_id":2},{"id":100,"student_id":3},{"id":200,"student_id":3},{"id":300,"student_id":3},{"id":400,"student_id":3},{"id":500,"student_id":3},{"id":600,"student_id":3}]

// index
const idxGradesStudentId = new Set(grades.map(({ student_id }) => student_id))

function semiJoin(R, S) {
  // WHERE EXISTS
  return R.filter(({ id }) => idxGradesStudentId.has(id))
    .map(({ id }) => id) // SELECT
}

console.log(semiJoin(students, grades))

因此,这不是嵌套循环,而是使用 grades.student_id 上的索引(由 Set 表示)来评估 EXISTS 子句(Set.prototype.has()),它具有O(1) 查找时间复杂度的好处。


没有索引,您仍然可以执行 exists 操作(通过 Array.prototype.some()),但显然性能会降低。

function semiJoin(R, S) {
  return R.filter(({ id }) =>
    S.some(({ student_id }) => student_id === id)
  ).map(({ id }) => id)
}

一些初步的评论:

  • 第一个查询在其 WHERE 子句中有一个 NOT NULL 条件,您没有在您的 JS 代码中专门翻译它。事实上,这样的条件抵消了outer join的特殊作用,使其充当了inner join。所以实际上第一个查询是:

    SELECT DISTINCT s.id
    FROM students s
    INNER JOIN grades g ON g.student_id = s.id
    
  • 所需的输出实际上应该是一个二维数组,因为通常一个 SQL 语句可以 select 多个表达式。所以每个输出行本身可以是一个数组。在这种情况下,我希望输出为 [[1], [2], [3]]

  • 实际上不可能准确翻译数据库引擎如何执行查询,因为数据库引擎通常会检查表和索引以确定查询将如何执行,并且其中一些可能是实时发生的,这意味着当涉及的表之一中的记录数量发生显着变化时,相同的查询甚至可能会以不同的方式执行。

    查询的执行方式(访问表的顺序,首先执行哪个连接,使用或不使用哪个索引,...)称为执行计划。这也符合 SQL 语言的原始精神:它应该让用户 说他们想要什么,而不是如何去做。

话虽如此,这个问题让我想到了一些流行的库,这些库允许使用一些混合语法来表达查询,其中 SQL 子句可以通过方法调用链接在一起。

例如,您的两个查询可以这样写:

res = select("s.id")
    .distinct()
    .from(students, "s")
    .from(grades, "g")
    .where("g.student_id", "=", "s.id")
    .exec();

并且:

res = select("s.id")
    .from(students, "s")
    .whereExists(
        select(1)
        .from(grades, "g")
        .where("g.student_id", "=", "s.id")
    )
    .exec();

这是 JavaScript 中的一个实现,它允许上述表达式起作用。它仅支持 SQL 的一小部分,足以处理这两个 SQL 语句中使用的结构:

class Query {
    constructor() {
        this.tables = {};
        this.selections = [];
        this.conditions = [];
        this.exists = [];
        this.isDistinct = false;
    }
    static parseField(field) {
        try { // Try to interpret the argument as a literal value (like 1)
            let value = JSON.parse(field);
            return {value};
        } catch(e) { // Otherwise interpret it as table.field
            let [table, column] = field.split(".");
            return {table, column};
        }
    }
    distinct() {
        this.isDistinct = true;
        return this;
    }
    from(records, alias) {
        this.tables[alias] = records;
        return this;
    }
    where(fieldA, operator, fieldB) {
        let {table: tableA, column: columnA} = Query.parseField(fieldA);
        let {table: tableB, column: columnB} = Query.parseField(fieldB);
        this.conditions.push({tableA, columnA, operator, tableB, columnB});
        return this;
    }
    whereExists(subquery) {
        this.exists.push({subquery, not: false});
        return this;
    }
    whereNotExists(subquery) {
        this.exists.push({subquery, not: true});
        return this;
    }
    exec(parentRecords={}) {
        // Perform cartesian product (by lack of index)
        let max = Object.values(this.tables).reduce((size, {length}) => size * length, 1);
        let results = [];
        for (let i = 0; i < max; i++) {
            let k = i;
            // Select a record from each table
            let records = {...parentRecords}; // Allow access to parent query
            for (let alias in this.tables) {
                let recId = k % this.tables[alias].length;
                k = (k - recId) / this.tables[alias].length;
                records[alias] = this.tables[alias][recId];
            }
            // Apply conditions
            let include = this.conditions.every(({tableA, columnA, operator, tableB, columnB}) => {
                let a = records[tableA][columnA];
                let b = records[tableB][columnB];
                if (operator == "=") return a === b;
                if (operator == "<") return a < b;
                if (operator == ">") return a > b;
                // ...etc
            }) && this.exists.every(({subquery, not}) => { // Where (NOT) EXIST
                return not === !subquery.exec(records).length;
            });
            if (include) {
                // Apply SELECT
                results.push(this.selections.map(({value, table, column}) => value ?? records[table][column]));
            }
        }
        // Apply DISTINCT
        if (this.isDistinct) {
            results = [...new Map(results.map(result => [JSON.stringify(result), result])).values()];
        }
        return results;
    }
}

function select(...fields) {
    let query = new Query;
    query.selections = fields.map(Query.parseField);
    return query;
}

// Example run

const students = [
  { id: 1 },
  { id: 2 },
  { id: 3 },
  { id: 4 },
  { id: 5 },
  { id: 6 },
]
const grades = [
  { id: 1, student_id: 1 },
  { id: 2, student_id: 1 },
  { id: 3, student_id: 1 },
  { id: 4, student_id: 1 },
  { id: 5, student_id: 1 },
  { id: 6, student_id: 1 },
  { id: 10, student_id: 2 },
  { id: 20, student_id: 2 },
  { id: 30, student_id: 2 },
  { id: 40, student_id: 2 },
  { id: 50, student_id: 2 },
  { id: 60, student_id: 2 },
  { id: 100, student_id: 3 },
  { id: 200, student_id: 3 },
  { id: 300, student_id: 3 },
  { id: 400, student_id: 3 },
  { id: 500, student_id: 3 },
  { id: 600, student_id: 3 },
]


let res1 = select("s.id")
        .distinct()
        .from(students, "s")
        .from(grades, "g")
        .where("g.student_id", "=", "s.id")
        .exec();

console.log(res1);

let res2 = select("s.id")
        .from(students, "s")
        .whereExists(
            select(1)
            .from(grades, "g")
            .where("g.student_id", "=", "s.id")
        )
        .exec();

console.log(res2);