在多对多关系中找到完全匹配的组 table
Find exactly matched groups in a many to many relationship table
Table 下图显示了课程和学生之间的多对多关系。
CREATE Table CourseStudents
(
CourseId INT NOT NULL,
StudentId INT NOT NULL,
PRIMARY KEY (CourseId, StudentId)
);
INSERT INTO CourseStudents VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 3), (3, 2),
(4, 3), (4, 2), (5, 1)
示例数据
| CourseId | StudentId |
|----------|-----------|
| 1 | 1 |
| 1 | 2 |
| 2 | 1 |
| 2 | 2 |
| 3 | 2 |
| 3 | 3 |
| 4 | 2 |
| 4 | 3 |
| 5 | 1 |
我正在寻找 returns 拥有完全相同学生的所有课程的查询。我能够提出如下所示的查询。
WITH CourseGroups AS
(
SELECT c.CourseId,
STUFF ((
SELECT ',' + CAST(c2.StudentId AS VARCHAR)
FROM CourseStudents c2
WHERE c2.CourseId = c.CourseId
ORDER BY c2.StudentId
FOR XML PATH ('')), 1, 1, '') AS StudentList
FROM CourseStudents c
GROUP BY c.CourseId)
SELECT cg.StudentList,
STUFF ((
SELECT ',' + CAST(cg2.CourseId AS VARCHAR(10))
FROM CourseGroups cg2
WHERE cg2.StudentList = cg.StudentList
FOR XML PATH ('')), 1, 1, '') AS ExactMatchCourseList
FROM CourseGroups cg
GROUP BY cg.StudentList
HAVING COUNT(*) > 1
查询returns
| StudentList | ExactMatchCourseList |
|-------------|----------------------|
| 1,2 | 1,2 |
| 2,3 | 3,4 |
以上结果很好。但我只需要 ExactMatchCourseList。
我正在处理的 table 有超过十亿行,所以我需要一个高效的查询,可以在 运行 的几分钟内找到任何匹配的课程。感谢任何帮助。
SqlFiddle
这将为您提供课程对列表,但如果您要获得一式三份(或更多),那么您最终会得到一些额外的结果。我没有时间进一步解决这个问题,但也许它为您指明了正确的方向:
WITH CTE_CourseMatches AS (
SELECT
CS1.CourseId AS CourseId_1,
CS2.CourseId AS CourseId_2,
COUNT(*) AS cnt
FROM
CourseStudents CS1
INNER JOIN CourseStudents CS2 ON CS2.StudentId = CS1.StudentId AND CS2.CourseId > CS1.CourseId
GROUP BY
CS1.CourseId,
CS2.CourseId
),
CTE_CourseCounts AS (SELECT CourseId, COUNT(*) AS cnt FROM CourseStudents GROUP BY CourseID)
SELECT
CM.CourseId_1,
CM.CourseId_2
FROM
CTE_CourseMatches CM
INNER JOIN CTE_CourseCounts CC1 ON CC1.CourseId = CM.CourseId_1 AND CC1.cnt = CM.cnt
INNER JOIN CTE_CourseCounts CC2 ON CC2.CourseId = CM.CourseId_2 AND CC2.cnt = CM.cnt
这只会对您的 CourseStudents table 执行 2 运行 秒,而不是您当前正在执行的 4 秒。如果您在 CourseStudents table 上的 CourseId 上添加索引,第一个 运行 将只是索引扫描。它也只 运行 每门课程一次的原始 STUFF,而不是每个学生一次,然后按课程分组。我遗漏了最后的东西,我不确定你是否想要它,或者它是否只是你计算它的副产品。
CREATE TABLE #Course
(
CourseId INT NOT NULL PRIMARY KEY
);
INSERT INTO #Course
SELECT CourseId
FROM
CourseStudents s
GROUP BY
CourseId
ORDER BY
CourseId;
CREATE TABLE #CourseStudentList
(
CourseId INT NOT NULL PRIMARY KEY,
StudentList VARCHAR(MAX) NOT NULL
);
INSERT INTO #CourseStudentList
SELECT
c.CourseId,
STUFF ((
SELECT ',' + CAST(c2.StudentId AS VARCHAR)
FROM CourseStudents c2
WHERE c2.CourseId = c.CourseId
ORDER BY c2.StudentId
FOR XML PATH ('')), 1, 1, '') AS StudentList
FROM
#Course c
ORDER BY
c.CourseId;
SELECT *
FROM
(
SELECT
l.CourseId,
l.StudentList,
COUNT(*) OVER (PARTITION BY l.StudentList) AS [Count]
FROM
#CourseStudentList l
) l
WHERE
l.[Count] > 1
ORDER BY
l.StudentList;
Table 下图显示了课程和学生之间的多对多关系。
CREATE Table CourseStudents
(
CourseId INT NOT NULL,
StudentId INT NOT NULL,
PRIMARY KEY (CourseId, StudentId)
);
INSERT INTO CourseStudents VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 3), (3, 2),
(4, 3), (4, 2), (5, 1)
示例数据
| CourseId | StudentId |
|----------|-----------|
| 1 | 1 |
| 1 | 2 |
| 2 | 1 |
| 2 | 2 |
| 3 | 2 |
| 3 | 3 |
| 4 | 2 |
| 4 | 3 |
| 5 | 1 |
我正在寻找 returns 拥有完全相同学生的所有课程的查询。我能够提出如下所示的查询。
WITH CourseGroups AS
(
SELECT c.CourseId,
STUFF ((
SELECT ',' + CAST(c2.StudentId AS VARCHAR)
FROM CourseStudents c2
WHERE c2.CourseId = c.CourseId
ORDER BY c2.StudentId
FOR XML PATH ('')), 1, 1, '') AS StudentList
FROM CourseStudents c
GROUP BY c.CourseId)
SELECT cg.StudentList,
STUFF ((
SELECT ',' + CAST(cg2.CourseId AS VARCHAR(10))
FROM CourseGroups cg2
WHERE cg2.StudentList = cg.StudentList
FOR XML PATH ('')), 1, 1, '') AS ExactMatchCourseList
FROM CourseGroups cg
GROUP BY cg.StudentList
HAVING COUNT(*) > 1
查询returns
| StudentList | ExactMatchCourseList |
|-------------|----------------------|
| 1,2 | 1,2 |
| 2,3 | 3,4 |
以上结果很好。但我只需要 ExactMatchCourseList。 我正在处理的 table 有超过十亿行,所以我需要一个高效的查询,可以在 运行 的几分钟内找到任何匹配的课程。感谢任何帮助。 SqlFiddle
这将为您提供课程对列表,但如果您要获得一式三份(或更多),那么您最终会得到一些额外的结果。我没有时间进一步解决这个问题,但也许它为您指明了正确的方向:
WITH CTE_CourseMatches AS (
SELECT
CS1.CourseId AS CourseId_1,
CS2.CourseId AS CourseId_2,
COUNT(*) AS cnt
FROM
CourseStudents CS1
INNER JOIN CourseStudents CS2 ON CS2.StudentId = CS1.StudentId AND CS2.CourseId > CS1.CourseId
GROUP BY
CS1.CourseId,
CS2.CourseId
),
CTE_CourseCounts AS (SELECT CourseId, COUNT(*) AS cnt FROM CourseStudents GROUP BY CourseID)
SELECT
CM.CourseId_1,
CM.CourseId_2
FROM
CTE_CourseMatches CM
INNER JOIN CTE_CourseCounts CC1 ON CC1.CourseId = CM.CourseId_1 AND CC1.cnt = CM.cnt
INNER JOIN CTE_CourseCounts CC2 ON CC2.CourseId = CM.CourseId_2 AND CC2.cnt = CM.cnt
这只会对您的 CourseStudents table 执行 2 运行 秒,而不是您当前正在执行的 4 秒。如果您在 CourseStudents table 上的 CourseId 上添加索引,第一个 运行 将只是索引扫描。它也只 运行 每门课程一次的原始 STUFF,而不是每个学生一次,然后按课程分组。我遗漏了最后的东西,我不确定你是否想要它,或者它是否只是你计算它的副产品。
CREATE TABLE #Course
(
CourseId INT NOT NULL PRIMARY KEY
);
INSERT INTO #Course
SELECT CourseId
FROM
CourseStudents s
GROUP BY
CourseId
ORDER BY
CourseId;
CREATE TABLE #CourseStudentList
(
CourseId INT NOT NULL PRIMARY KEY,
StudentList VARCHAR(MAX) NOT NULL
);
INSERT INTO #CourseStudentList
SELECT
c.CourseId,
STUFF ((
SELECT ',' + CAST(c2.StudentId AS VARCHAR)
FROM CourseStudents c2
WHERE c2.CourseId = c.CourseId
ORDER BY c2.StudentId
FOR XML PATH ('')), 1, 1, '') AS StudentList
FROM
#Course c
ORDER BY
c.CourseId;
SELECT *
FROM
(
SELECT
l.CourseId,
l.StudentList,
COUNT(*) OVER (PARTITION BY l.StudentList) AS [Count]
FROM
#CourseStudentList l
) l
WHERE
l.[Count] > 1
ORDER BY
l.StudentList;