高性能空间连接 - 使用 SQL 地理函数的最近记录
Performant spatial join - closest record using SQL Geography functions
在 SQL Server 2012 中改进最接近的空间匹配性能(地理 STDistance)时,我发现迭代地增加搜索半径可以提高我们数据集的性能。
我有以下三步查询,我试图将其转换为最大深度为 3 的递归 CTE,并进行距离比较 [LEVEL]*500。
SELECT
N.WorkID,
M.WorkID AS MatchWorkID,
N.Location.STDistance(M.Location) AS Meters
INTO
#MatchWork500
FROM
NewWork N
CROSS APPLY
(SELECT TOP (1)
C.WorkID, C.Location
FROM
CurrWork C
WHERE
N.Location.STDistance(C.Location) <= 500
ORDER BY
N.Location.STDistance(C.Location)) AS M
SELECT
N.WorkID,
M.WorkID AS MatchWorkID,
N.Location.STDistance(M.Location) AS Meters
INTO
#MatchWork1000
FROM
NewWork N
LEFT JOIN
#MatchWork500 M500 ON M500.WorkID = N.WorkID
CROSS APPLY
(SELECT TOP (1)
C.WorkID, C.Location
FROM
CurrWork C
WHERE
N.Location.STDistance(C.Location) <= 1000
ORDER BY
N.Location.STDistance(C.Location)) AS M
WHERE
M500.WorkID IS NULL
SELECT
N.WorkID,
M.WorkID AS MatchWorkID,
N.Location.STDistance(M.Location) AS Meters
INTO
#MatchWork1000
FROM
NewWork N
LEFT JOIN
#MatchWork500 M500 ON M500.WorkID = N.WorkID
LEFT JOIN
#MatchWork1000 M1000 ON M1000.WorkID = N.WorkID
CROSS APPLY
(SELECT TOP (1)
C.WorkID, C.Location
FROM
CurrWork C
WHERE
N.Location.STDistance(C.Location) <= 1500
ORDER BY
N.Location.STDistance(C.Location)) AS M
WHERE
M500.WorkID IS NULL
AND M1000.WorkID IS NULL
每次迭代只需要处理之前未匹配的内容,最好是单个 CTE table。请不要存储过程的答案,感谢帮助。
设法为此找到一个高性能的解决方案(剧透,我找不到一个高性能的 CTE 解决方案)。第一步是改进 CurrWork
上的空间索引。有问题的数据集都有 Geography
列。投影为WGS1984,供参考。这对于大多数基于英国的空间数据操作非常有效,尤其是对于 building/address 个位置;
CREATE SPATIAL INDEX [IDX_Location] ON [CurrWork] ([Location])
USING GEOGRAPHY_GRID
WITH (
GRIDS =(LEVEL_1 = HIGH,LEVEL_2 = MEDIUM,LEVEL_3 = MEDIUM,LEVEL_4 = MEDIUM),
CELLS_PER_OBJECT = 3, PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, DROP_EXISTING = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON
) ON [PRIMARY]
GO
我发现在我的案例中我必须使用索引提示,这可能并不总是需要的。
FROM CurrWork AS C WITH(INDEX([IDX_Location]))
为了找到最接近的 CurrWork
,我发现排名距离作为第一步效果很好;
RANK() OVER(PARTITION BY N.WorkID ORDER BY C.Location.STDistance(N.Location),C.WorkID) rnk
然后M.rnk = 1
在连接条件中限制为仅最近的记录。完整代码如下...希望有人觉得它有用!
DECLARE @rad FLOAT = 10000 --10km search radius
SELECT
N.WorkID,
M.MatchWorkID,
M.Meters
FROM NewWork N
LEFT OUTER JOIN (
SELECT
N.WorkID,
C.WorkID AS MatchWorkID,
C.Location.STDistance(N.Location) AS Meters,
RANK() OVER(PARTITION BY N.WorkID ORDER BY C.Location.STDistance(N.Location),C.WorkID) rnk
FROM CurrWork AS C WITH(INDEX([IDX_Location]))
JOIN NewWork N ON C.Location.STDistance(N.Location) < @rad
) M ON N.WorkID = M.WorkID AND M.rnk = 1
在 SQL Server 2012 中改进最接近的空间匹配性能(地理 STDistance)时,我发现迭代地增加搜索半径可以提高我们数据集的性能。
我有以下三步查询,我试图将其转换为最大深度为 3 的递归 CTE,并进行距离比较 [LEVEL]*500。
SELECT
N.WorkID,
M.WorkID AS MatchWorkID,
N.Location.STDistance(M.Location) AS Meters
INTO
#MatchWork500
FROM
NewWork N
CROSS APPLY
(SELECT TOP (1)
C.WorkID, C.Location
FROM
CurrWork C
WHERE
N.Location.STDistance(C.Location) <= 500
ORDER BY
N.Location.STDistance(C.Location)) AS M
SELECT
N.WorkID,
M.WorkID AS MatchWorkID,
N.Location.STDistance(M.Location) AS Meters
INTO
#MatchWork1000
FROM
NewWork N
LEFT JOIN
#MatchWork500 M500 ON M500.WorkID = N.WorkID
CROSS APPLY
(SELECT TOP (1)
C.WorkID, C.Location
FROM
CurrWork C
WHERE
N.Location.STDistance(C.Location) <= 1000
ORDER BY
N.Location.STDistance(C.Location)) AS M
WHERE
M500.WorkID IS NULL
SELECT
N.WorkID,
M.WorkID AS MatchWorkID,
N.Location.STDistance(M.Location) AS Meters
INTO
#MatchWork1000
FROM
NewWork N
LEFT JOIN
#MatchWork500 M500 ON M500.WorkID = N.WorkID
LEFT JOIN
#MatchWork1000 M1000 ON M1000.WorkID = N.WorkID
CROSS APPLY
(SELECT TOP (1)
C.WorkID, C.Location
FROM
CurrWork C
WHERE
N.Location.STDistance(C.Location) <= 1500
ORDER BY
N.Location.STDistance(C.Location)) AS M
WHERE
M500.WorkID IS NULL
AND M1000.WorkID IS NULL
每次迭代只需要处理之前未匹配的内容,最好是单个 CTE table。请不要存储过程的答案,感谢帮助。
设法为此找到一个高性能的解决方案(剧透,我找不到一个高性能的 CTE 解决方案)。第一步是改进 CurrWork
上的空间索引。有问题的数据集都有 Geography
列。投影为WGS1984,供参考。这对于大多数基于英国的空间数据操作非常有效,尤其是对于 building/address 个位置;
CREATE SPATIAL INDEX [IDX_Location] ON [CurrWork] ([Location])
USING GEOGRAPHY_GRID
WITH (
GRIDS =(LEVEL_1 = HIGH,LEVEL_2 = MEDIUM,LEVEL_3 = MEDIUM,LEVEL_4 = MEDIUM),
CELLS_PER_OBJECT = 3, PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, DROP_EXISTING = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON
) ON [PRIMARY]
GO
我发现在我的案例中我必须使用索引提示,这可能并不总是需要的。
FROM CurrWork AS C WITH(INDEX([IDX_Location]))
为了找到最接近的 CurrWork
,我发现排名距离作为第一步效果很好;
RANK() OVER(PARTITION BY N.WorkID ORDER BY C.Location.STDistance(N.Location),C.WorkID) rnk
然后M.rnk = 1
在连接条件中限制为仅最近的记录。完整代码如下...希望有人觉得它有用!
DECLARE @rad FLOAT = 10000 --10km search radius
SELECT
N.WorkID,
M.MatchWorkID,
M.Meters
FROM NewWork N
LEFT OUTER JOIN (
SELECT
N.WorkID,
C.WorkID AS MatchWorkID,
C.Location.STDistance(N.Location) AS Meters,
RANK() OVER(PARTITION BY N.WorkID ORDER BY C.Location.STDistance(N.Location),C.WorkID) rnk
FROM CurrWork AS C WITH(INDEX([IDX_Location]))
JOIN NewWork N ON C.Location.STDistance(N.Location) < @rad
) M ON N.WorkID = M.WorkID AND M.rnk = 1