Select 来自一组的唯一成员,按权重 categories/properties
Select unique members from a set by weighted categories/properties
我有一组对象(5000 多个),具有 7 个不同的属性。两个属性是三级的,其余的是二元的。每个对象都指定了所有 7 个属性。在某些情况下,二进制 属性 可能会变成一元。
有时我需要 select 此集合中的前 N 个随机对象,根据每个类别中的标签频率对对象总数进行加权。
目前,我将sql服务器table中的所有数据作为对象,属性掩码对;但是,我可以根据需要以任何其他方式重新组织。
示例:
- 黑蓝黄(1,2,4)
- 圆方三角(8,16,32)
- 纯色color/meshed颜色(64)
- 虚线 contour/no 等高线 (128)
- 等(256)
数据为:
object1|9 <- 1001 black circle only (all other properties are 0)
object2|81 <- 101 0001 black square with solid color (all other properties are 0)
object3|148 <- 1001 0100 yellow square with dashed contour
etc.
说,我最终得到 1k 个对象,其中 600 个黑色对象、300 个黄色对象和 100 个蓝色对象。我需要 select 前 10 个对象。如果我只考虑一个 属性,我将只取任意 6 个黑色、3 个黄色和 1 个蓝色对象。但我还有 6 个其他属性需要考虑并确保我有适量的圆形、正方形和三角形。等等。在这一点上,我什至不知道如何解决这个问题。
如有任何建议,我们将不胜感激。
*编辑:
我按照以下格式重新填充了数据
name | att1 | att2 | ...
obj1 | 1 | 8 | ...
obj2 | 2 | 16 | ...
obj3 | 1 | 32 | ...
有没有办法select TOP N objects weighted by each attribute?每个对象我有 7 个属性;没有空值。
谢谢!
它很混乱,它并不总是获取所需的确切行数或完美的分布,但它非常接近。
那么它是如何工作的:
- ValuesPivotted:旋转所有不同的值并为每一行提供一个随机行号
- TargetDistribution:对于每个不同的值,确定您需要多少
- SelectRows:逐行检查 ValuesPivotted 中的每一行,查看是否要跳过该行,否则它会违反不同值的目标。否则增加适用于该行的每个值的总和。
DECLARE @TargetRowNum INT = 100;
WITH ValuesPivotted AS(
SELECT O.id
, RowNum = ROW_NUMBER() OVER (ORDER BY NEWID())
, [0] = CASE WHEN O.atr1 = 0 THEN 1 ELSE 0 END
, [1] = CASE WHEN O.atr1 = 1 THEN 1 ELSE 0 END
, [2] = CASE WHEN O.atr1 = 2 THEN 1 ELSE 0 END
, [4] = CASE WHEN O.atr2 = 4 THEN 1 ELSE 0 END
, [8] = CASE WHEN O.atr2 = 8 THEN 1 ELSE 0 END
, [16] = CASE WHEN O.atr3 = 16 THEN 1 ELSE 0 END
, [32] = CASE WHEN O.atr3 = 32 THEN 1 ELSE 0 END
, [64] = CASE WHEN O.atr4 = 64 THEN 1 ELSE 0 END
, [128] = CASE WHEN O.atr4 = 128 THEN 1 ELSE 0 END
FROM dbo.objects AS O
),
TargetDistribution AS (
SELECT Target0 = ROUND(CAST(SUM([0] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target1 = ROUND(CAST(SUM([1] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target2 = ROUND(CAST(SUM([2] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target4 = ROUND(CAST(SUM([4] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target8 = ROUND(CAST(SUM([8] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target16 = ROUND(CAST(SUM([16] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target32 = ROUND(CAST(SUM([32] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target64 = ROUND(CAST(SUM([64] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target128 = ROUND(CAST(SUM([128]) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
FROM ValuesPivotted
),
SelectRows AS(
SELECT VP.id
, RowNum
, KeepRow = 1
, Target0 , Sum0 = [0]
, Target1 , Sum1 = [1]
, Target2 , Sum2 = [2]
, Target4 , Sum4 = [4]
, Target8 , Sum8 = [8]
, Target16 , Sum16 = [16]
, Target32 , Sum32 = [32]
, Target64 , Sum64 = [64]
, Target128 , Sum128 = [128]
FROM ValuesPivotted AS VP
CROSS JOIN TargetDistribution AS TD
WHERE VP.RowNum = 1
UNION ALL
SELECT
VP.id
, VP.RowNum
, KeepRow = ISNULL(SkipRow.Value, 1)
, Target0 , Sum0 = Sum0 + ISNULL(SkipRow.Value, [0] )
, Target1 , Sum1 = Sum1 + ISNULL(SkipRow.Value, [1] )
, Target2 , Sum2 = Sum2 + ISNULL(SkipRow.Value, [2] )
, Target4 , Sum4 = Sum4 + ISNULL(SkipRow.Value, [4] )
, Target8 , Sum8 = Sum8 + ISNULL(SkipRow.Value, [8] )
, Target16 , Sum16 = Sum16 + ISNULL(SkipRow.Value, [16] )
, Target32 , Sum32 = Sum32 + ISNULL(SkipRow.Value, [32] )
, Target64 , Sum64 = Sum64 + ISNULL(SkipRow.Value, [64] )
, Target128 , Sum128 = Sum128 + ISNULL(SkipRow.Value, [128])
FROM SelectRows AS SR
INNER JOIN ValuesPivotted AS VP
ON VP.RowNum = SR.RowNum + 1
CROSS APPLY(
SELECT Value =
CASE WHEN Sum0 + [0] <= Target0
AND Sum1 + [1] <= Target1
AND Sum2 + [2] <= Target2
AND Sum4 + [4] <= Target4
AND Sum8 + [8] <= Target8
AND Sum16 + [16] <= Target16
AND Sum32 + [32] <= Target32
AND Sum64 + [64] <= Target64
AND Sum128 + [128] <= Target128
THEN NULL ELSE 0 END
) AS SkipRow
WHERE Sum0 < Target0
OR Sum1 < Target1
OR Sum2 < Target2
OR Sum4 < Target4
OR Sum8 < Target8
OR Sum16 < Target16
OR Sum32 < Target32
OR Sum64 < Target64
OR Sum128 < Target128
)
SELECT O.*
FROM SelectRows AS SR
INNER JOIN dbo.objects AS O
ON SR.id = O.id
WHERE SR.KeepRow = 1
OPTION(MAXRECURSION 0)
编辑: SelectRows 中的 WHERE 子句没有做它应该做的事情,在满足所有目标时停止递归,现在它做到了。
我有一组对象(5000 多个),具有 7 个不同的属性。两个属性是三级的,其余的是二元的。每个对象都指定了所有 7 个属性。在某些情况下,二进制 属性 可能会变成一元。
有时我需要 select 此集合中的前 N 个随机对象,根据每个类别中的标签频率对对象总数进行加权。
目前,我将sql服务器table中的所有数据作为对象,属性掩码对;但是,我可以根据需要以任何其他方式重新组织。
示例:
- 黑蓝黄(1,2,4)
- 圆方三角(8,16,32)
- 纯色color/meshed颜色(64)
- 虚线 contour/no 等高线 (128)
- 等(256)
数据为:
object1|9 <- 1001 black circle only (all other properties are 0)
object2|81 <- 101 0001 black square with solid color (all other properties are 0)
object3|148 <- 1001 0100 yellow square with dashed contour
etc.
说,我最终得到 1k 个对象,其中 600 个黑色对象、300 个黄色对象和 100 个蓝色对象。我需要 select 前 10 个对象。如果我只考虑一个 属性,我将只取任意 6 个黑色、3 个黄色和 1 个蓝色对象。但我还有 6 个其他属性需要考虑并确保我有适量的圆形、正方形和三角形。等等。在这一点上,我什至不知道如何解决这个问题。
如有任何建议,我们将不胜感激。
*编辑:
我按照以下格式重新填充了数据
name | att1 | att2 | ...
obj1 | 1 | 8 | ...
obj2 | 2 | 16 | ...
obj3 | 1 | 32 | ...
有没有办法select TOP N objects weighted by each attribute?每个对象我有 7 个属性;没有空值。
谢谢!
它很混乱,它并不总是获取所需的确切行数或完美的分布,但它非常接近。
那么它是如何工作的:
- ValuesPivotted:旋转所有不同的值并为每一行提供一个随机行号
- TargetDistribution:对于每个不同的值,确定您需要多少
- SelectRows:逐行检查 ValuesPivotted 中的每一行,查看是否要跳过该行,否则它会违反不同值的目标。否则增加适用于该行的每个值的总和。
DECLARE @TargetRowNum INT = 100;
WITH ValuesPivotted AS(
SELECT O.id
, RowNum = ROW_NUMBER() OVER (ORDER BY NEWID())
, [0] = CASE WHEN O.atr1 = 0 THEN 1 ELSE 0 END
, [1] = CASE WHEN O.atr1 = 1 THEN 1 ELSE 0 END
, [2] = CASE WHEN O.atr1 = 2 THEN 1 ELSE 0 END
, [4] = CASE WHEN O.atr2 = 4 THEN 1 ELSE 0 END
, [8] = CASE WHEN O.atr2 = 8 THEN 1 ELSE 0 END
, [16] = CASE WHEN O.atr3 = 16 THEN 1 ELSE 0 END
, [32] = CASE WHEN O.atr3 = 32 THEN 1 ELSE 0 END
, [64] = CASE WHEN O.atr4 = 64 THEN 1 ELSE 0 END
, [128] = CASE WHEN O.atr4 = 128 THEN 1 ELSE 0 END
FROM dbo.objects AS O
),
TargetDistribution AS (
SELECT Target0 = ROUND(CAST(SUM([0] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target1 = ROUND(CAST(SUM([1] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target2 = ROUND(CAST(SUM([2] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target4 = ROUND(CAST(SUM([4] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target8 = ROUND(CAST(SUM([8] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target16 = ROUND(CAST(SUM([16] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target32 = ROUND(CAST(SUM([32] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target64 = ROUND(CAST(SUM([64] ) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
, Target128 = ROUND(CAST(SUM([128]) AS FLOAT) / COUNT(*) * @TargetRowNum, 0)
FROM ValuesPivotted
),
SelectRows AS(
SELECT VP.id
, RowNum
, KeepRow = 1
, Target0 , Sum0 = [0]
, Target1 , Sum1 = [1]
, Target2 , Sum2 = [2]
, Target4 , Sum4 = [4]
, Target8 , Sum8 = [8]
, Target16 , Sum16 = [16]
, Target32 , Sum32 = [32]
, Target64 , Sum64 = [64]
, Target128 , Sum128 = [128]
FROM ValuesPivotted AS VP
CROSS JOIN TargetDistribution AS TD
WHERE VP.RowNum = 1
UNION ALL
SELECT
VP.id
, VP.RowNum
, KeepRow = ISNULL(SkipRow.Value, 1)
, Target0 , Sum0 = Sum0 + ISNULL(SkipRow.Value, [0] )
, Target1 , Sum1 = Sum1 + ISNULL(SkipRow.Value, [1] )
, Target2 , Sum2 = Sum2 + ISNULL(SkipRow.Value, [2] )
, Target4 , Sum4 = Sum4 + ISNULL(SkipRow.Value, [4] )
, Target8 , Sum8 = Sum8 + ISNULL(SkipRow.Value, [8] )
, Target16 , Sum16 = Sum16 + ISNULL(SkipRow.Value, [16] )
, Target32 , Sum32 = Sum32 + ISNULL(SkipRow.Value, [32] )
, Target64 , Sum64 = Sum64 + ISNULL(SkipRow.Value, [64] )
, Target128 , Sum128 = Sum128 + ISNULL(SkipRow.Value, [128])
FROM SelectRows AS SR
INNER JOIN ValuesPivotted AS VP
ON VP.RowNum = SR.RowNum + 1
CROSS APPLY(
SELECT Value =
CASE WHEN Sum0 + [0] <= Target0
AND Sum1 + [1] <= Target1
AND Sum2 + [2] <= Target2
AND Sum4 + [4] <= Target4
AND Sum8 + [8] <= Target8
AND Sum16 + [16] <= Target16
AND Sum32 + [32] <= Target32
AND Sum64 + [64] <= Target64
AND Sum128 + [128] <= Target128
THEN NULL ELSE 0 END
) AS SkipRow
WHERE Sum0 < Target0
OR Sum1 < Target1
OR Sum2 < Target2
OR Sum4 < Target4
OR Sum8 < Target8
OR Sum16 < Target16
OR Sum32 < Target32
OR Sum64 < Target64
OR Sum128 < Target128
)
SELECT O.*
FROM SelectRows AS SR
INNER JOIN dbo.objects AS O
ON SR.id = O.id
WHERE SR.KeepRow = 1
OPTION(MAXRECURSION 0)
编辑: SelectRows 中的 WHERE 子句没有做它应该做的事情,在满足所有目标时停止递归,现在它做到了。