如何使用SQL抽取每组50%的记录?
How to use SQL to sample 50% of records in each group?
我是 SQL 的新手。我的数据是这样的:
c1 c2 c3 group
1 2 1 1
1 2 2 1
3 5 6 1
2 4 7 2
5 3 4 2
2 1 3 2
6 4 4 2
7 8 3 3
1 2 2 3
3 2 2 3
对于每个组(由列组表示),我想抽取 50% 的记录(保留所有列)并将结果保存到新的 table 中。
您可以使用 NTILE。如果您希望每个组按其他列排序,只需将 ORDER BY
从 NTILE(2) OVER ( PARTITION BY [t].[group] ORDER BY [t].[group] ) AS rn
更改为
IF OBJECT_ID('tempdb..#SampleData') IS NOT NULL
DROP TABLE #SampleData;
CREATE TABLE #SampleData
(
c1 INT ,
c2 INT ,
c3 INT ,
[group] INT
);
IF OBJECT_ID('tempdb..#OutputData') IS NOT NULL
DROP TABLE #OutputData;
CREATE TABLE #OutputData
(
c1 INT ,
c2 INT ,
c3 INT ,
[group] INT
);
INSERT INTO [#SampleData]
( [c1], [c2], [c3], [group] )
VALUES ( 1, -- c1 - int
2, -- c2 - int
1, -- c3 - int
1 -- groupNo - int
),
( 1, -- c1 - int
22, -- c2 - int
12, -- c3 - int
1 -- groupNo - int
),
( 144, -- c1 - int
244, -- c2 - int
144, -- c3 - int
1 -- groupNo - int
),
( 2, -- c1 - int
23, -- c2 - int
4, -- c3 - int
1 -- groupNo - int
),
( 13, -- c1 - int
24, -- c2 - int
14, -- c3 - int
1 -- groupNo - int
),
( 16, -- c1 - int
266, -- c2 - int
133, -- c3 - int
2 -- groupNo - int
),
( 16, -- c1 - int
266, -- c2 - int
133, -- c3 - int
2 -- groupNo - int
),
( 16, -- c1 - int
266, -- c2 - int
133, -- c3 - int
2 -- groupNo - int
),
( 123, -- c1 - int
293, -- c2 - int
123, -- c3 - int
2 -- groupNo - int
),
( 123, -- c1 - int
223, -- c2 - int
123, -- c3 - int
3 -- groupNo - int
)
;WITH cte
AS ( SELECT [t].[c1] ,
[t].[c2] ,
[t].[c3] ,
[t].[group] ,
NTILE(2) OVER ( PARTITION BY [t].[group] ORDER BY [t].[group] ) AS rn
FROM [#SampleData] AS [t]
)
INSERT INTO [#OutputData]
( [c1] ,
[c2] ,
[c3] ,
[group]
)
SELECT [cte].[c1] ,
[cte].[c2] ,
[cte].[c3] ,
[cte].[group]
FROM [cte]
WHERE rn = 1;
SELECT [sd].[c1] ,
[sd].[c2] ,
[sd].[c3] ,
[sd].[group]
FROM [#OutputData] AS [sd];
我是 SQL 的新手。我的数据是这样的:
c1 c2 c3 group
1 2 1 1
1 2 2 1
3 5 6 1
2 4 7 2
5 3 4 2
2 1 3 2
6 4 4 2
7 8 3 3
1 2 2 3
3 2 2 3
对于每个组(由列组表示),我想抽取 50% 的记录(保留所有列)并将结果保存到新的 table 中。
您可以使用 NTILE。如果您希望每个组按其他列排序,只需将 ORDER BY
从 NTILE(2) OVER ( PARTITION BY [t].[group] ORDER BY [t].[group] ) AS rn
IF OBJECT_ID('tempdb..#SampleData') IS NOT NULL
DROP TABLE #SampleData;
CREATE TABLE #SampleData
(
c1 INT ,
c2 INT ,
c3 INT ,
[group] INT
);
IF OBJECT_ID('tempdb..#OutputData') IS NOT NULL
DROP TABLE #OutputData;
CREATE TABLE #OutputData
(
c1 INT ,
c2 INT ,
c3 INT ,
[group] INT
);
INSERT INTO [#SampleData]
( [c1], [c2], [c3], [group] )
VALUES ( 1, -- c1 - int
2, -- c2 - int
1, -- c3 - int
1 -- groupNo - int
),
( 1, -- c1 - int
22, -- c2 - int
12, -- c3 - int
1 -- groupNo - int
),
( 144, -- c1 - int
244, -- c2 - int
144, -- c3 - int
1 -- groupNo - int
),
( 2, -- c1 - int
23, -- c2 - int
4, -- c3 - int
1 -- groupNo - int
),
( 13, -- c1 - int
24, -- c2 - int
14, -- c3 - int
1 -- groupNo - int
),
( 16, -- c1 - int
266, -- c2 - int
133, -- c3 - int
2 -- groupNo - int
),
( 16, -- c1 - int
266, -- c2 - int
133, -- c3 - int
2 -- groupNo - int
),
( 16, -- c1 - int
266, -- c2 - int
133, -- c3 - int
2 -- groupNo - int
),
( 123, -- c1 - int
293, -- c2 - int
123, -- c3 - int
2 -- groupNo - int
),
( 123, -- c1 - int
223, -- c2 - int
123, -- c3 - int
3 -- groupNo - int
)
;WITH cte
AS ( SELECT [t].[c1] ,
[t].[c2] ,
[t].[c3] ,
[t].[group] ,
NTILE(2) OVER ( PARTITION BY [t].[group] ORDER BY [t].[group] ) AS rn
FROM [#SampleData] AS [t]
)
INSERT INTO [#OutputData]
( [c1] ,
[c2] ,
[c3] ,
[group]
)
SELECT [cte].[c1] ,
[cte].[c2] ,
[cte].[c3] ,
[cte].[group]
FROM [cte]
WHERE rn = 1;
SELECT [sd].[c1] ,
[sd].[c2] ,
[sd].[c3] ,
[sd].[group]
FROM [#OutputData] AS [sd];