根据分隔符拆分两列
Split two column based on delimiter
我有一个 TSQL (MSSQL) table,其中包含以下格式的记录
Id
Column1
Column2
1
a/b/c
apple/banana/cucumber
我想按以下格式拆分记录
Id
Column1
Column2
1
a
apple
1
b
banana
1
c
cucumber
Column1 和 Column2 使用“/”分隔符保持关系,并且以相同的顺序相互关联。
我试图在 CHARINDEX 和 SUBSTRING 的帮助下拆分列,但我无法维持两列之间的关系。
请尝试以下解决方案。
它基于 JSON,将从 SQL Server 2016 开始运行。
SQL
-- DDL and sample data population, start
DECLARE @tbl TABLE (ID INT, ColB varchar(8000), ColC varchar(8000));
INSERT INTO @tbl VALUES
(1,'a/b/c','apple/banana/cucumber');
-- DDL and sample data population, end
DECLARE @separator CHAR(1) = '/';
WITH rs AS
(
SELECT *
, ar1 = '["' + REPLACE(ColB, @separator, '","') + '"]'
, ar2 = '["' + REPLACE(ColC, @separator, '","') + '"]'
FROM @tbl
)
SELECT ID, ColB.[value] AS [ColB], ColC.[value] AS ColC
FROM rs
CROSS APPLY OPENJSON (ar1, N'$') AS ColB
CROSS APPLY OPENJSON (ar2, N'$') AS ColC
WHERE ColB.[key] = ColC.[key];
输出
+----+------+----------+
| ID | ColB | ColC |
+----+------+----------+
| 1 | a | apple |
| 1 | b | banana |
| 1 | c | cucumber |
+----+------+----------+
您可以添加一个函数来拆分字符串。
然后交叉应用到Column1和Column2的拆分部分。
create table test (
Id int identity primary key,
Column1 varchar(30),
Column2 varchar(30)
);
insert into test (Column1, Column2) values
('a/b/c', 'apple/banana/cucumber'),
('d/e/f', 'orange/prune/onion');
(从 复制的 UDF)
CREATE FUNCTION dbo.fnString_Split
(
@str nvarchar(4000),
@delim nchar(1)
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
(
WITH RCTE AS (
SELECT
1 AS ordinal
, ISNULL(NULLIF(CHARINDEX(@delim, @str),0), LEN(@str)) AS pos
, LEFT(@str, ISNULL(NULLIF(CHARINDEX(@delim, @str),0)-1, LEN(@str))) AS value
UNION ALL
SELECT
ordinal+1
, ISNULL(NULLIF(CHARINDEX(@delim, @str, pos+1), 0), LEN(@str))
, SUBSTRING(@str, pos+1, ISNULL(NULLIF(CHARINDEX(@delim, @str, pos+1),0)-pos-1, LEN(@str)-pos ))
FROM RCTE
WHERE pos < LEN(@str)
)
SELECT ordinal, value
FROM RCTE
);
select
t.Id
, ca.Column1
, ca.Column2
from test t
cross apply (
select
s1.ordinal
, s1.value as Column1
, s2.value as Column2
from dbo.fnString_Split(t.Column1,'/') as s1
join dbo.fnString_Split(t.Column2,'/') as s2
on s1.ordinal = s2.ordinal
) ca;
Id
Column1
Column2
1
a
apple
1
b
banana
1
c
cucumber
2
d
orange
2
e
prune
2
f
onion
演示 db<>fiddle here
- 首先创建下面的函数来拆分字符串。
- 然后,执行函数代码后面的代码。
-- Function Code
CREATE FUNCTION [dbo].[udf_SplitList]
(
@InputString varchar(MAX)
, @Separator varchar(1)
)
RETURNS @ValuesList TABLE ( ID int IDENTITY(1,1), Value varchar(MAX))
AS
BEGIN
DECLARE @ListValue NVARCHAR(max)
SET @InputString = @InputString + @Separator
WHILE (LEN(@InputString) > 0)
BEGIN
SELECT @ListValue = SUBSTRING(@InputString , 1, CHARINDEX(@Separator, @InputString) - 1)
INSERT INTO @ValuesList
SELECT LTRIM(@ListValue)
SELECT @InputString = SUBSTRING(@InputString, CHARINDEX(@Separator, @InputString) + 1 , LEN(@InputString) - CHARINDEX(@Separator, @InputString))
END
RETURN
END
-- Execution Code
DECLARE @YourTable TABLE (ID int, CodeList varchar(MAX), ValueList varchar(MAX));
INSERT INTO @YourTable VALUES ( 1, 'a/b/c', 'apple/banana/cucumber');
SELECT X.*
FROM @YourTable Y
CROSS APPLY
(
SELECT
Code = C.Value
, Value = V.Value
FROM dbo.udf_SplitList(Y.CodeList , '/') C
JOIN dbo.udf_SplitList(Y.ValueList, '/') V ON V.ID = C.ID
) X
;
不使用函数,你可以使用 tally/numbers table 来做同样的事情,如下所示
查看工作演示 here
; with nums as
(
select 1 as num
union all
select num +1 as num
from
nums
where num <80
)
select
X.id,
substring(X.column1,X.b,X.e-X.b),
substring(Y.column2,Y.b,Y.e-Y.b)
from
(
select
t.*,
e=N.Num,
r=row_number() over(partition by Id order by N.num),
b=isnull(lag(N.num) over (partition by Id order by N.num),0)+1
from t
left join nums N
on charindex('/',column1+'/',N.num)=N.num
)X
left join
(
select
t.*,
e= N.Num,
r=row_number() over(partition by Id order by N.num),
b=isnull(lag(N.num) over (partition by Id order by N.num),0)+1
from t
left join nums N
on charindex('/', column2+'/',N.num)=N.num
)Y
on X.id=Y.id and X.r=Y.r
我知道已经有一些答案,其中一个已被接受,但还有一些重要的性能因素需要考虑。如果值的数量总是三个(或者总是很低,比如少于 5 或 6),那么级联应用技术将是目前最快的。此解决方案假定始终有 3 个项目。它可以很容易地修改以处理变量,但项目数量很少。
级联应用解决方案:
DECLARE @table TABLE
(
SomeId INT IDENTITY,
S1 VARCHAR(1000),
s2 VARCHAR(1000)
);
INSERT @table VALUES ('a/b/c','apple/banana/cucumber'),
('d/d2/f','dog/donkey/fish'),('x/y/z','x-ray/yo-yo/zeta');
SELECT
SomeId = f.SomeId,
Col1 = f2.C1,
Col2 = f2.C2
FROM
(
SELECT
t.SomeId,
SUBSTRING(t.S1, 1, c1.P-1),
SUBSTRING(t.S1, c1.P+1, c2.P - c1.P-1),
SUBSTRING(t.S1, c2.P+1, 8000),
SUBSTRING(t.S2, 1, c1.P2 - 1),
SUBSTRING(t.S2, c1.P2+1, c2.P2 - c1.P2-1),
SUBSTRING(t.S2, c2.P2+1, 8000)
FROM @table AS t
CROSS APPLY (VALUES(CHARINDEX('/',t.S1),CHARINDEX('/',t.S2))) AS c1(P,P2)
CROSS APPLY (VALUES(CHARINDEX('/',t.S1,c1.P+1),CHARINDEX('/',t.S2,c1.P2+1))) AS c2(P,P2)
CROSS APPLY (VALUES(CHARINDEX('/',t.S1,c2.P+1),CHARINDEX('/',t.S2,c2.P2+1))) AS c3(P,P2)
) AS f(SomeId,c1_1,c1_2,c1_3,c2_1,c2_2,c2_3)
CROSS APPLY (VALUES (c1_1, c2_1), (c1_2, c2_2),(c1_3,c2_3)) AS f2(c1,c2);
如果您使用的是 SQL Azure,则您有 STRING_SPLIT 和 ordinal option。
DECLARE @table TABLE
(
SomeId INT IDENTITY,
S1 VARCHAR(1000),
s2 VARCHAR(1000)
);
INSERT @table VALUES ('a/b/c','apple/banana/cucumber'),
('d/d2/f','dog/donkey/fish'),('x/y/z','x-ray/yo-yo/zeta');
SELECT TOP(1) WITH TIES
t.SomeId, t.S1, t.S2, Col1 = split1.[value], Col2 = split2.[value]
FROM @table AS t
CROSS APPLY STRING_SPLIT(t.S1 ,'/') AS split1
CROSS APPLY STRING_SPLIT(t.S2 ,'/') AS split2
ORDER BY ABS(
ROW_NUMBER() OVER (PARTITION BY t.SomeId, split1.value ORDER BY split1.value)-
ROW_NUMBER() OVER (PARTITION BY t.SomeId, split2.value ORDER BY split1.value));
^^^ 这仅在项目按字母顺序排列时有效(不实用)但是,在 Azure 上,您可以将 ORDER BY 更改为:
ORDER BY ABS(
ROW_NUMBER() OVER (PARTITION BY t.SomeId, split1.value ORDER BY split1.ordianal) -
ROW_NUMBER() OVER (PARTITION BY t.SomeId, split2.value ORDER BY split1.ordianal));
比较所有发布的技术
现在让我们比较迄今为止发布的所有解决方案,以了解 戏剧性 性能差异。我构建了一个基本测试工具并 运行 它,首先是 10 万行,然后是一百万行。解决方案包括:
- dbo.udf_SplitList 来自 Andy3B
- dbo.fnString_Split 来自 LukStorms
- 递归 CTE - Numbers/tally table DhruvJoshi 的解决方案
- DhruvJoshi 解决方案的改进版本,计算速度更快 table
- Yitzhak Khabinsky 的 JSON 解决方案
- 我的 Cascading APPLY 解决方案
dbo.udf_SplitList 解决方案最慢,一百万行需要 246 秒。标量函数慢得可怕,但递归让事情变得更糟。
dbo.fnString_Split 解决方案在 85 秒时效果更好。为了做得更好,我们需要丢失标量 udf。
DhruvJoshi 的递归 CTE 解决方案将我们缩短到 65 秒,提高了 50%。我通过重写数字 table 以不使用递归来改进他的解决方案;这让我们的速度又提高了 50%,减少到 45 秒。
Yitzhak Khabinsky 是第一个基于集合的解决方案;他正在利用 JSON。这里我们有 200%+ 的性能提升,减少到 16 秒。 Cascading APPLY 解决方案比 Yitzhak 的解决方案又提高了 400%+;三秒内 .
注意标量 UDF、计数和循环的递归。基于集合的总是规则。
IF OBJECT_ID('tempdb..#table') IS NOT NULL DROP TABLE #table;
GO
SELECT TOP(100000)
SomeId = ROW_NUMBER() OVER (ORDER BY (SELECT NULL)),
S1 = 'A/B/C',
S2 = REPLACE(LEFT(NEWID(),18),'-','/')
INTO #table
FROM sys.all_columns, sys.all_columns a
GO
PRINT CHAR(10)+'dbo.udf_SplitList -Andy3B'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
SELECT @ID = t.SomeId, @C1 = x.Code, @C2 = x.Value
FROM #table AS t
CROSS APPLY
(
SELECT Code = C.Value,
Value = V.Value
FROM dbo.udf_SplitList(t.S1, '/') AS C
JOIN dbo.udf_SplitList(t.S2, '/') AS V ON V.ID = C.ID
) AS X;
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
PRINT CHAR(10)+'dbo.fnString_Split - LukStorms'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
select @ID = t.SomeId, @C1 = ca.Column1, @C2 = ca.Column2
from #table AS t
cross apply (
select
s1.ordinal
, s1.value as Column1
, s2.value as Column2
from dbo.fnString_Split(t.S1,'/') as s1
join dbo.fnString_Split(t.S2,'/') as s2
on s1.ordinal = s2.ordinal) AS ca;
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
PRINT CHAR(10)+'Recursive CTE - TALLY TABLE DhruvJoshi'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
;with nums as
(
select 1 as num union all
select num +1 as num
from nums
where num <80
)
select
@ID = X.SomeId,
@C1 = substring(X.S1,X.b,X.e-X.b),
@C2 = substring(Y.S2,Y.b,Y.e-Y.b)
from
(
select
t.*,
e=N.Num,
r=row_number() over (partition by t.SomeId order by N.num),
b=isnull(lag(N.num) over (partition by t.SomeId order by N.num),0)+1
from #table AS t
left join nums AS N
on charindex('/',t.S1+'/',N.num)=N.num
) AS X
left join
(
select
t.*,
e= N.Num,
r=row_number() over (partition by t.SomeId order by N.num),
b=isnull(lag(N.num) over (partition by t.SomeId order by N.num),0)+1
from #table AS t
left join nums AS N
on charindex('/', t.S2+'/',N.num)=N.num
) AS Y
ON X.SomeId = Y.SomeId and X.r=Y.r;
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
PRINT CHAR(10)+'TALLY TABLE without Recursion DhruvJoshi'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
;with nums as
(
SELECT num = ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
FROM (VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS e1(x),
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS e2(x),
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS e3(x),
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS e4(x)
)
select
@ID = X.SomeId,
@C1 = substring(X.S1,X.b,X.e-X.b),
@C2 = substring(Y.S2,Y.b,Y.e-Y.b)
from
(
select
t.*,
e=N.Num,
r=row_number() over (partition by t.SomeId order by N.num),
b=isnull(lag(N.num) over (partition by t.SomeId order by N.num),0)+1
from #table AS t
left join nums AS N
on charindex('/',t.S1+'/',N.num)=N.num
WHERE n.num <= LEN(t.S1)
) AS X
left join
(
select
t.*,
e= N.Num,
r=row_number() over (partition by t.SomeId order by N.num),
b=isnull(lag(N.num) over (partition by t.SomeId order by N.num),0)+1
from #table AS t
left join nums AS N
on charindex('/', t.S2+'/',N.num)=N.num
WHERE n.num <= LEN(t.S2)
) AS Y
ON X.SomeId = Y.SomeId and X.r=Y.r;
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
PRINT CHAR(10)+'JSON - Yitzhak Khabinsky'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
DECLARE @separator CHAR(1) = '/';
WITH rs AS
(
SELECT *
, ar1 = '["' + REPLACE(t.S1, @separator, '","') + '"]'
, ar2 = '["' + REPLACE(t.S2, @separator, '","') + '"]'
FROM #table AS t
)
SELECT @id = SomeID, @C1 = ColB.[value], @C2 = ColC.[value]
FROM rs AS rs
CROSS APPLY OPENJSON (ar1, N'$') AS ColB
CROSS APPLY OPENJSON (ar2, N'$') AS ColC
WHERE ColB.[key] = ColC.[key];
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
PRINT CHAR(10)+'CROSS APPLY TECHNIQUE'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
SELECT
@ID = f.SomeId,
@C1 = f2.C1,
@C2 = f2.C2
FROM
(
SELECT
t.SomeId,
SUBSTRING(t.S1, 1, c1.P-1),
SUBSTRING(t.S1, c1.P+1, c2.P - c1.P-1),
SUBSTRING(t.S1, c2.P+1, 8000),
SUBSTRING(t.S2, 1, c1.P2 - 1),
SUBSTRING(t.S2, c1.P2+1, c2.P2 - c1.P2-1),
SUBSTRING(t.S2, c2.P2+1, 8000)
FROM #table AS t
CROSS APPLY (VALUES(CHARINDEX('/',t.S1),CHARINDEX('/',t.S2))) AS c1(P,P2)
CROSS APPLY (VALUES(CHARINDEX('/',t.S1,c1.P+1),CHARINDEX('/',t.S2,c1.P2+1))) AS c2(P,P2)
CROSS APPLY (VALUES(CHARINDEX('/',t.S1,c2.P+1),CHARINDEX('/',t.S2,c2.P2+1))) AS c3(P,P2)
) AS f(SomeId,c1_1,c1_2,c1_3,c2_1,c2_2,c2_3)
CROSS APPLY (VALUES (c1_1, c2_1), (c1_2, c2_2),(c1_3,c2_3)) AS f2(c1,c2);
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
100K行测试结果:
dbo.udf_SplitList -Andy3B
------------------------------------------------------------------------------------------
Beginning execution loop
24593
24566
24530
Batch execution completed 3 times.
dbo.fnString_Split - LukStorms
------------------------------------------------------------------------------------------
Beginning execution loop
8147
8260
8257
Batch execution completed 3 times.
Recursive CTE - TALLY TABLE DhruvJoshi
------------------------------------------------------------------------------------------
Beginning execution loop
6867
6733
6850
Batch execution completed 3 times.
TALLY TABLE without Recursion DhruvJoshi
------------------------------------------------------------------------------------------
Beginning execution loop
4677
4630
4620
Batch execution completed 3 times.
JSON - Yitzhak Khabinsky
------------------------------------------------------------------------------------------
Beginning execution loop
1667
1653
1670
Batch execution completed 3 times.
CROSS APPLY TECHNIQUE
------------------------------------------------------------------------------------------
Beginning execution loop
283
280
284
Batch execution completed 3 times.
100万行测试结果:
dbo.udf_SplitList -Andy3B
------------------------------------------------------------------------------------------
Beginning execution loop
246057
245296
247017
Batch execution completed 3 times.
dbo.fnString_Split - LukStorms
------------------------------------------------------------------------------------------
Beginning execution loop
85340
83010
83674
Batch execution completed 3 times.
Recursive CTE - TALLY TABLE -DhruvJoshi
------------------------------------------------------------------------------------------
Beginning execution loop
67226
64910
64740
Batch execution completed 3 times.
TALLY TABLE without Recursion DhruvJoshi
------------------------------------------------------------------------------------------
Beginning execution loop
46777
44630
44623
Batch execution completed 3 times.
JSON - Yitzhak Khabinsky
------------------------------------------------------------------------------------------
Beginning execution loop
16710
16830
16520
Batch execution completed 3 times.
CROSS APPLY TECHNIQUE
------------------------------------------------------------------------------------------
Beginning execution loop
2846
2793
2850
Batch execution completed 3 times.
Completion time: 2022-01-18T22:08:50.3264912-06:00
我有一个 TSQL (MSSQL) table,其中包含以下格式的记录
Id | Column1 | Column2 |
---|---|---|
1 | a/b/c | apple/banana/cucumber |
我想按以下格式拆分记录
Id | Column1 | Column2 |
---|---|---|
1 | a | apple |
1 | b | banana |
1 | c | cucumber |
Column1 和 Column2 使用“/”分隔符保持关系,并且以相同的顺序相互关联。
我试图在 CHARINDEX 和 SUBSTRING 的帮助下拆分列,但我无法维持两列之间的关系。
请尝试以下解决方案。
它基于 JSON,将从 SQL Server 2016 开始运行。
SQL
-- DDL and sample data population, start
DECLARE @tbl TABLE (ID INT, ColB varchar(8000), ColC varchar(8000));
INSERT INTO @tbl VALUES
(1,'a/b/c','apple/banana/cucumber');
-- DDL and sample data population, end
DECLARE @separator CHAR(1) = '/';
WITH rs AS
(
SELECT *
, ar1 = '["' + REPLACE(ColB, @separator, '","') + '"]'
, ar2 = '["' + REPLACE(ColC, @separator, '","') + '"]'
FROM @tbl
)
SELECT ID, ColB.[value] AS [ColB], ColC.[value] AS ColC
FROM rs
CROSS APPLY OPENJSON (ar1, N'$') AS ColB
CROSS APPLY OPENJSON (ar2, N'$') AS ColC
WHERE ColB.[key] = ColC.[key];
输出
+----+------+----------+
| ID | ColB | ColC |
+----+------+----------+
| 1 | a | apple |
| 1 | b | banana |
| 1 | c | cucumber |
+----+------+----------+
您可以添加一个函数来拆分字符串。
然后交叉应用到Column1和Column2的拆分部分。
create table test ( Id int identity primary key, Column1 varchar(30), Column2 varchar(30) ); insert into test (Column1, Column2) values ('a/b/c', 'apple/banana/cucumber'), ('d/e/f', 'orange/prune/onion');
(从
CREATE FUNCTION dbo.fnString_Split ( @str nvarchar(4000), @delim nchar(1) ) RETURNS TABLE WITH SCHEMABINDING AS RETURN ( WITH RCTE AS ( SELECT 1 AS ordinal , ISNULL(NULLIF(CHARINDEX(@delim, @str),0), LEN(@str)) AS pos , LEFT(@str, ISNULL(NULLIF(CHARINDEX(@delim, @str),0)-1, LEN(@str))) AS value UNION ALL SELECT ordinal+1 , ISNULL(NULLIF(CHARINDEX(@delim, @str, pos+1), 0), LEN(@str)) , SUBSTRING(@str, pos+1, ISNULL(NULLIF(CHARINDEX(@delim, @str, pos+1),0)-pos-1, LEN(@str)-pos )) FROM RCTE WHERE pos < LEN(@str) ) SELECT ordinal, value FROM RCTE );
select t.Id , ca.Column1 , ca.Column2 from test t cross apply ( select s1.ordinal , s1.value as Column1 , s2.value as Column2 from dbo.fnString_Split(t.Column1,'/') as s1 join dbo.fnString_Split(t.Column2,'/') as s2 on s1.ordinal = s2.ordinal ) ca;
Id | Column1 | Column2 |
---|---|---|
1 | a | apple |
1 | b | banana |
1 | c | cucumber |
2 | d | orange |
2 | e | prune |
2 | f | onion |
演示 db<>fiddle here
- 首先创建下面的函数来拆分字符串。
- 然后,执行函数代码后面的代码。
-- Function Code
CREATE FUNCTION [dbo].[udf_SplitList]
(
@InputString varchar(MAX)
, @Separator varchar(1)
)
RETURNS @ValuesList TABLE ( ID int IDENTITY(1,1), Value varchar(MAX))
AS
BEGIN
DECLARE @ListValue NVARCHAR(max)
SET @InputString = @InputString + @Separator
WHILE (LEN(@InputString) > 0)
BEGIN
SELECT @ListValue = SUBSTRING(@InputString , 1, CHARINDEX(@Separator, @InputString) - 1)
INSERT INTO @ValuesList
SELECT LTRIM(@ListValue)
SELECT @InputString = SUBSTRING(@InputString, CHARINDEX(@Separator, @InputString) + 1 , LEN(@InputString) - CHARINDEX(@Separator, @InputString))
END
RETURN
END
-- Execution Code
DECLARE @YourTable TABLE (ID int, CodeList varchar(MAX), ValueList varchar(MAX));
INSERT INTO @YourTable VALUES ( 1, 'a/b/c', 'apple/banana/cucumber');
SELECT X.*
FROM @YourTable Y
CROSS APPLY
(
SELECT
Code = C.Value
, Value = V.Value
FROM dbo.udf_SplitList(Y.CodeList , '/') C
JOIN dbo.udf_SplitList(Y.ValueList, '/') V ON V.ID = C.ID
) X
;
不使用函数,你可以使用 tally/numbers table 来做同样的事情,如下所示
查看工作演示 here
; with nums as
(
select 1 as num
union all
select num +1 as num
from
nums
where num <80
)
select
X.id,
substring(X.column1,X.b,X.e-X.b),
substring(Y.column2,Y.b,Y.e-Y.b)
from
(
select
t.*,
e=N.Num,
r=row_number() over(partition by Id order by N.num),
b=isnull(lag(N.num) over (partition by Id order by N.num),0)+1
from t
left join nums N
on charindex('/',column1+'/',N.num)=N.num
)X
left join
(
select
t.*,
e= N.Num,
r=row_number() over(partition by Id order by N.num),
b=isnull(lag(N.num) over (partition by Id order by N.num),0)+1
from t
left join nums N
on charindex('/', column2+'/',N.num)=N.num
)Y
on X.id=Y.id and X.r=Y.r
我知道已经有一些答案,其中一个已被接受,但还有一些重要的性能因素需要考虑。如果值的数量总是三个(或者总是很低,比如少于 5 或 6),那么级联应用技术将是目前最快的。此解决方案假定始终有 3 个项目。它可以很容易地修改以处理变量,但项目数量很少。
级联应用解决方案:
DECLARE @table TABLE
(
SomeId INT IDENTITY,
S1 VARCHAR(1000),
s2 VARCHAR(1000)
);
INSERT @table VALUES ('a/b/c','apple/banana/cucumber'),
('d/d2/f','dog/donkey/fish'),('x/y/z','x-ray/yo-yo/zeta');
SELECT
SomeId = f.SomeId,
Col1 = f2.C1,
Col2 = f2.C2
FROM
(
SELECT
t.SomeId,
SUBSTRING(t.S1, 1, c1.P-1),
SUBSTRING(t.S1, c1.P+1, c2.P - c1.P-1),
SUBSTRING(t.S1, c2.P+1, 8000),
SUBSTRING(t.S2, 1, c1.P2 - 1),
SUBSTRING(t.S2, c1.P2+1, c2.P2 - c1.P2-1),
SUBSTRING(t.S2, c2.P2+1, 8000)
FROM @table AS t
CROSS APPLY (VALUES(CHARINDEX('/',t.S1),CHARINDEX('/',t.S2))) AS c1(P,P2)
CROSS APPLY (VALUES(CHARINDEX('/',t.S1,c1.P+1),CHARINDEX('/',t.S2,c1.P2+1))) AS c2(P,P2)
CROSS APPLY (VALUES(CHARINDEX('/',t.S1,c2.P+1),CHARINDEX('/',t.S2,c2.P2+1))) AS c3(P,P2)
) AS f(SomeId,c1_1,c1_2,c1_3,c2_1,c2_2,c2_3)
CROSS APPLY (VALUES (c1_1, c2_1), (c1_2, c2_2),(c1_3,c2_3)) AS f2(c1,c2);
如果您使用的是 SQL Azure,则您有 STRING_SPLIT 和 ordinal option。
DECLARE @table TABLE
(
SomeId INT IDENTITY,
S1 VARCHAR(1000),
s2 VARCHAR(1000)
);
INSERT @table VALUES ('a/b/c','apple/banana/cucumber'),
('d/d2/f','dog/donkey/fish'),('x/y/z','x-ray/yo-yo/zeta');
SELECT TOP(1) WITH TIES
t.SomeId, t.S1, t.S2, Col1 = split1.[value], Col2 = split2.[value]
FROM @table AS t
CROSS APPLY STRING_SPLIT(t.S1 ,'/') AS split1
CROSS APPLY STRING_SPLIT(t.S2 ,'/') AS split2
ORDER BY ABS(
ROW_NUMBER() OVER (PARTITION BY t.SomeId, split1.value ORDER BY split1.value)-
ROW_NUMBER() OVER (PARTITION BY t.SomeId, split2.value ORDER BY split1.value));
^^^ 这仅在项目按字母顺序排列时有效(不实用)但是,在 Azure 上,您可以将 ORDER BY 更改为:
ORDER BY ABS(
ROW_NUMBER() OVER (PARTITION BY t.SomeId, split1.value ORDER BY split1.ordianal) -
ROW_NUMBER() OVER (PARTITION BY t.SomeId, split2.value ORDER BY split1.ordianal));
比较所有发布的技术
现在让我们比较迄今为止发布的所有解决方案,以了解 戏剧性 性能差异。我构建了一个基本测试工具并 运行 它,首先是 10 万行,然后是一百万行。解决方案包括:
- dbo.udf_SplitList 来自 Andy3B
- dbo.fnString_Split 来自 LukStorms
- 递归 CTE - Numbers/tally table DhruvJoshi 的解决方案
- DhruvJoshi 解决方案的改进版本,计算速度更快 table
- Yitzhak Khabinsky 的 JSON 解决方案
- 我的 Cascading APPLY 解决方案
dbo.udf_SplitList 解决方案最慢,一百万行需要 246 秒。标量函数慢得可怕,但递归让事情变得更糟。
dbo.fnString_Split 解决方案在 85 秒时效果更好。为了做得更好,我们需要丢失标量 udf。
DhruvJoshi 的递归 CTE 解决方案将我们缩短到 65 秒,提高了 50%。我通过重写数字 table 以不使用递归来改进他的解决方案;这让我们的速度又提高了 50%,减少到 45 秒。
Yitzhak Khabinsky 是第一个基于集合的解决方案;他正在利用 JSON。这里我们有 200%+ 的性能提升,减少到 16 秒。 Cascading APPLY 解决方案比 Yitzhak 的解决方案又提高了 400%+;三秒内 .
注意标量 UDF、计数和循环的递归。基于集合的总是规则。
IF OBJECT_ID('tempdb..#table') IS NOT NULL DROP TABLE #table;
GO
SELECT TOP(100000)
SomeId = ROW_NUMBER() OVER (ORDER BY (SELECT NULL)),
S1 = 'A/B/C',
S2 = REPLACE(LEFT(NEWID(),18),'-','/')
INTO #table
FROM sys.all_columns, sys.all_columns a
GO
PRINT CHAR(10)+'dbo.udf_SplitList -Andy3B'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
SELECT @ID = t.SomeId, @C1 = x.Code, @C2 = x.Value
FROM #table AS t
CROSS APPLY
(
SELECT Code = C.Value,
Value = V.Value
FROM dbo.udf_SplitList(t.S1, '/') AS C
JOIN dbo.udf_SplitList(t.S2, '/') AS V ON V.ID = C.ID
) AS X;
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
PRINT CHAR(10)+'dbo.fnString_Split - LukStorms'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
select @ID = t.SomeId, @C1 = ca.Column1, @C2 = ca.Column2
from #table AS t
cross apply (
select
s1.ordinal
, s1.value as Column1
, s2.value as Column2
from dbo.fnString_Split(t.S1,'/') as s1
join dbo.fnString_Split(t.S2,'/') as s2
on s1.ordinal = s2.ordinal) AS ca;
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
PRINT CHAR(10)+'Recursive CTE - TALLY TABLE DhruvJoshi'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
;with nums as
(
select 1 as num union all
select num +1 as num
from nums
where num <80
)
select
@ID = X.SomeId,
@C1 = substring(X.S1,X.b,X.e-X.b),
@C2 = substring(Y.S2,Y.b,Y.e-Y.b)
from
(
select
t.*,
e=N.Num,
r=row_number() over (partition by t.SomeId order by N.num),
b=isnull(lag(N.num) over (partition by t.SomeId order by N.num),0)+1
from #table AS t
left join nums AS N
on charindex('/',t.S1+'/',N.num)=N.num
) AS X
left join
(
select
t.*,
e= N.Num,
r=row_number() over (partition by t.SomeId order by N.num),
b=isnull(lag(N.num) over (partition by t.SomeId order by N.num),0)+1
from #table AS t
left join nums AS N
on charindex('/', t.S2+'/',N.num)=N.num
) AS Y
ON X.SomeId = Y.SomeId and X.r=Y.r;
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
PRINT CHAR(10)+'TALLY TABLE without Recursion DhruvJoshi'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
;with nums as
(
SELECT num = ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
FROM (VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS e1(x),
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS e2(x),
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS e3(x),
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS e4(x)
)
select
@ID = X.SomeId,
@C1 = substring(X.S1,X.b,X.e-X.b),
@C2 = substring(Y.S2,Y.b,Y.e-Y.b)
from
(
select
t.*,
e=N.Num,
r=row_number() over (partition by t.SomeId order by N.num),
b=isnull(lag(N.num) over (partition by t.SomeId order by N.num),0)+1
from #table AS t
left join nums AS N
on charindex('/',t.S1+'/',N.num)=N.num
WHERE n.num <= LEN(t.S1)
) AS X
left join
(
select
t.*,
e= N.Num,
r=row_number() over (partition by t.SomeId order by N.num),
b=isnull(lag(N.num) over (partition by t.SomeId order by N.num),0)+1
from #table AS t
left join nums AS N
on charindex('/', t.S2+'/',N.num)=N.num
WHERE n.num <= LEN(t.S2)
) AS Y
ON X.SomeId = Y.SomeId and X.r=Y.r;
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
PRINT CHAR(10)+'JSON - Yitzhak Khabinsky'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
DECLARE @separator CHAR(1) = '/';
WITH rs AS
(
SELECT *
, ar1 = '["' + REPLACE(t.S1, @separator, '","') + '"]'
, ar2 = '["' + REPLACE(t.S2, @separator, '","') + '"]'
FROM #table AS t
)
SELECT @id = SomeID, @C1 = ColB.[value], @C2 = ColC.[value]
FROM rs AS rs
CROSS APPLY OPENJSON (ar1, N'$') AS ColB
CROSS APPLY OPENJSON (ar2, N'$') AS ColC
WHERE ColB.[key] = ColC.[key];
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
PRINT CHAR(10)+'CROSS APPLY TECHNIQUE'+CHAR(10)+REPLICATE('-',90);
GO
DECLARE @st DATETIME = GETDATE(), @ID INT, @C1 VARCHAR(1000), @C2 VARCHAR(1000);
SELECT
@ID = f.SomeId,
@C1 = f2.C1,
@C2 = f2.C2
FROM
(
SELECT
t.SomeId,
SUBSTRING(t.S1, 1, c1.P-1),
SUBSTRING(t.S1, c1.P+1, c2.P - c1.P-1),
SUBSTRING(t.S1, c2.P+1, 8000),
SUBSTRING(t.S2, 1, c1.P2 - 1),
SUBSTRING(t.S2, c1.P2+1, c2.P2 - c1.P2-1),
SUBSTRING(t.S2, c2.P2+1, 8000)
FROM #table AS t
CROSS APPLY (VALUES(CHARINDEX('/',t.S1),CHARINDEX('/',t.S2))) AS c1(P,P2)
CROSS APPLY (VALUES(CHARINDEX('/',t.S1,c1.P+1),CHARINDEX('/',t.S2,c1.P2+1))) AS c2(P,P2)
CROSS APPLY (VALUES(CHARINDEX('/',t.S1,c2.P+1),CHARINDEX('/',t.S2,c2.P2+1))) AS c3(P,P2)
) AS f(SomeId,c1_1,c1_2,c1_3,c2_1,c2_2,c2_3)
CROSS APPLY (VALUES (c1_1, c2_1), (c1_2, c2_2),(c1_3,c2_3)) AS f2(c1,c2);
PRINT DATEDIFF(MS,@st,GETDATE())
GO 3
100K行测试结果:
dbo.udf_SplitList -Andy3B
------------------------------------------------------------------------------------------
Beginning execution loop
24593
24566
24530
Batch execution completed 3 times.
dbo.fnString_Split - LukStorms
------------------------------------------------------------------------------------------
Beginning execution loop
8147
8260
8257
Batch execution completed 3 times.
Recursive CTE - TALLY TABLE DhruvJoshi
------------------------------------------------------------------------------------------
Beginning execution loop
6867
6733
6850
Batch execution completed 3 times.
TALLY TABLE without Recursion DhruvJoshi
------------------------------------------------------------------------------------------
Beginning execution loop
4677
4630
4620
Batch execution completed 3 times.
JSON - Yitzhak Khabinsky
------------------------------------------------------------------------------------------
Beginning execution loop
1667
1653
1670
Batch execution completed 3 times.
CROSS APPLY TECHNIQUE
------------------------------------------------------------------------------------------
Beginning execution loop
283
280
284
Batch execution completed 3 times.
100万行测试结果:
dbo.udf_SplitList -Andy3B
------------------------------------------------------------------------------------------
Beginning execution loop
246057
245296
247017
Batch execution completed 3 times.
dbo.fnString_Split - LukStorms
------------------------------------------------------------------------------------------
Beginning execution loop
85340
83010
83674
Batch execution completed 3 times.
Recursive CTE - TALLY TABLE -DhruvJoshi
------------------------------------------------------------------------------------------
Beginning execution loop
67226
64910
64740
Batch execution completed 3 times.
TALLY TABLE without Recursion DhruvJoshi
------------------------------------------------------------------------------------------
Beginning execution loop
46777
44630
44623
Batch execution completed 3 times.
JSON - Yitzhak Khabinsky
------------------------------------------------------------------------------------------
Beginning execution loop
16710
16830
16520
Batch execution completed 3 times.
CROSS APPLY TECHNIQUE
------------------------------------------------------------------------------------------
Beginning execution loop
2846
2793
2850
Batch execution completed 3 times.
Completion time: 2022-01-18T22:08:50.3264912-06:00