在使用 REGEXP 删除结果中的 + 时需要指导
Need guidance in removing + in the result using REGEXP
SELECT ID,
lower(LISTAGG(DISTINCT COL_A, ',') WITHIN GROUP(ORDER BY COL_A)) AS COL_1
FROM table_1
WHERE date = '2022-02-02'
GROUP BY ID
ID
COL_1
12345
abc,+bda,+beach,relax
23456
unknown_user,+unknown_member,+others_to_denote
当我 运行 上面的查询时,我得到了如上所述的结果。我希望从结果中删除 +
符号。在这种情况下可以使用 REGEXP 吗?
如果你只想删除+,那么你可以使用REPLACE
SELECT ID,COL_1, replace(col_1,'+') FROM VALUES
('12345', 'abc,+bda,+beach,relax')
,('23456','unknown_user,+unknown_member,+others_to_denote') as tab(ID,COL_1)
因此,您应该在 LIST_AGG 之前进行清理,因为目前您可以多次在输出中包含 abc
:
SELECT
column1 AS ID
,LISTAGG(DISTINCT lower(column2), ',') WITHIN GROUP(ORDER BY lower(column2)) AS COL_1_a
,lower(LISTAGG(DISTINCT column2, ',') WITHIN GROUP(ORDER BY column2)) AS COL_1_b
,replace(COL_1_b,'+')
FROM VALUES
(12345, 'abc'),
(12345, 'ABC'),
(12345, '+ABC'),
(12345, '+ABc'),
(12345, '+AbC'),
(12345, '+bda')
GROUP BY ID;
给出:
ID
COL_1_A
COL_1_B
REPLACE(COL_1_B,'+')
12345
+abc,+bda,abc
+abc,+abc,+abc,+bda,abc,abc
abc,abc,abc,bda,abc,abc
因此有了一层“清洁”
SELECT
ID
,LISTAGG(DISTINCT col2_cleaned, ',') WITHIN GROUP(ORDER BY col2_cleaned) AS COL_1
FROM (
SELECT column1 as id,
replace(lower(column2),'+') AS col2_cleaned
FROM (
VALUES
(12345, 'abc'),
(12345, 'ABC'),
(12345, '+ABC'),
(12345, '+ABc'),
(12345, '+AbC'),
(12345, '+bda')
)
)
GROUP BY ID;
产生了更明智的结果:
ID
COL_1
12345
abc,bda
其他清洁要点:
您真的要删除所有 +
个标记吗,有时我们只想删除开头或结尾的标记,因此替换可能会删除太多:
SELECT column1 as orig
,replace(lower(column1),'+') AS all_cleaned
,ltrim(lower(column1),'+') AS lt_cleaned
,rtrim(lower(column1),'+') AS rt_cleaned
,trim(lower(column1),'+') AS t_cleaned
FROM VALUES
('abc'),
('ABC'),
('+A+BC'),
('+AB+c+'),
('+AbC+')
;
ORIG
ALL_CLEANED
LT_CLEANED
RT_CLEANED
T_CLEANED
abc
abc
abc
abc
abc
ABC
abc
abc
abc
abc
+A+BC
abc
a+bc
+a+bc
a+bc
+AB+c+
abc
ab+c+
+ab+c
ab+c
+AbC+
abc
abc+
+abc
abc
但是如果您想从开头删除最多 2 个 +
但不能更多:
,您也可以使用 REGEXP_REPLACE
,regexp_replace(lower(column1), '\+{1,2}','',1,1,'e')
SELECT ID,
lower(LISTAGG(DISTINCT COL_A, ',') WITHIN GROUP(ORDER BY COL_A)) AS COL_1
FROM table_1
WHERE date = '2022-02-02'
GROUP BY ID
ID | COL_1 |
---|---|
12345 | abc,+bda,+beach,relax |
23456 | unknown_user,+unknown_member,+others_to_denote |
当我 运行 上面的查询时,我得到了如上所述的结果。我希望从结果中删除 +
符号。在这种情况下可以使用 REGEXP 吗?
如果你只想删除+,那么你可以使用REPLACE
SELECT ID,COL_1, replace(col_1,'+') FROM VALUES
('12345', 'abc,+bda,+beach,relax')
,('23456','unknown_user,+unknown_member,+others_to_denote') as tab(ID,COL_1)
因此,您应该在 LIST_AGG 之前进行清理,因为目前您可以多次在输出中包含 abc
:
SELECT
column1 AS ID
,LISTAGG(DISTINCT lower(column2), ',') WITHIN GROUP(ORDER BY lower(column2)) AS COL_1_a
,lower(LISTAGG(DISTINCT column2, ',') WITHIN GROUP(ORDER BY column2)) AS COL_1_b
,replace(COL_1_b,'+')
FROM VALUES
(12345, 'abc'),
(12345, 'ABC'),
(12345, '+ABC'),
(12345, '+ABc'),
(12345, '+AbC'),
(12345, '+bda')
GROUP BY ID;
给出:
ID | COL_1_A | COL_1_B | REPLACE(COL_1_B,'+') |
---|---|---|---|
12345 | +abc,+bda,abc | +abc,+abc,+abc,+bda,abc,abc | abc,abc,abc,bda,abc,abc |
因此有了一层“清洁”
SELECT
ID
,LISTAGG(DISTINCT col2_cleaned, ',') WITHIN GROUP(ORDER BY col2_cleaned) AS COL_1
FROM (
SELECT column1 as id,
replace(lower(column2),'+') AS col2_cleaned
FROM (
VALUES
(12345, 'abc'),
(12345, 'ABC'),
(12345, '+ABC'),
(12345, '+ABc'),
(12345, '+AbC'),
(12345, '+bda')
)
)
GROUP BY ID;
产生了更明智的结果:
ID | COL_1 |
---|---|
12345 | abc,bda |
其他清洁要点:
您真的要删除所有 +
个标记吗,有时我们只想删除开头或结尾的标记,因此替换可能会删除太多:
SELECT column1 as orig
,replace(lower(column1),'+') AS all_cleaned
,ltrim(lower(column1),'+') AS lt_cleaned
,rtrim(lower(column1),'+') AS rt_cleaned
,trim(lower(column1),'+') AS t_cleaned
FROM VALUES
('abc'),
('ABC'),
('+A+BC'),
('+AB+c+'),
('+AbC+')
;
ORIG | ALL_CLEANED | LT_CLEANED | RT_CLEANED | T_CLEANED |
---|---|---|---|---|
abc | abc | abc | abc | abc |
ABC | abc | abc | abc | abc |
+A+BC | abc | a+bc | +a+bc | a+bc |
+AB+c+ | abc | ab+c+ | +ab+c | ab+c |
+AbC+ | abc | abc+ | +abc | abc |
但是如果您想从开头删除最多 2 个 +
但不能更多:
,regexp_replace(lower(column1), '\+{1,2}','',1,1,'e')