将 XML 数组展平为 Snowflake 中的一行
Flatten XML array to a single row in Snowflake
我有数千个 XML 文件,每个文件都有一个 key/value 对数组。像这样:
<extras>
<extra>
<key>key_name_1</key>
<value>value_1</value>
</extra>
<extra>
<key>key_name_2</key>
<value>value_2</value>
</extra>
</extras>
我需要能够创建一个查询,为每个 XML 文件生成一个单行响应,该文件在由键命名的单独列中包含这些值中的每一个。使用 MySQL,我可以将 extractvalue 与 XPath ('extras/extra[key="key_name_1"/value') 结合使用,但是对于 Snowflake,我无法为此找到解决方案。
我试过横向展平然后从这个数组中获取值,但我没有成功。这可能很简单,但我找不到解决方案。
---更新---
我总是可以使用这样的东西:
XMLGET(XMLGET("xml_data", 'extras'):"$"[0], 'value'):"$"
不幸的是,并非所有密钥都可用,因此这无用。
难道我必须为此做一个UDF?
--- 更新 2 ---
按照以下方式工作:
WITH Extra_1 AS (
SELECT
"id" as "id",
XMLGET(extra.value, 'value'):"$" AS "value"
FROM table,
LATERAL FLATTEN(XMLGET("xml_data", 'extras'):"$") extra
WHERE XMLGET(extra.value, 'key'):"$" = 'key_name_1'
),
WITH Extra_2 AS (
SELECT
"id" AS "id",
XMLGET(extra.value, 'value'):"$" AS "value"
FROM table,
LATERAL FLATTEN(XMLGET("xml_data", 'extras'):"$") extra
WHERE XMLGET(extra.value, 'key'):"$" = 'key_name_2'
)
SELECT
"id",
Extra_1."value" AS "key_name_1",
Extra_2."value" AS "key_name_2",
FROM table
JOIN Extra_1 ON Extra_1."id" = table."id";
JOIN Extra_2 ON Extra_2."id" = table."id";
希望有人有更简单的方法来做到这一点。
这就是在 MySQL 中可以轻松完成的事情:
SELECT
id,
extractvalue( xml_data,'/extras/extra[key="key_name_1"]/value') AS key_name_1,
extractvalue( xml_data,'/extras/extra[key="key_name_2"]/value') AS key_name_2
FROM table;
使用 FLATTEN
和 XMLGET
:
CREATE OR REPLACE TABLE t
AS
SELECT 1 AS id, PARSE_XML('<extras>
<extra>
<key>key_name_1</key>
<value>value_1</value>
</extra>
<extra>
<key>key_name_2</key>
<value>value_2</value>
</extra>
</extras>') AS col_xml
UNION ALL
SELECT 2 AS id, PARSE_XML('<extras>
<extra>
<key>key_name_3</key>
<value>value_3</value>
</extra>
<extra>
<key>key_name_4</key>
<value>value_4</value>
</extra>
<extra>
<key>key_name_5</key>
<value>value_5</value>
</extra>
</extras>') AS col;
查询:
SELECT id,
XMLGET(s.VALUE, 'key'):"$"::STRING AS k
,XMLGET(s.VALUE, 'value'):"$"::STRING AS val
FROM t
,LATERAL FLATTEN(INPUT => t.col_xml:"$") s
WHERE GET(s.value, '@')::STRING='extra';
输出:
单行:
SELECT id
,GET(XMLGET(t.col_xml, 'extra', 0), '$')[0]:"$"::STRING AS key1
,GET(XMLGET(t.col_xml, 'extra', 0), '$')[1]:"$"::STRING AS value1
,GET(XMLGET(t.col_xml, 'extra', 1), '$')[0]:"$"::STRING AS key2
,GET(XMLGET(t.col_xml, 'extra', 1), '$')[1]:"$"::STRING AS value2
,GET(XMLGET(t.col_xml, 'extra', 2), '$')[0]:"$"::STRING AS key2
,GET(XMLGET(t.col_xml, 'extra', 2), '$')[1]:"$"::STRING AS value2
FROM t;
如果您只有 2 个值想要提取,您可以将 Lukasz 的答案与 CASE 合并:
WITH fake_data AS (
SELECT column1 as id,
PARSE_XML(column2) as col_xml
FROM VALUES
(1, '<extras><extra><key>key_name_1</key><value>value_1</value></extra>
<extra><key>key_name_2</key><value>value_2</value></extra>
</extras>'),
(2, '<extras><extra><key>key_name_1</key><value>value_3</value></extra>
</extras>'),
(3, '<extras><extra><key>key_name_2</key><value>value_4</value></extra>
</extras>')
)
SELECT *
,case
when GET(XMLGET(t.col_xml, 'extra', 0), '$')[0]:"$" = 'key_name_1' then GET(XMLGET(t.col_xml, 'extra', 0), '$')[1]:"$"::STRING
when GET(XMLGET(t.col_xml, 'extra', 1), '$')[0]:"$" = 'key_name_1' then GET(XMLGET(t.col_xml, 'extra', 1), '$')[1]:"$"::STRING
end as key_name_1
,case
when GET(XMLGET(t.col_xml, 'extra', 0), '$')[0]:"$" = 'key_name_2' then GET(XMLGET(t.col_xml, 'extra', 0), '$')[1]:"$"::STRING
when GET(XMLGET(t.col_xml, 'extra', 1), '$')[0]:"$" = 'key_name_2' then GET(XMLGET(t.col_xml, 'extra', 1), '$')[1]:"$"::STRING
end as key_name_2
FROM fake_data as t;
给出:
ID
COL_XML
KEY_NAME_1
KEY_NAME_2
1
key_name_1 value_1 key_name_2 value_2
value_1
value_2
2
key_name_1 value_3
value_3
3
key_name_2 value_4
value_4
如果您有“大量列”,则将数据平展然后旋转它(或其他一些 GROUP BY)
WITH fake_data AS (
SELECT column1 as id,
PARSE_XML(column2) as col_xml
FROM VALUES
(1, '<extras><extra><key>key_name_1</key><value>value_1</value></extra>
<extra><key>key_name_2</key><value>value_2</value></extra>
</extras>'),
(2, '<extras><extra><key>key_name_1</key><value>value_3</value></extra>
</extras>'),
(3, '<extras><extra><key>key_name_2</key><value>value_4</value></extra>
</extras>')
)
SELECT *
FROM (
SELECT
t.id
--,f.seq
,xmlget(f.value,'key'):"$"::text as key
,xmlget(f.value,'value'):"$"::text as value
FROM fake_data as t
,table(flatten(input=>to_array(t.col_xml:"$"))) f
) d pivot(min(value) for key in ('key_name_1','key_name_2')) as p
ORDER BY id;
给出:
ID
'key_name_1'
'key_name_2'
1
value_1
value_2
2
value_3
3
value_4
此解决方案的注意事项 FLATTEN 的输入被强制为一个数组,因此只有一个子对象的 XML 对象不会被视为一个对象,而是一个数组,因此 FLATTEN 过程是一致的。
我有数千个 XML 文件,每个文件都有一个 key/value 对数组。像这样:
<extras>
<extra>
<key>key_name_1</key>
<value>value_1</value>
</extra>
<extra>
<key>key_name_2</key>
<value>value_2</value>
</extra>
</extras>
我需要能够创建一个查询,为每个 XML 文件生成一个单行响应,该文件在由键命名的单独列中包含这些值中的每一个。使用 MySQL,我可以将 extractvalue 与 XPath ('extras/extra[key="key_name_1"/value') 结合使用,但是对于 Snowflake,我无法为此找到解决方案。
我试过横向展平然后从这个数组中获取值,但我没有成功。这可能很简单,但我找不到解决方案。
---更新---
我总是可以使用这样的东西:
XMLGET(XMLGET("xml_data", 'extras'):"$"[0], 'value'):"$"
不幸的是,并非所有密钥都可用,因此这无用。
难道我必须为此做一个UDF?
--- 更新 2 ---
按照以下方式工作:
WITH Extra_1 AS (
SELECT
"id" as "id",
XMLGET(extra.value, 'value'):"$" AS "value"
FROM table,
LATERAL FLATTEN(XMLGET("xml_data", 'extras'):"$") extra
WHERE XMLGET(extra.value, 'key'):"$" = 'key_name_1'
),
WITH Extra_2 AS (
SELECT
"id" AS "id",
XMLGET(extra.value, 'value'):"$" AS "value"
FROM table,
LATERAL FLATTEN(XMLGET("xml_data", 'extras'):"$") extra
WHERE XMLGET(extra.value, 'key'):"$" = 'key_name_2'
)
SELECT
"id",
Extra_1."value" AS "key_name_1",
Extra_2."value" AS "key_name_2",
FROM table
JOIN Extra_1 ON Extra_1."id" = table."id";
JOIN Extra_2 ON Extra_2."id" = table."id";
希望有人有更简单的方法来做到这一点。
这就是在 MySQL 中可以轻松完成的事情:
SELECT
id,
extractvalue( xml_data,'/extras/extra[key="key_name_1"]/value') AS key_name_1,
extractvalue( xml_data,'/extras/extra[key="key_name_2"]/value') AS key_name_2
FROM table;
使用 FLATTEN
和 XMLGET
:
CREATE OR REPLACE TABLE t
AS
SELECT 1 AS id, PARSE_XML('<extras>
<extra>
<key>key_name_1</key>
<value>value_1</value>
</extra>
<extra>
<key>key_name_2</key>
<value>value_2</value>
</extra>
</extras>') AS col_xml
UNION ALL
SELECT 2 AS id, PARSE_XML('<extras>
<extra>
<key>key_name_3</key>
<value>value_3</value>
</extra>
<extra>
<key>key_name_4</key>
<value>value_4</value>
</extra>
<extra>
<key>key_name_5</key>
<value>value_5</value>
</extra>
</extras>') AS col;
查询:
SELECT id,
XMLGET(s.VALUE, 'key'):"$"::STRING AS k
,XMLGET(s.VALUE, 'value'):"$"::STRING AS val
FROM t
,LATERAL FLATTEN(INPUT => t.col_xml:"$") s
WHERE GET(s.value, '@')::STRING='extra';
输出:
单行:
SELECT id
,GET(XMLGET(t.col_xml, 'extra', 0), '$')[0]:"$"::STRING AS key1
,GET(XMLGET(t.col_xml, 'extra', 0), '$')[1]:"$"::STRING AS value1
,GET(XMLGET(t.col_xml, 'extra', 1), '$')[0]:"$"::STRING AS key2
,GET(XMLGET(t.col_xml, 'extra', 1), '$')[1]:"$"::STRING AS value2
,GET(XMLGET(t.col_xml, 'extra', 2), '$')[0]:"$"::STRING AS key2
,GET(XMLGET(t.col_xml, 'extra', 2), '$')[1]:"$"::STRING AS value2
FROM t;
如果您只有 2 个值想要提取,您可以将 Lukasz 的答案与 CASE 合并:
WITH fake_data AS (
SELECT column1 as id,
PARSE_XML(column2) as col_xml
FROM VALUES
(1, '<extras><extra><key>key_name_1</key><value>value_1</value></extra>
<extra><key>key_name_2</key><value>value_2</value></extra>
</extras>'),
(2, '<extras><extra><key>key_name_1</key><value>value_3</value></extra>
</extras>'),
(3, '<extras><extra><key>key_name_2</key><value>value_4</value></extra>
</extras>')
)
SELECT *
,case
when GET(XMLGET(t.col_xml, 'extra', 0), '$')[0]:"$" = 'key_name_1' then GET(XMLGET(t.col_xml, 'extra', 0), '$')[1]:"$"::STRING
when GET(XMLGET(t.col_xml, 'extra', 1), '$')[0]:"$" = 'key_name_1' then GET(XMLGET(t.col_xml, 'extra', 1), '$')[1]:"$"::STRING
end as key_name_1
,case
when GET(XMLGET(t.col_xml, 'extra', 0), '$')[0]:"$" = 'key_name_2' then GET(XMLGET(t.col_xml, 'extra', 0), '$')[1]:"$"::STRING
when GET(XMLGET(t.col_xml, 'extra', 1), '$')[0]:"$" = 'key_name_2' then GET(XMLGET(t.col_xml, 'extra', 1), '$')[1]:"$"::STRING
end as key_name_2
FROM fake_data as t;
给出:
ID | COL_XML | KEY_NAME_1 | KEY_NAME_2 |
---|---|---|---|
1 | key_name_1 value_1 key_name_2 value_2 | value_1 | value_2 |
2 | key_name_1 value_3 | value_3 | |
3 | key_name_2 value_4 | value_4 |
如果您有“大量列”,则将数据平展然后旋转它(或其他一些 GROUP BY)
WITH fake_data AS (
SELECT column1 as id,
PARSE_XML(column2) as col_xml
FROM VALUES
(1, '<extras><extra><key>key_name_1</key><value>value_1</value></extra>
<extra><key>key_name_2</key><value>value_2</value></extra>
</extras>'),
(2, '<extras><extra><key>key_name_1</key><value>value_3</value></extra>
</extras>'),
(3, '<extras><extra><key>key_name_2</key><value>value_4</value></extra>
</extras>')
)
SELECT *
FROM (
SELECT
t.id
--,f.seq
,xmlget(f.value,'key'):"$"::text as key
,xmlget(f.value,'value'):"$"::text as value
FROM fake_data as t
,table(flatten(input=>to_array(t.col_xml:"$"))) f
) d pivot(min(value) for key in ('key_name_1','key_name_2')) as p
ORDER BY id;
给出:
ID | 'key_name_1' | 'key_name_2' |
---|---|---|
1 | value_1 | value_2 |
2 | value_3 | |
3 | value_4 |
此解决方案的注意事项 FLATTEN 的输入被强制为一个数组,因此只有一个子对象的 XML 对象不会被视为一个对象,而是一个数组,因此 FLATTEN 过程是一致的。