查询从 Presto 中深度嵌套的 json 数组对象中提取 id
Query to extract ids from a deeply nested json array object in Presto
我正在使用 Presto 并尝试从嵌套的 json 结构中从 'source'='dd' 中提取所有 'id',如下所示。
{
"results": [
{
"docs": [
{
"id": "apple1",
"source": "dd"
},
{
"id": "apple2",
"source": "aa"
},
{
"id": "apple3",
"source": "dd"
}
],
"group": 99806
}
]
}
希望将 ID [apple1, apple3] 提取到 Presto 中的列中
我想知道在 Presto Query 中实现此目的的正确方法是什么?
如果您的数据具有您发布的示例中的规则结构,您可以使用 parsing the value as JSON, casting it to a structured SQL type (array/map/row) and the using array processing functions to filter, transform 的组合并提取您想要的元素:
WITH data(value) AS (VALUES '{
"results": [
{
"docs": [
{
"id": "apple1",
"source": "dd"
},
{
"id": "apple2",
"source": "aa"
},
{
"id": "apple3",
"source": "dd"
}
],
"group": 99806
}
]
}'),
parsed(value) AS (
SELECT cast(json_parse(value) AS row(results array(row(docs array(row(id varchar, source varchar)), "group" bigint))))
FROM data
)
SELECT
transform( -- extract the id from the resulting docs
filter( -- filter docs with source = 'dd'
flatten( -- flatten all docs arrays into a single doc array
transform(value.results, r -> r.docs) -- extract the docs arrays from the result array
),
doc -> doc.source = 'dd'),
doc -> doc.id)
FROM parsed
上面的查询产生:
_col0
------------------
[apple1, apple3]
(1 row)
我正在使用 Presto 并尝试从嵌套的 json 结构中从 'source'='dd' 中提取所有 'id',如下所示。
{
"results": [
{
"docs": [
{
"id": "apple1",
"source": "dd"
},
{
"id": "apple2",
"source": "aa"
},
{
"id": "apple3",
"source": "dd"
}
],
"group": 99806
}
]
}
希望将 ID [apple1, apple3] 提取到 Presto 中的列中 我想知道在 Presto Query 中实现此目的的正确方法是什么?
如果您的数据具有您发布的示例中的规则结构,您可以使用 parsing the value as JSON, casting it to a structured SQL type (array/map/row) and the using array processing functions to filter, transform 的组合并提取您想要的元素:
WITH data(value) AS (VALUES '{
"results": [
{
"docs": [
{
"id": "apple1",
"source": "dd"
},
{
"id": "apple2",
"source": "aa"
},
{
"id": "apple3",
"source": "dd"
}
],
"group": 99806
}
]
}'),
parsed(value) AS (
SELECT cast(json_parse(value) AS row(results array(row(docs array(row(id varchar, source varchar)), "group" bigint))))
FROM data
)
SELECT
transform( -- extract the id from the resulting docs
filter( -- filter docs with source = 'dd'
flatten( -- flatten all docs arrays into a single doc array
transform(value.results, r -> r.docs) -- extract the docs arrays from the result array
),
doc -> doc.source = 'dd'),
doc -> doc.id)
FROM parsed
上面的查询产生:
_col0
------------------
[apple1, apple3]
(1 row)