使用配置单元 SQL 提取不同字符之间的字符串
Extracting strings between distinct characters using hive SQL
我有一个名为 geo_data_display 的字段,其中包含国家、地区和 DMA。这 3 个值包含在 = 和 & 字符之间 - 第一个“=”和第一个“&”之间的国家/地区,第二个“=”和第二个“&”之间的地区以及第三个“=”和第三个“”之间的 DMA &”。这是 table 的可重现版本。 country 始终是字符,但 region 和 DMA 可以是数字或字符,DMA 并非对所有国家/地区都存在。
一些示例值是:
country=us®ion=tx&dma=625&domain=abc.net&zipcodes=76549
country=us®ion=ca&dma=803&domain=abc.com&zipcodes=90404
country=tw®ion=hsz&domain=hinet.net&zipcodes=300
country=jp®ion=1&dma=a&domain=hinet.net&zipcodes=300
我有一些示例 SQL 但 geo_dma 代码行根本不起作用,geo_region 代码行仅适用于字符值
SELECT
UPPER(REGEXP_REPLACE(split(geo_data_display, '\&')[0], 'country=', '')) AS geo_country
,UPPER(split(split(geo_data_display, '\&')[1],'\=')[1]) AS geo_region
,split(split(cast(geo_data_display as int), '\&')[2],'\=')[2] AS geo_dma
FROM mytable
regexp_extract(string subject, string pattern, int index)
Returns 使用模式提取的字符串。例如,regexp_extract('foothebar', 'foo(.*?)(bar)', 1) returns 'the'
select
regexp_extract(geo_data_display, 'country=(.*?)(®ion)', 1),
regexp_extract(geo_data_display, 'region=(.*?)(&dma)', 1),
regexp_extract(geo_data_display, 'dma=(.*?)(&domain)', 1)
您可以像这样使用 str_to_map
:
select geo_map['country'] as geo_country
,geo_map['region'] as geo_region
,geo_map['dma'] as geo_dma
from (select str_to_map(geo_data_display,'&','=') as geo_map
from mytable
) t
;
+--------------+-------------+----------+
| geo_country | geo_region | geo_dma |
+--------------+-------------+----------+
| us | tx | 625 |
| us | ca | 803 |
| tw | hsz | NULL |
| jp | 1 | a |
+--------------+-------------+----------+
请尝试以下操作,
create table ch8(details map string,string>)
row format delimited
collection items terminated by '&'
map keys terminated by '=';
将数据加载到 table。
create another table using CTAS
create table ch9 as select details["country"] as country, details["region"] as region, details["dma"] as dma, details["domain"] as domain, details["zipcodes"] as zipcode from ch8;
Select * from ch9;
我有一个名为 geo_data_display 的字段,其中包含国家、地区和 DMA。这 3 个值包含在 = 和 & 字符之间 - 第一个“=”和第一个“&”之间的国家/地区,第二个“=”和第二个“&”之间的地区以及第三个“=”和第三个“”之间的 DMA &”。这是 table 的可重现版本。 country 始终是字符,但 region 和 DMA 可以是数字或字符,DMA 并非对所有国家/地区都存在。
一些示例值是:
country=us®ion=tx&dma=625&domain=abc.net&zipcodes=76549
country=us®ion=ca&dma=803&domain=abc.com&zipcodes=90404
country=tw®ion=hsz&domain=hinet.net&zipcodes=300
country=jp®ion=1&dma=a&domain=hinet.net&zipcodes=300
我有一些示例 SQL 但 geo_dma 代码行根本不起作用,geo_region 代码行仅适用于字符值
SELECT
UPPER(REGEXP_REPLACE(split(geo_data_display, '\&')[0], 'country=', '')) AS geo_country
,UPPER(split(split(geo_data_display, '\&')[1],'\=')[1]) AS geo_region
,split(split(cast(geo_data_display as int), '\&')[2],'\=')[2] AS geo_dma
FROM mytable
regexp_extract(string subject, string pattern, int index)
Returns 使用模式提取的字符串。例如,regexp_extract('foothebar', 'foo(.*?)(bar)', 1) returns 'the'
select
regexp_extract(geo_data_display, 'country=(.*?)(®ion)', 1),
regexp_extract(geo_data_display, 'region=(.*?)(&dma)', 1),
regexp_extract(geo_data_display, 'dma=(.*?)(&domain)', 1)
您可以像这样使用 str_to_map
:
select geo_map['country'] as geo_country
,geo_map['region'] as geo_region
,geo_map['dma'] as geo_dma
from (select str_to_map(geo_data_display,'&','=') as geo_map
from mytable
) t
;
+--------------+-------------+----------+
| geo_country | geo_region | geo_dma |
+--------------+-------------+----------+
| us | tx | 625 |
| us | ca | 803 |
| tw | hsz | NULL |
| jp | 1 | a |
+--------------+-------------+----------+
请尝试以下操作,
create table ch8(details map string,string>)
row format delimited
collection items terminated by '&'
map keys terminated by '=';
将数据加载到 table。
create another table using CTAS
create table ch9 as select details["country"] as country, details["region"] as region, details["dma"] as dma, details["domain"] as domain, details["zipcodes"] as zipcode from ch8;
Select * from ch9;