AWS Athena 在分区加载后创建缩进并将值移动到错误的列中
AWS Athena creates indentation and moves values into wrong columns after partitions loads
我遇到了以下问题:
- 我在没有分区的 HDFS 中的 EMR 集群中创建了一个 Hive table
并向其加载数据。
- 我创建了另一个 Hiva table 基于
table 来自第 1 段,但包含来自日期时间的分区
列:PARTITIONED BY(年 STRING,月 STRING,日 STRING)。
- 我将数据从非分区 table 加载到分区 table 并获得有效结果。
- 我创建了一个 Athena 数据库和 table 具有与 Hive table 相同的结构。
- 我从 HDFS 本地复制了分区文件,并通过 aws s3 sync 将所有文件传输到 S3 空桶中。所有文件都已正确传输,并且与 HDFS 中的 Hive 目录中的顺序相同。
- 我通过 MSCK REPAIR TABLE 加载了分区并且在输出中没有得到任何错误。
之后我发现很多值都有缩进,例如需要在 "IP" 列中的值在 "Operating_sys" 列等
我的脚本是:
-- Hive tables
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
CREATE EXTERNAL TABLE IF NOT EXISTS cloudfront_logs_page_part
(
log_DATE STRING,
user_id STRING,
page_path STRING,
referer STRING,
tracking_referer STRING,
medium STRING,
campaign STRING,
source STRING,
visitor_id STRING,
ip STRING,
session_id STRING,
operating_sys STRING,
ad_id STRING,
keyword STRING,
user_agent STRING
)
PARTITIONED BY
(
`year` STRING,
`month` STRING,
`day` STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/user/admin/events_partitioned';
CREATE EXTERNAL TABLE IF NOT EXISTS cloudfront_logs_event_part
(
log_DATE STRING,
user_id STRING,
category STRING,
action STRING,
label STRING,
value STRING,
visitor_id STRING,
ip STRING,
session_id STRING,
operating_sys STRING,
extra_data_json STRING
)
PARTITIONED BY
(
`year` STRING,
`month` STRING,
`day` STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/user/admin/pages_partitioned';
INSERT INTO TABLE cloudfront_logs_page_part
PARTITION
(
`year`,
`month`,
`day`
)
SELECT
log_DATE,
user_id,
page_path,
referer,
tracking_referer,
medium,
campaign,
source,
visitor_id,
ip,
session_id,
operating_sys,
ad_id,
keyword,
user_agent,
year(log_DATE) as `year`,
month(log_DATE) as `month`,
day(log_DATE) as `day`
FROM
cloudfront_logs_page;
INSERT INTO TABLE cloudfront_logs_event_part
PARTITION
(
`year`,
`month`,
`day`
)
SELECT
log_DATE,
user_id,
category,
action,
label,
value,
visitor_id,
ip,
session_id,
operating_sys,
extra_data_json,
year(log_DATE) as `year`,
month(log_DATE) as `month`,
day(log_DATE) as `day`
FROM
cloudfront_logs_event;
-- Athena tables
CREATE DATABASE IF NOT EXISTS test
LOCATION 's3://...';
DROP TABLE IF EXISTS test.cloudfront_logs_page_ath;
CREATE EXTERNAL TABLE IF NOT EXISTS powtoon_hive.cloudfront_logs_page_ath (
log_DATE STRING,
user_id STRING,
page_path STRING,
referer STRING,
tracking_referer STRING,
medium STRING,
campaign STRING,
source STRING,
visitor_id STRING,
ip STRING,
session_id STRING,
operating_sys STRING,
ad_id STRING,
keyword STRING,
user_agent STRING
)
PARTITIONED BY (`year` STRING,`month` STRING, `day` STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION 's3://.../';
DROP TABLE IF EXISTS test.cloudfront_logs_event_ath;
CREATE EXTERNAL TABLE IF NOT EXISTS test.cloudfront_logs_event_ath
(
log_DATE STRING,
user_id STRING,
category STRING,
action STRING,
label STRING,
value STRING,
visitor_id STRING,
ip STRING,
session_id STRING,
operating_sys STRING,
extra_data_json STRING
)
PARTITIONED BY (`year` STRING,`month` STRING, `day` STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION 's3://.../';
有什么问题吗? Table结构?雅典娜元数据?
最简单的方法是将原始文件直接转换为 分区 Parquet 柱状格式。这具有分区、列式存储、谓词下推和所有其他花哨词的好处。
我遇到了以下问题:
- 我在没有分区的 HDFS 中的 EMR 集群中创建了一个 Hive table 并向其加载数据。
- 我创建了另一个 Hiva table 基于 table 来自第 1 段,但包含来自日期时间的分区 列:PARTITIONED BY(年 STRING,月 STRING,日 STRING)。
- 我将数据从非分区 table 加载到分区 table 并获得有效结果。
- 我创建了一个 Athena 数据库和 table 具有与 Hive table 相同的结构。
- 我从 HDFS 本地复制了分区文件,并通过 aws s3 sync 将所有文件传输到 S3 空桶中。所有文件都已正确传输,并且与 HDFS 中的 Hive 目录中的顺序相同。
- 我通过 MSCK REPAIR TABLE 加载了分区并且在输出中没有得到任何错误。
之后我发现很多值都有缩进,例如需要在 "IP" 列中的值在 "Operating_sys" 列等
我的脚本是:
-- Hive tables
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
CREATE EXTERNAL TABLE IF NOT EXISTS cloudfront_logs_page_part
(
log_DATE STRING,
user_id STRING,
page_path STRING,
referer STRING,
tracking_referer STRING,
medium STRING,
campaign STRING,
source STRING,
visitor_id STRING,
ip STRING,
session_id STRING,
operating_sys STRING,
ad_id STRING,
keyword STRING,
user_agent STRING
)
PARTITIONED BY
(
`year` STRING,
`month` STRING,
`day` STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/user/admin/events_partitioned';
CREATE EXTERNAL TABLE IF NOT EXISTS cloudfront_logs_event_part
(
log_DATE STRING,
user_id STRING,
category STRING,
action STRING,
label STRING,
value STRING,
visitor_id STRING,
ip STRING,
session_id STRING,
operating_sys STRING,
extra_data_json STRING
)
PARTITIONED BY
(
`year` STRING,
`month` STRING,
`day` STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/user/admin/pages_partitioned';
INSERT INTO TABLE cloudfront_logs_page_part
PARTITION
(
`year`,
`month`,
`day`
)
SELECT
log_DATE,
user_id,
page_path,
referer,
tracking_referer,
medium,
campaign,
source,
visitor_id,
ip,
session_id,
operating_sys,
ad_id,
keyword,
user_agent,
year(log_DATE) as `year`,
month(log_DATE) as `month`,
day(log_DATE) as `day`
FROM
cloudfront_logs_page;
INSERT INTO TABLE cloudfront_logs_event_part
PARTITION
(
`year`,
`month`,
`day`
)
SELECT
log_DATE,
user_id,
category,
action,
label,
value,
visitor_id,
ip,
session_id,
operating_sys,
extra_data_json,
year(log_DATE) as `year`,
month(log_DATE) as `month`,
day(log_DATE) as `day`
FROM
cloudfront_logs_event;
-- Athena tables
CREATE DATABASE IF NOT EXISTS test
LOCATION 's3://...';
DROP TABLE IF EXISTS test.cloudfront_logs_page_ath;
CREATE EXTERNAL TABLE IF NOT EXISTS powtoon_hive.cloudfront_logs_page_ath (
log_DATE STRING,
user_id STRING,
page_path STRING,
referer STRING,
tracking_referer STRING,
medium STRING,
campaign STRING,
source STRING,
visitor_id STRING,
ip STRING,
session_id STRING,
operating_sys STRING,
ad_id STRING,
keyword STRING,
user_agent STRING
)
PARTITIONED BY (`year` STRING,`month` STRING, `day` STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION 's3://.../';
DROP TABLE IF EXISTS test.cloudfront_logs_event_ath;
CREATE EXTERNAL TABLE IF NOT EXISTS test.cloudfront_logs_event_ath
(
log_DATE STRING,
user_id STRING,
category STRING,
action STRING,
label STRING,
value STRING,
visitor_id STRING,
ip STRING,
session_id STRING,
operating_sys STRING,
extra_data_json STRING
)
PARTITIONED BY (`year` STRING,`month` STRING, `day` STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION 's3://.../';
有什么问题吗? Table结构?雅典娜元数据?
最简单的方法是将原始文件直接转换为 分区 Parquet 柱状格式。这具有分区、列式存储、谓词下推和所有其他花哨词的好处。