HIVe:从 ORC 转换为 TEXT 时数据格式发生变化
HIve: Data format changing while converting from ORC to TEXT
我有一个具有以下架构的配置单元 table:
CREATE EXTERNAL TABLE db_test.user_arry(
cstid string,
prdctsslctd array<string>,
indvprc array<bigint>,
dscntamt array<bigint>,
prdctsrjctd array<string>)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'/location/on/a/hadoop/'
其中的数据格式如下:
--------------------------------------------------------
name | prdctsslctd | indvprc | dscntamt | prdctsrjctd
--------------------------------------------------------
cctg65 ["m_jns","cbyht"] ["23","6"] ["1","1"] ["shs","jkt"]
jju89o0 ["top","jeans_wmn"] ["55","45"] [NULL] [NULL]
ju34hd ["laychps","candy","toy"]["3","5","67"]["12","8"]["candy"]
尝试将此数据提取到 table 中,所有列
的数据类型都为 string
CREATE EXTERNAL TABLE db_test.user_strng(
cstid string,
prdctsslctd string,
indvprc string,
dscntamt string,
prdctsrjctd string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS textfile
LOCATION
'/location/on/a/hadoop/';
使用:
insert into db_test.user_strng select * from db_test.user_arry;
实际O/P:
--------------------------------------------------------
name | prdctsslctd | indvprc | dscntamt | prdctsrjctd
--------------------------------------------------------
cctg65 m_jnscbyht 236 11 shsjkt
jju89o0 topjeans_wmn 5545 NULL NULL
ju34hd laychpscandytoy 3567 128 candy
预计O/P:
--------------------------------------------------------
name | prdctsslctd | indvprc | dscntamt | prdctsrjctd
--------------------------------------------------------
cctg65 "m_jns","cbyht" "23","6" "1","1" "shs","jkt"
jju89o0 "top","jeans_wmn" "55","45" NULL NULL
ju34hd "laychps","candy","toy" "3","5","67" "12","8" "candy"
不知道哪里出了问题或者错过了什么?
Update_1
O/P 从 table 执行数组到数组的转换后:
ALTER TABLE user_arry CHANGE indvprc indvprc array<string>;
ALTER TABLE user_arry CHANGE dscntamt dscntamt array<string>;
--------------------------------------------------------
name | prdctsslctd | indvprc | dscntamt | prdctsrjctd
--------------------------------------------------------
cctg65 ["m_jns","cbyht"] ["23","6"] ["1","1"] ["shs","jkt"]
jju89o0 ["top","jeans_wmn"] ["55","45"] [] []
ju34hd ["laychps","candy","toy"]["3","5","67"]["12","8"]["candy"]
来自 table 的最终 O/P,其中所有数据类型均为字符串:
--------------------------------------------------------
name | prdctsslctd | indvprc | dscntamt | prdctsrjctd
--------------------------------------------------------
cctg65 m_jns cbyht 23 6 1 1 shs jkt
jju89o0 top jeans_wmn 55 45
ju34hd laychps candy toy 3 5 67 12 8 candy
仍然没有得到想要的 o/p。
Update_2
按照建议,将 FIELDS TERMINATED BY ','
更改为 FIELDS TERMINATED BY '\t'
。获取所需格式的数据。
将所有数组类型更改为 array <string>
:
alter table ALTER TABLE user_arry CHANGE indvprc indvprc array<string>;
alter table ALTER TABLE user_arry CHANGE dscntamt dscntamt array<string>;
并且根据您的数据示例,array<bigint>
不是像 "23.45"
这样的值的正确格式。 array<string>
应该可以很好地处理您的数据文件。
使用concat_ws
将数组转换为逗号分隔的字符串:
insert into db_test.user_strng
select name,
concat_ws(',',prdctsslctd) as prdctsslctd,
concat_ws(',',indvprc) as indvprc,
concat_ws(',',dscntamt) as dscntamt,
concat_ws(',',prdctsrjctd) as prdctsrjctd
from db_test.user_arry;
我有一个具有以下架构的配置单元 table:
CREATE EXTERNAL TABLE db_test.user_arry(
cstid string,
prdctsslctd array<string>,
indvprc array<bigint>,
dscntamt array<bigint>,
prdctsrjctd array<string>)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'/location/on/a/hadoop/'
其中的数据格式如下:
--------------------------------------------------------
name | prdctsslctd | indvprc | dscntamt | prdctsrjctd
--------------------------------------------------------
cctg65 ["m_jns","cbyht"] ["23","6"] ["1","1"] ["shs","jkt"]
jju89o0 ["top","jeans_wmn"] ["55","45"] [NULL] [NULL]
ju34hd ["laychps","candy","toy"]["3","5","67"]["12","8"]["candy"]
尝试将此数据提取到 table 中,所有列
的数据类型都为string
CREATE EXTERNAL TABLE db_test.user_strng(
cstid string,
prdctsslctd string,
indvprc string,
dscntamt string,
prdctsrjctd string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS textfile
LOCATION
'/location/on/a/hadoop/';
使用:
insert into db_test.user_strng select * from db_test.user_arry;
实际O/P:
--------------------------------------------------------
name | prdctsslctd | indvprc | dscntamt | prdctsrjctd
--------------------------------------------------------
cctg65 m_jnscbyht 236 11 shsjkt
jju89o0 topjeans_wmn 5545 NULL NULL
ju34hd laychpscandytoy 3567 128 candy
预计O/P:
--------------------------------------------------------
name | prdctsslctd | indvprc | dscntamt | prdctsrjctd
--------------------------------------------------------
cctg65 "m_jns","cbyht" "23","6" "1","1" "shs","jkt"
jju89o0 "top","jeans_wmn" "55","45" NULL NULL
ju34hd "laychps","candy","toy" "3","5","67" "12","8" "candy"
不知道哪里出了问题或者错过了什么?
Update_1
O/P 从 table 执行数组到数组的转换后:
ALTER TABLE user_arry CHANGE indvprc indvprc array<string>;
ALTER TABLE user_arry CHANGE dscntamt dscntamt array<string>;
--------------------------------------------------------
name | prdctsslctd | indvprc | dscntamt | prdctsrjctd
--------------------------------------------------------
cctg65 ["m_jns","cbyht"] ["23","6"] ["1","1"] ["shs","jkt"]
jju89o0 ["top","jeans_wmn"] ["55","45"] [] []
ju34hd ["laychps","candy","toy"]["3","5","67"]["12","8"]["candy"]
来自 table 的最终 O/P,其中所有数据类型均为字符串:
--------------------------------------------------------
name | prdctsslctd | indvprc | dscntamt | prdctsrjctd
--------------------------------------------------------
cctg65 m_jns cbyht 23 6 1 1 shs jkt
jju89o0 top jeans_wmn 55 45
ju34hd laychps candy toy 3 5 67 12 8 candy
仍然没有得到想要的 o/p。
Update_2
按照建议,将 FIELDS TERMINATED BY ','
更改为 FIELDS TERMINATED BY '\t'
。获取所需格式的数据。
将所有数组类型更改为 array <string>
:
alter table ALTER TABLE user_arry CHANGE indvprc indvprc array<string>;
alter table ALTER TABLE user_arry CHANGE dscntamt dscntamt array<string>;
并且根据您的数据示例,array<bigint>
不是像 "23.45"
这样的值的正确格式。 array<string>
应该可以很好地处理您的数据文件。
使用concat_ws
将数组转换为逗号分隔的字符串:
insert into db_test.user_strng
select name,
concat_ws(',',prdctsslctd) as prdctsslctd,
concat_ws(',',indvprc) as indvprc,
concat_ws(',',dscntamt) as dscntamt,
concat_ws(',',prdctsrjctd) as prdctsrjctd
from db_test.user_arry;