配置单元无法使用嵌套的 avro 模式创建 table
hive can't create table with nested avro schema
我正在尝试使用嵌套的 avro 架构来创建配置单元 table。但它不起作用。我在 cdh5.7.2 中使用配置单元 1.1。
这是我的嵌套 avro 架构:
[
{
"type": "record",
"name": "Id",
"namespace": "com.test.app_list",
"doc": "Device ID",
"fields": [
{
"name": "idType",
"type": "int"
},{
"name": "id",
"type": "string"
}
]
},
{
"type": "record",
"name": "AppList",
"namespace": "com.test.app_list",
"doc": "",
"fields": [
{
"name": "appId",
"type": "string",
"avro.java.string": "String"
},
{
"name": "timestamp",
"type": "long"
},
{
"name": "idList",
"type": [{"type": "array", "items": "com.test.app_list.Id"}]
}
]
}
]
和我的 sql 创建 table:
CREATE EXTERNAL TABLE app_list
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES (
'avro.schema.url'='/hive/schema/test_app_list.avsc');
但是蜂巢给了我:
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.RuntimeException: MetaException(message:org.apache.hadoop.hive.serde2.avro.AvroSerdeException Schema for table must be of type RECORD. Received type: UNION)
hive
文档显示:Supports arbitrarily nested schemas.
来自:https://cwiki.apache.org/confluence/display/Hive/AvroSerDe#AvroSerDe-Overview–WorkingwithAvrofromHive
数据样本:
{
"appId":{"string":"com.test.app"},
"timestamp":{"long":1495893601606},
"idList":{
"array":[
{"idType":15,"id":"6c:5c:14:c3:a5:39"},
{"idType":13,"id":"eb297afe56ff340b6bb7de5c5ab09193"}
]
}
}
但我不知道该怎么做。我需要一些帮助来解决这个问题。谢谢!
您的 avro 模式的顶层期望是记录类型,这就是 Hive 不允许这样做的原因。解决方法是将顶层创建为记录,并在内部创建两个字段作为记录类型。
{
"type": "record",
"name": "myRecord",
"namespace": "com.test.app_list"
"fields": [
{
"type": "record",
"name": "Id",
"doc": "Device ID",
"fields": [
{
"name": "idType",
"type": "int"
},{
"name": "id",
"type": "string"
}
]
},
{
"type": "record",
"name": "AppList",
"doc": "",
"fields": [
{
"name": "appId",
"type": "string",
"avro.java.string": "String"
},
{
"name": "timestamp",
"type": "long"
},
{
"name": "idList",
"type": [{"type": "array", "items": "com.test.app_list.Id"}]
}
]
}
]
}
我正在尝试使用嵌套的 avro 架构来创建配置单元 table。但它不起作用。我在 cdh5.7.2 中使用配置单元 1.1。
这是我的嵌套 avro 架构:
[
{
"type": "record",
"name": "Id",
"namespace": "com.test.app_list",
"doc": "Device ID",
"fields": [
{
"name": "idType",
"type": "int"
},{
"name": "id",
"type": "string"
}
]
},
{
"type": "record",
"name": "AppList",
"namespace": "com.test.app_list",
"doc": "",
"fields": [
{
"name": "appId",
"type": "string",
"avro.java.string": "String"
},
{
"name": "timestamp",
"type": "long"
},
{
"name": "idList",
"type": [{"type": "array", "items": "com.test.app_list.Id"}]
}
]
}
]
和我的 sql 创建 table:
CREATE EXTERNAL TABLE app_list
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES (
'avro.schema.url'='/hive/schema/test_app_list.avsc');
但是蜂巢给了我:
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.RuntimeException: MetaException(message:org.apache.hadoop.hive.serde2.avro.AvroSerdeException Schema for table must be of type RECORD. Received type: UNION)
hive
文档显示:Supports arbitrarily nested schemas.
来自:https://cwiki.apache.org/confluence/display/Hive/AvroSerDe#AvroSerDe-Overview–WorkingwithAvrofromHive
数据样本:
{
"appId":{"string":"com.test.app"},
"timestamp":{"long":1495893601606},
"idList":{
"array":[
{"idType":15,"id":"6c:5c:14:c3:a5:39"},
{"idType":13,"id":"eb297afe56ff340b6bb7de5c5ab09193"}
]
}
}
但我不知道该怎么做。我需要一些帮助来解决这个问题。谢谢!
您的 avro 模式的顶层期望是记录类型,这就是 Hive 不允许这样做的原因。解决方法是将顶层创建为记录,并在内部创建两个字段作为记录类型。
{
"type": "record",
"name": "myRecord",
"namespace": "com.test.app_list"
"fields": [
{
"type": "record",
"name": "Id",
"doc": "Device ID",
"fields": [
{
"name": "idType",
"type": "int"
},{
"name": "id",
"type": "string"
}
]
},
{
"type": "record",
"name": "AppList",
"doc": "",
"fields": [
{
"name": "appId",
"type": "string",
"avro.java.string": "String"
},
{
"name": "timestamp",
"type": "long"
},
{
"name": "idList",
"type": [{"type": "array", "items": "com.test.app_list.Id"}]
}
]
}
]
}