NiFi 无法解析转换记录中的数据
NiFi failed to parse data in convert record
我正在尝试使用 ConvertRecord
处理器将 JSON 转换为 CSV,但我返回的唯一错误是 Could not parse incoming data
。由于这不是很有描述性,我不知道如何诊断问题。
我知道我的 avro 模式是有效的,因为 A) 当我将它插入模式注册表时,NiFi 没有抛出关于模式的错误,并且 B) 我在 here 上测试了我的模式,但它没有给我一个问题。
我也知道我的 JSON 是有效的,因为我可以使用 json.loads()
将它加载到 Python 中,它不会给我带来任何问题。
我只是不太确定哪里出了问题,也不知道如何解决。
JSON
{
"DOC": {
"DOCID": "1234",
"Subjects": {
"Subject_xref": ["2233"]
},
"TXT": {
"COUNTRY": ["United States"],
"ESTATE": ["Mount Vernon"],
"PERSON": ["George Washington"]
},
"RAW_TXT": "George Washington lived in his family home, Mount Vernon, located in the United States.",
"RELINFO": [
{"ID" : "REL-1234-100",
"RELTYPE" : "PER-PROP",
"PERID" : "PER-1234-009",
"PROPID" : "PROP-1234-001",
"SENTID" : "1234-SENT-001",
"PROP_NORM" : "Mount Vernon",
"PROP_MENTION" : "Mount Vernon",
"PER_NORM" : "George Washington",
"PER_MENTION" : "George Washington"}
],
"ENTINFO": [
{"ID": "PER-1234-009", "TYPE": "PERSON", "NORM": "George Washington", "REFID": "PER-1234-009", "MENTION": "George Washington"},
{"ID": "CTRY-1234-003", "TYPE": "COUNTRY", "NORM": "United States", "REFID": "CTRY-1234-003", "MENTION": "United States."},
{"ID": "PROP-1234-001", "TYPE": "ESTATE", "NORM": "Mount Vernon", "REFID": "PROP-1234-001", "MENTION": "Mount Vernon"}
]
}
}
Avro
{
"type": "record",
"namespace": "name.space",
"name": "nlp_output",
"fields": [
{"name": "DOC", "type": {
"name": "DOCDocument", "type": "record", "namespace": "doc.name.space", "fields": [
{"name": "DOCID", "type": ["long","null"], "default": null},
{"name": "Subjects", "type": {
"name": "Subjects", "type": "record", "namespace": "subjects.name.space", "fields": [
{"name": "SubjectIdentificationID", "aliases": ["Subject_xref"], "type": ["long","null"], "default": null}
]
}},
{"name": "TXT", "type": {
"name": "TXT", "type": "record", "namespace": "text.name.space", "fields": [
{"name": "COUNTRY", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""},
{"name": "ESTATE", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""},
{"name": "PERSON", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""}
]
}},
{"name": "RAW_TXT", "type": ["string","null"], "default": null},
{"name": "RELINFO", "type": {
"name": "RelatedEntities", "type": "record", "namespace": "relent.name.space", "fields": [
{"name": "ID", "type": ["string", "null"], "default": null},
{"name": "RELTYPE", "type": ["string", "null"], "default": null},
{"name": "PERID", "type": ["string", "null"], "default": null},
{"name": "PROPID", "type": ["string", "null"], "default": null},
{"name": "SENTID", "type": ["string", "null"], "default": null},
{"name": "PROP_NORM", "type": ["string", "null"], "default": null},
{"name": "PROP_MENTION", "type": ["string", "null"], "default": null},
{"name": "PER_NORM", "type": ["string", "null"], "default": null},
{"name": "PER_MENTION", "type": ["string", "null"], "default": null}
]
}},
{"name": "ENTINFO", "doc": "Sentences stripped of tags for ease of reading", "type": {
"name": "Entities", "type": "record", "namespace": "entities.name.space", "fields": [
{"name": "ID", "type": ["string", "null"], "default": null},
{"name": "TYPE", "type": ["string", "null"], "default": null},
{"name": "NORM", "type": ["string", "null"], "default": null},
{"name": "REFID", "type": ["string", "null"], "default": null},
{"name": "MENTION", "type": ["string", "null"], "default": null}
]
}}
]
}}
]
}
您的架构与您的 JSON 不匹配。您将 SubjectIdentificationID
定义为 long
或 null
但在 JSON 中 Subject_xref
是一个数组。
{
"type": "record",
"namespace": "name.space",
"name": "nlp_output",
"fields": [
{"name": "DOC", "type": {
"name": "DOCDocument", "type": "record", "namespace": "doc.name.space", "fields": [
{"name": "DOCID", "type": ["long","null"], "default": null},
{"name": "Subjects", "type": {
"name": "Subjects", "type": "record", "namespace": "subjects.name.space", "fields": [
{"name": "SubjectIdentificationID", "aliases": ["Subject_xref"], "type": {"type": "array", "items": ["long", "null"]}, "default": null}
]
}},
{"name": "TXT", "type": {
"name": "TXT", "type": "record", "namespace": "text.name.space", "fields": [
{"name": "COUNTRY", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""},
{"name": "ESTATE", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""},
{"name": "PERSON", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""}
]
}},
{"name": "RAW_TXT", "type": ["string","null"], "default": null},
{"name": "RELINFO", "type": {
"name": "RelatedEntities", "type": "record", "namespace": "relent.name.space", "fields": [
{"name": "ID", "type": ["string", "null"], "default": null},
{"name": "RELTYPE", "type": ["string", "null"], "default": null},
{"name": "PERID", "type": ["string", "null"], "default": null},
{"name": "PROPID", "type": ["string", "null"], "default": null},
{"name": "SENTID", "type": ["string", "null"], "default": null},
{"name": "PROP_NORM", "type": ["string", "null"], "default": null},
{"name": "PROP_MENTION", "type": ["string", "null"], "default": null},
{"name": "PER_NORM", "type": ["string", "null"], "default": null},
{"name": "PER_MENTION", "type": ["string", "null"], "default": null}
]
}},
{"name": "ENTINFO", "doc": "Sentences stripped of tags for ease of reading", "type": {
"name": "Entities", "type": "record", "namespace": "entities.name.space", "fields": [
{"name": "ID", "type": ["string", "null"], "default": null},
{"name": "TYPE", "type": ["string", "null"], "default": null},
{"name": "NORM", "type": ["string", "null"], "default": null},
{"name": "REFID", "type": ["string", "null"], "default": null},
{"name": "MENTION", "type": ["string", "null"], "default": null}
]
}}
]
}}
]
}
我正在尝试使用 ConvertRecord
处理器将 JSON 转换为 CSV,但我返回的唯一错误是 Could not parse incoming data
。由于这不是很有描述性,我不知道如何诊断问题。
我知道我的 avro 模式是有效的,因为 A) 当我将它插入模式注册表时,NiFi 没有抛出关于模式的错误,并且 B) 我在 here 上测试了我的模式,但它没有给我一个问题。
我也知道我的 JSON 是有效的,因为我可以使用 json.loads()
将它加载到 Python 中,它不会给我带来任何问题。
我只是不太确定哪里出了问题,也不知道如何解决。
JSON
{
"DOC": {
"DOCID": "1234",
"Subjects": {
"Subject_xref": ["2233"]
},
"TXT": {
"COUNTRY": ["United States"],
"ESTATE": ["Mount Vernon"],
"PERSON": ["George Washington"]
},
"RAW_TXT": "George Washington lived in his family home, Mount Vernon, located in the United States.",
"RELINFO": [
{"ID" : "REL-1234-100",
"RELTYPE" : "PER-PROP",
"PERID" : "PER-1234-009",
"PROPID" : "PROP-1234-001",
"SENTID" : "1234-SENT-001",
"PROP_NORM" : "Mount Vernon",
"PROP_MENTION" : "Mount Vernon",
"PER_NORM" : "George Washington",
"PER_MENTION" : "George Washington"}
],
"ENTINFO": [
{"ID": "PER-1234-009", "TYPE": "PERSON", "NORM": "George Washington", "REFID": "PER-1234-009", "MENTION": "George Washington"},
{"ID": "CTRY-1234-003", "TYPE": "COUNTRY", "NORM": "United States", "REFID": "CTRY-1234-003", "MENTION": "United States."},
{"ID": "PROP-1234-001", "TYPE": "ESTATE", "NORM": "Mount Vernon", "REFID": "PROP-1234-001", "MENTION": "Mount Vernon"}
]
}
}
Avro
{
"type": "record",
"namespace": "name.space",
"name": "nlp_output",
"fields": [
{"name": "DOC", "type": {
"name": "DOCDocument", "type": "record", "namespace": "doc.name.space", "fields": [
{"name": "DOCID", "type": ["long","null"], "default": null},
{"name": "Subjects", "type": {
"name": "Subjects", "type": "record", "namespace": "subjects.name.space", "fields": [
{"name": "SubjectIdentificationID", "aliases": ["Subject_xref"], "type": ["long","null"], "default": null}
]
}},
{"name": "TXT", "type": {
"name": "TXT", "type": "record", "namespace": "text.name.space", "fields": [
{"name": "COUNTRY", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""},
{"name": "ESTATE", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""},
{"name": "PERSON", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""}
]
}},
{"name": "RAW_TXT", "type": ["string","null"], "default": null},
{"name": "RELINFO", "type": {
"name": "RelatedEntities", "type": "record", "namespace": "relent.name.space", "fields": [
{"name": "ID", "type": ["string", "null"], "default": null},
{"name": "RELTYPE", "type": ["string", "null"], "default": null},
{"name": "PERID", "type": ["string", "null"], "default": null},
{"name": "PROPID", "type": ["string", "null"], "default": null},
{"name": "SENTID", "type": ["string", "null"], "default": null},
{"name": "PROP_NORM", "type": ["string", "null"], "default": null},
{"name": "PROP_MENTION", "type": ["string", "null"], "default": null},
{"name": "PER_NORM", "type": ["string", "null"], "default": null},
{"name": "PER_MENTION", "type": ["string", "null"], "default": null}
]
}},
{"name": "ENTINFO", "doc": "Sentences stripped of tags for ease of reading", "type": {
"name": "Entities", "type": "record", "namespace": "entities.name.space", "fields": [
{"name": "ID", "type": ["string", "null"], "default": null},
{"name": "TYPE", "type": ["string", "null"], "default": null},
{"name": "NORM", "type": ["string", "null"], "default": null},
{"name": "REFID", "type": ["string", "null"], "default": null},
{"name": "MENTION", "type": ["string", "null"], "default": null}
]
}}
]
}}
]
}
您的架构与您的 JSON 不匹配。您将 SubjectIdentificationID
定义为 long
或 null
但在 JSON 中 Subject_xref
是一个数组。
{
"type": "record",
"namespace": "name.space",
"name": "nlp_output",
"fields": [
{"name": "DOC", "type": {
"name": "DOCDocument", "type": "record", "namespace": "doc.name.space", "fields": [
{"name": "DOCID", "type": ["long","null"], "default": null},
{"name": "Subjects", "type": {
"name": "Subjects", "type": "record", "namespace": "subjects.name.space", "fields": [
{"name": "SubjectIdentificationID", "aliases": ["Subject_xref"], "type": {"type": "array", "items": ["long", "null"]}, "default": null}
]
}},
{"name": "TXT", "type": {
"name": "TXT", "type": "record", "namespace": "text.name.space", "fields": [
{"name": "COUNTRY", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""},
{"name": "ESTATE", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""},
{"name": "PERSON", "type": {"type": "array", "items": ["string", "null"]}, "default": null, "doc": ""}
]
}},
{"name": "RAW_TXT", "type": ["string","null"], "default": null},
{"name": "RELINFO", "type": {
"name": "RelatedEntities", "type": "record", "namespace": "relent.name.space", "fields": [
{"name": "ID", "type": ["string", "null"], "default": null},
{"name": "RELTYPE", "type": ["string", "null"], "default": null},
{"name": "PERID", "type": ["string", "null"], "default": null},
{"name": "PROPID", "type": ["string", "null"], "default": null},
{"name": "SENTID", "type": ["string", "null"], "default": null},
{"name": "PROP_NORM", "type": ["string", "null"], "default": null},
{"name": "PROP_MENTION", "type": ["string", "null"], "default": null},
{"name": "PER_NORM", "type": ["string", "null"], "default": null},
{"name": "PER_MENTION", "type": ["string", "null"], "default": null}
]
}},
{"name": "ENTINFO", "doc": "Sentences stripped of tags for ease of reading", "type": {
"name": "Entities", "type": "record", "namespace": "entities.name.space", "fields": [
{"name": "ID", "type": ["string", "null"], "default": null},
{"name": "TYPE", "type": ["string", "null"], "default": null},
{"name": "NORM", "type": ["string", "null"], "default": null},
{"name": "REFID", "type": ["string", "null"], "default": null},
{"name": "MENTION", "type": ["string", "null"], "default": null}
]
}}
]
}}
]
}