Elasticsearch 7.8 嵌套聚合不返回正确的数据
Elasticsearch 7.8 Nested Aggregation not returning correct data
我已经努力了一个星期试图从 Elasticsearch 嵌套聚合索引中获取正确的数据。下面是我的索引映射和插入的两个示例文档。我要查找的是:
- 匹配字段xforms.sentence.tokens.value等于24
的所有文档
- 在匹配的文档集中,计算按 xforms.sentence.tokens.tag 分组的匹配项,其中 xforms.sentence.tokens.value 等于至 24
因此,作为插入文档下方的示例,我期望的输出是:
{"JJ": 1, "NN": 1}
{
"_doc": {
"_meta": {},
"_source": {},
"properties": {
"originalText": {
"type": "text"
},
"testDataId": {
"type": "text"
},
"xforms": {
"type": "nested",
"properties": {
"sentence": {
"type": "nested"
},
"predicate": {
"type": "nested"
}
}
},
"corpusId": {
"type": "text"
},
"row": {
"type": "text"
},
"batchId": {
"type": "text"
},
"processor": {
"type": "text"
}
}
}
}
插入的示例文档如下:
{
"_id": "28",
"_source": {
"testDataId": "5e97e9bef033448b893e485baa0fdf15",
"originalText": "Some text with the word 24",
"xforms": [{
"sentence": {
"tokens": [{
"lemma": "Some",
"index": 1,
"after": " ",
"tag": "JJ",
"value": "Some"
},
{
"lemma": "text",
"index": 2,
"after": " ",
"tag": "NN",
"value": "text"
},
{
"lemma": "with",
"index": 3,
"after": " ",
"tag": "NN",
"value": "with"
},
{
"lemma": "the",
"index": 4,
"after": "",
"tag": "CD",
"value": "the"
},
{
"lemma": "word",
"index": 5,
"after": " ",
"tag": "CC",
"value": "word"
},
{
"lemma": "24",
"index": 6,
"after": " ",
"tag": "JJ",
"value": "24"
}
],
"type": "RAW"
},
"originalSentence": "Some text with the word 24 in it",
"id": "e724611d8c024bcb8f0158b60e3df87e"
}]
}
},
{
"_id": "56",
"_source": {
"testDataId": "5e97e9bef033448b893e485baa0fad15",
"originalText": "24 word",
"xforms": [{
"sentence": {
"tokens": [{
"lemma": "24",
"index": 1,
"after": " ",
"tag": "NN",
"value": "24"
},
{
"lemma": "word",
"index": 2,
"after": " ",
"tag": "JJ",
"value": "word"
}
],
"type": "RAW"
},
"originalSentence": "24 word",
"id": "e724611d8c024bcb8f0158b60e3d123"
}]
}
}
{
"aggs": {
"xforms": {
"nested": { //Nested aggregation
"path": "xforms.sentence"
},
"aggs": {
"inner": { //Counting only within the matching doc
"filter": {
"bool": {
"filter": { //Filtering docs with value=24
"terms": {
"xforms.sentence.tokens.value": [
"24"
]
}
}
}
},
"aggs" : {
"tag_count":{ //On filtered doc, doing terms aggregation on tag's keyword version as tag is of type text
"terms":{
"field":"xforms.sentence.tokens.tag.keyword"
}
}
}
}
}
}
}
}
它提供以下输出
"aggregations": {
"xforms": {
"doc_count": 2,
"inner": {
"doc_count": 2,
"tag_count": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "JJ",
"doc_count": 2
},
{
"key": "NN",
"doc_count": 2
},
{
"key": "CC",
"doc_count": 1
},
{
"key": "CD",
"doc_count": 1
}
]
}
}
}
}
扩展@Gibbs 的回答,@N Kiram 您还需要将 tokens
设置为 nested
:
{
"xforms":{
"type":"nested",
"properties":{
"sentence":{
"type":"nested",
"properties":{
"tokens":{ <----
"type":"nested"
}
}
},
"predicate":{
"type":"nested"
}
}
}
}
只有这样,您的聚合才会产生正确的计数:
{
"aggregations":{
"xforms":{
"doc_count":8,
"inner":{
"doc_count":2,
"tag_count":{
"doc_count_error_upper_bound":0,
"sum_other_doc_count":0,
"buckets":[
{
"key":"JJ",
"doc_count":1
},
{
"key":"NN",
"doc_count":1
}
]
}
}
}
}
}
旁注:您必须重建索引才能应用更改的映射。
我已经努力了一个星期试图从 Elasticsearch 嵌套聚合索引中获取正确的数据。下面是我的索引映射和插入的两个示例文档。我要查找的是:
- 匹配字段xforms.sentence.tokens.value等于24 的所有文档
- 在匹配的文档集中,计算按 xforms.sentence.tokens.tag 分组的匹配项,其中 xforms.sentence.tokens.value 等于至 24
因此,作为插入文档下方的示例,我期望的输出是:
{"JJ": 1, "NN": 1}
{
"_doc": {
"_meta": {},
"_source": {},
"properties": {
"originalText": {
"type": "text"
},
"testDataId": {
"type": "text"
},
"xforms": {
"type": "nested",
"properties": {
"sentence": {
"type": "nested"
},
"predicate": {
"type": "nested"
}
}
},
"corpusId": {
"type": "text"
},
"row": {
"type": "text"
},
"batchId": {
"type": "text"
},
"processor": {
"type": "text"
}
}
}
}
插入的示例文档如下:
{
"_id": "28",
"_source": {
"testDataId": "5e97e9bef033448b893e485baa0fdf15",
"originalText": "Some text with the word 24",
"xforms": [{
"sentence": {
"tokens": [{
"lemma": "Some",
"index": 1,
"after": " ",
"tag": "JJ",
"value": "Some"
},
{
"lemma": "text",
"index": 2,
"after": " ",
"tag": "NN",
"value": "text"
},
{
"lemma": "with",
"index": 3,
"after": " ",
"tag": "NN",
"value": "with"
},
{
"lemma": "the",
"index": 4,
"after": "",
"tag": "CD",
"value": "the"
},
{
"lemma": "word",
"index": 5,
"after": " ",
"tag": "CC",
"value": "word"
},
{
"lemma": "24",
"index": 6,
"after": " ",
"tag": "JJ",
"value": "24"
}
],
"type": "RAW"
},
"originalSentence": "Some text with the word 24 in it",
"id": "e724611d8c024bcb8f0158b60e3df87e"
}]
}
},
{
"_id": "56",
"_source": {
"testDataId": "5e97e9bef033448b893e485baa0fad15",
"originalText": "24 word",
"xforms": [{
"sentence": {
"tokens": [{
"lemma": "24",
"index": 1,
"after": " ",
"tag": "NN",
"value": "24"
},
{
"lemma": "word",
"index": 2,
"after": " ",
"tag": "JJ",
"value": "word"
}
],
"type": "RAW"
},
"originalSentence": "24 word",
"id": "e724611d8c024bcb8f0158b60e3d123"
}]
}
}
{
"aggs": {
"xforms": {
"nested": { //Nested aggregation
"path": "xforms.sentence"
},
"aggs": {
"inner": { //Counting only within the matching doc
"filter": {
"bool": {
"filter": { //Filtering docs with value=24
"terms": {
"xforms.sentence.tokens.value": [
"24"
]
}
}
}
},
"aggs" : {
"tag_count":{ //On filtered doc, doing terms aggregation on tag's keyword version as tag is of type text
"terms":{
"field":"xforms.sentence.tokens.tag.keyword"
}
}
}
}
}
}
}
}
它提供以下输出
"aggregations": {
"xforms": {
"doc_count": 2,
"inner": {
"doc_count": 2,
"tag_count": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "JJ",
"doc_count": 2
},
{
"key": "NN",
"doc_count": 2
},
{
"key": "CC",
"doc_count": 1
},
{
"key": "CD",
"doc_count": 1
}
]
}
}
}
}
扩展@Gibbs 的回答,@N Kiram 您还需要将 tokens
设置为 nested
:
{
"xforms":{
"type":"nested",
"properties":{
"sentence":{
"type":"nested",
"properties":{
"tokens":{ <----
"type":"nested"
}
}
},
"predicate":{
"type":"nested"
}
}
}
}
只有这样,您的聚合才会产生正确的计数:
{
"aggregations":{
"xforms":{
"doc_count":8,
"inner":{
"doc_count":2,
"tag_count":{
"doc_count_error_upper_bound":0,
"sum_other_doc_count":0,
"buckets":[
{
"key":"JJ",
"doc_count":1
},
{
"key":"NN",
"doc_count":1
}
]
}
}
}
}
}
旁注:您必须重建索引才能应用更改的映射。