Elasticsearch 7.8 嵌套聚合不返回正确的数据

Question

我已经努力了一个星期试图从 Elasticsearch 嵌套聚合索引中获取正确的数据。下面是我的索引映射和插入的两个示例文档。我要查找的是：

匹配字段xforms.sentence.tokens.value等于24
在匹配的文档集中，计算按 xforms.sentence.tokens.tag 分组的匹配项，其中 xforms.sentence.tokens.value 等于至 24

因此，作为插入文档下方的示例，我期望的输出是：

{"JJ": 1, "NN": 1}

{
  "_doc": {
    "_meta": {},
    "_source": {},
    "properties": {
      "originalText": {
        "type": "text"
      },
      "testDataId": {
        "type": "text"
      },
      "xforms": {
        "type": "nested",
        "properties": {
          "sentence": {
            "type": "nested"
          },
          "predicate": {
            "type": "nested"
          }
        }
      },
      "corpusId": {
        "type": "text"
      },
      "row": {
        "type": "text"
      },
      "batchId": {
        "type": "text"
      },
      "processor": {
        "type": "text"
      }
    }
  }
}

插入的示例文档如下：

{
    "_id": "28",
    "_source": {
        "testDataId": "5e97e9bef033448b893e485baa0fdf15",
        "originalText": "Some text with the word 24",
        "xforms": [{
            "sentence": {
                "tokens": [{
                        "lemma": "Some",
                        "index": 1,
                        "after": " ",
                        "tag": "JJ",
                        "value": "Some"
                    },
                    {
                        "lemma": "text",
                        "index": 2,
                        "after": " ",
                        "tag": "NN",
                        "value": "text"
                    },
                    {
                        "lemma": "with",
                        "index": 3,
                        "after": " ",
                        "tag": "NN",
                        "value": "with"
                    },
                    {
                        "lemma": "the",
                        "index": 4,
                        "after": "",
                        "tag": "CD",
                        "value": "the"
                    },
                    {
                        "lemma": "word",
                        "index": 5,
                        "after": " ",
                        "tag": "CC",
                        "value": "word"
                    },
                    {
                        "lemma": "24",
                        "index": 6,
                        "after": " ",
                        "tag": "JJ",
                        "value": "24"
                    }
                ],
                "type": "RAW"
            },
            "originalSentence": "Some text with the word 24 in it",
            "id": "e724611d8c024bcb8f0158b60e3df87e"
        }]
    }
},
{
    "_id": "56",
    "_source": {
        "testDataId": "5e97e9bef033448b893e485baa0fad15",
        "originalText": "24 word",
        "xforms": [{
            "sentence": {
                "tokens": [{
                        "lemma": "24",
                        "index": 1,
                        "after": " ",
                        "tag": "NN",
                        "value": "24"
                    },
                    {
                        "lemma": "word",
                        "index": 2,
                        "after": " ",
                        "tag": "JJ",
                        "value": "word"
                    }
                ],
                "type": "RAW"
            },
            "originalSentence": "24 word",
            "id": "e724611d8c024bcb8f0158b60e3d123"
        }]
    }
}

Answer 1

{
  "aggs": {
    "xforms": {
      "nested": { //Nested aggregation
        "path": "xforms.sentence"
      },
      "aggs": {
        "inner": { //Counting only within the matching doc
          "filter": {
            "bool": {
              "filter": { //Filtering docs with value=24
                "terms": {
                  "xforms.sentence.tokens.value": [
                    "24"
                  ]
                }
              }
            }
          },
        "aggs" : {
          "tag_count":{ //On filtered doc, doing terms aggregation on tag's keyword version as tag is of type text
            "terms":{
              "field":"xforms.sentence.tokens.tag.keyword"
            }
          }
        }
        }
      }
    }
  }
}

它提供以下输出

"aggregations": {
        "xforms": {
            "doc_count": 2,
            "inner": {
                "doc_count": 2,
                "tag_count": {
                    "doc_count_error_upper_bound": 0,
                    "sum_other_doc_count": 0,
                    "buckets": [
                        {
                            "key": "JJ",
                            "doc_count": 2
                        },
                        {
                            "key": "NN",
                            "doc_count": 2
                        },
                        {
                            "key": "CC",
                            "doc_count": 1
                        },
                        {
                            "key": "CD",
                            "doc_count": 1
                        }
                    ]
                }
            }
        }
    }

Answer 2

扩展@Gibbs 的回答，@N Kiram 您还需要将 tokens 设置为 nested：

{
  "xforms":{
    "type":"nested",
    "properties":{
      "sentence":{
        "type":"nested",
        "properties":{
          "tokens":{              <----
            "type":"nested"
          }
        }
      },
      "predicate":{
        "type":"nested"
      }
    }
  }
}

只有这样，您的聚合才会产生正确的计数：

{
  "aggregations":{
    "xforms":{
      "doc_count":8,
      "inner":{
        "doc_count":2,
        "tag_count":{
          "doc_count_error_upper_bound":0,
          "sum_other_doc_count":0,
          "buckets":[
            {
              "key":"JJ",
              "doc_count":1
            },
            {
              "key":"NN",
              "doc_count":1
            }
          ]
        }
      }
    }
  }
}

旁注：您必须重建索引才能应用更改的映射。

Elasticsearch 7.8 嵌套聚合不返回正确的数据

Elasticsearch 7.8 Nested Aggregation not returning correct data

elasticsearch

elasticsearch-aggregation