elasticsearch 查询不同字段值的计数，在另一个字段上有 where 条件

Question

我想在我的 elasticsearch 索引中查询类似于下面在 postgres 上的查询

    select count(distinct(candidate_id)) from candidate_ranking cr 
where badge='1'

请考虑下面的示例索引，其中包含少量文档

{
  "id": 295537,
  "candidate_id": 29492,
  "created_at": "2021-03-30T02:23:42.077149+00:00",
  "badge": "1"
}
{
  "id": 271179,
  "candidate_id": 29492,
  "created_at": "2021-03-30T01:19:59.803999+00:00",
  "badge": "1"
}
{
  "id": 247169,
  "candidate_id": 29492,
  "created_at": "2021-03-30T00:16:04.077245+00:00",
  "badge": "1"
}
{
  "id": 247156,
  "candidate_id": 29332,
  "created_at": "2021-03-30T00:17:04.077245+00:00",
  "badge": "1"
}
{
  "id": 225434,
  "candidate_id": 24493,
  "created_at": "2021-03-29T23:13:59.266074+00:00",
  "badge": null
}
{
  "id": 192999,
  "candidate_id": 24493,
  "created_at": "2021-03-29T22:20:24.942116+00:00",
  "badge": null
}
{
  "id": 177712,
  "candidate_id": 24493,
  "created_at": "2021-03-29T21:33:32.596613+00:00",
  "badge": null
}
{
  "id": 162916,
  "candidate_id": 24493,
  "created_at": "2021-03-29T21:05:03.985032+00:00",
  "badge": null
}
{
  "id": 148136,
  "candidate_id": 23422,
  "created_at": "2021-03-29T20:20:36.482066+00:00",
  "badge": "2"
}
{
  "id": 118558,
  "candidate_id": 23422,
  "created_at": "2021-03-27T01:34:29.628550+00:00",
  "badge": "2"
}
{
  "id": 133354,
  "candidate_id": 23422,
  "created_at": "2021-03-27T02:11:35.811420+00:00",
  "badge": "2"
}

对于上述情况，我的答案计数应该是 2，因为 candidate_id=29492、29332 有徽章 1。我的 es 索引包含许多具有相同 candidate_id 但不同 created_at 字段的文档

Answer 1

您需要使用聚合的多种组合 - terms, top_hits, max聚合

然后你需要使用stats_bucket aggregation，得到桶的数量

{
  "size": 0,
  "aggs": {
    "badge_1": {
      "terms": {
        "field": "badge.keyword",
        "include": [
          "1"
        ],
        "size": 10
      },
      "aggs": {
        "unique_id": {
          "terms": {
            "field": "candidate_id",
            "size": 10,
            "order": {
              "latestOrder": "desc"
            }
          },
          "aggs": {
            "top_doc": {
              "top_hits": {
                "size": 1
              }
            },
            "latestOrder": {
              "max": {
                "field": "created_at"
              }
            }
          }
        },
        "bucketcount": {
          "stats_bucket": {
            "buckets_path": "unique_id._count"
          }
        }
      }
    }
  }
}

搜索结果将是

    "aggregations": {
    "badge_1": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "1",
          "doc_count": 4,
          "unique_id": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
              {
                "key": 29492,
                "doc_count": 3,
                "latestOrder": {
                  "value": 1.617071022077E12,
                  "value_as_string": "2021-03-30T02:23:42.077000Z"
                },
                "top_doc": {
                  "hits": {
                    "total": {
                      "value": 3,
                      "relation": "eq"
                    },
                    "max_score": 1.0,
                    "hits": [
                      {
                        "_index": "67162554",
                        "_type": "_doc",
                        "_id": "1",
                        "_score": 1.0,
                        "_source": {
                          "id": 295537,
                          "candidate_id": 29492,
                          "created_at": "2021-03-30T02:23:42.077149+00:00",
                          "badge": "1"
                        }
                      }
                    ]
                  }
                }
              },
              {
                "key": 29332,
                "doc_count": 1,
                "latestOrder": {
                  "value": 1.617063424077E12,
                  "value_as_string": "2021-03-30T00:17:04.077000Z"
                },
                "top_doc": {
                  "hits": {
                    "total": {
                      "value": 1,
                      "relation": "eq"
                    },
                    "max_score": 1.0,
                    "hits": [
                      {
                        "_index": "67162554",
                        "_type": "_doc",
                        "_id": "4",
                        "_score": 1.0,
                        "_source": {
                          "id": 247156,
                          "candidate_id": 29332,
                          "created_at": "2021-03-30T00:17:04.077245+00:00",
                          "badge": "1"
                        }
                      }
                    ]
                  }
                }
              }
            ]
          },
          "bucketcount": {
            "count": 2,        // note this
            "min": 1.0,
            "max": 3.0,
            "avg": 2.0,
            "sum": 4.0
          }
        }
      ]
    }
  }

Answer 2

要计算不同，您可以使用术语聚合并计算结果中的桶。像这样

  GET test_index/_search
  {
    "size": 0,
    "query": {
      "match": {
        "badge": "1"
      }
    }, 
    "aggs": {
      "candidate_aggs": {
        "terms": {
          "field": "candidate_id"
        }
      }
    }
  }

这将return以下

"aggregations" : {
"candidate_aggs" : {
  "doc_count_error_upper_bound" : 0,
  "sum_other_doc_count" : 0,
  "buckets" : [
    {
      "key" : 29492,
      "doc_count" : 3
    },
    {
      "key" : 29332,
      "doc_count" : 1
    }
  ]
}

elasticsearch 查询不同字段值的计数，在另一个字段上有 where 条件

elasticsearch query for count of distinct field value with where condition on another field

elasticsearch

elasticsearch-aggregation