elasticsearch 查询不同字段值的计数,在另一个字段上有 where 条件
elasticsearch query for count of distinct field value with where condition on another field
我想在我的 elasticsearch 索引中查询类似于下面在 postgres 上的查询
select count(distinct(candidate_id)) from candidate_ranking cr
where badge='1'
请考虑下面的示例索引,其中包含少量文档
{
"id": 295537,
"candidate_id": 29492,
"created_at": "2021-03-30T02:23:42.077149+00:00",
"badge": "1"
}
{
"id": 271179,
"candidate_id": 29492,
"created_at": "2021-03-30T01:19:59.803999+00:00",
"badge": "1"
}
{
"id": 247169,
"candidate_id": 29492,
"created_at": "2021-03-30T00:16:04.077245+00:00",
"badge": "1"
}
{
"id": 247156,
"candidate_id": 29332,
"created_at": "2021-03-30T00:17:04.077245+00:00",
"badge": "1"
}
{
"id": 225434,
"candidate_id": 24493,
"created_at": "2021-03-29T23:13:59.266074+00:00",
"badge": null
}
{
"id": 192999,
"candidate_id": 24493,
"created_at": "2021-03-29T22:20:24.942116+00:00",
"badge": null
}
{
"id": 177712,
"candidate_id": 24493,
"created_at": "2021-03-29T21:33:32.596613+00:00",
"badge": null
}
{
"id": 162916,
"candidate_id": 24493,
"created_at": "2021-03-29T21:05:03.985032+00:00",
"badge": null
}
{
"id": 148136,
"candidate_id": 23422,
"created_at": "2021-03-29T20:20:36.482066+00:00",
"badge": "2"
}
{
"id": 118558,
"candidate_id": 23422,
"created_at": "2021-03-27T01:34:29.628550+00:00",
"badge": "2"
}
{
"id": 133354,
"candidate_id": 23422,
"created_at": "2021-03-27T02:11:35.811420+00:00",
"badge": "2"
}
对于上述情况,我的答案计数应该是 2,因为 candidate_id=29492、29332 有徽章 1。我的 es 索引包含许多具有相同 candidate_id 但不同 created_at 字段的文档
您需要使用聚合的多种组合 - terms, top_hits, max聚合
然后你需要使用stats_bucket aggregation,得到桶的数量
{
"size": 0,
"aggs": {
"badge_1": {
"terms": {
"field": "badge.keyword",
"include": [
"1"
],
"size": 10
},
"aggs": {
"unique_id": {
"terms": {
"field": "candidate_id",
"size": 10,
"order": {
"latestOrder": "desc"
}
},
"aggs": {
"top_doc": {
"top_hits": {
"size": 1
}
},
"latestOrder": {
"max": {
"field": "created_at"
}
}
}
},
"bucketcount": {
"stats_bucket": {
"buckets_path": "unique_id._count"
}
}
}
}
}
}
搜索结果将是
"aggregations": {
"badge_1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1",
"doc_count": 4,
"unique_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 29492,
"doc_count": 3,
"latestOrder": {
"value": 1.617071022077E12,
"value_as_string": "2021-03-30T02:23:42.077000Z"
},
"top_doc": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "67162554",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"id": 295537,
"candidate_id": 29492,
"created_at": "2021-03-30T02:23:42.077149+00:00",
"badge": "1"
}
}
]
}
}
},
{
"key": 29332,
"doc_count": 1,
"latestOrder": {
"value": 1.617063424077E12,
"value_as_string": "2021-03-30T00:17:04.077000Z"
},
"top_doc": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "67162554",
"_type": "_doc",
"_id": "4",
"_score": 1.0,
"_source": {
"id": 247156,
"candidate_id": 29332,
"created_at": "2021-03-30T00:17:04.077245+00:00",
"badge": "1"
}
}
]
}
}
}
]
},
"bucketcount": {
"count": 2, // note this
"min": 1.0,
"max": 3.0,
"avg": 2.0,
"sum": 4.0
}
}
]
}
}
要计算不同,您可以使用术语聚合并计算结果中的桶。
像这样
GET test_index/_search
{
"size": 0,
"query": {
"match": {
"badge": "1"
}
},
"aggs": {
"candidate_aggs": {
"terms": {
"field": "candidate_id"
}
}
}
}
这将return以下
"aggregations" : {
"candidate_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 29492,
"doc_count" : 3
},
{
"key" : 29332,
"doc_count" : 1
}
]
}
我想在我的 elasticsearch 索引中查询类似于下面在 postgres 上的查询
select count(distinct(candidate_id)) from candidate_ranking cr
where badge='1'
请考虑下面的示例索引,其中包含少量文档
{
"id": 295537,
"candidate_id": 29492,
"created_at": "2021-03-30T02:23:42.077149+00:00",
"badge": "1"
}
{
"id": 271179,
"candidate_id": 29492,
"created_at": "2021-03-30T01:19:59.803999+00:00",
"badge": "1"
}
{
"id": 247169,
"candidate_id": 29492,
"created_at": "2021-03-30T00:16:04.077245+00:00",
"badge": "1"
}
{
"id": 247156,
"candidate_id": 29332,
"created_at": "2021-03-30T00:17:04.077245+00:00",
"badge": "1"
}
{
"id": 225434,
"candidate_id": 24493,
"created_at": "2021-03-29T23:13:59.266074+00:00",
"badge": null
}
{
"id": 192999,
"candidate_id": 24493,
"created_at": "2021-03-29T22:20:24.942116+00:00",
"badge": null
}
{
"id": 177712,
"candidate_id": 24493,
"created_at": "2021-03-29T21:33:32.596613+00:00",
"badge": null
}
{
"id": 162916,
"candidate_id": 24493,
"created_at": "2021-03-29T21:05:03.985032+00:00",
"badge": null
}
{
"id": 148136,
"candidate_id": 23422,
"created_at": "2021-03-29T20:20:36.482066+00:00",
"badge": "2"
}
{
"id": 118558,
"candidate_id": 23422,
"created_at": "2021-03-27T01:34:29.628550+00:00",
"badge": "2"
}
{
"id": 133354,
"candidate_id": 23422,
"created_at": "2021-03-27T02:11:35.811420+00:00",
"badge": "2"
}
对于上述情况,我的答案计数应该是 2,因为 candidate_id=29492、29332 有徽章 1。我的 es 索引包含许多具有相同 candidate_id 但不同 created_at 字段的文档
您需要使用聚合的多种组合 - terms, top_hits, max聚合
然后你需要使用stats_bucket aggregation,得到桶的数量
{
"size": 0,
"aggs": {
"badge_1": {
"terms": {
"field": "badge.keyword",
"include": [
"1"
],
"size": 10
},
"aggs": {
"unique_id": {
"terms": {
"field": "candidate_id",
"size": 10,
"order": {
"latestOrder": "desc"
}
},
"aggs": {
"top_doc": {
"top_hits": {
"size": 1
}
},
"latestOrder": {
"max": {
"field": "created_at"
}
}
}
},
"bucketcount": {
"stats_bucket": {
"buckets_path": "unique_id._count"
}
}
}
}
}
}
搜索结果将是
"aggregations": {
"badge_1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1",
"doc_count": 4,
"unique_id": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 29492,
"doc_count": 3,
"latestOrder": {
"value": 1.617071022077E12,
"value_as_string": "2021-03-30T02:23:42.077000Z"
},
"top_doc": {
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "67162554",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"id": 295537,
"candidate_id": 29492,
"created_at": "2021-03-30T02:23:42.077149+00:00",
"badge": "1"
}
}
]
}
}
},
{
"key": 29332,
"doc_count": 1,
"latestOrder": {
"value": 1.617063424077E12,
"value_as_string": "2021-03-30T00:17:04.077000Z"
},
"top_doc": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "67162554",
"_type": "_doc",
"_id": "4",
"_score": 1.0,
"_source": {
"id": 247156,
"candidate_id": 29332,
"created_at": "2021-03-30T00:17:04.077245+00:00",
"badge": "1"
}
}
]
}
}
}
]
},
"bucketcount": {
"count": 2, // note this
"min": 1.0,
"max": 3.0,
"avg": 2.0,
"sum": 4.0
}
}
]
}
}
要计算不同,您可以使用术语聚合并计算结果中的桶。 像这样
GET test_index/_search
{
"size": 0,
"query": {
"match": {
"badge": "1"
}
},
"aggs": {
"candidate_aggs": {
"terms": {
"field": "candidate_id"
}
}
}
}
这将return以下
"aggregations" : {
"candidate_aggs" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : 29492,
"doc_count" : 3
},
{
"key" : 29332,
"doc_count" : 1
}
]
}