Elasticsearch 地理搜索异常行为
Elasticsearch geo search strange behavior
几天前,我遇到了 Elasticsearch 中地理搜索的奇怪行为。
我使用 AWS 托管 ES 5.5,显然是通过 REST 接口。
假设我们有 200k 个对象,其位置信息仅表示为点。我使用地理搜索来查找多个多边形内的点。它们显示在下图中。坐标是从对 ES 的最终请求中提取的。
该请求是使用官方 Java 高级 REST 客户端构建的。下面附上请求查询。
我想搜索至少 一个 多边形内的所有对象。
这是查询(实际字段名称和值已替换为存根,Except location 和 locationPoint.coordinates)
{
"size" : 20,
"query" : {
"constant_score" : {
"filter" : {
"bool" : {
"must" : [
{
"terms" : {
"field1" : [
"a",
"b",
"c",
"d",
"e",
"f"
],
"boost" : 1.0
}
},
{
"term" : {
"field2" : {
"value" : "q",
"boost" : 1.0
}
}
},
{
"range" : {
"field3" : {
"from" : "10",
"to" : null,
"include_lower" : true,
"include_upper" : true,
"boost" : 1.0
}
}
},
{
"range" : {
"field4" : {
"from" : "10",
"to" : null,
"include_lower" : true,
"include_upper" : true,
"boost" : 1.0
}
}
},
{
"geo_shape" : {
"location" : {
"shape" : {
"type" : "geometrycollection",
"geometries" : [
{
"type" : "multipolygon",
"orientation" : "right",
"coordinates" : [
[
// coords here
]
]
},
{
"type" : "polygon",
"orientation" : "right",
"coordinates" : [
[
// coords here
]
]
},
{
"type" : "polygon",
"orientation" : "right",
"coordinates" : [
[
// coords here
]
]
},
{
"type" : "polygon",
"orientation" : "right",
"coordinates" : [
[
// coords here
]
]
}
]
},
"relation" : "intersects"
},
"ignore_unmapped" : false,
"boost" : 1.0
}
}
]
}
},
"boost" : 1.0
}
},
"_source" : {
"includes" : [
"field1",
"field2",
"field3",
"field4",
"field8"
],
"excludes" : [ ]
},
"sort" : [
{
"field1" : {
"order" : "desc"
}
}
],
"aggregations" : {
"agg1" : {
"terms" : {
"field" : "field1",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg2" : {
"terms" : {
"field" : "field2",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg3" : {
"terms" : {
"field" : "field3",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg4" : {
"terms" : {
"field" : "field4",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg5" : {
"terms" : {
"field" : "field5",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg6" : {
"terms" : {
"field" : "field6",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg7" : {
"terms" : {
"field" : "field7",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg8" : {
"terms" : {
"field" : "field8",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"map_center" : {
"geo_centroid" : {
"field" : "locationPoint.coordinates"
}
},
"map_bound" : {
"geo_bounds" : {
"field" : "locationPoint.coordinates",
"wrap_longitude" : true
}
}
}
}
注意,字段location映射为geo_shape和字段location.coordinates 映射为 geo_point.
那么接下来就是问题了。下面是请求的结果(点击数)。只有多边形在变化。
# Polygons Hits count
1) 1,2,3,4 5565
2) 1 4897
3) 3,4 75
4) 2 9
5) 1,3,4 5543
6) 1,2 5466
7) 2,3,4 84
因此,如果我将第一个多边形的结果与 2、3、4 个多边形相加,我将无法获得完整请求中的数字。
例如,#1 != #2 + #7,还有#1 != #5 + #4,但是 #7 == #4 + #3
我不明白这是这个请求中的问题还是预期的行为,甚至是 ES 中的错误。
任何人都可以帮助我理解这种 ES 行为的逻辑或指出解决方案吗?
谢谢!
在与 Elasticsearch 团队成员进行了简短的交谈后,我们谈到了 AWS。
AWS 和纯 ES 的构建哈希是不相等的,ES 是由 AWS 团队修改的,我们不知道确切的变化。可能会有一些更改可能会影响已发布问题中的搜索。
在我们继续对话之前,需要在纯 ES 集群上重现此行为。
几天前,我遇到了 Elasticsearch 中地理搜索的奇怪行为。
我使用 AWS 托管 ES 5.5,显然是通过 REST 接口。
假设我们有 200k 个对象,其位置信息仅表示为点。我使用地理搜索来查找多个多边形内的点。它们显示在下图中。坐标是从对 ES 的最终请求中提取的。
我想搜索至少 一个 多边形内的所有对象。 这是查询(实际字段名称和值已替换为存根,Except location 和 locationPoint.coordinates)
{
"size" : 20,
"query" : {
"constant_score" : {
"filter" : {
"bool" : {
"must" : [
{
"terms" : {
"field1" : [
"a",
"b",
"c",
"d",
"e",
"f"
],
"boost" : 1.0
}
},
{
"term" : {
"field2" : {
"value" : "q",
"boost" : 1.0
}
}
},
{
"range" : {
"field3" : {
"from" : "10",
"to" : null,
"include_lower" : true,
"include_upper" : true,
"boost" : 1.0
}
}
},
{
"range" : {
"field4" : {
"from" : "10",
"to" : null,
"include_lower" : true,
"include_upper" : true,
"boost" : 1.0
}
}
},
{
"geo_shape" : {
"location" : {
"shape" : {
"type" : "geometrycollection",
"geometries" : [
{
"type" : "multipolygon",
"orientation" : "right",
"coordinates" : [
[
// coords here
]
]
},
{
"type" : "polygon",
"orientation" : "right",
"coordinates" : [
[
// coords here
]
]
},
{
"type" : "polygon",
"orientation" : "right",
"coordinates" : [
[
// coords here
]
]
},
{
"type" : "polygon",
"orientation" : "right",
"coordinates" : [
[
// coords here
]
]
}
]
},
"relation" : "intersects"
},
"ignore_unmapped" : false,
"boost" : 1.0
}
}
]
}
},
"boost" : 1.0
}
},
"_source" : {
"includes" : [
"field1",
"field2",
"field3",
"field4",
"field8"
],
"excludes" : [ ]
},
"sort" : [
{
"field1" : {
"order" : "desc"
}
}
],
"aggregations" : {
"agg1" : {
"terms" : {
"field" : "field1",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg2" : {
"terms" : {
"field" : "field2",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg3" : {
"terms" : {
"field" : "field3",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg4" : {
"terms" : {
"field" : "field4",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg5" : {
"terms" : {
"field" : "field5",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg6" : {
"terms" : {
"field" : "field6",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg7" : {
"terms" : {
"field" : "field7",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"agg8" : {
"terms" : {
"field" : "field8",
"size" : 10000,
"min_doc_count" : 1,
"shard_min_doc_count" : 0,
"show_term_doc_count_error" : false,
"order" : [
{
"_count" : "desc"
},
{
"_term" : "asc"
}
]
}
},
"map_center" : {
"geo_centroid" : {
"field" : "locationPoint.coordinates"
}
},
"map_bound" : {
"geo_bounds" : {
"field" : "locationPoint.coordinates",
"wrap_longitude" : true
}
}
}
}
注意,字段location映射为geo_shape和字段location.coordinates 映射为 geo_point.
那么接下来就是问题了。下面是请求的结果(点击数)。只有多边形在变化。
# Polygons Hits count
1) 1,2,3,4 5565
2) 1 4897
3) 3,4 75
4) 2 9
5) 1,3,4 5543
6) 1,2 5466
7) 2,3,4 84
因此,如果我将第一个多边形的结果与 2、3、4 个多边形相加,我将无法获得完整请求中的数字。
例如,#1 != #2 + #7,还有#1 != #5 + #4,但是 #7 == #4 + #3
我不明白这是这个请求中的问题还是预期的行为,甚至是 ES 中的错误。
任何人都可以帮助我理解这种 ES 行为的逻辑或指出解决方案吗?
谢谢!
在与 Elasticsearch 团队成员进行了简短的交谈后,我们谈到了 AWS。 AWS 和纯 ES 的构建哈希是不相等的,ES 是由 AWS 团队修改的,我们不知道确切的变化。可能会有一些更改可能会影响已发布问题中的搜索。 在我们继续对话之前,需要在纯 ES 集群上重现此行为。