为什么在elasticsearch中keyword类型比text占用多space?
Why does the keyword type take up much more space than text in elasticsearch?
环境:弹性搜索 5.5.1
首先我的elasticsearch中有两个索引
两个索引唯一不同的是消息字段,索引1中消息字段的类型是关键字,索引2中是文本。
为了保证不受其他字段的影响,我把message字段去掉,对比前后结果:
删除消息字段之前:
删除消息字段后我得到:
很明显message字段占了很多space,keyword的type比text占的多,但是不知道为什么keyword比text占的多?
那么,有人帮我吗?
以下是index1的映射信息索引:
"mappings": {
"system": {
"dynamic": "true",
"_all": {
"enabled": false
},
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"dynamic_templates": [
{
"geo2": {
"match": "*_geo",
"mapping": {
"type": "geo_point"
}
}
},
{
"strings2": {
"match_mapping_type": "string",
"mapping": {
"type": "keyword"
}
}
}
],
"numeric_detection": false,
"properties": {
"@agent_timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"@timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"Kafkaspeed": {
"type": "keyword"
},
"_index_name": {
"type": "keyword"
},
"count": {
"type": "long"
},
"datex": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"defaultWriteESspeed": {
"type": "double"
},
"filepathname": {
"type": "keyword"
},
"jsonmessage": {
"type": "text"
},
"key": {
"type": "keyword"
},
"logcount": {
"type": "long"
},
"loglevel": {
"type": "keyword"
},
"message": {
"type": "keyword"
},
"paredspeed": {
"type": "float"
},
"seccount": {
"type": "long"
},
"sn": {
"type": "long"
},
"sourceName": {
"type": "keyword"
},
"sourceip": {
"type": "keyword"
},
"sourcename": {
"type": "keyword"
},
"sourceport": {
"type": "long"
},
"sucesscount": {
"type": "long"
},
"time_str": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"timestamp": {
"type": "long"
},
"totalcount": {
"type": "long"
},
"uniqueid": {
"type": "keyword"
}
}
}
}
和设置信息:
"settings": {
"index": {
"refresh_interval": "1s",
"number_of_shards": "3",
"translog": {
"flush_threshold_size": "1024mb",
"sync_interval": "60s",
"durability": "async"
},
"provided_name": "index1",
"creation_date": "1531389785215",
"analysis": {
"analyzer": {
"optionIK": {
"filter": [
"word_delimiter"
],
"type": "custom",
"tokenizer": "ik_max_word"
}
}
},
"number_of_replicas": "0",
"uuid": "zd8oVbwUQbys1UJ8hJZRmQ",
"version": {
"created": "5050099"
}
}
}
index2的映射信息索引如下:
"mappings": {
"system": {
"dynamic": "true",
"_all": {
"enabled": false
},
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"dynamic_templates": [
{
"geo2": {
"match": "*_geo",
"mapping": {
"type": "geo_point"
}
}
},
{
"strings2": {
"match_mapping_type": "string",
"mapping": {
"type": "keyword"
}
}
}
],
"numeric_detection": false,
"properties": {
"@agent_timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"@timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"CommunicationReturnCode": {
"type": "keyword"
},
"Kafkaspeed": {
"type": "keyword"
},
"_index_name": {
"type": "keyword"
},
"action": {
"type": "keyword"
},
"count": {
"type": "long"
},
"datex": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"defaultWriteESspeed": {
"type": "double"
},
"filepathname": {
"type": "keyword"
},
"jsonmessage": {
"type": "text"
},
"key": {
"type": "keyword"
},
"logcount": {
"type": "long"
},
"loglevel": {
"type": "keyword"
},
"message": {
"type": "text"
},
"msgid": {
"type": "keyword"
},
"msgname": {
"type": "keyword"
},
"nodetype": {
"type": "keyword"
},
"orgid": {
"type": "keyword"
},
"orgname": {
"type": "keyword"
},
"paredspeed": {
"type": "float"
},
"processingState": {
"type": "keyword"
},
"processingStatecode": {
"type": "keyword"
},
"seccount": {
"type": "long"
},
"sn": {
"type": "long"
},
"sourceName": {
"type": "keyword"
},
"sourceip": {
"type": "keyword"
},
"sourcename": {
"type": "keyword"
},
"sourceport": {
"type": "long"
},
"sucesscount": {
"type": "long"
},
"thread": {
"type": "keyword"
},
"time_str": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"timestamp": {
"type": "long"
},
"totalcount": {
"type": "long"
},
"transDescription": {
"type": "keyword"
},
"transactionErrorCode": {
"type": "keyword"
},
"transactionTimeConsuming": {
"type": "keyword"
},
"transcode": {
"type": "keyword"
},
"uniqueid": {
"type": "keyword"
}
}
}
}
和设置信息:
"settings": {
"index": {
"refresh_interval": "1s",
"number_of_shards": "2",
"translog": {
"flush_threshold_size": "1024mb",
"sync_interval": "60s",
"durability": "async"
},
"provided_name": "index2",
"creation_date": "1531467294314",
"analysis": {
"analyzer": {
"optionIK": {
"filter": [
"word_delimiter"
],
"type": "custom",
"tokenizer": "ik_max_word"
}
}
},
"number_of_replicas": "0",
"uuid": "yROU2MrMTzip4VXH_zWEXQ",
"version": {
"created": "5050099"
}
}
}
以下是关于文本类型字段的两个分片的索引文件结构之一:
和关键字类型字段:
你可以相信两个文件夹中的文档数量相同,字段的唯一区别是消息字段的类型。
你能解释一下吗?
非常感谢!
在 Elasticsearch 关键字字段中有 doc_values enabled by default, while text fields does not. This means that on your keyword fields it will store the whole field in a column-oriented fashion, in order to be able to perform aggregations or sorting, without relying on fielddata.
此外,一旦对字符串进行词干化、小写等操作,就可以获得更好的压缩效果。
如果您不对其执行聚合或排序,您可以尝试在该字段上禁用 doc_values。
环境:弹性搜索 5.5.1
首先我的elasticsearch中有两个索引 两个索引唯一不同的是消息字段,索引1中消息字段的类型是关键字,索引2中是文本。
为了保证不受其他字段的影响,我把message字段去掉,对比前后结果:
删除消息字段之前:
删除消息字段后我得到:
很明显message字段占了很多space,keyword的type比text占的多,但是不知道为什么keyword比text占的多? 那么,有人帮我吗?
以下是index1的映射信息索引:
"mappings": {
"system": {
"dynamic": "true",
"_all": {
"enabled": false
},
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"dynamic_templates": [
{
"geo2": {
"match": "*_geo",
"mapping": {
"type": "geo_point"
}
}
},
{
"strings2": {
"match_mapping_type": "string",
"mapping": {
"type": "keyword"
}
}
}
],
"numeric_detection": false,
"properties": {
"@agent_timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"@timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"Kafkaspeed": {
"type": "keyword"
},
"_index_name": {
"type": "keyword"
},
"count": {
"type": "long"
},
"datex": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"defaultWriteESspeed": {
"type": "double"
},
"filepathname": {
"type": "keyword"
},
"jsonmessage": {
"type": "text"
},
"key": {
"type": "keyword"
},
"logcount": {
"type": "long"
},
"loglevel": {
"type": "keyword"
},
"message": {
"type": "keyword"
},
"paredspeed": {
"type": "float"
},
"seccount": {
"type": "long"
},
"sn": {
"type": "long"
},
"sourceName": {
"type": "keyword"
},
"sourceip": {
"type": "keyword"
},
"sourcename": {
"type": "keyword"
},
"sourceport": {
"type": "long"
},
"sucesscount": {
"type": "long"
},
"time_str": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"timestamp": {
"type": "long"
},
"totalcount": {
"type": "long"
},
"uniqueid": {
"type": "keyword"
}
}
}
}
和设置信息:
"settings": {
"index": {
"refresh_interval": "1s",
"number_of_shards": "3",
"translog": {
"flush_threshold_size": "1024mb",
"sync_interval": "60s",
"durability": "async"
},
"provided_name": "index1",
"creation_date": "1531389785215",
"analysis": {
"analyzer": {
"optionIK": {
"filter": [
"word_delimiter"
],
"type": "custom",
"tokenizer": "ik_max_word"
}
}
},
"number_of_replicas": "0",
"uuid": "zd8oVbwUQbys1UJ8hJZRmQ",
"version": {
"created": "5050099"
}
}
}
index2的映射信息索引如下:
"mappings": {
"system": {
"dynamic": "true",
"_all": {
"enabled": false
},
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"dynamic_templates": [
{
"geo2": {
"match": "*_geo",
"mapping": {
"type": "geo_point"
}
}
},
{
"strings2": {
"match_mapping_type": "string",
"mapping": {
"type": "keyword"
}
}
}
],
"numeric_detection": false,
"properties": {
"@agent_timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"@timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"CommunicationReturnCode": {
"type": "keyword"
},
"Kafkaspeed": {
"type": "keyword"
},
"_index_name": {
"type": "keyword"
},
"action": {
"type": "keyword"
},
"count": {
"type": "long"
},
"datex": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"defaultWriteESspeed": {
"type": "double"
},
"filepathname": {
"type": "keyword"
},
"jsonmessage": {
"type": "text"
},
"key": {
"type": "keyword"
},
"logcount": {
"type": "long"
},
"loglevel": {
"type": "keyword"
},
"message": {
"type": "text"
},
"msgid": {
"type": "keyword"
},
"msgname": {
"type": "keyword"
},
"nodetype": {
"type": "keyword"
},
"orgid": {
"type": "keyword"
},
"orgname": {
"type": "keyword"
},
"paredspeed": {
"type": "float"
},
"processingState": {
"type": "keyword"
},
"processingStatecode": {
"type": "keyword"
},
"seccount": {
"type": "long"
},
"sn": {
"type": "long"
},
"sourceName": {
"type": "keyword"
},
"sourceip": {
"type": "keyword"
},
"sourcename": {
"type": "keyword"
},
"sourceport": {
"type": "long"
},
"sucesscount": {
"type": "long"
},
"thread": {
"type": "keyword"
},
"time_str": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss.SSS"
},
"timestamp": {
"type": "long"
},
"totalcount": {
"type": "long"
},
"transDescription": {
"type": "keyword"
},
"transactionErrorCode": {
"type": "keyword"
},
"transactionTimeConsuming": {
"type": "keyword"
},
"transcode": {
"type": "keyword"
},
"uniqueid": {
"type": "keyword"
}
}
}
}
和设置信息:
"settings": {
"index": {
"refresh_interval": "1s",
"number_of_shards": "2",
"translog": {
"flush_threshold_size": "1024mb",
"sync_interval": "60s",
"durability": "async"
},
"provided_name": "index2",
"creation_date": "1531467294314",
"analysis": {
"analyzer": {
"optionIK": {
"filter": [
"word_delimiter"
],
"type": "custom",
"tokenizer": "ik_max_word"
}
}
},
"number_of_replicas": "0",
"uuid": "yROU2MrMTzip4VXH_zWEXQ",
"version": {
"created": "5050099"
}
}
}
以下是关于文本类型字段的两个分片的索引文件结构之一:
你可以相信两个文件夹中的文档数量相同,字段的唯一区别是消息字段的类型。
你能解释一下吗? 非常感谢!
在 Elasticsearch 关键字字段中有 doc_values enabled by default, while text fields does not. This means that on your keyword fields it will store the whole field in a column-oriented fashion, in order to be able to perform aggregations or sorting, without relying on fielddata.
此外,一旦对字符串进行词干化、小写等操作,就可以获得更好的压缩效果。
如果您不对其执行聚合或排序,您可以尝试在该字段上禁用 doc_values。