ElasticSearch NEST 2.4 中的自定义 "tab" 分词器
Custom "tab" Tokenizer in ElasticSearch NEST 2.4
我有一个包含很多字段的索引,其中一个字段"ServiceCategories"有类似这样的数据:
|Case Management|Developmental Disabilities
我需要用分隔符“|”拆分数据我试图这样做:
var descriptor = new CreateIndexDescriptor(_DataSource.ToLower())
.Mappings(ms => ms
.Map<ProviderContent>(m => m
.AutoMap()
.Properties(p => p
.String(s => s
.Name(n => n.OrganizationName)
.Fields(f => f
.String(ss => ss.Name("raw").NotAnalyzed())))
.String(s => s
.Name(n => n.ServiceCategories)
.Analyzer("tab_delim_analyzer"))
.GeoPoint(g => g.Name(n => n.Location).LatLon(true)))))
.Settings(st => st
.Analysis(an => an
.Analyzers(anz => anz
.Custom("tab_delim_analyzer", td => td
.Filters("lowercase")
.Tokenizer("tab_delim_tokenizer")))
.Tokenizers(t => t
.Pattern("tab_delim_tokenizer", tdt => tdt
.Pattern("|")))));
_elasticClientWrapper.CreateIndex(descriptor);
我的 ServiceCategories(serviceCategories 到 ES)搜索代码使用一个简单的 TermQuery,并将值设置为小写。
使用此搜索参数没有得到结果(其他参数工作正常)。预期结果是从上面的至少一个术语中获得完全匹配。
我也尝试使用经典的分词器让它工作:
var descriptor = new CreateIndexDescriptor(_DataSource.ToLower())
.Mappings(ms => ms
.Map<ProviderContent>(m => m
.AutoMap()
.Properties(p => p
.String(s => s
.Name(n => n.OrganizationName)
.Fields(f => f
.String(ss => ss.Name("raw").NotAnalyzed())))
.String(s => s
.Name(n => n.ServiceCategories)
.Analyzer("classic_tokenizer")
.SearchAnalyzer("standard"))
.GeoPoint(g => g.Name(n => n.Location).LatLon(true)))))
.Settings(s => s
.Analysis(an => an
.Analyzers(a => a.Custom("classic_tokenizer", ca => ca
.Tokenizer("classic")))));
这也不起作用。任何人都可以帮助我确定我哪里出错了吗?
这是搜索请求:
### ES REQEUST ###
{
"from": 0,
"size": 10,
"sort": [
{
"organizationName": {
"order": "asc"
}
}
],
"query": {
"bool": {
"must": [
{
"match_all": {}
},
{
"term": {
"serviceCategories": {
"value": "developmental disabilities"
}
}
}
]
}
}
}
您 tab_delim_tokenizer
的模式很接近,但不太正确 :) 最简单的查看方式是使用分析 API 来了解分析器如何标记一段文本。完成第一个映射后,我们可以检查自定义分析器的作用
client.Analyze(a => a
.Index(_DataSource.ToLower())
.Analyzer("tab_delim_analyzer")
.Text("|Case Management|Developmental Disabilities")
);
其中 return 秒(为简洁起见被截断)
{
"tokens" : [ {
"token" : "|",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
}, {
"token" : "c",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1
}, {
"token" : "a",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 2
}, {
"token" : "s",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 3
}, ... ]
}
证明 tab_delim_tokenizer
没有按照我们的预期进行标记化。通过使用 \
转义模式中的 |
并通过前缀 @
.
使模式成为逐字字符串文字来解决此问题。
这是一个完整的例子
void Main()
{
var pool = new SingleNodeConnectionPool(new Uri("http://localhost:9200"));
var defaultIndex = "default-index";
var connectionSettings = new ConnectionSettings(pool)
.DefaultIndex(defaultIndex);
var client = new ElasticClient(connectionSettings);
if (client.IndexExists(defaultIndex).Exists)
client.DeleteIndex(defaultIndex);
var descriptor = new CreateIndexDescriptor(defaultIndex)
.Mappings(ms => ms
.Map<ProviderContent>(m => m
.AutoMap()
.Properties(p => p
.String(s => s
.Name(n => n.OrganizationName)
.Fields(f => f
.String(ss => ss.Name("raw").NotAnalyzed())))
.String(s => s
.Name(n => n.ServiceCategories)
.Analyzer("tab_delim_analyzer")
)
.GeoPoint(g => g
.Name(n => n.Location)
.LatLon(true)
)
)
)
)
.Settings(st => st
.Analysis(an => an
.Analyzers(anz => anz
.Custom("tab_delim_analyzer", td => td
.Filters("lowercase")
.Tokenizer("tab_delim_tokenizer")
)
)
.Tokenizers(t => t
.Pattern("tab_delim_tokenizer", tdt => tdt
.Pattern(@"\|")
)
)
)
);
client.CreateIndex(descriptor);
// check our custom analyzer does what we think it should
client.Analyze(a => a
.Index(defaultIndex)
.Analyzer("tab_delim_analyzer")
.Text("|Case Management|Developmental Disabilities")
);
// index a document and make it immediately available for search
client.Index(new ProviderContent
{
OrganizationName = "Elastic",
ServiceCategories = "|Case Management|Developmental Disabilities"
}, i => i.Refresh());
// search for our document. Use a term query in a bool filter clause
// as we don't need scoring (probably)
client.Search<ProviderContent>(s => s
.From(0)
.Size(10)
.Sort(so => so
.Ascending(f => f.OrganizationName)
)
.Query(q => +q
.Term(f => f.ServiceCategories, "developmental disabilities")
)
);
}
public class ProviderContent
{
public string OrganizationName { get; set; }
public string ServiceCategories { get; set; }
public GeoLocation Location { get; set; }
}
搜索结果return
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : null,
"hits" : [ {
"_index" : "default-index",
"_type" : "providercontent",
"_id" : "AVqNNqlQpAW_5iHrnIDQ",
"_score" : null,
"_source" : {
"organizationName" : "Elastic",
"serviceCategories" : "|Case Management|Developmental Disabilities"
},
"sort" : [ "elastic" ]
} ]
}
}
我有一个包含很多字段的索引,其中一个字段"ServiceCategories"有类似这样的数据:
|Case Management|Developmental Disabilities
我需要用分隔符“|”拆分数据我试图这样做:
var descriptor = new CreateIndexDescriptor(_DataSource.ToLower())
.Mappings(ms => ms
.Map<ProviderContent>(m => m
.AutoMap()
.Properties(p => p
.String(s => s
.Name(n => n.OrganizationName)
.Fields(f => f
.String(ss => ss.Name("raw").NotAnalyzed())))
.String(s => s
.Name(n => n.ServiceCategories)
.Analyzer("tab_delim_analyzer"))
.GeoPoint(g => g.Name(n => n.Location).LatLon(true)))))
.Settings(st => st
.Analysis(an => an
.Analyzers(anz => anz
.Custom("tab_delim_analyzer", td => td
.Filters("lowercase")
.Tokenizer("tab_delim_tokenizer")))
.Tokenizers(t => t
.Pattern("tab_delim_tokenizer", tdt => tdt
.Pattern("|")))));
_elasticClientWrapper.CreateIndex(descriptor);
我的 ServiceCategories(serviceCategories 到 ES)搜索代码使用一个简单的 TermQuery,并将值设置为小写。
使用此搜索参数没有得到结果(其他参数工作正常)。预期结果是从上面的至少一个术语中获得完全匹配。
我也尝试使用经典的分词器让它工作:
var descriptor = new CreateIndexDescriptor(_DataSource.ToLower())
.Mappings(ms => ms
.Map<ProviderContent>(m => m
.AutoMap()
.Properties(p => p
.String(s => s
.Name(n => n.OrganizationName)
.Fields(f => f
.String(ss => ss.Name("raw").NotAnalyzed())))
.String(s => s
.Name(n => n.ServiceCategories)
.Analyzer("classic_tokenizer")
.SearchAnalyzer("standard"))
.GeoPoint(g => g.Name(n => n.Location).LatLon(true)))))
.Settings(s => s
.Analysis(an => an
.Analyzers(a => a.Custom("classic_tokenizer", ca => ca
.Tokenizer("classic")))));
这也不起作用。任何人都可以帮助我确定我哪里出错了吗?
这是搜索请求:
### ES REQEUST ###
{
"from": 0,
"size": 10,
"sort": [
{
"organizationName": {
"order": "asc"
}
}
],
"query": {
"bool": {
"must": [
{
"match_all": {}
},
{
"term": {
"serviceCategories": {
"value": "developmental disabilities"
}
}
}
]
}
}
}
您 tab_delim_tokenizer
的模式很接近,但不太正确 :) 最简单的查看方式是使用分析 API 来了解分析器如何标记一段文本。完成第一个映射后,我们可以检查自定义分析器的作用
client.Analyze(a => a
.Index(_DataSource.ToLower())
.Analyzer("tab_delim_analyzer")
.Text("|Case Management|Developmental Disabilities")
);
其中 return 秒(为简洁起见被截断)
{
"tokens" : [ {
"token" : "|",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
}, {
"token" : "c",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1
}, {
"token" : "a",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 2
}, {
"token" : "s",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 3
}, ... ]
}
证明 tab_delim_tokenizer
没有按照我们的预期进行标记化。通过使用 \
转义模式中的 |
并通过前缀 @
.
这是一个完整的例子
void Main()
{
var pool = new SingleNodeConnectionPool(new Uri("http://localhost:9200"));
var defaultIndex = "default-index";
var connectionSettings = new ConnectionSettings(pool)
.DefaultIndex(defaultIndex);
var client = new ElasticClient(connectionSettings);
if (client.IndexExists(defaultIndex).Exists)
client.DeleteIndex(defaultIndex);
var descriptor = new CreateIndexDescriptor(defaultIndex)
.Mappings(ms => ms
.Map<ProviderContent>(m => m
.AutoMap()
.Properties(p => p
.String(s => s
.Name(n => n.OrganizationName)
.Fields(f => f
.String(ss => ss.Name("raw").NotAnalyzed())))
.String(s => s
.Name(n => n.ServiceCategories)
.Analyzer("tab_delim_analyzer")
)
.GeoPoint(g => g
.Name(n => n.Location)
.LatLon(true)
)
)
)
)
.Settings(st => st
.Analysis(an => an
.Analyzers(anz => anz
.Custom("tab_delim_analyzer", td => td
.Filters("lowercase")
.Tokenizer("tab_delim_tokenizer")
)
)
.Tokenizers(t => t
.Pattern("tab_delim_tokenizer", tdt => tdt
.Pattern(@"\|")
)
)
)
);
client.CreateIndex(descriptor);
// check our custom analyzer does what we think it should
client.Analyze(a => a
.Index(defaultIndex)
.Analyzer("tab_delim_analyzer")
.Text("|Case Management|Developmental Disabilities")
);
// index a document and make it immediately available for search
client.Index(new ProviderContent
{
OrganizationName = "Elastic",
ServiceCategories = "|Case Management|Developmental Disabilities"
}, i => i.Refresh());
// search for our document. Use a term query in a bool filter clause
// as we don't need scoring (probably)
client.Search<ProviderContent>(s => s
.From(0)
.Size(10)
.Sort(so => so
.Ascending(f => f.OrganizationName)
)
.Query(q => +q
.Term(f => f.ServiceCategories, "developmental disabilities")
)
);
}
public class ProviderContent
{
public string OrganizationName { get; set; }
public string ServiceCategories { get; set; }
public GeoLocation Location { get; set; }
}
搜索结果return
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : null,
"hits" : [ {
"_index" : "default-index",
"_type" : "providercontent",
"_id" : "AVqNNqlQpAW_5iHrnIDQ",
"_score" : null,
"_source" : {
"organizationName" : "Elastic",
"serviceCategories" : "|Case Management|Developmental Disabilities"
},
"sort" : [ "elastic" ]
} ]
}
}