弹性搜索 MoreLikeThis 查询从不 Returns 结果

Elastic Search MoreLikeThis Query Never Returns Results

我一定是做错了什么。我正在尝试让 "More Like This" 查询在我们拥有的使用 Elastic Search 的搜索引擎项目中运行。这个想法是 CMS 可以将标签(如类别)写入 Meta 标签或其他内容中的页面,我们会将这些内容读入 Elastic 并使用它们来驱动 "more like this" 基于输入文档 ID 的搜索。

因此,如果输入文档的标签为 catfish, chicken, goat,我希望 Elastic Search 找到共享这些标签的其他文档,而不是 racecarairplane 的 return ].

我通过以下方式构建了一个概念验证控制台应用程序:

然后我创建了一个新的弹性索引,其中包含具有 "Tags" 属性 的对象(类型 "MyThing")。此标记是一组可能值中的一组随机逗号分隔的单词。我在测试中的索引中插入了 100 到 5000 个项目。我在集合中尝试了更多和更少的可能单词。

无论我尝试什么,MoreLikeThis 查询都没有 return 任何东西,我不明白为什么。

未 return 结果的查询:

    var result = EsClient.Search<MyThing>(s => s
        .Index(DEFAULT_INDEX)
        .Query(esQuery =>
        {
            var mainQuery = esQuery
                .MoreLikeThis(mlt => mlt
                    .Include(true)
                    .Fields(f => f.Field(ff => ff.Tags, 5))
                    .Like(l => l.Document(d => d.Id(id)))
                );

            return mainQuery;
        }

完整 "program.cs" 来源:

using Nest;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Test_MoreLikeThis_ES6
{
    class Program
    {
        public class MyThing
        {
            public string Tags { get; set; }
        }

        const string ELASTIC_SERVER = "http://localhost:9200";
        const string DEFAULT_INDEX = "my_index";
        const int NUM_RECORDS = 1000;

        private static Uri es_node = new Uri(ELASTIC_SERVER);
        private static ConnectionSettings settings = new ConnectionSettings(es_node).DefaultIndex(DEFAULT_INDEX);
        private static ElasticClient EsClient = new ElasticClient(settings);

        private static Random rnd = new Random();

        static void Main(string[] args)
        {
            Console.WriteLine("Rebuild index? (y):");
            var answer = Console.ReadLine().ToLower();
            if (answer == "y")
            {
                RebuildIndex();
                for (int i = 0; i < NUM_RECORDS; i++)
                {
                    AddToIndex();
                }
            }

            Console.WriteLine("");
            Console.WriteLine("Getting a Thing...");
            var aThingId = GetARandomThingId();


            Console.WriteLine("");
            Console.WriteLine("Looking for something similar to document with id " + aThingId);
            Console.WriteLine("");
            Console.WriteLine("");

            GetMoreLikeAThing(aThingId);
        }

        private static string GetARandomThingId()
        {
            var firstdocQuery = EsClient
                .Search<MyThing>(s =>
                    s.Size(1)
                    .Query(q => {
                        return q.FunctionScore(fs => fs.Functions(fn => fn.RandomScore(rs => rs.Seed(DateTime.Now.Ticks).Field("_seq_no"))));
                    })
                );

            if (!firstdocQuery.IsValid || firstdocQuery.Hits.Count == 0) return null;

            var hit = firstdocQuery.Hits.First();
            Console.WriteLine("Found a thing with id '" + hit.Id + "' and tags: " + hit.Source.Tags);
            return hit.Id;
        }

        private static void GetMoreLikeAThing(string id)
        {

            var result = EsClient.Search<MyThing>(s => s
                .Index(DEFAULT_INDEX)
                .Query(esQuery =>
                {
                    var mainQuery = esQuery
                        .MoreLikeThis(mlt => mlt
                            .Include(true)
                            .Fields(f => f.Field(ff => ff.Tags, 5))
                            .Like(l => l.Document(d => d.Id(id)))
                        );

                    return mainQuery;
                }

            ));

            if (result.IsValid)
            {
                if (result.Hits.Count > 0)
                {
                    Console.WriteLine("These things are similar:");
                    foreach (var hit in result.Hits)
                    {
                        Console.WriteLine("   " + hit.Id + " : " + hit.Source.Tags);
                    }
                }
                else
                {
                    Console.WriteLine("No similar things found.");
                }

            }
            else
            {
                Console.WriteLine("There was an error running the ES query.");
            }

            Console.WriteLine("");
            Console.WriteLine("Enter (y) to get another thing, or anything else to exit");
            var y = Console.ReadLine().ToLower();

            if (y == "y")
            {
                var aThingId = GetARandomThingId();
                GetMoreLikeAThing(aThingId);
            }

            Console.WriteLine("");
            Console.WriteLine("Any key to exit...");
            Console.ReadKey();

        }

        private static void RebuildIndex()
        {
            var existsResponse = EsClient.IndexExists(DEFAULT_INDEX);
            if (existsResponse.Exists) //delete existing mapping (and data)
            {
                EsClient.DeleteIndex(DEFAULT_INDEX);
            }

            var rebuildResponse = EsClient.CreateIndex(DEFAULT_INDEX, c => c.Settings(s => s.NumberOfReplicas(1).NumberOfShards(5)));
            var response2 = EsClient.Map<MyThing>(m => m.AutoMap());
        }

        private static void AddToIndex()
        {
            var myThing = new MyThing();
            var tags = new List<string> {
                    "catfish",
                    "tractor",
                    "racecar",
                    "airplane",
                    "chicken",
                    "goat",
                    "pig",
                    "horse",
                    "goose",
                    "duck"
                };

            var randNum = rnd.Next(0, tags.Count);

            //get randNum random tags
            var rand = tags.OrderBy(o => Guid.NewGuid().ToString()).Take(randNum);
            myThing.Tags = string.Join(", ", rand);

            var ir = new IndexRequest<MyThing>(myThing);
            var indexResponse = EsClient.Index(ir);

            Console.WriteLine("Index response: " + indexResponse.Id + " : " + string.Join(" " , myThing.Tags));
        }
    }
}

这里的问题是默认的 min_term_freq 值 2 永远不会满足原型文档的任何条款,因为所有文档只包含每个标签 (term) 一次。如果将 min_term_freq 降为 1,您将得到结果。可能还想将 min_doc_freq 也设置为 1,并结合排除原型文档的查询。

这是一个可以玩的例子

const string ELASTIC_SERVER = "http://localhost:9200";
const string DEFAULT_INDEX = "my_index";
const int NUM_RECORDS = 1000;

private static readonly Random _random = new Random();
private static readonly IReadOnlyList<string> Tags = 
    new List<string>
    {
        "catfish",
        "tractor",
        "racecar",
        "airplane",
        "chicken",
        "goat",
        "pig",
        "horse",
        "goose",
        "duck"
    };

private static ElasticClient _client;

private static void Main()
{
    var pool = new SingleNodeConnectionPool(new Uri(ELASTIC_SERVER));

    var settings = new ConnectionSettings(pool)
        .DefaultIndex(DEFAULT_INDEX);

    _client = new ElasticClient(settings);

    Console.WriteLine("Rebuild index? (y):");
    var answer = Console.ReadLine().ToLower();
    if (answer == "y")
    {
        RebuildIndex();
        AddToIndex();
    }

    Console.WriteLine();
    Console.WriteLine("Getting a Thing...");
    var aThingId = GetARandomThingId();

    Console.WriteLine();
    Console.WriteLine("Looking for something similar to document with id " + aThingId);
    Console.WriteLine();
    Console.WriteLine();

    GetMoreLikeAThing(aThingId);
}

public class MyThing
{
    public List<string> Tags { get; set; }
}

private static string GetARandomThingId()
{
    var firstdocQuery = _client
        .Search<MyThing>(s =>
            s.Size(1)
            .Query(q => q
                .FunctionScore(fs => fs
                    .Functions(fn => fn
                        .RandomScore(rs => rs
                            .Seed(DateTime.Now.Ticks)
                            .Field("_seq_no")
                        )
                    )
                )
            )
        );

    if (!firstdocQuery.IsValid || firstdocQuery.Hits.Count == 0) return null;

    var hit = firstdocQuery.Hits.First();
    Console.WriteLine($"Found a thing with id '{hit.Id}' and tags: {string.Join(", ", hit.Source.Tags)}");
    return hit.Id;
}

private static void GetMoreLikeAThing(string id)
{
    var result = _client.Search<MyThing>(s => s
        .Index(DEFAULT_INDEX)
        .Query(esQuery => esQuery 
            .MoreLikeThis(mlt => mlt
                    .Include(true)
                    .Fields(f => f.Field(ff => ff.Tags))
                    .Like(l => l.Document(d => d.Id(id)))
                    .MinTermFrequency(1)
                    .MinDocumentFrequency(1)
            ) && !esQuery
            .Ids(ids => ids
                .Values(id)
            )
        )
    );

    if (result.IsValid)
    {
        if (result.Hits.Count > 0)
        {
            Console.WriteLine("These things are similar:");
            foreach (var hit in result.Hits)
            {
                Console.WriteLine($"   {hit.Id}: {string.Join(", ", hit.Source.Tags)}");
            }
        }
        else
        {
            Console.WriteLine("No similar things found.");
        }

    }
    else
    {
        Console.WriteLine("There was an error running the ES query.");
    }

    Console.WriteLine();
    Console.WriteLine("Enter (y) to get another thing, or anything else to exit");
    var y = Console.ReadLine().ToLower();

    if (y == "y")
    {
        var aThingId = GetARandomThingId();
        GetMoreLikeAThing(aThingId);
    }

    Console.WriteLine();
    Console.WriteLine("Any key to exit...");

}

private static void RebuildIndex()
{
    var existsResponse = _client.IndexExists(DEFAULT_INDEX);
    if (existsResponse.Exists) //delete existing mapping (and data)
    {
        _client.DeleteIndex(DEFAULT_INDEX);
    }

    var rebuildResponse = _client.CreateIndex(DEFAULT_INDEX, c => c
        .Settings(s => s
            .NumberOfShards(1)
        )
        .Mappings(m => m       
            .Map<MyThing>(mm => mm.AutoMap())
        )
    );
}

private static void AddToIndex()
{
    var bulkAllObservable = _client.BulkAll(GetMyThings(), b => b
        .RefreshOnCompleted()
        .Size(1000));

    var waitHandle = new ManualResetEvent(false);
    Exception exception = null;

    var bulkAllObserver = new BulkAllObserver(
        onNext: r =>
        {
            Console.WriteLine($"Indexed page {r.Page}");
        },
        onError: e => 
        {
            exception = e;
            waitHandle.Set();
        },
        onCompleted: () => waitHandle.Set());

    bulkAllObservable.Subscribe(bulkAllObserver);

    waitHandle.WaitOne();

    if (exception != null)
    {
        throw exception;
    }
}

private static IEnumerable<MyThing> GetMyThings()
{
    for (int i = 0; i < NUM_RECORDS; i++)
    {
        var randomTags = Tags.OrderBy(o => Guid.NewGuid().ToString())
            .Take(_random.Next(0, Tags.Count))
            .OrderBy(t => t)
            .ToList();

        yield return new MyThing { Tags = randomTags };
    }
}

这是一个示例输出

Found a thing with id 'Ugg9LGkBPK3n91HQD1d5' and tags: airplane, goat
These things are similar:
   4wg9LGkBPK3n91HQD1l5: airplane, goat
   9Ag9LGkBPK3n91HQD1l5: airplane, goat
   Vgg9LGkBPK3n91HQD1d5: airplane, goat, goose
   sQg9LGkBPK3n91HQD1d5: airplane, duck, goat
   lQg9LGkBPK3n91HQD1h5: airplane, catfish, goat
   9gg9LGkBPK3n91HQD1l5: airplane, catfish, goat
   FQg9LGkBPK3n91HQD1p5: airplane, goat, goose
   Jwg9LGkBPK3n91HQD1p5: airplane, goat, goose
   Fwg9LGkBPK3n91HQD1d5: airplane, duck, goat, tractor
   Kwg9LGkBPK3n91HQD1d5: airplane, goat, goose, horse