在代码中使 Lucene.Net 线程安全

Making Lucene.Net thread safe in the code

我正在使用 Lucene.Net 进行搜索,想知道如何处理这个线程问题。

我有一个 class 测试实例,但在这种情况下搜索器不是线程安全的,因为计时器线程可以在服务请求的同时更新索引,我确实看到了异常因为那个。关于如何使其线程安全的任何指示。

public class Test 
{
    private static object syncObj = new object();

    private System.Threading.Timer timer;

    private Searcher searcher;

    private RAMDirectory idx = new RAMDirectory();

    public Test()
    {
        this.timer = new System.Threading.Timer(this.Timer_Elapsed, null, TimeSpan.Zero, TimeSpan.FromMinutes(3));
    }


    private Searcher ESearcher
    {
        get
        {
            return this.searcher;
        }

        set
        {
            lock (syncObj)
            {
                this.searcher = value;
            }
        }
    }

    public Document CreateDocument(string title, string content)
    {
        Document doc = new Document();
        doc.Add(new Field("A", title, Field.Store.YES, Field.Index.NO));
        doc.Add(new Field("B", content, Field.Store.YES, Field.Index.ANALYZED));
        return doc;
    }

    public List<Document> Search(Searcher searcher, string queryString)
    {
        List<Document> documents = new List<Document>();
        QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "B", new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30));
        Query query = parser.Parse(queryString);
        int hitsPerPage = 5;
        TopScoreDocCollector collector = TopScoreDocCollector.Create(2 * hitsPerPage, true);
        this.ESearcher.Search(query, collector);

        ScoreDoc[] hits = collector.TopDocs().ScoreDocs;

        int hitCount = collector.TotalHits > 10 ? 10 : collector.TotalHits;
        for (int i = 0; i < hitCount; i++)
        {
            ScoreDoc scoreDoc = hits[i];
            int docId = scoreDoc.Doc;
            float docScore = scoreDoc.Score;
            Document doc = searcher.Doc(docId);
            documents.Add(doc);
        }

        return documents;
    }

    private void Timer_Elapsed(object sender)
    {
        this.Log("Started Updating the Search Indexing");
        // Get New data to Index
        using (IndexWriter writer = new IndexWriter(this.idx, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED))
        {
            foreach (var e in es)
            {
                writer.AddDocument(this.CreateDocument(e.Value.ToString(), e.Key));
            }

            writer.Optimize();
        }

        this.ESearcher = new IndexSearcher(this.idx);
        this.Log("Completed Updating the Search Indexing");
    }

    public Result ServeRequest()
    {
        var documents = this.Search(this.EntitySearcher, searchTerm);
        //somelogic
        return result;

    }

}

而不是使用新的 IndexSearcher。您可以使用 'SearcherManager' class.

SearcherManager _searcherManager = new SearcherManager(LuceneMapDirectory, null);

然后,搜索如下:

_searcherManager.ExecuteSearch(searcher =>
        {
          //Execute query using <searcher>
        }, ex => { Trace.WriteLine(ex); });

很多事情 "wrong" 这个。

如前所述,锁定并不安全(您需要锁定读取和写入)。

更重要的是,在 Lucene 中有更好的方法来处理这个问题。首先,IndexWriter 本身是线程安全的。它应该是 Directory 的所有者。一般"bad practice"有不同的部分opening/closing目录。

NRT(近实时)索引有一种样式,涉及从 IW 获取 IndexReader,而不是包装目录。

如果索引本质上是 read-only 并且可能会批量重新生成 daily/weekly 等,则您的示例中使用的样式实际上只是 "good"

我重写了示例以展示一些方法。显然,因为这只是测试代码,所以根据用例会有细微差别需要 refactoring/enhancing...

public class Test
{
    private static object syncObj = new object();

    private System.Threading.Timer timer;

    private Searcher searcher;

    private IndexWriter writer;
    private IndexReader reader;

    public Test()
    {
        writer = new IndexWriter(new RAMDirectory(), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED);
        reader = writer.GetReader();
        searcher = new IndexSearcher(reader);
        timer = new System.Threading.Timer(Timer_Elapsed, null, TimeSpan.Zero, TimeSpan.FromMinutes(3));
    }


    public void CreateDocument(string title, string content)
    {
        var doc = new Document();
        doc.Add(new Field("A", title, Field.Store.YES, Field.Index.NO));
        doc.Add(new Field("B", content, Field.Store.YES, Field.Index.ANALYZED));

        writer.AddDocument(doc);
    }

    public void ReplaceAll(Dictionary<string, string> es)
    {
        // pause timer
        timer.Change(Timeout.Infinite, Timeout.Infinite);

        writer.DeleteAll();
        foreach (var e in es)
        {
            AddDocument(e.Value.ToString(), e.Key);
        }

        // restart timer
        timer.Change(TimeSpan.Zero, TimeSpan.FromMinutes(3));
    }

    public List<Document> Search(string queryString)
    {
        var documents = new List<Document>();
        var parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "B", new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30));
        Query query = parser.Parse(queryString);
        int hitsPerPage = 5;
        var collector = TopScoreDocCollector.Create(2 * hitsPerPage, true);
        searcher.Search(query, collector);

        ScoreDoc[] hits = collector.TopDocs().ScoreDocs;

        int hitCount = collector.TotalHits > 10 ? 10 : collector.TotalHits;
        for (int i = 0; i < hitCount; i++)
        {
            ScoreDoc scoreDoc = hits[i];
            int docId = scoreDoc.Doc;
            float docScore = scoreDoc.Score;
            Document doc = searcher.Doc(docId);
            documents.Add(doc);
        }

        return documents;
    }

    private void Timer_Elapsed(object sender)
    {
        if (reader.IsCurrent())
            return;

        reader = writer.GetReader();
        var newSearcher = new IndexSearcher(reader);
        Interlocked.Exchange(ref searcher, newSearcher);
        Debug.WriteLine("Searcher updated");
    }

    public Result ServeRequest(string searchTerm)
    {
        var documents = Search(searchTerm);
        //somelogic
        var result = new Result();

        return result;

    }
}

注:

  • 作者"owns"目录
  • 如果这是一个基于文件的目录,那么您将有 OpenClose 方法来 create/dispose 编写器(处理 lock 文件)。 RamDirectory 只能被 GC 处理
  • 使用 Interlocked.Exchange 而不是 lock。所以使用 searcher 成员时的成本为零(这里是龙!)
  • 直接添加到作者的新文档
  • 如果没有添加新文档,
  • IsCurrent() 允许零成本。根据您添加文档的频率,您可能根本不需要计时器(只需调用 Timer_Elapsed - 显然重命名 - 在 Search 的顶部)。
  • 不要使用 Optimize() 它是以前版本的后遗症,强烈建议不要使用它(性能和磁盘 I/O 原因)

最后,如果您使用的是 Lucene.net v4.8,那么您应该使用 SearcherManager(如另一个答案中所建议的)。但是使用接受 IndexWriter 的 ctor 并将其保留为 "singleton"(与 writer 相同的范围)。它将为您处理锁定和获取新读者。