在代码中使 Lucene.Net 线程安全
Making Lucene.Net thread safe in the code
我正在使用 Lucene.Net 进行搜索,想知道如何处理这个线程问题。
我有一个 class 测试实例,但在这种情况下搜索器不是线程安全的,因为计时器线程可以在服务请求的同时更新索引,我确实看到了异常因为那个。关于如何使其线程安全的任何指示。
public class Test
{
private static object syncObj = new object();
private System.Threading.Timer timer;
private Searcher searcher;
private RAMDirectory idx = new RAMDirectory();
public Test()
{
this.timer = new System.Threading.Timer(this.Timer_Elapsed, null, TimeSpan.Zero, TimeSpan.FromMinutes(3));
}
private Searcher ESearcher
{
get
{
return this.searcher;
}
set
{
lock (syncObj)
{
this.searcher = value;
}
}
}
public Document CreateDocument(string title, string content)
{
Document doc = new Document();
doc.Add(new Field("A", title, Field.Store.YES, Field.Index.NO));
doc.Add(new Field("B", content, Field.Store.YES, Field.Index.ANALYZED));
return doc;
}
public List<Document> Search(Searcher searcher, string queryString)
{
List<Document> documents = new List<Document>();
QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "B", new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30));
Query query = parser.Parse(queryString);
int hitsPerPage = 5;
TopScoreDocCollector collector = TopScoreDocCollector.Create(2 * hitsPerPage, true);
this.ESearcher.Search(query, collector);
ScoreDoc[] hits = collector.TopDocs().ScoreDocs;
int hitCount = collector.TotalHits > 10 ? 10 : collector.TotalHits;
for (int i = 0; i < hitCount; i++)
{
ScoreDoc scoreDoc = hits[i];
int docId = scoreDoc.Doc;
float docScore = scoreDoc.Score;
Document doc = searcher.Doc(docId);
documents.Add(doc);
}
return documents;
}
private void Timer_Elapsed(object sender)
{
this.Log("Started Updating the Search Indexing");
// Get New data to Index
using (IndexWriter writer = new IndexWriter(this.idx, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED))
{
foreach (var e in es)
{
writer.AddDocument(this.CreateDocument(e.Value.ToString(), e.Key));
}
writer.Optimize();
}
this.ESearcher = new IndexSearcher(this.idx);
this.Log("Completed Updating the Search Indexing");
}
public Result ServeRequest()
{
var documents = this.Search(this.EntitySearcher, searchTerm);
//somelogic
return result;
}
}
而不是使用新的 IndexSearcher。您可以使用 'SearcherManager' class.
SearcherManager _searcherManager = new
SearcherManager(LuceneMapDirectory, null);
然后,搜索如下:
_searcherManager.ExecuteSearch(searcher =>
{
//Execute query using <searcher>
}, ex => { Trace.WriteLine(ex); });
很多事情 "wrong" 这个。
如前所述,锁定并不安全(您需要锁定读取和写入)。
更重要的是,在 Lucene 中有更好的方法来处理这个问题。首先,IndexWriter
本身是线程安全的。它应该是 Directory
的所有者。一般"bad practice"有不同的部分opening/closing目录。
NRT(近实时)索引有一种样式,涉及从 IW 获取 IndexReader
,而不是包装目录。
如果索引本质上是 read-only 并且可能会批量重新生成 daily/weekly 等,则您的示例中使用的样式实际上只是 "good"
我重写了示例以展示一些方法。显然,因为这只是测试代码,所以根据用例会有细微差别需要 refactoring/enhancing...
public class Test
{
private static object syncObj = new object();
private System.Threading.Timer timer;
private Searcher searcher;
private IndexWriter writer;
private IndexReader reader;
public Test()
{
writer = new IndexWriter(new RAMDirectory(), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED);
reader = writer.GetReader();
searcher = new IndexSearcher(reader);
timer = new System.Threading.Timer(Timer_Elapsed, null, TimeSpan.Zero, TimeSpan.FromMinutes(3));
}
public void CreateDocument(string title, string content)
{
var doc = new Document();
doc.Add(new Field("A", title, Field.Store.YES, Field.Index.NO));
doc.Add(new Field("B", content, Field.Store.YES, Field.Index.ANALYZED));
writer.AddDocument(doc);
}
public void ReplaceAll(Dictionary<string, string> es)
{
// pause timer
timer.Change(Timeout.Infinite, Timeout.Infinite);
writer.DeleteAll();
foreach (var e in es)
{
AddDocument(e.Value.ToString(), e.Key);
}
// restart timer
timer.Change(TimeSpan.Zero, TimeSpan.FromMinutes(3));
}
public List<Document> Search(string queryString)
{
var documents = new List<Document>();
var parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "B", new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30));
Query query = parser.Parse(queryString);
int hitsPerPage = 5;
var collector = TopScoreDocCollector.Create(2 * hitsPerPage, true);
searcher.Search(query, collector);
ScoreDoc[] hits = collector.TopDocs().ScoreDocs;
int hitCount = collector.TotalHits > 10 ? 10 : collector.TotalHits;
for (int i = 0; i < hitCount; i++)
{
ScoreDoc scoreDoc = hits[i];
int docId = scoreDoc.Doc;
float docScore = scoreDoc.Score;
Document doc = searcher.Doc(docId);
documents.Add(doc);
}
return documents;
}
private void Timer_Elapsed(object sender)
{
if (reader.IsCurrent())
return;
reader = writer.GetReader();
var newSearcher = new IndexSearcher(reader);
Interlocked.Exchange(ref searcher, newSearcher);
Debug.WriteLine("Searcher updated");
}
public Result ServeRequest(string searchTerm)
{
var documents = Search(searchTerm);
//somelogic
var result = new Result();
return result;
}
}
注:
- 作者"owns"目录
- 如果这是一个基于文件的目录,那么您将有
Open
和 Close
方法来 create/dispose 编写器(处理 lock
文件)。 RamDirectory 只能被 GC 处理
- 使用
Interlocked.Exchange
而不是 lock
。所以使用 searcher
成员时的成本为零(这里是龙!)
- 直接添加到作者的新文档
如果没有添加新文档,IsCurrent()
允许零成本。根据您添加文档的频率,您可能根本不需要计时器(只需调用 Timer_Elapsed
- 显然重命名 - 在 Search
的顶部)。
- 不要使用
Optimize()
它是以前版本的后遗症,强烈建议不要使用它(性能和磁盘 I/O 原因)
最后,如果您使用的是 Lucene.net v4.8,那么您应该使用 SearcherManager
(如另一个答案中所建议的)。但是使用接受 IndexWriter
的 ctor 并将其保留为 "singleton"(与 writer
相同的范围)。它将为您处理锁定和获取新读者。
我正在使用 Lucene.Net 进行搜索,想知道如何处理这个线程问题。
我有一个 class 测试实例,但在这种情况下搜索器不是线程安全的,因为计时器线程可以在服务请求的同时更新索引,我确实看到了异常因为那个。关于如何使其线程安全的任何指示。
public class Test
{
private static object syncObj = new object();
private System.Threading.Timer timer;
private Searcher searcher;
private RAMDirectory idx = new RAMDirectory();
public Test()
{
this.timer = new System.Threading.Timer(this.Timer_Elapsed, null, TimeSpan.Zero, TimeSpan.FromMinutes(3));
}
private Searcher ESearcher
{
get
{
return this.searcher;
}
set
{
lock (syncObj)
{
this.searcher = value;
}
}
}
public Document CreateDocument(string title, string content)
{
Document doc = new Document();
doc.Add(new Field("A", title, Field.Store.YES, Field.Index.NO));
doc.Add(new Field("B", content, Field.Store.YES, Field.Index.ANALYZED));
return doc;
}
public List<Document> Search(Searcher searcher, string queryString)
{
List<Document> documents = new List<Document>();
QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "B", new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30));
Query query = parser.Parse(queryString);
int hitsPerPage = 5;
TopScoreDocCollector collector = TopScoreDocCollector.Create(2 * hitsPerPage, true);
this.ESearcher.Search(query, collector);
ScoreDoc[] hits = collector.TopDocs().ScoreDocs;
int hitCount = collector.TotalHits > 10 ? 10 : collector.TotalHits;
for (int i = 0; i < hitCount; i++)
{
ScoreDoc scoreDoc = hits[i];
int docId = scoreDoc.Doc;
float docScore = scoreDoc.Score;
Document doc = searcher.Doc(docId);
documents.Add(doc);
}
return documents;
}
private void Timer_Elapsed(object sender)
{
this.Log("Started Updating the Search Indexing");
// Get New data to Index
using (IndexWriter writer = new IndexWriter(this.idx, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED))
{
foreach (var e in es)
{
writer.AddDocument(this.CreateDocument(e.Value.ToString(), e.Key));
}
writer.Optimize();
}
this.ESearcher = new IndexSearcher(this.idx);
this.Log("Completed Updating the Search Indexing");
}
public Result ServeRequest()
{
var documents = this.Search(this.EntitySearcher, searchTerm);
//somelogic
return result;
}
}
而不是使用新的 IndexSearcher。您可以使用 'SearcherManager' class.
SearcherManager _searcherManager = new SearcherManager(LuceneMapDirectory, null);
然后,搜索如下:
_searcherManager.ExecuteSearch(searcher =>
{
//Execute query using <searcher>
}, ex => { Trace.WriteLine(ex); });
很多事情 "wrong" 这个。
如前所述,锁定并不安全(您需要锁定读取和写入)。
更重要的是,在 Lucene 中有更好的方法来处理这个问题。首先,IndexWriter
本身是线程安全的。它应该是 Directory
的所有者。一般"bad practice"有不同的部分opening/closing目录。
NRT(近实时)索引有一种样式,涉及从 IW 获取 IndexReader
,而不是包装目录。
如果索引本质上是 read-only 并且可能会批量重新生成 daily/weekly 等,则您的示例中使用的样式实际上只是 "good"
我重写了示例以展示一些方法。显然,因为这只是测试代码,所以根据用例会有细微差别需要 refactoring/enhancing...
public class Test
{
private static object syncObj = new object();
private System.Threading.Timer timer;
private Searcher searcher;
private IndexWriter writer;
private IndexReader reader;
public Test()
{
writer = new IndexWriter(new RAMDirectory(), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED);
reader = writer.GetReader();
searcher = new IndexSearcher(reader);
timer = new System.Threading.Timer(Timer_Elapsed, null, TimeSpan.Zero, TimeSpan.FromMinutes(3));
}
public void CreateDocument(string title, string content)
{
var doc = new Document();
doc.Add(new Field("A", title, Field.Store.YES, Field.Index.NO));
doc.Add(new Field("B", content, Field.Store.YES, Field.Index.ANALYZED));
writer.AddDocument(doc);
}
public void ReplaceAll(Dictionary<string, string> es)
{
// pause timer
timer.Change(Timeout.Infinite, Timeout.Infinite);
writer.DeleteAll();
foreach (var e in es)
{
AddDocument(e.Value.ToString(), e.Key);
}
// restart timer
timer.Change(TimeSpan.Zero, TimeSpan.FromMinutes(3));
}
public List<Document> Search(string queryString)
{
var documents = new List<Document>();
var parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "B", new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30));
Query query = parser.Parse(queryString);
int hitsPerPage = 5;
var collector = TopScoreDocCollector.Create(2 * hitsPerPage, true);
searcher.Search(query, collector);
ScoreDoc[] hits = collector.TopDocs().ScoreDocs;
int hitCount = collector.TotalHits > 10 ? 10 : collector.TotalHits;
for (int i = 0; i < hitCount; i++)
{
ScoreDoc scoreDoc = hits[i];
int docId = scoreDoc.Doc;
float docScore = scoreDoc.Score;
Document doc = searcher.Doc(docId);
documents.Add(doc);
}
return documents;
}
private void Timer_Elapsed(object sender)
{
if (reader.IsCurrent())
return;
reader = writer.GetReader();
var newSearcher = new IndexSearcher(reader);
Interlocked.Exchange(ref searcher, newSearcher);
Debug.WriteLine("Searcher updated");
}
public Result ServeRequest(string searchTerm)
{
var documents = Search(searchTerm);
//somelogic
var result = new Result();
return result;
}
}
注:
- 作者"owns"目录
- 如果这是一个基于文件的目录,那么您将有
Open
和Close
方法来 create/dispose 编写器(处理lock
文件)。 RamDirectory 只能被 GC 处理 - 使用
Interlocked.Exchange
而不是lock
。所以使用searcher
成员时的成本为零(这里是龙!) - 直接添加到作者的新文档 如果没有添加新文档,
IsCurrent()
允许零成本。根据您添加文档的频率,您可能根本不需要计时器(只需调用Timer_Elapsed
- 显然重命名 - 在Search
的顶部)。- 不要使用
Optimize()
它是以前版本的后遗症,强烈建议不要使用它(性能和磁盘 I/O 原因)
最后,如果您使用的是 Lucene.net v4.8,那么您应该使用 SearcherManager
(如另一个答案中所建议的)。但是使用接受 IndexWriter
的 ctor 并将其保留为 "singleton"(与 writer
相同的范围)。它将为您处理锁定和获取新读者。