带日期参数的 Lucene 搜索
Lucene Search with Date parameter
我对 Lucene 框架还很陌生。我们正在尝试实现 Lucene 框架,因为我们需要在几毫秒内搜索大量数据。
场景:
我们有在 Lucene 中索引的 EmployeeDto。对于以下
例如,我只硬编码了 6 个值。
我有 2 个参数应该作为搜索的输入参数
查询.
EmployeeDto.java
private String firstName;
private String lastName;
private Long employeeId;
private Integer salary;
private Date startDate;
private Date terminationDate;
//getters and setters
EmployeeLucene.java
public class EmployeeLucene {
public static void main(String[] args) throws IOException, ParseException {
// 0. Specify the analyzer for tokenizing text.
// The same analyzer should be used for indexing and searching
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");
// 1. create the index
Directory index = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
IndexWriter w = new IndexWriter(index, config);
long starttimeOfLoad = Calendar.getInstance().getTimeInMillis();
System.out.println("Data Loading started");
addEmployee(w, new EmployeeDto("John", "Smith", new Long(101), 10000, DATE_FORMAT.parse("2010-05-05"), DATE_FORMAT.parse("2018-05-05")));
addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 12000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2015-03-10")));
addEmployee(w, new EmployeeDto("Franklin", "Robinson", new Long(102), 12000, DATE_FORMAT.parse("2011-04-04"), DATE_FORMAT.parse("2015-07-07")));
addEmployee(w, new EmployeeDto("Thomas", "Boone", new Long(102), 12000, DATE_FORMAT.parse("2011-02-02"), DATE_FORMAT.parse("2015-03-10")));
addEmployee(w, new EmployeeDto("John", "Smith", new Long(103), 13000, DATE_FORMAT.parse("2019-05-05"), DATE_FORMAT.parse("2099-12-31")));
addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 14000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2099-12-31")));
w.close();
System.out.println("Data Loaded. Completed in " + (Calendar.getInstance().getTimeInMillis() - starttimeOfLoad));
// 2. query
Query q = null;
try {
q = new QueryParser(Version.LUCENE_40, "fullName", analyzer).parse(args[0] + "*");
} catch (org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
}
// 3. search
long starttime = Calendar.getInstance().getTimeInMillis();
int hitsPerPage = 100;
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// 4. display results
System.out.println("Found " + hits.length + " hits.");
List<EmployeeDto> employeeDtoList = new ArrayList<EmployeeDto>();
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
employeeDtoList.add(new EmployeeDto(d.get("firstName"), d.get("lastName"), Long.valueOf(d.get("employeeId")),
Integer.valueOf(d.get("salary"))));
}
System.out.println(employeeDtoList.size());
System.out.println(employeeDtoList);
System.out.println("Time taken:" + (Calendar.getInstance().getTimeInMillis() - starttime) + " ms");
}
private static void addEmployee(IndexWriter w, EmployeeDto employeeDto) throws IOException, ParseException {
Document doc = new Document();
doc.add(new TextField("fullName", employeeDto.getFirstName() + " " + employeeDto.getLastName(), Field.Store.YES));
doc.add(new TextField("firstName", employeeDto.getFirstName(), Field.Store.YES));
doc.add(new TextField("lastName", employeeDto.getLastName(), Field.Store.YES));
doc.add(new LongField("employeeId", employeeDto.getEmployeeId(), Field.Store.YES));
doc.add(new LongField("salary", employeeDto.getSalary(), Field.Store.YES));
doc.add(new LongField("startDate", employeeDto.getStartDate().getTime(), Field.Store.YES));
doc.add(new LongField("terminationDate", employeeDto.getTerminationDate().getTime(), Field.Store.YES));
w.addDocument(doc);
}
}
I run the program as "java EmployeeLucene thom 2014-05-05".
I should get only 2 values. but getting 3 hits.
问题:
- 如何在查询字符串中包含第二个参数?第二个参数
应大于 'startDate' 且小于 'terminationDate'
- 我们能否在文档中包含 EmployeeDto 本身以避免
一旦我们获得点击,就创建 EmployeeDtos 列表。
首先,您将获得三个结果,因为您有三个全名包含字符串 "thom*" 的记录。它们是记录 2、4 和 6。
其次,Lucene 4.0版本真的很老
最后,查询 startDate
和 terminationDate
之间日期的一种方法如下:
// 2. query
BooleanQuery finalQuery = null;
try {
// final query
finalQuery = new BooleanQuery();
// thom* query
Query fullName = new QueryParser(Version.LUCENE_40, "fullName", analyzer).parse("thom" + "*");
finalQuery.add(fullName, Occur.MUST); // MUST implies that the keyword must occur.
// greaterStartDate query
long searchDate = DATE_FORMAT.parse("2014-05-05").getTime();
Query greaterStartDate = NumericRangeQuery.newLongRange("startDate", null, searchDate, true, true);
finalQuery.add(greaterStartDate, Occur.MUST); // Using all "MUST" occurs is equivalent to "AND" operator
// lessTerminationDate query
Query lessTerminationDate = NumericRangeQuery.newLongRange("terminationDate", searchDate, null, false, false);
finalQuery.add(lessTerminationDate, Occur.MUST);
} catch (org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
}
Can we include EmployeeDto itself inside the document to avoid creation of List of EmployeeDtos once we get the hits.
我不知道。
编辑:版本 7.0.1
// 0. Specify the analyzer for tokenizing text.
// The same analyzer should be used for indexing and searching
StandardAnalyzer analyzer = new StandardAnalyzer();
final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");
// 1. create the index
Directory index = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
IndexWriter w = new IndexWriter(index, config);
long starttimeOfLoad = Calendar.getInstance().getTimeInMillis();
System.out.println("Data Loading started");
addEmployee(w, new EmployeeDto("John", "Smith", new Long(101), 10000, DATE_FORMAT.parse("2010-05-05"), DATE_FORMAT.parse("2018-05-05")));
addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 12000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2015-10-10")));
addEmployee(w, new EmployeeDto("Franklin", "Robinson", new Long(102), 12000, DATE_FORMAT.parse("2011-04-04"), DATE_FORMAT.parse("2015-07-07")));
addEmployee(w, new EmployeeDto("Thomas", "Boone", new Long(102), 12000, DATE_FORMAT.parse("2011-02-02"), DATE_FORMAT.parse("2015-03-10")));
addEmployee(w, new EmployeeDto("John", "Smith", new Long(103), 13000, DATE_FORMAT.parse("2019-05-05"), DATE_FORMAT.parse("2099-12-31")));
addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 14000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2099-12-31")));
w.close();
System.out.println("Data Loaded. Completed in " + (Calendar.getInstance().getTimeInMillis() - starttimeOfLoad));
// 2. query
BooleanQuery finalQuery = null;
try {
// final query
Builder builder = new BooleanQuery.Builder();
// thom* query
Query fullName = new QueryParser("fullName", analyzer).parse("thom" + "*");
builder.add(fullName, Occur.MUST); // MUST implies that the keyword must occur.
// greaterStartDate query
long searchDate = DATE_FORMAT.parse("2014-05-05").getTime();
Query greaterStartDate = LongPoint.newRangeQuery("startDatePoint", Long.MIN_VALUE, searchDate);
builder.add(greaterStartDate, Occur.MUST); // Using all "MUST" occurs is equivalent to "AND" operator
// lessTerminationDate query
Query lessTerminationDate = LongPoint.newRangeQuery("terminationDatePoint", searchDate, Long.MAX_VALUE);
builder.add(lessTerminationDate, Occur.MUST);
finalQuery = builder.build();
} catch (org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
}
// 3. search
long starttime = Calendar.getInstance().getTimeInMillis();
int hitsPerPage = 100;
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage);
searcher.search(finalQuery, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// 4. display results
System.out.println("Found " + hits.length + " hits.");
List<EmployeeDto> employeeDtoList = new ArrayList<EmployeeDto>();
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
employeeDtoList.add(new EmployeeDto(d.get("firstName"), d.get("lastName"), Long.valueOf(d.get("employeeId")),
Integer.valueOf(d.get("salary"))));
}
System.out.println(employeeDtoList.size());
System.out.println(employeeDtoList);
System.out.println("Time taken:" + (Calendar.getInstance().getTimeInMillis() - starttime) + " ms");
}
private static void addEmployee(IndexWriter w, EmployeeDto employeeDto) throws IOException {
Document doc = new Document();
doc.add(new TextField("fullName", employeeDto.getFirstName() + " " + employeeDto.getLastName(), Store.YES));
doc.add(new TextField("firstName", employeeDto.getFirstName(), Store.YES));
doc.add(new TextField("lastName", employeeDto.getLastName(), Store.YES));
doc.add(new StoredField("employeeId", employeeDto.getEmployeeId()));
doc.add(new StoredField("salary", employeeDto.getSalary()));
doc.add(new StoredField("startDate", employeeDto.getStartDate().getTime()));
doc.add(new LongPoint("startDatePoint", employeeDto.getStartDate().getTime()));
doc.add(new StoredField("terminationDate", employeeDto.getTerminationDate().getTime()));
doc.add(new LongPoint("terminationDatePoint", employeeDto.getTerminationDate().getTime()));
w.addDocument(doc);
}
编辑:日期字段存储为 LongPoint
和 StoredField
类型。 LongPoint
类型可用于 LongPoint.newRangeQuery
,但如果您想知道日期是什么,则以后不能将其作为值检索。 StoredField
类型可以作为存储值检索,但不能用于范围查询。虽然此示例不检索日期字段,但版本 4 确实具有这两种功能。如果您不打算检索这些值,则可以删除 StoredField
日期。
我对 Lucene 框架还很陌生。我们正在尝试实现 Lucene 框架,因为我们需要在几毫秒内搜索大量数据。
场景:
我们有在 Lucene 中索引的 EmployeeDto。对于以下 例如,我只硬编码了 6 个值。
我有 2 个参数应该作为搜索的输入参数 查询.
EmployeeDto.java
private String firstName;
private String lastName;
private Long employeeId;
private Integer salary;
private Date startDate;
private Date terminationDate;
//getters and setters
EmployeeLucene.java
public class EmployeeLucene {
public static void main(String[] args) throws IOException, ParseException {
// 0. Specify the analyzer for tokenizing text.
// The same analyzer should be used for indexing and searching
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");
// 1. create the index
Directory index = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
IndexWriter w = new IndexWriter(index, config);
long starttimeOfLoad = Calendar.getInstance().getTimeInMillis();
System.out.println("Data Loading started");
addEmployee(w, new EmployeeDto("John", "Smith", new Long(101), 10000, DATE_FORMAT.parse("2010-05-05"), DATE_FORMAT.parse("2018-05-05")));
addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 12000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2015-03-10")));
addEmployee(w, new EmployeeDto("Franklin", "Robinson", new Long(102), 12000, DATE_FORMAT.parse("2011-04-04"), DATE_FORMAT.parse("2015-07-07")));
addEmployee(w, new EmployeeDto("Thomas", "Boone", new Long(102), 12000, DATE_FORMAT.parse("2011-02-02"), DATE_FORMAT.parse("2015-03-10")));
addEmployee(w, new EmployeeDto("John", "Smith", new Long(103), 13000, DATE_FORMAT.parse("2019-05-05"), DATE_FORMAT.parse("2099-12-31")));
addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 14000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2099-12-31")));
w.close();
System.out.println("Data Loaded. Completed in " + (Calendar.getInstance().getTimeInMillis() - starttimeOfLoad));
// 2. query
Query q = null;
try {
q = new QueryParser(Version.LUCENE_40, "fullName", analyzer).parse(args[0] + "*");
} catch (org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
}
// 3. search
long starttime = Calendar.getInstance().getTimeInMillis();
int hitsPerPage = 100;
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// 4. display results
System.out.println("Found " + hits.length + " hits.");
List<EmployeeDto> employeeDtoList = new ArrayList<EmployeeDto>();
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
employeeDtoList.add(new EmployeeDto(d.get("firstName"), d.get("lastName"), Long.valueOf(d.get("employeeId")),
Integer.valueOf(d.get("salary"))));
}
System.out.println(employeeDtoList.size());
System.out.println(employeeDtoList);
System.out.println("Time taken:" + (Calendar.getInstance().getTimeInMillis() - starttime) + " ms");
}
private static void addEmployee(IndexWriter w, EmployeeDto employeeDto) throws IOException, ParseException {
Document doc = new Document();
doc.add(new TextField("fullName", employeeDto.getFirstName() + " " + employeeDto.getLastName(), Field.Store.YES));
doc.add(new TextField("firstName", employeeDto.getFirstName(), Field.Store.YES));
doc.add(new TextField("lastName", employeeDto.getLastName(), Field.Store.YES));
doc.add(new LongField("employeeId", employeeDto.getEmployeeId(), Field.Store.YES));
doc.add(new LongField("salary", employeeDto.getSalary(), Field.Store.YES));
doc.add(new LongField("startDate", employeeDto.getStartDate().getTime(), Field.Store.YES));
doc.add(new LongField("terminationDate", employeeDto.getTerminationDate().getTime(), Field.Store.YES));
w.addDocument(doc);
}
}
I run the program as "java EmployeeLucene thom 2014-05-05".
I should get only 2 values. but getting 3 hits.
问题:
- 如何在查询字符串中包含第二个参数?第二个参数 应大于 'startDate' 且小于 'terminationDate'
- 我们能否在文档中包含 EmployeeDto 本身以避免 一旦我们获得点击,就创建 EmployeeDtos 列表。
首先,您将获得三个结果,因为您有三个全名包含字符串 "thom*" 的记录。它们是记录 2、4 和 6。
其次,Lucene 4.0版本真的很老
最后,查询 startDate
和 terminationDate
之间日期的一种方法如下:
// 2. query
BooleanQuery finalQuery = null;
try {
// final query
finalQuery = new BooleanQuery();
// thom* query
Query fullName = new QueryParser(Version.LUCENE_40, "fullName", analyzer).parse("thom" + "*");
finalQuery.add(fullName, Occur.MUST); // MUST implies that the keyword must occur.
// greaterStartDate query
long searchDate = DATE_FORMAT.parse("2014-05-05").getTime();
Query greaterStartDate = NumericRangeQuery.newLongRange("startDate", null, searchDate, true, true);
finalQuery.add(greaterStartDate, Occur.MUST); // Using all "MUST" occurs is equivalent to "AND" operator
// lessTerminationDate query
Query lessTerminationDate = NumericRangeQuery.newLongRange("terminationDate", searchDate, null, false, false);
finalQuery.add(lessTerminationDate, Occur.MUST);
} catch (org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
}
Can we include EmployeeDto itself inside the document to avoid creation of List of EmployeeDtos once we get the hits.
我不知道。
编辑:版本 7.0.1
// 0. Specify the analyzer for tokenizing text.
// The same analyzer should be used for indexing and searching
StandardAnalyzer analyzer = new StandardAnalyzer();
final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");
// 1. create the index
Directory index = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
IndexWriter w = new IndexWriter(index, config);
long starttimeOfLoad = Calendar.getInstance().getTimeInMillis();
System.out.println("Data Loading started");
addEmployee(w, new EmployeeDto("John", "Smith", new Long(101), 10000, DATE_FORMAT.parse("2010-05-05"), DATE_FORMAT.parse("2018-05-05")));
addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 12000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2015-10-10")));
addEmployee(w, new EmployeeDto("Franklin", "Robinson", new Long(102), 12000, DATE_FORMAT.parse("2011-04-04"), DATE_FORMAT.parse("2015-07-07")));
addEmployee(w, new EmployeeDto("Thomas", "Boone", new Long(102), 12000, DATE_FORMAT.parse("2011-02-02"), DATE_FORMAT.parse("2015-03-10")));
addEmployee(w, new EmployeeDto("John", "Smith", new Long(103), 13000, DATE_FORMAT.parse("2019-05-05"), DATE_FORMAT.parse("2099-12-31")));
addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 14000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2099-12-31")));
w.close();
System.out.println("Data Loaded. Completed in " + (Calendar.getInstance().getTimeInMillis() - starttimeOfLoad));
// 2. query
BooleanQuery finalQuery = null;
try {
// final query
Builder builder = new BooleanQuery.Builder();
// thom* query
Query fullName = new QueryParser("fullName", analyzer).parse("thom" + "*");
builder.add(fullName, Occur.MUST); // MUST implies that the keyword must occur.
// greaterStartDate query
long searchDate = DATE_FORMAT.parse("2014-05-05").getTime();
Query greaterStartDate = LongPoint.newRangeQuery("startDatePoint", Long.MIN_VALUE, searchDate);
builder.add(greaterStartDate, Occur.MUST); // Using all "MUST" occurs is equivalent to "AND" operator
// lessTerminationDate query
Query lessTerminationDate = LongPoint.newRangeQuery("terminationDatePoint", searchDate, Long.MAX_VALUE);
builder.add(lessTerminationDate, Occur.MUST);
finalQuery = builder.build();
} catch (org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
}
// 3. search
long starttime = Calendar.getInstance().getTimeInMillis();
int hitsPerPage = 100;
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage);
searcher.search(finalQuery, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// 4. display results
System.out.println("Found " + hits.length + " hits.");
List<EmployeeDto> employeeDtoList = new ArrayList<EmployeeDto>();
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
employeeDtoList.add(new EmployeeDto(d.get("firstName"), d.get("lastName"), Long.valueOf(d.get("employeeId")),
Integer.valueOf(d.get("salary"))));
}
System.out.println(employeeDtoList.size());
System.out.println(employeeDtoList);
System.out.println("Time taken:" + (Calendar.getInstance().getTimeInMillis() - starttime) + " ms");
}
private static void addEmployee(IndexWriter w, EmployeeDto employeeDto) throws IOException {
Document doc = new Document();
doc.add(new TextField("fullName", employeeDto.getFirstName() + " " + employeeDto.getLastName(), Store.YES));
doc.add(new TextField("firstName", employeeDto.getFirstName(), Store.YES));
doc.add(new TextField("lastName", employeeDto.getLastName(), Store.YES));
doc.add(new StoredField("employeeId", employeeDto.getEmployeeId()));
doc.add(new StoredField("salary", employeeDto.getSalary()));
doc.add(new StoredField("startDate", employeeDto.getStartDate().getTime()));
doc.add(new LongPoint("startDatePoint", employeeDto.getStartDate().getTime()));
doc.add(new StoredField("terminationDate", employeeDto.getTerminationDate().getTime()));
doc.add(new LongPoint("terminationDatePoint", employeeDto.getTerminationDate().getTime()));
w.addDocument(doc);
}
编辑:日期字段存储为 LongPoint
和 StoredField
类型。 LongPoint
类型可用于 LongPoint.newRangeQuery
,但如果您想知道日期是什么,则以后不能将其作为值检索。 StoredField
类型可以作为存储值检索,但不能用于范围查询。虽然此示例不检索日期字段,但版本 4 确实具有这两种功能。如果您不打算检索这些值,则可以删除 StoredField
日期。