diff --git a/benches/bool_queries_with_range.rs b/benches/bool_queries_with_range.rs index 9b28493004..6097295f62 100644 --- a/benches/bool_queries_with_range.rs +++ b/benches/bool_queries_with_range.rs @@ -2,7 +2,7 @@ use binggan::{black_box, BenchGroup, BenchRunner}; use rand::prelude::*; use rand::rngs::StdRng; use rand::SeedableRng; -use tantivy::collector::{Collector, Count, DocSetCollector, TopDocs}; +use tantivy::collector::{Collector, Count, TopDocs}; use tantivy::query::{Query, QueryParser}; use tantivy::schema::{Schema, FAST, INDEXED, TEXT}; use tantivy::{doc, Index, Order, ReloadPolicy, Searcher}; @@ -110,43 +110,39 @@ fn main() { // Prepare corpora with varying scenarios let scenarios = vec![ ( - "dense and 99% a".to_string(), - 10_000_000, - 0.99, + "dense and 0.1% a".to_string(), + 5_000_000, + 0.001, "dense", 0, 9, ), + ("dense and 1% a".to_string(), 5_000_000, 0.01, "dense", 0, 9), + ("dense and 10% a".to_string(), 5_000_000, 0.1, "dense", 0, 9), ( - "dense and 99% a".to_string(), - 10_000_000, - 0.99, + "dense and 50% a".to_string(), + 5_000_000, + 0.5, "dense", - 990, - 999, + 0, + 500, ), ( - "sparse and 99% a".to_string(), - 10_000_000, + "sparse and 50% a".to_string(), + 5_000_000, 0.99, "sparse", 0, 9, ), - ( - "sparse and 99% a".to_string(), - 10_000_000, - 0.99, - "sparse", - 9_999_990, - 9_999_999, - ), ]; let mut runner = BenchRunner::new(); - for (scenario_id, n, p_title_a, num_rand_distribution, range_low, range_high) in scenarios { + for (scenario_id, num_docs, p_title_a, num_rand_distribution, range_low, range_high) in + scenarios + { // Build index for this scenario - let bench_index = build_shared_indices(n, p_title_a, num_rand_distribution); + let bench_index = build_shared_indices(num_docs, p_title_a, num_rand_distribution); // Create benchmark group let mut group = runner.new_group(); @@ -158,7 +154,7 @@ fn main() { let field_names = ["num_rand", "num_asc", "num_rand_fast", "num_asc_fast"]; // Define the three terms we want to test with - let terms = ["a", "b", "z"]; + let terms = ["a"]; // Generate all combinations of terms and field names let mut queries = Vec::new(); @@ -202,8 +198,8 @@ fn run_benchmark_tasks( bench_group, bench_index, query_str, - DocSetCollector, - "all results", + (Count, TopDocs::with_limit(1000).order_by_score()), + "all_results", ); // Test top 100 by the field (if it's a FAST field) @@ -269,6 +265,10 @@ impl SearchTask { .downcast_ref::, tantivy::DocAddress)>>() { top_docs.len() + } else if let Some(top_docs_with_count) = (&result as &dyn std::any::Any) + .downcast_ref::<(usize, Vec<(f32, tantivy::DocAddress)>)>() + { + top_docs_with_count.0 } else if let Some(top_docs) = (&result as &dyn std::any::Any).downcast_ref::>() { diff --git a/src/query/const_score_query.rs b/src/query/const_score_query.rs index d07e6a96f4..f844485c54 100644 --- a/src/query/const_score_query.rs +++ b/src/query/const_score_query.rs @@ -1,6 +1,6 @@ use std::fmt; -use crate::docset::COLLECT_BLOCK_BUFFER_LEN; +use crate::docset::{SeekDangerResult, COLLECT_BLOCK_BUFFER_LEN}; use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight}; use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term}; @@ -119,6 +119,10 @@ impl DocSet for ConstScorer { self.docset.seek(target) } + fn seek_danger(&mut self, target: DocId) -> SeekDangerResult { + self.docset.seek_danger(target) + } + fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { self.docset.fill_buffer(buffer) } diff --git a/src/query/range_query/fast_field_range_doc_set.rs b/src/query/range_query/fast_field_range_doc_set.rs index 24d2b1fe3c..28e4db2148 100644 --- a/src/query/range_query/fast_field_range_doc_set.rs +++ b/src/query/range_query/fast_field_range_doc_set.rs @@ -3,6 +3,7 @@ use std::ops::RangeInclusive; use columnar::Column; +use crate::docset::SeekDangerResult; use crate::{DocId, DocSet, TERMINATED}; /// Helper to have a cursor over a vec of docids @@ -184,6 +185,37 @@ impl DocSet for RangeDocSe doc } + /// `seek_danger` only needs to answer whether `target` itself matches, so it does a cheap + /// point lookup on the column instead of scanning forward to materialize the next match (the + /// expensive part of a regular `seek`). + fn seek_danger(&mut self, target: DocId) -> SeekDangerResult { + // Covers `target == TERMINATED` and any target past the last doc: no match is possible. + if target >= self.column.num_docs() { + return SeekDangerResult::SeekLowerBound(TERMINATED); + } + + if self.is_last_seek_distance_large(target) { + self.reset_fetch_range(); + } + self.last_seek_pos_opt = Some(target); + + let is_match = self + .column + .values_for_doc(target) + .any(|value| self.value_range.contains(&value)); + if is_match { + // Leave the docset in a valid state positioned on `target`, so `doc()` returns it and a + // following `advance()` resumes the scan right after it. + self.loaded_docs.get_cleared_data().push(target); + self.next_fetch_start = target + 1; + SeekDangerResult::Found + } else { + // `target` is not in the docset. The next match is strictly greater than `target`, so + // `target + 1` is a valid lower bound. We may leave the docset in an invalid state. + SeekDangerResult::SeekLowerBound(target + 1) + } + } + fn size_hint(&self) -> u32 { // TODO: Implement a better size hint self.column.num_docs() / 10 @@ -209,12 +241,148 @@ impl DocSet for RangeDocSe #[cfg(test)] mod tests { - use std::ops::Bound; + use std::ops::{Bound, RangeInclusive}; + + use columnar::Column; + use super::RangeDocSet; use crate::collector::Count; use crate::directory::RamDirectory; + use crate::docset::{SeekDangerResult, TERMINATED}; use crate::query::RangeQuery; - use crate::{schema, IndexBuilder, TantivyDocument, Term}; + use crate::{schema, DocSet, Index, IndexBuilder, TantivyDocument, Term}; + + /// Builds a single-segment index where doc `i` carries `values_for_doc(i)` in a u64 fast + /// field, then returns its column so we can drive a `RangeDocSet` directly. + fn build_u64_column( + num_docs: usize, + values_for_doc: impl Fn(usize) -> Vec, + ) -> Column { + let mut schema_builder = schema::SchemaBuilder::new(); + let value_field = schema_builder.add_u64_field("value", schema::FAST); + let index = Index::create_in_ram(schema_builder.build()); + { + let mut writer = index.writer_for_tests().unwrap(); + for i in 0..num_docs { + let mut doc = TantivyDocument::new(); + for v in values_for_doc(i) { + doc.add_u64(value_field, v); + } + writer.add_document(doc).unwrap(); + } + writer.commit().unwrap(); + } + let searcher = index.reader().unwrap().searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + searcher + .segment_reader(0) + .fast_fields() + .u64("value") + .unwrap() + } + + fn range_docset( + value_range: RangeInclusive, + num_docs: usize, + values_for_doc: impl Fn(usize) -> Vec, + ) -> RangeDocSet { + RangeDocSet::new(value_range, build_u64_column(num_docs, values_for_doc)) + } + + #[test] + fn seek_danger_found_leaves_valid_state() { + // Even docs match the range, odd docs do not. + let mut docset = range_docset(0..=0, 100, |i| vec![(i % 2) as u64]); + + // Matching target: `Found`, and the docset is positioned exactly on it. + assert_eq!(docset.seek_danger(10), SeekDangerResult::Found); + assert_eq!(docset.doc(), 10); + // A following advance resumes the scan right after the found doc. + assert_eq!(docset.advance(), 12); + assert_eq!(docset.doc(), 12); + } + + #[test] + fn seek_danger_miss_returns_lower_bound() { + let mut docset = range_docset(0..=0, 100, |i| vec![(i % 2) as u64]); + + // Odd target does not match: lower bound is strictly greater than the target and never + // skips past the next real match (here doc 12, the first even doc after 11). + match docset.seek_danger(11) { + SeekDangerResult::SeekLowerBound(lower_bound) => { + assert!(lower_bound > 11); + assert!(lower_bound <= 12); + } + SeekDangerResult::Found => panic!("11 should not match"), + } + // After a miss we may be in an invalid state; another seek_danger recovers it. + assert_eq!(docset.seek_danger(12), SeekDangerResult::Found); + assert_eq!(docset.doc(), 12); + } + + #[test] + fn seek_danger_terminated_and_out_of_bounds() { + let mut docset = range_docset(0..=0, 10, |i| vec![(i % 2) as u64]); + assert_eq!( + docset.seek_danger(TERMINATED), + SeekDangerResult::SeekLowerBound(TERMINATED) + ); + // A target past the last doc has no possible match either. + assert_eq!( + docset.seek_danger(10), + SeekDangerResult::SeekLowerBound(TERMINATED) + ); + } + + #[test] + fn seek_danger_multivalued() { + // Doc `i` holds values [i, i+1]; the range {5} matches docs 4 and 5. + let mut docset = range_docset(5..=5, 20, |i| vec![i as u64, i as u64 + 1]); + + assert_eq!(docset.seek_danger(4), SeekDangerResult::Found); + assert_eq!(docset.doc(), 4); + assert_eq!(docset.advance(), 5); + // No further match after doc 5. + assert_eq!(docset.advance(), TERMINATED); + } + + #[test] + fn seek_danger_matches_seek() { + // Cross-check seek_danger against the true next match for every target, on a column with a + // few sparse matches. + let matches = [3u32, 7, 50, 51, 99]; + let num_docs = 100; + let values_for_doc = |i: usize| { + vec![if matches.contains(&(i as u32)) { + 1u64 + } else { + 0u64 + }] + }; + + for target in 0..num_docs as u32 { + // The first matching doc greater than or equal to `target`, i.e. what `seek` returns. + let expected = matches + .iter() + .copied() + .find(|&m| m >= target) + .unwrap_or(TERMINATED); + + let mut danger = range_docset(1..=1, num_docs, values_for_doc); + match danger.seek_danger(target) { + SeekDangerResult::Found => { + assert_eq!(expected, target, "target {target} reported Found"); + assert_eq!(danger.doc(), target); + } + SeekDangerResult::SeekLowerBound(lower_bound) => { + assert_ne!(expected, target, "target {target} should have been Found"); + assert!(lower_bound > target); + // The lower bound must never skip past the true next match. + assert!(lower_bound <= expected); + } + } + } + } #[test] fn range_query_fast_optional_field_minimum() {