Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 24 additions & 24 deletions benches/bool_queries_with_range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use binggan::{black_box, BenchGroup, BenchRunner};
use rand::prelude::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use tantivy::collector::{Collector, Count, DocSetCollector, TopDocs};
use tantivy::collector::{Collector, Count, TopDocs};
use tantivy::query::{Query, QueryParser};
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, Order, ReloadPolicy, Searcher};
Expand Down Expand Up @@ -110,43 +110,39 @@ fn main() {
// Prepare corpora with varying scenarios
let scenarios = vec![
(
"dense and 99% a".to_string(),
10_000_000,
0.99,
"dense and 0.1% a".to_string(),
5_000_000,
0.001,
"dense",
0,
9,
),
("dense and 1% a".to_string(), 5_000_000, 0.01, "dense", 0, 9),
("dense and 10% a".to_string(), 5_000_000, 0.1, "dense", 0, 9),
(
"dense and 99% a".to_string(),
10_000_000,
0.99,
"dense and 50% a".to_string(),
5_000_000,
0.5,
"dense",
990,
999,
0,
500,
),
(
"sparse and 99% a".to_string(),
10_000_000,
"sparse and 50% a".to_string(),
5_000_000,
0.99,
"sparse",
0,
9,
),
(
"sparse and 99% a".to_string(),
10_000_000,
0.99,
"sparse",
9_999_990,
9_999_999,
),
];

let mut runner = BenchRunner::new();
for (scenario_id, n, p_title_a, num_rand_distribution, range_low, range_high) in scenarios {
for (scenario_id, num_docs, p_title_a, num_rand_distribution, range_low, range_high) in
scenarios
{
// Build index for this scenario
let bench_index = build_shared_indices(n, p_title_a, num_rand_distribution);
let bench_index = build_shared_indices(num_docs, p_title_a, num_rand_distribution);

// Create benchmark group
let mut group = runner.new_group();
Expand All @@ -158,7 +154,7 @@ fn main() {
let field_names = ["num_rand", "num_asc", "num_rand_fast", "num_asc_fast"];

// Define the three terms we want to test with
let terms = ["a", "b", "z"];
let terms = ["a"];

// Generate all combinations of terms and field names
let mut queries = Vec::new();
Expand Down Expand Up @@ -202,8 +198,8 @@ fn run_benchmark_tasks(
bench_group,
bench_index,
query_str,
DocSetCollector,
"all results",
(Count, TopDocs::with_limit(1000).order_by_score()),
Comment thread
PSeitz marked this conversation as resolved.
"all_results",
);

// Test top 100 by the field (if it's a FAST field)
Expand Down Expand Up @@ -269,6 +265,10 @@ impl<C: Collector> SearchTask<C> {
.downcast_ref::<Vec<(Option<u64>, tantivy::DocAddress)>>()
{
top_docs.len()
} else if let Some(top_docs_with_count) = (&result as &dyn std::any::Any)
.downcast_ref::<(usize, Vec<(f32, tantivy::DocAddress)>)>()
{
top_docs_with_count.0
} else if let Some(top_docs) =
(&result as &dyn std::any::Any).downcast_ref::<Vec<(u64, tantivy::DocAddress)>>()
{
Expand Down
6 changes: 5 additions & 1 deletion src/query/const_score_query.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::fmt;

use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::docset::{SeekDangerResult, COLLECT_BLOCK_BUFFER_LEN};
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};

Expand Down Expand Up @@ -119,6 +119,10 @@ impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
self.docset.seek(target)
}

fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
self.docset.seek_danger(target)
}

fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
self.docset.fill_buffer(buffer)
}
Expand Down
172 changes: 170 additions & 2 deletions src/query/range_query/fast_field_range_doc_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::ops::RangeInclusive;

use columnar::Column;

use crate::docset::SeekDangerResult;
use crate::{DocId, DocSet, TERMINATED};

/// Helper to have a cursor over a vec of docids
Expand Down Expand Up @@ -184,6 +185,37 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
doc
}

/// `seek_danger` only needs to answer whether `target` itself matches, so it does a cheap
/// point lookup on the column instead of scanning forward to materialize the next match (the
/// expensive part of a regular `seek`).
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
// Covers `target == TERMINATED` and any target past the last doc: no match is possible.
if target >= self.column.num_docs() {
return SeekDangerResult::SeekLowerBound(TERMINATED);
}

if self.is_last_seek_distance_large(target) {
self.reset_fetch_range();
}
self.last_seek_pos_opt = Some(target);

let is_match = self
.column
.values_for_doc(target)
.any(|value| self.value_range.contains(&value));
if is_match {
// Leave the docset in a valid state positioned on `target`, so `doc()` returns it and a
// following `advance()` resumes the scan right after it.
self.loaded_docs.get_cleared_data().push(target);
self.next_fetch_start = target + 1;
SeekDangerResult::Found
} else {
// `target` is not in the docset. The next match is strictly greater than `target`, so
// `target + 1` is a valid lower bound. We may leave the docset in an invalid state.
SeekDangerResult::SeekLowerBound(target + 1)
Comment on lines +213 to +215

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid returning only target+1 on sparse misses

When a sparse fast-field range is the non-leading side of an intersection whose lead scorer is moderately dense, Intersection::advance uses this returned lower bound as the next candidate, so returning only target + 1 forces a point lookup for nearly every lead posting until the next range hit. Before this override, the default seek_danger called seek, which let RangeDocSet use its batched range scan to jump to the next matching range doc; this change can therefore turn sparse-range intersections such as a ~50%-matching term AND a handful-of-docs range into millions of per-doc lookups.

Useful? React with 👍 / 👎.

}
}

fn size_hint(&self) -> u32 {
// TODO: Implement a better size hint
self.column.num_docs() / 10
Expand All @@ -209,12 +241,148 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe

#[cfg(test)]
mod tests {
use std::ops::Bound;
use std::ops::{Bound, RangeInclusive};

use columnar::Column;

use super::RangeDocSet;
use crate::collector::Count;
use crate::directory::RamDirectory;
use crate::docset::{SeekDangerResult, TERMINATED};
use crate::query::RangeQuery;
use crate::{schema, IndexBuilder, TantivyDocument, Term};
use crate::{schema, DocSet, Index, IndexBuilder, TantivyDocument, Term};

/// Builds a single-segment index where doc `i` carries `values_for_doc(i)` in a u64 fast
/// field, then returns its column so we can drive a `RangeDocSet` directly.
fn build_u64_column(
num_docs: usize,
values_for_doc: impl Fn(usize) -> Vec<u64>,
) -> Column<u64> {
let mut schema_builder = schema::SchemaBuilder::new();
let value_field = schema_builder.add_u64_field("value", schema::FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut writer = index.writer_for_tests().unwrap();
for i in 0..num_docs {
let mut doc = TantivyDocument::new();
for v in values_for_doc(i) {
doc.add_u64(value_field, v);
}
writer.add_document(doc).unwrap();
}
writer.commit().unwrap();
}
let searcher = index.reader().unwrap().searcher();
assert_eq!(searcher.segment_readers().len(), 1);
searcher
.segment_reader(0)
.fast_fields()
.u64("value")
.unwrap()
}

fn range_docset(
value_range: RangeInclusive<u64>,
num_docs: usize,
values_for_doc: impl Fn(usize) -> Vec<u64>,
) -> RangeDocSet<u64> {
RangeDocSet::new(value_range, build_u64_column(num_docs, values_for_doc))
}

#[test]
fn seek_danger_found_leaves_valid_state() {
// Even docs match the range, odd docs do not.
let mut docset = range_docset(0..=0, 100, |i| vec![(i % 2) as u64]);

// Matching target: `Found`, and the docset is positioned exactly on it.
assert_eq!(docset.seek_danger(10), SeekDangerResult::Found);
assert_eq!(docset.doc(), 10);
// A following advance resumes the scan right after the found doc.
assert_eq!(docset.advance(), 12);
assert_eq!(docset.doc(), 12);
}

#[test]
fn seek_danger_miss_returns_lower_bound() {
let mut docset = range_docset(0..=0, 100, |i| vec![(i % 2) as u64]);

// Odd target does not match: lower bound is strictly greater than the target and never
// skips past the next real match (here doc 12, the first even doc after 11).
match docset.seek_danger(11) {
SeekDangerResult::SeekLowerBound(lower_bound) => {
assert!(lower_bound > 11);
assert!(lower_bound <= 12);
}
SeekDangerResult::Found => panic!("11 should not match"),
}
// After a miss we may be in an invalid state; another seek_danger recovers it.
assert_eq!(docset.seek_danger(12), SeekDangerResult::Found);
assert_eq!(docset.doc(), 12);
}

#[test]
fn seek_danger_terminated_and_out_of_bounds() {
let mut docset = range_docset(0..=0, 10, |i| vec![(i % 2) as u64]);
assert_eq!(
docset.seek_danger(TERMINATED),
SeekDangerResult::SeekLowerBound(TERMINATED)
);
// A target past the last doc has no possible match either.
assert_eq!(
docset.seek_danger(10),
SeekDangerResult::SeekLowerBound(TERMINATED)
);
}

#[test]
fn seek_danger_multivalued() {
// Doc `i` holds values [i, i+1]; the range {5} matches docs 4 and 5.
let mut docset = range_docset(5..=5, 20, |i| vec![i as u64, i as u64 + 1]);

assert_eq!(docset.seek_danger(4), SeekDangerResult::Found);
assert_eq!(docset.doc(), 4);
assert_eq!(docset.advance(), 5);
// No further match after doc 5.
assert_eq!(docset.advance(), TERMINATED);
}

#[test]
fn seek_danger_matches_seek() {
// Cross-check seek_danger against the true next match for every target, on a column with a
// few sparse matches.
let matches = [3u32, 7, 50, 51, 99];
let num_docs = 100;
let values_for_doc = |i: usize| {
vec![if matches.contains(&(i as u32)) {
1u64
} else {
0u64
}]
};

for target in 0..num_docs as u32 {
// The first matching doc greater than or equal to `target`, i.e. what `seek` returns.
let expected = matches
.iter()
.copied()
.find(|&m| m >= target)
.unwrap_or(TERMINATED);

let mut danger = range_docset(1..=1, num_docs, values_for_doc);
match danger.seek_danger(target) {
SeekDangerResult::Found => {
assert_eq!(expected, target, "target {target} reported Found");
assert_eq!(danger.doc(), target);
}
SeekDangerResult::SeekLowerBound(lower_bound) => {
assert_ne!(expected, target, "target {target} should have been Found");
assert!(lower_bound > target);
// The lower bound must never skip past the true next match.
assert!(lower_bound <= expected);
}
}
}
}

#[test]
fn range_query_fast_optional_field_minimum() {
Expand Down
Loading