Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions src/docset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,31 @@ pub trait DocSet: Send {
buffer.len()
}

/// Fills a given mutable buffer with the next doc ids smaller than `horizon`.
///
/// Unlike [`DocSet::fill_buffer`], this method must not advance past a doc id greater than or
/// equal to `horizon`.
fn fill_buffer_up_to(
&mut self,
horizon: DocId,
buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN],
) -> usize {
if self.doc() == TERMINATED {
return 0;
}
for (pos, buffer_val) in buffer.iter_mut().enumerate() {
let doc = self.doc();
if doc >= horizon {
return pos;
}
*buffer_val = doc;
if self.advance() == TERMINATED {
return pos + 1;
}
}
buffer.len()
}

/// Returns the current document
/// Right after creating a new `DocSet`, the docset points to the first document.
///
Expand Down Expand Up @@ -251,6 +276,14 @@ impl DocSet for &mut dyn DocSet {
(**self).fill_buffer(buffer)
}

fn fill_buffer_up_to(
&mut self,
horizon: DocId,
buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN],
) -> usize {
(**self).fill_buffer_up_to(horizon, buffer)
}

fn fill_bitset_block(
&mut self,
min_doc: DocId,
Expand Down
36 changes: 36 additions & 0 deletions src/postings/block_segment_postings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,42 @@ impl BlockSegmentPostings {
self.freq_decoder.output_array()
}

pub(crate) fn copy_docs_and_term_freqs(
&self,
block_offset: usize,
horizon: DocId,
docs: &mut [DocId],
term_freqs: &mut [u32],
) -> usize {
debug_assert_eq!(docs.len(), term_freqs.len());
let block_docs = self.docs();
let remaining_docs_in_block = block_docs.len().saturating_sub(block_offset);
let max_len = remaining_docs_in_block.min(docs.len());
if max_len == 0 {
return 0;
}

let source_docs = &block_docs[block_offset..block_offset + max_len];
let len = if source_docs[max_len - 1] < horizon {
max_len
} else {
source_docs
.iter()
.position(|&doc| doc >= horizon)
.unwrap_or(max_len)
};

docs[..len].copy_from_slice(&source_docs[..len]);

let block_freqs = self.freq_output_array();
if block_freqs.len() >= block_offset + len {
term_freqs[..len].copy_from_slice(&block_freqs[block_offset..block_offset + len]);
} else {
term_freqs[..len].fill(1);
}
len
}

/// Return the frequency at index `idx` of the block.
#[inline]
pub fn freq(&self, idx: usize) -> u32 {
Expand Down
10 changes: 10 additions & 0 deletions src/postings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,16 @@ pub(crate) mod tests {
fn score(&mut self) -> Score {
self.0.score()
}

#[inline]
fn can_score_doc(&self) -> bool {
self.0.can_score_doc()
}

#[inline]
fn score_doc(&mut self, doc: DocId, term_freq: u32) -> Score {
self.0.score_doc(doc, term_freq)
}
}

pub fn test_skip_against_unoptimized<F: Fn() -> Box<dyn DocSet>>(
Expand Down
30 changes: 29 additions & 1 deletion src/postings/segment_postings.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use common::HasLen;

use crate::docset::DocSet;
use crate::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN};
use crate::fastfield::AliveBitSet;
use crate::positions::PositionReader;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
Expand Down Expand Up @@ -151,6 +151,34 @@ impl SegmentPostings {
position_reader,
}
}

pub(crate) fn fill_buffer_up_to_with_term_freqs(
&mut self,
horizon: DocId,
docs: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN],
term_freqs: &mut [u32; COLLECT_BLOCK_BUFFER_LEN],
) -> usize {
let mut num_elems = 0;
while num_elems < COLLECT_BLOCK_BUFFER_LEN && self.doc() < horizon {
let copied = self.block_cursor.copy_docs_and_term_freqs(
self.cur,
horizon,
&mut docs[num_elems..],
&mut term_freqs[num_elems..],
);
if copied == 0 {
break;
}
num_elems += copied;
self.cur += copied;

if self.cur == COMPRESSION_BLOCK_SIZE {
self.cur = 0;
self.block_cursor.advance();
}
}
num_elems
}
}

impl DocSet for SegmentPostings {
Expand Down
10 changes: 10 additions & 0 deletions src/query/all_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,16 @@ impl Scorer for AllScorer {
fn score(&mut self) -> Score {
1.0
}

#[inline]
fn can_score_doc(&self) -> bool {
true
}

#[inline]
fn score_doc(&mut self, _doc: DocId, _term_freq: u32) -> Score {
1.0
}
}

#[cfg(test)]
Expand Down
48 changes: 46 additions & 2 deletions src/query/bm25.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
use std::cell::RefCell;
use std::num::NonZeroUsize;
use std::sync::Arc;

use lru::LruCache;

use crate::fieldnorm::FieldNormReader;
use crate::query::Explanation;
use crate::schema::Field;
Expand Down Expand Up @@ -59,7 +63,9 @@ fn cached_tf_component(fieldnorm: u32, average_fieldnorm: Score) -> Score {
K1 * (1.0 - B + B * fieldnorm as Score / average_fieldnorm)
}

fn compute_tf_cache(average_fieldnorm: Score) -> Arc<[Score; 256]> {
const BM25_TF_CACHE_CAPACITY: usize = 64;

fn compute_tf_cache_uncached(average_fieldnorm: Score) -> Arc<[Score; 256]> {
let mut cache: [Score; 256] = [0.0; 256];
for (fieldnorm_id, cache_mut) in cache.iter_mut().enumerate() {
let fieldnorm = FieldNormReader::id_to_fieldnorm(fieldnorm_id as u8);
Expand All @@ -68,6 +74,36 @@ fn compute_tf_cache(average_fieldnorm: Score) -> Arc<[Score; 256]> {
Arc::new(cache)
}

thread_local! {
static TF_CACHES: RefCell<LruCache<u32, Arc<[Score; 256]>>> = RefCell::new(LruCache::new(
NonZeroUsize::new(BM25_TF_CACHE_CAPACITY).unwrap(),
));
}

/// The cache is shared across all [Bm25Weight] with the same average fieldnorm on the same thread.
/// It is stored in a thread local LRU cache.
///
/// On one query all terms on the same field will share the same average fieldnorm, and thus the
/// same cache. This will lower cache pressure.
///
/// Even between queries (on the same thread), the cache will be reused, which allows the cache to
/// better learn the memory address of the cache and access patterns.
///
/// Thread local is used in order to be defensive about potential contention on the cache.
fn compute_tf_cache(average_fieldnorm: Score) -> Arc<[Score; 256]> {
let cache_key = average_fieldnorm.to_bits();
TF_CACHES.with(|cache_by_average_fieldnorm| {
let mut cache_by_average_fieldnorm = cache_by_average_fieldnorm.borrow_mut();
if let Some(cache) = cache_by_average_fieldnorm.get(&cache_key) {
return cache.clone();
}

let cache = compute_tf_cache_uncached(average_fieldnorm);
cache_by_average_fieldnorm.put(cache_key, cache.clone());
cache
})
}

/// A struct used for computing BM25 scores.
#[derive(Clone)]
pub struct Bm25Weight {
Expand Down Expand Up @@ -229,12 +265,20 @@ impl Bm25Weight {
#[cfg(test)]
mod tests {

use super::idf;
use super::{idf, Bm25Weight};
use crate::{assert_nearly_equals, Score};

#[test]
fn test_idf() {
let score: Score = 2.0;
assert_nearly_equals!(idf(1, 2), score.ln());
}

#[test]
fn test_bm25_tf_cache_is_shared_for_same_average_fieldnorm() {
let weight1 = Bm25Weight::for_one_term(1, 10, 3.0);
let weight2 = Bm25Weight::for_one_term(2, 10, 3.0);

assert!(std::sync::Arc::ptr_eq(&weight1.cache, &weight2.cache));
}
}
12 changes: 8 additions & 4 deletions src/query/boolean_query/boolean_weight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,14 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
num_docs: u32,
) -> Box<dyn Scorer> {
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let union_scorer =
BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
Box::new(union_scorer)
SpecializedScorer::TermUnion(mut term_scorers) => {
if term_scorers.len() == 1 {
Box::new(term_scorers.pop().unwrap())
} else {
let union_scorer =
BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs);
Box::new(union_scorer)
}
}
SpecializedScorer::TermIntersection(term_scorers) => {
let boxed_scorers: Vec<Box<dyn Scorer>> = term_scorers
Expand Down
29 changes: 29 additions & 0 deletions src/query/boost_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,14 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
self.underlying.fill_buffer(buffer)
}

fn fill_buffer_up_to(
&mut self,
horizon: DocId,
buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN],
) -> usize {
self.underlying.fill_buffer_up_to(horizon, buffer)
}

fn doc(&self) -> u32 {
self.underlying.doc()
}
Expand All @@ -138,6 +146,27 @@ impl<S: Scorer> Scorer for BoostScorer<S> {
fn score(&mut self) -> Score {
self.underlying.score() * self.boost
}

#[inline]
fn can_score_doc(&self) -> bool {
self.underlying.can_score_doc()
}

#[inline]
fn score_doc(&mut self, doc: DocId, term_freq: u32) -> Score {
self.underlying.score_doc(doc, term_freq) * self.boost
}

#[inline]
fn fill_buffer_up_to_with_term_freqs(
&mut self,
horizon: DocId,
docs: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN],
term_freqs: &mut [u32; COLLECT_BLOCK_BUFFER_LEN],
) -> usize {
self.underlying
.fill_buffer_up_to_with_term_freqs(horizon, docs, term_freqs)
}
}

#[cfg(test)]
Expand Down
10 changes: 10 additions & 0 deletions src/query/const_score_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,16 @@ impl<TDocSet: DocSet + 'static> Scorer for ConstScorer<TDocSet> {
fn score(&mut self) -> Score {
self.score
}

#[inline]
fn can_score_doc(&self) -> bool {
true
}

#[inline]
fn score_doc(&mut self, _doc: DocId, _term_freq: u32) -> Score {
self.score
}
}

#[cfg(test)]
Expand Down
14 changes: 14 additions & 0 deletions src/query/disjunction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,20 @@ mod tests {
fn score(&mut self) -> Score {
self.foo.get(self.cursor).map(|x| x.1).unwrap_or(0.0)
}

#[inline]
fn can_score_doc(&self) -> bool {
true
}

#[inline]
fn score_doc(&mut self, doc: DocId, _term_freq: u32) -> Score {
self.foo
.iter()
.find(|(candidate_doc, _)| *candidate_doc == doc)
.map(|(_, score)| *score)
.unwrap_or(0.0)
}
}

#[test]
Expand Down
10 changes: 10 additions & 0 deletions src/query/empty_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,16 @@ impl Scorer for EmptyScorer {
fn score(&mut self) -> Score {
0.0
}

#[inline]
fn can_score_doc(&self) -> bool {
true
}

#[inline]
fn score_doc(&mut self, _doc: DocId, _term_freq: u32) -> Score {
0.0
}
}

#[cfg(test)]
Expand Down
Loading
Loading