Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Tantivy 0.26 (Unreleased)
================================

## Bugfixes
- Fix `MoreLikeThis` panic on indexes with deleted documents [#2964](https://github.com/quickwit-oss/tantivy/pull/2964)(@stumpylog)

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was a little confused about the released vs not state. There's a 0.26.1 tag (https://github.com/quickwit-oss/tantivy/releases/tag/0.26.1) but the header here says 0.26.0 is (Unreleased). So just let me know if this needs moving/updating.

- Align float query coercion during search with the columnar coercion rules [#2692](https://github.com/quickwit-oss/tantivy/pull/2692)(@fulmicoton)
- Fix lenient elastic range queries with trailing closing parentheses [#2816](https://github.com/quickwit-oss/tantivy/pull/2816)(@evance-br)
- Fix intersection `seek()` advancing below current doc id [#2812](https://github.com/quickwit-oss/tantivy/pull/2812)(@fulmicoton)
Expand Down
5 changes: 4 additions & 1 deletion src/query/more_like_this/more_like_this.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,10 +302,13 @@ impl MoreLikeThis {
per_field_term_frequencies: HashMap<Term, usize>,
) -> Result<Vec<ScoreTerm>> {
let mut score_terms: BinaryHeap<Reverse<ScoreTerm>> = BinaryHeap::new();
// Use `max_doc` (includes soft-deleted docs) to match `doc_freq`, which
// also counts deletes until a merge. Using the alive-only `num_docs`
// can yield `doc_freq > num_docs` and trip the assertion in `idf`.
let num_docs = searcher
.segment_readers()
.iter()
.map(|segment_reader| segment_reader.num_docs() as u64)
.map(|segment_reader| segment_reader.max_doc() as u64)
.sum::<u64>();

for (term, term_frequency) in per_field_term_frequencies.iter() {
Expand Down
59 changes: 58 additions & 1 deletion src/query/more_like_this/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,9 @@ impl MoreLikeThisQueryBuilder {
mod tests {
use super::{MoreLikeThisQuery, TargetDocument};
use crate::collector::TopDocs;
use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, STORED, TEXT};
use crate::{DocAddress, Index, IndexWriter};
use crate::{DocAddress, Index, IndexWriter, Term};

fn create_test_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -291,4 +292,60 @@ mod tests {
assert_eq!(doc_ids, vec![3, 4]);
Ok(())
}

#[test]
fn test_more_like_this_query_with_deleted_documents() -> crate::Result<()> {
// Regression test: a More Like This query must not panic when the
// index contains soft-deleted documents.
//
// Tantivy deletes are soft: a term's `doc_freq` (read from the term
// dictionary) keeps counting deleted documents until a merge expunges
// them, but `SegmentReader::num_docs()` only counts alive documents.
// `create_score_term` previously used the alive-only count as the idf
// denominator while `doc_freq` still included deleted docs, so
// `doc_freq > num_docs` could hold and trip the `doc_count >= doc_freq`
// assertion inside `idf()`.
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT | STORED);
let body = schema_builder.add_text_field("body", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests()?;
// Keep every document in a single, never-merged segment so the deleted
// documents' postings (and thus their contribution to doc_freq) stick
// around.
index_writer.set_merge_policy(Box::new(NoMergePolicy));

// Six documents share the same body terms, so each body term has a
// doc_freq of 6. Each has a unique title term so they can be deleted
// individually.
for i in 0..6 {
index_writer
.add_document(doc!(title => format!("doc{i}"), body => "shared body content"))?;
}
index_writer.commit()?;

// Delete two documents. Alive count drops to 4, but the body terms'
// doc_freq stays 6 until a merge.
index_writer.delete_term(Term::from_field_text(title, "doc0"));
index_writer.delete_term(Term::from_field_text(title, "doc1"));
index_writer.commit()?;

let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 4);

// Run More Like This against a surviving document. This used to panic.
let query = MoreLikeThisQuery::builder()
.with_min_doc_frequency(1)
.with_min_term_frequency(1)
.with_document(DocAddress::new(0, 2));
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?;
let mut doc_ids: Vec<_> = top_docs.iter().map(|item| item.1.doc_id).collect();
doc_ids.sort_unstable();

// Only the four surviving documents should be returned.
assert_eq!(doc_ids, vec![2, 3, 4, 5]);
Ok(())
}
}