Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions requirements-optional.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ numpy>=1.26.0
hnswlib>=0.8.0
faiss-cpu>=1.8.0

# ONNX Runtime for fast, lightweight dense embeddings (preferred over sentence-transformers)
onnxruntime>=1.17.0
tokenizers>=0.15.0
huggingface_hub>=0.20.0

# Tree-sitter for AST-based code chunking (semantic code boundaries)
tree-sitter>=0.21.0
tree-sitter-python>=0.21.0
Expand Down
35 changes: 33 additions & 2 deletions scripts/token_reducer/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,24 @@
DEFAULT_WORD_BUDGET = 350
DEFAULT_HYBRID_MODE = "fallback"
DEFAULT_RETRIEVAL_MODE = "compact"
DEFAULT_EMBEDDING_BACKEND = "hash"
DEFAULT_EMBEDDING_MODEL = "jinaai/jina-embeddings-v2-base-code"
DEFAULT_EMBEDDING_BACKEND = "onnx"
DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DEFAULT_ANN_ENGINE = "hnsw"
DEFAULT_ANN_EF_SEARCH = 160
DEFAULT_QUERY_CACHE_TTL_SECONDS = 900
DEFAULT_RELEVANCE_FLOOR = 0.15 # Minimum score threshold for knapsack packing

# ONNX Runtime settings for fast CPU-based dense embeddings
# Model path can be local file or HuggingFace hub model ID
DEFAULT_ONNX_MODEL_PATH = "sentence-transformers/all-MiniLM-L6-v2"
Comment on lines +25 to +26

Copilot AI Apr 3, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DEFAULT_ONNX_MODEL_PATH is introduced but (based on a repo-wide search) is not referenced anywhere else. If the intended default is already covered by DEFAULT_EMBEDDING_MODEL, consider removing this constant or wiring it into the CLI/config flow so it doesn’t drift unused.

Suggested change
# Model path can be local file or HuggingFace hub model ID
DEFAULT_ONNX_MODEL_PATH = "sentence-transformers/all-MiniLM-L6-v2"
# The default model identifier is provided by DEFAULT_EMBEDDING_MODEL.

Copilot uses AI. Check for mistakes.
DEFAULT_ONNX_MAX_LENGTH = 512 # Max sequence length for tokenization

# Reciprocal Rank Fusion (RRF) settings
# RRF score = sum(1 / (k + rank)) across retrieval systems
# Higher k values reduce the impact of top positions
DEFAULT_RRF_K = 60 # Standard RRF constant (60 is common default)
DEFAULT_USE_RRF = True # Enable RRF for hybrid retrieval; False uses weighted sum

# Adaptive retrieval tiers — determined at runtime from indexed chunk count.
# Small (< TIER_SMALL_CHUNKS) → FTS5 only; no embeddings used for retrieval, no ANN built.
# Medium (< TIER_MEDIUM_CHUNKS) → FTS5 primary + hash-embedding fallback; no ANN built.
Expand Down Expand Up @@ -55,8 +66,11 @@

_EMBEDDING_MODEL_CACHE: dict[str, object] = {}
_EMBEDDING_VECTOR_CACHE: dict[str, list[float]] = {}
_ONNX_SESSION_CACHE: dict[str, object] = {} # Cache for ONNX Runtime sessions
_SCORING_WEIGHTS: dict[str, float] = DEFAULT_SCORING_WEIGHTS.copy()
_HASH_EMBEDDING_SKIP_VECTOR: bool = DEFAULT_HASH_EMBEDDING_SKIP_VECTOR
_USE_RRF: bool = DEFAULT_USE_RRF
_RRF_K: int = DEFAULT_RRF_K


def configure_scoring_weights(weights: dict[str, float] | None = None) -> None:
Expand Down Expand Up @@ -94,6 +108,23 @@ def should_skip_vector_for_hash() -> bool:
return _HASH_EMBEDDING_SKIP_VECTOR


def should_use_rrf() -> bool:
"""Check if Reciprocal Rank Fusion should be used for hybrid retrieval."""
return _USE_RRF


def get_rrf_k() -> int:
"""Get the RRF constant k."""
return _RRF_K


def configure_rrf(use_rrf: bool = True, k: int = 60) -> None:
"""Configure RRF settings."""
global _USE_RRF, _RRF_K
_USE_RRF = use_rrf
_RRF_K = k


TEXT_EXTENSIONS = {
".md",
".txt",
Expand Down
5 changes: 3 additions & 2 deletions scripts/token_reducer/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,14 +469,15 @@ def index_file_dependencies(
for import_path in imports:
resolved = resolve_import_to_file(import_path, file_path, indexed_files)
try:
conn.execute(
cur = conn.execute(
"""
INSERT OR IGNORE INTO file_dependencies (source_file, target_import, resolved_file)
VALUES (?, ?, ?)
""",
(file_path, import_path, resolved),
)
count += 1
if cur.rowcount > 0:
count += 1
except Exception:
continue

Expand Down
141 changes: 140 additions & 1 deletion scripts/token_reducer/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,15 @@
import sys
from collections.abc import Sequence
from hashlib import blake2b
from pathlib import Path

from .chunker import char_ngrams, tokenize
from .config import _EMBEDDING_MODEL_CACHE, get_weight
from .config import (
_EMBEDDING_MODEL_CACHE,
_ONNX_SESSION_CACHE,
DEFAULT_ONNX_MAX_LENGTH,
get_weight,
)


def embed_text_hash(text: str, dimensions: int) -> list[float]:
Expand Down Expand Up @@ -66,6 +72,118 @@ def embed_text_ml(text: str, model_name: str) -> list[float]:
return [float(x) for x in embedding]


# Candidate ONNX filenames tried in order: quantized int8 first (smallest/fastest),
# then subfolder full-precision, then legacy root-level model.onnx.
_ONNX_CANDIDATE_FILENAMES = ["onnx/model_quantized.onnx", "onnx/model.onnx", "model.onnx"]


def get_onnx_session(model_path: str):
"""Load or retrieve cached ONNX Runtime session for embeddings."""
cached = _ONNX_SESSION_CACHE.get(model_path)
if cached is not None:
return cached

try:
import onnxruntime as ort # type: ignore
from tokenizers import Tokenizer # type: ignore
except Exception as exc:
raise RuntimeError(
"ONNX embedding backend requested but onnxruntime or tokenizers is not installed."
) from exc

onnx_path: str | None = None
tokenizer_path: str | None = None

try:
from huggingface_hub import hf_hub_download # type: ignore

for candidate in _ONNX_CANDIDATE_FILENAMES:
try:
onnx_path = hf_hub_download(repo_id=model_path, filename=candidate)
break
except Exception:
continue

if onnx_path is None:
raise RuntimeError(f"No ONNX model file found in HuggingFace repo '{model_path}'")

tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer.json")

except Exception as hf_err:
model_dir = Path(model_path)
if not model_dir.exists():
raise RuntimeError(
f"ONNX model not found at '{model_path}'. "
"Provide a valid HuggingFace model ID or local directory path."
) from hf_err

for candidate in _ONNX_CANDIDATE_FILENAMES:
p = model_dir / candidate
if p.exists():
onnx_path = str(p)
break

if onnx_path is None:
raise RuntimeError(
f"No ONNX model file found locally in '{model_path}'"
) from hf_err

tokenizer_path = str(model_dir / "tokenizer.json")

Comment on lines +96 to +132

Copilot AI Apr 3, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The HuggingFace download try covers both the ONNX model and tokenizer.json, but any failure (including a missing tokenizer.json in the repo, a transient HF error, or ImportError for huggingface_hub) falls into the local-path fallback branch. This can produce misleading errors (treating a valid HF model ID as a missing local path) and can discard a successfully downloaded onnx_path. Consider handling ImportError for huggingface_hub separately, and distinguishing “repo exists but required files missing” from “local path missing”.

Suggested change
try:
from huggingface_hub import hf_hub_download # type: ignore
for candidate in _ONNX_CANDIDATE_FILENAMES:
try:
onnx_path = hf_hub_download(repo_id=model_path, filename=candidate)
break
except Exception:
continue
if onnx_path is None:
raise RuntimeError(f"No ONNX model file found in HuggingFace repo '{model_path}'")
tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer.json")
except Exception as hf_err:
model_dir = Path(model_path)
if not model_dir.exists():
raise RuntimeError(
f"ONNX model not found at '{model_path}'. "
"Provide a valid HuggingFace model ID or local directory path."
) from hf_err
for candidate in _ONNX_CANDIDATE_FILENAMES:
p = model_dir / candidate
if p.exists():
onnx_path = str(p)
break
if onnx_path is None:
raise RuntimeError(
f"No ONNX model file found locally in '{model_path}'"
) from hf_err
tokenizer_path = str(model_dir / "tokenizer.json")
hf_import_err: Exception | None = None
hf_onnx_err: Exception | None = None
try:
from huggingface_hub import hf_hub_download # type: ignore
except ImportError as exc:
hf_hub_download = None
hf_import_err = exc
if hf_hub_download is not None:
for candidate in _ONNX_CANDIDATE_FILENAMES:
try:
onnx_path = hf_hub_download(repo_id=model_path, filename=candidate)
break
except Exception as exc:
hf_onnx_err = exc
continue
if onnx_path is not None:
try:
tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer.json")
except Exception as exc:
raise RuntimeError(
f"Found ONNX model in HuggingFace repo '{model_path}', "
"but required file 'tokenizer.json' is missing or could not be downloaded."
) from exc
if onnx_path is None:
model_dir = Path(model_path)
if model_dir.exists():
for candidate in _ONNX_CANDIDATE_FILENAMES:
p = model_dir / candidate
if p.exists():
onnx_path = str(p)
break
if onnx_path is None:
raise RuntimeError(
f"No ONNX model file found locally in '{model_path}'"
) from hf_onnx_err
tokenizer_path = str(model_dir / "tokenizer.json")
elif hf_import_err is not None:
raise RuntimeError(
"ONNX model not found locally and huggingface_hub is not installed. "
"Provide a local directory path or install huggingface_hub to load a HuggingFace model ID."
) from hf_import_err
elif hf_onnx_err is not None:
raise RuntimeError(
f"Unable to download an ONNX model file from HuggingFace repo '{model_path}', "
"and no local directory exists at that path."
) from hf_onnx_err
else:
raise RuntimeError(
f"ONNX model not found at '{model_path}'. "
"Provide a valid HuggingFace model ID or local directory path."
)

Copilot uses AI. Check for mistakes.
Comment on lines +131 to +132

Copilot AI Apr 3, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the local-path fallback, tokenizer_path is set to <model_dir>/tokenizer.json but its existence is not checked before calling Tokenizer.from_file(). If the tokenizer file is missing, the error will be a low-level file error rather than a clear message about the required file layout. Consider validating tokenizer_path exists and raising a RuntimeError with actionable guidance.

Suggested change
tokenizer_path = str(model_dir / "tokenizer.json")
local_tokenizer_path = model_dir / "tokenizer.json"
if not local_tokenizer_path.exists():
raise RuntimeError(
f"Missing tokenizer file in local ONNX model directory '{model_path}'. "
"Expected 'tokenizer.json' alongside the ONNX model files "
f"({', '.join(_ONNX_CANDIDATE_FILENAMES)}). "
"Provide a valid HuggingFace model ID or a local directory with the required files."
) from hf_err
tokenizer_path = str(local_tokenizer_path)

Copilot uses AI. Check for mistakes.
session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
tokenizer = Tokenizer.from_file(tokenizer_path)

_ONNX_SESSION_CACHE[model_path] = (session, tokenizer)
return session, tokenizer


def embed_text_onnx(
text: str, model_path: str, max_length: int = DEFAULT_ONNX_MAX_LENGTH
) -> list[float]:
"""Generate embeddings using ONNX Runtime for fast CPU inference.

Uses attention-mask-weighted mean pooling so padding tokens are excluded
from the average. Supports models that require token_type_ids.
"""
import numpy as np

session, tokenizer = get_onnx_session(model_path)

encoding = tokenizer.encode(text)
real_len = min(len(encoding.ids), max_length)
pad_len = max_length - real_len

tokens = encoding.ids[:real_len] + [0] * pad_len
attn_mask = [1] * real_len + [0] * pad_len
type_ids = list(encoding.type_ids[:real_len]) + [0] * pad_len if encoding.type_ids else [0] * max_length

input_ids = np.array([tokens], dtype=np.int64)
attention_mask_array = np.array([attn_mask], dtype=np.int64)
token_type_ids_array = np.array([type_ids], dtype=np.int64)

input_names = {inp.name for inp in session.get_inputs()}
ort_inputs: dict[str, object] = {
"input_ids": input_ids,
"attention_mask": attention_mask_array,
}
if "token_type_ids" in input_names:
ort_inputs["token_type_ids"] = token_type_ids_array

outputs = session.run(None, ort_inputs)

# Attention-mask-weighted mean pooling: exclude padding tokens from average
hidden = outputs[0][0] # shape: (max_length, hidden_dim)
mask = np.array(attn_mask, dtype=np.float32)
masked_sum = (hidden * mask[:, np.newaxis]).sum(axis=0)
embedding = masked_sum / (mask.sum() + 1e-9)

norm = np.linalg.norm(embedding)
if norm > 0:
embedding = embedding / norm

return [float(x) for x in embedding]


def resolve_embedding_backend(
requested_backend: str,
requested_model: str,
Expand All @@ -74,6 +192,17 @@ def resolve_embedding_backend(
if backend == "hash":
return "hash", None

if backend == "onnx":
try:
get_onnx_session(requested_model)
return "onnx", requested_model
except Exception as exc:
print(
f"[warn] ONNX embedding backend unavailable ({exc}). Falling back to hash embeddings.",
file=sys.stderr,
)
return "hash", None

Comment on lines 187 to +205

Copilot AI Apr 3, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New ONNX support was added to resolve_embedding_backend() / embed_text(), but tests/test_embeddings.py does not currently exercise the onnx backend branch (even if only to assert graceful fallback when optional deps/models aren’t available). Adding coverage here would help prevent regressions in the default-backend selection and fallback behavior.

Copilot uses AI. Check for mistakes.
if backend == "ml":
try:
get_sentence_transformer_model(requested_model)
Expand All @@ -99,6 +228,16 @@ def embed_text(
embedding_model: str | None,
) -> tuple[list[float], str, str | None]:
backend = embedding_backend.strip().lower()

if backend == "onnx" and embedding_model:
try:
return embed_text_onnx(text=text, model_path=embedding_model), "onnx", embedding_model
except Exception as exc:
print(
f"[warn] ONNX embedding runtime failed ({exc}). Falling back to hash embeddings.",
file=sys.stderr,
)

if backend == "ml" and embedding_model:
try:
return embed_text_ml(text=text, model_name=embedding_model), "ml", embedding_model
Expand Down
61 changes: 61 additions & 0 deletions scripts/token_reducer/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,12 +321,73 @@ def overlap_ratio(query: str, text: str) -> float:
return len(q_terms & t_terms) / float(len(q_terms))


def reciprocal_rank_fusion(
fts_hits: list[Candidate],
vector_hits: list[Candidate],
k: int = 60,
) -> list[Candidate]:
"""Combine retrieval results using Reciprocal Rank Fusion (RRF).

RRF score = sum(1 / (k + rank)) across all retrieval systems.
This is a parameter-free, deterministic fusion method that works well
for combining BM25 and semantic search.

Args:
fts_hits: Candidates from FTS5/BM25 retrieval (already ranked)
vector_hits: Candidates from vector/semantic retrieval (already ranked)
k: RRF constant (typically 60). Higher values reduce top position impact.

Returns:
Merged and re-ranked candidates using RRF scores.
"""
from .config import get_rrf_k

if k <= 0:
k = get_rrf_k()

Comment on lines +343 to +347

Copilot AI Apr 3, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reciprocal_rank_fusion() imports get_rrf_k(), but with the current defaults k is always 60 and get_rrf_k() is never used (since rerank_candidates() calls reciprocal_rank_fusion(...) without passing k). This makes configure_rrf(k=...) ineffective. Consider defaulting k from get_rrf_k() when k is None/omitted, or explicitly passing k=get_rrf_k() from rerank_candidates().

Copilot uses AI. Check for mistakes.
rrf_scores: dict[int, float] = {}
candidates: dict[int, Candidate] = {}

# Add FTS5/BM25 ranks
for rank, candidate in enumerate(fts_hits, start=1):
rrf_scores[candidate.chunk_id] = 1.0 / (k + rank)
candidates[candidate.chunk_id] = candidate

# Add vector ranks
for rank, candidate in enumerate(vector_hits, start=1):
chunk_id = candidate.chunk_id
rrf_scores[chunk_id] = rrf_scores.get(chunk_id, 0.0) + (1.0 / (k + rank))

# Merge candidate info
if chunk_id in candidates:
candidates[chunk_id].vector_rank = candidate.vector_rank
candidates[chunk_id].vector_score = candidate.vector_score
else:
candidates[chunk_id] = candidate

# Assign final RRF scores
for chunk_id, candidate in candidates.items():
candidate.final_score = rrf_scores[chunk_id]

# Sort by RRF score descending
ranked = sorted(candidates.values(), key=lambda c: c.final_score, reverse=True)
return ranked


def rerank_candidates(
query: str,
fts_hits: list[Candidate],
vector_hits: list[Candidate],
top_k: int,
) -> tuple[list[Candidate], list[Candidate]]:
from .config import should_use_rrf

# Use RRF if enabled, otherwise fall back to weighted scoring
if should_use_rrf() and vector_hits:
ranked = reciprocal_rank_fusion(fts_hits, vector_hits)
return ranked[:top_k], ranked
Comment on lines +369 to +388

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Action required

1. Rrf scores filtered out 🐞 Bug ≡ Correctness

With RRF enabled, reciprocal_rank_fusion() assigns Candidate.final_score to an RRF value
(~1/(k+rank)), which is always far below the default relevance_floor=0.15, so
compress_candidates() stops immediately and returns no bullets/packet content when vector
retrieval is used.
Agent Prompt
## Issue description
RRF ranking currently overwrites `Candidate.final_score` with very small reciprocal-rank values. Downstream compression uses `final_score` with a default `relevance_floor=0.15`, causing the compressor to break immediately and produce empty results when RRF is active.

## Issue Context
- RRF scores for k=60 are ~0.016 (rank=1) per system, far below the 0.15 floor.
- `compress_candidates()` assumes `final_score` is on the existing weighted-score scale.

## Fix Focus Areas
- scripts/token_reducer/retriever.py[324-429]
- scripts/token_reducer/compressor.py[232-263]
- scripts/token_reducer/config.py[17-34]

## What to change
Choose one consistent approach:
1) **Use RRF for ordering only**: compute an `rrf_score` for sorting, but still compute the existing weighted `final_score` (and overlap/fts/vector scores) before passing candidates to the compressor; or
2) **Adjust relevance floor for RRF**: when RRF is enabled, either disable the relevance-floor early-break or use a much lower floor appropriate to RRF (and ensure the floor is configurable);
3) If you keep RRF as `final_score`, **rescale/normalize** RRF scores to the same magnitude expected by `relevance_floor`.

Add/adjust a small unit-style check (if present in repo) or a lightweight assertion path to prevent returning an empty packet solely due to scoring-scale mismatch.

ⓘ Copy this prompt and use it to remediate the issue with your preferred AI generation tools


# Fallback: Original weighted scoring
merged: dict[int, Candidate] = {}

for candidate in fts_hits:
Expand Down
Loading