Madhan230205 · Madhan230205 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026 · Copilot
diff --git a/requirements-optional.txt b/requirements-optional.txt
@@ -6,6 +6,11 @@ numpy>=1.26.0
 hnswlib>=0.8.0
 faiss-cpu>=1.8.0
 
+# ONNX Runtime for fast, lightweight dense embeddings (preferred over sentence-transformers)
+onnxruntime>=1.17.0
+tokenizers>=0.15.0
+huggingface_hub>=0.20.0
+
 # Tree-sitter for AST-based code chunking (semantic code boundaries)
 tree-sitter>=0.21.0
 tree-sitter-python>=0.21.0

diff --git a/scripts/token_reducer/config.py b/scripts/token_reducer/config.py
@@ -14,13 +14,24 @@
 DEFAULT_WORD_BUDGET = 350
 DEFAULT_HYBRID_MODE = "fallback"
 DEFAULT_RETRIEVAL_MODE = "compact"
-DEFAULT_EMBEDDING_BACKEND = "hash"
-DEFAULT_EMBEDDING_MODEL = "jinaai/jina-embeddings-v2-base-code"
+DEFAULT_EMBEDDING_BACKEND = "onnx"
+DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 DEFAULT_ANN_ENGINE = "hnsw"
 DEFAULT_ANN_EF_SEARCH = 160
 DEFAULT_QUERY_CACHE_TTL_SECONDS = 900
 DEFAULT_RELEVANCE_FLOOR = 0.15  # Minimum score threshold for knapsack packing
 
+# ONNX Runtime settings for fast CPU-based dense embeddings
+# Model path can be local file or HuggingFace hub model ID
+DEFAULT_ONNX_MODEL_PATH = "sentence-transformers/all-MiniLM-L6-v2"
-# Model path can be local file or HuggingFace hub model ID
-DEFAULT_ONNX_MODEL_PATH = "sentence-transformers/all-MiniLM-L6-v2"
+# The default model identifier is provided by DEFAULT_EMBEDDING_MODEL.
-# Model path can be local file or HuggingFace hub model ID
-DEFAULT_ONNX_MODEL_PATH = "sentence-transformers/all-MiniLM-L6-v2"
+# The default model identifier is provided by DEFAULT_EMBEDDING_MODEL.
+DEFAULT_ONNX_MAX_LENGTH = 512  # Max sequence length for tokenization
+
+# Reciprocal Rank Fusion (RRF) settings
+# RRF score = sum(1 / (k + rank)) across retrieval systems
+# Higher k values reduce the impact of top positions
+DEFAULT_RRF_K = 60  # Standard RRF constant (60 is common default)
+DEFAULT_USE_RRF = True  # Enable RRF for hybrid retrieval; False uses weighted sum
+
 # Adaptive retrieval tiers — determined at runtime from indexed chunk count.
 # Small  (<  TIER_SMALL_CHUNKS)  → FTS5 only; no embeddings used for retrieval, no ANN built.
 # Medium (< TIER_MEDIUM_CHUNKS)  → FTS5 primary + hash-embedding fallback; no ANN built.
@@ -55,8 +66,11 @@
 
 _EMBEDDING_MODEL_CACHE: dict[str, object] = {}
 _EMBEDDING_VECTOR_CACHE: dict[str, list[float]] = {}
+_ONNX_SESSION_CACHE: dict[str, object] = {}  # Cache for ONNX Runtime sessions
 _SCORING_WEIGHTS: dict[str, float] = DEFAULT_SCORING_WEIGHTS.copy()
 _HASH_EMBEDDING_SKIP_VECTOR: bool = DEFAULT_HASH_EMBEDDING_SKIP_VECTOR
+_USE_RRF: bool = DEFAULT_USE_RRF
+_RRF_K: int = DEFAULT_RRF_K
 
 
 def configure_scoring_weights(weights: dict[str, float] | None = None) -> None:
@@ -94,6 +108,23 @@ def should_skip_vector_for_hash() -> bool:
     return _HASH_EMBEDDING_SKIP_VECTOR
 
 
+def should_use_rrf() -> bool:
+    """Check if Reciprocal Rank Fusion should be used for hybrid retrieval."""
+    return _USE_RRF
+
+
+def get_rrf_k() -> int:
+    """Get the RRF constant k."""
+    return _RRF_K
+
+
+def configure_rrf(use_rrf: bool = True, k: int = 60) -> None:
+    """Configure RRF settings."""
+    global _USE_RRF, _RRF_K
+    _USE_RRF = use_rrf
+    _RRF_K = k
+
+
 TEXT_EXTENSIONS = {
     ".md",
     ".txt",

diff --git a/scripts/token_reducer/db.py b/scripts/token_reducer/db.py
@@ -469,14 +469,15 @@ def index_file_dependencies(
     for import_path in imports:
         resolved = resolve_import_to_file(import_path, file_path, indexed_files)
         try:
-            conn.execute(
+            cur = conn.execute(
                 """
                 INSERT OR IGNORE INTO file_dependencies (source_file, target_import, resolved_file)
                 VALUES (?, ?, ?)
                 """,
                 (file_path, import_path, resolved),
             )
-            count += 1
+            if cur.rowcount > 0:
+                count += 1
         except Exception:
             continue
 

diff --git a/scripts/token_reducer/embeddings.py b/scripts/token_reducer/embeddings.py
@@ -4,9 +4,15 @@
 import sys
 from collections.abc import Sequence
 from hashlib import blake2b
+from pathlib import Path
 
 from .chunker import char_ngrams, tokenize
-from .config import _EMBEDDING_MODEL_CACHE, get_weight
+from .config import (
+    _EMBEDDING_MODEL_CACHE,
+    _ONNX_SESSION_CACHE,
+    DEFAULT_ONNX_MAX_LENGTH,
+    get_weight,
+)
 
 
 def embed_text_hash(text: str, dimensions: int) -> list[float]:
@@ -66,6 +72,118 @@ def embed_text_ml(text: str, model_name: str) -> list[float]:
     return [float(x) for x in embedding]
 
 
+# Candidate ONNX filenames tried in order: quantized int8 first (smallest/fastest),
+# then subfolder full-precision, then legacy root-level model.onnx.
+_ONNX_CANDIDATE_FILENAMES = ["onnx/model_quantized.onnx", "onnx/model.onnx", "model.onnx"]
+
+
+def get_onnx_session(model_path: str):
+    """Load or retrieve cached ONNX Runtime session for embeddings."""
+    cached = _ONNX_SESSION_CACHE.get(model_path)
+    if cached is not None:
+        return cached
+
+    try:
+        import onnxruntime as ort  # type: ignore
+        from tokenizers import Tokenizer  # type: ignore
+    except Exception as exc:
+        raise RuntimeError(
+            "ONNX embedding backend requested but onnxruntime or tokenizers is not installed."
+        ) from exc
+
+    onnx_path: str | None = None
+    tokenizer_path: str | None = None
+
+    try:
+        from huggingface_hub import hf_hub_download  # type: ignore
+
+        for candidate in _ONNX_CANDIDATE_FILENAMES:
+            try:
+                onnx_path = hf_hub_download(repo_id=model_path, filename=candidate)
+                break
+            except Exception:
+                continue
+
+        if onnx_path is None:
+            raise RuntimeError(f"No ONNX model file found in HuggingFace repo '{model_path}'")
+
+        tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer.json")
+
+    except Exception as hf_err:
+        model_dir = Path(model_path)
+        if not model_dir.exists():
+            raise RuntimeError(
+                f"ONNX model not found at '{model_path}'. "
+                "Provide a valid HuggingFace model ID or local directory path."
+            ) from hf_err
+
+        for candidate in _ONNX_CANDIDATE_FILENAMES:
+            p = model_dir / candidate
+            if p.exists():
+                onnx_path = str(p)
+                break
+
+        if onnx_path is None:
+            raise RuntimeError(
+                f"No ONNX model file found locally in '{model_path}'"
+            ) from hf_err
+
+        tokenizer_path = str(model_dir / "tokenizer.json")
+
-
-    try:
-        from huggingface_hub import hf_hub_download  # type: ignore
-
-        for candidate in _ONNX_CANDIDATE_FILENAMES:
-            try:
-                onnx_path = hf_hub_download(repo_id=model_path, filename=candidate)
-                break
-            except Exception:
-                continue
-
-        if onnx_path is None:
-            raise RuntimeError(f"No ONNX model file found in HuggingFace repo '{model_path}'")
-
-        tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer.json")
-
-    except Exception as hf_err:
-        model_dir = Path(model_path)
-        if not model_dir.exists():
-            raise RuntimeError(
-                f"ONNX model not found at '{model_path}'. "
-                "Provide a valid HuggingFace model ID or local directory path."
-            ) from hf_err
-
-        for candidate in _ONNX_CANDIDATE_FILENAMES:
-            p = model_dir / candidate
-            if p.exists():
-                onnx_path = str(p)
-                break
-
-        if onnx_path is None:
-            raise RuntimeError(
-                f"No ONNX model file found locally in '{model_path}'"
-            ) from hf_err
-
-        tokenizer_path = str(model_dir / "tokenizer.json")
+    hf_import_err: Exception | None = None
+    hf_onnx_err: Exception | None = None
+
+    try:
+        from huggingface_hub import hf_hub_download  # type: ignore
+    except ImportError as exc:
+        hf_hub_download = None
+        hf_import_err = exc
+
+    if hf_hub_download is not None:
+        for candidate in _ONNX_CANDIDATE_FILENAMES:
+            try:
+                onnx_path = hf_hub_download(repo_id=model_path, filename=candidate)
+                break
+            except Exception as exc:
+                hf_onnx_err = exc
+                continue
+
+        if onnx_path is not None:
+            try:
+                tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer.json")
+            except Exception as exc:
+                raise RuntimeError(
+                    f"Found ONNX model in HuggingFace repo '{model_path}', "
+                    "but required file 'tokenizer.json' is missing or could not be downloaded."
+                ) from exc
+
+    if onnx_path is None:
+        model_dir = Path(model_path)
+        if model_dir.exists():
+            for candidate in _ONNX_CANDIDATE_FILENAMES:
+                p = model_dir / candidate
+                if p.exists():
+                    onnx_path = str(p)
+                    break
+
+            if onnx_path is None:
+                raise RuntimeError(
+                    f"No ONNX model file found locally in '{model_path}'"
+                ) from hf_onnx_err
+
+            tokenizer_path = str(model_dir / "tokenizer.json")
+        elif hf_import_err is not None:
+            raise RuntimeError(
+                "ONNX model not found locally and huggingface_hub is not installed. "
+                "Provide a local directory path or install huggingface_hub to load a HuggingFace model ID."
+            ) from hf_import_err
+        elif hf_onnx_err is not None:
+            raise RuntimeError(
+                f"Unable to download an ONNX model file from HuggingFace repo '{model_path}', "
+                "and no local directory exists at that path."
+            ) from hf_onnx_err
+        else:
+            raise RuntimeError(
+                f"ONNX model not found at '{model_path}'. "
+                "Provide a valid HuggingFace model ID or local directory path."
+            )
-        tokenizer_path = str(model_dir / "tokenizer.json")
+        local_tokenizer_path = model_dir / "tokenizer.json"
+        if not local_tokenizer_path.exists():
+            raise RuntimeError(
+                f"Missing tokenizer file in local ONNX model directory '{model_path}'. "
+                "Expected 'tokenizer.json' alongside the ONNX model files "
+                f"({', '.join(_ONNX_CANDIDATE_FILENAMES)}). "
+                "Provide a valid HuggingFace model ID or a local directory with the required files."
+            ) from hf_err
+
+        tokenizer_path = str(local_tokenizer_path)
-
-    try:
-        from huggingface_hub import hf_hub_download  # type: ignore
-
-        for candidate in _ONNX_CANDIDATE_FILENAMES:
-            try:
-                onnx_path = hf_hub_download(repo_id=model_path, filename=candidate)
-                break
-            except Exception:
-                continue
-
-        if onnx_path is None:
-            raise RuntimeError(f"No ONNX model file found in HuggingFace repo '{model_path}'")
-
-        tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer.json")
-
-    except Exception as hf_err:
-        model_dir = Path(model_path)
-        if not model_dir.exists():
-            raise RuntimeError(
-                f"ONNX model not found at '{model_path}'. "
-                "Provide a valid HuggingFace model ID or local directory path."
-            ) from hf_err
-
-        for candidate in _ONNX_CANDIDATE_FILENAMES:
-            p = model_dir / candidate
-            if p.exists():
-                onnx_path = str(p)
-                break
-
-        if onnx_path is None:
-            raise RuntimeError(
-                f"No ONNX model file found locally in '{model_path}'"
-            ) from hf_err
-
-        tokenizer_path = str(model_dir / "tokenizer.json")
+    hf_import_err: Exception | None = None
+    hf_onnx_err: Exception | None = None
+
+    try:
+        from huggingface_hub import hf_hub_download  # type: ignore
+    except ImportError as exc:
+        hf_hub_download = None
+        hf_import_err = exc
+
+    if hf_hub_download is not None:
+        for candidate in _ONNX_CANDIDATE_FILENAMES:
+            try:
+                onnx_path = hf_hub_download(repo_id=model_path, filename=candidate)
+                break
+            except Exception as exc:
+                hf_onnx_err = exc
+                continue
+
+        if onnx_path is not None:
+            try:
+                tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer.json")
+            except Exception as exc:
+                raise RuntimeError(
+                    f"Found ONNX model in HuggingFace repo '{model_path}', "
+                    "but required file 'tokenizer.json' is missing or could not be downloaded."
+                ) from exc
+
+    if onnx_path is None:
+        model_dir = Path(model_path)
+        if model_dir.exists():
+            for candidate in _ONNX_CANDIDATE_FILENAMES:
+                p = model_dir / candidate
+                if p.exists():
+                    onnx_path = str(p)
+                    break
+
+            if onnx_path is None:
+                raise RuntimeError(
+                    f"No ONNX model file found locally in '{model_path}'"
+                ) from hf_onnx_err
+
+            tokenizer_path = str(model_dir / "tokenizer.json")
+        elif hf_import_err is not None:
+            raise RuntimeError(
+                "ONNX model not found locally and huggingface_hub is not installed. "
+                "Provide a local directory path or install huggingface_hub to load a HuggingFace model ID."
+            ) from hf_import_err
+        elif hf_onnx_err is not None:
+            raise RuntimeError(
+                f"Unable to download an ONNX model file from HuggingFace repo '{model_path}', "
+                "and no local directory exists at that path."
+            ) from hf_onnx_err
+        else:
+            raise RuntimeError(
+                f"ONNX model not found at '{model_path}'. "
+                "Provide a valid HuggingFace model ID or local directory path."
+            )
-        tokenizer_path = str(model_dir / "tokenizer.json")
+        local_tokenizer_path = model_dir / "tokenizer.json"
+        if not local_tokenizer_path.exists():
+            raise RuntimeError(
+                f"Missing tokenizer file in local ONNX model directory '{model_path}'. "
+                "Expected 'tokenizer.json' alongside the ONNX model files "
+                f"({', '.join(_ONNX_CANDIDATE_FILENAMES)}). "
+                "Provide a valid HuggingFace model ID or a local directory with the required files."
+            ) from hf_err
+
+        tokenizer_path = str(local_tokenizer_path)
+    session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+    tokenizer = Tokenizer.from_file(tokenizer_path)
+
+    _ONNX_SESSION_CACHE[model_path] = (session, tokenizer)
+    return session, tokenizer
+
+
+def embed_text_onnx(
+    text: str, model_path: str, max_length: int = DEFAULT_ONNX_MAX_LENGTH
+) -> list[float]:
+    """Generate embeddings using ONNX Runtime for fast CPU inference.
+
+    Uses attention-mask-weighted mean pooling so padding tokens are excluded
+    from the average. Supports models that require token_type_ids.
+    """
+    import numpy as np
+
+    session, tokenizer = get_onnx_session(model_path)
+
+    encoding = tokenizer.encode(text)
+    real_len = min(len(encoding.ids), max_length)
+    pad_len = max_length - real_len
+
+    tokens = encoding.ids[:real_len] + [0] * pad_len
+    attn_mask = [1] * real_len + [0] * pad_len
+    type_ids = list(encoding.type_ids[:real_len]) + [0] * pad_len if encoding.type_ids else [0] * max_length
+
+    input_ids = np.array([tokens], dtype=np.int64)
+    attention_mask_array = np.array([attn_mask], dtype=np.int64)
+    token_type_ids_array = np.array([type_ids], dtype=np.int64)
+
+    input_names = {inp.name for inp in session.get_inputs()}
+    ort_inputs: dict[str, object] = {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask_array,
+    }
+    if "token_type_ids" in input_names:
+        ort_inputs["token_type_ids"] = token_type_ids_array
+
+    outputs = session.run(None, ort_inputs)
+
+    # Attention-mask-weighted mean pooling: exclude padding tokens from average
+    hidden = outputs[0][0]  # shape: (max_length, hidden_dim)
+    mask = np.array(attn_mask, dtype=np.float32)
+    masked_sum = (hidden * mask[:, np.newaxis]).sum(axis=0)
+    embedding = masked_sum / (mask.sum() + 1e-9)
+
+    norm = np.linalg.norm(embedding)
+    if norm > 0:
+        embedding = embedding / norm
+
+    return [float(x) for x in embedding]
+
+
 def resolve_embedding_backend(
     requested_backend: str,
     requested_model: str,
@@ -74,6 +192,17 @@ def resolve_embedding_backend(
     if backend == "hash":
         return "hash", None
 
+    if backend == "onnx":
+        try:
+            get_onnx_session(requested_model)
+            return "onnx", requested_model
+        except Exception as exc:
+            print(
+                f"[warn] ONNX embedding backend unavailable ({exc}). Falling back to hash embeddings.",
+                file=sys.stderr,
+            )
+            return "hash", None
+
     if backend == "ml":
         try:
             get_sentence_transformer_model(requested_model)
@@ -99,6 +228,16 @@ def embed_text(
     embedding_model: str | None,
 ) -> tuple[list[float], str, str | None]:
     backend = embedding_backend.strip().lower()
+
+    if backend == "onnx" and embedding_model:
+        try:
+            return embed_text_onnx(text=text, model_path=embedding_model), "onnx", embedding_model
+        except Exception as exc:
+            print(
+                f"[warn] ONNX embedding runtime failed ({exc}). Falling back to hash embeddings.",
+                file=sys.stderr,
+            )
+
     if backend == "ml" and embedding_model:
         try:
             return embed_text_ml(text=text, model_name=embedding_model), "ml", embedding_model

diff --git a/scripts/token_reducer/retriever.py b/scripts/token_reducer/retriever.py
@@ -321,12 +321,73 @@ def overlap_ratio(query: str, text: str) -> float:
     return len(q_terms & t_terms) / float(len(q_terms))
 
 
+def reciprocal_rank_fusion(
+    fts_hits: list[Candidate],
+    vector_hits: list[Candidate],
+    k: int = 60,
+) -> list[Candidate]:
+    """Combine retrieval results using Reciprocal Rank Fusion (RRF).
+
+    RRF score = sum(1 / (k + rank)) across all retrieval systems.
+    This is a parameter-free, deterministic fusion method that works well
+    for combining BM25 and semantic search.
+
+    Args:
+        fts_hits: Candidates from FTS5/BM25 retrieval (already ranked)
+        vector_hits: Candidates from vector/semantic retrieval (already ranked)
+        k: RRF constant (typically 60). Higher values reduce top position impact.
+
+    Returns:
+        Merged and re-ranked candidates using RRF scores.
+    """
+    from .config import get_rrf_k
+
+    if k <= 0:
+        k = get_rrf_k()
+
+    rrf_scores: dict[int, float] = {}
+    candidates: dict[int, Candidate] = {}
+
+    # Add FTS5/BM25 ranks
+    for rank, candidate in enumerate(fts_hits, start=1):
+        rrf_scores[candidate.chunk_id] = 1.0 / (k + rank)
+        candidates[candidate.chunk_id] = candidate
+
+    # Add vector ranks
+    for rank, candidate in enumerate(vector_hits, start=1):
+        chunk_id = candidate.chunk_id
+        rrf_scores[chunk_id] = rrf_scores.get(chunk_id, 0.0) + (1.0 / (k + rank))
+
+        # Merge candidate info
+        if chunk_id in candidates:
+            candidates[chunk_id].vector_rank = candidate.vector_rank
+            candidates[chunk_id].vector_score = candidate.vector_score
+        else:
+            candidates[chunk_id] = candidate
+
+    # Assign final RRF scores
+    for chunk_id, candidate in candidates.items():
+        candidate.final_score = rrf_scores[chunk_id]
+
+    # Sort by RRF score descending
+    ranked = sorted(candidates.values(), key=lambda c: c.final_score, reverse=True)
+    return ranked
+
+
 def rerank_candidates(
     query: str,
     fts_hits: list[Candidate],
     vector_hits: list[Candidate],
     top_k: int,
 ) -> tuple[list[Candidate], list[Candidate]]:
+    from .config import should_use_rrf
+
+    # Use RRF if enabled, otherwise fall back to weighted scoring
+    if should_use_rrf() and vector_hits:
+        ranked = reciprocal_rank_fusion(fts_hits, vector_hits)
+        return ranked[:top_k], ranked
+
+    # Fallback: Original weighted scoring
     merged: dict[int, Candidate] = {}
 
     for candidate in fts_hits: