From 3198383f2e48fa3f11dd8480ac81b5aaffd17f8c Mon Sep 17 00:00:00 2001
From: Luigi Corsaro <5324491+drake69@users.noreply.github.com>
Date: Tue, 23 Jun 2026 18:59:51 +0200
Subject: [PATCH 1/2] feat(ui+db): counterparts grid, CategoryCorrection table,
 cat_select widget
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- CategoryCorrection table: live implicit benchmark — logga ogni correzione
  utente con source, confidence e consistency_at_correction per calcolare
  accuracy per modello senza dataset etichettato separato (AI-164)
- counterparts_page.py: griglia per-vendor con # tx, source_mode, variabilità
  categoria modale, flag human-checked; inline rule creation da riga (AI-164)
- cat_select widget: selector categoria/sottocategoria riutilizzabile
- i18n: chiavi counterparts aggiunte in IT/EN/FR/DE/ES
---
 app.py                          |   5 +
 core/_build_info.py             |   3 +
 db/models.py                    |  55 +++++++
 db/repository.py                | 188 ++++++++++++++++++++++-
 services/transaction_service.py |  18 ++-
 ui/bulk_edit_page.py            |   4 +-
 ui/counterparts_page.py         | 254 ++++++++++++++++++++++++++++++++
 ui/i18n/de.json                 |  50 ++++++-
 ui/i18n/en.json                 |  50 ++++++-
 ui/i18n/es.json                 |  50 ++++++-
 ui/i18n/fr.json                 |  50 ++++++-
 ui/i18n/it.json                 |  50 ++++++-
 ui/llm_models_page.py           |  52 +++++++
 ui/registry_page.py             |   2 +-
 ui/review_page.py               |   4 +-
 ui/rules_page.py                |   4 +-
 ui/sidebar.py                   |   1 +
 ui/widgets/cat_select.py        |  58 ++++++++
 18 files changed, 877 insertions(+), 21 deletions(-)
 create mode 100644 core/_build_info.py
 create mode 100644 ui/counterparts_page.py
 create mode 100644 ui/widgets/cat_select.py

diff --git a/app.py b/app.py
index 800f282..6672638 100644
--- a/app.py
+++ b/app.py
@@ -11,6 +11,7 @@
   📊 Budget vs Actual   – compare actual spending vs budget targets
   🔍 Review             – manual review of low-confidence items
   📏 Rules              – manage category rules (edit / delete / create)
+  🏪 Counterparts       – per-vendor stats grid with inline rule creation
   🗂️ Taxonomy           – manage categories and subcategories
   ⚙️ Settings           – locale, language, LLM backend preferences
   ✅ Checklist          – monthly tx presence per account (pivot table)
@@ -199,6 +200,10 @@
     from ui.rules_page import render_rules_page
     render_rules_page(engine)
 
+elif page == "counterparts":
+    from ui.counterparts_page import render_counterparts_page
+    render_counterparts_page(engine)
+
 elif page == "taxonomy":
     from ui.taxonomy_page import render_taxonomy_page
     render_taxonomy_page(engine)
diff --git a/core/_build_info.py b/core/_build_info.py
new file mode 100644
index 0000000..2d66aa1
--- /dev/null
+++ b/core/_build_info.py
@@ -0,0 +1,3 @@
+# Generated at build time — do not edit manually.
+BUILD_TIME = "2026-06-23 15:48"
+BUILD_VERSION = "0.1.0"
diff --git a/db/models.py b/db/models.py
index fe86cdd..094ea54 100644
--- a/db/models.py
+++ b/db/models.py
@@ -226,6 +226,7 @@ def create_tables(engine=None):
     _migrate_add_nsi_tag_mapping(engine)
     _migrate_add_category_model(engine)
     _migrate_add_llm_usage_log(engine)
+    _migrate_add_category_correction(engine)
     _migrate_set_onboarding_done_for_existing_users(engine)  # must run last
     _migrate_purge_orphan_schemas(engine)  # cleanup: remove schemas without header_sha256
 
@@ -583,6 +584,34 @@ class NsiTagMapping(Base):
     updated_at = Column(DateTime, default=lambda: datetime.now(timezone.utc))
 
 
+class CategoryCorrection(Base):
+    """User correction log — one row every time the user changes a category.
+
+    Captures the original LLM/rule assignment alongside two quality signals:
+      - original_confidence: the model's self-reported certainty on that tx
+      - consistency_at_correction: % of same-description txs that agreed on
+        the same category at the moment of correction (vendor-level coherence)
+
+    Together these let us compute a live implicit benchmark:
+      accuracy ≈ 1 - (corrections / total_llm_categorizations)  per model
+    and diagnose failure modes (high-confidence errors, inconsistent vendors).
+    """
+    __tablename__ = "category_correction"
+
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    tx_id = Column(String(64), nullable=False, index=True)
+    original_category = Column(String(128))
+    original_subcategory = Column(String(128))
+    original_source = Column(String(10))        # llm | rule | history
+    original_model = Column(String(128))        # category_model at correction time
+    original_confidence = Column(String(10))    # high | medium | low
+    new_category = Column(String(128))
+    new_subcategory = Column(String(128))
+    consistency_at_correction = Column(Float, nullable=True)  # % modal cat, same description
+    correction_origin = Column(String(20))      # ledger | counterparts | review | bulk_edit
+    corrected_at = Column(DateTime, default=lambda: datetime.now(timezone.utc))
+
+
 def _migrate_add_import_job(engine) -> None:
     """Create import_job table if not present (idempotent) and add the
     AI-88 ms_* phase-timing columns when missing.
@@ -1126,6 +1155,32 @@ def _migrate_add_category_model(engine) -> None:
                 raise
 
 
+def _migrate_add_category_correction(engine) -> None:
+    """Create category_correction table for live implicit benchmark (idempotent)."""
+    from sqlalchemy import text as _text
+    with engine.connect() as conn:
+        conn.execute(_text(
+            'CREATE TABLE IF NOT EXISTS category_correction ('
+            'id INTEGER PRIMARY KEY AUTOINCREMENT, '
+            'tx_id VARCHAR(64) NOT NULL, '
+            'original_category VARCHAR(128), '
+            'original_subcategory VARCHAR(128), '
+            'original_source VARCHAR(10), '
+            'original_model VARCHAR(128), '
+            'original_confidence VARCHAR(10), '
+            'new_category VARCHAR(128), '
+            'new_subcategory VARCHAR(128), '
+            'consistency_at_correction FLOAT, '
+            'correction_origin VARCHAR(20), '
+            'corrected_at DATETIME)'
+        ))
+        conn.execute(_text(
+            'CREATE INDEX IF NOT EXISTS ix_category_correction_tx_id '
+            'ON category_correction (tx_id)'
+        ))
+        conn.commit()
+
+
 def _migrate_add_llm_usage_log(engine) -> None:
     """Create llm_usage_log table if not present (idempotent)."""
     from sqlalchemy import text as _text
diff --git a/db/repository.py b/db/repository.py
index e980d54..7b131a6 100644
--- a/db/repository.py
+++ b/db/repository.py
@@ -13,6 +13,7 @@
 from core.categorizer import CategoryRule as CoreCategoryRule
 from core.schemas import DocumentSchema
 from db.models import (
+    CategoryCorrection,
     CategoryRule,
     DEFAULT_USER_SETTINGS,
     DescriptionRule,
@@ -288,7 +289,7 @@ def upsert_transaction(session: Session, tx: dict, batch_id: Optional[int] = Non
         date_accounting=tx.get("date_accounting").isoformat() if tx.get("date_accounting") and hasattr(tx["date_accounting"], "isoformat") else tx.get("date_accounting"),
         amount=amount_val,
         currency=tx.get("currency", "EUR"),
-        description=tx.get("description", ""),
+        description=(tx.get("description") or "").strip().upper() or None,
         source_file=tx.get("source_file", ""),
         doc_type=tx.get("doc_type", ""),
         account_label=tx.get("account_label", ""),
@@ -318,16 +319,57 @@ def get_existing_tx_ids(session: Session, tx_ids: list[str]) -> set[str]:
     return {row.id for row in rows}
 
 
+def _compute_consistency(session: Session, description: str) -> float | None:
+    """% of categorized transactions with same description that agree on modal category."""
+    from collections import Counter
+    rows = (
+        session.query(Transaction.category)
+        .filter(
+            Transaction.description == description,
+            Transaction.category.isnot(None),
+        )
+        .all()
+    )
+    if not rows:
+        return None
+    counts = Counter(r[0] for r in rows)
+    modal_count = counts.most_common(1)[0][1]
+    return round(modal_count / len(rows) * 100, 1)
+
+
 def update_transaction_category(
     session: Session,
     tx_id: str,
     category: str,
     subcategory: str,
+    origin: str = "unknown",
 ) -> bool:
     from datetime import datetime, timezone
     tx = session.get(Transaction, tx_id)
     if tx is None:
         return False
+    old_cat = tx.category
+    old_sub = tx.subcategory
+    old_src = tx.category_source
+    old_model = tx.category_model
+    old_conf = tx.category_confidence
+    category_changed = old_cat != category or old_sub != subcategory
+    if category_changed and old_src in ("llm", "rule", "history"):
+        consistency = _compute_consistency(session, tx.description or "")
+        correction = CategoryCorrection(
+            tx_id=tx_id,
+            original_category=old_cat,
+            original_subcategory=old_sub,
+            original_source=old_src,
+            original_model=old_model,
+            original_confidence=old_conf,
+            new_category=category,
+            new_subcategory=subcategory,
+            consistency_at_correction=consistency,
+            correction_origin=origin,
+            corrected_at=datetime.now(timezone.utc),
+        )
+        session.add(correction)
     tx.category = category
     tx.subcategory = subcategory
     tx.category_confidence = "high"
@@ -339,6 +381,66 @@ def update_transaction_category(
     return True
 
 
+def get_correction_benchmark(session: Session) -> list[dict]:
+    """Live implicit benchmark: per-model corrections vs total LLM categorizations.
+
+    Returns one dict per model with:
+      model, total_categorized, total_corrections, implicit_accuracy,
+      high_conf_errors, avg_consistency_at_error
+    """
+    from collections import Counter, defaultdict
+
+    # Still-LLM categorizations (not yet corrected by user)
+    still_llm_rows = (
+        session.query(Transaction.category_model)
+        .filter(
+            Transaction.category_source == "llm",
+            Transaction.category_model.isnot(None),
+        )
+        .all()
+    )
+    total_by_model: Counter = Counter(r[0] for r in still_llm_rows)
+
+    # Corrections where original source was llm
+    corr_rows = (
+        session.query(CategoryCorrection)
+        .filter(CategoryCorrection.original_source == "llm")
+        .all()
+    )
+
+    corrections: dict[str, list] = defaultdict(list)
+    for c in corr_rows:
+        if c.original_model:
+            corrections[c.original_model].append(c)
+
+    all_models = set(total_by_model.keys()) | set(corrections.keys())
+    results = []
+    for model in sorted(all_models):
+        corr_list = corrections.get(model, [])
+        n_corr = len(corr_list)
+        # total = still-LLM + already-corrected (corrected txs left the 'llm' source)
+        n_total = total_by_model.get(model, 0) + n_corr
+        high_conf = sum(1 for c in corr_list if c.original_confidence == "high")
+        consistency_vals = [
+            c.consistency_at_correction
+            for c in corr_list
+            if c.consistency_at_correction is not None
+        ]
+        avg_cons = round(sum(consistency_vals) / len(consistency_vals), 1) if consistency_vals else None
+        implicit_acc = round((1 - n_corr / n_total) * 100, 1) if n_total > 0 else None
+        results.append(
+            {
+                "model": model,
+                "total_categorized": n_total,
+                "total_corrections": n_corr,
+                "implicit_accuracy": implicit_acc,
+                "high_conf_errors": high_conf,
+                "avg_consistency_at_error": avg_cons,
+            }
+        )
+    return results
+
+
 def toggle_transaction_giroconto(session: Session, tx_id: str) -> tuple[bool, str]:
     """Toggle a transaction's tx_type between giroconto and expense/income.
 
@@ -655,6 +757,10 @@ def create_category_rule(
 
     Returns (rule, created) where created=False means an existing rule was updated.
     """
+    # Normalize pattern casing: contains/exact match against uppercase descriptions
+    if match_type in ("contains", "exact"):
+        pattern = pattern.upper()
+
     existing = (
         session.query(CategoryRule)
         .filter(CategoryRule.pattern == pattern, CategoryRule.match_type == match_type)
@@ -696,7 +802,8 @@ def update_category_rule(
     if rule is None:
         return False
     if pattern is not None:
-        rule.pattern = pattern
+        _mt = match_type or rule.match_type
+        rule.pattern = pattern.upper() if _mt in ("contains", "exact") else pattern
     if match_type is not None:
         rule.match_type = match_type
     if category is not None:
@@ -1908,3 +2015,80 @@ def get_adaptive_n_ctx_cap(
     # Round up to next 1024 multiple, enforce floor of 2048
     cap = max(int(math.ceil(max_upper / 1024)) * 1024, 2048)
     return cap
+
+
+def get_counterpart_stats(
+    session: Session,
+    tx_types: tuple[str, ...] = ("expense", "income"),
+) -> list[dict]:
+    """Aggregate transactions by description to produce per-counterpart stats.
+
+    Returns a list of dicts with keys:
+      description, tx_count, avg_amount, modal_category, modal_subcategory,
+      variability_pct, source_mode, human_checked
+    """
+    from collections import Counter, defaultdict
+
+    rows = (
+        session.query(
+            Transaction.description,
+            Transaction.amount,
+            Transaction.category,
+            Transaction.subcategory,
+            Transaction.category_source,
+            Transaction.validated_at,
+        )
+        .filter(
+            Transaction.description.isnot(None),
+            Transaction.description != "",
+            Transaction.tx_type.in_(tx_types),
+        )
+        .all()
+    )
+
+    groups: dict[str, list] = defaultdict(list)
+    for row in rows:
+        groups[row.description].append(row)
+
+    stats = []
+    for desc, txs in groups.items():
+        tx_count = len(txs)
+        avg_amount = sum(abs(float(t.amount or 0)) for t in txs) / tx_count
+
+        cat_counts: Counter = Counter(t.category for t in txs if t.category)
+        if cat_counts:
+            modal_cat, modal_count = cat_counts.most_common(1)[0]
+        else:
+            modal_cat, modal_count = "", 0
+        sub_counts: Counter = Counter(
+            t.subcategory for t in txs if t.category == modal_cat and t.subcategory
+        )
+        modal_sub = sub_counts.most_common(1)[0][0] if sub_counts else ""
+        variability_pct = (modal_count / tx_count * 100) if tx_count else 0.0
+
+        sources = {t.category_source for t in txs if t.category_source}
+        if len(sources) == 1:
+            source_mode = next(iter(sources))
+        elif sources:
+            source_mode = "mixed"
+        else:
+            source_mode = "unknown"
+
+        human_checked = any(
+            t.validated_at is not None or t.category_source == "manual" for t in txs
+        )
+
+        stats.append(
+            {
+                "description": desc,
+                "tx_count": tx_count,
+                "avg_amount": avg_amount,
+                "modal_category": modal_cat,
+                "modal_subcategory": modal_sub,
+                "variability_pct": variability_pct,
+                "source_mode": source_mode,
+                "human_checked": human_checked,
+            }
+        )
+
+    return stats
diff --git a/services/transaction_service.py b/services/transaction_service.py
index 36545b8..f297717 100644
--- a/services/transaction_service.py
+++ b/services/transaction_service.py
@@ -56,12 +56,20 @@ def get_recent_for_home(self, since_iso: str) -> list[tuple]:
                 .all()
             )
 
-    def update_category(self, tx_id: str, category: str, subcategory: str) -> bool:
+    def update_category(
+        self, tx_id: str, category: str, subcategory: str, origin: str = "unknown"
+    ) -> bool:
         with self._session() as s:
-            result = repository.update_transaction_category(s, tx_id, category, subcategory)
+            result = repository.update_transaction_category(
+                s, tx_id, category, subcategory, origin=origin
+            )
             s.commit()
             return result
 
+    def get_correction_benchmark(self) -> list[dict]:
+        with self._session() as s:
+            return repository.get_correction_benchmark(s)
+
     def update_context(self, tx_id: str, context: str | None) -> bool:
         with self._session() as s:
             result = repository.update_transaction_context(s, tx_id, context)
@@ -290,6 +298,12 @@ def update_context_bulk(self, ids: list[str], context: str | None) -> int:
             s.commit()
             return updated
 
+    def get_counterpart_stats(
+        self, tx_types: tuple[str, ...] = ("expense", "income")
+    ) -> list[dict]:
+        with self._session() as s:
+            return repository.get_counterpart_stats(s, tx_types=tx_types)
+
     def delete_duplicate_groups(self, groups: list[list]) -> int:
         """Delete all but the first transaction in each duplicate group.
 
diff --git a/ui/bulk_edit_page.py b/ui/bulk_edit_page.py
index 8fc57d1..50f2892 100644
--- a/ui/bulk_edit_page.py
+++ b/ui/bulk_edit_page.py
@@ -327,7 +327,7 @@ def _cat_progress_cb(p: float):
             )
 
         if st.button(t("bulk_edit.apply_category_btn"), type="primary", key="bulk_cat_save"):
-            ok = tx_svc.update_category(sel.id, new_cat, new_sub)
+            ok = tx_svc.update_category(sel.id, new_cat, new_sub, origin="bulk_edit")
             if ok:
                 rule_msg = ""
                 n_similar = 0
@@ -346,7 +346,7 @@ def _cat_progress_cb(p: float):
                     similar = tx_svc.get_by_rule_pattern(sel.description, "contains")
                     for stx in similar:
                         if stx.id != sel.id:
-                            tx_svc.update_category(stx.id, new_cat, new_sub)
+                            tx_svc.update_category(stx.id, new_cat, new_sub, origin="bulk_edit")
                             n_similar += 1
                     if n_similar:
                         rule_msg += t("bulk_edit.similar_tx_updated", n=n_similar)
diff --git a/ui/counterparts_page.py b/ui/counterparts_page.py
new file mode 100644
index 0000000..dbb299e
--- /dev/null
+++ b/ui/counterparts_page.py
@@ -0,0 +1,254 @@
+"""Counterparts page — per-vendor stats grid with inline rule creation."""
+from __future__ import annotations
+
+import pandas as pd
+import streamlit as st
+
+from services.rule_service import RuleService
+from services.settings_service import SettingsService
+from services.transaction_service import TransactionService
+from support.logging import setup_logging
+from ui.i18n import t
+from ui.widgets.cat_select import build_cat_options, join_cat_sub, split_cat_sub
+
+logger = setup_logging()
+
+_SOURCE_EMOJI = {
+    "llm": "🤖",
+    "rule": "📏",
+    "manual": "✋",
+    "mixed": "🔀",
+    "unknown": "❓",
+}
+
+_VARIABILITY_WARN = 80.0
+
+
+def _build_df(stats: list[dict], cat_options: list[str]) -> pd.DataFrame:
+    rows = []
+    for s in stats:
+        src = s["source_mode"]
+        src_label = f"{_SOURCE_EMOJI.get(src, '')} {src}"
+        var = s["variability_pct"]
+        combined = join_cat_sub(s["modal_category"], s["modal_subcategory"])
+        rows.append(
+            {
+                t("counterparts.col_counterpart"): s["description"],
+                t("counterparts.col_tx_count"): s["tx_count"],
+                t("counterparts.col_avg_amount"): round(s["avg_amount"], 2),
+                t("counterparts.col_cat_sub"): combined,
+                t("counterparts.col_source"): src_label,
+                t("counterparts.col_variability"): f"{var:.0f}%",
+                t("counterparts.col_checked"): s["human_checked"],
+                "_description": s["description"],
+                "_orig_cat_sub": combined,
+            }
+        )
+    return pd.DataFrame(rows)
+
+
+_DROPDOWN_CSS = """
+<style>
+/* ag-grid SelectboxColumn dropdown popup */
+.ag-rich-select {
+    background-color: #16213e !important;
+    border: 1px solid #53c28b !important;
+    border-radius: 4px !important;
+}
+.ag-rich-select-list {
+    background-color: #16213e !important;
+}
+.ag-rich-select-row {
+    color: #e0e0e0 !important;
+}
+.ag-rich-select-row:hover,
+.ag-rich-select-row.ag-hover {
+    background-color: #1e3a5f !important;
+    color: #ffffff !important;
+}
+.ag-rich-select-row.ag-rich-select-row-selected {
+    background-color: #0f3460 !important;
+    color: #53c28b !important;
+}
+</style>
+"""
+
+
+def render_counterparts_page(engine) -> None:
+    st.markdown(_DROPDOWN_CSS, unsafe_allow_html=True)
+    st.header(t("counterparts.title"))
+    st.caption(t("counterparts.caption"))
+
+    tx_svc = TransactionService(engine)
+    rule_svc = RuleService(engine)
+    cfg_svc = SettingsService(engine)
+
+    taxonomy = cfg_svc.get_taxonomy()
+    cat_options = build_cat_options(taxonomy, include_empty=True)
+
+    stats = tx_svc.get_counterpart_stats()
+    if not stats:
+        st.info(t("counterparts.empty"))
+        return
+
+    # ── Filters ───────────────────────────────────────────────────────────────
+    f1, f2, f3, f4, f5 = st.columns([2, 2, 2, 1, 1])
+    with f1:
+        sort_opts = {
+            t("counterparts.sort_tx_count"): "tx_count",
+            t("counterparts.sort_avg_amount"): "avg_amount",
+            t("counterparts.sort_variability"): "variability_pct",
+            t("counterparts.sort_name"): "description",
+        }
+        sort_label = st.selectbox(
+            t("counterparts.sort_by"), list(sort_opts.keys()), key="cp_sort"
+        )
+        sort_key = sort_opts[sort_label]
+    with f2:
+        sort_asc = st.toggle(t("counterparts.sort_asc"), value=False, key="cp_sort_asc")
+    with f3:
+        source_filter_opts = {
+            t("counterparts.filter_source_all"): None,
+            t("counterparts.filter_source_rule"): "rule",
+            t("counterparts.filter_source_llm"): "llm",
+            t("counterparts.filter_source_mixed"): "mixed",
+            t("counterparts.filter_source_manual"): "manual",
+        }
+        source_label = st.selectbox(
+            t("counterparts.filter_source"), list(source_filter_opts.keys()), key="cp_filter_src"
+        )
+        filter_source = source_filter_opts[source_label]
+    with f4:
+        filter_low_var = st.toggle(
+            t("counterparts.filter_low_var"), value=False, key="cp_filter_var"
+        )
+    with f5:
+        filter_unchecked = st.toggle(
+            t("counterparts.filter_unchecked"), value=False, key="cp_filter_unc"
+        )
+
+    # ── Apply filters & sort ──────────────────────────────────────────────────
+    filtered = stats
+    if filter_source is not None:
+        filtered = [s for s in filtered if s["source_mode"] == filter_source]
+    if filter_low_var:
+        filtered = [s for s in filtered if s["variability_pct"] < _VARIABILITY_WARN]
+    if filter_unchecked:
+        filtered = [s for s in filtered if not s["human_checked"]]
+    filtered.sort(key=lambda s: s[sort_key], reverse=not sort_asc)
+
+    df = _build_df(filtered, cat_options)
+
+    _col_counterpart = t("counterparts.col_counterpart")
+    _col_tx          = t("counterparts.col_tx_count")
+    _col_avg         = t("counterparts.col_avg_amount")
+    _col_cat_sub     = t("counterparts.col_cat_sub")
+    _col_src         = t("counterparts.col_source")
+    _col_var         = t("counterparts.col_variability")
+    _col_chk         = t("counterparts.col_checked")
+
+    display_cols = [
+        _col_counterpart, _col_tx, _col_avg,
+        _col_cat_sub,
+        _col_src, _col_var, _col_chk,
+    ]
+
+    column_config = {
+        _col_counterpart: st.column_config.TextColumn(
+            _col_counterpart, disabled=True, width="large"
+        ),
+        _col_tx: st.column_config.NumberColumn(
+            _col_tx, disabled=True, width="small"
+        ),
+        _col_avg: st.column_config.NumberColumn(
+            _col_avg, disabled=True, format="€ %.2f", width="small"
+        ),
+        _col_cat_sub: st.column_config.SelectboxColumn(
+            _col_cat_sub,
+            options=cat_options,
+            required=False,
+            width="large",
+        ),
+        _col_src: st.column_config.TextColumn(
+            _col_src, disabled=True, width="small"
+        ),
+        _col_var: st.column_config.TextColumn(
+            _col_var, disabled=True, width="small"
+        ),
+        _col_chk: st.column_config.CheckboxColumn(
+            _col_chk, disabled=True, width="small"
+        ),
+    }
+
+    st.caption(t("counterparts.grid_hint", n=len(filtered)))
+
+    edited = st.data_editor(
+        df[display_cols],
+        column_config=column_config,
+        use_container_width=True,
+        hide_index=True,
+        key="cp_editor",
+        num_rows="fixed",
+    )
+
+    # ── Detect changes ────────────────────────────────────────────────────────
+    changed_rows = []
+    for idx in range(len(df)):
+        orig = df.at[idx, "_orig_cat_sub"]
+        new_val = edited.at[idx, _col_cat_sub]
+        if new_val and new_val != orig:
+            new_cat, new_sub = split_cat_sub(new_val)
+            changed_rows.append(
+                {
+                    "description": df.at[idx, "_description"],
+                    "new_cat": new_cat,
+                    "new_sub": new_sub,
+                    "orig": orig,
+                }
+            )
+
+    if changed_rows:
+        st.info(t("counterparts.changes_pending", n=len(changed_rows)))
+
+        n_affected = sum(
+            len(tx_svc.get_by_rule_pattern(r["description"], "exact"))
+            for r in changed_rows
+        )
+        retroapply = st.checkbox(
+            t("counterparts.retroapply", n=n_affected),
+            value=True,
+            key="cp_retroapply",
+            disabled=n_affected == 0,
+        )
+
+        if st.button(t("counterparts.save_btn"), type="primary", key="cp_save"):
+            saved = 0
+            applied = 0
+            for r in changed_rows:
+                _, created = rule_svc.create_rule(
+                    pattern=r["description"],
+                    match_type="exact",
+                    category=r["new_cat"],
+                    subcategory=r["new_sub"],
+                    priority=10,
+                )
+                saved += 1
+                logger.info(
+                    f"counterparts_page: {'created' if created else 'updated'} rule"
+                    f" pattern={r['description']!r} → {r['new_cat']!r}/{r['new_sub']!r}"
+                )
+                if retroapply:
+                    txs = tx_svc.get_by_rule_pattern(r["description"], "exact")
+                    for tx in txs:
+                        tx_svc.update_category(
+                            tx.id, r["new_cat"], r["new_sub"], origin="counterparts"
+                        )
+                    applied += len(txs)
+
+            msg = t("counterparts.saved_ok", n=saved)
+            if retroapply and applied:
+                msg += " " + t("counterparts.retroapplied", n=applied)
+            st.success(msg)
+            st.rerun()
+    else:
+        st.caption(t("counterparts.no_changes"))
diff --git a/ui/i18n/de.json b/ui/i18n/de.json
index f345540..934a1f8 100644
--- a/ui/i18n/de.json
+++ b/ui/i18n/de.json
@@ -984,5 +984,51 @@
   "llm_models.stats.col.s_per_tx": "s/Tx",
   "llm_models.stats.col.s_per_tx_help": "Mittlere Zeit pro Einzeltransaktion = mittlere Aufrufdauer ÷ batch_size. Leer für Single-Shot-Phasen (Classifier, Footer). Nützlich zur Schätzung der Gesamt-Importzeit.",
   "llm_models.stats.col.mean_s_help": "Mittlere Latenz eines einzelnen LLM-Aufrufs (ein Aufruf kann N Transaktionen im Batch enthalten). Pro-Transaktion siehe Spalte s/Tx.",
-  "upload.error_backend_load": "❌ **{filename}** nicht importiert — LLM-Modell konnte nicht geladen werden.\n\n{error}\n\n👉 Öffnen Sie **🤖 LLM-Modelle** und prüfen Sie die Modelldatei (Test 🧪). Bei Beschädigung erneut herunterladen."
-}
+  "upload.error_backend_load": "❌ **{filename}** nicht importiert — LLM-Modell konnte nicht geladen werden.\n\n{error}\n\n👉 Öffnen Sie **🤖 LLM-Modelle** und prüfen Sie die Modelldatei (Test 🧪). Bei Beschädigung erneut herunterladen.",
+  "nav.counterparts": "🏪 Gegenparteien",
+  "nav.counterparts.desc": "Anbieter-Statistiken mit schneller Regelerfassung",
+  "counterparts.title": "🏪 Gegenparteien",
+  "counterparts.caption": "Händler/Gegenpartei-Liste mit Kategorisierungsstatistiken. Kategorie direkt im Raster bearbeiten und speichern, um eine automatische Regel zu erstellen.",
+  "counterparts.empty": "Keine kategorisierten Transaktionen gefunden.",
+  "counterparts.col_counterpart": "Gegenpartei",
+  "counterparts.col_tx_count": "# Tx",
+  "counterparts.col_avg_amount": "Ø Betrag",
+  "counterparts.col_category": "Kategorie",
+  "counterparts.col_subcategory": "Unterkategorie",
+  "counterparts.col_source": "Quelle",
+  "counterparts.col_variability": "Konsistenz",
+  "counterparts.col_checked": "✓",
+  "counterparts.sort_by": "Sortieren nach",
+  "counterparts.sort_asc": "Aufsteigend",
+  "counterparts.sort_tx_count": "# Transaktionen",
+  "counterparts.sort_avg_amount": "Ø Betrag",
+  "counterparts.sort_variability": "Konsistenz",
+  "counterparts.sort_name": "Name",
+  "counterparts.filter_low_var": "Nur niedrige Konsistenz",
+  "counterparts.filter_unchecked": "Nur nicht geprüft",
+  "counterparts.grid_hint": "{n} Gegenparteien — Kategorie oder Unterkategorie bearbeiten, dann speichern",
+  "counterparts.changes_pending": "{n} Zeile(n) geändert — bereit als Regel(n)",
+  "counterparts.retroapply": "Auch auf {n} bestehende Transaktionen anwenden",
+  "counterparts.save_btn": "💾 Als Regeln speichern",
+  "counterparts.saved_ok": "✅ {n} Regel(n) erstellt oder aktualisiert.",
+  "counterparts.retroapplied": "{n} Transaktionen aktualisiert.",
+  "counterparts.no_changes": "Keine ausstehenden Änderungen.",
+  "counterparts.filter_source": "Quelle",
+  "counterparts.filter_source_all": "Alle",
+  "counterparts.filter_source_rule": "📏 Regel",
+  "counterparts.filter_source_llm": "🤖 LLM",
+  "counterparts.filter_source_mixed": "🔀 Gemischt",
+  "counterparts.filter_source_manual": "✋ Manuell",
+  "llm_models.benchmark.title": "📊 Implizites Benutzer-Benchmark",
+  "llm_models.benchmark.caption": "Geschätzte Genauigkeit pro Modell basierend auf Benutzerkorrekturen.",
+  "llm_models.benchmark.unavailable": "Benchmark-Tabelle noch nicht verfügbar.",
+  "llm_models.benchmark.empty": "Noch keine Korrekturen aufgezeichnet.",
+  "llm_models.benchmark.col_model": "Modell",
+  "llm_models.benchmark.col_total": "LLM-Kategorisierungen",
+  "llm_models.benchmark.col_corrections": "Benutzerkorrekturen",
+  "llm_models.benchmark.col_accuracy": "Implizite Genauigkeit",
+  "llm_models.benchmark.col_hce": "Fehler hoher Konfidenz",
+  "llm_models.benchmark.col_consistency": "Anbieter-Konsistenz (Ø)",
+  "llm_models.benchmark.note": "⚠️ Precision-when-reviewed: nicht berührte Transaktionen werden nicht gezählt.",
+  "counterparts.col_cat_sub": "Kategorie / Unterkategorie"
+}
\ No newline at end of file
diff --git a/ui/i18n/en.json b/ui/i18n/en.json
index b6699c6..066bffb 100644
--- a/ui/i18n/en.json
+++ b/ui/i18n/en.json
@@ -984,5 +984,51 @@
   "llm_models.stats.col.s_per_tx": "s/tx",
   "llm_models.stats.col.s_per_tx_help": "Mean time per single transaction = mean call duration ÷ batch_size. Blank for single-shot phases (classifier, footer) where 1 call ≠ N transactions. Useful for estimating total import time.",
   "llm_models.stats.col.mean_s_help": "Mean latency of a single LLM call (one call may contain N transactions in a batch). For per-transaction time see the s/tx column.",
-  "upload.error_backend_load": "❌ **{filename}** not imported — failed to load the LLM model.\n\n{error}\n\n👉 Open **🤖 LLM Models** and verify the model file is valid (Test 🧪). If the file is corrupted, re-download it from the Download section."
-}
+  "upload.error_backend_load": "❌ **{filename}** not imported — failed to load the LLM model.\n\n{error}\n\n👉 Open **🤖 LLM Models** and verify the model file is valid (Test 🧪). If the file is corrupted, re-download it from the Download section.",
+  "nav.counterparts": "🏪 Counterparts",
+  "nav.counterparts.desc": "Per-vendor statistics with quick rule creation",
+  "counterparts.title": "🏪 Counterparts",
+  "counterparts.caption": "Vendor/counterpart list with categorization statistics. Edit category directly in the grid and save to create or update an automatic rule.",
+  "counterparts.empty": "No categorized transactions found. Import and categorize transactions before using this page.",
+  "counterparts.col_counterpart": "Counterpart",
+  "counterparts.col_tx_count": "# Tx",
+  "counterparts.col_avg_amount": "Avg amount",
+  "counterparts.col_category": "Category",
+  "counterparts.col_subcategory": "Subcategory",
+  "counterparts.col_source": "Source",
+  "counterparts.col_variability": "Consistency",
+  "counterparts.col_checked": "✓",
+  "counterparts.sort_by": "Sort by",
+  "counterparts.sort_asc": "Ascending",
+  "counterparts.sort_tx_count": "# Transactions",
+  "counterparts.sort_avg_amount": "Avg amount",
+  "counterparts.sort_variability": "Consistency",
+  "counterparts.sort_name": "Name",
+  "counterparts.filter_low_var": "Low consistency only",
+  "counterparts.filter_unchecked": "Unchecked only",
+  "counterparts.grid_hint": "{n} counterparts — edit Category or Subcategory in the highlighted cells, then save",
+  "counterparts.changes_pending": "{n} row(s) modified — ready to become rule(s)",
+  "counterparts.retroapply": "Also apply to {n} existing transactions",
+  "counterparts.save_btn": "💾 Save as rules",
+  "counterparts.saved_ok": "✅ {n} rule(s) created or updated.",
+  "counterparts.retroapplied": "{n} transactions updated.",
+  "counterparts.no_changes": "No pending changes. Edit Category or Subcategory in a row to create a rule.",
+  "counterparts.filter_source": "Source",
+  "counterparts.filter_source_all": "All",
+  "counterparts.filter_source_rule": "📏 Rule",
+  "counterparts.filter_source_llm": "🤖 LLM",
+  "counterparts.filter_source_mixed": "🔀 Mixed",
+  "counterparts.filter_source_manual": "✋ Manual",
+  "llm_models.benchmark.title": "📊 Implicit user benchmark",
+  "llm_models.benchmark.caption": "Estimated per-model accuracy based on user category corrections. One correction = model was wrong. Only LLM-tracked categorizations included.",
+  "llm_models.benchmark.unavailable": "Benchmark table not yet available.",
+  "llm_models.benchmark.empty": "No corrections recorded yet. The benchmark fills in as the user corrects categories in Ledger, Review or Counterparts.",
+  "llm_models.benchmark.col_model": "Model",
+  "llm_models.benchmark.col_total": "LLM categorizations",
+  "llm_models.benchmark.col_corrections": "User corrections",
+  "llm_models.benchmark.col_accuracy": "Implicit accuracy",
+  "llm_models.benchmark.col_hce": "High-confidence errors",
+  "llm_models.benchmark.col_consistency": "Vendor consistency (avg)",
+  "llm_models.benchmark.note": "⚠️ Precision-when-reviewed benchmark: untouched transactions are not counted. More corrections → more reliable estimate.",
+  "counterparts.col_cat_sub": "Category / Subcategory"
+}
\ No newline at end of file
diff --git a/ui/i18n/es.json b/ui/i18n/es.json
index a0b2f3f..b30b996 100644
--- a/ui/i18n/es.json
+++ b/ui/i18n/es.json
@@ -984,5 +984,51 @@
   "llm_models.stats.col.s_per_tx": "s/tx",
   "llm_models.stats.col.s_per_tx_help": "Tiempo medio por transacción = duración media de la llamada ÷ batch_size. Vacío para fases single-shot (classifier, footer). Útil para estimar el tiempo total de un import.",
   "llm_models.stats.col.mean_s_help": "Latencia media de una sola llamada LLM (puede contener N transacciones en el lote). Para el tiempo por transacción ver la columna s/tx.",
-  "upload.error_backend_load": "❌ **{filename}** no importado — error al cargar el modelo LLM.\n\n{error}\n\n👉 Abre **🤖 Modelos LLM** y verifica el archivo del modelo (Test 🧪). Si está dañado, descárgalo de nuevo."
-}
+  "upload.error_backend_load": "❌ **{filename}** no importado — error al cargar el modelo LLM.\n\n{error}\n\n👉 Abre **🤖 Modelos LLM** y verifica el archivo del modelo (Test 🧪). Si está dañado, descárgalo de nuevo.",
+  "nav.counterparts": "🏪 Contrapartes",
+  "nav.counterparts.desc": "Estadísticas por proveedor con creación rápida de reglas",
+  "counterparts.title": "🏪 Contrapartes",
+  "counterparts.caption": "Lista de proveedores/contrapartes con estadísticas de categorización.",
+  "counterparts.empty": "No se encontraron transacciones categorizadas.",
+  "counterparts.col_counterpart": "Contraparte",
+  "counterparts.col_tx_count": "# Tx",
+  "counterparts.col_avg_amount": "Importe medio",
+  "counterparts.col_category": "Categoría",
+  "counterparts.col_subcategory": "Subcategoría",
+  "counterparts.col_source": "Fuente",
+  "counterparts.col_variability": "Consistencia",
+  "counterparts.col_checked": "✓",
+  "counterparts.sort_by": "Ordenar por",
+  "counterparts.sort_asc": "Ascendente",
+  "counterparts.sort_tx_count": "# Transacciones",
+  "counterparts.sort_avg_amount": "Importe medio",
+  "counterparts.sort_variability": "Consistencia",
+  "counterparts.sort_name": "Nombre",
+  "counterparts.filter_low_var": "Solo baja consistencia",
+  "counterparts.filter_unchecked": "Solo no validadas",
+  "counterparts.grid_hint": "{n} contrapartes — edita Categoría o Subcategoría, luego guarda",
+  "counterparts.changes_pending": "{n} fila(s) modificada(s) — lista(s) para convertirse en regla(s)",
+  "counterparts.retroapply": "Aplicar también a {n} transacciones existentes",
+  "counterparts.save_btn": "💾 Guardar como reglas",
+  "counterparts.saved_ok": "✅ {n} regla(s) creada(s) o actualizada(s).",
+  "counterparts.retroapplied": "{n} transacciones actualizadas.",
+  "counterparts.no_changes": "Sin cambios pendientes.",
+  "counterparts.filter_source": "Fuente",
+  "counterparts.filter_source_all": "Todas",
+  "counterparts.filter_source_rule": "📏 Regla",
+  "counterparts.filter_source_llm": "🤖 LLM",
+  "counterparts.filter_source_mixed": "🔀 Mixto",
+  "counterparts.filter_source_manual": "✋ Manual",
+  "llm_models.benchmark.title": "📊 Benchmark implícito de usuario",
+  "llm_models.benchmark.caption": "Precisión estimada por modelo basada en correcciones del usuario.",
+  "llm_models.benchmark.unavailable": "Tabla de benchmark no disponible aún.",
+  "llm_models.benchmark.empty": "Aún no hay correcciones registradas.",
+  "llm_models.benchmark.col_model": "Modelo",
+  "llm_models.benchmark.col_total": "Categorizaciones LLM",
+  "llm_models.benchmark.col_corrections": "Correcciones usuario",
+  "llm_models.benchmark.col_accuracy": "Precisión implícita",
+  "llm_models.benchmark.col_hce": "Errores alta confianza",
+  "llm_models.benchmark.col_consistency": "Consistencia vendor (media)",
+  "llm_models.benchmark.note": "⚠️ Benchmark precision-when-reviewed: las transacciones no tocadas no se cuentan.",
+  "counterparts.col_cat_sub": "Categoría / Subcategoría"
+}
\ No newline at end of file
diff --git a/ui/i18n/fr.json b/ui/i18n/fr.json
index e13ef94..8254e45 100644
--- a/ui/i18n/fr.json
+++ b/ui/i18n/fr.json
@@ -984,5 +984,51 @@
   "llm_models.stats.col.s_per_tx": "s/tx",
   "llm_models.stats.col.s_per_tx_help": "Temps moyen par transaction = durée moyenne de l'appel ÷ batch_size. Vide pour les phases single-shot (classifier, footer). Utile pour estimer le temps total d'un import.",
   "llm_models.stats.col.mean_s_help": "Latence moyenne d'un seul appel LLM (un appel peut contenir N transactions dans un lot). Pour le temps par transaction voir la colonne s/tx.",
-  "upload.error_backend_load": "❌ **{filename}** non importé — échec du chargement du modèle LLM.\n\n{error}\n\n👉 Ouvrez **🤖 Modèles LLM** et vérifiez la validité du fichier (Test 🧪). Si corrompu, retéléchargez-le."
-}
+  "upload.error_backend_load": "❌ **{filename}** non importé — échec du chargement du modèle LLM.\n\n{error}\n\n👉 Ouvrez **🤖 Modèles LLM** et vérifiez la validité du fichier (Test 🧪). Si corrompu, retéléchargez-le.",
+  "nav.counterparts": "🏪 Contreparties",
+  "nav.counterparts.desc": "Statistiques par fournisseur avec création rapide de règles",
+  "counterparts.title": "🏪 Contreparties",
+  "counterparts.caption": "Liste des fournisseurs/contreparties avec statistiques de catégorisation.",
+  "counterparts.empty": "Aucune transaction catégorisée trouvée.",
+  "counterparts.col_counterpart": "Contrepartie",
+  "counterparts.col_tx_count": "# Tx",
+  "counterparts.col_avg_amount": "Montant moyen",
+  "counterparts.col_category": "Catégorie",
+  "counterparts.col_subcategory": "Sous-catégorie",
+  "counterparts.col_source": "Source",
+  "counterparts.col_variability": "Cohérence",
+  "counterparts.col_checked": "✓",
+  "counterparts.sort_by": "Trier par",
+  "counterparts.sort_asc": "Croissant",
+  "counterparts.sort_tx_count": "# Transactions",
+  "counterparts.sort_avg_amount": "Montant moyen",
+  "counterparts.sort_variability": "Cohérence",
+  "counterparts.sort_name": "Nom",
+  "counterparts.filter_low_var": "Faible cohérence uniquement",
+  "counterparts.filter_unchecked": "Non validées uniquement",
+  "counterparts.grid_hint": "{n} contreparties — modifiez Catégorie ou Sous-catégorie, puis sauvegardez",
+  "counterparts.changes_pending": "{n} ligne(s) modifiée(s) — prête(s) à devenir règle(s)",
+  "counterparts.retroapply": "Appliquer aussi à {n} transactions existantes",
+  "counterparts.save_btn": "💾 Enregistrer comme règles",
+  "counterparts.saved_ok": "✅ {n} règle(s) créée(s) ou mise(s) à jour.",
+  "counterparts.retroapplied": "{n} transactions mises à jour.",
+  "counterparts.no_changes": "Aucune modification en attente.",
+  "counterparts.filter_source": "Source",
+  "counterparts.filter_source_all": "Toutes",
+  "counterparts.filter_source_rule": "📏 Règle",
+  "counterparts.filter_source_llm": "🤖 LLM",
+  "counterparts.filter_source_mixed": "🔀 Mixte",
+  "counterparts.filter_source_manual": "✋ Manuel",
+  "llm_models.benchmark.title": "📊 Benchmark utilisateur implicite",
+  "llm_models.benchmark.caption": "Précision estimée par modèle basée sur les corrections de l'utilisateur.",
+  "llm_models.benchmark.unavailable": "Table de benchmark non encore disponible.",
+  "llm_models.benchmark.empty": "Aucune correction enregistrée pour l'instant.",
+  "llm_models.benchmark.col_model": "Modèle",
+  "llm_models.benchmark.col_total": "Catégorisations LLM",
+  "llm_models.benchmark.col_corrections": "Corrections utilisateur",
+  "llm_models.benchmark.col_accuracy": "Précision implicite",
+  "llm_models.benchmark.col_hce": "Erreurs haute confiance",
+  "llm_models.benchmark.col_consistency": "Cohérence fournisseur (moy.)",
+  "llm_models.benchmark.note": "⚠️ Benchmark precision-when-reviewed: les transactions non touchées ne sont pas comptées.",
+  "counterparts.col_cat_sub": "Catégorie / Sous-catégorie"
+}
\ No newline at end of file
diff --git a/ui/i18n/it.json b/ui/i18n/it.json
index f840e28..d146d22 100644
--- a/ui/i18n/it.json
+++ b/ui/i18n/it.json
@@ -984,5 +984,51 @@
   "llm_models.stats.col.s_per_tx": "s/tx",
   "llm_models.stats.col.s_per_tx_help": "Tempo medio per singola transazione = durata media della call ÷ batch_size. Vuoto per le fasi single-shot (classifier, footer) dove 1 call ≠ N transazioni. Utile per stimare il tempo totale di un import.",
   "llm_models.stats.col.mean_s_help": "Latenza media per una singola chiamata LLM (può contenere N transazioni nel batch). Per il tempo per tx vedi la colonna s/tx.",
-  "upload.error_backend_load": "❌ **{filename}** non importato — errore nel caricamento del modello LLM.\n\n{error}\n\n👉 Apri **🤖 Modelli LLM** e verifica che il file del modello sia valido (Test 🧪). Se il file è danneggiato, ri-scaricalo dalla sezione Download."
-}
+  "upload.error_backend_load": "❌ **{filename}** non importato — errore nel caricamento del modello LLM.\n\n{error}\n\n👉 Apri **🤖 Modelli LLM** e verifica che il file del modello sia valido (Test 🧪). Se il file è danneggiato, ri-scaricalo dalla sezione Download.",
+  "nav.counterparts": "🏪 Controparti",
+  "nav.counterparts.desc": "Statistiche per controparte/vendor con creazione rapida di regole",
+  "counterparts.title": "🏪 Controparti",
+  "counterparts.caption": "Elenco vendor/controparti con statistiche di categorizzazione. Modifica la categoria direttamente nella griglia e salva per creare o aggiornare una regola automatica.",
+  "counterparts.empty": "Nessuna transazione categorizzata trovata. Importa e categorizza le transazioni prima di usare questa pagina.",
+  "counterparts.col_counterpart": "Controparte",
+  "counterparts.col_tx_count": "# Tx",
+  "counterparts.col_avg_amount": "Valore medio",
+  "counterparts.col_category": "Categoria",
+  "counterparts.col_subcategory": "Sottocategoria",
+  "counterparts.col_source": "Modalità",
+  "counterparts.col_variability": "Variabilità",
+  "counterparts.col_checked": "✓",
+  "counterparts.sort_by": "Ordina per",
+  "counterparts.sort_asc": "Crescente",
+  "counterparts.sort_tx_count": "# Transazioni",
+  "counterparts.sort_avg_amount": "Valore medio",
+  "counterparts.sort_variability": "Variabilità",
+  "counterparts.sort_name": "Nome",
+  "counterparts.filter_low_var": "Solo bassa variabilità",
+  "counterparts.filter_unchecked": "Solo non validate",
+  "counterparts.grid_hint": "{n} controparti — modifica Categoria o Sottocategoria nelle celle evidenziate, poi salva",
+  "counterparts.changes_pending": "{n} riga/righe modificata/e — pronta/e per diventare regola",
+  "counterparts.retroapply": "Applica anche alle {n} transazioni esistenti",
+  "counterparts.save_btn": "💾 Salva come regole",
+  "counterparts.saved_ok": "✅ {n} regola/e creata/e o aggiornata/e.",
+  "counterparts.retroapplied": "{n} transazioni aggiornate.",
+  "counterparts.no_changes": "Nessuna modifica in sospeso. Cambia Categoria o Sottocategoria in una riga per creare una regola.",
+  "counterparts.filter_source": "Modalità",
+  "counterparts.filter_source_all": "Tutte",
+  "counterparts.filter_source_rule": "📏 Regola",
+  "counterparts.filter_source_llm": "🤖 LLM",
+  "counterparts.filter_source_mixed": "🔀 Misto",
+  "counterparts.filter_source_manual": "✋ Manuale",
+  "llm_models.benchmark.title": "📊 Benchmark implicito utente",
+  "llm_models.benchmark.caption": "Accuratezza stimata per modello basata sulle correzioni di categoria effettuate dall'utente. Una correzione = il modello aveva sbagliato. Solo categorizzazioni LLM tracciate.",
+  "llm_models.benchmark.unavailable": "Tabella benchmark non ancora disponibile.",
+  "llm_models.benchmark.empty": "Nessuna correzione registrata. Il benchmark si popola man mano che l'utente corregge le categorie nel Ledger, Review o Controparti.",
+  "llm_models.benchmark.col_model": "Modello",
+  "llm_models.benchmark.col_total": "Categorizzazioni LLM",
+  "llm_models.benchmark.col_corrections": "Correzioni utente",
+  "llm_models.benchmark.col_accuracy": "Accuratezza implicita",
+  "llm_models.benchmark.col_hce": "Errori alta confidence",
+  "llm_models.benchmark.col_consistency": "Coerenza vendor (media)",
+  "llm_models.benchmark.note": "⚠️ Benchmark su \"precision-when-reviewed\": le transazioni mai toccate non entrano nel calcolo. Più correzioni → stima più affidabile.",
+  "counterparts.col_cat_sub": "Categoria / Sottocategoria"
+}
\ No newline at end of file
diff --git a/ui/llm_models_page.py b/ui/llm_models_page.py
index 7f92d8b..6d34eb5 100644
--- a/ui/llm_models_page.py
+++ b/ui/llm_models_page.py
@@ -619,11 +619,63 @@ def render_llm_models_page(engine) -> None:
     st.subheader(t("llm_models.operations.title"))
     _render_stats_7d(engine)
     st.divider()
+    _render_correction_benchmark(engine)
+    st.divider()
     _render_calibrate_stub()
     st.divider()
     _render_download()
 
 
+def _render_correction_benchmark(engine) -> None:
+    """Live implicit benchmark derived from user category corrections."""
+    import pandas as pd
+    from db import repository
+    from sqlalchemy.orm import sessionmaker
+
+    st.markdown(f"**{t('llm_models.benchmark.title')}**")
+    st.caption(t("llm_models.benchmark.caption"))
+
+    try:
+        _Session = sessionmaker(bind=engine, expire_on_commit=False)
+        s = _Session()
+        try:
+            rows = repository.get_correction_benchmark(s)
+        finally:
+            s.close()
+    except Exception:
+        st.caption(t("llm_models.benchmark.unavailable"))
+        return
+
+    if not rows:
+        st.caption(t("llm_models.benchmark.empty"))
+        return
+
+    df = pd.DataFrame(rows)
+    df.rename(columns={
+        "model": t("llm_models.benchmark.col_model"),
+        "total_categorized": t("llm_models.benchmark.col_total"),
+        "total_corrections": t("llm_models.benchmark.col_corrections"),
+        "implicit_accuracy": t("llm_models.benchmark.col_accuracy"),
+        "high_conf_errors": t("llm_models.benchmark.col_hce"),
+        "avg_consistency_at_error": t("llm_models.benchmark.col_consistency"),
+    }, inplace=True)
+
+    col_acc = t("llm_models.benchmark.col_accuracy")
+    col_cons = t("llm_models.benchmark.col_consistency")
+
+    st.dataframe(
+        df.style.format(
+            {
+                col_acc: lambda v: f"{v:.1f}%" if v is not None else "—",
+                col_cons: lambda v: f"{v:.1f}%" if v is not None else "—",
+            }
+        ).background_gradient(subset=[col_acc], cmap="RdYlGn", vmin=0, vmax=100),
+        use_container_width=True,
+        hide_index=True,
+    )
+    st.caption(t("llm_models.benchmark.note"))
+
+
 def _render_stats_7d(engine) -> None:
     """Aggregate llm_usage_log over the last 7 days, group by caller × model."""
     from sqlalchemy import text as _sql
diff --git a/ui/registry_page.py b/ui/registry_page.py
index 444c43a..75992ff 100644
--- a/ui/registry_page.py
+++ b/ui/registry_page.py
@@ -373,7 +373,7 @@ def render_registry_page(engine):
                         f"a «{_new_cat}». Sottocategorie valide: {', '.join(_valid_subs)}"
                     )
                     continue
-                tx_svc.update_category(tx_id, _new_cat, _new_sub)
+                tx_svc.update_category(tx_id, _new_cat, _new_sub, origin="ledger")
                 n_cat += 1
                 _desc = str(orig["Descrizione"]).strip()
                 if _desc:
diff --git a/ui/review_page.py b/ui/review_page.py
index 68719e5..b26f009 100644
--- a/ui/review_page.py
+++ b/ui/review_page.py
@@ -300,7 +300,7 @@ def render_review_page(engine):
             review_retroactive = False
 
         if st.button(t("review.apply_btn"), type="primary"):
-            ok = tx_svc.update_category(selected_tx.id, new_cat, new_sub)
+            ok = tx_svc.update_category(selected_tx.id, new_cat, new_sub, origin="review")
             if ok:
                 rule_msg = ""
                 if save_rule and selected_tx.description:
@@ -321,7 +321,7 @@ def render_review_page(engine):
                         n_similar = 0
                         for stx in similar:
                             if stx.id != selected_tx.id:
-                                tx_svc.update_category(stx.id, new_cat, new_sub)
+                                tx_svc.update_category(stx.id, new_cat, new_sub, origin="review")
                                 n_similar += 1
                         if n_similar:
                             rule_msg += f" · {n_similar} transazioni simili aggiornate."
diff --git a/ui/rules_page.py b/ui/rules_page.py
index 050404a..fd07804 100644
--- a/ui/rules_page.py
+++ b/ui/rules_page.py
@@ -200,7 +200,7 @@ def render_rules_page(engine):
                     )
                     if ok and also_fix_txs and n_affected > 0:
                         for tx in affected:
-                            tx_svc.update_category(tx.id, new_cat, new_sub)
+                            tx_svc.update_category(tx.id, new_cat, new_sub, origin="rule_apply")
                             if new_ctx:
                                 tx_svc.update_context(tx.id, new_ctx)
                     if ok:
@@ -301,7 +301,7 @@ def render_rules_page(engine):
                 logger.info(f"rules_page: updated existing rule pattern={nr_pattern!r} cat={nr_cat!r} ctx={nr_ctx!r}")
             if nr_also_apply and _nr_preview_txs:
                 for _tx in _nr_preview_txs:
-                    tx_svc.update_category(_tx.id, nr_cat, nr_sub)
+                    tx_svc.update_category(_tx.id, nr_cat, nr_sub, origin="rule_apply")
                     if nr_ctx:
                         tx_svc.update_context(_tx.id, nr_ctx)
                 logger.info(
diff --git a/ui/sidebar.py b/ui/sidebar.py
index 3c2014e..baf9def 100644
--- a/ui/sidebar.py
+++ b/ui/sidebar.py
@@ -16,6 +16,7 @@
     ("budget_vs_actual","budget_vs_actual"),
     ("review",          "review"),
     ("rules",           "rules"),
+    ("counterparts",    "counterparts"),
     ("taxonomy",        "taxonomy"),
     ("llm_models",      "llm_models"),
     ("settings",        "settings"),
diff --git a/ui/widgets/cat_select.py b/ui/widgets/cat_select.py
new file mode 100644
index 0000000..7770f70
--- /dev/null
+++ b/ui/widgets/cat_select.py
@@ -0,0 +1,58 @@
+"""Category + subcategory as a single combined string.
+
+Pattern: "Categoria / Sottocategoria" (separator " / ").
+When the category has no subcategories the string is just "Categoria".
+
+Public API:
+    build_cat_options(taxonomy) -> list[str]
+        Full flat list of valid combined strings, suitable for SelectboxColumn.
+
+    join_cat_sub(category, subcategory) -> str
+        Build the combined string from separate fields.
+
+    split_cat_sub(value) -> (category, subcategory)
+        Parse back to separate fields. Returns ("", "") for empty/None.
+"""
+from __future__ import annotations
+
+from core.categorizer import TaxonomyConfig
+
+SEP = " / "
+
+
+def build_cat_options(taxonomy: TaxonomyConfig, *, include_empty: bool = False) -> list[str]:
+    """Return all valid category+subcategory combinations as combined strings.
+
+    For categories with subcategories: one entry per subcategory ("Cat / Sub").
+    For categories without subcategories: one entry for the category alone ("Cat").
+    """
+    options: list[str] = []
+    if include_empty:
+        options.append("")
+    for cat in taxonomy.all_expense_categories + taxonomy.all_income_categories:
+        subs = taxonomy.valid_subcategories(cat)
+        if subs:
+            for sub in subs:
+                options.append(f"{cat}{SEP}{sub}")
+        else:
+            options.append(cat)
+    return options
+
+
+def join_cat_sub(category: str | None, subcategory: str | None) -> str:
+    """Combine category and subcategory into one display string."""
+    cat = (category or "").strip()
+    sub = (subcategory or "").strip()
+    if cat and sub:
+        return f"{cat}{SEP}{sub}"
+    return cat
+
+
+def split_cat_sub(value: str | None) -> tuple[str, str]:
+    """Parse a combined string back to (category, subcategory)."""
+    if not value:
+        return "", ""
+    if SEP in value:
+        cat, sub = value.split(SEP, 1)
+        return cat.strip(), sub.strip()
+    return value.strip(), ""

From f54ac8819b84dd224efe5561b1c52c85a38dc00c Mon Sep 17 00:00:00 2001
From: Luigi Corsaro <5324491+drake69@users.noreply.github.com>
Date: Thu, 25 Jun 2026 21:37:23 +0200
Subject: [PATCH 2/2] fix(rules+counterparts): case-insensitive rule upsert and
 counterpart grouping

- store rule pattern verbatim (matching already case-insensitive at compare time)
- case-insensitive upsert dedup for contains/exact rules
- group counterparts case-insensitively, keep first-seen casing for display
- add regression tests

Fixes the 4 failing rule tests on #140.
---
 db/repository.py                      | 42 ++++++++++-----
 tests/test_repository_counterparts.py | 78 +++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_repository_counterparts.py

diff --git a/db/repository.py b/db/repository.py
index 7b131a6..177872a 100644
--- a/db/repository.py
+++ b/db/repository.py
@@ -757,15 +757,26 @@ def create_category_rule(
 
     Returns (rule, created) where created=False means an existing rule was updated.
     """
-    # Normalize pattern casing: contains/exact match against uppercase descriptions
-    if match_type in ("contains", "exact"):
-        pattern = pattern.upper()
+    # Pattern is stored verbatim (matching is case-insensitive at compare time:
+    # see categorizer.matches / get_transactions_by_rule_pattern). The upsert
+    # lookup is case-insensitive for contains/exact so "coop"/"COOP" dedup to one.
+    from sqlalchemy import func
 
-    existing = (
-        session.query(CategoryRule)
-        .filter(CategoryRule.pattern == pattern, CategoryRule.match_type == match_type)
-        .first()
-    )
+    if match_type in ("contains", "exact"):
+        existing = (
+            session.query(CategoryRule)
+            .filter(
+                func.upper(CategoryRule.pattern) == pattern.upper(),
+                CategoryRule.match_type == match_type,
+            )
+            .first()
+        )
+    else:
+        existing = (
+            session.query(CategoryRule)
+            .filter(CategoryRule.pattern == pattern, CategoryRule.match_type == match_type)
+            .first()
+        )
     if existing is not None:
         existing.category = category
         existing.subcategory = subcategory
@@ -802,8 +813,8 @@ def update_category_rule(
     if rule is None:
         return False
     if pattern is not None:
-        _mt = match_type or rule.match_type
-        rule.pattern = pattern.upper() if _mt in ("contains", "exact") else pattern
+        # Stored verbatim; rule matching is case-insensitive at compare time.
+        rule.pattern = pattern
     if match_type is not None:
         rule.match_type = match_type
     if category is not None:
@@ -2046,12 +2057,19 @@ def get_counterpart_stats(
         .all()
     )
 
+    # Group case-insensitively so "Coop"/"COOP" collapse into one counterpart,
+    # regardless of how the description casing was stored. The first-seen
+    # original casing is kept for display.
     groups: dict[str, list] = defaultdict(list)
+    display: dict[str, str] = {}
     for row in rows:
-        groups[row.description].append(row)
+        key = (row.description or "").upper()
+        groups[key].append(row)
+        display.setdefault(key, row.description)
 
     stats = []
-    for desc, txs in groups.items():
+    for key, txs in groups.items():
+        desc = display[key]
         tx_count = len(txs)
         avg_amount = sum(abs(float(t.amount or 0)) for t in txs) / tx_count
 
diff --git a/tests/test_repository_counterparts.py b/tests/test_repository_counterparts.py
new file mode 100644
index 0000000..b617a7e
--- /dev/null
+++ b/tests/test_repository_counterparts.py
@@ -0,0 +1,78 @@
+"""Case-insensitive behaviour of counterpart grouping and rule upsert.
+
+These lock the contract that storage casing no longer matters: descriptions
+that differ only by case collapse into a single counterpart, and rule patterns
+are stored verbatim while the upsert dedup is case-insensitive.
+"""
+from __future__ import annotations
+
+import pytest
+from sqlalchemy import create_engine
+
+from db.models import Base, Transaction, get_session
+from db.repository import create_category_rule, get_counterpart_stats
+
+
+@pytest.fixture
+def engine():
+    eng = create_engine("sqlite:///:memory:", connect_args={"check_same_thread": False})
+    Base.metadata.create_all(eng)
+    return eng
+
+
+@pytest.fixture
+def session(engine):
+    with get_session(engine) as s:
+        yield s
+
+
+def _tx(session, *, tx_id: str, description: str, amount: float = -10.0) -> None:
+    session.add(
+        Transaction(
+            id=tx_id,
+            date="2025-01-01",
+            description=description,
+            amount=amount,
+            currency="EUR",
+            tx_type="expense",
+            category="Spesa",
+            subcategory="Supermercato",
+            category_source="llm",
+            category_confidence="medium",
+            account_label="test",
+        )
+    )
+    session.flush()
+
+
+def test_counterpart_grouping_is_case_insensitive(session):
+    _tx(session, tx_id="t1", description="Coop Roma")
+    _tx(session, tx_id="t2", description="COOP ROMA")
+    _tx(session, tx_id="t3", description="coop roma")
+
+    stats = get_counterpart_stats(session)
+
+    assert len(stats) == 1
+    group = stats[0]
+    assert group["tx_count"] == 3
+    # First-seen original casing is preserved for display.
+    assert group["description"] == "Coop Roma"
+
+
+def test_rule_upsert_is_case_insensitive_and_keeps_verbatim_pattern(session):
+    rule, created = create_category_rule(
+        session, pattern="coop", match_type="contains",
+        category="Spesa", subcategory="Supermercato",
+    )
+    assert created is True
+    assert rule.pattern == "coop"
+
+    # Same pattern, different casing → updates the existing rule, no duplicate.
+    rule2, created2 = create_category_rule(
+        session, pattern="COOP", match_type="contains",
+        category="Spesa", subcategory="Altro",
+    )
+    assert created2 is False
+    assert rule2.id == rule.id
+    # Stored pattern stays as originally entered.
+    assert rule2.pattern == "coop"