From 8c6806a131ede381783c189e40764739d0db7585 Mon Sep 17 00:00:00 2001
From: Ignazio De Santis <ignaziodesantisofficial@gmail.com>
Date: Tue, 26 May 2026 14:57:01 +0800
Subject: [PATCH] feat(benchmark): public eval benchmark, Tier-A telemetry,
 /api/benchmark-latest

Graduate EvalOps from a showcase shell to a working evaluation harness with a
public, reproducible benchmark. Implements the project's published MVP contract.

- eval engine (stdlib, src/evalops_workbench/eval/): dataset loader, SQuAD-style
  scorers (exact match, token F1, contains-gold), three extractive-QA strategies,
  runner + per-case regression diff, pinned-baseline gate, run ledger
- examples/benchmark-v1: 38-case public fixture (CC0 original prose), 26 standard
  + 12 adversarial. Candidate lifts exact match 0.00 -> 0.63 and token F1 +0.38
  while surfacing 12 per-case regressions a regression gate exists to catch
- benchmark_runner publishes a committed artifact (api/_benchmark_latest.json +
  api/_benchmark_history.json), so persistence needs no external store
- api/benchmark-latest.py serves the latest run (schema-conformed, previous_run)
- api/stats.py flipped to mode:"live" with benchmark-derived metrics
  (eval_runs, pass rate, distinct regressions caught); honest degraded fallback
- .github/workflows/benchmark.yml: weekly + on-demand reproducibility re-run that
  commits results back and validates artifact freshness
- dashboard renders the latest benchmark (variant comparison, deltas, regressions)
- project stage Researching -> Prototype

All metrics computed from committed runs; nothing simulated or seeded.
Tests: 38 unittest + 36 vitest green; next build + tsc clean.

Refs: outputs/plans/PLAN_C_PROOF_FIRST.md (Phase 2)
---
 .github/workflows/benchmark.yml               |  77 ++++
 api/_benchmark_history.json                   |  30 ++
 api/_benchmark_latest.json                    | 202 ++++++++++
 api/benchmark-latest.py                       |  76 ++++
 api/stats.py                                  | 244 +++++-------
 examples/benchmark-v1/README.md               |  63 ++++
 examples/benchmark-v1/cases.jsonl             |  38 ++
 examples/benchmark-v1/pinned-baseline.json    |   9 +
 .../archive/evalops-2026-05-26-308b0b20.json  | 202 ++++++++++
 .../benchmark-v1/results/latest-report.md     |  43 +++
 src/app/page.tsx                              | 353 +++++++++++++-----
 src/app/telemetry/page.tsx                    |  93 ++---
 src/evalops_workbench/benchmark_runner.py     | 106 ++++++
 src/evalops_workbench/eval/__init__.py        |  12 +
 src/evalops_workbench/eval/artifact.py        | 134 +++++++
 src/evalops_workbench/eval/baseline.py        |  81 ++++
 src/evalops_workbench/eval/cases.py           |  94 +++++
 src/evalops_workbench/eval/ledger.py          |  39 ++
 src/evalops_workbench/eval/normalize.py       |  34 ++
 src/evalops_workbench/eval/report.py          |  66 ++++
 src/evalops_workbench/eval/runner.py          |  95 +++++
 src/evalops_workbench/eval/scorers.py         |  60 +++
 src/evalops_workbench/eval/targets.py         | 147 ++++++++
 src/evalops_workbench/project.json            |   7 +-
 src/lib/api.ts                                |  85 ++++-
 src/lib/project.ts                            |   4 +-
 tests/test_benchmark_endpoint.py              |  75 ++++
 tests/test_eval_engine.py                     | 272 ++++++++++++++
 tests/test_stats.py                           | 214 ++++-------
 29 files changed, 2510 insertions(+), 445 deletions(-)
 create mode 100644 .github/workflows/benchmark.yml
 create mode 100644 api/_benchmark_history.json
 create mode 100644 api/_benchmark_latest.json
 create mode 100644 api/benchmark-latest.py
 create mode 100644 examples/benchmark-v1/README.md
 create mode 100644 examples/benchmark-v1/cases.jsonl
 create mode 100644 examples/benchmark-v1/pinned-baseline.json
 create mode 100644 examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json
 create mode 100644 examples/benchmark-v1/results/latest-report.md
 create mode 100644 src/evalops_workbench/benchmark_runner.py
 create mode 100644 src/evalops_workbench/eval/__init__.py
 create mode 100644 src/evalops_workbench/eval/artifact.py
 create mode 100644 src/evalops_workbench/eval/baseline.py
 create mode 100644 src/evalops_workbench/eval/cases.py
 create mode 100644 src/evalops_workbench/eval/ledger.py
 create mode 100644 src/evalops_workbench/eval/normalize.py
 create mode 100644 src/evalops_workbench/eval/report.py
 create mode 100644 src/evalops_workbench/eval/runner.py
 create mode 100644 src/evalops_workbench/eval/scorers.py
 create mode 100644 src/evalops_workbench/eval/targets.py
 create mode 100644 tests/test_benchmark_endpoint.py
 create mode 100644 tests/test_eval_engine.py

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..9326774
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,77 @@
+name: Benchmark
+
+# The benchmark is deterministic and reproducible. This workflow re-verifies it
+# weekly (and on demand), refreshes the published artifact, and commits the
+# result back to the repo, where /api/benchmark-latest and /api/stats serve it.
+# It is reproducibility verification, not synthetic daily activity.
+
+on:
+  schedule:
+    - cron: "0 6 * * 1" # Mondays 06:00 UTC
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install package
+        run: pip install -e .
+
+      - name: Run benchmark
+        run: python -m evalops_workbench.benchmark_runner
+
+      - name: Validate published artifact
+        run: |
+          python - <<'PY'
+          import json
+          from datetime import datetime, timezone
+
+          artifact = json.load(open("api/_benchmark_latest.json"))
+          assert artifact["system"] == "evalops", "wrong system"
+          assert artifact["schema_version"] == 1, "schema_version must be 1"
+          assert artifact["benchmark_type"] == "eval", "wrong benchmark_type"
+          assert artifact["metrics"], "metrics missing"
+          assert artifact["generated_at"], "generated_at missing"
+          generated = datetime.strptime(
+              artifact["generated_at"], "%Y-%m-%dT%H:%M:%SZ"
+          ).replace(tzinfo=timezone.utc)
+          age = (datetime.now(timezone.utc) - generated).total_seconds()
+          assert age < 600, f"artifact is stale ({age:.0f}s old)"
+          print(f"artifact valid; {artifact['metrics']['n_cases']} cases; age {age:.0f}s")
+          PY
+
+      - name: Commit results if changed
+        run: |
+          git config user.name "eleventh-bot"
+          git config user.email "noreply@eleventh.dev"
+          git add api/_benchmark_latest.json api/_benchmark_history.json \
+                  examples/benchmark-v1/results examples/benchmark-v1/pinned-baseline.json
+          if git diff --cached --quiet; then
+            echo "No benchmark changes to commit."
+          else
+            git commit -m "chore(benchmark): scheduled run [skip ci]"
+            git push
+          fi
+
+      - name: Live endpoint check (soft)
+        continue-on-error: true
+        run: |
+          sleep 20
+          for path in stats benchmark-latest; do
+            url="https://evalops-workbench.eleventh.dev/api/$path"
+            echo "GET $url"
+            curl -s --max-time 30 -A "Mozilla/5.0 ci" "$url" \
+              | python -c "import sys, json; d = json.load(sys.stdin); print('  ', {k: d.get(k) for k in ('system', 'mode', 'status', 'schema_version', 'benchmark_type')})" \
+              || echo "  (endpoint not reachable yet; redeploy may be in flight)"
+          done
diff --git a/api/_benchmark_history.json b/api/_benchmark_history.json
new file mode 100644
index 0000000..c08a83a
--- /dev/null
+++ b/api/_benchmark_history.json
@@ -0,0 +1,30 @@
+[
+  {
+    "run_id": "evalops-2026-05-26-308b0b20",
+    "generated_at": "2026-05-26T06:49:52Z",
+    "pass_rate": 0.631579,
+    "token_f1": 0.651316,
+    "exact_match": 0.631579,
+    "regressions": 12,
+    "regressed_ids": [
+      "adv-01",
+      "adv-02",
+      "adv-03",
+      "adv-04",
+      "adv-05",
+      "adv-06",
+      "adv-07",
+      "adv-08",
+      "adv-09",
+      "adv-10",
+      "adv-11",
+      "adv-12"
+    ],
+    "gate_verdict": "pass",
+    "variants": [
+      "first_sentence",
+      "overlap_sentence",
+      "span_extract"
+    ]
+  }
+]
diff --git a/api/_benchmark_latest.json b/api/_benchmark_latest.json
new file mode 100644
index 0000000..59472d5
--- /dev/null
+++ b/api/_benchmark_latest.json
@@ -0,0 +1,202 @@
+{
+  "system": "evalops",
+  "benchmark_type": "eval",
+  "run_id": "evalops-2026-05-26-308b0b20",
+  "fixture": "benchmark-v1",
+  "metrics": {
+    "n_cases": 38,
+    "baseline_variant": "overlap_sentence",
+    "candidate_variant": "span_extract",
+    "exact_match": 0.631579,
+    "token_f1": 0.651316,
+    "contains_gold": 0.684211,
+    "improvement_exact_match": 0.631579,
+    "improvement_token_f1": 0.377036,
+    "regressions": 12,
+    "pass_rate": 0.631579,
+    "gate_verdict": "pass"
+  },
+  "variants": [
+    {
+      "name": "first_sentence",
+      "exact_match": 0.0,
+      "token_f1": 0.27428,
+      "contains_gold": 1.0
+    },
+    {
+      "name": "overlap_sentence",
+      "exact_match": 0.0,
+      "token_f1": 0.27428,
+      "contains_gold": 1.0
+    },
+    {
+      "name": "span_extract",
+      "exact_match": 0.631579,
+      "token_f1": 0.651316,
+      "contains_gold": 0.684211
+    }
+  ],
+  "regressions": [
+    {
+      "case_id": "adv-01",
+      "metric": "token_f1",
+      "baseline": 0.153846,
+      "candidate": 0.0,
+      "delta": -0.153846,
+      "reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-02",
+      "metric": "token_f1",
+      "baseline": 0.333333,
+      "candidate": 0.0,
+      "delta": -0.333333,
+      "reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-03",
+      "metric": "token_f1",
+      "baseline": 0.307692,
+      "candidate": 0.0,
+      "delta": -0.307692,
+      "reason": "token_f1: candidate 0.00 below baseline 0.31 (-0.31)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-04",
+      "metric": "token_f1",
+      "baseline": 0.333333,
+      "candidate": 0.0,
+      "delta": -0.333333,
+      "reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-05",
+      "metric": "token_f1",
+      "baseline": 0.166667,
+      "candidate": 0.0,
+      "delta": -0.166667,
+      "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
+      "tags": [
+        "adversarial",
+        "where"
+      ]
+    },
+    {
+      "case_id": "adv-06",
+      "metric": "token_f1",
+      "baseline": 0.166667,
+      "candidate": 0.0,
+      "delta": -0.166667,
+      "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
+      "tags": [
+        "adversarial",
+        "where"
+      ]
+    },
+    {
+      "case_id": "adv-07",
+      "metric": "token_f1",
+      "baseline": 0.153846,
+      "candidate": 0.0,
+      "delta": -0.153846,
+      "reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)",
+      "tags": [
+        "adversarial",
+        "where"
+      ]
+    },
+    {
+      "case_id": "adv-08",
+      "metric": "token_f1",
+      "baseline": 0.166667,
+      "candidate": 0.0,
+      "delta": -0.166667,
+      "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
+      "tags": [
+        "adversarial",
+        "when"
+      ]
+    },
+    {
+      "case_id": "adv-09",
+      "metric": "token_f1",
+      "baseline": 0.181818,
+      "candidate": 0.0,
+      "delta": -0.181818,
+      "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
+      "tags": [
+        "adversarial",
+        "when"
+      ]
+    },
+    {
+      "case_id": "adv-10",
+      "metric": "token_f1",
+      "baseline": 0.181818,
+      "candidate": 0.0,
+      "delta": -0.181818,
+      "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
+      "tags": [
+        "adversarial",
+        "when"
+      ]
+    },
+    {
+      "case_id": "adv-11",
+      "metric": "token_f1",
+      "baseline": 0.181818,
+      "candidate": 0.0,
+      "delta": -0.181818,
+      "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
+      "tags": [
+        "adversarial",
+        "how-many"
+      ]
+    },
+    {
+      "case_id": "adv-12",
+      "metric": "token_f1",
+      "baseline": 0.2,
+      "candidate": 0.0,
+      "delta": -0.2,
+      "reason": "token_f1: candidate 0.00 below baseline 0.20 (-0.20)",
+      "tags": [
+        "adversarial",
+        "how-many"
+      ]
+    }
+  ],
+  "gate": {
+    "passed": true,
+    "metric": "token_f1",
+    "pinned": null,
+    "observed": 0.651316,
+    "reasons": [
+      "no pinned baseline yet; this run establishes the contract"
+    ]
+  },
+  "artifact_urls": {
+    "report": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/latest-report.md",
+    "fixture": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/cases.jsonl",
+    "run": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json"
+  },
+  "schema_version": 1,
+  "generated_at": "2026-05-26T06:49:52Z",
+  "previous_run": null
+}
diff --git a/api/benchmark-latest.py b/api/benchmark-latest.py
new file mode 100644
index 0000000..299cee9
--- /dev/null
+++ b/api/benchmark-latest.py
@@ -0,0 +1,76 @@
+"""Public benchmark endpoint: the latest published evaluation run.
+
+Stdlib-only Vercel Python serverless function. Serves the committed artifact at
+``api/_benchmark_latest.json`` (written by ``evalops_workbench.benchmark_runner``
+and refreshed by the nightly cron). The artifact already conforms to the
+benchmark-latest specification in TELEMETRY_SCHEMA.md, so this endpoint reads and
+returns it directly. The contract forbids HTTP 5xx; a missing artifact yields a
+valid ``status: "pending"`` envelope.
+"""
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+from http.server import BaseHTTPRequestHandler
+from pathlib import Path
+from typing import Any
+
+SYSTEM_SLUG = "evalops"
+BENCHMARK_TYPE = "eval"
+SCHEMA_VERSION = 1
+ARTIFACT_FILE = Path(__file__).parent / "_benchmark_latest.json"
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def _pending_payload() -> dict[str, Any]:
+    """Honest envelope for the window before the first run is published."""
+    return {
+        "system": SYSTEM_SLUG,
+        "benchmark_type": BENCHMARK_TYPE,
+        "status": "pending",
+        "run_id": None,
+        "metrics": None,
+        "schema_version": SCHEMA_VERSION,
+        "generated_at": _now_iso(),
+    }
+
+
+def build_response() -> dict[str, Any]:
+    try:
+        return json.loads(ARTIFACT_FILE.read_text(encoding="utf-8"))
+    except (FileNotFoundError, json.JSONDecodeError, OSError, ValueError):
+        return _pending_payload()
+
+
+class handler(BaseHTTPRequestHandler):
+    """Vercel Python serverless entrypoint."""
+
+    def _write_common_headers(self) -> None:
+        self.send_header("Cache-Control", "public, max-age=30, stale-while-revalidate=60")
+        self.send_header("Access-Control-Allow-Origin", "*")
+        self.send_header("Access-Control-Allow-Methods", "GET, OPTIONS")
+        self.send_header("Access-Control-Allow-Headers", "Content-Type")
+
+    def do_OPTIONS(self) -> None:  # noqa: N802 (interface contract)
+        self.send_response(204)
+        self._write_common_headers()
+        self.end_headers()
+
+    def do_GET(self) -> None:  # noqa: N802 (interface contract)
+        try:
+            payload = build_response()
+        except Exception:  # noqa: BLE001 (last resort: contract forbids 5xx)
+            payload = _pending_payload()
+        body = json.dumps(payload, separators=(",", ":")).encode("utf-8")
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self._write_common_headers()
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, fmt: str, *args: Any) -> None:  # noqa: A002, ARG002
+        return
diff --git a/api/stats.py b/api/stats.py
index 69b9754..3ad5228 100644
--- a/api/stats.py
+++ b/api/stats.py
@@ -1,50 +1,44 @@
-"""Public telemetry endpoint for the showcase deploy.
+"""Public telemetry endpoint for EvalOps Workbench (Tier A, live workload).
 
-Stdlib-only Vercel Python serverless function. Reports honest GitHub-derived
-signals about the codebase, never simulated workload metrics. The Tier B
-endpoint is consumed by the Production Telemetry panel on
-https://eleventh.dev. See:
+Stdlib-only Vercel Python serverless function. The live workload is the public
+benchmark: ``evalops_workbench.benchmark_runner`` runs nightly, persists each
+result to the repo, and this endpoint reports honest metrics derived from that
+durable history. See:
 
   https://github.com/IgnazioDS/IgnazioDS/blob/main/TELEMETRY_SCHEMA.md
+
+Every value is computed from committed run records. Nothing is simulated,
+seeded, or incremented in memory. If no run has been published the endpoint
+degrades honestly (status="degraded", zeroed metrics) and never returns 5xx.
 """
 from __future__ import annotations
 
 import json
 import os
-import re
-import time
 from datetime import datetime, timedelta, timezone
 from http.server import BaseHTTPRequestHandler
 from pathlib import Path
 from typing import Any
-from urllib.error import HTTPError, URLError
-from urllib.request import Request, urlopen
 
-# --- repo identity ---
 SYSTEM_SLUG = "evalops"
-GITHUB_OWNER = "IgnazioDS"
-GITHUB_REPO = "evalops-workbench"
-
-# --- contract constants ---
 SCHEMA_VERSION = 1
-HTTP_TIMEOUT_S = 4.0
-CACHE_TTL_S = 300  # 5 min, stays well under GitHub's 60-req/hr unauth cap
 
-# --- safety caps: never expose values larger than these ---
+ARTIFACT_FILE = Path(__file__).parent / "_benchmark_latest.json"
+HISTORY_FILE = Path(__file__).parent / "_benchmark_history.json"
+STATIC_FILE = Path(__file__).parent / "_telemetry_static.json"
+
+# Sanity caps: never expose values larger than these (defence against a runaway
+# history file). The benchmark publishes one run per scheduled invocation.
 SAFETY_CAPS: dict[str, int] = {
-    "commits_total": 1_000_000,
-    "commits_30d": 100_000,
-    "lines_of_code": 10_000_000,
-    "repo_stars": 1_000_000,
+    "eval_runs_total": 1_000_000,
+    "eval_runs_24h": 10_000,
+    "regressions_caught_30d": 1_000_000,
+    "experiments_tracked": 100_000,
 }
 
-GITHUB_API = "https://api.github.com"
-USER_AGENT = "eleventh-telemetry/1.0 (+https://eleventh.dev)"
-STATIC_FILE = Path(__file__).parent / "_telemetry_static.json"
 
-# Module-scope cache survives across warm Vercel invocations; cold starts pay
-# one GitHub round-trip and prime the cache for ~5min of subsequent requests.
-_cache: dict[str, Any] = {"ts": 0.0, "payload": None}
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
 
 
 def _cap(name: str, value: int) -> int:
@@ -52,159 +46,102 @@ def _cap(name: str, value: int) -> int:
     return min(value, cap) if cap is not None else value
 
 
-def _now_iso() -> str:
-    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-
-
-def _load_static() -> dict[str, Any]:
-    """Read the build-time artifact (lines_of_code, built_at). Missing fields
-    are silently treated as absent per the spec ("omit rather than estimate")."""
+def _read_json(path: Path) -> Any:
     try:
-        return json.loads(STATIC_FILE.read_text(encoding="utf-8"))
+        return json.loads(path.read_text(encoding="utf-8"))
     except (FileNotFoundError, json.JSONDecodeError, OSError, ValueError):
-        return {}
+        return None
 
 
-def _http_get(url: str) -> tuple[Any, dict[str, str]]:
-    """Stdlib HTTP GET. Returns (parsed_json, response_headers)."""
-    req = Request(
-        url,
-        headers={"User-Agent": USER_AGENT, "Accept": "application/vnd.github+json"},
-    )
-    with urlopen(req, timeout=HTTP_TIMEOUT_S) as resp:  # noqa: S310 (https only)
-        body = resp.read().decode("utf-8")
-        # Headers is a Message object; convert to plain dict for portability.
-        hdrs = {k.lower(): v for k, v in resp.getheaders()}
-    return json.loads(body), hdrs
-
+def _parse_iso(value: Any) -> datetime | None:
+    if not isinstance(value, str):
+        return None
+    try:
+        return datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+    except ValueError:
+        return None
 
-_LAST_PAGE_RE = re.compile(r'<[^>]*[?&]page=(\d+)[^>]*>;\s*rel="last"')
 
+def _within(record: dict, days: int, now: datetime) -> bool:
+    stamp = _parse_iso(record.get("generated_at"))
+    return stamp is not None and (now - stamp) <= timedelta(days=days)
 
-def _commits_count_from_link_header(link_header: str, when_no_last: int) -> int:
-    """Parse the 'last' page number from GitHub's Link header.
 
-    With per_page=1, the page count IS the total record count. When no Link
-    header is present (single page of results), fall back to ``when_no_last``.
-    """
-    match = _LAST_PAGE_RE.search(link_header or "")
-    if match:
-        return int(match.group(1))
-    return when_no_last
+def _zeroed_metrics() -> dict[str, Any]:
+    return {
+        "eval_runs_total": 0,
+        "eval_runs_24h": 0,
+        "last_pass_rate": 0.0,
+        "rolling_pass_rate_7d": 0.0,
+        "regressions_caught_30d": 0,
+        "experiments_tracked": 0,
+    }
 
 
-def _fetch_metrics() -> tuple[dict[str, Any], str | None]:
-    """Pull GitHub-derived metrics. Returns (metrics, last_commit_at)."""
-    repo, _ = _http_get(f"{GITHUB_API}/repos/{GITHUB_OWNER}/{GITHUB_REPO}")
-    repo_stars = _cap("repo_stars", int(repo.get("stargazers_count") or 0))
-    primary_language = repo.get("language") or "Unknown"
+def _metrics_from_history(history: list[dict], now: datetime) -> dict[str, Any]:
+    if not history:
+        return _zeroed_metrics()
 
-    commits_url = (
-        f"{GITHUB_API}/repos/{GITHUB_OWNER}/{GITHUB_REPO}/commits?per_page=1"
-    )
-    latest_commits, latest_hdrs = _http_get(commits_url)
-    commits_total = _cap(
-        "commits_total",
-        _commits_count_from_link_header(latest_hdrs.get("link", ""), len(latest_commits)),
-    )
-    last_commit_at: str | None = None
-    if latest_commits:
-        last_commit_at = (
-            latest_commits[0].get("commit", {}).get("author", {}).get("date")
-        )
-
-    since = (datetime.now(timezone.utc) - timedelta(days=30)).strftime(
-        "%Y-%m-%dT%H:%M:%SZ"
-    )
-    recent_url = (
-        f"{GITHUB_API}/repos/{GITHUB_OWNER}/{GITHUB_REPO}"
-        f"/commits?per_page=1&since={since}"
-    )
-    recent_commits, recent_hdrs = _http_get(recent_url)
-    commits_30d = _cap(
-        "commits_30d",
-        _commits_count_from_link_header(recent_hdrs.get("link", ""), len(recent_commits)),
+    runs_7d = [r for r in history if _within(r, 7, now)]
+    pass_rates_7d = [float(r.get("pass_rate", 0.0)) for r in runs_7d]
+    rolling_7d = (
+        round(sum(pass_rates_7d) / len(pass_rates_7d), 4)
+        if pass_rates_7d
+        else float(history[-1].get("pass_rate", 0.0))
     )
 
-    metrics: dict[str, Any] = {
-        "commits_30d": commits_30d,
-        "commits_total": commits_total,
-        "primary_language": primary_language,
-        "repo_stars": repo_stars,
+    variants: set[str] = set()
+    # Distinct regressions, not a per-run sum: re-detecting the same case nightly
+    # is not catching a new regression, so the 30-day count unions case ids.
+    regressed_30d: set[str] = set()
+    for record in history:
+        variants.update(record.get("variants", []) or [])
+        if _within(record, 30, now):
+            regressed_30d.update(record.get("regressed_ids", []) or [])
+
+    return {
+        "eval_runs_total": _cap("eval_runs_total", len(history)),
+        "eval_runs_24h": _cap("eval_runs_24h", sum(1 for r in history if _within(r, 1, now))),
+        "last_pass_rate": round(float(history[-1].get("pass_rate", 0.0)), 4),
+        "rolling_pass_rate_7d": rolling_7d,
+        "regressions_caught_30d": _cap("regressions_caught_30d", len(regressed_30d)),
+        "experiments_tracked": _cap("experiments_tracked", len(variants)),
     }
-    static = _load_static()
-    loc = static.get("lines_of_code")
-    if isinstance(loc, int) and loc > 0:
-        metrics["lines_of_code"] = _cap("lines_of_code", loc)
-    return metrics, last_commit_at
-
-
-def _zeroed_metrics() -> dict[str, Any]:
-    metrics: dict[str, Any] = {
-        "commits_30d": 0,
-        "commits_total": 0,
-        "primary_language": "Unknown",
-        "repo_stars": 0,
-    }
-    static = _load_static()
-    loc = static.get("lines_of_code")
-    if isinstance(loc, int) and loc > 0:
-        metrics["lines_of_code"] = _cap("lines_of_code", loc)
-    return metrics
 
 
 def _build_response() -> dict[str, Any]:
-    """Compose the full response object. Always returns a parseable dict."""
-    now = time.time()
-    cached = _cache.get("payload")
-    if cached is not None and (now - _cache["ts"]) < CACHE_TTL_S:
-        fresh = dict(cached)
-        fresh["generated_at"] = _now_iso()
-        return fresh
-
-    static = _load_static()
-    last_deployed_at = (
-        os.environ.get("VERCEL_GIT_COMMIT_AUTHOR_DATE") or static.get("built_at")
-    )
-
-    try:
-        metrics, last_commit_at = _fetch_metrics()
+    now = datetime.now(timezone.utc)
+    static = _read_json(STATIC_FILE) or {}
+    last_deployed_at = os.environ.get("VERCEL_GIT_COMMIT_AUTHOR_DATE") or static.get("built_at")
+
+    history = _read_json(HISTORY_FILE)
+    artifact = _read_json(ARTIFACT_FILE)
+
+    if isinstance(history, list) and history:
+        metrics = _metrics_from_history(history, now)
+        last_active_at = (
+            artifact.get("generated_at") if isinstance(artifact, dict) else None
+        ) or history[-1].get("generated_at")
         status = "operational"
-    except (HTTPError, URLError, OSError, json.JSONDecodeError, ValueError, TimeoutError):
-        # Upstream unreachable. Serve last good cache if we have one,
-        # otherwise zeros. Never propagate the error.
-        if cached is not None:
-            stale = dict(cached)
-            stale["status"] = "degraded"
-            stale["generated_at"] = _now_iso()
-            return stale
+    else:
         metrics = _zeroed_metrics()
-        last_commit_at = None
+        last_active_at = None
         status = "degraded"
 
-    response: dict[str, Any] = {
+    return {
         "system": SYSTEM_SLUG,
-        "mode": "showcase",
+        "mode": "live",
         "status": status,
         "last_deployed_at": last_deployed_at,
-        "last_commit_at": last_commit_at,
+        "last_active_at": last_active_at,
         "metrics": metrics,
         "schema_version": SCHEMA_VERSION,
         "generated_at": _now_iso(),
     }
 
-    if status == "operational":
-        _cache["payload"] = response
-        _cache["ts"] = now
-    return response
-
 
 class handler(BaseHTTPRequestHandler):
-    """Vercel Python serverless entrypoint.
-
-    Vercel discovers this class by name; the runtime invokes ``do_GET`` /
-    ``do_OPTIONS`` per the BaseHTTPRequestHandler protocol.
-    """
+    """Vercel Python serverless entrypoint."""
 
     def _write_common_headers(self) -> None:
         self.send_header("Cache-Control", "public, max-age=30, stale-while-revalidate=60")
@@ -220,18 +157,17 @@ def do_OPTIONS(self) -> None:  # noqa: N802 (interface contract)
     def do_GET(self) -> None:  # noqa: N802 (interface contract)
         try:
             payload = _build_response()
-        except Exception:  # noqa: BLE001 (last-resort: contract forbids 5xx)
+        except Exception:  # noqa: BLE001 (last resort: contract forbids 5xx)
             payload = {
                 "system": SYSTEM_SLUG,
-                "mode": "showcase",
+                "mode": "live",
                 "status": "degraded",
                 "last_deployed_at": None,
-                "last_commit_at": None,
+                "last_active_at": None,
                 "metrics": _zeroed_metrics(),
                 "schema_version": SCHEMA_VERSION,
                 "generated_at": _now_iso(),
             }
-
         body = json.dumps(payload, separators=(",", ":")).encode("utf-8")
         self.send_response(200)
         self.send_header("Content-Type", "application/json")
@@ -241,4 +177,4 @@ def do_GET(self) -> None:  # noqa: N802 (interface contract)
         self.wfile.write(body)
 
     def log_message(self, fmt: str, *args: Any) -> None:  # noqa: A002, ARG002
-        return  # Suppress default access log; Vercel captures stdout/stderr.
+        return
diff --git a/examples/benchmark-v1/README.md b/examples/benchmark-v1/README.md
new file mode 100644
index 0000000..c135817
--- /dev/null
+++ b/examples/benchmark-v1/README.md
@@ -0,0 +1,63 @@
+# benchmark-v1
+
+A small, fully reproducible extractive question-answering benchmark. It is the
+workload behind EvalOps Workbench's public telemetry: a real evaluation that
+runs nightly, persists its result to this repo, and is served at
+`/api/benchmark-latest`.
+
+## What it measures
+
+Three deterministic strategies answer each question from its context passage:
+
+| Variant | Strategy |
+| --- | --- |
+| `first_sentence` | Return the opening sentence (floor). |
+| `overlap_sentence` | **Baseline.** Return the whole sentence with the most question-word overlap. Safe but blunt. |
+| `span_extract` | **Candidate.** Find that sentence, then narrow to an answer span (entity / year / number), skipping the entity the question already names. |
+
+Scores use the SQuAD convention: normalized exact match, token-overlap F1, and
+gold containment. Each prediction scores against its best-matching gold answer.
+
+The harness pins the candidate aggregate as a baseline contract
+(`pinned-baseline.json`) and blocks if a future run drops below it, and it
+surfaces every per-case regression where the candidate scored worse than the
+baseline.
+
+## Why this design
+
+The candidate lifts exact match and token F1 substantially by returning the
+answer span instead of the whole sentence, but it regresses on an adversarial
+pack where a distractor entity precedes the answer. That trade-off (a change
+that improves the aggregate while silently degrading specific cases) is exactly
+what a regression-tracking harness exists to make visible.
+
+The system-under-test is deterministic on purpose: anyone can reproduce the
+published numbers with no credentials, no network, and no cost. The harness is
+model-agnostic; a live-LLM target would implement the same interface and is an
+optional extension, never required by this benchmark.
+
+## Contents
+
+- `cases.jsonl` — 38 labelled cases. 26 standard (who / where / when / how-many
+  / what) and 12 adversarial, tagged in each row.
+- `pinned-baseline.json` — the pinned candidate-aggregate contract.
+- `results/latest-report.md` — human-readable report of the most recent run.
+- `results/archive/<run_id>.json` — every published run, by id.
+
+## Provenance and license
+
+Every passage is original prose written for this benchmark, describing
+well-known public-domain facts (geography, science, history). No third-party
+dataset text is included, so there are no upstream licensing constraints. The
+fixture is dedicated to the public domain (CC0).
+
+## Reproduce
+
+```bash
+git clone https://github.com/IgnazioDS/evalops-workbench
+cd evalops-workbench && pip install -e .
+python -m evalops_workbench.benchmark_runner
+```
+
+The run writes `api/_benchmark_latest.json` (served at `/api/benchmark-latest`),
+appends `api/_benchmark_history.json`, and refreshes the report above.
diff --git a/examples/benchmark-v1/cases.jsonl b/examples/benchmark-v1/cases.jsonl
new file mode 100644
index 0000000..dd09b6c
--- /dev/null
+++ b/examples/benchmark-v1/cases.jsonl
@@ -0,0 +1,38 @@
+{"id": "who-01", "question": "Who developed the theory of general relativity?", "context": "The theory of general relativity was developed by Albert Einstein. It reshaped modern physics.", "answers": ["Albert Einstein"], "tags": ["standard", "who"]}
+{"id": "who-02", "question": "Who discovered radium?", "context": "Radium was discovered by Marie Curie in 1898. The discovery later earned a Nobel Prize.", "answers": ["Marie Curie"], "tags": ["standard", "who"]}
+{"id": "who-03", "question": "Who wrote the play Hamlet?", "context": "The play Hamlet was written by William Shakespeare. It remains widely performed today.", "answers": ["William Shakespeare"], "tags": ["standard", "who"]}
+{"id": "who-04", "question": "Who painted the ceiling of the Sistine Chapel?", "context": "The ceiling of the Sistine Chapel was painted by Michelangelo. The work took roughly four years.", "answers": ["Michelangelo"], "tags": ["standard", "who"]}
+{"id": "who-05", "question": "Who formulated the laws of motion?", "context": "The laws of motion were formulated by Isaac Newton. They underpin classical mechanics.", "answers": ["Isaac Newton"], "tags": ["standard", "who"]}
+{"id": "who-06", "question": "Who proposed the theory of evolution by natural selection?", "context": "The theory of evolution by natural selection was proposed by Charles Darwin. It transformed biology.", "answers": ["Charles Darwin"], "tags": ["standard", "who"]}
+{"id": "where-01", "question": "Where is the Eiffel Tower located?", "context": "The Eiffel Tower is located in Paris. It opened to the public in 1889.", "answers": ["Paris"], "tags": ["standard", "where"]}
+{"id": "where-02", "question": "Where is Mount Everest located?", "context": "Mount Everest is located on the border of Nepal. It is the tallest mountain above sea level.", "answers": ["Nepal"], "tags": ["standard", "where"]}
+{"id": "where-03", "question": "Where is the Colosseum located?", "context": "The Colosseum is located in Rome. It was completed around the year 80.", "answers": ["Rome"], "tags": ["standard", "where"]}
+{"id": "where-04", "question": "Where is the Statue of Liberty located?", "context": "The Statue of Liberty is located in New York. It was a gift from the people of France.", "answers": ["New York"], "tags": ["standard", "where"]}
+{"id": "where-05", "question": "Where were the first modern Olympic Games held?", "context": "The first modern Olympic Games were held in Athens. The event revived an ancient tradition.", "answers": ["Athens"], "tags": ["standard", "where"]}
+{"id": "where-06", "question": "Where is the Great Barrier Reef located?", "context": "The Great Barrier Reef is located off the coast of Australia. It is the largest coral reef system on Earth.", "answers": ["Australia"], "tags": ["standard", "where"]}
+{"id": "when-01", "question": "When was the Declaration of Independence adopted?", "context": "The Declaration of Independence was adopted in 1776. It announced a formal separation.", "answers": ["1776"], "tags": ["standard", "when"]}
+{"id": "when-02", "question": "When did the Berlin Wall fall?", "context": "The Berlin Wall fell in 1989. Its collapse marked a turning point in European history.", "answers": ["1989"], "tags": ["standard", "when"]}
+{"id": "when-03", "question": "When was the telephone patented?", "context": "The telephone was patented in 1876 by Alexander Graham Bell. It changed daily communication.", "answers": ["1876"], "tags": ["standard", "when"]}
+{"id": "when-04", "question": "When did the first manned Moon landing occur?", "context": "The first manned Moon landing occurred in 1969. Astronauts walked on the lunar surface.", "answers": ["1969"], "tags": ["standard", "when"]}
+{"id": "when-05", "question": "When was the Eiffel Tower completed?", "context": "The Eiffel Tower was completed in 1889 for a world exhibition. It soon became a landmark.", "answers": ["1889"], "tags": ["standard", "when"]}
+{"id": "when-06", "question": "When was the Magna Carta sealed?", "context": "The Magna Carta was sealed in 1215. It limited the power of the crown.", "answers": ["1215"], "tags": ["standard", "when"]}
+{"id": "howmany-01", "question": "How many continents are there on Earth?", "context": "There are 7 continents on Earth. They vary greatly in size and population.", "answers": ["7"], "tags": ["standard", "how-many"]}
+{"id": "howmany-02", "question": "How many players does a soccer team field at once?", "context": "A soccer team fields 11 players at a time. Substitutions are permitted during play.", "answers": ["11"], "tags": ["standard", "how-many"]}
+{"id": "howmany-03", "question": "How many strings does a standard guitar have?", "context": "A standard guitar has 6 strings. Variants exist with more strings.", "answers": ["6"], "tags": ["standard", "how-many"]}
+{"id": "howmany-04", "question": "How many keys are on a standard piano?", "context": "A standard piano has 88 keys. The layout repeats across several octaves.", "answers": ["88"], "tags": ["standard", "how-many"]}
+{"id": "howmany-05", "question": "How many moons does Mars have?", "context": "Mars has 2 moons, named Phobos and Deimos. Both are small and irregular in shape.", "answers": ["2"], "tags": ["standard", "how-many"]}
+{"id": "howmany-06", "question": "How many sides does a hexagon have?", "context": "A hexagon has 6 sides. It tiles a plane without gaps.", "answers": ["6"], "tags": ["standard", "how-many"]}
+{"id": "what-01", "question": "What gas do plants absorb during photosynthesis?", "context": "During photosynthesis, plants absorb carbon dioxide. They release oxygen as a byproduct.", "answers": ["carbon dioxide"], "tags": ["standard", "what"]}
+{"id": "what-02", "question": "What is the largest planet in the Solar System?", "context": "The largest planet in the Solar System is Jupiter. It is a gas giant with many moons.", "answers": ["Jupiter"], "tags": ["standard", "what"]}
+{"id": "adv-01", "question": "Who composed the Ninth Symphony?", "context": "The Ninth Symphony was first performed in Vienna and is attributed to Beethoven. Critics praised its choral finale.", "answers": ["Beethoven"], "tags": ["adversarial", "who"]}
+{"id": "adv-02", "question": "Who discovered penicillin?", "context": "The substance penicillin, studied in London, was identified by Alexander Fleming. The finding launched modern antibiotics.", "answers": ["Alexander Fleming"], "tags": ["adversarial", "who"]}
+{"id": "adv-03", "question": "Who wrote the novel Don Quixote?", "context": "The novel Don Quixote, set in Spain, was written by Miguel Cervantes. It is a foundational work of fiction.", "answers": ["Miguel Cervantes"], "tags": ["adversarial", "who"]}
+{"id": "adv-04", "question": "Who directed the film Psycho?", "context": "The film Psycho, shot in Hollywood, was directed by Alfred Hitchcock. It became a defining thriller.", "answers": ["Alfred Hitchcock"], "tags": ["adversarial", "who"]}
+{"id": "adv-05", "question": "Where is the Louvre museum located?", "context": "The Louvre museum, which displays works from Italy, is located in Paris. Millions of people visit it each year.", "answers": ["Paris"], "tags": ["adversarial", "where"]}
+{"id": "adv-06", "question": "Where was Napoleon born?", "context": "Long associated with France, Napoleon was born on the island of Corsica. He later became emperor.", "answers": ["Corsica"], "tags": ["adversarial", "where"]}
+{"id": "adv-07", "question": "Where are the company headquarters located?", "context": "The company, which began operations in Germany, has its headquarters located in Tokyo. It employs thousands of people.", "answers": ["Tokyo"], "tags": ["adversarial", "where"]}
+{"id": "adv-08", "question": "When did Ford introduce the Model T?", "context": "Ford expanded production in 1913 and introduced the Model T in 1908. The car was affordable for many families.", "answers": ["1908"], "tags": ["adversarial", "when"]}
+{"id": "adv-09", "question": "When did the museum first open?", "context": "The museum was renovated in 2001 after first opening in 1937. Its collection spans several centuries.", "answers": ["1937"], "tags": ["adversarial", "when"]}
+{"id": "adv-10", "question": "When was the bridge completed?", "context": "The bridge began construction in 1933 and was completed in 1937. It spans a wide strait.", "answers": ["1937"], "tags": ["adversarial", "when"]}
+{"id": "adv-11", "question": "How many gold medals did the athlete win?", "context": "Competing in 24 events overall, the athlete won 8 gold medals. The tally set a national record.", "answers": ["8"], "tags": ["adversarial", "how-many"]}
+{"id": "adv-12", "question": "How many championships did the team win?", "context": "Across 30 seasons in the league, the team won 5 championships. Supporters celebrated every title.", "answers": ["5"], "tags": ["adversarial", "how-many"]}
diff --git a/examples/benchmark-v1/pinned-baseline.json b/examples/benchmark-v1/pinned-baseline.json
new file mode 100644
index 0000000..71fea75
--- /dev/null
+++ b/examples/benchmark-v1/pinned-baseline.json
@@ -0,0 +1,9 @@
+{
+  "variant": "span_extract",
+  "metric": "token_f1",
+  "aggregate": {
+    "exact_match": 0.631579,
+    "token_f1": 0.651316,
+    "contains_gold": 0.684211
+  }
+}
diff --git a/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json b/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json
new file mode 100644
index 0000000..59472d5
--- /dev/null
+++ b/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json
@@ -0,0 +1,202 @@
+{
+  "system": "evalops",
+  "benchmark_type": "eval",
+  "run_id": "evalops-2026-05-26-308b0b20",
+  "fixture": "benchmark-v1",
+  "metrics": {
+    "n_cases": 38,
+    "baseline_variant": "overlap_sentence",
+    "candidate_variant": "span_extract",
+    "exact_match": 0.631579,
+    "token_f1": 0.651316,
+    "contains_gold": 0.684211,
+    "improvement_exact_match": 0.631579,
+    "improvement_token_f1": 0.377036,
+    "regressions": 12,
+    "pass_rate": 0.631579,
+    "gate_verdict": "pass"
+  },
+  "variants": [
+    {
+      "name": "first_sentence",
+      "exact_match": 0.0,
+      "token_f1": 0.27428,
+      "contains_gold": 1.0
+    },
+    {
+      "name": "overlap_sentence",
+      "exact_match": 0.0,
+      "token_f1": 0.27428,
+      "contains_gold": 1.0
+    },
+    {
+      "name": "span_extract",
+      "exact_match": 0.631579,
+      "token_f1": 0.651316,
+      "contains_gold": 0.684211
+    }
+  ],
+  "regressions": [
+    {
+      "case_id": "adv-01",
+      "metric": "token_f1",
+      "baseline": 0.153846,
+      "candidate": 0.0,
+      "delta": -0.153846,
+      "reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-02",
+      "metric": "token_f1",
+      "baseline": 0.333333,
+      "candidate": 0.0,
+      "delta": -0.333333,
+      "reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-03",
+      "metric": "token_f1",
+      "baseline": 0.307692,
+      "candidate": 0.0,
+      "delta": -0.307692,
+      "reason": "token_f1: candidate 0.00 below baseline 0.31 (-0.31)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-04",
+      "metric": "token_f1",
+      "baseline": 0.333333,
+      "candidate": 0.0,
+      "delta": -0.333333,
+      "reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-05",
+      "metric": "token_f1",
+      "baseline": 0.166667,
+      "candidate": 0.0,
+      "delta": -0.166667,
+      "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
+      "tags": [
+        "adversarial",
+        "where"
+      ]
+    },
+    {
+      "case_id": "adv-06",
+      "metric": "token_f1",
+      "baseline": 0.166667,
+      "candidate": 0.0,
+      "delta": -0.166667,
+      "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
+      "tags": [
+        "adversarial",
+        "where"
+      ]
+    },
+    {
+      "case_id": "adv-07",
+      "metric": "token_f1",
+      "baseline": 0.153846,
+      "candidate": 0.0,
+      "delta": -0.153846,
+      "reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)",
+      "tags": [
+        "adversarial",
+        "where"
+      ]
+    },
+    {
+      "case_id": "adv-08",
+      "metric": "token_f1",
+      "baseline": 0.166667,
+      "candidate": 0.0,
+      "delta": -0.166667,
+      "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
+      "tags": [
+        "adversarial",
+        "when"
+      ]
+    },
+    {
+      "case_id": "adv-09",
+      "metric": "token_f1",
+      "baseline": 0.181818,
+      "candidate": 0.0,
+      "delta": -0.181818,
+      "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
+      "tags": [
+        "adversarial",
+        "when"
+      ]
+    },
+    {
+      "case_id": "adv-10",
+      "metric": "token_f1",
+      "baseline": 0.181818,
+      "candidate": 0.0,
+      "delta": -0.181818,
+      "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
+      "tags": [
+        "adversarial",
+        "when"
+      ]
+    },
+    {
+      "case_id": "adv-11",
+      "metric": "token_f1",
+      "baseline": 0.181818,
+      "candidate": 0.0,
+      "delta": -0.181818,
+      "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
+      "tags": [
+        "adversarial",
+        "how-many"
+      ]
+    },
+    {
+      "case_id": "adv-12",
+      "metric": "token_f1",
+      "baseline": 0.2,
+      "candidate": 0.0,
+      "delta": -0.2,
+      "reason": "token_f1: candidate 0.00 below baseline 0.20 (-0.20)",
+      "tags": [
+        "adversarial",
+        "how-many"
+      ]
+    }
+  ],
+  "gate": {
+    "passed": true,
+    "metric": "token_f1",
+    "pinned": null,
+    "observed": 0.651316,
+    "reasons": [
+      "no pinned baseline yet; this run establishes the contract"
+    ]
+  },
+  "artifact_urls": {
+    "report": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/latest-report.md",
+    "fixture": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/cases.jsonl",
+    "run": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json"
+  },
+  "schema_version": 1,
+  "generated_at": "2026-05-26T06:49:52Z",
+  "previous_run": null
+}
diff --git a/examples/benchmark-v1/results/latest-report.md b/examples/benchmark-v1/results/latest-report.md
new file mode 100644
index 0000000..8c3103a
--- /dev/null
+++ b/examples/benchmark-v1/results/latest-report.md
@@ -0,0 +1,43 @@
+# EvalOps benchmark: benchmark-v1
+
+- Run: `evalops-2026-05-26-308b0b20`
+- Generated: 2026-05-26T06:49:52Z
+- Cases: 38
+- Gate verdict: **PASS**
+
+## Variants
+
+| Variant | Exact match | Token F1 | Contains gold |
+| --- | --- | --- | --- |
+| first_sentence | 0.000 | 0.274 | 1.000 |
+| overlap_sentence (baseline) | 0.000 | 0.274 | 1.000 |
+| span_extract (candidate) | 0.632 | 0.651 | 0.684 |
+
+Candidate against baseline: exact match +0.632, token F1 +0.377.
+
+## Regressions (12)
+
+| Case | Baseline | Candidate | Delta | Reason |
+| --- | --- | --- | --- | --- |
+| adv-01 | 0.15 | 0.00 | -0.15 | token_f1: candidate 0.00 below baseline 0.15 (-0.15) |
+| adv-02 | 0.33 | 0.00 | -0.33 | token_f1: candidate 0.00 below baseline 0.33 (-0.33) |
+| adv-03 | 0.31 | 0.00 | -0.31 | token_f1: candidate 0.00 below baseline 0.31 (-0.31) |
+| adv-04 | 0.33 | 0.00 | -0.33 | token_f1: candidate 0.00 below baseline 0.33 (-0.33) |
+| adv-05 | 0.17 | 0.00 | -0.17 | token_f1: candidate 0.00 below baseline 0.17 (-0.17) |
+| adv-06 | 0.17 | 0.00 | -0.17 | token_f1: candidate 0.00 below baseline 0.17 (-0.17) |
+| adv-07 | 0.15 | 0.00 | -0.15 | token_f1: candidate 0.00 below baseline 0.15 (-0.15) |
+| adv-08 | 0.17 | 0.00 | -0.17 | token_f1: candidate 0.00 below baseline 0.17 (-0.17) |
+| adv-09 | 0.18 | 0.00 | -0.18 | token_f1: candidate 0.00 below baseline 0.18 (-0.18) |
+| adv-10 | 0.18 | 0.00 | -0.18 | token_f1: candidate 0.00 below baseline 0.18 (-0.18) |
+| adv-11 | 0.18 | 0.00 | -0.18 | token_f1: candidate 0.00 below baseline 0.18 (-0.18) |
+| adv-12 | 0.20 | 0.00 | -0.20 | token_f1: candidate 0.00 below baseline 0.20 (-0.20) |
+
+## Reproduce
+
+```bash
+git clone https://github.com/IgnazioDS/evalops-workbench
+cd evalops-workbench && pip install -e .
+python -m evalops_workbench.benchmark_runner
+```
+
+Fixture: [`benchmark-v1`](https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/cases.jsonl). Every case, label, and score is reproducible offline with no credentials.
diff --git a/src/app/page.tsx b/src/app/page.tsx
index 523db47..0f02530 100644
--- a/src/app/page.tsx
+++ b/src/app/page.tsx
@@ -4,57 +4,59 @@ import { useEffect, useState } from "react";
 import {
   ArrowRight,
   ExternalLink,
-  GitCommit,
+  FileText,
+  FlaskConical,
+  GitCompare,
   Github,
-  Lightbulb,
-  Star,
-  TrendingUp,
+  ShieldAlert,
+  Target,
   Users,
 } from "lucide-react";
-import { fetchPublicStats, type PublicStats } from "@/lib/api";
+import {
+  fetchBenchmarkLatest,
+  fetchPublicStats,
+  type PublicBenchmark,
+  type PublicStats,
+} from "@/lib/api";
 import { TopBar } from "@/components/layout/TopBar";
 import { Card, CardHeader, CardTitle, CardContent, CardDescription } from "@/components/ui/card";
 import { Button } from "@/components/ui/button";
 import { Badge } from "@/components/ui/badge";
 import { StatusDot } from "@/components/ui/status-dot";
-import { StatCard } from "@/components/dashboard/StatCard";
 import { Skeleton } from "@/components/ui/skeleton";
-import { Sparkline } from "@/components/ui/sparkline";
 import { PROJECT } from "@/lib/project";
-import { formatRelative } from "@/lib/utils";
+import { formatNumber, formatRelative } from "@/lib/utils";
 
-/**
- * Build a deterministic 10-point shape derived from the live value, so
- * StatCard sparklines convey velocity without claiming a measured history
- * the showcase tier doesn't have.
- */
-function shapeFromValue(target: number, points = 10): number[] {
-  if (target <= 0) return Array(points).fill(0);
-  const result: number[] = [];
-  for (let i = 0; i < points; i++) {
-    const ratio = i / (points - 1);
-    const eased = ratio * ratio;
-    const wobble = Math.sin(i + target) * 0.06;
-    result.push(target * (eased + wobble + 0.1));
-  }
-  return result;
+function pct(value: number | undefined): string {
+  if (value === undefined || Number.isNaN(value)) return "—";
+  return `${Math.round(value * 100)}%`;
+}
+
+function signedPoints(value: number): string {
+  const points = Math.round(value * 100);
+  return `${points >= 0 ? "+" : ""}${points} pts`;
 }
 
 export default function OverviewPage() {
   const [stats, setStats] = useState<PublicStats | null>(null);
+  const [benchmark, setBenchmark] = useState<PublicBenchmark | null>(null);
   const [loading, setLoading] = useState(true);
 
   useEffect(() => {
-    fetchPublicStats()
-      .then(setStats)
-      .catch(() => null)
+    Promise.allSettled([fetchPublicStats(), fetchBenchmarkLatest()])
+      .then(([statsResult, benchmarkResult]) => {
+        if (statsResult.status === "fulfilled") setStats(statsResult.value);
+        if (benchmarkResult.status === "fulfilled") setBenchmark(benchmarkResult.value);
+      })
       .finally(() => setLoading(false));
   }, []);
 
-  const commitsTotal = (stats?.metrics.commits_total as number | undefined) ?? 0;
-  const commits30d = (stats?.metrics.commits_30d as number | undefined) ?? 0;
-  const stars = (stats?.metrics.repo_stars as number | undefined) ?? 0;
-  const loc = (stats?.metrics.lines_of_code as number | undefined) ?? 0;
+  const metrics = stats?.metrics ?? {};
+  const lastPass = metrics.last_pass_rate as number | undefined;
+  const rolling = metrics.rolling_pass_rate_7d as number | undefined;
+  const regressions = metrics.regressions_caught_30d as number | undefined;
+  const runs = metrics.eval_runs_total as number | undefined;
+  const experiments = metrics.experiments_tracked as number | undefined;
 
   return (
     <>
@@ -97,11 +99,7 @@ export default function OverviewPage() {
                   </a>
                 </Button>
                 <Button asChild size="sm" variant="outline">
-                  <a
-                    href={PROJECT.github_url}
-                    target="_blank"
-                    rel="noreferrer"
-                  >
+                  <a href={PROJECT.github_url} target="_blank" rel="noreferrer">
                     <Github />
                     GitHub
                   </a>
@@ -110,42 +108,34 @@ export default function OverviewPage() {
             </CardContent>
           </Card>
 
-          {/* Stat row — wired to real /api/stats Tier-B values */}
+          {/* Tier-A stat row — wired to real /api/stats live values */}
           <div className="grid grid-cols-1 gap-3 sm:grid-cols-2 xl:grid-cols-4">
-            <StatCard
-              title="Commits · total"
-              value={commitsTotal}
-              subtitle="GitHub history"
-              icon={GitCommit}
-              sparkData={shapeFromValue(commitsTotal)}
+            <Stat
+              title="Last pass rate"
+              value={pct(lastPass)}
+              subtitle="Candidate exact match, latest run"
+              icon={Target}
               loading={loading}
             />
-            <StatCard
-              title="Commits · 30d"
-              value={commits30d}
-              subtitle="Trailing 30 days"
-              icon={TrendingUp}
-              sparkData={shapeFromValue(commits30d)}
+            <Stat
+              title="Pass rate · 7d"
+              value={pct(rolling)}
+              subtitle="Rolling mean"
+              icon={GitCompare}
               loading={loading}
             />
-            <StatCard
-              title="Repo stars"
-              value={stars}
-              subtitle="GitHub"
-              icon={Star}
-              sparkData={shapeFromValue(stars)}
+            <Stat
+              title="Regressions · 30d"
+              value={regressions === undefined ? "—" : formatNumber(regressions)}
+              subtitle="Distinct cases caught"
+              icon={ShieldAlert}
               loading={loading}
             />
-            <StatCard
-              title="Lines of code"
-              value={loc}
-              subtitle={
-                stats?.metrics.primary_language
-                  ? `Mostly ${stats.metrics.primary_language}`
-                  : "All sources"
-              }
-              icon={Lightbulb}
-              sparkData={shapeFromValue(loc)}
+            <Stat
+              title="Eval runs"
+              value={runs === undefined ? "—" : formatNumber(runs)}
+              subtitle={experiments ? `${experiments} variants tracked` : "Recorded runs"}
+              icon={FlaskConical}
               loading={loading}
             />
           </div>
@@ -154,15 +144,9 @@ export default function OverviewPage() {
           <Card>
             <CardHeader className="flex flex-row items-center justify-between border-b border-border-subtle py-3">
               <CardTitle>System status</CardTitle>
-              <Badge
-                variant={
-                  stats?.status === "operational" ? "success" : "warning"
-                }
-              >
+              <Badge variant={stats?.status === "operational" ? "success" : "warning"}>
                 <StatusDot
-                  tone={
-                    stats?.status === "operational" ? "success" : "warning"
-                  }
+                  tone={stats?.status === "operational" ? "success" : "warning"}
                   pulse={stats?.status === "operational"}
                   size="sm"
                 />
@@ -170,15 +154,11 @@ export default function OverviewPage() {
               </Badge>
             </CardHeader>
             <CardContent className="grid grid-cols-2 gap-4 py-4 sm:grid-cols-4">
+              <StatusCell label="Mode" value={stats?.mode ?? "live"} hint="Tier A · live workload" />
               <StatusCell
-                label="Mode"
-                value={stats?.mode ?? "showcase"}
-                hint="Tier B — see schema"
-              />
-              <StatusCell
-                label="Last commit"
-                value={formatRelative(stats?.last_commit_at)}
-                hint={stats?.last_commit_at ?? "never"}
+                label="Last eval run"
+                value={formatRelative(stats?.last_active_at)}
+                hint={stats?.last_active_at ?? "never"}
               />
               <StatusCell
                 label="Last deploy"
@@ -193,7 +173,10 @@ export default function OverviewPage() {
             </CardContent>
           </Card>
 
-          {/* Users + audience */}
+          {/* Latest benchmark */}
+          <BenchmarkCard benchmark={benchmark} loading={loading} />
+
+          {/* Built for + MVP */}
           <div className="grid grid-cols-1 gap-4 lg:grid-cols-3">
             <Card className="lg:col-span-1">
               <CardHeader>
@@ -208,9 +191,9 @@ export default function OverviewPage() {
                   Stack
                 </p>
                 <div className="flex flex-wrap gap-1.5">
-                  {PROJECT.stack.map((s) => (
-                    <Badge key={s} variant="muted">
-                      {s}
+                  {PROJECT.stack.map((item) => (
+                    <Badge key={item} variant="muted">
+                      {item}
                     </Badge>
                   ))}
                 </div>
@@ -218,14 +201,14 @@ export default function OverviewPage() {
             </Card>
             <Card className="lg:col-span-2">
               <CardHeader>
-                <CardTitle>What ships first</CardTitle>
-                <CardDescription>The MVP scope this project commits to.</CardDescription>
+                <CardTitle>What ships now</CardTitle>
+                <CardDescription>The harness capabilities, live in this repo.</CardDescription>
               </CardHeader>
               <CardContent>
                 <ul className="space-y-2.5">
-                  {PROJECT.mvp.map((item, i) => (
+                  {PROJECT.mvp.map((item, index) => (
                     <li
-                      key={i}
+                      key={index}
                       className="flex items-start gap-3 text-sm text-foreground-muted"
                     >
                       <span className="mt-1.5 inline-flex h-1.5 w-1.5 shrink-0 rounded-full bg-brand" />
@@ -242,6 +225,192 @@ export default function OverviewPage() {
   );
 }
 
+function BenchmarkCard({
+  benchmark,
+  loading,
+}: {
+  benchmark: PublicBenchmark | null;
+  loading: boolean;
+}) {
+  if (loading) {
+    return (
+      <Card>
+        <CardContent className="p-6">
+          <Skeleton className="h-40 w-full" />
+        </CardContent>
+      </Card>
+    );
+  }
+  if (!benchmark || benchmark.status === "pending" || !benchmark.metrics) {
+    return (
+      <Card>
+        <CardHeader>
+          <CardTitle>Latest benchmark</CardTitle>
+          <CardDescription>No run has been published yet.</CardDescription>
+        </CardHeader>
+      </Card>
+    );
+  }
+
+  const metrics = benchmark.metrics;
+  const variants = benchmark.variants ?? [];
+  const regressions = benchmark.regressions ?? [];
+  const urls = benchmark.artifact_urls;
+
+  return (
+    <Card>
+      <CardHeader className="flex flex-row items-start justify-between border-b border-border-subtle py-3">
+        <div>
+          <CardTitle>Latest benchmark</CardTitle>
+          <CardDescription>
+            {benchmark.fixture} · {metrics.n_cases} cases · generated{" "}
+            {formatRelative(benchmark.generated_at)}
+          </CardDescription>
+        </div>
+        <Badge variant={metrics.gate_verdict === "pass" ? "success" : "warning"}>
+          gate {metrics.gate_verdict}
+        </Badge>
+      </CardHeader>
+      <CardContent className="space-y-4 py-4">
+        <div className="overflow-x-auto">
+          <table className="w-full text-sm">
+            <thead>
+              <tr className="text-2xs uppercase tracking-wider text-foreground-faint">
+                <th className="py-1.5 text-left font-medium">Variant</th>
+                <th className="py-1.5 text-right font-medium">Exact match</th>
+                <th className="py-1.5 text-right font-medium">Token F1</th>
+                <th className="py-1.5 text-right font-medium">Contains gold</th>
+              </tr>
+            </thead>
+            <tbody className="tabular-nums">
+              {variants.map((variant) => {
+                const role =
+                  variant.name === metrics.candidate_variant
+                    ? "candidate"
+                    : variant.name === metrics.baseline_variant
+                    ? "baseline"
+                    : "";
+                return (
+                  <tr key={variant.name} className="border-t border-border-subtle">
+                    <td className="py-1.5 text-left font-mono text-xs text-foreground">
+                      {variant.name}
+                      {role && (
+                        <span className="ml-1.5 text-2xs text-foreground-subtle">({role})</span>
+                      )}
+                    </td>
+                    <td className="py-1.5 text-right text-foreground-muted">
+                      {pct(variant.exact_match)}
+                    </td>
+                    <td className="py-1.5 text-right text-foreground-muted">
+                      {pct(variant.token_f1)}
+                    </td>
+                    <td className="py-1.5 text-right text-foreground-muted">
+                      {pct(variant.contains_gold)}
+                    </td>
+                  </tr>
+                );
+              })}
+            </tbody>
+          </table>
+        </div>
+
+        <div className="grid grid-cols-1 gap-3 sm:grid-cols-2">
+          <div className="rounded-md border border-border-subtle bg-surface-2 px-3 py-2.5">
+            <p className="text-2xs uppercase tracking-wider text-foreground-faint">
+              Candidate vs baseline
+            </p>
+            <p className="mt-1 text-sm text-foreground">
+              exact match{" "}
+              <span className="font-semibold text-success">
+                {signedPoints(metrics.improvement_exact_match)}
+              </span>
+              , token F1{" "}
+              <span className="font-semibold text-success">
+                {signedPoints(metrics.improvement_token_f1)}
+              </span>
+            </p>
+          </div>
+          <div className="rounded-md border border-border-subtle bg-surface-2 px-3 py-2.5">
+            <p className="text-2xs uppercase tracking-wider text-foreground-faint">
+              Regressions surfaced
+            </p>
+            <p className="mt-1 text-sm text-foreground">
+              <span className="font-semibold text-warning">{metrics.regressions}</span> of{" "}
+              {metrics.n_cases} cases scored below baseline
+            </p>
+          </div>
+        </div>
+
+        {regressions.length > 0 && (
+          <ul className="space-y-1.5">
+            {regressions.slice(0, 3).map((regression) => (
+              <li
+                key={regression.case_id}
+                className="flex items-start gap-2 text-2xs text-foreground-muted"
+              >
+                <span className="mt-1 inline-flex h-1.5 w-1.5 shrink-0 rounded-full bg-warning" />
+                <span className="font-mono text-foreground-subtle">{regression.case_id}</span>{" "}
+                {regression.reason}
+              </li>
+            ))}
+          </ul>
+        )}
+
+        {urls && (
+          <div className="flex flex-wrap gap-2 pt-1">
+            <Button asChild size="sm" variant="outline">
+              <a href={urls.report} target="_blank" rel="noreferrer">
+                <FileText />
+                Full report
+              </a>
+            </Button>
+            <Button asChild size="sm" variant="outline">
+              <a href={urls.fixture} target="_blank" rel="noreferrer">
+                Fixture
+              </a>
+            </Button>
+          </div>
+        )}
+      </CardContent>
+    </Card>
+  );
+}
+
+function Stat({
+  title,
+  value,
+  subtitle,
+  icon: Icon,
+  loading,
+}: {
+  title: string;
+  value: string;
+  subtitle?: string;
+  icon: typeof Target;
+  loading: boolean;
+}) {
+  return (
+    <Card>
+      <div className="p-4">
+        <div className="flex items-start justify-between">
+          <p className="text-2xs font-medium uppercase tracking-wider text-foreground-faint">
+            {title}
+          </p>
+          <div className="flex h-7 w-7 items-center justify-center rounded-md bg-surface-2 text-foreground-muted">
+            <Icon className="h-3.5 w-3.5" strokeWidth={1.75} />
+          </div>
+        </div>
+        {loading ? (
+          <Skeleton className="mt-2 h-7 w-20" />
+        ) : (
+          <p className="mt-2 text-2xl font-semibold tabular-nums text-foreground">{value}</p>
+        )}
+        {subtitle && <p className="mt-0.5 text-2xs text-foreground-subtle">{subtitle}</p>}
+      </div>
+    </Card>
+  );
+}
+
 function StatusCell({
   label,
   value,
@@ -256,13 +425,9 @@ function StatusCell({
       <p className="text-2xs font-medium uppercase tracking-wider text-foreground-faint">
         {label}
       </p>
-      <p className="mt-1 text-xl font-semibold tabular-nums text-foreground">
-        {value}
-      </p>
+      <p className="mt-1 text-xl font-semibold tabular-nums text-foreground">{value}</p>
       {hint && (
-        <p className="mt-0.5 text-2xs text-foreground-subtle truncate">
-          {hint}
-        </p>
+        <p className="mt-0.5 text-2xs text-foreground-subtle truncate">{hint}</p>
       )}
     </div>
   );
diff --git a/src/app/telemetry/page.tsx b/src/app/telemetry/page.tsx
index f888169..3bfdf58 100644
--- a/src/app/telemetry/page.tsx
+++ b/src/app/telemetry/page.tsx
@@ -3,11 +3,11 @@
 import { useState } from "react";
 import {
   CheckCircle2,
-  Code2,
-  GitCommit,
-  Layers,
+  FlaskConical,
+  GitCompare,
   RefreshCw,
-  Star,
+  ShieldAlert,
+  Target,
 } from "lucide-react";
 import { fetchPublicStats, type PublicStats } from "@/lib/api";
 import { TopBar } from "@/components/layout/TopBar";
@@ -28,6 +28,12 @@ import {
 
 const POLL_INTERVAL_MS = 30_000;
 
+function pct(value: unknown): string {
+  return typeof value === "number" && !Number.isNaN(value)
+    ? `${Math.round(value * 100)}%`
+    : "—";
+}
+
 export default function TelemetryPage() {
   const { data: stats, loading, error, refetch } = usePolling<PublicStats>(
     fetchPublicStats,
@@ -77,7 +83,7 @@ export default function TelemetryPage() {
               </div>
               <div className="flex flex-wrap items-center gap-2">
                 <Badge variant={stats?.mode === "live" ? "brand" : "muted"}>
-                  {stats?.mode ?? "showcase"}
+                  {stats?.mode ?? "live"}
                 </Badge>
                 <Badge variant="outline">
                   generated {formatRelative(stats?.generated_at)}
@@ -102,61 +108,57 @@ export default function TelemetryPage() {
             </Card>
           )}
 
-          {/* Tier-B metric grid */}
+          {/* Tier-A metric grid */}
           <div className="grid grid-cols-1 gap-3 sm:grid-cols-2 lg:grid-cols-4">
             <MetricTile
-              label="Commits · total"
-              value={
-                stats
-                  ? formatNumber(stats.metrics.commits_total as number)
-                  : "—"
-              }
-              icon={GitCommit}
+              label="Last pass rate"
+              value={stats ? pct(stats.metrics.last_pass_rate) : "—"}
+              icon={Target}
               loading={loading}
             />
             <MetricTile
-              label="Commits · 30d"
-              value={
-                stats ? formatNumber(stats.metrics.commits_30d as number) : "—"
-              }
-              icon={GitCommit}
+              label="Pass rate · 7d"
+              value={stats ? pct(stats.metrics.rolling_pass_rate_7d) : "—"}
+              icon={GitCompare}
               loading={loading}
             />
             <MetricTile
-              label="Lines of code"
+              label="Regressions · 30d"
               value={
                 stats
-                  ? formatNumber(stats.metrics.lines_of_code as number)
+                  ? formatNumber(Number(stats.metrics.regressions_caught_30d ?? 0))
                   : "—"
               }
-              icon={Code2}
+              icon={ShieldAlert}
               loading={loading}
             />
             <MetricTile
-              label="Repo stars"
+              label="Eval runs · total"
               value={
-                stats ? formatNumber(stats.metrics.repo_stars as number) : "—"
+                stats ? formatNumber(Number(stats.metrics.eval_runs_total ?? 0)) : "—"
               }
-              icon={Star}
+              icon={FlaskConical}
               loading={loading}
             />
             <MetricTile
-              label="Primary language"
+              label="Eval runs · 24h"
               value={
-                (stats?.metrics.primary_language as string | undefined) ?? "—"
+                stats ? formatNumber(Number(stats.metrics.eval_runs_24h ?? 0)) : "—"
               }
-              icon={Layers}
+              icon={FlaskConical}
               loading={loading}
             />
             <MetricTile
-              label="Last commit"
-              value={formatRelative(stats?.last_commit_at)}
-              icon={CheckCircle2}
+              label="Experiments tracked"
+              value={
+                stats ? formatNumber(Number(stats.metrics.experiments_tracked ?? 0)) : "—"
+              }
+              icon={GitCompare}
               loading={loading}
             />
             <MetricTile
-              label="Last deploy"
-              value={formatRelative(stats?.last_deployed_at)}
+              label="Last eval run"
+              value={formatRelative(stats?.last_active_at)}
               icon={CheckCircle2}
               loading={loading}
             />
@@ -187,7 +189,7 @@ export default function TelemetryPage() {
                     />
                     <DetailRow
                       label="Mode"
-                      value={stats?.mode ?? "showcase"}
+                      value={stats?.mode ?? "live"}
                     />
                     <DetailRow
                       label="Schema version"
@@ -230,7 +232,7 @@ export default function TelemetryPage() {
                     <p>
                       This endpoint runs in{" "}
                       <code className="rounded bg-surface-2 px-1 py-0.5 font-mono text-xs">
-                        mode: &quot;showcase&quot;
+                        mode: &quot;live&quot;
                       </code>{" "}
                       per the public schema at{" "}
                       <a
@@ -241,21 +243,26 @@ export default function TelemetryPage() {
                       >
                         TELEMETRY_SCHEMA.md
                       </a>
-                      . Counters are sourced from the GitHub REST API
-                      (commits, language, stars) plus a build-time line-of-code
-                      snapshot, behind a 5-minute module-scope cache.
+                      . Every metric is computed from the committed history of
+                      the public benchmark, which re-runs on a schedule and on
+                      every change. Nothing is simulated, seeded, or incremented
+                      in memory.
                     </p>
                     <p>
-                      The endpoint never returns 5xx — GitHub failures degrade
-                      to{" "}
+                      The endpoint never returns 5xx. With no published run it
+                      degrades to{" "}
                       <code className="rounded bg-surface-2 px-1 py-0.5 font-mono text-xs">
                         status: &quot;degraded&quot;
                       </code>{" "}
-                      with the last cached response (or zeros) and a
-                      contract-valid envelope.
+                      with zeroed metrics and a contract-valid envelope. The
+                      latest run is served at{" "}
+                      <code className="rounded bg-surface-2 px-1 py-0.5 font-mono text-xs">
+                        /api/benchmark-latest
+                      </code>
+                      .
                     </p>
                     <CodeBlock language="bash">
-                      {`curl -i https://${PROJECT.slug}.vercel.app/api/stats`}
+                      {`curl -i https://${PROJECT.slug}.eleventh.dev/api/stats`}
                     </CodeBlock>
                   </div>
                 </TabsContent>
@@ -284,7 +291,7 @@ function MetricTile({
 }: {
   label: string;
   value: string;
-  icon: typeof GitCommit;
+  icon: typeof CheckCircle2;
   loading: boolean;
 }) {
   return (
diff --git a/src/evalops_workbench/benchmark_runner.py b/src/evalops_workbench/benchmark_runner.py
new file mode 100644
index 0000000..33e80c5
--- /dev/null
+++ b/src/evalops_workbench/benchmark_runner.py
@@ -0,0 +1,106 @@
+"""Run the public benchmark and publish the committed artifact.
+
+This is the workload behind EvalOps' Tier-A telemetry: a real, reproducible
+evaluation that runs nightly (and on demand), persists its result to the repo,
+and is read back by the public /api/benchmark-latest and /api/stats endpoints.
+
+It is dependency-free and credential-free. Re-running it on the committed
+fixture reproduces the published numbers exactly.
+"""
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+
+from .eval import artifact as artifact_mod
+from .eval import baseline as baseline_mod
+from .eval import ledger as ledger_mod
+from .eval import report as report_mod
+from .eval.cases import load_cases
+from .eval.runner import compare, run_target
+from .eval.targets import get_target
+
+FIXTURE_ID = "benchmark-v1"
+FIXTURE_REL = "examples/benchmark-v1/cases.jsonl"
+BASELINE_VARIANT = "overlap_sentence"
+CANDIDATE_VARIANT = "span_extract"
+VARIANT_ORDER = ("first_sentence", "overlap_sentence", "span_extract")
+GATE_METRIC = "token_f1"
+
+
+def _repo_root() -> Path:
+    # src/evalops_workbench/benchmark_runner.py -> repo root is three levels up.
+    return Path(__file__).resolve().parents[2]
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def run_benchmark(repo_root: Path | None = None, *, write: bool = True) -> dict:
+    """Execute the benchmark and return the artifact. Persist it when ``write``."""
+    root = repo_root or _repo_root()
+    fixture_path = root / FIXTURE_REL
+    results_dir = root / "examples" / FIXTURE_ID / "results"
+    archive_dir = results_dir / "archive"
+    pinned_path = root / "examples" / FIXTURE_ID / "pinned-baseline.json"
+    latest_artifact_path = root / "api" / "_benchmark_latest.json"
+    history_path = root / "api" / "_benchmark_history.json"
+    report_path = results_dir / "latest-report.md"
+
+    cases = load_cases(fixture_path)
+    results = {name: run_target(name, get_target(name), cases) for name in VARIANT_ORDER}
+    baseline_result = results[BASELINE_VARIANT]
+    candidate_result = results[CANDIDATE_VARIANT]
+
+    regressions = compare(baseline_result, candidate_result, metric=GATE_METRIC)
+    pinned = baseline_mod.load_pinned(pinned_path)
+    verdict = baseline_mod.evaluate_gate(candidate_result, pinned, metric=GATE_METRIC)
+
+    previous = ledger_mod.previous_record(history_path)
+    generated_at = _now_iso()
+    artifact = artifact_mod.build_artifact(
+        fixture_id=FIXTURE_ID,
+        fixture_rel=FIXTURE_REL,
+        results=results,
+        baseline_name=BASELINE_VARIANT,
+        candidate_name=CANDIDATE_VARIANT,
+        regressions=regressions,
+        verdict=verdict,
+        previous=previous,
+        generated_at=generated_at,
+    )
+
+    if write:
+        archive_dir.mkdir(parents=True, exist_ok=True)
+        latest_artifact_path.parent.mkdir(parents=True, exist_ok=True)
+        _write_json(latest_artifact_path, artifact)
+        _write_json(archive_dir / f"{artifact['run_id']}.json", artifact)
+        report_path.write_text(report_mod.render(artifact), encoding="utf-8")
+        ledger_mod.append_record(history_path, artifact_mod.slim_record(artifact))
+        if pinned is None:
+            baseline_mod.save_pinned(pinned_path, candidate_result, metric=GATE_METRIC)
+
+    return artifact
+
+
+def _write_json(path: Path, payload: dict) -> None:
+    path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+
+
+def main(argv: list[str] | None = None) -> int:
+    artifact = run_benchmark()
+    metrics = artifact["metrics"]
+    print(
+        f"[{artifact['run_id']}] cases={metrics['n_cases']} "
+        f"candidate={metrics['candidate_variant']} "
+        f"exact_match={metrics['exact_match']:.3f} token_f1={metrics['token_f1']:.3f} "
+        f"(vs baseline {metrics['improvement_token_f1']:+.3f} F1) "
+        f"regressions={metrics['regressions']} gate={metrics['gate_verdict']}"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/evalops_workbench/eval/__init__.py b/src/evalops_workbench/eval/__init__.py
new file mode 100644
index 0000000..6d80e3c
--- /dev/null
+++ b/src/evalops_workbench/eval/__init__.py
@@ -0,0 +1,12 @@
+"""EvalOps evaluation engine.
+
+A local-first, dependency-free evaluation harness: load a labelled dataset,
+run named variants of a system-under-test, score predictions with rubric
+functions, pin a baseline, and surface per-case regressions.
+
+The engine is model-agnostic. The public benchmark ships deterministic
+extractive-QA strategies as the system-under-test so the run is reproducible
+by any third party with zero credentials and zero cost. A live LLM target is
+an optional extension point (see ``targets.Target``), never a requirement.
+"""
+from __future__ import annotations
diff --git a/src/evalops_workbench/eval/artifact.py b/src/evalops_workbench/eval/artifact.py
new file mode 100644
index 0000000..d43da19
--- /dev/null
+++ b/src/evalops_workbench/eval/artifact.py
@@ -0,0 +1,134 @@
+"""Assemble the schema-conformed /api/benchmark-latest payload + slim history record.
+
+The artifact is the public contract. Its shape matches the benchmark-latest
+specification in TELEMETRY_SCHEMA.md: a stable envelope with metrics, the
+per-variant comparison, the surfaced regressions, public artifact URLs, and a
+run-over-run delta.
+"""
+from __future__ import annotations
+
+import hashlib
+
+from .baseline import GateVerdict
+from .runner import Regression, RunResult
+
+SYSTEM_SLUG = "evalops"
+BENCHMARK_TYPE = "eval"
+SCHEMA_VERSION = 1
+_RAW_BASE = "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main"
+
+
+def run_id_for(fixture_id: str, candidate: RunResult, baseline: RunResult, generated_at: str) -> str:
+    """Deterministic, reproducible id: date + short hash of the compared aggregates."""
+    digest_source = f"{fixture_id}|{candidate.aggregate}|{baseline.aggregate}"
+    short = hashlib.sha256(digest_source.encode("utf-8")).hexdigest()[:8]
+    return f"{SYSTEM_SLUG}-{generated_at[:10]}-{short}"
+
+
+def _artifact_urls(run_id: str, fixture_rel: str) -> dict:
+    return {
+        "report": f"{_RAW_BASE}/examples/benchmark-v1/results/latest-report.md",
+        "fixture": f"{_RAW_BASE}/{fixture_rel}",
+        "run": f"{_RAW_BASE}/examples/benchmark-v1/results/archive/{run_id}.json",
+    }
+
+
+def _variant_row(result: RunResult) -> dict:
+    return {"name": result.target, **result.aggregate}
+
+
+def _previous_block(previous: dict | None, metrics: dict) -> dict | None:
+    if not previous:
+        return None
+    return {
+        "run_id": previous.get("run_id"),
+        "generated_at": previous.get("generated_at"),
+        "delta": {
+            "token_f1": round(metrics["token_f1"] - float(previous.get("token_f1", 0.0)), 6),
+            "exact_match": round(metrics["exact_match"] - float(previous.get("exact_match", 0.0)), 6),
+            "regressions": metrics["regressions"] - int(previous.get("regressions", 0)),
+        },
+    }
+
+
+def build_artifact(
+    *,
+    fixture_id: str,
+    fixture_rel: str,
+    results: dict[str, RunResult],
+    baseline_name: str,
+    candidate_name: str,
+    regressions: list[Regression],
+    verdict: GateVerdict,
+    previous: dict | None,
+    generated_at: str,
+) -> dict:
+    baseline = results[baseline_name]
+    candidate = results[candidate_name]
+    run_id = run_id_for(fixture_id, candidate, baseline, generated_at)
+
+    metrics = {
+        "n_cases": candidate.n_cases,
+        "baseline_variant": baseline_name,
+        "candidate_variant": candidate_name,
+        "exact_match": candidate.aggregate["exact_match"],
+        "token_f1": candidate.aggregate["token_f1"],
+        "contains_gold": candidate.aggregate["contains_gold"],
+        "improvement_exact_match": round(
+            candidate.aggregate["exact_match"] - baseline.aggregate["exact_match"], 6
+        ),
+        "improvement_token_f1": round(
+            candidate.aggregate["token_f1"] - baseline.aggregate["token_f1"], 6
+        ),
+        "regressions": len(regressions),
+        "pass_rate": candidate.aggregate["exact_match"],
+        "gate_verdict": "pass" if verdict.passed else "fail",
+    }
+
+    return {
+        "system": SYSTEM_SLUG,
+        "benchmark_type": BENCHMARK_TYPE,
+        "run_id": run_id,
+        "fixture": fixture_id,
+        "metrics": metrics,
+        "variants": [_variant_row(results[name]) for name in results],
+        "regressions": [
+            {
+                "case_id": r.case_id,
+                "metric": r.metric,
+                "baseline": r.baseline,
+                "candidate": r.candidate,
+                "delta": r.delta,
+                "reason": r.reason,
+                "tags": list(r.tags),
+            }
+            for r in regressions
+        ],
+        "gate": {
+            "passed": verdict.passed,
+            "metric": verdict.metric,
+            "pinned": verdict.pinned,
+            "observed": verdict.observed,
+            "reasons": list(verdict.reasons),
+        },
+        "artifact_urls": _artifact_urls(run_id, fixture_rel),
+        "schema_version": SCHEMA_VERSION,
+        "generated_at": generated_at,
+        "previous_run": _previous_block(previous, metrics),
+    }
+
+
+def slim_record(artifact: dict) -> dict:
+    """The compact history row consumed by the ledger and /api/stats rollups."""
+    metrics = artifact["metrics"]
+    return {
+        "run_id": artifact["run_id"],
+        "generated_at": artifact["generated_at"],
+        "pass_rate": metrics["pass_rate"],
+        "token_f1": metrics["token_f1"],
+        "exact_match": metrics["exact_match"],
+        "regressions": metrics["regressions"],
+        "regressed_ids": [reg["case_id"] for reg in artifact["regressions"]],
+        "gate_verdict": metrics["gate_verdict"],
+        "variants": [variant["name"] for variant in artifact["variants"]],
+    }
diff --git a/src/evalops_workbench/eval/baseline.py b/src/evalops_workbench/eval/baseline.py
new file mode 100644
index 0000000..9603dce
--- /dev/null
+++ b/src/evalops_workbench/eval/baseline.py
@@ -0,0 +1,81 @@
+"""Pinned baseline contract and the regression gate.
+
+EvalOps' thesis: a regression dashboard nobody reads does not prevent
+regressions. The contract is that quality below a pinned baseline blocks. This
+module persists that pinned aggregate (a versioned file in the repo) and
+evaluates a candidate run against it.
+"""
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+from .runner import RunResult
+
+_DEFAULT_METRIC = "token_f1"
+_DEFAULT_TOLERANCE = 0.02
+
+
+@dataclass(frozen=True)
+class GateVerdict:
+    passed: bool
+    metric: str
+    pinned: float | None
+    observed: float
+    reasons: tuple[str, ...]
+
+
+def load_pinned(path: str | Path) -> dict | None:
+    path = Path(path)
+    if not path.exists():
+        return None
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, OSError, ValueError):
+        return None
+    return data if isinstance(data, dict) else None
+
+
+def save_pinned(path: str | Path, result: RunResult, *, metric: str = _DEFAULT_METRIC) -> None:
+    payload = {"variant": result.target, "metric": metric, "aggregate": result.aggregate}
+    Path(path).write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+
+
+def evaluate_gate(
+    candidate: RunResult,
+    pinned: dict | None,
+    *,
+    metric: str = _DEFAULT_METRIC,
+    tolerance: float = _DEFAULT_TOLERANCE,
+) -> GateVerdict:
+    """Pass unless the candidate aggregate drops below the pinned floor."""
+    observed = float(candidate.aggregate.get(metric, 0.0))
+    if pinned is None:
+        return GateVerdict(
+            passed=True,
+            metric=metric,
+            pinned=None,
+            observed=observed,
+            reasons=("no pinned baseline yet; this run establishes the contract",),
+        )
+    pinned_value = float(pinned.get("aggregate", {}).get(metric, 0.0))
+    floor = pinned_value - tolerance
+    if observed + 1e-9 >= floor:
+        return GateVerdict(
+            passed=True,
+            metric=metric,
+            pinned=pinned_value,
+            observed=observed,
+            reasons=(f"{metric} {observed:.3f} holds at or above pinned floor {floor:.3f}",),
+        )
+    return GateVerdict(
+        passed=False,
+        metric=metric,
+        pinned=pinned_value,
+        observed=observed,
+        reasons=(
+            f"{metric} {observed:.3f} dropped below pinned floor {floor:.3f} "
+            f"(pinned {pinned_value:.3f}, tolerance {tolerance:.3f})",
+        ),
+    )
diff --git a/src/evalops_workbench/eval/cases.py b/src/evalops_workbench/eval/cases.py
new file mode 100644
index 0000000..35744d6
--- /dev/null
+++ b/src/evalops_workbench/eval/cases.py
@@ -0,0 +1,94 @@
+"""The evaluation dataset: an immutable ``Case`` and loaders for JSONL/JSON/CSV.
+
+A case is one labelled example: a question over a context passage, with one or
+more acceptable gold answers. Loading validates at the boundary and fails fast
+on malformed rows rather than silently scoring against garbage.
+"""
+from __future__ import annotations
+
+import csv
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class Case:
+    """One labelled evaluation example."""
+
+    id: str
+    question: str
+    context: str
+    answers: tuple[str, ...]
+    tags: tuple[str, ...] = ()
+
+
+def _coerce_answers(raw: object) -> tuple[str, ...]:
+    if isinstance(raw, str):
+        return (raw,)
+    if isinstance(raw, (list, tuple)):
+        answers = tuple(str(a) for a in raw if str(a).strip())
+        if answers:
+            return answers
+    raise ValueError("case 'answers' must be a non-empty string or list of strings")
+
+
+def _coerce_tags(raw: object) -> tuple[str, ...]:
+    if raw is None or raw == "":
+        return ()
+    if isinstance(raw, str):
+        return tuple(t.strip() for t in raw.split("|") if t.strip())
+    if isinstance(raw, (list, tuple)):
+        return tuple(str(t).strip() for t in raw if str(t).strip())
+    return ()
+
+
+def _coerce_case(raw: dict) -> Case:
+    for field in ("id", "question", "context", "answers"):
+        if field not in raw:
+            raise ValueError(f"case is missing required field '{field}': {raw!r}")
+    return Case(
+        id=str(raw["id"]),
+        question=str(raw["question"]),
+        context=str(raw["context"]),
+        answers=_coerce_answers(raw["answers"]),
+        tags=_coerce_tags(raw.get("tags")),
+    )
+
+
+def load_cases(path: str | Path) -> list[Case]:
+    """Load cases from a ``.jsonl``, ``.json`` (array), or ``.csv`` file."""
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"fixture not found: {path}")
+
+    suffix = path.suffix.lower()
+    if suffix == ".jsonl":
+        rows = [
+            json.loads(line)
+            for line in path.read_text(encoding="utf-8").splitlines()
+            if line.strip()
+        ]
+    elif suffix == ".json":
+        rows = json.loads(path.read_text(encoding="utf-8"))
+        if not isinstance(rows, list):
+            raise ValueError(f"{path} must contain a JSON array of cases")
+    elif suffix == ".csv":
+        with path.open(encoding="utf-8", newline="") as handle:
+            rows = list(csv.DictReader(handle))
+    else:
+        raise ValueError(f"unsupported fixture extension: {suffix}")
+
+    cases = [_coerce_case(row) for row in rows]
+    if not cases:
+        raise ValueError(f"fixture {path} contained no cases")
+    _assert_unique_ids(cases)
+    return cases
+
+
+def _assert_unique_ids(cases: list[Case]) -> None:
+    seen: set[str] = set()
+    for case in cases:
+        if case.id in seen:
+            raise ValueError(f"duplicate case id: {case.id}")
+        seen.add(case.id)
diff --git a/src/evalops_workbench/eval/ledger.py b/src/evalops_workbench/eval/ledger.py
new file mode 100644
index 0000000..4c1ebc8
--- /dev/null
+++ b/src/evalops_workbench/eval/ledger.py
@@ -0,0 +1,39 @@
+"""Append-only run history, stored as a JSON array the stdlib endpoints can read.
+
+Each record is a slim summary of one published run. The history backs the
+run-over-run delta on /api/benchmark-latest and the trailing rollups
+(24h / 7d / 30d) on /api/stats. Trimmed to a bounded length so the committed
+artifact never grows without limit.
+"""
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+_KEEP = 100
+
+
+def read_records(path: str | Path) -> list[dict]:
+    path = Path(path)
+    if not path.exists():
+        return []
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, OSError, ValueError):
+        return []
+    return data if isinstance(data, list) else []
+
+
+def previous_record(path: str | Path) -> dict | None:
+    """The most recent record currently on disk (before a new append)."""
+    records = read_records(path)
+    return records[-1] if records else None
+
+
+def append_record(path: str | Path, record: dict, *, keep: int = _KEEP) -> list[dict]:
+    """Append ``record`` and persist, trimming to the most recent ``keep`` runs."""
+    records = read_records(path)
+    records.append(record)
+    trimmed = records[-keep:]
+    Path(path).write_text(json.dumps(trimmed, indent=2) + "\n", encoding="utf-8")
+    return trimmed
diff --git a/src/evalops_workbench/eval/normalize.py b/src/evalops_workbench/eval/normalize.py
new file mode 100644
index 0000000..093dd2d
--- /dev/null
+++ b/src/evalops_workbench/eval/normalize.py
@@ -0,0 +1,34 @@
+"""Answer-text normalization, following the SQuAD evaluation convention.
+
+Normalization is the contract that makes scores comparable: "The Nile." and
+"nile" must score as equal. Keeping it in one place means every scorer shares
+the exact same notion of equality.
+"""
+from __future__ import annotations
+
+import re
+import string
+
+_ARTICLES = re.compile(r"\b(a|an|the)\b", re.IGNORECASE)
+_PUNCT_TABLE = str.maketrans("", "", string.punctuation)
+_WHITESPACE = re.compile(r"\s+")
+
+
+def normalize_answer(text: str) -> str:
+    """Lowercase, strip punctuation and articles, collapse whitespace.
+
+    Mirrors the SQuAD v1.1 official normalization so token-overlap F1 and
+    exact-match scores match published methodology.
+    """
+    if not text:
+        return ""
+    lowered = text.lower()
+    no_punct = lowered.translate(_PUNCT_TABLE)
+    no_articles = _ARTICLES.sub(" ", no_punct)
+    return _WHITESPACE.sub(" ", no_articles).strip()
+
+
+def tokenize(text: str) -> list[str]:
+    """Normalized whitespace tokens. Empty string yields an empty list."""
+    normalized = normalize_answer(text)
+    return normalized.split() if normalized else []
diff --git a/src/evalops_workbench/eval/report.py b/src/evalops_workbench/eval/report.py
new file mode 100644
index 0000000..cb085e7
--- /dev/null
+++ b/src/evalops_workbench/eval/report.py
@@ -0,0 +1,66 @@
+"""Render a downloadable markdown report from a benchmark artifact."""
+from __future__ import annotations
+
+
+def render(artifact: dict) -> str:
+    metrics = artifact["metrics"]
+    lines = [
+        f"# EvalOps benchmark: {artifact['fixture']}",
+        "",
+        f"- Run: `{artifact['run_id']}`",
+        f"- Generated: {artifact['generated_at']}",
+        f"- Cases: {metrics['n_cases']}",
+        f"- Gate verdict: **{metrics['gate_verdict'].upper()}**",
+        "",
+        "## Variants",
+        "",
+        "| Variant | Exact match | Token F1 | Contains gold |",
+        "| --- | --- | --- | --- |",
+    ]
+    for variant in artifact["variants"]:
+        suffix = ""
+        if variant["name"] == metrics["candidate_variant"]:
+            suffix = " (candidate)"
+        elif variant["name"] == metrics["baseline_variant"]:
+            suffix = " (baseline)"
+        lines.append(
+            f"| {variant['name']}{suffix} | {variant['exact_match']:.3f} | "
+            f"{variant['token_f1']:.3f} | {variant['contains_gold']:.3f} |"
+        )
+
+    lines += [
+        "",
+        f"Candidate against baseline: exact match {metrics['improvement_exact_match']:+.3f}, "
+        f"token F1 {metrics['improvement_token_f1']:+.3f}.",
+        "",
+        f"## Regressions ({metrics['regressions']})",
+        "",
+    ]
+    if artifact["regressions"]:
+        lines += [
+            "| Case | Baseline | Candidate | Delta | Reason |",
+            "| --- | --- | --- | --- | --- |",
+        ]
+        for reg in artifact["regressions"]:
+            lines.append(
+                f"| {reg['case_id']} | {reg['baseline']:.2f} | {reg['candidate']:.2f} | "
+                f"{reg['delta']:+.2f} | {reg['reason']} |"
+            )
+    else:
+        lines.append("None. The candidate scored at or above the baseline on every case.")
+
+    lines += [
+        "",
+        "## Reproduce",
+        "",
+        "```bash",
+        "git clone https://github.com/IgnazioDS/evalops-workbench",
+        "cd evalops-workbench && pip install -e .",
+        "python -m evalops_workbench.benchmark_runner",
+        "```",
+        "",
+        f"Fixture: [`{artifact['fixture']}`]({artifact['artifact_urls']['fixture']}). "
+        "Every case, label, and score is reproducible offline with no credentials.",
+        "",
+    ]
+    return "\n".join(lines)
diff --git a/src/evalops_workbench/eval/runner.py b/src/evalops_workbench/eval/runner.py
new file mode 100644
index 0000000..c7ed344
--- /dev/null
+++ b/src/evalops_workbench/eval/runner.py
@@ -0,0 +1,95 @@
+"""Run a target over the dataset, aggregate scores, and diff two runs.
+
+``run_target`` produces an immutable ``RunResult`` (per-case scores + aggregate
+means). ``compare`` diffs a candidate against a baseline run and returns the
+per-case regressions — the cases where the candidate scored strictly worse.
+"""
+from __future__ import annotations
+
+from collections.abc import Sequence
+from dataclasses import dataclass
+
+from .cases import Case
+from .scorers import SCORERS, score_prediction
+from .targets import Target
+
+_REGRESSION_EPSILON = 1e-9
+
+
+@dataclass(frozen=True)
+class CaseScore:
+    case_id: str
+    prediction: str
+    scores: dict[str, float]
+    tags: tuple[str, ...]
+
+
+@dataclass(frozen=True)
+class RunResult:
+    target: str
+    n_cases: int
+    aggregate: dict[str, float]
+    case_scores: tuple[CaseScore, ...]
+
+
+def run_target(name: str, target: Target, cases: Sequence[Case]) -> RunResult:
+    """Score ``target`` against every case and roll up mean aggregates."""
+    totals = {metric: 0.0 for metric in SCORERS}
+    case_scores: list[CaseScore] = []
+    for case in cases:
+        prediction = target(case)
+        scores = score_prediction(prediction, case.answers)
+        for metric, value in scores.items():
+            totals[metric] += value
+        case_scores.append(CaseScore(case.id, prediction, scores, case.tags))
+
+    denominator = len(cases) or 1
+    aggregate = {metric: round(total / denominator, 6) for metric, total in totals.items()}
+    return RunResult(name, len(cases), aggregate, tuple(case_scores))
+
+
+@dataclass(frozen=True)
+class Regression:
+    case_id: str
+    metric: str
+    baseline: float
+    candidate: float
+    delta: float
+    tags: tuple[str, ...]
+
+    @property
+    def reason(self) -> str:
+        return (
+            f"{self.metric}: candidate {self.candidate:.2f} below baseline "
+            f"{self.baseline:.2f} ({self.delta:+.2f})"
+        )
+
+
+def compare(
+    baseline: RunResult,
+    candidate: RunResult,
+    *,
+    metric: str = "token_f1",
+    threshold: float = _REGRESSION_EPSILON,
+) -> list[Regression]:
+    """Per-case regressions: candidate strictly worse than baseline on ``metric``."""
+    baseline_by_id = {cs.case_id: cs for cs in baseline.case_scores}
+    regressions: list[Regression] = []
+    for candidate_score in candidate.case_scores:
+        baseline_score = baseline_by_id.get(candidate_score.case_id)
+        if baseline_score is None:
+            continue
+        before = baseline_score.scores.get(metric, 0.0)
+        after = candidate_score.scores.get(metric, 0.0)
+        if after < before - threshold:
+            regressions.append(
+                Regression(
+                    case_id=candidate_score.case_id,
+                    metric=metric,
+                    baseline=round(before, 6),
+                    candidate=round(after, 6),
+                    delta=round(after - before, 6),
+                    tags=candidate_score.tags,
+                )
+            )
+    return regressions
diff --git a/src/evalops_workbench/eval/scorers.py b/src/evalops_workbench/eval/scorers.py
new file mode 100644
index 0000000..033b7c3
--- /dev/null
+++ b/src/evalops_workbench/eval/scorers.py
@@ -0,0 +1,60 @@
+"""Rubric functions: exact match, token-overlap F1, and gold containment.
+
+Each scorer takes a single prediction string and the tuple of acceptable gold
+answers, and returns a float in [0.0, 1.0]. A prediction scores against its
+best-matching gold (the standard for multi-reference QA).
+"""
+from __future__ import annotations
+
+from collections import Counter
+from collections.abc import Sequence
+
+from .normalize import normalize_answer, tokenize
+
+
+def exact_match(prediction: str, golds: Sequence[str]) -> float:
+    """1.0 if the normalized prediction equals any normalized gold, else 0.0."""
+    normalized_pred = normalize_answer(prediction)
+    return 1.0 if any(normalized_pred == normalize_answer(g) for g in golds) else 0.0
+
+
+def _pairwise_f1(prediction: str, gold: str) -> float:
+    pred_tokens = tokenize(prediction)
+    gold_tokens = tokenize(gold)
+    if not pred_tokens or not gold_tokens:
+        # If both are empty they match; if exactly one is empty they do not.
+        return 1.0 if pred_tokens == gold_tokens else 0.0
+    shared = Counter(pred_tokens) & Counter(gold_tokens)
+    num_shared = sum(shared.values())
+    if num_shared == 0:
+        return 0.0
+    precision = num_shared / len(pred_tokens)
+    recall = num_shared / len(gold_tokens)
+    return 2 * precision * recall / (precision + recall)
+
+
+def token_f1(prediction: str, golds: Sequence[str]) -> float:
+    """Best token-overlap F1 across all gold answers."""
+    return max((_pairwise_f1(prediction, g) for g in golds), default=0.0)
+
+
+def contains_gold(prediction: str, golds: Sequence[str]) -> float:
+    """1.0 if the normalized prediction contains a (non-empty) normalized gold."""
+    normalized_pred = normalize_answer(prediction)
+    for gold in golds:
+        normalized_gold = normalize_answer(gold)
+        if normalized_gold and normalized_gold in normalized_pred:
+            return 1.0
+    return 0.0
+
+
+SCORERS = {
+    "exact_match": exact_match,
+    "token_f1": token_f1,
+    "contains_gold": contains_gold,
+}
+
+
+def score_prediction(prediction: str, golds: Sequence[str]) -> dict[str, float]:
+    """Run every rubric function and return a name -> score mapping."""
+    return {name: fn(prediction, golds) for name, fn in SCORERS.items()}
diff --git a/src/evalops_workbench/eval/targets.py b/src/evalops_workbench/eval/targets.py
new file mode 100644
index 0000000..cc2e4ce
--- /dev/null
+++ b/src/evalops_workbench/eval/targets.py
@@ -0,0 +1,147 @@
+"""The system-under-test: named, deterministic extractive-QA strategies.
+
+These are the "variants" an EvalOps user would compare, made concrete and
+credential-free so the public benchmark reproduces anywhere. Each target maps a
+``Case`` to a predicted answer string. They differ in real, explainable ways, so
+the harness surfaces a genuine quality delta (and genuine per-case regressions),
+not a manufactured one.
+
+A live-LLM target would implement the same ``Target`` signature; it is an
+optional extension, never required by the public benchmark.
+"""
+from __future__ import annotations
+
+import re
+from collections.abc import Callable
+
+from .cases import Case
+from .normalize import normalize_answer
+
+Target = Callable[[Case], str]
+
+_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
+_YEAR = re.compile(r"\b(?:1[0-9]{3}|2[0-9]{3})\b")
+_NUMBER = re.compile(r"\b\d[\d,]*(?:\.\d+)?\b")
+_PROPER_NOUN = re.compile(r"\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b")
+
+_STOPWORDS = frozenset(
+    """
+    a an the of to in on at for and or but is are was were be been being am
+    this that these those with as by from into over under after before it its
+    their his her him she he they them we you your our
+    what when where who whom which why how whose does did do done has have had
+    """.split()
+)
+
+
+def split_sentences(text: str) -> list[str]:
+    """Split a passage into trimmed, non-empty sentences."""
+    return [part.strip() for part in _SENTENCE_SPLIT.split(text.strip()) if part.strip()]
+
+
+def _content_tokens(text: str) -> set[str]:
+    return {tok for tok in normalize_answer(text).split() if tok not in _STOPWORDS}
+
+
+def _best_sentence(case: Case) -> str:
+    """The sentence whose content words most overlap the question.
+
+    Ties break toward the shorter sentence (more answer-dense). This is the
+    shared retrieval step both the baseline and candidate build on.
+    """
+    sentences = split_sentences(case.context)
+    if not sentences:
+        return case.context.strip()
+    question_tokens = _content_tokens(case.question)
+    if not question_tokens:
+        return sentences[0]
+
+    def rank(sentence: str) -> tuple[int, int]:
+        overlap = len(question_tokens & _content_tokens(sentence))
+        return (overlap, -len(_content_tokens(sentence)))
+
+    return max(sentences, key=rank)
+
+
+def _question_word(question: str) -> str:
+    tokens = normalize_answer(question).split()
+    return tokens[0] if tokens else ""
+
+
+def _first_novel_proper_noun(sentence: str, question: str) -> str | None:
+    """First proper-noun phrase that the question does not already name.
+
+    Skipping the subject the question is about (and sentence-initial articles
+    like "The") is what makes the candidate competent on most factoids. Picking
+    the *first* remaining entity is also what makes it fail when a distractor
+    entity precedes the answer.
+    """
+    question_tokens = _content_tokens(question)
+    for match in _PROPER_NOUN.finditer(sentence):
+        phrase_tokens = _content_tokens(match.group(0))
+        if not phrase_tokens or phrase_tokens & question_tokens:
+            continue
+        return match.group(0)
+    return None
+
+
+def first_sentence(case: Case) -> str:
+    """Floor strategy: always return the opening sentence."""
+    sentences = split_sentences(case.context)
+    return sentences[0] if sentences else case.context.strip()
+
+
+def overlap_sentence(case: Case) -> str:
+    """Baseline: return the whole best-overlap sentence.
+
+    Safe but blunt — it scores partial F1 on most cases and rarely an exact
+    match, because it returns far more than the answer span.
+    """
+    return _best_sentence(case)
+
+
+def span_extract(case: Case) -> str:
+    """Candidate: locate the best sentence, then narrow to an answer span.
+
+    Wins big on factoid questions (who/when/where/how-many) by returning the
+    span instead of the sentence. Can mis-fire on adversarial phrasing (grabs
+    the first proper noun or number when the answer is a later one), which is
+    exactly the silent regression the harness exists to catch.
+    """
+    sentence = _best_sentence(case)
+    qword = _question_word(case.question)
+    lowered = case.question.lower()
+
+    if qword == "when" or "what year" in lowered or "which year" in lowered:
+        match = _YEAR.search(sentence) or _NUMBER.search(sentence)
+        if match:
+            return match.group(0)
+    if qword == "how" and any(
+        phrase in lowered for phrase in ("how many", "how much", "how long", "how old")
+    ):
+        match = _NUMBER.search(sentence)
+        if match:
+            return match.group(0)
+    if qword in {"who", "whom", "where"}:
+        span = _first_novel_proper_noun(sentence, case.question)
+        if span:
+            return span
+
+    # No confident span: fall back to the best sentence (never worse than baseline here).
+    return sentence
+
+
+REGISTRY: dict[str, Target] = {
+    "first_sentence": first_sentence,
+    "overlap_sentence": overlap_sentence,
+    "span_extract": span_extract,
+}
+
+
+def get_target(name: str) -> Target:
+    try:
+        return REGISTRY[name]
+    except KeyError:
+        raise ValueError(
+            f"unknown target {name!r}; known targets: {sorted(REGISTRY)}"
+        ) from None
diff --git a/src/evalops_workbench/project.json b/src/evalops_workbench/project.json
index 3a32958..3265713 100644
--- a/src/evalops_workbench/project.json
+++ b/src/evalops_workbench/project.json
@@ -3,15 +3,14 @@
   "name": "EvalOps Workbench",
   "category": "Developer Tool",
   "track": "LLM",
-  "stage": "Researching",
+  "stage": "Prototype",
   "summary": "A local-first evaluation harness for prompts, tools, and agents with regression tracking and experiment history.",
   "problem": "LLM teams lack a lightweight way to compare prompt and tool changes before shipping.",
   "users": "Agent builders, prompt engineers, applied AI teams",
   "stack": [
     "Python",
-    "Typer",
-    "DuckDB",
-    "OpenTelemetry"
+    "GitHub Actions",
+    "Vercel"
   ],
   "why_now": "Evaluation is moving from optional best practice to baseline engineering hygiene.",
   "mvp": [
diff --git a/src/lib/api.ts b/src/lib/api.ts
index aa5474c..1e9faa0 100644
--- a/src/lib/api.ts
+++ b/src/lib/api.ts
@@ -1,6 +1,7 @@
-// Slim API surface for the showcase dashboard.
-// Only the public /api/stats endpoint is real on showcase deploys; the
-// Tier-A BFF endpoints (run, documents, ui/*) don't exist here.
+// Slim API surface for the dashboard.
+// Two public, unauthenticated endpoints are real on this deploy:
+//   /api/stats            — Tier-A telemetry (TELEMETRY_SCHEMA.md)
+//   /api/benchmark-latest — the latest published benchmark run
 
 async function publicFetch<T>(path: string, init?: RequestInit): Promise<T> {
   const res = await fetch(path, {
@@ -13,19 +14,21 @@ async function publicFetch<T>(path: string, init?: RequestInit): Promise<T> {
   return res.json() as Promise<T>;
 }
 
-/** Tier-B telemetry response — see TELEMETRY_SCHEMA.md. */
+/** Tier-A telemetry response — see TELEMETRY_SCHEMA.md. */
 export interface PublicStats {
   system: string;
   mode?: "live" | "showcase";
   status: "operational" | "degraded" | "down";
   last_deployed_at: string | null;
+  last_active_at?: string | null;
   last_commit_at?: string | null;
   metrics: {
-    commits_30d?: number;
-    commits_total?: number;
-    primary_language?: string;
-    repo_stars?: number;
-    lines_of_code?: number;
+    eval_runs_total?: number;
+    eval_runs_24h?: number;
+    last_pass_rate?: number;
+    rolling_pass_rate_7d?: number;
+    regressions_caught_30d?: number;
+    experiments_tracked?: number;
     [key: string]: number | string | undefined;
   };
   schema_version: number;
@@ -35,3 +38,67 @@ export interface PublicStats {
 export function fetchPublicStats(): Promise<PublicStats> {
   return publicFetch<PublicStats>("/api/stats");
 }
+
+/** One strategy's aggregate scores in a benchmark run. */
+export interface BenchmarkVariant {
+  name: string;
+  exact_match: number;
+  token_f1: number;
+  contains_gold: number;
+}
+
+/** A single per-case regression surfaced by the run. */
+export interface BenchmarkRegression {
+  case_id: string;
+  metric: string;
+  baseline: number;
+  candidate: number;
+  delta: number;
+  reason: string;
+  tags: string[];
+}
+
+export interface BenchmarkMetrics {
+  n_cases: number;
+  baseline_variant: string;
+  candidate_variant: string;
+  exact_match: number;
+  token_f1: number;
+  contains_gold: number;
+  improvement_exact_match: number;
+  improvement_token_f1: number;
+  regressions: number;
+  pass_rate: number;
+  gate_verdict: string;
+}
+
+/** /api/benchmark-latest response — see the benchmark-latest spec in TELEMETRY_SCHEMA.md. */
+export interface PublicBenchmark {
+  system: string;
+  benchmark_type: string;
+  status?: string;
+  run_id: string | null;
+  fixture?: string;
+  metrics: BenchmarkMetrics | null;
+  variants?: BenchmarkVariant[];
+  regressions?: BenchmarkRegression[];
+  gate?: {
+    passed: boolean;
+    metric: string;
+    pinned: number | null;
+    observed: number;
+    reasons: string[];
+  };
+  artifact_urls?: { report: string; fixture: string; run: string };
+  schema_version: number;
+  generated_at: string;
+  previous_run?: {
+    run_id: string | null;
+    generated_at: string | null;
+    delta: Record<string, number>;
+  } | null;
+}
+
+export function fetchBenchmarkLatest(): Promise<PublicBenchmark> {
+  return publicFetch<PublicBenchmark>("/api/benchmark-latest");
+}
diff --git a/src/lib/project.ts b/src/lib/project.ts
index 5a59dab..954d7b2 100644
--- a/src/lib/project.ts
+++ b/src/lib/project.ts
@@ -26,13 +26,13 @@ export const PROJECT: ProjectSpec = {
   name: "EvalOps Workbench",
   category: "Developer Tool",
   track: "LLM",
-  stage: "Researching",
+  stage: "Prototype",
   summary:
     "A local-first evaluation harness for prompts, tools, and agents with regression tracking and experiment history.",
   problem:
     "LLM teams lack a lightweight way to compare prompt and tool changes before shipping.",
   users: "Agent builders, prompt engineers, applied AI teams",
-  stack: ["Python", "Typer", "DuckDB", "OpenTelemetry"],
+  stack: ["Python", "GitHub Actions", "Vercel"],
   why_now:
     "Evaluation is moving from optional best practice to baseline engineering hygiene.",
   mvp: [
diff --git a/tests/test_benchmark_endpoint.py b/tests/test_benchmark_endpoint.py
new file mode 100644
index 0000000..c24df67
--- /dev/null
+++ b/tests/test_benchmark_endpoint.py
@@ -0,0 +1,75 @@
+"""Unit tests for the /api/benchmark-latest serverless function.
+
+The module file name contains a hyphen (matching the Vercel route), so it is
+loaded by path rather than imported by name.
+"""
+from __future__ import annotations
+
+import importlib.util
+import json
+import tempfile
+import unittest
+from pathlib import Path
+
+_API_DIR = Path(__file__).resolve().parent.parent / "api"
+
+
+def _load_endpoint():
+    spec = importlib.util.spec_from_file_location(
+        "benchmark_latest_endpoint", _API_DIR / "benchmark-latest.py"
+    )
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module
+
+
+class BenchmarkEndpointTests(unittest.TestCase):
+    def setUp(self) -> None:
+        self.mod = _load_endpoint()
+        self._orig_artifact = self.mod.ARTIFACT_FILE
+
+    def tearDown(self) -> None:
+        self.mod.ARTIFACT_FILE = self._orig_artifact
+
+    def test_returns_committed_artifact(self) -> None:
+        artifact = {
+            "system": "evalops",
+            "benchmark_type": "eval",
+            "run_id": "evalops-2026-05-26-abc12345",
+            "metrics": {"token_f1": 0.65},
+            "schema_version": 1,
+            "generated_at": "2026-05-26T00:00:00Z",
+        }
+        path = Path(tempfile.mkdtemp()) / "_benchmark_latest.json"
+        path.write_text(json.dumps(artifact), encoding="utf-8")
+        self.mod.ARTIFACT_FILE = path
+
+        response = self.mod.build_response()
+        self.assertEqual(response["system"], "evalops")
+        self.assertEqual(response["run_id"], "evalops-2026-05-26-abc12345")
+        self.assertEqual(response["schema_version"], 1)
+
+    def test_pending_when_artifact_missing(self) -> None:
+        self.mod.ARTIFACT_FILE = Path("/nonexistent/_benchmark_latest.json")
+        response = self.mod.build_response()
+        self.assertEqual(response["system"], "evalops")
+        self.assertEqual(response["status"], "pending")
+        self.assertEqual(response["benchmark_type"], "eval")
+        self.assertIsNone(response["run_id"])
+        self.assertEqual(response["schema_version"], 1)
+
+    def test_seeded_repo_artifact_is_valid(self) -> None:
+        """The committed artifact in the repo must be schema-valid."""
+        response = self.mod.build_response()
+        # In the repo the seed exists; if a developer cleared it, accept pending.
+        if response.get("status") == "pending":
+            self.skipTest("no seeded artifact present")
+        self.assertEqual(response["system"], "evalops")
+        self.assertEqual(response["schema_version"], 1)
+        self.assertIn("metrics", response)
+        self.assertIn("generated_at", response)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_eval_engine.py b/tests/test_eval_engine.py
new file mode 100644
index 0000000..a09c848
--- /dev/null
+++ b/tests/test_eval_engine.py
@@ -0,0 +1,272 @@
+"""Unit tests for the EvalOps evaluation engine."""
+from __future__ import annotations
+
+import json
+import tempfile
+import unittest
+from pathlib import Path
+
+from evalops_workbench.eval import artifact as artifact_mod
+from evalops_workbench.eval import baseline as baseline_mod
+from evalops_workbench.eval import ledger as ledger_mod
+from evalops_workbench.eval.cases import Case, load_cases
+from evalops_workbench.eval.normalize import normalize_answer, tokenize
+from evalops_workbench.eval.runner import compare, run_target
+from evalops_workbench.eval.scorers import (
+    contains_gold,
+    exact_match,
+    score_prediction,
+    token_f1,
+)
+from evalops_workbench.eval.targets import (
+    first_sentence,
+    get_target,
+    overlap_sentence,
+    span_extract,
+    split_sentences,
+)
+
+
+class NormalizeTests(unittest.TestCase):
+    def test_strips_articles_punctuation_and_case(self) -> None:
+        self.assertEqual(normalize_answer("The Nile."), "nile")
+        self.assertEqual(normalize_answer("  A  House! "), "house")
+
+    def test_empty(self) -> None:
+        self.assertEqual(normalize_answer(""), "")
+        self.assertEqual(tokenize("the a an"), [])
+
+    def test_tokenize(self) -> None:
+        self.assertEqual(tokenize("Hello, World!"), ["hello", "world"])
+
+
+class ScorerTests(unittest.TestCase):
+    def test_exact_match_normalizes(self) -> None:
+        self.assertEqual(exact_match("Paris", ["paris"]), 1.0)
+        self.assertEqual(exact_match("Paris, France", ["Paris"]), 0.0)
+
+    def test_token_f1_bounds(self) -> None:
+        self.assertEqual(token_f1("Paris", ["Paris"]), 1.0)
+        self.assertEqual(token_f1("London", ["Paris"]), 0.0)
+        partial = token_f1("the answer is Paris", ["Paris"])
+        self.assertTrue(0.0 < partial < 1.0)
+
+    def test_contains_gold(self) -> None:
+        self.assertEqual(contains_gold("the capital is Paris", ["Paris"]), 1.0)
+        self.assertEqual(contains_gold("London", ["Paris"]), 0.0)
+
+    def test_score_prediction_returns_all(self) -> None:
+        scores = score_prediction("Paris", ["Paris"])
+        self.assertEqual(set(scores), {"exact_match", "token_f1", "contains_gold"})
+
+
+class CaseLoaderTests(unittest.TestCase):
+    def _write(self, name: str, text: str) -> Path:
+        tmp = Path(tempfile.mkdtemp()) / name
+        tmp.write_text(text, encoding="utf-8")
+        return tmp
+
+    def test_load_jsonl_and_coerce_string_answer(self) -> None:
+        path = self._write(
+            "c.jsonl",
+            '{"id":"1","question":"q","context":"c","answers":"a"}\n',
+        )
+        cases = load_cases(path)
+        self.assertEqual(len(cases), 1)
+        self.assertEqual(cases[0].answers, ("a",))
+
+    def test_missing_field_raises(self) -> None:
+        path = self._write("c.jsonl", '{"id":"1","question":"q","context":"c"}\n')
+        with self.assertRaises(ValueError):
+            load_cases(path)
+
+    def test_duplicate_ids_raise(self) -> None:
+        path = self._write(
+            "c.jsonl",
+            '{"id":"1","question":"q","context":"c","answers":"a"}\n'
+            '{"id":"1","question":"q","context":"c","answers":"b"}\n',
+        )
+        with self.assertRaises(ValueError):
+            load_cases(path)
+
+    def test_unsupported_extension_raises(self) -> None:
+        path = self._write("c.txt", "nope")
+        with self.assertRaises(ValueError):
+            load_cases(path)
+
+    def test_missing_file_raises(self) -> None:
+        with self.assertRaises(FileNotFoundError):
+            load_cases(Path(tempfile.mkdtemp()) / "absent.jsonl")
+
+
+class TargetTests(unittest.TestCase):
+    def test_split_sentences(self) -> None:
+        self.assertEqual(split_sentences("One. Two! Three?"), ["One.", "Two!", "Three?"])
+
+    def test_candidate_extracts_entity(self) -> None:
+        case = Case(
+            "t",
+            "Where is the Eiffel Tower located?",
+            "The Eiffel Tower is located in Paris. It opened in 1889.",
+            ("Paris",),
+        )
+        self.assertEqual(span_extract(case), "Paris")
+        self.assertEqual(overlap_sentence(case), "The Eiffel Tower is located in Paris.")
+        self.assertEqual(first_sentence(case), "The Eiffel Tower is located in Paris.")
+
+    def test_candidate_extracts_year_and_number(self) -> None:
+        when = Case("w", "When was the telephone patented?", "The telephone was patented in 1876.", ("1876",))
+        self.assertEqual(span_extract(when), "1876")
+        howmany = Case("h", "How many continents are there on Earth?", "There are 7 continents on Earth.", ("7",))
+        self.assertEqual(span_extract(howmany), "7")
+
+    def test_candidate_regresses_on_distractor(self) -> None:
+        case = Case(
+            "adv",
+            "Who composed the Ninth Symphony?",
+            "The Ninth Symphony was first performed in Vienna and is attributed to Beethoven.",
+            ("Beethoven",),
+        )
+        # The candidate grabs the first novel entity (the distractor), not the answer.
+        self.assertEqual(span_extract(case), "Vienna")
+
+    def test_get_target_unknown_raises(self) -> None:
+        with self.assertRaises(ValueError):
+            get_target("nope")
+
+
+class RunnerTests(unittest.TestCase):
+    def setUp(self) -> None:
+        self.cases = [
+            Case("good", "Where is the Eiffel Tower located?",
+                 "The Eiffel Tower is located in Paris.", ("Paris",)),
+            Case("adv", "Who composed the Ninth Symphony?",
+                 "The Ninth Symphony premiered in Vienna and is attributed to Beethoven.", ("Beethoven",)),
+        ]
+
+    def test_run_target_aggregate(self) -> None:
+        result = run_target("span_extract", get_target("span_extract"), self.cases)
+        self.assertEqual(result.n_cases, 2)
+        self.assertIn("token_f1", result.aggregate)
+        self.assertEqual(len(result.case_scores), 2)
+
+    def test_compare_finds_only_real_regressions(self) -> None:
+        baseline = run_target("overlap_sentence", get_target("overlap_sentence"), self.cases)
+        candidate = run_target("span_extract", get_target("span_extract"), self.cases)
+        regressions = compare(baseline, candidate, metric="token_f1")
+        regressed = {r.case_id for r in regressions}
+        self.assertIn("adv", regressed)
+        self.assertNotIn("good", regressed)
+
+
+class BaselineGateTests(unittest.TestCase):
+    def _result(self, f1: float):
+        case = Case("x", "q", "c", ("a",))
+        result = run_target("first_sentence", lambda _c: "a", [case])
+        # Override aggregate for a controlled gate test.
+        object.__setattr__(result, "aggregate", {"token_f1": f1, "exact_match": f1, "contains_gold": f1})
+        return result
+
+    def test_no_pin_establishes_contract(self) -> None:
+        verdict = baseline_mod.evaluate_gate(self._result(0.5), None)
+        self.assertTrue(verdict.passed)
+        self.assertIsNone(verdict.pinned)
+
+    def test_above_floor_passes_below_fails(self) -> None:
+        pinned = {"aggregate": {"token_f1": 0.60}}
+        self.assertTrue(baseline_mod.evaluate_gate(self._result(0.59), pinned).passed)
+        self.assertFalse(baseline_mod.evaluate_gate(self._result(0.50), pinned).passed)
+
+    def test_pin_roundtrip(self) -> None:
+        path = Path(tempfile.mkdtemp()) / "pin.json"
+        result = self._result(0.7)
+        baseline_mod.save_pinned(path, result)
+        loaded = baseline_mod.load_pinned(path)
+        self.assertEqual(loaded["aggregate"]["token_f1"], 0.7)
+
+
+class LedgerTests(unittest.TestCase):
+    def test_append_trim_previous(self) -> None:
+        path = Path(tempfile.mkdtemp()) / "hist.json"
+        self.assertIsNone(ledger_mod.previous_record(path))
+        ledger_mod.append_record(path, {"run_id": "a", "generated_at": "t1"})
+        ledger_mod.append_record(path, {"run_id": "b", "generated_at": "t2"}, keep=5)
+        self.assertEqual(ledger_mod.previous_record(path)["run_id"], "b")
+        self.assertEqual(len(ledger_mod.read_records(path)), 2)
+
+    def test_trim_to_keep(self) -> None:
+        path = Path(tempfile.mkdtemp()) / "hist.json"
+        for i in range(10):
+            ledger_mod.append_record(path, {"run_id": str(i)}, keep=3)
+        records = ledger_mod.read_records(path)
+        self.assertEqual(len(records), 3)
+        self.assertEqual(records[-1]["run_id"], "9")
+
+
+class ArtifactTests(unittest.TestCase):
+    def _results(self):
+        cases = [
+            Case("good", "Where is the Eiffel Tower located?",
+                 "The Eiffel Tower is located in Paris.", ("Paris",)),
+            Case("adv", "Who composed the Ninth Symphony?",
+                 "The Ninth Symphony premiered in Vienna and is attributed to Beethoven.", ("Beethoven",)),
+        ]
+        return {name: run_target(name, get_target(name), cases)
+                for name in ("first_sentence", "overlap_sentence", "span_extract")}
+
+    def test_build_artifact_shape(self) -> None:
+        results = self._results()
+        regressions = compare(results["overlap_sentence"], results["span_extract"])
+        verdict = baseline_mod.evaluate_gate(results["span_extract"], None)
+        art = artifact_mod.build_artifact(
+            fixture_id="benchmark-v1",
+            fixture_rel="examples/benchmark-v1/cases.jsonl",
+            results=results,
+            baseline_name="overlap_sentence",
+            candidate_name="span_extract",
+            regressions=regressions,
+            verdict=verdict,
+            previous=None,
+            generated_at="2026-05-26T00:00:00Z",
+        )
+        self.assertEqual(art["system"], "evalops")
+        self.assertEqual(art["benchmark_type"], "eval")
+        self.assertEqual(art["schema_version"], 1)
+        self.assertTrue(art["run_id"].startswith("evalops-2026-05-26-"))
+        self.assertIsNone(art["previous_run"])
+        self.assertEqual(len(art["variants"]), 3)
+        self.assertIn("pass_rate", art["metrics"])
+
+    def test_previous_run_delta(self) -> None:
+        results = self._results()
+        verdict = baseline_mod.evaluate_gate(results["span_extract"], None)
+        previous = {"run_id": "prev", "generated_at": "2026-05-25T00:00:00Z",
+                    "token_f1": 0.10, "exact_match": 0.10, "regressions": 1}
+        art = artifact_mod.build_artifact(
+            fixture_id="benchmark-v1", fixture_rel="x", results=results,
+            baseline_name="overlap_sentence", candidate_name="span_extract",
+            regressions=[], verdict=verdict, previous=previous,
+            generated_at="2026-05-26T00:00:00Z",
+        )
+        self.assertEqual(art["previous_run"]["run_id"], "prev")
+        self.assertIn("token_f1", art["previous_run"]["delta"])
+
+    def test_slim_record_keys(self) -> None:
+        results = self._results()
+        verdict = baseline_mod.evaluate_gate(results["span_extract"], None)
+        art = artifact_mod.build_artifact(
+            fixture_id="benchmark-v1", fixture_rel="x", results=results,
+            baseline_name="overlap_sentence", candidate_name="span_extract",
+            regressions=[], verdict=verdict, previous=None,
+            generated_at="2026-05-26T00:00:00Z",
+        )
+        record = artifact_mod.slim_record(art)
+        self.assertEqual(
+            set(record),
+            {"run_id", "generated_at", "pass_rate", "token_f1", "exact_match",
+             "regressions", "regressed_ids", "gate_verdict", "variants"},
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_stats.py b/tests/test_stats.py
index 32c876a..df001fb 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -1,156 +1,97 @@
-"""Unit tests for the /api/stats Vercel serverless function.
+"""Unit tests for the /api/stats Vercel serverless function (Tier A, live).
 
 Covers:
-- happy path: GitHub reachable, response shape matches Tier B contract
-- degraded path: GitHub unreachable, contract still satisfied with status="degraded"
-- safety caps: oversize values are clamped
-- never returns 5xx (handler always emits HTTP 200)
+- live path: benchmark history present, metrics derived from records
+- degraded path: no history yet, contract satisfied with zeroed metrics
+- schema shape matches the evalops Tier-A contract in TELEMETRY_SCHEMA.md
+- safety caps and the never-5xx handler guarantee
 """
 from __future__ import annotations
 
 import io
 import json
 import sys
+import tempfile
 import unittest
 from pathlib import Path
-from unittest.mock import MagicMock, patch
-from urllib.error import URLError
+from unittest.mock import MagicMock
 
-# Add repo root to sys.path so we can import the api/stats.py module.
+# Add repo root /api to sys.path so we can import the api/stats.py module.
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "api"))
 import stats  # type: ignore  # noqa: E402
 
+_TIER_A_METRICS = {
+    "eval_runs_total",
+    "eval_runs_24h",
+    "last_pass_rate",
+    "rolling_pass_rate_7d",
+    "regressions_caught_30d",
+    "experiments_tracked",
+}
 
-def _reset_cache() -> None:
-    stats._cache = {"ts": 0.0, "payload": None}
 
+def _write_history(records: list[dict]) -> Path:
+    path = Path(tempfile.mkdtemp()) / "_benchmark_history.json"
+    path.write_text(json.dumps(records), encoding="utf-8")
+    return path
 
-def _fake_response(body: object, link_header: str = "") -> MagicMock:
-    """Build a context-manager-compatible mock that mimics urlopen's return."""
-    raw = json.dumps(body).encode("utf-8")
-    cm = MagicMock()
-    cm.__enter__ = MagicMock(return_value=cm)
-    cm.__exit__ = MagicMock(return_value=False)
-    cm.read = MagicMock(return_value=raw)
-    cm.getheaders = MagicMock(
-        return_value=[("Link", link_header)] if link_header else []
-    )
-    return cm
 
-
-class ResponseShapeTests(unittest.TestCase):
+class LiveResponseTests(unittest.TestCase):
     def setUp(self) -> None:
-        _reset_cache()
-
-    def test_happy_path_matches_contract(self) -> None:
-        repo_payload = {"stargazers_count": 7, "language": "Python"}
-        commit_payload = [
-            {"commit": {"author": {"date": "2026-04-26T12:00:00Z"}}}
-        ]
-
-        def side_effect(req, timeout=None):
-            url = req.full_url
-            if "/commits" not in url:
-                return _fake_response(repo_payload)
-            return _fake_response(
-                commit_payload,
-                link_header=(
-                    f"<https://api.github.com/repositories/x/commits"
-                    f"?per_page=1&page=2>; rel=\"next\", "
-                    f"<https://api.github.com/repositories/x/commits"
-                    f"?per_page=1&page=42>; rel=\"last\""
-                ),
-            )
-
-        with patch.object(stats, "urlopen", side_effect=side_effect):
-            response = stats._build_response()
-
-        self.assertEqual(response["schema_version"], 1)
-        self.assertEqual(response["mode"], "showcase")
+        self._orig_history = stats.HISTORY_FILE
+        self._orig_artifact = stats.ARTIFACT_FILE
+
+    def tearDown(self) -> None:
+        stats.HISTORY_FILE = self._orig_history
+        stats.ARTIFACT_FILE = self._orig_artifact
+
+    def test_live_operational_from_history(self) -> None:
+        stats.HISTORY_FILE = _write_history(
+            [
+                {"run_id": "r1", "generated_at": stats._now_iso(), "pass_rate": 0.6,
+                 "regressions": 2, "regressed_ids": ["a", "b"],
+                 "variants": ["overlap_sentence", "span_extract"]},
+                {"run_id": "r2", "generated_at": stats._now_iso(), "pass_rate": 0.7,
+                 "regressions": 2, "regressed_ids": ["b", "c"],
+                 "variants": ["span_extract", "first_sentence"]},
+            ]
+        )
+        stats.ARTIFACT_FILE = Path("/nonexistent/_benchmark_latest.json")
+        response = stats._build_response()
+
+        self.assertEqual(response["mode"], "live")
         self.assertEqual(response["status"], "operational")
-        self.assertEqual(response["system"], stats.SYSTEM_SLUG)
-        self.assertIn("metrics", response)
-        self.assertEqual(response["metrics"]["repo_stars"], 7)
-        self.assertEqual(response["metrics"]["primary_language"], "Python")
-        self.assertEqual(response["metrics"]["commits_total"], 42)
-        self.assertEqual(response["last_commit_at"], "2026-04-26T12:00:00Z")
-        # generated_at is ISO-8601 with Z suffix.
+        self.assertEqual(response["schema_version"], 1)
+        self.assertEqual(set(response["metrics"]), _TIER_A_METRICS)
+        self.assertEqual(response["metrics"]["eval_runs_total"], 2)
+        self.assertEqual(response["metrics"]["last_pass_rate"], 0.7)
+        # Distinct regressions across the window: union of {a,b} and {b,c} = 3.
+        self.assertEqual(response["metrics"]["regressions_caught_30d"], 3)
+        self.assertEqual(response["metrics"]["experiments_tracked"], 3)
         self.assertTrue(response["generated_at"].endswith("Z"))
 
-    def test_degraded_when_github_unreachable(self) -> None:
-        with patch.object(stats, "urlopen", side_effect=URLError("offline")):
-            response = stats._build_response()
+    def test_degraded_without_history(self) -> None:
+        stats.HISTORY_FILE = Path("/nonexistent/_benchmark_history.json")
+        stats.ARTIFACT_FILE = Path("/nonexistent/_benchmark_latest.json")
+        response = stats._build_response()
 
-        self.assertEqual(response["schema_version"], 1)
-        self.assertEqual(response["mode"], "showcase")
+        self.assertEqual(response["mode"], "live")
         self.assertEqual(response["status"], "degraded")
-        self.assertEqual(response["metrics"]["commits_total"], 0)
-        self.assertEqual(response["metrics"]["repo_stars"], 0)
-        self.assertIsNone(response["last_commit_at"])
-
-    def test_serves_stale_cache_on_subsequent_failure(self) -> None:
-        # First call: successful. Second call: GitHub is down. Expect status
-        # to flip to "degraded" but the metric values from the cache are kept.
-        repo_payload = {"stargazers_count": 11, "language": "Go"}
-        commit_payload = [
-            {"commit": {"author": {"date": "2026-04-25T08:00:00Z"}}}
-        ]
-
-        def good(req, timeout=None):
-            if "/commits" not in req.full_url:
-                return _fake_response(repo_payload)
-            return _fake_response(
-                commit_payload,
-                link_header=(
-                    '<https://api.github.com/repositories/x/commits'
-                    '?per_page=1&page=2>; rel="next", '
-                    '<https://api.github.com/repositories/x/commits'
-                    '?per_page=1&page=99>; rel="last"'
-                ),
-            )
-
-        with patch.object(stats, "urlopen", side_effect=good):
-            first = stats._build_response()
-        self.assertEqual(first["status"], "operational")
-
-        with patch.object(stats, "_fetch_metrics", side_effect=URLError("offline")):
-            # Force cache miss by advancing the clock past the TTL.
-            stats._cache["ts"] = 0.0
-            stale = stats._build_response()
-        self.assertEqual(stale["status"], "degraded")
-        self.assertEqual(stale["metrics"]["repo_stars"], 11)
-        self.assertEqual(stale["metrics"]["commits_total"], 99)
+        self.assertEqual(response["metrics"], stats._zeroed_metrics())
+        self.assertIsNone(response["last_active_at"])
 
 
 class SafetyCapTests(unittest.TestCase):
-    def test_oversize_values_are_clamped(self) -> None:
-        self.assertEqual(stats._cap("repo_stars", 99_999_999), 1_000_000)
-        self.assertEqual(stats._cap("commits_total", 50_000_000), 1_000_000)
-        self.assertEqual(stats._cap("commits_30d", 500_000), 100_000)
-        self.assertEqual(stats._cap("lines_of_code", 999_999_999), 10_000_000)
-        # Unknown key passes through unchanged.
+    def test_caps_clamp(self) -> None:
+        self.assertEqual(stats._cap("eval_runs_total", 9_999_999), 1_000_000)
+        self.assertEqual(stats._cap("experiments_tracked", 999_999), 100_000)
         self.assertEqual(stats._cap("not_a_field", 42), 42)
 
 
 class HandlerTests(unittest.TestCase):
-    """Exercise the BaseHTTPRequestHandler entrypoint end-to-end."""
-
-    def setUp(self) -> None:
-        _reset_cache()
-
     def _invoke(self, method: str = "GET") -> tuple[int, dict[str, str], bytes]:
-        # Build a minimal raw HTTP request the handler can parse.
-        request_text = (
-            f"{method} /api/stats HTTP/1.0\r\nHost: x\r\n\r\n"
-        ).encode("utf-8")
-        rfile = io.BytesIO(request_text)
+        rfile = io.BytesIO(f"{method} /api/stats HTTP/1.0\r\nHost: x\r\n\r\n".encode())
         wfile = io.BytesIO()
-
-        class _Conn:
-            def makefile(self, *_args: object, **_kwargs: object) -> io.BytesIO:
-                return rfile
-
-        # BaseHTTPRequestHandler init runs the request automatically.
         h = stats.handler.__new__(stats.handler)
         h.rfile = rfile
         h.wfile = wfile
@@ -161,39 +102,34 @@ def makefile(self, *_args: object, **_kwargs: object) -> io.BytesIO:
         h.request_version = "HTTP/1.0"
         h.headers = {}
         h.requestline = f"{method} /api/stats HTTP/1.0"
-
         if method == "OPTIONS":
             h.do_OPTIONS()
         else:
-            with patch.object(stats, "urlopen", side_effect=URLError("test")):
-                h.do_GET()
-
+            h.do_GET()
         raw = wfile.getvalue().decode("utf-8", errors="replace")
         head, _, body = raw.partition("\r\n\r\n")
-        status_line = head.split("\r\n", 1)[0]
-        status_code = int(status_line.split(" ", 2)[1])
-        hdrs = {}
+        status_code = int(head.split("\r\n", 1)[0].split(" ", 2)[1])
+        headers = {}
         for line in head.split("\r\n")[1:]:
             if ": " in line:
-                k, v = line.split(": ", 1)
-                hdrs[k] = v
-        return status_code, hdrs, body.encode("utf-8")
+                key, value = line.split(": ", 1)
+                headers[key] = value
+        return status_code, headers, body.encode("utf-8")
 
-    def test_get_returns_200_even_when_upstream_fails(self) -> None:
-        status, hdrs, body = self._invoke("GET")
+    def test_get_returns_200_with_valid_contract(self) -> None:
+        status, headers, body = self._invoke("GET")
         self.assertEqual(status, 200)
-        self.assertEqual(hdrs.get("Content-Type"), "application/json")
-        self.assertEqual(hdrs.get("Access-Control-Allow-Origin"), "*")
-        self.assertIn("max-age=30", hdrs.get("Cache-Control", ""))
+        self.assertEqual(headers.get("Content-Type"), "application/json")
+        self.assertEqual(headers.get("Access-Control-Allow-Origin"), "*")
+        self.assertIn("max-age=30", headers.get("Cache-Control", ""))
         payload = json.loads(body)
         self.assertEqual(payload["schema_version"], 1)
-        self.assertEqual(payload["status"], "degraded")
+        self.assertEqual(payload["mode"], "live")
 
     def test_options_returns_204(self) -> None:
-        status, hdrs, _ = self._invoke("OPTIONS")
+        status, headers, _ = self._invoke("OPTIONS")
         self.assertEqual(status, 204)
-        self.assertEqual(hdrs.get("Access-Control-Allow-Origin"), "*")
-        self.assertEqual(hdrs.get("Access-Control-Allow-Methods"), "GET, OPTIONS")
+        self.assertEqual(headers.get("Access-Control-Allow-Methods"), "GET, OPTIONS")
 
 
 if __name__ == "__main__":