IgnazioDS · IgnazioDS · May 26, 2026
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,77 @@
+name: Benchmark
+
+# The benchmark is deterministic and reproducible. This workflow re-verifies it
+# weekly (and on demand), refreshes the published artifact, and commits the
+# result back to the repo, where /api/benchmark-latest and /api/stats serve it.
+# It is reproducibility verification, not synthetic daily activity.
+
+on:
+  schedule:
+    - cron: "0 6 * * 1" # Mondays 06:00 UTC
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install package
+        run: pip install -e .
+
+      - name: Run benchmark
+        run: python -m evalops_workbench.benchmark_runner
+
+      - name: Validate published artifact
+        run: |
+          python - <<'PY'
+          import json
+          from datetime import datetime, timezone
+
+          artifact = json.load(open("api/_benchmark_latest.json"))
+          assert artifact["system"] == "evalops", "wrong system"
+          assert artifact["schema_version"] == 1, "schema_version must be 1"
+          assert artifact["benchmark_type"] == "eval", "wrong benchmark_type"
+          assert artifact["metrics"], "metrics missing"
+          assert artifact["generated_at"], "generated_at missing"
+          generated = datetime.strptime(
+              artifact["generated_at"], "%Y-%m-%dT%H:%M:%SZ"
+          ).replace(tzinfo=timezone.utc)
+          age = (datetime.now(timezone.utc) - generated).total_seconds()
+          assert age < 600, f"artifact is stale ({age:.0f}s old)"
+          print(f"artifact valid; {artifact['metrics']['n_cases']} cases; age {age:.0f}s")
+          PY
+
+      - name: Commit results if changed
+        run: |
+          git config user.name "eleventh-bot"
+          git config user.email "noreply@eleventh.dev"
+          git add api/_benchmark_latest.json api/_benchmark_history.json \
+                  examples/benchmark-v1/results examples/benchmark-v1/pinned-baseline.json
+          if git diff --cached --quiet; then
+            echo "No benchmark changes to commit."
+          else
+            git commit -m "chore(benchmark): scheduled run [skip ci]"
+            git push
+          fi
+
+      - name: Live endpoint check (soft)
+        continue-on-error: true
+        run: |
+          sleep 20
+          for path in stats benchmark-latest; do
+            url="https://evalops-workbench.eleventh.dev/api/$path"
+            echo "GET $url"
+            curl -s --max-time 30 -A "Mozilla/5.0 ci" "$url" \
+              | python -c "import sys, json; d = json.load(sys.stdin); print('  ', {k: d.get(k) for k in ('system', 'mode', 'status', 'schema_version', 'benchmark_type')})" \
+              || echo "  (endpoint not reachable yet; redeploy may be in flight)"
+          done
diff --git a/api/_benchmark_history.json b/api/_benchmark_history.json
@@ -0,0 +1,30 @@
+[
+  {
+    "run_id": "evalops-2026-05-26-308b0b20",
+    "generated_at": "2026-05-26T06:49:52Z",
+    "pass_rate": 0.631579,
+    "token_f1": 0.651316,
+    "exact_match": 0.631579,
+    "regressions": 12,
+    "regressed_ids": [
+      "adv-01",
+      "adv-02",
+      "adv-03",
+      "adv-04",
+      "adv-05",
+      "adv-06",
+      "adv-07",
+      "adv-08",
+      "adv-09",
+      "adv-10",
+      "adv-11",
+      "adv-12"
+    ],
+    "gate_verdict": "pass",
+    "variants": [
+      "first_sentence",
+      "overlap_sentence",
+      "span_extract"
+    ]
+  }
+]
diff --git a/api/_benchmark_latest.json b/api/_benchmark_latest.json
@@ -0,0 +1,202 @@
+{
+  "system": "evalops",
+  "benchmark_type": "eval",
+  "run_id": "evalops-2026-05-26-308b0b20",
+  "fixture": "benchmark-v1",
+  "metrics": {
+    "n_cases": 38,
+    "baseline_variant": "overlap_sentence",
+    "candidate_variant": "span_extract",
+    "exact_match": 0.631579,
+    "token_f1": 0.651316,
+    "contains_gold": 0.684211,
+    "improvement_exact_match": 0.631579,
+    "improvement_token_f1": 0.377036,
+    "regressions": 12,
+    "pass_rate": 0.631579,
+    "gate_verdict": "pass"
+  },
+  "variants": [
+    {
+      "name": "first_sentence",
+      "exact_match": 0.0,
+      "token_f1": 0.27428,
+      "contains_gold": 1.0
+    },
+    {
+      "name": "overlap_sentence",
+      "exact_match": 0.0,
+      "token_f1": 0.27428,
+      "contains_gold": 1.0
+    },
+    {
+      "name": "span_extract",
+      "exact_match": 0.631579,
+      "token_f1": 0.651316,
+      "contains_gold": 0.684211
+    }
+  ],
+  "regressions": [
+    {
+      "case_id": "adv-01",
+      "metric": "token_f1",
+      "baseline": 0.153846,
+      "candidate": 0.0,
+      "delta": -0.153846,
+      "reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-02",
+      "metric": "token_f1",
+      "baseline": 0.333333,
+      "candidate": 0.0,
+      "delta": -0.333333,
+      "reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-03",
+      "metric": "token_f1",
+      "baseline": 0.307692,
+      "candidate": 0.0,
+      "delta": -0.307692,
+      "reason": "token_f1: candidate 0.00 below baseline 0.31 (-0.31)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-04",
+      "metric": "token_f1",
+      "baseline": 0.333333,
+      "candidate": 0.0,
+      "delta": -0.333333,
+      "reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)",
+      "tags": [
+        "adversarial",
+        "who"
+      ]
+    },
+    {
+      "case_id": "adv-05",
+      "metric": "token_f1",
+      "baseline": 0.166667,
+      "candidate": 0.0,
+      "delta": -0.166667,
+      "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
+      "tags": [
+        "adversarial",
+        "where"
+      ]
+    },
+    {
+      "case_id": "adv-06",
+      "metric": "token_f1",
+      "baseline": 0.166667,
+      "candidate": 0.0,
+      "delta": -0.166667,
+      "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
+      "tags": [
+        "adversarial",
+        "where"
+      ]
+    },
+    {
+      "case_id": "adv-07",
+      "metric": "token_f1",
+      "baseline": 0.153846,
+      "candidate": 0.0,
+      "delta": -0.153846,
+      "reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)",
+      "tags": [
+        "adversarial",
+        "where"
+      ]
+    },
+    {
+      "case_id": "adv-08",
+      "metric": "token_f1",
+      "baseline": 0.166667,
+      "candidate": 0.0,
+      "delta": -0.166667,
+      "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
+      "tags": [
+        "adversarial",
+        "when"
+      ]
+    },
+    {
+      "case_id": "adv-09",
+      "metric": "token_f1",
+      "baseline": 0.181818,
+      "candidate": 0.0,
+      "delta": -0.181818,
+      "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
+      "tags": [
+        "adversarial",
+        "when"
+      ]
+    },
+    {
+      "case_id": "adv-10",
+      "metric": "token_f1",
+      "baseline": 0.181818,
+      "candidate": 0.0,
+      "delta": -0.181818,
+      "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
+      "tags": [
+        "adversarial",
+        "when"
+      ]
+    },
+    {
+      "case_id": "adv-11",
+      "metric": "token_f1",
+      "baseline": 0.181818,
+      "candidate": 0.0,
+      "delta": -0.181818,
+      "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
+      "tags": [
+        "adversarial",
+        "how-many"
+      ]
+    },
+    {
+      "case_id": "adv-12",
+      "metric": "token_f1",
+      "baseline": 0.2,
+      "candidate": 0.0,
+      "delta": -0.2,
+      "reason": "token_f1: candidate 0.00 below baseline 0.20 (-0.20)",
+      "tags": [
+        "adversarial",
+        "how-many"
+      ]
+    }
+  ],
+  "gate": {
+    "passed": true,
+    "metric": "token_f1",
+    "pinned": null,
+    "observed": 0.651316,
+    "reasons": [
+      "no pinned baseline yet; this run establishes the contract"
+    ]
+  },
+  "artifact_urls": {
+    "report": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/latest-report.md",
+    "fixture": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/cases.jsonl",
+    "run": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json"
+  },
+  "schema_version": 1,
+  "generated_at": "2026-05-26T06:49:52Z",
+  "previous_run": null
+}
diff --git a/api/benchmark-latest.py b/api/benchmark-latest.py
@@ -0,0 +1,76 @@
+"""Public benchmark endpoint: the latest published evaluation run.
+
+Stdlib-only Vercel Python serverless function. Serves the committed artifact at
+``api/_benchmark_latest.json`` (written by ``evalops_workbench.benchmark_runner``
+and refreshed by the nightly cron). The artifact already conforms to the
+benchmark-latest specification in TELEMETRY_SCHEMA.md, so this endpoint reads and
+returns it directly. The contract forbids HTTP 5xx; a missing artifact yields a
+valid ``status: "pending"`` envelope.
+"""
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+from http.server import BaseHTTPRequestHandler
+from pathlib import Path
+from typing import Any
+
+SYSTEM_SLUG = "evalops"
+BENCHMARK_TYPE = "eval"
+SCHEMA_VERSION = 1
+ARTIFACT_FILE = Path(__file__).parent / "_benchmark_latest.json"
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def _pending_payload() -> dict[str, Any]:
+    """Honest envelope for the window before the first run is published."""
+    return {
+        "system": SYSTEM_SLUG,
+        "benchmark_type": BENCHMARK_TYPE,
+        "status": "pending",
+        "run_id": None,
+        "metrics": None,
+        "schema_version": SCHEMA_VERSION,
+        "generated_at": _now_iso(),
+    }
+
+
+def build_response() -> dict[str, Any]:
+    try:
+        return json.loads(ARTIFACT_FILE.read_text(encoding="utf-8"))
+    except (FileNotFoundError, json.JSONDecodeError, OSError, ValueError):
+        return _pending_payload()
+
+
+class handler(BaseHTTPRequestHandler):
+    """Vercel Python serverless entrypoint."""
+
+    def _write_common_headers(self) -> None:
+        self.send_header("Cache-Control", "public, max-age=30, stale-while-revalidate=60")
+        self.send_header("Access-Control-Allow-Origin", "*")
+        self.send_header("Access-Control-Allow-Methods", "GET, OPTIONS")
+        self.send_header("Access-Control-Allow-Headers", "Content-Type")
+
+    def do_OPTIONS(self) -> None:  # noqa: N802 (interface contract)
+        self.send_response(204)
+        self._write_common_headers()
+        self.end_headers()
+
+    def do_GET(self) -> None:  # noqa: N802 (interface contract)
+        try:
+            payload = build_response()
+        except Exception:  # noqa: BLE001 (last resort: contract forbids 5xx)
+            payload = _pending_payload()
+        body = json.dumps(payload, separators=(",", ":")).encode("utf-8")
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self._write_common_headers()
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, fmt: str, *args: Any) -> None:  # noqa: A002, ARG002
+        return