IgnazioDS · IgnazioDS · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,77 @@
+name: Benchmark
+
+# The benchmark is deterministic and reproducible. This workflow re-verifies it
+# weekly (and on demand), refreshes the published artifact, and commits the
+# result back to the repo, where /api/benchmark-latest and /api/stats serve it.
+# It is reproducibility verification, not synthetic daily activity.
+
+on:
+  schedule:
+    - cron: "0 6 * * 1" # Mondays 06:00 UTC
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install package
+        run: pip install -e .
+
+      - name: Run benchmark
+        run: python -m evalops_workbench.benchmark_runner
+
+      - name: Validate published artifact
+        run: |
+          python - <<'PY'
+          import json
+          from datetime import datetime, timezone
+
+          artifact = json.load(open("api/_benchmark_latest.json"))
+          assert artifact["system"] == "evalops", "wrong system"
+          assert artifact["schema_version"] == 1, "schema_version must be 1"
+          assert artifact["benchmark_type"] == "eval", "wrong benchmark_type"
+          assert artifact["metrics"], "metrics missing"
+          assert artifact["generated_at"], "generated_at missing"
+          generated = datetime.strptime(
+              artifact["generated_at"], "%Y-%m-%dT%H:%M:%SZ"
+          ).replace(tzinfo=timezone.utc)
+          age = (datetime.now(timezone.utc) - generated).total_seconds()
+          assert age < 600, f"artifact is stale ({age:.0f}s old)"
+          print(f"artifact valid; {artifact['metrics']['n_cases']} cases; age {age:.0f}s")
+          PY
+
+      - name: Commit results if changed
+        run: |
+          git config user.name "eleventh-bot"
+          git config user.email "noreply@eleventh.dev"
+          git add api/_benchmark_latest.json api/_benchmark_history.json \
+                  examples/benchmark-v1/results examples/benchmark-v1/pinned-baseline.json
+          if git diff --cached --quiet; then
+            echo "No benchmark changes to commit."
+          else
+            git commit -m "chore(benchmark): scheduled run [skip ci]"
+            git push
+          fi
+
+      - name: Live endpoint check (soft)
+        continue-on-error: true
+        run: |
+          sleep 20
+          for path in stats benchmark-latest; do
+            url="https://evalops-workbench.eleventh.dev/api/$path"
+            echo "GET $url"
+            curl -s --max-time 30 -A "Mozilla/5.0 ci" "$url" \
+              | python -c "import sys, json; d = json.load(sys.stdin); print('  ', {k: d.get(k) for k in ('system', 'mode', 'status', 'schema_version', 'benchmark_type')})" \
+              || echo "  (endpoint not reachable yet; redeploy may be in flight)"
+          done
diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
@@ -22,7 +22,6 @@ jobs:
         uses: actions/setup-node@v4
         with:
           node-version: "20"
-          cache: "npm"
 
       - name: Install Python package
         run: pip install -e .
@@ -31,7 +30,7 @@ jobs:
         run: python -m unittest discover -s tests -p 'test_*.py'
 
       - name: Install frontend dependencies
-        run: npm ci
+        run: npm install --no-audit --no-fund
 
       - name: Type-check dashboard
         run: npm run type-check

diff --git a/api/_benchmark_history.json b/api/_benchmark_history.json
@@ -0,0 +1,15 @@
+[
+  {
+    "run_id": "evalops-2026-05-26-d34c4f66",
+    "generated_at": "2026-05-26T07:25:34Z",
+    "pass_rate": 1.0,
+    "avg_score": 1.0,
+    "regressions": 0,
+    "regressed_ids": [],
+    "gate_verdict": "pass",
+    "variants": [
+      "prompt_v1",
+      "prompt_v2"
+    ]
+  }
+]
diff --git a/api/_benchmark_latest.json b/api/_benchmark_latest.json
@@ -0,0 +1,52 @@
+{
+  "system": "evalops",
+  "benchmark_type": "eval",
+  "run_id": "evalops-2026-05-26-d34c4f66",
+  "fixture": "support-qa",
+  "metrics": {
+    "n_cases": 4,
+    "baseline_variant": "prompt_v1",
+    "candidate_variant": "prompt_v2",
+    "baseline_pass_rate": 0.0,
+    "candidate_pass_rate": 1.0,
+    "baseline_avg_score": 0.333,
+    "candidate_avg_score": 1.0,
+    "pass_rate_delta": 1.0,
+    "avg_score_delta": 0.667,
+    "regressions": 0,
+    "improvements": 4,
+    "gate_verdict": "pass"
+  },
+  "variants": [
+    {
+      "name": "prompt_v1",
+      "pass_rate": 0.0,
+      "avg_score": 0.333,
+      "passed_cases": 0,
+      "total_cases": 4
+    },
+    {
+      "name": "prompt_v2",
+      "pass_rate": 1.0,
+      "avg_score": 1.0,
+      "passed_cases": 4,
+      "total_cases": 4
+    }
+  ],
+  "regressions": [],
+  "gate": {
+    "passed": true,
+    "reasons": [],
+    "max_regressions": 0,
+    "max_score_drop": 0.0,
+    "max_pass_rate_drop": 0.0
+  },
+  "artifact_urls": {
+    "report": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark/latest-report.md",
+    "fixture": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/support_qa.json",
+    "run": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark/archive/evalops-2026-05-26-d34c4f66.json"
+  },
+  "schema_version": 1,
+  "generated_at": "2026-05-26T07:25:34Z",
+  "previous_run": null
+}
diff --git a/api/benchmark-latest.py b/api/benchmark-latest.py
@@ -0,0 +1,76 @@
+"""Public benchmark endpoint: the latest published evaluation run.
+
+Stdlib-only Vercel Python serverless function. Serves the committed artifact at
+``api/_benchmark_latest.json`` (written by ``evalops_workbench.benchmark_runner``
+and refreshed by the nightly cron). The artifact already conforms to the
+benchmark-latest specification in TELEMETRY_SCHEMA.md, so this endpoint reads and
+returns it directly. The contract forbids HTTP 5xx; a missing artifact yields a
+valid ``status: "pending"`` envelope.
+"""
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+from http.server import BaseHTTPRequestHandler
+from pathlib import Path
+from typing import Any
+
+SYSTEM_SLUG = "evalops"
+BENCHMARK_TYPE = "eval"
+SCHEMA_VERSION = 1
+ARTIFACT_FILE = Path(__file__).parent / "_benchmark_latest.json"
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def _pending_payload() -> dict[str, Any]:
+    """Honest envelope for the window before the first run is published."""
+    return {
+        "system": SYSTEM_SLUG,
+        "benchmark_type": BENCHMARK_TYPE,
+        "status": "pending",
+        "run_id": None,
+        "metrics": None,
+        "schema_version": SCHEMA_VERSION,
+        "generated_at": _now_iso(),
+    }
+
+
+def build_response() -> dict[str, Any]:
+    try:
+        return json.loads(ARTIFACT_FILE.read_text(encoding="utf-8"))
+    except (FileNotFoundError, json.JSONDecodeError, OSError, ValueError):
+        return _pending_payload()
+
+
+class handler(BaseHTTPRequestHandler):
+    """Vercel Python serverless entrypoint."""
+
+    def _write_common_headers(self) -> None:
+        self.send_header("Cache-Control", "public, max-age=30, stale-while-revalidate=60")
+        self.send_header("Access-Control-Allow-Origin", "*")
+        self.send_header("Access-Control-Allow-Methods", "GET, OPTIONS")
+        self.send_header("Access-Control-Allow-Headers", "Content-Type")
+
+    def do_OPTIONS(self) -> None:  # noqa: N802 (interface contract)
+        self.send_response(204)
+        self._write_common_headers()
+        self.end_headers()
+
+    def do_GET(self) -> None:  # noqa: N802 (interface contract)
+        try:
+            payload = build_response()
+        except Exception:  # noqa: BLE001 (last resort: contract forbids 5xx)
+            payload = _pending_payload()
+        body = json.dumps(payload, separators=(",", ":")).encode("utf-8")
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self._write_common_headers()
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, fmt: str, *args: Any) -> None:  # noqa: A002, ARG002
+        return