Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
name: Benchmark

# The benchmark is deterministic and reproducible. This workflow re-verifies it
# weekly (and on demand), refreshes the published artifact, and commits the
# result back to the repo, where /api/benchmark-latest and /api/stats serve it.
# It is reproducibility verification, not synthetic daily activity.

on:
schedule:
- cron: "0 6 * * 1" # Mondays 06:00 UTC
workflow_dispatch:

permissions:
contents: write

jobs:
benchmark:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install package
run: pip install -e .

- name: Run benchmark
run: python -m evalops_workbench.benchmark_runner

- name: Validate published artifact
run: |
python - <<'PY'
import json
from datetime import datetime, timezone

artifact = json.load(open("api/_benchmark_latest.json"))
assert artifact["system"] == "evalops", "wrong system"
assert artifact["schema_version"] == 1, "schema_version must be 1"
assert artifact["benchmark_type"] == "eval", "wrong benchmark_type"
assert artifact["metrics"], "metrics missing"
assert artifact["generated_at"], "generated_at missing"
generated = datetime.strptime(
artifact["generated_at"], "%Y-%m-%dT%H:%M:%SZ"
).replace(tzinfo=timezone.utc)
age = (datetime.now(timezone.utc) - generated).total_seconds()
assert age < 600, f"artifact is stale ({age:.0f}s old)"
print(f"artifact valid; {artifact['metrics']['n_cases']} cases; age {age:.0f}s")
PY

- name: Commit results if changed
run: |
git config user.name "eleventh-bot"
git config user.email "noreply@eleventh.dev"
git add api/_benchmark_latest.json api/_benchmark_history.json \
examples/benchmark-v1/results examples/benchmark-v1/pinned-baseline.json
if git diff --cached --quiet; then
echo "No benchmark changes to commit."
else
git commit -m "chore(benchmark): scheduled run [skip ci]"
git push
fi

- name: Live endpoint check (soft)
continue-on-error: true
run: |
sleep 20
for path in stats benchmark-latest; do
url="https://evalops-workbench.eleventh.dev/api/$path"
echo "GET $url"
curl -s --max-time 30 -A "Mozilla/5.0 ci" "$url" \
| python -c "import sys, json; d = json.load(sys.stdin); print(' ', {k: d.get(k) for k in ('system', 'mode', 'status', 'schema_version', 'benchmark_type')})" \
|| echo " (endpoint not reachable yet; redeploy may be in flight)"
done
30 changes: 30 additions & 0 deletions api/_benchmark_history.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[
{
"run_id": "evalops-2026-05-26-308b0b20",
"generated_at": "2026-05-26T06:49:52Z",
"pass_rate": 0.631579,
"token_f1": 0.651316,
"exact_match": 0.631579,
"regressions": 12,
"regressed_ids": [
"adv-01",
"adv-02",
"adv-03",
"adv-04",
"adv-05",
"adv-06",
"adv-07",
"adv-08",
"adv-09",
"adv-10",
"adv-11",
"adv-12"
],
"gate_verdict": "pass",
"variants": [
"first_sentence",
"overlap_sentence",
"span_extract"
]
}
]
202 changes: 202 additions & 0 deletions api/_benchmark_latest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
{
"system": "evalops",
"benchmark_type": "eval",
"run_id": "evalops-2026-05-26-308b0b20",
"fixture": "benchmark-v1",
"metrics": {
"n_cases": 38,
"baseline_variant": "overlap_sentence",
"candidate_variant": "span_extract",
"exact_match": 0.631579,
"token_f1": 0.651316,
"contains_gold": 0.684211,
"improvement_exact_match": 0.631579,
"improvement_token_f1": 0.377036,
"regressions": 12,
"pass_rate": 0.631579,
"gate_verdict": "pass"
},
"variants": [
{
"name": "first_sentence",
"exact_match": 0.0,
"token_f1": 0.27428,
"contains_gold": 1.0
},
{
"name": "overlap_sentence",
"exact_match": 0.0,
"token_f1": 0.27428,
"contains_gold": 1.0
},
{
"name": "span_extract",
"exact_match": 0.631579,
"token_f1": 0.651316,
"contains_gold": 0.684211
}
],
"regressions": [
{
"case_id": "adv-01",
"metric": "token_f1",
"baseline": 0.153846,
"candidate": 0.0,
"delta": -0.153846,
"reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)",
"tags": [
"adversarial",
"who"
]
},
{
"case_id": "adv-02",
"metric": "token_f1",
"baseline": 0.333333,
"candidate": 0.0,
"delta": -0.333333,
"reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)",
"tags": [
"adversarial",
"who"
]
},
{
"case_id": "adv-03",
"metric": "token_f1",
"baseline": 0.307692,
"candidate": 0.0,
"delta": -0.307692,
"reason": "token_f1: candidate 0.00 below baseline 0.31 (-0.31)",
"tags": [
"adversarial",
"who"
]
},
{
"case_id": "adv-04",
"metric": "token_f1",
"baseline": 0.333333,
"candidate": 0.0,
"delta": -0.333333,
"reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)",
"tags": [
"adversarial",
"who"
]
},
{
"case_id": "adv-05",
"metric": "token_f1",
"baseline": 0.166667,
"candidate": 0.0,
"delta": -0.166667,
"reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
"tags": [
"adversarial",
"where"
]
},
{
"case_id": "adv-06",
"metric": "token_f1",
"baseline": 0.166667,
"candidate": 0.0,
"delta": -0.166667,
"reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
"tags": [
"adversarial",
"where"
]
},
{
"case_id": "adv-07",
"metric": "token_f1",
"baseline": 0.153846,
"candidate": 0.0,
"delta": -0.153846,
"reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)",
"tags": [
"adversarial",
"where"
]
},
{
"case_id": "adv-08",
"metric": "token_f1",
"baseline": 0.166667,
"candidate": 0.0,
"delta": -0.166667,
"reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)",
"tags": [
"adversarial",
"when"
]
},
{
"case_id": "adv-09",
"metric": "token_f1",
"baseline": 0.181818,
"candidate": 0.0,
"delta": -0.181818,
"reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
"tags": [
"adversarial",
"when"
]
},
{
"case_id": "adv-10",
"metric": "token_f1",
"baseline": 0.181818,
"candidate": 0.0,
"delta": -0.181818,
"reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
"tags": [
"adversarial",
"when"
]
},
{
"case_id": "adv-11",
"metric": "token_f1",
"baseline": 0.181818,
"candidate": 0.0,
"delta": -0.181818,
"reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)",
"tags": [
"adversarial",
"how-many"
]
},
{
"case_id": "adv-12",
"metric": "token_f1",
"baseline": 0.2,
"candidate": 0.0,
"delta": -0.2,
"reason": "token_f1: candidate 0.00 below baseline 0.20 (-0.20)",
"tags": [
"adversarial",
"how-many"
]
}
],
"gate": {
"passed": true,
"metric": "token_f1",
"pinned": null,
"observed": 0.651316,
"reasons": [
"no pinned baseline yet; this run establishes the contract"
]
},
"artifact_urls": {
"report": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/latest-report.md",
"fixture": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/cases.jsonl",
"run": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json"
},
"schema_version": 1,
"generated_at": "2026-05-26T06:49:52Z",
"previous_run": null
}
76 changes: 76 additions & 0 deletions api/benchmark-latest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""Public benchmark endpoint: the latest published evaluation run.

Stdlib-only Vercel Python serverless function. Serves the committed artifact at
``api/_benchmark_latest.json`` (written by ``evalops_workbench.benchmark_runner``
and refreshed by the nightly cron). The artifact already conforms to the
benchmark-latest specification in TELEMETRY_SCHEMA.md, so this endpoint reads and
returns it directly. The contract forbids HTTP 5xx; a missing artifact yields a
valid ``status: "pending"`` envelope.
"""
from __future__ import annotations

import json
from datetime import datetime, timezone
from http.server import BaseHTTPRequestHandler
from pathlib import Path
from typing import Any

SYSTEM_SLUG = "evalops"
BENCHMARK_TYPE = "eval"
SCHEMA_VERSION = 1
ARTIFACT_FILE = Path(__file__).parent / "_benchmark_latest.json"


def _now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def _pending_payload() -> dict[str, Any]:
"""Honest envelope for the window before the first run is published."""
return {
"system": SYSTEM_SLUG,
"benchmark_type": BENCHMARK_TYPE,
"status": "pending",
"run_id": None,
"metrics": None,
"schema_version": SCHEMA_VERSION,
"generated_at": _now_iso(),
}


def build_response() -> dict[str, Any]:
try:
return json.loads(ARTIFACT_FILE.read_text(encoding="utf-8"))
except (FileNotFoundError, json.JSONDecodeError, OSError, ValueError):
return _pending_payload()


class handler(BaseHTTPRequestHandler):
"""Vercel Python serverless entrypoint."""

def _write_common_headers(self) -> None:
self.send_header("Cache-Control", "public, max-age=30, stale-while-revalidate=60")
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")

def do_OPTIONS(self) -> None: # noqa: N802 (interface contract)
self.send_response(204)
self._write_common_headers()
self.end_headers()

def do_GET(self) -> None: # noqa: N802 (interface contract)
try:
payload = build_response()
except Exception: # noqa: BLE001 (last resort: contract forbids 5xx)
payload = _pending_payload()
body = json.dumps(payload, separators=(",", ":")).encode("utf-8")
self.send_response(200)
self.send_header("Content-Type", "application/json")
self._write_common_headers()
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)

def log_message(self, fmt: str, *args: Any) -> None: # noqa: A002, ARG002
return
Loading