From 8c6806a131ede381783c189e40764739d0db7585 Mon Sep 17 00:00:00 2001 From: Ignazio De Santis Date: Tue, 26 May 2026 14:57:01 +0800 Subject: [PATCH] feat(benchmark): public eval benchmark, Tier-A telemetry, /api/benchmark-latest Graduate EvalOps from a showcase shell to a working evaluation harness with a public, reproducible benchmark. Implements the project's published MVP contract. - eval engine (stdlib, src/evalops_workbench/eval/): dataset loader, SQuAD-style scorers (exact match, token F1, contains-gold), three extractive-QA strategies, runner + per-case regression diff, pinned-baseline gate, run ledger - examples/benchmark-v1: 38-case public fixture (CC0 original prose), 26 standard + 12 adversarial. Candidate lifts exact match 0.00 -> 0.63 and token F1 +0.38 while surfacing 12 per-case regressions a regression gate exists to catch - benchmark_runner publishes a committed artifact (api/_benchmark_latest.json + api/_benchmark_history.json), so persistence needs no external store - api/benchmark-latest.py serves the latest run (schema-conformed, previous_run) - api/stats.py flipped to mode:"live" with benchmark-derived metrics (eval_runs, pass rate, distinct regressions caught); honest degraded fallback - .github/workflows/benchmark.yml: weekly + on-demand reproducibility re-run that commits results back and validates artifact freshness - dashboard renders the latest benchmark (variant comparison, deltas, regressions) - project stage Researching -> Prototype All metrics computed from committed runs; nothing simulated or seeded. Tests: 38 unittest + 36 vitest green; next build + tsc clean. Refs: outputs/plans/PLAN_C_PROOF_FIRST.md (Phase 2) --- .github/workflows/benchmark.yml | 77 ++++ api/_benchmark_history.json | 30 ++ api/_benchmark_latest.json | 202 ++++++++++ api/benchmark-latest.py | 76 ++++ api/stats.py | 244 +++++------- examples/benchmark-v1/README.md | 63 ++++ examples/benchmark-v1/cases.jsonl | 38 ++ examples/benchmark-v1/pinned-baseline.json | 9 + .../archive/evalops-2026-05-26-308b0b20.json | 202 ++++++++++ .../benchmark-v1/results/latest-report.md | 43 +++ src/app/page.tsx | 353 +++++++++++++----- src/app/telemetry/page.tsx | 93 ++--- src/evalops_workbench/benchmark_runner.py | 106 ++++++ src/evalops_workbench/eval/__init__.py | 12 + src/evalops_workbench/eval/artifact.py | 134 +++++++ src/evalops_workbench/eval/baseline.py | 81 ++++ src/evalops_workbench/eval/cases.py | 94 +++++ src/evalops_workbench/eval/ledger.py | 39 ++ src/evalops_workbench/eval/normalize.py | 34 ++ src/evalops_workbench/eval/report.py | 66 ++++ src/evalops_workbench/eval/runner.py | 95 +++++ src/evalops_workbench/eval/scorers.py | 60 +++ src/evalops_workbench/eval/targets.py | 147 ++++++++ src/evalops_workbench/project.json | 7 +- src/lib/api.ts | 85 ++++- src/lib/project.ts | 4 +- tests/test_benchmark_endpoint.py | 75 ++++ tests/test_eval_engine.py | 272 ++++++++++++++ tests/test_stats.py | 214 ++++------- 29 files changed, 2510 insertions(+), 445 deletions(-) create mode 100644 .github/workflows/benchmark.yml create mode 100644 api/_benchmark_history.json create mode 100644 api/_benchmark_latest.json create mode 100644 api/benchmark-latest.py create mode 100644 examples/benchmark-v1/README.md create mode 100644 examples/benchmark-v1/cases.jsonl create mode 100644 examples/benchmark-v1/pinned-baseline.json create mode 100644 examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json create mode 100644 examples/benchmark-v1/results/latest-report.md create mode 100644 src/evalops_workbench/benchmark_runner.py create mode 100644 src/evalops_workbench/eval/__init__.py create mode 100644 src/evalops_workbench/eval/artifact.py create mode 100644 src/evalops_workbench/eval/baseline.py create mode 100644 src/evalops_workbench/eval/cases.py create mode 100644 src/evalops_workbench/eval/ledger.py create mode 100644 src/evalops_workbench/eval/normalize.py create mode 100644 src/evalops_workbench/eval/report.py create mode 100644 src/evalops_workbench/eval/runner.py create mode 100644 src/evalops_workbench/eval/scorers.py create mode 100644 src/evalops_workbench/eval/targets.py create mode 100644 tests/test_benchmark_endpoint.py create mode 100644 tests/test_eval_engine.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..9326774 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,77 @@ +name: Benchmark + +# The benchmark is deterministic and reproducible. This workflow re-verifies it +# weekly (and on demand), refreshes the published artifact, and commits the +# result back to the repo, where /api/benchmark-latest and /api/stats serve it. +# It is reproducibility verification, not synthetic daily activity. + +on: + schedule: + - cron: "0 6 * * 1" # Mondays 06:00 UTC + workflow_dispatch: + +permissions: + contents: write + +jobs: + benchmark: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install package + run: pip install -e . + + - name: Run benchmark + run: python -m evalops_workbench.benchmark_runner + + - name: Validate published artifact + run: | + python - <<'PY' + import json + from datetime import datetime, timezone + + artifact = json.load(open("api/_benchmark_latest.json")) + assert artifact["system"] == "evalops", "wrong system" + assert artifact["schema_version"] == 1, "schema_version must be 1" + assert artifact["benchmark_type"] == "eval", "wrong benchmark_type" + assert artifact["metrics"], "metrics missing" + assert artifact["generated_at"], "generated_at missing" + generated = datetime.strptime( + artifact["generated_at"], "%Y-%m-%dT%H:%M:%SZ" + ).replace(tzinfo=timezone.utc) + age = (datetime.now(timezone.utc) - generated).total_seconds() + assert age < 600, f"artifact is stale ({age:.0f}s old)" + print(f"artifact valid; {artifact['metrics']['n_cases']} cases; age {age:.0f}s") + PY + + - name: Commit results if changed + run: | + git config user.name "eleventh-bot" + git config user.email "noreply@eleventh.dev" + git add api/_benchmark_latest.json api/_benchmark_history.json \ + examples/benchmark-v1/results examples/benchmark-v1/pinned-baseline.json + if git diff --cached --quiet; then + echo "No benchmark changes to commit." + else + git commit -m "chore(benchmark): scheduled run [skip ci]" + git push + fi + + - name: Live endpoint check (soft) + continue-on-error: true + run: | + sleep 20 + for path in stats benchmark-latest; do + url="https://evalops-workbench.eleventh.dev/api/$path" + echo "GET $url" + curl -s --max-time 30 -A "Mozilla/5.0 ci" "$url" \ + | python -c "import sys, json; d = json.load(sys.stdin); print(' ', {k: d.get(k) for k in ('system', 'mode', 'status', 'schema_version', 'benchmark_type')})" \ + || echo " (endpoint not reachable yet; redeploy may be in flight)" + done diff --git a/api/_benchmark_history.json b/api/_benchmark_history.json new file mode 100644 index 0000000..c08a83a --- /dev/null +++ b/api/_benchmark_history.json @@ -0,0 +1,30 @@ +[ + { + "run_id": "evalops-2026-05-26-308b0b20", + "generated_at": "2026-05-26T06:49:52Z", + "pass_rate": 0.631579, + "token_f1": 0.651316, + "exact_match": 0.631579, + "regressions": 12, + "regressed_ids": [ + "adv-01", + "adv-02", + "adv-03", + "adv-04", + "adv-05", + "adv-06", + "adv-07", + "adv-08", + "adv-09", + "adv-10", + "adv-11", + "adv-12" + ], + "gate_verdict": "pass", + "variants": [ + "first_sentence", + "overlap_sentence", + "span_extract" + ] + } +] diff --git a/api/_benchmark_latest.json b/api/_benchmark_latest.json new file mode 100644 index 0000000..59472d5 --- /dev/null +++ b/api/_benchmark_latest.json @@ -0,0 +1,202 @@ +{ + "system": "evalops", + "benchmark_type": "eval", + "run_id": "evalops-2026-05-26-308b0b20", + "fixture": "benchmark-v1", + "metrics": { + "n_cases": 38, + "baseline_variant": "overlap_sentence", + "candidate_variant": "span_extract", + "exact_match": 0.631579, + "token_f1": 0.651316, + "contains_gold": 0.684211, + "improvement_exact_match": 0.631579, + "improvement_token_f1": 0.377036, + "regressions": 12, + "pass_rate": 0.631579, + "gate_verdict": "pass" + }, + "variants": [ + { + "name": "first_sentence", + "exact_match": 0.0, + "token_f1": 0.27428, + "contains_gold": 1.0 + }, + { + "name": "overlap_sentence", + "exact_match": 0.0, + "token_f1": 0.27428, + "contains_gold": 1.0 + }, + { + "name": "span_extract", + "exact_match": 0.631579, + "token_f1": 0.651316, + "contains_gold": 0.684211 + } + ], + "regressions": [ + { + "case_id": "adv-01", + "metric": "token_f1", + "baseline": 0.153846, + "candidate": 0.0, + "delta": -0.153846, + "reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)", + "tags": [ + "adversarial", + "who" + ] + }, + { + "case_id": "adv-02", + "metric": "token_f1", + "baseline": 0.333333, + "candidate": 0.0, + "delta": -0.333333, + "reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)", + "tags": [ + "adversarial", + "who" + ] + }, + { + "case_id": "adv-03", + "metric": "token_f1", + "baseline": 0.307692, + "candidate": 0.0, + "delta": -0.307692, + "reason": "token_f1: candidate 0.00 below baseline 0.31 (-0.31)", + "tags": [ + "adversarial", + "who" + ] + }, + { + "case_id": "adv-04", + "metric": "token_f1", + "baseline": 0.333333, + "candidate": 0.0, + "delta": -0.333333, + "reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)", + "tags": [ + "adversarial", + "who" + ] + }, + { + "case_id": "adv-05", + "metric": "token_f1", + "baseline": 0.166667, + "candidate": 0.0, + "delta": -0.166667, + "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)", + "tags": [ + "adversarial", + "where" + ] + }, + { + "case_id": "adv-06", + "metric": "token_f1", + "baseline": 0.166667, + "candidate": 0.0, + "delta": -0.166667, + "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)", + "tags": [ + "adversarial", + "where" + ] + }, + { + "case_id": "adv-07", + "metric": "token_f1", + "baseline": 0.153846, + "candidate": 0.0, + "delta": -0.153846, + "reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)", + "tags": [ + "adversarial", + "where" + ] + }, + { + "case_id": "adv-08", + "metric": "token_f1", + "baseline": 0.166667, + "candidate": 0.0, + "delta": -0.166667, + "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)", + "tags": [ + "adversarial", + "when" + ] + }, + { + "case_id": "adv-09", + "metric": "token_f1", + "baseline": 0.181818, + "candidate": 0.0, + "delta": -0.181818, + "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)", + "tags": [ + "adversarial", + "when" + ] + }, + { + "case_id": "adv-10", + "metric": "token_f1", + "baseline": 0.181818, + "candidate": 0.0, + "delta": -0.181818, + "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)", + "tags": [ + "adversarial", + "when" + ] + }, + { + "case_id": "adv-11", + "metric": "token_f1", + "baseline": 0.181818, + "candidate": 0.0, + "delta": -0.181818, + "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)", + "tags": [ + "adversarial", + "how-many" + ] + }, + { + "case_id": "adv-12", + "metric": "token_f1", + "baseline": 0.2, + "candidate": 0.0, + "delta": -0.2, + "reason": "token_f1: candidate 0.00 below baseline 0.20 (-0.20)", + "tags": [ + "adversarial", + "how-many" + ] + } + ], + "gate": { + "passed": true, + "metric": "token_f1", + "pinned": null, + "observed": 0.651316, + "reasons": [ + "no pinned baseline yet; this run establishes the contract" + ] + }, + "artifact_urls": { + "report": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/latest-report.md", + "fixture": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/cases.jsonl", + "run": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json" + }, + "schema_version": 1, + "generated_at": "2026-05-26T06:49:52Z", + "previous_run": null +} diff --git a/api/benchmark-latest.py b/api/benchmark-latest.py new file mode 100644 index 0000000..299cee9 --- /dev/null +++ b/api/benchmark-latest.py @@ -0,0 +1,76 @@ +"""Public benchmark endpoint: the latest published evaluation run. + +Stdlib-only Vercel Python serverless function. Serves the committed artifact at +``api/_benchmark_latest.json`` (written by ``evalops_workbench.benchmark_runner`` +and refreshed by the nightly cron). The artifact already conforms to the +benchmark-latest specification in TELEMETRY_SCHEMA.md, so this endpoint reads and +returns it directly. The contract forbids HTTP 5xx; a missing artifact yields a +valid ``status: "pending"`` envelope. +""" +from __future__ import annotations + +import json +from datetime import datetime, timezone +from http.server import BaseHTTPRequestHandler +from pathlib import Path +from typing import Any + +SYSTEM_SLUG = "evalops" +BENCHMARK_TYPE = "eval" +SCHEMA_VERSION = 1 +ARTIFACT_FILE = Path(__file__).parent / "_benchmark_latest.json" + + +def _now_iso() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def _pending_payload() -> dict[str, Any]: + """Honest envelope for the window before the first run is published.""" + return { + "system": SYSTEM_SLUG, + "benchmark_type": BENCHMARK_TYPE, + "status": "pending", + "run_id": None, + "metrics": None, + "schema_version": SCHEMA_VERSION, + "generated_at": _now_iso(), + } + + +def build_response() -> dict[str, Any]: + try: + return json.loads(ARTIFACT_FILE.read_text(encoding="utf-8")) + except (FileNotFoundError, json.JSONDecodeError, OSError, ValueError): + return _pending_payload() + + +class handler(BaseHTTPRequestHandler): + """Vercel Python serverless entrypoint.""" + + def _write_common_headers(self) -> None: + self.send_header("Cache-Control", "public, max-age=30, stale-while-revalidate=60") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "GET, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type") + + def do_OPTIONS(self) -> None: # noqa: N802 (interface contract) + self.send_response(204) + self._write_common_headers() + self.end_headers() + + def do_GET(self) -> None: # noqa: N802 (interface contract) + try: + payload = build_response() + except Exception: # noqa: BLE001 (last resort: contract forbids 5xx) + payload = _pending_payload() + body = json.dumps(payload, separators=(",", ":")).encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "application/json") + self._write_common_headers() + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, fmt: str, *args: Any) -> None: # noqa: A002, ARG002 + return diff --git a/api/stats.py b/api/stats.py index 69b9754..3ad5228 100644 --- a/api/stats.py +++ b/api/stats.py @@ -1,50 +1,44 @@ -"""Public telemetry endpoint for the showcase deploy. +"""Public telemetry endpoint for EvalOps Workbench (Tier A, live workload). -Stdlib-only Vercel Python serverless function. Reports honest GitHub-derived -signals about the codebase, never simulated workload metrics. The Tier B -endpoint is consumed by the Production Telemetry panel on -https://eleventh.dev. See: +Stdlib-only Vercel Python serverless function. The live workload is the public +benchmark: ``evalops_workbench.benchmark_runner`` runs nightly, persists each +result to the repo, and this endpoint reports honest metrics derived from that +durable history. See: https://github.com/IgnazioDS/IgnazioDS/blob/main/TELEMETRY_SCHEMA.md + +Every value is computed from committed run records. Nothing is simulated, +seeded, or incremented in memory. If no run has been published the endpoint +degrades honestly (status="degraded", zeroed metrics) and never returns 5xx. """ from __future__ import annotations import json import os -import re -import time from datetime import datetime, timedelta, timezone from http.server import BaseHTTPRequestHandler from pathlib import Path from typing import Any -from urllib.error import HTTPError, URLError -from urllib.request import Request, urlopen -# --- repo identity --- SYSTEM_SLUG = "evalops" -GITHUB_OWNER = "IgnazioDS" -GITHUB_REPO = "evalops-workbench" - -# --- contract constants --- SCHEMA_VERSION = 1 -HTTP_TIMEOUT_S = 4.0 -CACHE_TTL_S = 300 # 5 min, stays well under GitHub's 60-req/hr unauth cap -# --- safety caps: never expose values larger than these --- +ARTIFACT_FILE = Path(__file__).parent / "_benchmark_latest.json" +HISTORY_FILE = Path(__file__).parent / "_benchmark_history.json" +STATIC_FILE = Path(__file__).parent / "_telemetry_static.json" + +# Sanity caps: never expose values larger than these (defence against a runaway +# history file). The benchmark publishes one run per scheduled invocation. SAFETY_CAPS: dict[str, int] = { - "commits_total": 1_000_000, - "commits_30d": 100_000, - "lines_of_code": 10_000_000, - "repo_stars": 1_000_000, + "eval_runs_total": 1_000_000, + "eval_runs_24h": 10_000, + "regressions_caught_30d": 1_000_000, + "experiments_tracked": 100_000, } -GITHUB_API = "https://api.github.com" -USER_AGENT = "eleventh-telemetry/1.0 (+https://eleventh.dev)" -STATIC_FILE = Path(__file__).parent / "_telemetry_static.json" -# Module-scope cache survives across warm Vercel invocations; cold starts pay -# one GitHub round-trip and prime the cache for ~5min of subsequent requests. -_cache: dict[str, Any] = {"ts": 0.0, "payload": None} +def _now_iso() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def _cap(name: str, value: int) -> int: @@ -52,159 +46,102 @@ def _cap(name: str, value: int) -> int: return min(value, cap) if cap is not None else value -def _now_iso() -> str: - return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") - - -def _load_static() -> dict[str, Any]: - """Read the build-time artifact (lines_of_code, built_at). Missing fields - are silently treated as absent per the spec ("omit rather than estimate").""" +def _read_json(path: Path) -> Any: try: - return json.loads(STATIC_FILE.read_text(encoding="utf-8")) + return json.loads(path.read_text(encoding="utf-8")) except (FileNotFoundError, json.JSONDecodeError, OSError, ValueError): - return {} + return None -def _http_get(url: str) -> tuple[Any, dict[str, str]]: - """Stdlib HTTP GET. Returns (parsed_json, response_headers).""" - req = Request( - url, - headers={"User-Agent": USER_AGENT, "Accept": "application/vnd.github+json"}, - ) - with urlopen(req, timeout=HTTP_TIMEOUT_S) as resp: # noqa: S310 (https only) - body = resp.read().decode("utf-8") - # Headers is a Message object; convert to plain dict for portability. - hdrs = {k.lower(): v for k, v in resp.getheaders()} - return json.loads(body), hdrs - +def _parse_iso(value: Any) -> datetime | None: + if not isinstance(value, str): + return None + try: + return datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc) + except ValueError: + return None -_LAST_PAGE_RE = re.compile(r'<[^>]*[?&]page=(\d+)[^>]*>;\s*rel="last"') +def _within(record: dict, days: int, now: datetime) -> bool: + stamp = _parse_iso(record.get("generated_at")) + return stamp is not None and (now - stamp) <= timedelta(days=days) -def _commits_count_from_link_header(link_header: str, when_no_last: int) -> int: - """Parse the 'last' page number from GitHub's Link header. - With per_page=1, the page count IS the total record count. When no Link - header is present (single page of results), fall back to ``when_no_last``. - """ - match = _LAST_PAGE_RE.search(link_header or "") - if match: - return int(match.group(1)) - return when_no_last +def _zeroed_metrics() -> dict[str, Any]: + return { + "eval_runs_total": 0, + "eval_runs_24h": 0, + "last_pass_rate": 0.0, + "rolling_pass_rate_7d": 0.0, + "regressions_caught_30d": 0, + "experiments_tracked": 0, + } -def _fetch_metrics() -> tuple[dict[str, Any], str | None]: - """Pull GitHub-derived metrics. Returns (metrics, last_commit_at).""" - repo, _ = _http_get(f"{GITHUB_API}/repos/{GITHUB_OWNER}/{GITHUB_REPO}") - repo_stars = _cap("repo_stars", int(repo.get("stargazers_count") or 0)) - primary_language = repo.get("language") or "Unknown" +def _metrics_from_history(history: list[dict], now: datetime) -> dict[str, Any]: + if not history: + return _zeroed_metrics() - commits_url = ( - f"{GITHUB_API}/repos/{GITHUB_OWNER}/{GITHUB_REPO}/commits?per_page=1" - ) - latest_commits, latest_hdrs = _http_get(commits_url) - commits_total = _cap( - "commits_total", - _commits_count_from_link_header(latest_hdrs.get("link", ""), len(latest_commits)), - ) - last_commit_at: str | None = None - if latest_commits: - last_commit_at = ( - latest_commits[0].get("commit", {}).get("author", {}).get("date") - ) - - since = (datetime.now(timezone.utc) - timedelta(days=30)).strftime( - "%Y-%m-%dT%H:%M:%SZ" - ) - recent_url = ( - f"{GITHUB_API}/repos/{GITHUB_OWNER}/{GITHUB_REPO}" - f"/commits?per_page=1&since={since}" - ) - recent_commits, recent_hdrs = _http_get(recent_url) - commits_30d = _cap( - "commits_30d", - _commits_count_from_link_header(recent_hdrs.get("link", ""), len(recent_commits)), + runs_7d = [r for r in history if _within(r, 7, now)] + pass_rates_7d = [float(r.get("pass_rate", 0.0)) for r in runs_7d] + rolling_7d = ( + round(sum(pass_rates_7d) / len(pass_rates_7d), 4) + if pass_rates_7d + else float(history[-1].get("pass_rate", 0.0)) ) - metrics: dict[str, Any] = { - "commits_30d": commits_30d, - "commits_total": commits_total, - "primary_language": primary_language, - "repo_stars": repo_stars, + variants: set[str] = set() + # Distinct regressions, not a per-run sum: re-detecting the same case nightly + # is not catching a new regression, so the 30-day count unions case ids. + regressed_30d: set[str] = set() + for record in history: + variants.update(record.get("variants", []) or []) + if _within(record, 30, now): + regressed_30d.update(record.get("regressed_ids", []) or []) + + return { + "eval_runs_total": _cap("eval_runs_total", len(history)), + "eval_runs_24h": _cap("eval_runs_24h", sum(1 for r in history if _within(r, 1, now))), + "last_pass_rate": round(float(history[-1].get("pass_rate", 0.0)), 4), + "rolling_pass_rate_7d": rolling_7d, + "regressions_caught_30d": _cap("regressions_caught_30d", len(regressed_30d)), + "experiments_tracked": _cap("experiments_tracked", len(variants)), } - static = _load_static() - loc = static.get("lines_of_code") - if isinstance(loc, int) and loc > 0: - metrics["lines_of_code"] = _cap("lines_of_code", loc) - return metrics, last_commit_at - - -def _zeroed_metrics() -> dict[str, Any]: - metrics: dict[str, Any] = { - "commits_30d": 0, - "commits_total": 0, - "primary_language": "Unknown", - "repo_stars": 0, - } - static = _load_static() - loc = static.get("lines_of_code") - if isinstance(loc, int) and loc > 0: - metrics["lines_of_code"] = _cap("lines_of_code", loc) - return metrics def _build_response() -> dict[str, Any]: - """Compose the full response object. Always returns a parseable dict.""" - now = time.time() - cached = _cache.get("payload") - if cached is not None and (now - _cache["ts"]) < CACHE_TTL_S: - fresh = dict(cached) - fresh["generated_at"] = _now_iso() - return fresh - - static = _load_static() - last_deployed_at = ( - os.environ.get("VERCEL_GIT_COMMIT_AUTHOR_DATE") or static.get("built_at") - ) - - try: - metrics, last_commit_at = _fetch_metrics() + now = datetime.now(timezone.utc) + static = _read_json(STATIC_FILE) or {} + last_deployed_at = os.environ.get("VERCEL_GIT_COMMIT_AUTHOR_DATE") or static.get("built_at") + + history = _read_json(HISTORY_FILE) + artifact = _read_json(ARTIFACT_FILE) + + if isinstance(history, list) and history: + metrics = _metrics_from_history(history, now) + last_active_at = ( + artifact.get("generated_at") if isinstance(artifact, dict) else None + ) or history[-1].get("generated_at") status = "operational" - except (HTTPError, URLError, OSError, json.JSONDecodeError, ValueError, TimeoutError): - # Upstream unreachable. Serve last good cache if we have one, - # otherwise zeros. Never propagate the error. - if cached is not None: - stale = dict(cached) - stale["status"] = "degraded" - stale["generated_at"] = _now_iso() - return stale + else: metrics = _zeroed_metrics() - last_commit_at = None + last_active_at = None status = "degraded" - response: dict[str, Any] = { + return { "system": SYSTEM_SLUG, - "mode": "showcase", + "mode": "live", "status": status, "last_deployed_at": last_deployed_at, - "last_commit_at": last_commit_at, + "last_active_at": last_active_at, "metrics": metrics, "schema_version": SCHEMA_VERSION, "generated_at": _now_iso(), } - if status == "operational": - _cache["payload"] = response - _cache["ts"] = now - return response - class handler(BaseHTTPRequestHandler): - """Vercel Python serverless entrypoint. - - Vercel discovers this class by name; the runtime invokes ``do_GET`` / - ``do_OPTIONS`` per the BaseHTTPRequestHandler protocol. - """ + """Vercel Python serverless entrypoint.""" def _write_common_headers(self) -> None: self.send_header("Cache-Control", "public, max-age=30, stale-while-revalidate=60") @@ -220,18 +157,17 @@ def do_OPTIONS(self) -> None: # noqa: N802 (interface contract) def do_GET(self) -> None: # noqa: N802 (interface contract) try: payload = _build_response() - except Exception: # noqa: BLE001 (last-resort: contract forbids 5xx) + except Exception: # noqa: BLE001 (last resort: contract forbids 5xx) payload = { "system": SYSTEM_SLUG, - "mode": "showcase", + "mode": "live", "status": "degraded", "last_deployed_at": None, - "last_commit_at": None, + "last_active_at": None, "metrics": _zeroed_metrics(), "schema_version": SCHEMA_VERSION, "generated_at": _now_iso(), } - body = json.dumps(payload, separators=(",", ":")).encode("utf-8") self.send_response(200) self.send_header("Content-Type", "application/json") @@ -241,4 +177,4 @@ def do_GET(self) -> None: # noqa: N802 (interface contract) self.wfile.write(body) def log_message(self, fmt: str, *args: Any) -> None: # noqa: A002, ARG002 - return # Suppress default access log; Vercel captures stdout/stderr. + return diff --git a/examples/benchmark-v1/README.md b/examples/benchmark-v1/README.md new file mode 100644 index 0000000..c135817 --- /dev/null +++ b/examples/benchmark-v1/README.md @@ -0,0 +1,63 @@ +# benchmark-v1 + +A small, fully reproducible extractive question-answering benchmark. It is the +workload behind EvalOps Workbench's public telemetry: a real evaluation that +runs nightly, persists its result to this repo, and is served at +`/api/benchmark-latest`. + +## What it measures + +Three deterministic strategies answer each question from its context passage: + +| Variant | Strategy | +| --- | --- | +| `first_sentence` | Return the opening sentence (floor). | +| `overlap_sentence` | **Baseline.** Return the whole sentence with the most question-word overlap. Safe but blunt. | +| `span_extract` | **Candidate.** Find that sentence, then narrow to an answer span (entity / year / number), skipping the entity the question already names. | + +Scores use the SQuAD convention: normalized exact match, token-overlap F1, and +gold containment. Each prediction scores against its best-matching gold answer. + +The harness pins the candidate aggregate as a baseline contract +(`pinned-baseline.json`) and blocks if a future run drops below it, and it +surfaces every per-case regression where the candidate scored worse than the +baseline. + +## Why this design + +The candidate lifts exact match and token F1 substantially by returning the +answer span instead of the whole sentence, but it regresses on an adversarial +pack where a distractor entity precedes the answer. That trade-off (a change +that improves the aggregate while silently degrading specific cases) is exactly +what a regression-tracking harness exists to make visible. + +The system-under-test is deterministic on purpose: anyone can reproduce the +published numbers with no credentials, no network, and no cost. The harness is +model-agnostic; a live-LLM target would implement the same interface and is an +optional extension, never required by this benchmark. + +## Contents + +- `cases.jsonl` — 38 labelled cases. 26 standard (who / where / when / how-many + / what) and 12 adversarial, tagged in each row. +- `pinned-baseline.json` — the pinned candidate-aggregate contract. +- `results/latest-report.md` — human-readable report of the most recent run. +- `results/archive/.json` — every published run, by id. + +## Provenance and license + +Every passage is original prose written for this benchmark, describing +well-known public-domain facts (geography, science, history). No third-party +dataset text is included, so there are no upstream licensing constraints. The +fixture is dedicated to the public domain (CC0). + +## Reproduce + +```bash +git clone https://github.com/IgnazioDS/evalops-workbench +cd evalops-workbench && pip install -e . +python -m evalops_workbench.benchmark_runner +``` + +The run writes `api/_benchmark_latest.json` (served at `/api/benchmark-latest`), +appends `api/_benchmark_history.json`, and refreshes the report above. diff --git a/examples/benchmark-v1/cases.jsonl b/examples/benchmark-v1/cases.jsonl new file mode 100644 index 0000000..dd09b6c --- /dev/null +++ b/examples/benchmark-v1/cases.jsonl @@ -0,0 +1,38 @@ +{"id": "who-01", "question": "Who developed the theory of general relativity?", "context": "The theory of general relativity was developed by Albert Einstein. It reshaped modern physics.", "answers": ["Albert Einstein"], "tags": ["standard", "who"]} +{"id": "who-02", "question": "Who discovered radium?", "context": "Radium was discovered by Marie Curie in 1898. The discovery later earned a Nobel Prize.", "answers": ["Marie Curie"], "tags": ["standard", "who"]} +{"id": "who-03", "question": "Who wrote the play Hamlet?", "context": "The play Hamlet was written by William Shakespeare. It remains widely performed today.", "answers": ["William Shakespeare"], "tags": ["standard", "who"]} +{"id": "who-04", "question": "Who painted the ceiling of the Sistine Chapel?", "context": "The ceiling of the Sistine Chapel was painted by Michelangelo. The work took roughly four years.", "answers": ["Michelangelo"], "tags": ["standard", "who"]} +{"id": "who-05", "question": "Who formulated the laws of motion?", "context": "The laws of motion were formulated by Isaac Newton. They underpin classical mechanics.", "answers": ["Isaac Newton"], "tags": ["standard", "who"]} +{"id": "who-06", "question": "Who proposed the theory of evolution by natural selection?", "context": "The theory of evolution by natural selection was proposed by Charles Darwin. It transformed biology.", "answers": ["Charles Darwin"], "tags": ["standard", "who"]} +{"id": "where-01", "question": "Where is the Eiffel Tower located?", "context": "The Eiffel Tower is located in Paris. It opened to the public in 1889.", "answers": ["Paris"], "tags": ["standard", "where"]} +{"id": "where-02", "question": "Where is Mount Everest located?", "context": "Mount Everest is located on the border of Nepal. It is the tallest mountain above sea level.", "answers": ["Nepal"], "tags": ["standard", "where"]} +{"id": "where-03", "question": "Where is the Colosseum located?", "context": "The Colosseum is located in Rome. It was completed around the year 80.", "answers": ["Rome"], "tags": ["standard", "where"]} +{"id": "where-04", "question": "Where is the Statue of Liberty located?", "context": "The Statue of Liberty is located in New York. It was a gift from the people of France.", "answers": ["New York"], "tags": ["standard", "where"]} +{"id": "where-05", "question": "Where were the first modern Olympic Games held?", "context": "The first modern Olympic Games were held in Athens. The event revived an ancient tradition.", "answers": ["Athens"], "tags": ["standard", "where"]} +{"id": "where-06", "question": "Where is the Great Barrier Reef located?", "context": "The Great Barrier Reef is located off the coast of Australia. It is the largest coral reef system on Earth.", "answers": ["Australia"], "tags": ["standard", "where"]} +{"id": "when-01", "question": "When was the Declaration of Independence adopted?", "context": "The Declaration of Independence was adopted in 1776. It announced a formal separation.", "answers": ["1776"], "tags": ["standard", "when"]} +{"id": "when-02", "question": "When did the Berlin Wall fall?", "context": "The Berlin Wall fell in 1989. Its collapse marked a turning point in European history.", "answers": ["1989"], "tags": ["standard", "when"]} +{"id": "when-03", "question": "When was the telephone patented?", "context": "The telephone was patented in 1876 by Alexander Graham Bell. It changed daily communication.", "answers": ["1876"], "tags": ["standard", "when"]} +{"id": "when-04", "question": "When did the first manned Moon landing occur?", "context": "The first manned Moon landing occurred in 1969. Astronauts walked on the lunar surface.", "answers": ["1969"], "tags": ["standard", "when"]} +{"id": "when-05", "question": "When was the Eiffel Tower completed?", "context": "The Eiffel Tower was completed in 1889 for a world exhibition. It soon became a landmark.", "answers": ["1889"], "tags": ["standard", "when"]} +{"id": "when-06", "question": "When was the Magna Carta sealed?", "context": "The Magna Carta was sealed in 1215. It limited the power of the crown.", "answers": ["1215"], "tags": ["standard", "when"]} +{"id": "howmany-01", "question": "How many continents are there on Earth?", "context": "There are 7 continents on Earth. They vary greatly in size and population.", "answers": ["7"], "tags": ["standard", "how-many"]} +{"id": "howmany-02", "question": "How many players does a soccer team field at once?", "context": "A soccer team fields 11 players at a time. Substitutions are permitted during play.", "answers": ["11"], "tags": ["standard", "how-many"]} +{"id": "howmany-03", "question": "How many strings does a standard guitar have?", "context": "A standard guitar has 6 strings. Variants exist with more strings.", "answers": ["6"], "tags": ["standard", "how-many"]} +{"id": "howmany-04", "question": "How many keys are on a standard piano?", "context": "A standard piano has 88 keys. The layout repeats across several octaves.", "answers": ["88"], "tags": ["standard", "how-many"]} +{"id": "howmany-05", "question": "How many moons does Mars have?", "context": "Mars has 2 moons, named Phobos and Deimos. Both are small and irregular in shape.", "answers": ["2"], "tags": ["standard", "how-many"]} +{"id": "howmany-06", "question": "How many sides does a hexagon have?", "context": "A hexagon has 6 sides. It tiles a plane without gaps.", "answers": ["6"], "tags": ["standard", "how-many"]} +{"id": "what-01", "question": "What gas do plants absorb during photosynthesis?", "context": "During photosynthesis, plants absorb carbon dioxide. They release oxygen as a byproduct.", "answers": ["carbon dioxide"], "tags": ["standard", "what"]} +{"id": "what-02", "question": "What is the largest planet in the Solar System?", "context": "The largest planet in the Solar System is Jupiter. It is a gas giant with many moons.", "answers": ["Jupiter"], "tags": ["standard", "what"]} +{"id": "adv-01", "question": "Who composed the Ninth Symphony?", "context": "The Ninth Symphony was first performed in Vienna and is attributed to Beethoven. Critics praised its choral finale.", "answers": ["Beethoven"], "tags": ["adversarial", "who"]} +{"id": "adv-02", "question": "Who discovered penicillin?", "context": "The substance penicillin, studied in London, was identified by Alexander Fleming. The finding launched modern antibiotics.", "answers": ["Alexander Fleming"], "tags": ["adversarial", "who"]} +{"id": "adv-03", "question": "Who wrote the novel Don Quixote?", "context": "The novel Don Quixote, set in Spain, was written by Miguel Cervantes. It is a foundational work of fiction.", "answers": ["Miguel Cervantes"], "tags": ["adversarial", "who"]} +{"id": "adv-04", "question": "Who directed the film Psycho?", "context": "The film Psycho, shot in Hollywood, was directed by Alfred Hitchcock. It became a defining thriller.", "answers": ["Alfred Hitchcock"], "tags": ["adversarial", "who"]} +{"id": "adv-05", "question": "Where is the Louvre museum located?", "context": "The Louvre museum, which displays works from Italy, is located in Paris. Millions of people visit it each year.", "answers": ["Paris"], "tags": ["adversarial", "where"]} +{"id": "adv-06", "question": "Where was Napoleon born?", "context": "Long associated with France, Napoleon was born on the island of Corsica. He later became emperor.", "answers": ["Corsica"], "tags": ["adversarial", "where"]} +{"id": "adv-07", "question": "Where are the company headquarters located?", "context": "The company, which began operations in Germany, has its headquarters located in Tokyo. It employs thousands of people.", "answers": ["Tokyo"], "tags": ["adversarial", "where"]} +{"id": "adv-08", "question": "When did Ford introduce the Model T?", "context": "Ford expanded production in 1913 and introduced the Model T in 1908. The car was affordable for many families.", "answers": ["1908"], "tags": ["adversarial", "when"]} +{"id": "adv-09", "question": "When did the museum first open?", "context": "The museum was renovated in 2001 after first opening in 1937. Its collection spans several centuries.", "answers": ["1937"], "tags": ["adversarial", "when"]} +{"id": "adv-10", "question": "When was the bridge completed?", "context": "The bridge began construction in 1933 and was completed in 1937. It spans a wide strait.", "answers": ["1937"], "tags": ["adversarial", "when"]} +{"id": "adv-11", "question": "How many gold medals did the athlete win?", "context": "Competing in 24 events overall, the athlete won 8 gold medals. The tally set a national record.", "answers": ["8"], "tags": ["adversarial", "how-many"]} +{"id": "adv-12", "question": "How many championships did the team win?", "context": "Across 30 seasons in the league, the team won 5 championships. Supporters celebrated every title.", "answers": ["5"], "tags": ["adversarial", "how-many"]} diff --git a/examples/benchmark-v1/pinned-baseline.json b/examples/benchmark-v1/pinned-baseline.json new file mode 100644 index 0000000..71fea75 --- /dev/null +++ b/examples/benchmark-v1/pinned-baseline.json @@ -0,0 +1,9 @@ +{ + "variant": "span_extract", + "metric": "token_f1", + "aggregate": { + "exact_match": 0.631579, + "token_f1": 0.651316, + "contains_gold": 0.684211 + } +} diff --git a/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json b/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json new file mode 100644 index 0000000..59472d5 --- /dev/null +++ b/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json @@ -0,0 +1,202 @@ +{ + "system": "evalops", + "benchmark_type": "eval", + "run_id": "evalops-2026-05-26-308b0b20", + "fixture": "benchmark-v1", + "metrics": { + "n_cases": 38, + "baseline_variant": "overlap_sentence", + "candidate_variant": "span_extract", + "exact_match": 0.631579, + "token_f1": 0.651316, + "contains_gold": 0.684211, + "improvement_exact_match": 0.631579, + "improvement_token_f1": 0.377036, + "regressions": 12, + "pass_rate": 0.631579, + "gate_verdict": "pass" + }, + "variants": [ + { + "name": "first_sentence", + "exact_match": 0.0, + "token_f1": 0.27428, + "contains_gold": 1.0 + }, + { + "name": "overlap_sentence", + "exact_match": 0.0, + "token_f1": 0.27428, + "contains_gold": 1.0 + }, + { + "name": "span_extract", + "exact_match": 0.631579, + "token_f1": 0.651316, + "contains_gold": 0.684211 + } + ], + "regressions": [ + { + "case_id": "adv-01", + "metric": "token_f1", + "baseline": 0.153846, + "candidate": 0.0, + "delta": -0.153846, + "reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)", + "tags": [ + "adversarial", + "who" + ] + }, + { + "case_id": "adv-02", + "metric": "token_f1", + "baseline": 0.333333, + "candidate": 0.0, + "delta": -0.333333, + "reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)", + "tags": [ + "adversarial", + "who" + ] + }, + { + "case_id": "adv-03", + "metric": "token_f1", + "baseline": 0.307692, + "candidate": 0.0, + "delta": -0.307692, + "reason": "token_f1: candidate 0.00 below baseline 0.31 (-0.31)", + "tags": [ + "adversarial", + "who" + ] + }, + { + "case_id": "adv-04", + "metric": "token_f1", + "baseline": 0.333333, + "candidate": 0.0, + "delta": -0.333333, + "reason": "token_f1: candidate 0.00 below baseline 0.33 (-0.33)", + "tags": [ + "adversarial", + "who" + ] + }, + { + "case_id": "adv-05", + "metric": "token_f1", + "baseline": 0.166667, + "candidate": 0.0, + "delta": -0.166667, + "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)", + "tags": [ + "adversarial", + "where" + ] + }, + { + "case_id": "adv-06", + "metric": "token_f1", + "baseline": 0.166667, + "candidate": 0.0, + "delta": -0.166667, + "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)", + "tags": [ + "adversarial", + "where" + ] + }, + { + "case_id": "adv-07", + "metric": "token_f1", + "baseline": 0.153846, + "candidate": 0.0, + "delta": -0.153846, + "reason": "token_f1: candidate 0.00 below baseline 0.15 (-0.15)", + "tags": [ + "adversarial", + "where" + ] + }, + { + "case_id": "adv-08", + "metric": "token_f1", + "baseline": 0.166667, + "candidate": 0.0, + "delta": -0.166667, + "reason": "token_f1: candidate 0.00 below baseline 0.17 (-0.17)", + "tags": [ + "adversarial", + "when" + ] + }, + { + "case_id": "adv-09", + "metric": "token_f1", + "baseline": 0.181818, + "candidate": 0.0, + "delta": -0.181818, + "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)", + "tags": [ + "adversarial", + "when" + ] + }, + { + "case_id": "adv-10", + "metric": "token_f1", + "baseline": 0.181818, + "candidate": 0.0, + "delta": -0.181818, + "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)", + "tags": [ + "adversarial", + "when" + ] + }, + { + "case_id": "adv-11", + "metric": "token_f1", + "baseline": 0.181818, + "candidate": 0.0, + "delta": -0.181818, + "reason": "token_f1: candidate 0.00 below baseline 0.18 (-0.18)", + "tags": [ + "adversarial", + "how-many" + ] + }, + { + "case_id": "adv-12", + "metric": "token_f1", + "baseline": 0.2, + "candidate": 0.0, + "delta": -0.2, + "reason": "token_f1: candidate 0.00 below baseline 0.20 (-0.20)", + "tags": [ + "adversarial", + "how-many" + ] + } + ], + "gate": { + "passed": true, + "metric": "token_f1", + "pinned": null, + "observed": 0.651316, + "reasons": [ + "no pinned baseline yet; this run establishes the contract" + ] + }, + "artifact_urls": { + "report": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/latest-report.md", + "fixture": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/cases.jsonl", + "run": "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/results/archive/evalops-2026-05-26-308b0b20.json" + }, + "schema_version": 1, + "generated_at": "2026-05-26T06:49:52Z", + "previous_run": null +} diff --git a/examples/benchmark-v1/results/latest-report.md b/examples/benchmark-v1/results/latest-report.md new file mode 100644 index 0000000..8c3103a --- /dev/null +++ b/examples/benchmark-v1/results/latest-report.md @@ -0,0 +1,43 @@ +# EvalOps benchmark: benchmark-v1 + +- Run: `evalops-2026-05-26-308b0b20` +- Generated: 2026-05-26T06:49:52Z +- Cases: 38 +- Gate verdict: **PASS** + +## Variants + +| Variant | Exact match | Token F1 | Contains gold | +| --- | --- | --- | --- | +| first_sentence | 0.000 | 0.274 | 1.000 | +| overlap_sentence (baseline) | 0.000 | 0.274 | 1.000 | +| span_extract (candidate) | 0.632 | 0.651 | 0.684 | + +Candidate against baseline: exact match +0.632, token F1 +0.377. + +## Regressions (12) + +| Case | Baseline | Candidate | Delta | Reason | +| --- | --- | --- | --- | --- | +| adv-01 | 0.15 | 0.00 | -0.15 | token_f1: candidate 0.00 below baseline 0.15 (-0.15) | +| adv-02 | 0.33 | 0.00 | -0.33 | token_f1: candidate 0.00 below baseline 0.33 (-0.33) | +| adv-03 | 0.31 | 0.00 | -0.31 | token_f1: candidate 0.00 below baseline 0.31 (-0.31) | +| adv-04 | 0.33 | 0.00 | -0.33 | token_f1: candidate 0.00 below baseline 0.33 (-0.33) | +| adv-05 | 0.17 | 0.00 | -0.17 | token_f1: candidate 0.00 below baseline 0.17 (-0.17) | +| adv-06 | 0.17 | 0.00 | -0.17 | token_f1: candidate 0.00 below baseline 0.17 (-0.17) | +| adv-07 | 0.15 | 0.00 | -0.15 | token_f1: candidate 0.00 below baseline 0.15 (-0.15) | +| adv-08 | 0.17 | 0.00 | -0.17 | token_f1: candidate 0.00 below baseline 0.17 (-0.17) | +| adv-09 | 0.18 | 0.00 | -0.18 | token_f1: candidate 0.00 below baseline 0.18 (-0.18) | +| adv-10 | 0.18 | 0.00 | -0.18 | token_f1: candidate 0.00 below baseline 0.18 (-0.18) | +| adv-11 | 0.18 | 0.00 | -0.18 | token_f1: candidate 0.00 below baseline 0.18 (-0.18) | +| adv-12 | 0.20 | 0.00 | -0.20 | token_f1: candidate 0.00 below baseline 0.20 (-0.20) | + +## Reproduce + +```bash +git clone https://github.com/IgnazioDS/evalops-workbench +cd evalops-workbench && pip install -e . +python -m evalops_workbench.benchmark_runner +``` + +Fixture: [`benchmark-v1`](https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main/examples/benchmark-v1/cases.jsonl). Every case, label, and score is reproducible offline with no credentials. diff --git a/src/app/page.tsx b/src/app/page.tsx index 523db47..0f02530 100644 --- a/src/app/page.tsx +++ b/src/app/page.tsx @@ -4,57 +4,59 @@ import { useEffect, useState } from "react"; import { ArrowRight, ExternalLink, - GitCommit, + FileText, + FlaskConical, + GitCompare, Github, - Lightbulb, - Star, - TrendingUp, + ShieldAlert, + Target, Users, } from "lucide-react"; -import { fetchPublicStats, type PublicStats } from "@/lib/api"; +import { + fetchBenchmarkLatest, + fetchPublicStats, + type PublicBenchmark, + type PublicStats, +} from "@/lib/api"; import { TopBar } from "@/components/layout/TopBar"; import { Card, CardHeader, CardTitle, CardContent, CardDescription } from "@/components/ui/card"; import { Button } from "@/components/ui/button"; import { Badge } from "@/components/ui/badge"; import { StatusDot } from "@/components/ui/status-dot"; -import { StatCard } from "@/components/dashboard/StatCard"; import { Skeleton } from "@/components/ui/skeleton"; -import { Sparkline } from "@/components/ui/sparkline"; import { PROJECT } from "@/lib/project"; -import { formatRelative } from "@/lib/utils"; +import { formatNumber, formatRelative } from "@/lib/utils"; -/** - * Build a deterministic 10-point shape derived from the live value, so - * StatCard sparklines convey velocity without claiming a measured history - * the showcase tier doesn't have. - */ -function shapeFromValue(target: number, points = 10): number[] { - if (target <= 0) return Array(points).fill(0); - const result: number[] = []; - for (let i = 0; i < points; i++) { - const ratio = i / (points - 1); - const eased = ratio * ratio; - const wobble = Math.sin(i + target) * 0.06; - result.push(target * (eased + wobble + 0.1)); - } - return result; +function pct(value: number | undefined): string { + if (value === undefined || Number.isNaN(value)) return "—"; + return `${Math.round(value * 100)}%`; +} + +function signedPoints(value: number): string { + const points = Math.round(value * 100); + return `${points >= 0 ? "+" : ""}${points} pts`; } export default function OverviewPage() { const [stats, setStats] = useState(null); + const [benchmark, setBenchmark] = useState(null); const [loading, setLoading] = useState(true); useEffect(() => { - fetchPublicStats() - .then(setStats) - .catch(() => null) + Promise.allSettled([fetchPublicStats(), fetchBenchmarkLatest()]) + .then(([statsResult, benchmarkResult]) => { + if (statsResult.status === "fulfilled") setStats(statsResult.value); + if (benchmarkResult.status === "fulfilled") setBenchmark(benchmarkResult.value); + }) .finally(() => setLoading(false)); }, []); - const commitsTotal = (stats?.metrics.commits_total as number | undefined) ?? 0; - const commits30d = (stats?.metrics.commits_30d as number | undefined) ?? 0; - const stars = (stats?.metrics.repo_stars as number | undefined) ?? 0; - const loc = (stats?.metrics.lines_of_code as number | undefined) ?? 0; + const metrics = stats?.metrics ?? {}; + const lastPass = metrics.last_pass_rate as number | undefined; + const rolling = metrics.rolling_pass_rate_7d as number | undefined; + const regressions = metrics.regressions_caught_30d as number | undefined; + const runs = metrics.eval_runs_total as number | undefined; + const experiments = metrics.experiments_tracked as number | undefined; return ( <> @@ -97,11 +99,7 @@ export default function OverviewPage() { + + + )} + + + ); +} + +function Stat({ + title, + value, + subtitle, + icon: Icon, + loading, +}: { + title: string; + value: string; + subtitle?: string; + icon: typeof Target; + loading: boolean; +}) { + return ( + +
+
+

+ {title} +

+
+ +
+
+ {loading ? ( + + ) : ( +

{value}

+ )} + {subtitle &&

{subtitle}

} +
+
+ ); +} + function StatusCell({ label, value, @@ -256,13 +425,9 @@ function StatusCell({

{label}

-

- {value} -

+

{value}

{hint && ( -

- {hint} -

+

{hint}

)} ); diff --git a/src/app/telemetry/page.tsx b/src/app/telemetry/page.tsx index f888169..3bfdf58 100644 --- a/src/app/telemetry/page.tsx +++ b/src/app/telemetry/page.tsx @@ -3,11 +3,11 @@ import { useState } from "react"; import { CheckCircle2, - Code2, - GitCommit, - Layers, + FlaskConical, + GitCompare, RefreshCw, - Star, + ShieldAlert, + Target, } from "lucide-react"; import { fetchPublicStats, type PublicStats } from "@/lib/api"; import { TopBar } from "@/components/layout/TopBar"; @@ -28,6 +28,12 @@ import { const POLL_INTERVAL_MS = 30_000; +function pct(value: unknown): string { + return typeof value === "number" && !Number.isNaN(value) + ? `${Math.round(value * 100)}%` + : "—"; +} + export default function TelemetryPage() { const { data: stats, loading, error, refetch } = usePolling( fetchPublicStats, @@ -77,7 +83,7 @@ export default function TelemetryPage() {
- {stats?.mode ?? "showcase"} + {stats?.mode ?? "live"} generated {formatRelative(stats?.generated_at)} @@ -102,61 +108,57 @@ export default function TelemetryPage() { )} - {/* Tier-B metric grid */} + {/* Tier-A metric grid */}
@@ -187,7 +189,7 @@ export default function TelemetryPage() { /> This endpoint runs in{" "} - mode: "showcase" + mode: "live" {" "} per the public schema at{" "} TELEMETRY_SCHEMA.md - . Counters are sourced from the GitHub REST API - (commits, language, stars) plus a build-time line-of-code - snapshot, behind a 5-minute module-scope cache. + . Every metric is computed from the committed history of + the public benchmark, which re-runs on a schedule and on + every change. Nothing is simulated, seeded, or incremented + in memory.

- The endpoint never returns 5xx — GitHub failures degrade - to{" "} + The endpoint never returns 5xx. With no published run it + degrades to{" "} status: "degraded" {" "} - with the last cached response (or zeros) and a - contract-valid envelope. + with zeroed metrics and a contract-valid envelope. The + latest run is served at{" "} + + /api/benchmark-latest + + .

- {`curl -i https://${PROJECT.slug}.vercel.app/api/stats`} + {`curl -i https://${PROJECT.slug}.eleventh.dev/api/stats`}
@@ -284,7 +291,7 @@ function MetricTile({ }: { label: string; value: string; - icon: typeof GitCommit; + icon: typeof CheckCircle2; loading: boolean; }) { return ( diff --git a/src/evalops_workbench/benchmark_runner.py b/src/evalops_workbench/benchmark_runner.py new file mode 100644 index 0000000..33e80c5 --- /dev/null +++ b/src/evalops_workbench/benchmark_runner.py @@ -0,0 +1,106 @@ +"""Run the public benchmark and publish the committed artifact. + +This is the workload behind EvalOps' Tier-A telemetry: a real, reproducible +evaluation that runs nightly (and on demand), persists its result to the repo, +and is read back by the public /api/benchmark-latest and /api/stats endpoints. + +It is dependency-free and credential-free. Re-running it on the committed +fixture reproduces the published numbers exactly. +""" +from __future__ import annotations + +import json +from datetime import datetime, timezone +from pathlib import Path + +from .eval import artifact as artifact_mod +from .eval import baseline as baseline_mod +from .eval import ledger as ledger_mod +from .eval import report as report_mod +from .eval.cases import load_cases +from .eval.runner import compare, run_target +from .eval.targets import get_target + +FIXTURE_ID = "benchmark-v1" +FIXTURE_REL = "examples/benchmark-v1/cases.jsonl" +BASELINE_VARIANT = "overlap_sentence" +CANDIDATE_VARIANT = "span_extract" +VARIANT_ORDER = ("first_sentence", "overlap_sentence", "span_extract") +GATE_METRIC = "token_f1" + + +def _repo_root() -> Path: + # src/evalops_workbench/benchmark_runner.py -> repo root is three levels up. + return Path(__file__).resolve().parents[2] + + +def _now_iso() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def run_benchmark(repo_root: Path | None = None, *, write: bool = True) -> dict: + """Execute the benchmark and return the artifact. Persist it when ``write``.""" + root = repo_root or _repo_root() + fixture_path = root / FIXTURE_REL + results_dir = root / "examples" / FIXTURE_ID / "results" + archive_dir = results_dir / "archive" + pinned_path = root / "examples" / FIXTURE_ID / "pinned-baseline.json" + latest_artifact_path = root / "api" / "_benchmark_latest.json" + history_path = root / "api" / "_benchmark_history.json" + report_path = results_dir / "latest-report.md" + + cases = load_cases(fixture_path) + results = {name: run_target(name, get_target(name), cases) for name in VARIANT_ORDER} + baseline_result = results[BASELINE_VARIANT] + candidate_result = results[CANDIDATE_VARIANT] + + regressions = compare(baseline_result, candidate_result, metric=GATE_METRIC) + pinned = baseline_mod.load_pinned(pinned_path) + verdict = baseline_mod.evaluate_gate(candidate_result, pinned, metric=GATE_METRIC) + + previous = ledger_mod.previous_record(history_path) + generated_at = _now_iso() + artifact = artifact_mod.build_artifact( + fixture_id=FIXTURE_ID, + fixture_rel=FIXTURE_REL, + results=results, + baseline_name=BASELINE_VARIANT, + candidate_name=CANDIDATE_VARIANT, + regressions=regressions, + verdict=verdict, + previous=previous, + generated_at=generated_at, + ) + + if write: + archive_dir.mkdir(parents=True, exist_ok=True) + latest_artifact_path.parent.mkdir(parents=True, exist_ok=True) + _write_json(latest_artifact_path, artifact) + _write_json(archive_dir / f"{artifact['run_id']}.json", artifact) + report_path.write_text(report_mod.render(artifact), encoding="utf-8") + ledger_mod.append_record(history_path, artifact_mod.slim_record(artifact)) + if pinned is None: + baseline_mod.save_pinned(pinned_path, candidate_result, metric=GATE_METRIC) + + return artifact + + +def _write_json(path: Path, payload: dict) -> None: + path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") + + +def main(argv: list[str] | None = None) -> int: + artifact = run_benchmark() + metrics = artifact["metrics"] + print( + f"[{artifact['run_id']}] cases={metrics['n_cases']} " + f"candidate={metrics['candidate_variant']} " + f"exact_match={metrics['exact_match']:.3f} token_f1={metrics['token_f1']:.3f} " + f"(vs baseline {metrics['improvement_token_f1']:+.3f} F1) " + f"regressions={metrics['regressions']} gate={metrics['gate_verdict']}" + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/evalops_workbench/eval/__init__.py b/src/evalops_workbench/eval/__init__.py new file mode 100644 index 0000000..6d80e3c --- /dev/null +++ b/src/evalops_workbench/eval/__init__.py @@ -0,0 +1,12 @@ +"""EvalOps evaluation engine. + +A local-first, dependency-free evaluation harness: load a labelled dataset, +run named variants of a system-under-test, score predictions with rubric +functions, pin a baseline, and surface per-case regressions. + +The engine is model-agnostic. The public benchmark ships deterministic +extractive-QA strategies as the system-under-test so the run is reproducible +by any third party with zero credentials and zero cost. A live LLM target is +an optional extension point (see ``targets.Target``), never a requirement. +""" +from __future__ import annotations diff --git a/src/evalops_workbench/eval/artifact.py b/src/evalops_workbench/eval/artifact.py new file mode 100644 index 0000000..d43da19 --- /dev/null +++ b/src/evalops_workbench/eval/artifact.py @@ -0,0 +1,134 @@ +"""Assemble the schema-conformed /api/benchmark-latest payload + slim history record. + +The artifact is the public contract. Its shape matches the benchmark-latest +specification in TELEMETRY_SCHEMA.md: a stable envelope with metrics, the +per-variant comparison, the surfaced regressions, public artifact URLs, and a +run-over-run delta. +""" +from __future__ import annotations + +import hashlib + +from .baseline import GateVerdict +from .runner import Regression, RunResult + +SYSTEM_SLUG = "evalops" +BENCHMARK_TYPE = "eval" +SCHEMA_VERSION = 1 +_RAW_BASE = "https://raw.githubusercontent.com/IgnazioDS/evalops-workbench/main" + + +def run_id_for(fixture_id: str, candidate: RunResult, baseline: RunResult, generated_at: str) -> str: + """Deterministic, reproducible id: date + short hash of the compared aggregates.""" + digest_source = f"{fixture_id}|{candidate.aggregate}|{baseline.aggregate}" + short = hashlib.sha256(digest_source.encode("utf-8")).hexdigest()[:8] + return f"{SYSTEM_SLUG}-{generated_at[:10]}-{short}" + + +def _artifact_urls(run_id: str, fixture_rel: str) -> dict: + return { + "report": f"{_RAW_BASE}/examples/benchmark-v1/results/latest-report.md", + "fixture": f"{_RAW_BASE}/{fixture_rel}", + "run": f"{_RAW_BASE}/examples/benchmark-v1/results/archive/{run_id}.json", + } + + +def _variant_row(result: RunResult) -> dict: + return {"name": result.target, **result.aggregate} + + +def _previous_block(previous: dict | None, metrics: dict) -> dict | None: + if not previous: + return None + return { + "run_id": previous.get("run_id"), + "generated_at": previous.get("generated_at"), + "delta": { + "token_f1": round(metrics["token_f1"] - float(previous.get("token_f1", 0.0)), 6), + "exact_match": round(metrics["exact_match"] - float(previous.get("exact_match", 0.0)), 6), + "regressions": metrics["regressions"] - int(previous.get("regressions", 0)), + }, + } + + +def build_artifact( + *, + fixture_id: str, + fixture_rel: str, + results: dict[str, RunResult], + baseline_name: str, + candidate_name: str, + regressions: list[Regression], + verdict: GateVerdict, + previous: dict | None, + generated_at: str, +) -> dict: + baseline = results[baseline_name] + candidate = results[candidate_name] + run_id = run_id_for(fixture_id, candidate, baseline, generated_at) + + metrics = { + "n_cases": candidate.n_cases, + "baseline_variant": baseline_name, + "candidate_variant": candidate_name, + "exact_match": candidate.aggregate["exact_match"], + "token_f1": candidate.aggregate["token_f1"], + "contains_gold": candidate.aggregate["contains_gold"], + "improvement_exact_match": round( + candidate.aggregate["exact_match"] - baseline.aggregate["exact_match"], 6 + ), + "improvement_token_f1": round( + candidate.aggregate["token_f1"] - baseline.aggregate["token_f1"], 6 + ), + "regressions": len(regressions), + "pass_rate": candidate.aggregate["exact_match"], + "gate_verdict": "pass" if verdict.passed else "fail", + } + + return { + "system": SYSTEM_SLUG, + "benchmark_type": BENCHMARK_TYPE, + "run_id": run_id, + "fixture": fixture_id, + "metrics": metrics, + "variants": [_variant_row(results[name]) for name in results], + "regressions": [ + { + "case_id": r.case_id, + "metric": r.metric, + "baseline": r.baseline, + "candidate": r.candidate, + "delta": r.delta, + "reason": r.reason, + "tags": list(r.tags), + } + for r in regressions + ], + "gate": { + "passed": verdict.passed, + "metric": verdict.metric, + "pinned": verdict.pinned, + "observed": verdict.observed, + "reasons": list(verdict.reasons), + }, + "artifact_urls": _artifact_urls(run_id, fixture_rel), + "schema_version": SCHEMA_VERSION, + "generated_at": generated_at, + "previous_run": _previous_block(previous, metrics), + } + + +def slim_record(artifact: dict) -> dict: + """The compact history row consumed by the ledger and /api/stats rollups.""" + metrics = artifact["metrics"] + return { + "run_id": artifact["run_id"], + "generated_at": artifact["generated_at"], + "pass_rate": metrics["pass_rate"], + "token_f1": metrics["token_f1"], + "exact_match": metrics["exact_match"], + "regressions": metrics["regressions"], + "regressed_ids": [reg["case_id"] for reg in artifact["regressions"]], + "gate_verdict": metrics["gate_verdict"], + "variants": [variant["name"] for variant in artifact["variants"]], + } diff --git a/src/evalops_workbench/eval/baseline.py b/src/evalops_workbench/eval/baseline.py new file mode 100644 index 0000000..9603dce --- /dev/null +++ b/src/evalops_workbench/eval/baseline.py @@ -0,0 +1,81 @@ +"""Pinned baseline contract and the regression gate. + +EvalOps' thesis: a regression dashboard nobody reads does not prevent +regressions. The contract is that quality below a pinned baseline blocks. This +module persists that pinned aggregate (a versioned file in the repo) and +evaluates a candidate run against it. +""" +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path + +from .runner import RunResult + +_DEFAULT_METRIC = "token_f1" +_DEFAULT_TOLERANCE = 0.02 + + +@dataclass(frozen=True) +class GateVerdict: + passed: bool + metric: str + pinned: float | None + observed: float + reasons: tuple[str, ...] + + +def load_pinned(path: str | Path) -> dict | None: + path = Path(path) + if not path.exists(): + return None + try: + data = json.loads(path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError, ValueError): + return None + return data if isinstance(data, dict) else None + + +def save_pinned(path: str | Path, result: RunResult, *, metric: str = _DEFAULT_METRIC) -> None: + payload = {"variant": result.target, "metric": metric, "aggregate": result.aggregate} + Path(path).write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") + + +def evaluate_gate( + candidate: RunResult, + pinned: dict | None, + *, + metric: str = _DEFAULT_METRIC, + tolerance: float = _DEFAULT_TOLERANCE, +) -> GateVerdict: + """Pass unless the candidate aggregate drops below the pinned floor.""" + observed = float(candidate.aggregate.get(metric, 0.0)) + if pinned is None: + return GateVerdict( + passed=True, + metric=metric, + pinned=None, + observed=observed, + reasons=("no pinned baseline yet; this run establishes the contract",), + ) + pinned_value = float(pinned.get("aggregate", {}).get(metric, 0.0)) + floor = pinned_value - tolerance + if observed + 1e-9 >= floor: + return GateVerdict( + passed=True, + metric=metric, + pinned=pinned_value, + observed=observed, + reasons=(f"{metric} {observed:.3f} holds at or above pinned floor {floor:.3f}",), + ) + return GateVerdict( + passed=False, + metric=metric, + pinned=pinned_value, + observed=observed, + reasons=( + f"{metric} {observed:.3f} dropped below pinned floor {floor:.3f} " + f"(pinned {pinned_value:.3f}, tolerance {tolerance:.3f})", + ), + ) diff --git a/src/evalops_workbench/eval/cases.py b/src/evalops_workbench/eval/cases.py new file mode 100644 index 0000000..35744d6 --- /dev/null +++ b/src/evalops_workbench/eval/cases.py @@ -0,0 +1,94 @@ +"""The evaluation dataset: an immutable ``Case`` and loaders for JSONL/JSON/CSV. + +A case is one labelled example: a question over a context passage, with one or +more acceptable gold answers. Loading validates at the boundary and fails fast +on malformed rows rather than silently scoring against garbage. +""" +from __future__ import annotations + +import csv +import json +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class Case: + """One labelled evaluation example.""" + + id: str + question: str + context: str + answers: tuple[str, ...] + tags: tuple[str, ...] = () + + +def _coerce_answers(raw: object) -> tuple[str, ...]: + if isinstance(raw, str): + return (raw,) + if isinstance(raw, (list, tuple)): + answers = tuple(str(a) for a in raw if str(a).strip()) + if answers: + return answers + raise ValueError("case 'answers' must be a non-empty string or list of strings") + + +def _coerce_tags(raw: object) -> tuple[str, ...]: + if raw is None or raw == "": + return () + if isinstance(raw, str): + return tuple(t.strip() for t in raw.split("|") if t.strip()) + if isinstance(raw, (list, tuple)): + return tuple(str(t).strip() for t in raw if str(t).strip()) + return () + + +def _coerce_case(raw: dict) -> Case: + for field in ("id", "question", "context", "answers"): + if field not in raw: + raise ValueError(f"case is missing required field '{field}': {raw!r}") + return Case( + id=str(raw["id"]), + question=str(raw["question"]), + context=str(raw["context"]), + answers=_coerce_answers(raw["answers"]), + tags=_coerce_tags(raw.get("tags")), + ) + + +def load_cases(path: str | Path) -> list[Case]: + """Load cases from a ``.jsonl``, ``.json`` (array), or ``.csv`` file.""" + path = Path(path) + if not path.exists(): + raise FileNotFoundError(f"fixture not found: {path}") + + suffix = path.suffix.lower() + if suffix == ".jsonl": + rows = [ + json.loads(line) + for line in path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + elif suffix == ".json": + rows = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(rows, list): + raise ValueError(f"{path} must contain a JSON array of cases") + elif suffix == ".csv": + with path.open(encoding="utf-8", newline="") as handle: + rows = list(csv.DictReader(handle)) + else: + raise ValueError(f"unsupported fixture extension: {suffix}") + + cases = [_coerce_case(row) for row in rows] + if not cases: + raise ValueError(f"fixture {path} contained no cases") + _assert_unique_ids(cases) + return cases + + +def _assert_unique_ids(cases: list[Case]) -> None: + seen: set[str] = set() + for case in cases: + if case.id in seen: + raise ValueError(f"duplicate case id: {case.id}") + seen.add(case.id) diff --git a/src/evalops_workbench/eval/ledger.py b/src/evalops_workbench/eval/ledger.py new file mode 100644 index 0000000..4c1ebc8 --- /dev/null +++ b/src/evalops_workbench/eval/ledger.py @@ -0,0 +1,39 @@ +"""Append-only run history, stored as a JSON array the stdlib endpoints can read. + +Each record is a slim summary of one published run. The history backs the +run-over-run delta on /api/benchmark-latest and the trailing rollups +(24h / 7d / 30d) on /api/stats. Trimmed to a bounded length so the committed +artifact never grows without limit. +""" +from __future__ import annotations + +import json +from pathlib import Path + +_KEEP = 100 + + +def read_records(path: str | Path) -> list[dict]: + path = Path(path) + if not path.exists(): + return [] + try: + data = json.loads(path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError, ValueError): + return [] + return data if isinstance(data, list) else [] + + +def previous_record(path: str | Path) -> dict | None: + """The most recent record currently on disk (before a new append).""" + records = read_records(path) + return records[-1] if records else None + + +def append_record(path: str | Path, record: dict, *, keep: int = _KEEP) -> list[dict]: + """Append ``record`` and persist, trimming to the most recent ``keep`` runs.""" + records = read_records(path) + records.append(record) + trimmed = records[-keep:] + Path(path).write_text(json.dumps(trimmed, indent=2) + "\n", encoding="utf-8") + return trimmed diff --git a/src/evalops_workbench/eval/normalize.py b/src/evalops_workbench/eval/normalize.py new file mode 100644 index 0000000..093dd2d --- /dev/null +++ b/src/evalops_workbench/eval/normalize.py @@ -0,0 +1,34 @@ +"""Answer-text normalization, following the SQuAD evaluation convention. + +Normalization is the contract that makes scores comparable: "The Nile." and +"nile" must score as equal. Keeping it in one place means every scorer shares +the exact same notion of equality. +""" +from __future__ import annotations + +import re +import string + +_ARTICLES = re.compile(r"\b(a|an|the)\b", re.IGNORECASE) +_PUNCT_TABLE = str.maketrans("", "", string.punctuation) +_WHITESPACE = re.compile(r"\s+") + + +def normalize_answer(text: str) -> str: + """Lowercase, strip punctuation and articles, collapse whitespace. + + Mirrors the SQuAD v1.1 official normalization so token-overlap F1 and + exact-match scores match published methodology. + """ + if not text: + return "" + lowered = text.lower() + no_punct = lowered.translate(_PUNCT_TABLE) + no_articles = _ARTICLES.sub(" ", no_punct) + return _WHITESPACE.sub(" ", no_articles).strip() + + +def tokenize(text: str) -> list[str]: + """Normalized whitespace tokens. Empty string yields an empty list.""" + normalized = normalize_answer(text) + return normalized.split() if normalized else [] diff --git a/src/evalops_workbench/eval/report.py b/src/evalops_workbench/eval/report.py new file mode 100644 index 0000000..cb085e7 --- /dev/null +++ b/src/evalops_workbench/eval/report.py @@ -0,0 +1,66 @@ +"""Render a downloadable markdown report from a benchmark artifact.""" +from __future__ import annotations + + +def render(artifact: dict) -> str: + metrics = artifact["metrics"] + lines = [ + f"# EvalOps benchmark: {artifact['fixture']}", + "", + f"- Run: `{artifact['run_id']}`", + f"- Generated: {artifact['generated_at']}", + f"- Cases: {metrics['n_cases']}", + f"- Gate verdict: **{metrics['gate_verdict'].upper()}**", + "", + "## Variants", + "", + "| Variant | Exact match | Token F1 | Contains gold |", + "| --- | --- | --- | --- |", + ] + for variant in artifact["variants"]: + suffix = "" + if variant["name"] == metrics["candidate_variant"]: + suffix = " (candidate)" + elif variant["name"] == metrics["baseline_variant"]: + suffix = " (baseline)" + lines.append( + f"| {variant['name']}{suffix} | {variant['exact_match']:.3f} | " + f"{variant['token_f1']:.3f} | {variant['contains_gold']:.3f} |" + ) + + lines += [ + "", + f"Candidate against baseline: exact match {metrics['improvement_exact_match']:+.3f}, " + f"token F1 {metrics['improvement_token_f1']:+.3f}.", + "", + f"## Regressions ({metrics['regressions']})", + "", + ] + if artifact["regressions"]: + lines += [ + "| Case | Baseline | Candidate | Delta | Reason |", + "| --- | --- | --- | --- | --- |", + ] + for reg in artifact["regressions"]: + lines.append( + f"| {reg['case_id']} | {reg['baseline']:.2f} | {reg['candidate']:.2f} | " + f"{reg['delta']:+.2f} | {reg['reason']} |" + ) + else: + lines.append("None. The candidate scored at or above the baseline on every case.") + + lines += [ + "", + "## Reproduce", + "", + "```bash", + "git clone https://github.com/IgnazioDS/evalops-workbench", + "cd evalops-workbench && pip install -e .", + "python -m evalops_workbench.benchmark_runner", + "```", + "", + f"Fixture: [`{artifact['fixture']}`]({artifact['artifact_urls']['fixture']}). " + "Every case, label, and score is reproducible offline with no credentials.", + "", + ] + return "\n".join(lines) diff --git a/src/evalops_workbench/eval/runner.py b/src/evalops_workbench/eval/runner.py new file mode 100644 index 0000000..c7ed344 --- /dev/null +++ b/src/evalops_workbench/eval/runner.py @@ -0,0 +1,95 @@ +"""Run a target over the dataset, aggregate scores, and diff two runs. + +``run_target`` produces an immutable ``RunResult`` (per-case scores + aggregate +means). ``compare`` diffs a candidate against a baseline run and returns the +per-case regressions — the cases where the candidate scored strictly worse. +""" +from __future__ import annotations + +from collections.abc import Sequence +from dataclasses import dataclass + +from .cases import Case +from .scorers import SCORERS, score_prediction +from .targets import Target + +_REGRESSION_EPSILON = 1e-9 + + +@dataclass(frozen=True) +class CaseScore: + case_id: str + prediction: str + scores: dict[str, float] + tags: tuple[str, ...] + + +@dataclass(frozen=True) +class RunResult: + target: str + n_cases: int + aggregate: dict[str, float] + case_scores: tuple[CaseScore, ...] + + +def run_target(name: str, target: Target, cases: Sequence[Case]) -> RunResult: + """Score ``target`` against every case and roll up mean aggregates.""" + totals = {metric: 0.0 for metric in SCORERS} + case_scores: list[CaseScore] = [] + for case in cases: + prediction = target(case) + scores = score_prediction(prediction, case.answers) + for metric, value in scores.items(): + totals[metric] += value + case_scores.append(CaseScore(case.id, prediction, scores, case.tags)) + + denominator = len(cases) or 1 + aggregate = {metric: round(total / denominator, 6) for metric, total in totals.items()} + return RunResult(name, len(cases), aggregate, tuple(case_scores)) + + +@dataclass(frozen=True) +class Regression: + case_id: str + metric: str + baseline: float + candidate: float + delta: float + tags: tuple[str, ...] + + @property + def reason(self) -> str: + return ( + f"{self.metric}: candidate {self.candidate:.2f} below baseline " + f"{self.baseline:.2f} ({self.delta:+.2f})" + ) + + +def compare( + baseline: RunResult, + candidate: RunResult, + *, + metric: str = "token_f1", + threshold: float = _REGRESSION_EPSILON, +) -> list[Regression]: + """Per-case regressions: candidate strictly worse than baseline on ``metric``.""" + baseline_by_id = {cs.case_id: cs for cs in baseline.case_scores} + regressions: list[Regression] = [] + for candidate_score in candidate.case_scores: + baseline_score = baseline_by_id.get(candidate_score.case_id) + if baseline_score is None: + continue + before = baseline_score.scores.get(metric, 0.0) + after = candidate_score.scores.get(metric, 0.0) + if after < before - threshold: + regressions.append( + Regression( + case_id=candidate_score.case_id, + metric=metric, + baseline=round(before, 6), + candidate=round(after, 6), + delta=round(after - before, 6), + tags=candidate_score.tags, + ) + ) + return regressions diff --git a/src/evalops_workbench/eval/scorers.py b/src/evalops_workbench/eval/scorers.py new file mode 100644 index 0000000..033b7c3 --- /dev/null +++ b/src/evalops_workbench/eval/scorers.py @@ -0,0 +1,60 @@ +"""Rubric functions: exact match, token-overlap F1, and gold containment. + +Each scorer takes a single prediction string and the tuple of acceptable gold +answers, and returns a float in [0.0, 1.0]. A prediction scores against its +best-matching gold (the standard for multi-reference QA). +""" +from __future__ import annotations + +from collections import Counter +from collections.abc import Sequence + +from .normalize import normalize_answer, tokenize + + +def exact_match(prediction: str, golds: Sequence[str]) -> float: + """1.0 if the normalized prediction equals any normalized gold, else 0.0.""" + normalized_pred = normalize_answer(prediction) + return 1.0 if any(normalized_pred == normalize_answer(g) for g in golds) else 0.0 + + +def _pairwise_f1(prediction: str, gold: str) -> float: + pred_tokens = tokenize(prediction) + gold_tokens = tokenize(gold) + if not pred_tokens or not gold_tokens: + # If both are empty they match; if exactly one is empty they do not. + return 1.0 if pred_tokens == gold_tokens else 0.0 + shared = Counter(pred_tokens) & Counter(gold_tokens) + num_shared = sum(shared.values()) + if num_shared == 0: + return 0.0 + precision = num_shared / len(pred_tokens) + recall = num_shared / len(gold_tokens) + return 2 * precision * recall / (precision + recall) + + +def token_f1(prediction: str, golds: Sequence[str]) -> float: + """Best token-overlap F1 across all gold answers.""" + return max((_pairwise_f1(prediction, g) for g in golds), default=0.0) + + +def contains_gold(prediction: str, golds: Sequence[str]) -> float: + """1.0 if the normalized prediction contains a (non-empty) normalized gold.""" + normalized_pred = normalize_answer(prediction) + for gold in golds: + normalized_gold = normalize_answer(gold) + if normalized_gold and normalized_gold in normalized_pred: + return 1.0 + return 0.0 + + +SCORERS = { + "exact_match": exact_match, + "token_f1": token_f1, + "contains_gold": contains_gold, +} + + +def score_prediction(prediction: str, golds: Sequence[str]) -> dict[str, float]: + """Run every rubric function and return a name -> score mapping.""" + return {name: fn(prediction, golds) for name, fn in SCORERS.items()} diff --git a/src/evalops_workbench/eval/targets.py b/src/evalops_workbench/eval/targets.py new file mode 100644 index 0000000..cc2e4ce --- /dev/null +++ b/src/evalops_workbench/eval/targets.py @@ -0,0 +1,147 @@ +"""The system-under-test: named, deterministic extractive-QA strategies. + +These are the "variants" an EvalOps user would compare, made concrete and +credential-free so the public benchmark reproduces anywhere. Each target maps a +``Case`` to a predicted answer string. They differ in real, explainable ways, so +the harness surfaces a genuine quality delta (and genuine per-case regressions), +not a manufactured one. + +A live-LLM target would implement the same ``Target`` signature; it is an +optional extension, never required by the public benchmark. +""" +from __future__ import annotations + +import re +from collections.abc import Callable + +from .cases import Case +from .normalize import normalize_answer + +Target = Callable[[Case], str] + +_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+") +_YEAR = re.compile(r"\b(?:1[0-9]{3}|2[0-9]{3})\b") +_NUMBER = re.compile(r"\b\d[\d,]*(?:\.\d+)?\b") +_PROPER_NOUN = re.compile(r"\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b") + +_STOPWORDS = frozenset( + """ + a an the of to in on at for and or but is are was were be been being am + this that these those with as by from into over under after before it its + their his her him she he they them we you your our + what when where who whom which why how whose does did do done has have had + """.split() +) + + +def split_sentences(text: str) -> list[str]: + """Split a passage into trimmed, non-empty sentences.""" + return [part.strip() for part in _SENTENCE_SPLIT.split(text.strip()) if part.strip()] + + +def _content_tokens(text: str) -> set[str]: + return {tok for tok in normalize_answer(text).split() if tok not in _STOPWORDS} + + +def _best_sentence(case: Case) -> str: + """The sentence whose content words most overlap the question. + + Ties break toward the shorter sentence (more answer-dense). This is the + shared retrieval step both the baseline and candidate build on. + """ + sentences = split_sentences(case.context) + if not sentences: + return case.context.strip() + question_tokens = _content_tokens(case.question) + if not question_tokens: + return sentences[0] + + def rank(sentence: str) -> tuple[int, int]: + overlap = len(question_tokens & _content_tokens(sentence)) + return (overlap, -len(_content_tokens(sentence))) + + return max(sentences, key=rank) + + +def _question_word(question: str) -> str: + tokens = normalize_answer(question).split() + return tokens[0] if tokens else "" + + +def _first_novel_proper_noun(sentence: str, question: str) -> str | None: + """First proper-noun phrase that the question does not already name. + + Skipping the subject the question is about (and sentence-initial articles + like "The") is what makes the candidate competent on most factoids. Picking + the *first* remaining entity is also what makes it fail when a distractor + entity precedes the answer. + """ + question_tokens = _content_tokens(question) + for match in _PROPER_NOUN.finditer(sentence): + phrase_tokens = _content_tokens(match.group(0)) + if not phrase_tokens or phrase_tokens & question_tokens: + continue + return match.group(0) + return None + + +def first_sentence(case: Case) -> str: + """Floor strategy: always return the opening sentence.""" + sentences = split_sentences(case.context) + return sentences[0] if sentences else case.context.strip() + + +def overlap_sentence(case: Case) -> str: + """Baseline: return the whole best-overlap sentence. + + Safe but blunt — it scores partial F1 on most cases and rarely an exact + match, because it returns far more than the answer span. + """ + return _best_sentence(case) + + +def span_extract(case: Case) -> str: + """Candidate: locate the best sentence, then narrow to an answer span. + + Wins big on factoid questions (who/when/where/how-many) by returning the + span instead of the sentence. Can mis-fire on adversarial phrasing (grabs + the first proper noun or number when the answer is a later one), which is + exactly the silent regression the harness exists to catch. + """ + sentence = _best_sentence(case) + qword = _question_word(case.question) + lowered = case.question.lower() + + if qword == "when" or "what year" in lowered or "which year" in lowered: + match = _YEAR.search(sentence) or _NUMBER.search(sentence) + if match: + return match.group(0) + if qword == "how" and any( + phrase in lowered for phrase in ("how many", "how much", "how long", "how old") + ): + match = _NUMBER.search(sentence) + if match: + return match.group(0) + if qword in {"who", "whom", "where"}: + span = _first_novel_proper_noun(sentence, case.question) + if span: + return span + + # No confident span: fall back to the best sentence (never worse than baseline here). + return sentence + + +REGISTRY: dict[str, Target] = { + "first_sentence": first_sentence, + "overlap_sentence": overlap_sentence, + "span_extract": span_extract, +} + + +def get_target(name: str) -> Target: + try: + return REGISTRY[name] + except KeyError: + raise ValueError( + f"unknown target {name!r}; known targets: {sorted(REGISTRY)}" + ) from None diff --git a/src/evalops_workbench/project.json b/src/evalops_workbench/project.json index 3a32958..3265713 100644 --- a/src/evalops_workbench/project.json +++ b/src/evalops_workbench/project.json @@ -3,15 +3,14 @@ "name": "EvalOps Workbench", "category": "Developer Tool", "track": "LLM", - "stage": "Researching", + "stage": "Prototype", "summary": "A local-first evaluation harness for prompts, tools, and agents with regression tracking and experiment history.", "problem": "LLM teams lack a lightweight way to compare prompt and tool changes before shipping.", "users": "Agent builders, prompt engineers, applied AI teams", "stack": [ "Python", - "Typer", - "DuckDB", - "OpenTelemetry" + "GitHub Actions", + "Vercel" ], "why_now": "Evaluation is moving from optional best practice to baseline engineering hygiene.", "mvp": [ diff --git a/src/lib/api.ts b/src/lib/api.ts index aa5474c..1e9faa0 100644 --- a/src/lib/api.ts +++ b/src/lib/api.ts @@ -1,6 +1,7 @@ -// Slim API surface for the showcase dashboard. -// Only the public /api/stats endpoint is real on showcase deploys; the -// Tier-A BFF endpoints (run, documents, ui/*) don't exist here. +// Slim API surface for the dashboard. +// Two public, unauthenticated endpoints are real on this deploy: +// /api/stats — Tier-A telemetry (TELEMETRY_SCHEMA.md) +// /api/benchmark-latest — the latest published benchmark run async function publicFetch(path: string, init?: RequestInit): Promise { const res = await fetch(path, { @@ -13,19 +14,21 @@ async function publicFetch(path: string, init?: RequestInit): Promise { return res.json() as Promise; } -/** Tier-B telemetry response — see TELEMETRY_SCHEMA.md. */ +/** Tier-A telemetry response — see TELEMETRY_SCHEMA.md. */ export interface PublicStats { system: string; mode?: "live" | "showcase"; status: "operational" | "degraded" | "down"; last_deployed_at: string | null; + last_active_at?: string | null; last_commit_at?: string | null; metrics: { - commits_30d?: number; - commits_total?: number; - primary_language?: string; - repo_stars?: number; - lines_of_code?: number; + eval_runs_total?: number; + eval_runs_24h?: number; + last_pass_rate?: number; + rolling_pass_rate_7d?: number; + regressions_caught_30d?: number; + experiments_tracked?: number; [key: string]: number | string | undefined; }; schema_version: number; @@ -35,3 +38,67 @@ export interface PublicStats { export function fetchPublicStats(): Promise { return publicFetch("/api/stats"); } + +/** One strategy's aggregate scores in a benchmark run. */ +export interface BenchmarkVariant { + name: string; + exact_match: number; + token_f1: number; + contains_gold: number; +} + +/** A single per-case regression surfaced by the run. */ +export interface BenchmarkRegression { + case_id: string; + metric: string; + baseline: number; + candidate: number; + delta: number; + reason: string; + tags: string[]; +} + +export interface BenchmarkMetrics { + n_cases: number; + baseline_variant: string; + candidate_variant: string; + exact_match: number; + token_f1: number; + contains_gold: number; + improvement_exact_match: number; + improvement_token_f1: number; + regressions: number; + pass_rate: number; + gate_verdict: string; +} + +/** /api/benchmark-latest response — see the benchmark-latest spec in TELEMETRY_SCHEMA.md. */ +export interface PublicBenchmark { + system: string; + benchmark_type: string; + status?: string; + run_id: string | null; + fixture?: string; + metrics: BenchmarkMetrics | null; + variants?: BenchmarkVariant[]; + regressions?: BenchmarkRegression[]; + gate?: { + passed: boolean; + metric: string; + pinned: number | null; + observed: number; + reasons: string[]; + }; + artifact_urls?: { report: string; fixture: string; run: string }; + schema_version: number; + generated_at: string; + previous_run?: { + run_id: string | null; + generated_at: string | null; + delta: Record; + } | null; +} + +export function fetchBenchmarkLatest(): Promise { + return publicFetch("/api/benchmark-latest"); +} diff --git a/src/lib/project.ts b/src/lib/project.ts index 5a59dab..954d7b2 100644 --- a/src/lib/project.ts +++ b/src/lib/project.ts @@ -26,13 +26,13 @@ export const PROJECT: ProjectSpec = { name: "EvalOps Workbench", category: "Developer Tool", track: "LLM", - stage: "Researching", + stage: "Prototype", summary: "A local-first evaluation harness for prompts, tools, and agents with regression tracking and experiment history.", problem: "LLM teams lack a lightweight way to compare prompt and tool changes before shipping.", users: "Agent builders, prompt engineers, applied AI teams", - stack: ["Python", "Typer", "DuckDB", "OpenTelemetry"], + stack: ["Python", "GitHub Actions", "Vercel"], why_now: "Evaluation is moving from optional best practice to baseline engineering hygiene.", mvp: [ diff --git a/tests/test_benchmark_endpoint.py b/tests/test_benchmark_endpoint.py new file mode 100644 index 0000000..c24df67 --- /dev/null +++ b/tests/test_benchmark_endpoint.py @@ -0,0 +1,75 @@ +"""Unit tests for the /api/benchmark-latest serverless function. + +The module file name contains a hyphen (matching the Vercel route), so it is +loaded by path rather than imported by name. +""" +from __future__ import annotations + +import importlib.util +import json +import tempfile +import unittest +from pathlib import Path + +_API_DIR = Path(__file__).resolve().parent.parent / "api" + + +def _load_endpoint(): + spec = importlib.util.spec_from_file_location( + "benchmark_latest_endpoint", _API_DIR / "benchmark-latest.py" + ) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +class BenchmarkEndpointTests(unittest.TestCase): + def setUp(self) -> None: + self.mod = _load_endpoint() + self._orig_artifact = self.mod.ARTIFACT_FILE + + def tearDown(self) -> None: + self.mod.ARTIFACT_FILE = self._orig_artifact + + def test_returns_committed_artifact(self) -> None: + artifact = { + "system": "evalops", + "benchmark_type": "eval", + "run_id": "evalops-2026-05-26-abc12345", + "metrics": {"token_f1": 0.65}, + "schema_version": 1, + "generated_at": "2026-05-26T00:00:00Z", + } + path = Path(tempfile.mkdtemp()) / "_benchmark_latest.json" + path.write_text(json.dumps(artifact), encoding="utf-8") + self.mod.ARTIFACT_FILE = path + + response = self.mod.build_response() + self.assertEqual(response["system"], "evalops") + self.assertEqual(response["run_id"], "evalops-2026-05-26-abc12345") + self.assertEqual(response["schema_version"], 1) + + def test_pending_when_artifact_missing(self) -> None: + self.mod.ARTIFACT_FILE = Path("/nonexistent/_benchmark_latest.json") + response = self.mod.build_response() + self.assertEqual(response["system"], "evalops") + self.assertEqual(response["status"], "pending") + self.assertEqual(response["benchmark_type"], "eval") + self.assertIsNone(response["run_id"]) + self.assertEqual(response["schema_version"], 1) + + def test_seeded_repo_artifact_is_valid(self) -> None: + """The committed artifact in the repo must be schema-valid.""" + response = self.mod.build_response() + # In the repo the seed exists; if a developer cleared it, accept pending. + if response.get("status") == "pending": + self.skipTest("no seeded artifact present") + self.assertEqual(response["system"], "evalops") + self.assertEqual(response["schema_version"], 1) + self.assertIn("metrics", response) + self.assertIn("generated_at", response) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_eval_engine.py b/tests/test_eval_engine.py new file mode 100644 index 0000000..a09c848 --- /dev/null +++ b/tests/test_eval_engine.py @@ -0,0 +1,272 @@ +"""Unit tests for the EvalOps evaluation engine.""" +from __future__ import annotations + +import json +import tempfile +import unittest +from pathlib import Path + +from evalops_workbench.eval import artifact as artifact_mod +from evalops_workbench.eval import baseline as baseline_mod +from evalops_workbench.eval import ledger as ledger_mod +from evalops_workbench.eval.cases import Case, load_cases +from evalops_workbench.eval.normalize import normalize_answer, tokenize +from evalops_workbench.eval.runner import compare, run_target +from evalops_workbench.eval.scorers import ( + contains_gold, + exact_match, + score_prediction, + token_f1, +) +from evalops_workbench.eval.targets import ( + first_sentence, + get_target, + overlap_sentence, + span_extract, + split_sentences, +) + + +class NormalizeTests(unittest.TestCase): + def test_strips_articles_punctuation_and_case(self) -> None: + self.assertEqual(normalize_answer("The Nile."), "nile") + self.assertEqual(normalize_answer(" A House! "), "house") + + def test_empty(self) -> None: + self.assertEqual(normalize_answer(""), "") + self.assertEqual(tokenize("the a an"), []) + + def test_tokenize(self) -> None: + self.assertEqual(tokenize("Hello, World!"), ["hello", "world"]) + + +class ScorerTests(unittest.TestCase): + def test_exact_match_normalizes(self) -> None: + self.assertEqual(exact_match("Paris", ["paris"]), 1.0) + self.assertEqual(exact_match("Paris, France", ["Paris"]), 0.0) + + def test_token_f1_bounds(self) -> None: + self.assertEqual(token_f1("Paris", ["Paris"]), 1.0) + self.assertEqual(token_f1("London", ["Paris"]), 0.0) + partial = token_f1("the answer is Paris", ["Paris"]) + self.assertTrue(0.0 < partial < 1.0) + + def test_contains_gold(self) -> None: + self.assertEqual(contains_gold("the capital is Paris", ["Paris"]), 1.0) + self.assertEqual(contains_gold("London", ["Paris"]), 0.0) + + def test_score_prediction_returns_all(self) -> None: + scores = score_prediction("Paris", ["Paris"]) + self.assertEqual(set(scores), {"exact_match", "token_f1", "contains_gold"}) + + +class CaseLoaderTests(unittest.TestCase): + def _write(self, name: str, text: str) -> Path: + tmp = Path(tempfile.mkdtemp()) / name + tmp.write_text(text, encoding="utf-8") + return tmp + + def test_load_jsonl_and_coerce_string_answer(self) -> None: + path = self._write( + "c.jsonl", + '{"id":"1","question":"q","context":"c","answers":"a"}\n', + ) + cases = load_cases(path) + self.assertEqual(len(cases), 1) + self.assertEqual(cases[0].answers, ("a",)) + + def test_missing_field_raises(self) -> None: + path = self._write("c.jsonl", '{"id":"1","question":"q","context":"c"}\n') + with self.assertRaises(ValueError): + load_cases(path) + + def test_duplicate_ids_raise(self) -> None: + path = self._write( + "c.jsonl", + '{"id":"1","question":"q","context":"c","answers":"a"}\n' + '{"id":"1","question":"q","context":"c","answers":"b"}\n', + ) + with self.assertRaises(ValueError): + load_cases(path) + + def test_unsupported_extension_raises(self) -> None: + path = self._write("c.txt", "nope") + with self.assertRaises(ValueError): + load_cases(path) + + def test_missing_file_raises(self) -> None: + with self.assertRaises(FileNotFoundError): + load_cases(Path(tempfile.mkdtemp()) / "absent.jsonl") + + +class TargetTests(unittest.TestCase): + def test_split_sentences(self) -> None: + self.assertEqual(split_sentences("One. Two! Three?"), ["One.", "Two!", "Three?"]) + + def test_candidate_extracts_entity(self) -> None: + case = Case( + "t", + "Where is the Eiffel Tower located?", + "The Eiffel Tower is located in Paris. It opened in 1889.", + ("Paris",), + ) + self.assertEqual(span_extract(case), "Paris") + self.assertEqual(overlap_sentence(case), "The Eiffel Tower is located in Paris.") + self.assertEqual(first_sentence(case), "The Eiffel Tower is located in Paris.") + + def test_candidate_extracts_year_and_number(self) -> None: + when = Case("w", "When was the telephone patented?", "The telephone was patented in 1876.", ("1876",)) + self.assertEqual(span_extract(when), "1876") + howmany = Case("h", "How many continents are there on Earth?", "There are 7 continents on Earth.", ("7",)) + self.assertEqual(span_extract(howmany), "7") + + def test_candidate_regresses_on_distractor(self) -> None: + case = Case( + "adv", + "Who composed the Ninth Symphony?", + "The Ninth Symphony was first performed in Vienna and is attributed to Beethoven.", + ("Beethoven",), + ) + # The candidate grabs the first novel entity (the distractor), not the answer. + self.assertEqual(span_extract(case), "Vienna") + + def test_get_target_unknown_raises(self) -> None: + with self.assertRaises(ValueError): + get_target("nope") + + +class RunnerTests(unittest.TestCase): + def setUp(self) -> None: + self.cases = [ + Case("good", "Where is the Eiffel Tower located?", + "The Eiffel Tower is located in Paris.", ("Paris",)), + Case("adv", "Who composed the Ninth Symphony?", + "The Ninth Symphony premiered in Vienna and is attributed to Beethoven.", ("Beethoven",)), + ] + + def test_run_target_aggregate(self) -> None: + result = run_target("span_extract", get_target("span_extract"), self.cases) + self.assertEqual(result.n_cases, 2) + self.assertIn("token_f1", result.aggregate) + self.assertEqual(len(result.case_scores), 2) + + def test_compare_finds_only_real_regressions(self) -> None: + baseline = run_target("overlap_sentence", get_target("overlap_sentence"), self.cases) + candidate = run_target("span_extract", get_target("span_extract"), self.cases) + regressions = compare(baseline, candidate, metric="token_f1") + regressed = {r.case_id for r in regressions} + self.assertIn("adv", regressed) + self.assertNotIn("good", regressed) + + +class BaselineGateTests(unittest.TestCase): + def _result(self, f1: float): + case = Case("x", "q", "c", ("a",)) + result = run_target("first_sentence", lambda _c: "a", [case]) + # Override aggregate for a controlled gate test. + object.__setattr__(result, "aggregate", {"token_f1": f1, "exact_match": f1, "contains_gold": f1}) + return result + + def test_no_pin_establishes_contract(self) -> None: + verdict = baseline_mod.evaluate_gate(self._result(0.5), None) + self.assertTrue(verdict.passed) + self.assertIsNone(verdict.pinned) + + def test_above_floor_passes_below_fails(self) -> None: + pinned = {"aggregate": {"token_f1": 0.60}} + self.assertTrue(baseline_mod.evaluate_gate(self._result(0.59), pinned).passed) + self.assertFalse(baseline_mod.evaluate_gate(self._result(0.50), pinned).passed) + + def test_pin_roundtrip(self) -> None: + path = Path(tempfile.mkdtemp()) / "pin.json" + result = self._result(0.7) + baseline_mod.save_pinned(path, result) + loaded = baseline_mod.load_pinned(path) + self.assertEqual(loaded["aggregate"]["token_f1"], 0.7) + + +class LedgerTests(unittest.TestCase): + def test_append_trim_previous(self) -> None: + path = Path(tempfile.mkdtemp()) / "hist.json" + self.assertIsNone(ledger_mod.previous_record(path)) + ledger_mod.append_record(path, {"run_id": "a", "generated_at": "t1"}) + ledger_mod.append_record(path, {"run_id": "b", "generated_at": "t2"}, keep=5) + self.assertEqual(ledger_mod.previous_record(path)["run_id"], "b") + self.assertEqual(len(ledger_mod.read_records(path)), 2) + + def test_trim_to_keep(self) -> None: + path = Path(tempfile.mkdtemp()) / "hist.json" + for i in range(10): + ledger_mod.append_record(path, {"run_id": str(i)}, keep=3) + records = ledger_mod.read_records(path) + self.assertEqual(len(records), 3) + self.assertEqual(records[-1]["run_id"], "9") + + +class ArtifactTests(unittest.TestCase): + def _results(self): + cases = [ + Case("good", "Where is the Eiffel Tower located?", + "The Eiffel Tower is located in Paris.", ("Paris",)), + Case("adv", "Who composed the Ninth Symphony?", + "The Ninth Symphony premiered in Vienna and is attributed to Beethoven.", ("Beethoven",)), + ] + return {name: run_target(name, get_target(name), cases) + for name in ("first_sentence", "overlap_sentence", "span_extract")} + + def test_build_artifact_shape(self) -> None: + results = self._results() + regressions = compare(results["overlap_sentence"], results["span_extract"]) + verdict = baseline_mod.evaluate_gate(results["span_extract"], None) + art = artifact_mod.build_artifact( + fixture_id="benchmark-v1", + fixture_rel="examples/benchmark-v1/cases.jsonl", + results=results, + baseline_name="overlap_sentence", + candidate_name="span_extract", + regressions=regressions, + verdict=verdict, + previous=None, + generated_at="2026-05-26T00:00:00Z", + ) + self.assertEqual(art["system"], "evalops") + self.assertEqual(art["benchmark_type"], "eval") + self.assertEqual(art["schema_version"], 1) + self.assertTrue(art["run_id"].startswith("evalops-2026-05-26-")) + self.assertIsNone(art["previous_run"]) + self.assertEqual(len(art["variants"]), 3) + self.assertIn("pass_rate", art["metrics"]) + + def test_previous_run_delta(self) -> None: + results = self._results() + verdict = baseline_mod.evaluate_gate(results["span_extract"], None) + previous = {"run_id": "prev", "generated_at": "2026-05-25T00:00:00Z", + "token_f1": 0.10, "exact_match": 0.10, "regressions": 1} + art = artifact_mod.build_artifact( + fixture_id="benchmark-v1", fixture_rel="x", results=results, + baseline_name="overlap_sentence", candidate_name="span_extract", + regressions=[], verdict=verdict, previous=previous, + generated_at="2026-05-26T00:00:00Z", + ) + self.assertEqual(art["previous_run"]["run_id"], "prev") + self.assertIn("token_f1", art["previous_run"]["delta"]) + + def test_slim_record_keys(self) -> None: + results = self._results() + verdict = baseline_mod.evaluate_gate(results["span_extract"], None) + art = artifact_mod.build_artifact( + fixture_id="benchmark-v1", fixture_rel="x", results=results, + baseline_name="overlap_sentence", candidate_name="span_extract", + regressions=[], verdict=verdict, previous=None, + generated_at="2026-05-26T00:00:00Z", + ) + record = artifact_mod.slim_record(art) + self.assertEqual( + set(record), + {"run_id", "generated_at", "pass_rate", "token_f1", "exact_match", + "regressions", "regressed_ids", "gate_verdict", "variants"}, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stats.py b/tests/test_stats.py index 32c876a..df001fb 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -1,156 +1,97 @@ -"""Unit tests for the /api/stats Vercel serverless function. +"""Unit tests for the /api/stats Vercel serverless function (Tier A, live). Covers: -- happy path: GitHub reachable, response shape matches Tier B contract -- degraded path: GitHub unreachable, contract still satisfied with status="degraded" -- safety caps: oversize values are clamped -- never returns 5xx (handler always emits HTTP 200) +- live path: benchmark history present, metrics derived from records +- degraded path: no history yet, contract satisfied with zeroed metrics +- schema shape matches the evalops Tier-A contract in TELEMETRY_SCHEMA.md +- safety caps and the never-5xx handler guarantee """ from __future__ import annotations import io import json import sys +import tempfile import unittest from pathlib import Path -from unittest.mock import MagicMock, patch -from urllib.error import URLError +from unittest.mock import MagicMock -# Add repo root to sys.path so we can import the api/stats.py module. +# Add repo root /api to sys.path so we can import the api/stats.py module. sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "api")) import stats # type: ignore # noqa: E402 +_TIER_A_METRICS = { + "eval_runs_total", + "eval_runs_24h", + "last_pass_rate", + "rolling_pass_rate_7d", + "regressions_caught_30d", + "experiments_tracked", +} -def _reset_cache() -> None: - stats._cache = {"ts": 0.0, "payload": None} +def _write_history(records: list[dict]) -> Path: + path = Path(tempfile.mkdtemp()) / "_benchmark_history.json" + path.write_text(json.dumps(records), encoding="utf-8") + return path -def _fake_response(body: object, link_header: str = "") -> MagicMock: - """Build a context-manager-compatible mock that mimics urlopen's return.""" - raw = json.dumps(body).encode("utf-8") - cm = MagicMock() - cm.__enter__ = MagicMock(return_value=cm) - cm.__exit__ = MagicMock(return_value=False) - cm.read = MagicMock(return_value=raw) - cm.getheaders = MagicMock( - return_value=[("Link", link_header)] if link_header else [] - ) - return cm - -class ResponseShapeTests(unittest.TestCase): +class LiveResponseTests(unittest.TestCase): def setUp(self) -> None: - _reset_cache() - - def test_happy_path_matches_contract(self) -> None: - repo_payload = {"stargazers_count": 7, "language": "Python"} - commit_payload = [ - {"commit": {"author": {"date": "2026-04-26T12:00:00Z"}}} - ] - - def side_effect(req, timeout=None): - url = req.full_url - if "/commits" not in url: - return _fake_response(repo_payload) - return _fake_response( - commit_payload, - link_header=( - f"; rel=\"next\", " - f"; rel=\"last\"" - ), - ) - - with patch.object(stats, "urlopen", side_effect=side_effect): - response = stats._build_response() - - self.assertEqual(response["schema_version"], 1) - self.assertEqual(response["mode"], "showcase") + self._orig_history = stats.HISTORY_FILE + self._orig_artifact = stats.ARTIFACT_FILE + + def tearDown(self) -> None: + stats.HISTORY_FILE = self._orig_history + stats.ARTIFACT_FILE = self._orig_artifact + + def test_live_operational_from_history(self) -> None: + stats.HISTORY_FILE = _write_history( + [ + {"run_id": "r1", "generated_at": stats._now_iso(), "pass_rate": 0.6, + "regressions": 2, "regressed_ids": ["a", "b"], + "variants": ["overlap_sentence", "span_extract"]}, + {"run_id": "r2", "generated_at": stats._now_iso(), "pass_rate": 0.7, + "regressions": 2, "regressed_ids": ["b", "c"], + "variants": ["span_extract", "first_sentence"]}, + ] + ) + stats.ARTIFACT_FILE = Path("/nonexistent/_benchmark_latest.json") + response = stats._build_response() + + self.assertEqual(response["mode"], "live") self.assertEqual(response["status"], "operational") - self.assertEqual(response["system"], stats.SYSTEM_SLUG) - self.assertIn("metrics", response) - self.assertEqual(response["metrics"]["repo_stars"], 7) - self.assertEqual(response["metrics"]["primary_language"], "Python") - self.assertEqual(response["metrics"]["commits_total"], 42) - self.assertEqual(response["last_commit_at"], "2026-04-26T12:00:00Z") - # generated_at is ISO-8601 with Z suffix. + self.assertEqual(response["schema_version"], 1) + self.assertEqual(set(response["metrics"]), _TIER_A_METRICS) + self.assertEqual(response["metrics"]["eval_runs_total"], 2) + self.assertEqual(response["metrics"]["last_pass_rate"], 0.7) + # Distinct regressions across the window: union of {a,b} and {b,c} = 3. + self.assertEqual(response["metrics"]["regressions_caught_30d"], 3) + self.assertEqual(response["metrics"]["experiments_tracked"], 3) self.assertTrue(response["generated_at"].endswith("Z")) - def test_degraded_when_github_unreachable(self) -> None: - with patch.object(stats, "urlopen", side_effect=URLError("offline")): - response = stats._build_response() + def test_degraded_without_history(self) -> None: + stats.HISTORY_FILE = Path("/nonexistent/_benchmark_history.json") + stats.ARTIFACT_FILE = Path("/nonexistent/_benchmark_latest.json") + response = stats._build_response() - self.assertEqual(response["schema_version"], 1) - self.assertEqual(response["mode"], "showcase") + self.assertEqual(response["mode"], "live") self.assertEqual(response["status"], "degraded") - self.assertEqual(response["metrics"]["commits_total"], 0) - self.assertEqual(response["metrics"]["repo_stars"], 0) - self.assertIsNone(response["last_commit_at"]) - - def test_serves_stale_cache_on_subsequent_failure(self) -> None: - # First call: successful. Second call: GitHub is down. Expect status - # to flip to "degraded" but the metric values from the cache are kept. - repo_payload = {"stargazers_count": 11, "language": "Go"} - commit_payload = [ - {"commit": {"author": {"date": "2026-04-25T08:00:00Z"}}} - ] - - def good(req, timeout=None): - if "/commits" not in req.full_url: - return _fake_response(repo_payload) - return _fake_response( - commit_payload, - link_header=( - '; rel="next", ' - '; rel="last"' - ), - ) - - with patch.object(stats, "urlopen", side_effect=good): - first = stats._build_response() - self.assertEqual(first["status"], "operational") - - with patch.object(stats, "_fetch_metrics", side_effect=URLError("offline")): - # Force cache miss by advancing the clock past the TTL. - stats._cache["ts"] = 0.0 - stale = stats._build_response() - self.assertEqual(stale["status"], "degraded") - self.assertEqual(stale["metrics"]["repo_stars"], 11) - self.assertEqual(stale["metrics"]["commits_total"], 99) + self.assertEqual(response["metrics"], stats._zeroed_metrics()) + self.assertIsNone(response["last_active_at"]) class SafetyCapTests(unittest.TestCase): - def test_oversize_values_are_clamped(self) -> None: - self.assertEqual(stats._cap("repo_stars", 99_999_999), 1_000_000) - self.assertEqual(stats._cap("commits_total", 50_000_000), 1_000_000) - self.assertEqual(stats._cap("commits_30d", 500_000), 100_000) - self.assertEqual(stats._cap("lines_of_code", 999_999_999), 10_000_000) - # Unknown key passes through unchanged. + def test_caps_clamp(self) -> None: + self.assertEqual(stats._cap("eval_runs_total", 9_999_999), 1_000_000) + self.assertEqual(stats._cap("experiments_tracked", 999_999), 100_000) self.assertEqual(stats._cap("not_a_field", 42), 42) class HandlerTests(unittest.TestCase): - """Exercise the BaseHTTPRequestHandler entrypoint end-to-end.""" - - def setUp(self) -> None: - _reset_cache() - def _invoke(self, method: str = "GET") -> tuple[int, dict[str, str], bytes]: - # Build a minimal raw HTTP request the handler can parse. - request_text = ( - f"{method} /api/stats HTTP/1.0\r\nHost: x\r\n\r\n" - ).encode("utf-8") - rfile = io.BytesIO(request_text) + rfile = io.BytesIO(f"{method} /api/stats HTTP/1.0\r\nHost: x\r\n\r\n".encode()) wfile = io.BytesIO() - - class _Conn: - def makefile(self, *_args: object, **_kwargs: object) -> io.BytesIO: - return rfile - - # BaseHTTPRequestHandler init runs the request automatically. h = stats.handler.__new__(stats.handler) h.rfile = rfile h.wfile = wfile @@ -161,39 +102,34 @@ def makefile(self, *_args: object, **_kwargs: object) -> io.BytesIO: h.request_version = "HTTP/1.0" h.headers = {} h.requestline = f"{method} /api/stats HTTP/1.0" - if method == "OPTIONS": h.do_OPTIONS() else: - with patch.object(stats, "urlopen", side_effect=URLError("test")): - h.do_GET() - + h.do_GET() raw = wfile.getvalue().decode("utf-8", errors="replace") head, _, body = raw.partition("\r\n\r\n") - status_line = head.split("\r\n", 1)[0] - status_code = int(status_line.split(" ", 2)[1]) - hdrs = {} + status_code = int(head.split("\r\n", 1)[0].split(" ", 2)[1]) + headers = {} for line in head.split("\r\n")[1:]: if ": " in line: - k, v = line.split(": ", 1) - hdrs[k] = v - return status_code, hdrs, body.encode("utf-8") + key, value = line.split(": ", 1) + headers[key] = value + return status_code, headers, body.encode("utf-8") - def test_get_returns_200_even_when_upstream_fails(self) -> None: - status, hdrs, body = self._invoke("GET") + def test_get_returns_200_with_valid_contract(self) -> None: + status, headers, body = self._invoke("GET") self.assertEqual(status, 200) - self.assertEqual(hdrs.get("Content-Type"), "application/json") - self.assertEqual(hdrs.get("Access-Control-Allow-Origin"), "*") - self.assertIn("max-age=30", hdrs.get("Cache-Control", "")) + self.assertEqual(headers.get("Content-Type"), "application/json") + self.assertEqual(headers.get("Access-Control-Allow-Origin"), "*") + self.assertIn("max-age=30", headers.get("Cache-Control", "")) payload = json.loads(body) self.assertEqual(payload["schema_version"], 1) - self.assertEqual(payload["status"], "degraded") + self.assertEqual(payload["mode"], "live") def test_options_returns_204(self) -> None: - status, hdrs, _ = self._invoke("OPTIONS") + status, headers, _ = self._invoke("OPTIONS") self.assertEqual(status, 204) - self.assertEqual(hdrs.get("Access-Control-Allow-Origin"), "*") - self.assertEqual(hdrs.get("Access-Control-Allow-Methods"), "GET, OPTIONS") + self.assertEqual(headers.get("Access-Control-Allow-Methods"), "GET, OPTIONS") if __name__ == "__main__":