From aaec5e9e60fdb10250d7f4c907285964f00f9ff1 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Wed, 17 Jun 2026 09:53:10 +0000 Subject: [PATCH 1/4] bench: report and bundle judge against the same baseline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The report flagged reds against the latest nightly value while the bundle's retry compared against the median shipped in expectations — so when they differed a red could appear that was never retried (breaking retry==red). Single-source the baseline in bench_common.reference_value (median of recent non-null) and use it for both the report's reference and the shipped expectation. Median also makes the reference robust to a noisy last nightly. --- .travis/bench-expectations.py | 7 +++---- .travis/bench-report.py | 9 ++++++--- .travis/bench_common.py | 12 +++++++++++- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/.travis/bench-expectations.py b/.travis/bench-expectations.py index 25724beff6..2989bf4303 100644 --- a/.travis/bench-expectations.py +++ b/.travis/bench-expectations.py @@ -13,7 +13,7 @@ retried. Keys are underscored, as in bench-data; the bundle normalizes '-' -> '_'. A device with no history yields an empty file (retry disabled there, single-shot). """ -import argparse, json, os, statistics +import argparse, json, os import bench_common as bc @@ -33,10 +33,9 @@ def main(): if os.path.exists(path): d = json.load(open(path)) for m, arr in d.get("metrics", {}).items(): - vals = [v for v in arr[-a.window:] if v is not None] - if not vals: + expected = bc.reference_value(arr, a.window) + if expected is None: continue - expected = statistics.median(vals) thr = bc.red_threshold(m, cfg, bc.series_noise(arr), expected) if thr is not None: lines.append(f"{m} {expected} {thr}") diff --git a/.travis/bench-report.py b/.travis/bench-report.py index 81745d8d50..966830cf8e 100644 --- a/.travis/bench-report.py +++ b/.travis/bench-report.py @@ -22,8 +22,9 @@ def reference(bench_data, triple, device): - """latest non-null value + recent noise (p90 day-to-day |Δ%|) per metric from - bench-data//.json, plus the reference date.""" + """reference value (median of recent non-null, == what bench-expectations ships) + + recent noise (p90 day-to-day |Δ%|) per metric from bench-data//.json, + plus the reference date (latest non-null day, for display).""" path = os.path.join(bench_data, triple, f"{device}.json") if not os.path.exists(path): return {}, {}, None @@ -31,10 +32,12 @@ def reference(bench_data, triple, device): start = datetime.date.fromisoformat(d["start_day"]) vals, noise, last_idx = {}, {}, -1 for m, arr in d["metrics"].items(): + ref = bc.reference_value(arr) + if ref is not None: + vals[m] = ref noise[m] = bc.series_noise(arr) for i in range(len(arr) - 1, -1, -1): if arr[i] is not None: - vals[m] = arr[i] last_idx = max(last_idx, i) break ref_day = start + datetime.timedelta(last_idx) if last_idx >= 0 else None diff --git a/.travis/bench_common.py b/.travis/bench_common.py index 1a59ef1091..4e5a211143 100644 --- a/.travis/bench_common.py +++ b/.travis/bench_common.py @@ -5,11 +5,21 @@ exactly the would-be reds). Keeping it single-sourced is what guarantees that every red the maintainer sees was measured more than once. """ -import re, tomllib +import re, tomllib, statistics HIGHER_BETTER = re.compile(r"\.(pp\d+|tg\d+)\.") # llm throughput; everything else lower-is-better +def reference_value(arr, window=10): + """The baseline a metric is compared against: the median of its recent non-null + points. Used by BOTH the report (to compute the red) and bench-expectations (the + value shipped to the bundle) — so the bundle's retry and the report's red judge + against the same number. Median (not the latest single value) so a noisy last + nightly doesn't shift the reference. None if there's no data.""" + vals = [v for v in arr[-window:] if v is not None] + return statistics.median(vals) if vals else None + + def load_cfg(path): return tomllib.load(open(path, "rb")) From 872851c0ed3773a0cdad75a7870bb982f39601b5 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Wed, 17 Jun 2026 11:15:35 +0000 Subject: [PATCH 2/4] bench: don't let a cancelled/empty report overwrite a real comment A superseding PR push cancels the bench via concurrency, but the report job ran on always() with no results and posted a vacuous '0 metrics / no regressions' over the real comment. Gate the report on !cancelled() instead of always(), have bench-report skip writing a comment when there are no comparable metrics, and have the post step skip when no comment file exists. --- .github/workflows/bench.yml | 8 +++++++- .travis/bench-report.py | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 1afce2cdf4..eab1514889 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -208,7 +208,9 @@ jobs: report: needs: [ prepare, bench ] - if: ${{ always() && github.event_name == 'pull_request' && needs.prepare.outputs.enabled == 'true' }} + # !cancelled() (not always()) so a superseded/cancelled bench doesn't post a vacuous + # "no regressions" over a real result; the post step also skips when there's no comment. + if: ${{ !cancelled() && github.event_name == 'pull_request' && needs.prepare.outputs.enabled == 'true' }} runs-on: ubuntu-latest permissions: contents: read @@ -239,6 +241,10 @@ jobs: with: script: | const fs = require('fs'); + if (!fs.existsSync('pr-comment.md')) { // no results (cancelled/empty) -> leave any prior comment alone + core.info('no comparison produced; not posting'); + return; + } const body = fs.readFileSync('pr-comment.md', 'utf8'); const marker = ''; const { data: comments } = await github.rest.issues.listComments({ diff --git a/.travis/bench-report.py b/.travis/bench-report.py index 966830cf8e..381c6f74c7 100644 --- a/.travis/bench-report.py +++ b/.travis/bench-report.py @@ -125,6 +125,12 @@ def main(): rows.append({"device": device, "metric": metric, "ref": rv, "pr": prv, "delta": delta, "worse": worse, "mover": mover}) + # No comparable metrics (no device results, or no reference) -> don't write a comment, + # so a cancelled/empty run can't overwrite a real one with a vacuous "no regressions". + if not rows: + print("no comparable metrics; not writing a comment") + return + movers = [r for r in rows if r["mover"]] regr = sorted([r for r in movers if r["worse"]], key=lambda r: -abs(r["delta"])) impr = sorted([r for r in movers if not r["worse"]], From 3b500d9a4da3416e1d54503a1e85eab6a3617997 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Wed, 17 Jun 2026 11:27:44 +0000 Subject: [PATCH 3/4] DEBUG: instrument bench_run path + EXPECTATIONS state (revert after diagnosing mac no-retry) Temporary: print EXPECTATIONS path/-s/line-count/pwd before the bench loop, and whether bench_run takes the single-shot or retry path (once). To pin why the macOS runner logs 0 retries despite a present, correct expectations file. Revert once answered. --- .travis/bundle-entrypoint.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.travis/bundle-entrypoint.sh b/.travis/bundle-entrypoint.sh index c691c22ff8..01f38d4cc7 100755 --- a/.travis/bundle-entrypoint.sh +++ b/.travis/bundle-entrypoint.sh @@ -91,7 +91,11 @@ merge_best() { # $1 <- per-metric best of $1 and $2 (min, or max for pp/tg thro bench_run() { # bench_run fn=$1 shift - if [ ! -s "$EXPECTATIONS" ]; then CUR=metrics; "$fn" "$@"; return; fi + if [ ! -s "$EXPECTATIONS" ]; then + [ -z "$_DBG_PATH" ] && { echo "DEBUG[bench]: bench_run -> SINGLE-SHOT (EXPECTATIONS not usable)"; _DBG_PATH=1; } + CUR=metrics; "$fn" "$@"; return + fi + [ -z "$_DBG_PATH" ] && { echo "DEBUG[bench]: bench_run -> RETRY path"; _DBG_PATH=1; } best=$(newtmp); CUR=$best; : > "$best"; "$fn" "$@" tries=0 while [ "$tries" -lt "$RETRY_MAX" ] && out_of_threshold "$best"; do @@ -162,6 +166,8 @@ _llm_measure() { fi } +printf 'DEBUG[bench]: EXPECTATIONS=[%s] -s=%s lines=%s pwd=%s\n' "$EXPECTATIONS" "$([ -s "$EXPECTATIONS" ] && echo yes || echo no)" "$(wc -l < "$EXPECTATIONS" 2>/dev/null || echo NA)" "$(pwd)" + group "net benches" net_bench arm_ml_kws_cnn_m pass $CACHEDIR/ARM-ML-KWS-CNN-M.pb -i 49,10,f32 --partial --input-node Mfcc From 9ea2446e1d7266a7a7c19636f1b7f1fb53884fb8 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Wed, 17 Jun 2026 11:31:52 +0000 Subject: [PATCH 4/4] bench: pin GPU clock during the run, reset on exit The cuda runner free-boosts from idle (210MHz) up to its max, and the boost state varies session-to-session, adding ~16% variance to GPU evaltime that the in-run retry can't damp (shared session). Lock the graphics clock for the bench and reset it on exit via a trap. Best-effort (needs privilege, like the CPU governor pin; no-op without nvidia-smi). Defaults to the max supported clock; set BENCH_GPU_CLOCK if that throttles. --- .travis/bundle-entrypoint.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.travis/bundle-entrypoint.sh b/.travis/bundle-entrypoint.sh index 01f38d4cc7..4e2d72f79b 100755 --- a/.travis/bundle-entrypoint.sh +++ b/.travis/bundle-entrypoint.sh @@ -45,6 +45,19 @@ if [ "$(uname)" = "Linux" ] && [ -r /sys/devices/system/cpu/cpu0/cpufreq/scaling echo "$F" > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed fi +# Pin the GPU clock for the run, reset on exit. The cuda runner free-boosts from idle, +# which adds session-to-session variance to evaltime (the metric we most want sharp); +# pinning makes timing deterministic. Best-effort: needs privilege, no-op without +# nvidia-smi. Override the clock with BENCH_GPU_CLOCK (MHz) if the default throttles. +if command -v nvidia-smi > /dev/null 2>&1; then + GPU_CLOCK=${BENCH_GPU_CLOCK:-$(nvidia-smi --query-supported-clocks=graphics --format=csv,noheader,nounits 2>/dev/null | head -1)} + if [ -n "$GPU_CLOCK" ]; then + trap 'nvidia-smi --reset-gpu-clocks > /dev/null 2>&1 || true' EXIT + nvidia-smi -pm 1 > /dev/null 2>&1 || true + nvidia-smi --lock-gpu-clocks="$GPU_CLOCK" > /dev/null 2>&1 || echo "Warning: could not lock GPU clocks (need privilege?)" + fi +fi + # Expectation-guided retry: with an expectations file (EXPECTATIONS, 'metric expected # threshold' lines from bench-data history), re-run a bench whose measured value moved # worse-than-expected by at least its threshold — i.e. far enough to show as a PR red —