From aaec5e9e60fdb10250d7f4c907285964f00f9ff1 Mon Sep 17 00:00:00 2001
From: Mathieu Poumeyrol <kali@zoy.org>
Date: Wed, 17 Jun 2026 09:53:10 +0000
Subject: [PATCH 1/4] bench: report and bundle judge against the same baseline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The report flagged reds against the latest nightly value while the bundle's retry
compared against the median shipped in expectations — so when they differed a red
could appear that was never retried (breaking retry==red). Single-source the baseline
in bench_common.reference_value (median of recent non-null) and use it for both the
report's reference and the shipped expectation. Median also makes the reference robust
to a noisy last nightly.
---
 .travis/bench-expectations.py |  7 +++----
 .travis/bench-report.py       |  9 ++++++---
 .travis/bench_common.py       | 12 +++++++++++-
 3 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/.travis/bench-expectations.py b/.travis/bench-expectations.py
index 25724beff6..2989bf4303 100644
--- a/.travis/bench-expectations.py
+++ b/.travis/bench-expectations.py
@@ -13,7 +13,7 @@
 retried. Keys are underscored, as in bench-data; the bundle normalizes '-' -> '_'.
 A device with no history yields an empty file (retry disabled there, single-shot).
 """
-import argparse, json, os, statistics
+import argparse, json, os
 import bench_common as bc
 
 
@@ -33,10 +33,9 @@ def main():
     if os.path.exists(path):
         d = json.load(open(path))
         for m, arr in d.get("metrics", {}).items():
-            vals = [v for v in arr[-a.window:] if v is not None]
-            if not vals:
+            expected = bc.reference_value(arr, a.window)
+            if expected is None:
                 continue
-            expected = statistics.median(vals)
             thr = bc.red_threshold(m, cfg, bc.series_noise(arr), expected)
             if thr is not None:
                 lines.append(f"{m} {expected} {thr}")
diff --git a/.travis/bench-report.py b/.travis/bench-report.py
index 81745d8d50..966830cf8e 100644
--- a/.travis/bench-report.py
+++ b/.travis/bench-report.py
@@ -22,8 +22,9 @@
 
 
 def reference(bench_data, triple, device):
-    """latest non-null value + recent noise (p90 day-to-day |Δ%|) per metric from
-    bench-data/<triple>/<device>.json, plus the reference date."""
+    """reference value (median of recent non-null, == what bench-expectations ships)
+    + recent noise (p90 day-to-day |Δ%|) per metric from bench-data/<triple>/<device>.json,
+    plus the reference date (latest non-null day, for display)."""
     path = os.path.join(bench_data, triple, f"{device}.json")
     if not os.path.exists(path):
         return {}, {}, None
@@ -31,10 +32,12 @@ def reference(bench_data, triple, device):
     start = datetime.date.fromisoformat(d["start_day"])
     vals, noise, last_idx = {}, {}, -1
     for m, arr in d["metrics"].items():
+        ref = bc.reference_value(arr)
+        if ref is not None:
+            vals[m] = ref
         noise[m] = bc.series_noise(arr)
         for i in range(len(arr) - 1, -1, -1):
             if arr[i] is not None:
-                vals[m] = arr[i]
                 last_idx = max(last_idx, i)
                 break
     ref_day = start + datetime.timedelta(last_idx) if last_idx >= 0 else None
diff --git a/.travis/bench_common.py b/.travis/bench_common.py
index 1a59ef1091..4e5a211143 100644
--- a/.travis/bench_common.py
+++ b/.travis/bench_common.py
@@ -5,11 +5,21 @@
 exactly the would-be reds). Keeping it single-sourced is what guarantees that every
 red the maintainer sees was measured more than once.
 """
-import re, tomllib
+import re, tomllib, statistics
 
 HIGHER_BETTER = re.compile(r"\.(pp\d+|tg\d+)\.")   # llm throughput; everything else lower-is-better
 
 
+def reference_value(arr, window=10):
+    """The baseline a metric is compared against: the median of its recent non-null
+    points. Used by BOTH the report (to compute the red) and bench-expectations (the
+    value shipped to the bundle) — so the bundle's retry and the report's red judge
+    against the same number. Median (not the latest single value) so a noisy last
+    nightly doesn't shift the reference. None if there's no data."""
+    vals = [v for v in arr[-window:] if v is not None]
+    return statistics.median(vals) if vals else None
+
+
 def load_cfg(path):
     return tomllib.load(open(path, "rb"))
 

From 872851c0ed3773a0cdad75a7870bb982f39601b5 Mon Sep 17 00:00:00 2001
From: Mathieu Poumeyrol <kali@zoy.org>
Date: Wed, 17 Jun 2026 11:15:35 +0000
Subject: [PATCH 2/4] bench: don't let a cancelled/empty report overwrite a
 real comment

A superseding PR push cancels the bench via concurrency, but the report job ran on
always() with no results and posted a vacuous '0 metrics / no regressions' over the
real comment. Gate the report on !cancelled() instead of always(), have bench-report
skip writing a comment when there are no comparable metrics, and have the post step
skip when no comment file exists.
---
 .github/workflows/bench.yml | 8 +++++++-
 .travis/bench-report.py     | 6 ++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 1afce2cdf4..eab1514889 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -208,7 +208,9 @@ jobs:
 
   report:
     needs: [ prepare, bench ]
-    if: ${{ always() && github.event_name == 'pull_request' && needs.prepare.outputs.enabled == 'true' }}
+    # !cancelled() (not always()) so a superseded/cancelled bench doesn't post a vacuous
+    # "no regressions" over a real result; the post step also skips when there's no comment.
+    if: ${{ !cancelled() && github.event_name == 'pull_request' && needs.prepare.outputs.enabled == 'true' }}
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -239,6 +241,10 @@ jobs:
         with:
           script: |
             const fs = require('fs');
+            if (!fs.existsSync('pr-comment.md')) {   // no results (cancelled/empty) -> leave any prior comment alone
+              core.info('no comparison produced; not posting');
+              return;
+            }
             const body = fs.readFileSync('pr-comment.md', 'utf8');
             const marker = '<!-- bench-vs-main -->';
             const { data: comments } = await github.rest.issues.listComments({
diff --git a/.travis/bench-report.py b/.travis/bench-report.py
index 966830cf8e..381c6f74c7 100644
--- a/.travis/bench-report.py
+++ b/.travis/bench-report.py
@@ -125,6 +125,12 @@ def main():
             rows.append({"device": device, "metric": metric, "ref": rv, "pr": prv,
                          "delta": delta, "worse": worse, "mover": mover})
 
+    # No comparable metrics (no device results, or no reference) -> don't write a comment,
+    # so a cancelled/empty run can't overwrite a real one with a vacuous "no regressions".
+    if not rows:
+        print("no comparable metrics; not writing a comment")
+        return
+
     movers = [r for r in rows if r["mover"]]
     regr = sorted([r for r in movers if r["worse"]], key=lambda r: -abs(r["delta"]))
     impr = sorted([r for r in movers if not r["worse"]],

From 3b500d9a4da3416e1d54503a1e85eab6a3617997 Mon Sep 17 00:00:00 2001
From: Mathieu Poumeyrol <kali@zoy.org>
Date: Wed, 17 Jun 2026 11:27:44 +0000
Subject: [PATCH 3/4] DEBUG: instrument bench_run path + EXPECTATIONS state
 (revert after diagnosing mac no-retry)

Temporary: print EXPECTATIONS path/-s/line-count/pwd before the bench loop, and whether
bench_run takes the single-shot or retry path (once). To pin why the macOS runner logs
0 retries despite a present, correct expectations file. Revert once answered.
---
 .travis/bundle-entrypoint.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.travis/bundle-entrypoint.sh b/.travis/bundle-entrypoint.sh
index c691c22ff8..01f38d4cc7 100755
--- a/.travis/bundle-entrypoint.sh
+++ b/.travis/bundle-entrypoint.sh
@@ -91,7 +91,11 @@ merge_best() {  # $1 <- per-metric best of $1 and $2 (min, or max for pp/tg thro
 bench_run() {  # bench_run <measure-fn> <args...>
     fn=$1
     shift
-    if [ ! -s "$EXPECTATIONS" ]; then CUR=metrics; "$fn" "$@"; return; fi
+    if [ ! -s "$EXPECTATIONS" ]; then
+        [ -z "$_DBG_PATH" ] && { echo "DEBUG[bench]: bench_run -> SINGLE-SHOT (EXPECTATIONS not usable)"; _DBG_PATH=1; }
+        CUR=metrics; "$fn" "$@"; return
+    fi
+    [ -z "$_DBG_PATH" ] && { echo "DEBUG[bench]: bench_run -> RETRY path"; _DBG_PATH=1; }
     best=$(newtmp); CUR=$best; : > "$best"; "$fn" "$@"
     tries=0
     while [ "$tries" -lt "$RETRY_MAX" ] && out_of_threshold "$best"; do
@@ -162,6 +166,8 @@ _llm_measure() {
     fi
 }
 
+printf 'DEBUG[bench]: EXPECTATIONS=[%s]  -s=%s  lines=%s  pwd=%s\n' "$EXPECTATIONS" "$([ -s "$EXPECTATIONS" ] && echo yes || echo no)" "$(wc -l < "$EXPECTATIONS" 2>/dev/null || echo NA)" "$(pwd)"
+
 group "net benches"
 net_bench arm_ml_kws_cnn_m pass $CACHEDIR/ARM-ML-KWS-CNN-M.pb -i 49,10,f32 --partial --input-node Mfcc
 

From 9ea2446e1d7266a7a7c19636f1b7f1fb53884fb8 Mon Sep 17 00:00:00 2001
From: Mathieu Poumeyrol <kali@zoy.org>
Date: Wed, 17 Jun 2026 11:31:52 +0000
Subject: [PATCH 4/4] bench: pin GPU clock during the run, reset on exit

The cuda runner free-boosts from idle (210MHz) up to its max, and the boost state
varies session-to-session, adding ~16% variance to GPU evaltime that the in-run retry
can't damp (shared session). Lock the graphics clock for the bench and reset it on
exit via a trap. Best-effort (needs privilege, like the CPU governor pin; no-op without
nvidia-smi). Defaults to the max supported clock; set BENCH_GPU_CLOCK if that throttles.
---
 .travis/bundle-entrypoint.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.travis/bundle-entrypoint.sh b/.travis/bundle-entrypoint.sh
index 01f38d4cc7..4e2d72f79b 100755
--- a/.travis/bundle-entrypoint.sh
+++ b/.travis/bundle-entrypoint.sh
@@ -45,6 +45,19 @@ if [ "$(uname)" = "Linux" ] && [ -r /sys/devices/system/cpu/cpu0/cpufreq/scaling
     echo "$F" > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed
 fi
 
+# Pin the GPU clock for the run, reset on exit. The cuda runner free-boosts from idle,
+# which adds session-to-session variance to evaltime (the metric we most want sharp);
+# pinning makes timing deterministic. Best-effort: needs privilege, no-op without
+# nvidia-smi. Override the clock with BENCH_GPU_CLOCK (MHz) if the default throttles.
+if command -v nvidia-smi > /dev/null 2>&1; then
+    GPU_CLOCK=${BENCH_GPU_CLOCK:-$(nvidia-smi --query-supported-clocks=graphics --format=csv,noheader,nounits 2>/dev/null | head -1)}
+    if [ -n "$GPU_CLOCK" ]; then
+        trap 'nvidia-smi --reset-gpu-clocks > /dev/null 2>&1 || true' EXIT
+        nvidia-smi -pm 1 > /dev/null 2>&1 || true
+        nvidia-smi --lock-gpu-clocks="$GPU_CLOCK" > /dev/null 2>&1 || echo "Warning: could not lock GPU clocks (need privilege?)"
+    fi
+fi
+
 # Expectation-guided retry: with an expectations file (EXPECTATIONS, 'metric expected
 # threshold' lines from bench-data history), re-run a bench whose measured value moved
 # worse-than-expected by at least its threshold — i.e. far enough to show as a PR red —