sonos · kali · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -208,7 +208,9 @@ jobs:
 
   report:
     needs: [ prepare, bench ]
-    if: ${{ always() && github.event_name == 'pull_request' && needs.prepare.outputs.enabled == 'true' }}
+    # !cancelled() (not always()) so a superseded/cancelled bench doesn't post a vacuous
+    # "no regressions" over a real result; the post step also skips when there's no comment.
+    if: ${{ !cancelled() && github.event_name == 'pull_request' && needs.prepare.outputs.enabled == 'true' }}
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -239,6 +241,10 @@ jobs:
         with:
           script: |
             const fs = require('fs');
+            if (!fs.existsSync('pr-comment.md')) {   // no results (cancelled/empty) -> leave any prior comment alone
+              core.info('no comparison produced; not posting');
+              return;
+            }
             const body = fs.readFileSync('pr-comment.md', 'utf8');
             const marker = '<!-- bench-vs-main -->';
             const { data: comments } = await github.rest.issues.listComments({

diff --git a/.travis/bench-expectations.py b/.travis/bench-expectations.py
@@ -13,7 +13,7 @@
 retried. Keys are underscored, as in bench-data; the bundle normalizes '-' -> '_'.
 A device with no history yields an empty file (retry disabled there, single-shot).
 """
-import argparse, json, os, statistics
+import argparse, json, os
 import bench_common as bc
 
 
@@ -33,10 +33,9 @@ def main():
     if os.path.exists(path):
         d = json.load(open(path))
         for m, arr in d.get("metrics", {}).items():
-            vals = [v for v in arr[-a.window:] if v is not None]
-            if not vals:
+            expected = bc.reference_value(arr, a.window)
+            if expected is None:
                 continue
-            expected = statistics.median(vals)
             thr = bc.red_threshold(m, cfg, bc.series_noise(arr), expected)
             if thr is not None:
                 lines.append(f"{m} {expected} {thr}")

diff --git a/.travis/bench-report.py b/.travis/bench-report.py
@@ -22,19 +22,22 @@
 
 
 def reference(bench_data, triple, device):
-    """latest non-null value + recent noise (p90 day-to-day |Δ%|) per metric from
-    bench-data/<triple>/<device>.json, plus the reference date."""
+    """reference value (median of recent non-null, == what bench-expectations ships)
+    + recent noise (p90 day-to-day |Δ%|) per metric from bench-data/<triple>/<device>.json,
+    plus the reference date (latest non-null day, for display)."""
     path = os.path.join(bench_data, triple, f"{device}.json")
     if not os.path.exists(path):
         return {}, {}, None
     d = json.load(open(path))
     start = datetime.date.fromisoformat(d["start_day"])
     vals, noise, last_idx = {}, {}, -1
     for m, arr in d["metrics"].items():
+        ref = bc.reference_value(arr)
+        if ref is not None:
+            vals[m] = ref
         noise[m] = bc.series_noise(arr)
         for i in range(len(arr) - 1, -1, -1):
             if arr[i] is not None:
-                vals[m] = arr[i]
                 last_idx = max(last_idx, i)
                 break
     ref_day = start + datetime.timedelta(last_idx) if last_idx >= 0 else None
@@ -122,6 +125,12 @@ def main():
             rows.append({"device": device, "metric": metric, "ref": rv, "pr": prv,
                          "delta": delta, "worse": worse, "mover": mover})
 
+    # No comparable metrics (no device results, or no reference) -> don't write a comment,
+    # so a cancelled/empty run can't overwrite a real one with a vacuous "no regressions".
+    if not rows:
+        print("no comparable metrics; not writing a comment")
+        return
+
     movers = [r for r in rows if r["mover"]]
     regr = sorted([r for r in movers if r["worse"]], key=lambda r: -abs(r["delta"]))
     impr = sorted([r for r in movers if not r["worse"]],

diff --git a/.travis/bench_common.py b/.travis/bench_common.py
@@ -5,11 +5,21 @@
 exactly the would-be reds). Keeping it single-sourced is what guarantees that every
 red the maintainer sees was measured more than once.
 """
-import re, tomllib
+import re, tomllib, statistics
 
 HIGHER_BETTER = re.compile(r"\.(pp\d+|tg\d+)\.")   # llm throughput; everything else lower-is-better
 
 
+def reference_value(arr, window=10):
+    """The baseline a metric is compared against: the median of its recent non-null
+    points. Used by BOTH the report (to compute the red) and bench-expectations (the
+    value shipped to the bundle) — so the bundle's retry and the report's red judge
+    against the same number. Median (not the latest single value) so a noisy last
+    nightly doesn't shift the reference. None if there's no data."""
+    vals = [v for v in arr[-window:] if v is not None]
+    return statistics.median(vals) if vals else None
+
+
 def load_cfg(path):
     return tomllib.load(open(path, "rb"))
 

diff --git a/.travis/bundle-entrypoint.sh b/.travis/bundle-entrypoint.sh
@@ -45,6 +45,19 @@ if [ "$(uname)" = "Linux" ] && [ -r /sys/devices/system/cpu/cpu0/cpufreq/scaling
     echo "$F" > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed
 fi
 
+# Pin the GPU clock for the run, reset on exit. The cuda runner free-boosts from idle,
+# which adds session-to-session variance to evaltime (the metric we most want sharp);
+# pinning makes timing deterministic. Best-effort: needs privilege, no-op without
+# nvidia-smi. Override the clock with BENCH_GPU_CLOCK (MHz) if the default throttles.
+if command -v nvidia-smi > /dev/null 2>&1; then
+    GPU_CLOCK=${BENCH_GPU_CLOCK:-$(nvidia-smi --query-supported-clocks=graphics --format=csv,noheader,nounits 2>/dev/null | head -1)}
+    if [ -n "$GPU_CLOCK" ]; then
+        trap 'nvidia-smi --reset-gpu-clocks > /dev/null 2>&1 || true' EXIT
+        nvidia-smi -pm 1 > /dev/null 2>&1 || true
+        nvidia-smi --lock-gpu-clocks="$GPU_CLOCK" > /dev/null 2>&1 || echo "Warning: could not lock GPU clocks (need privilege?)"
+    fi
+fi
+
 # Expectation-guided retry: with an expectations file (EXPECTATIONS, 'metric expected
 # threshold' lines from bench-data history), re-run a bench whose measured value moved
 # worse-than-expected by at least its threshold — i.e. far enough to show as a PR red —
@@ -91,7 +104,11 @@ merge_best() {  # $1 <- per-metric best of $1 and $2 (min, or max for pp/tg thro
 bench_run() {  # bench_run <measure-fn> <args...>
     fn=$1
     shift
-    if [ ! -s "$EXPECTATIONS" ]; then CUR=metrics; "$fn" "$@"; return; fi
+    if [ ! -s "$EXPECTATIONS" ]; then
+        [ -z "$_DBG_PATH" ] && { echo "DEBUG[bench]: bench_run -> SINGLE-SHOT (EXPECTATIONS not usable)"; _DBG_PATH=1; }
+        CUR=metrics; "$fn" "$@"; return
+    fi
+    [ -z "$_DBG_PATH" ] && { echo "DEBUG[bench]: bench_run -> RETRY path"; _DBG_PATH=1; }
     best=$(newtmp); CUR=$best; : > "$best"; "$fn" "$@"
     tries=0
     while [ "$tries" -lt "$RETRY_MAX" ] && out_of_threshold "$best"; do
@@ -162,6 +179,8 @@ _llm_measure() {
     fi
 }
 
+printf 'DEBUG[bench]: EXPECTATIONS=[%s]  -s=%s  lines=%s  pwd=%s\n' "$EXPECTATIONS" "$([ -s "$EXPECTATIONS" ] && echo yes || echo no)" "$(wc -l < "$EXPECTATIONS" 2>/dev/null || echo NA)" "$(pwd)"
+
 group "net benches"
 net_bench arm_ml_kws_cnn_m pass $CACHEDIR/ARM-ML-KWS-CNN-M.pb -i 49,10,f32 --partial --input-node Mfcc