Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,9 @@ jobs:

report:
needs: [ prepare, bench ]
if: ${{ always() && github.event_name == 'pull_request' && needs.prepare.outputs.enabled == 'true' }}
# !cancelled() (not always()) so a superseded/cancelled bench doesn't post a vacuous
# "no regressions" over a real result; the post step also skips when there's no comment.
if: ${{ !cancelled() && github.event_name == 'pull_request' && needs.prepare.outputs.enabled == 'true' }}
runs-on: ubuntu-latest
permissions:
contents: read
Expand Down Expand Up @@ -239,6 +241,10 @@ jobs:
with:
script: |
const fs = require('fs');
if (!fs.existsSync('pr-comment.md')) { // no results (cancelled/empty) -> leave any prior comment alone
core.info('no comparison produced; not posting');
return;
}
const body = fs.readFileSync('pr-comment.md', 'utf8');
const marker = '<!-- bench-vs-main -->';
const { data: comments } = await github.rest.issues.listComments({
Expand Down
7 changes: 3 additions & 4 deletions .travis/bench-expectations.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
retried. Keys are underscored, as in bench-data; the bundle normalizes '-' -> '_'.
A device with no history yields an empty file (retry disabled there, single-shot).
"""
import argparse, json, os, statistics
import argparse, json, os
import bench_common as bc


Expand All @@ -33,10 +33,9 @@ def main():
if os.path.exists(path):
d = json.load(open(path))
for m, arr in d.get("metrics", {}).items():
vals = [v for v in arr[-a.window:] if v is not None]
if not vals:
expected = bc.reference_value(arr, a.window)
if expected is None:
continue
expected = statistics.median(vals)
thr = bc.red_threshold(m, cfg, bc.series_noise(arr), expected)
if thr is not None:
lines.append(f"{m} {expected} {thr}")
Expand Down
15 changes: 12 additions & 3 deletions .travis/bench-report.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,22 @@


def reference(bench_data, triple, device):
"""latest non-null value + recent noise (p90 day-to-day |Δ%|) per metric from
bench-data/<triple>/<device>.json, plus the reference date."""
"""reference value (median of recent non-null, == what bench-expectations ships)
+ recent noise (p90 day-to-day |Δ%|) per metric from bench-data/<triple>/<device>.json,
plus the reference date (latest non-null day, for display)."""
path = os.path.join(bench_data, triple, f"{device}.json")
if not os.path.exists(path):
return {}, {}, None
d = json.load(open(path))
start = datetime.date.fromisoformat(d["start_day"])
vals, noise, last_idx = {}, {}, -1
for m, arr in d["metrics"].items():
ref = bc.reference_value(arr)
if ref is not None:
vals[m] = ref
noise[m] = bc.series_noise(arr)
for i in range(len(arr) - 1, -1, -1):
if arr[i] is not None:
vals[m] = arr[i]
last_idx = max(last_idx, i)
break
ref_day = start + datetime.timedelta(last_idx) if last_idx >= 0 else None
Expand Down Expand Up @@ -122,6 +125,12 @@ def main():
rows.append({"device": device, "metric": metric, "ref": rv, "pr": prv,
"delta": delta, "worse": worse, "mover": mover})

# No comparable metrics (no device results, or no reference) -> don't write a comment,
# so a cancelled/empty run can't overwrite a real one with a vacuous "no regressions".
if not rows:
print("no comparable metrics; not writing a comment")
return

movers = [r for r in rows if r["mover"]]
regr = sorted([r for r in movers if r["worse"]], key=lambda r: -abs(r["delta"]))
impr = sorted([r for r in movers if not r["worse"]],
Expand Down
12 changes: 11 additions & 1 deletion .travis/bench_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,21 @@
exactly the would-be reds). Keeping it single-sourced is what guarantees that every
red the maintainer sees was measured more than once.
"""
import re, tomllib
import re, tomllib, statistics

HIGHER_BETTER = re.compile(r"\.(pp\d+|tg\d+)\.") # llm throughput; everything else lower-is-better


def reference_value(arr, window=10):
"""The baseline a metric is compared against: the median of its recent non-null
points. Used by BOTH the report (to compute the red) and bench-expectations (the
value shipped to the bundle) — so the bundle's retry and the report's red judge
against the same number. Median (not the latest single value) so a noisy last
nightly doesn't shift the reference. None if there's no data."""
vals = [v for v in arr[-window:] if v is not None]
return statistics.median(vals) if vals else None


def load_cfg(path):
return tomllib.load(open(path, "rb"))

Expand Down
21 changes: 20 additions & 1 deletion .travis/bundle-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,19 @@ if [ "$(uname)" = "Linux" ] && [ -r /sys/devices/system/cpu/cpu0/cpufreq/scaling
echo "$F" > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed
fi

# Pin the GPU clock for the run, reset on exit. The cuda runner free-boosts from idle,
# which adds session-to-session variance to evaltime (the metric we most want sharp);
# pinning makes timing deterministic. Best-effort: needs privilege, no-op without
# nvidia-smi. Override the clock with BENCH_GPU_CLOCK (MHz) if the default throttles.
if command -v nvidia-smi > /dev/null 2>&1; then
GPU_CLOCK=${BENCH_GPU_CLOCK:-$(nvidia-smi --query-supported-clocks=graphics --format=csv,noheader,nounits 2>/dev/null | head -1)}
if [ -n "$GPU_CLOCK" ]; then
trap 'nvidia-smi --reset-gpu-clocks > /dev/null 2>&1 || true' EXIT
nvidia-smi -pm 1 > /dev/null 2>&1 || true
nvidia-smi --lock-gpu-clocks="$GPU_CLOCK" > /dev/null 2>&1 || echo "Warning: could not lock GPU clocks (need privilege?)"
fi
fi

# Expectation-guided retry: with an expectations file (EXPECTATIONS, 'metric expected
# threshold' lines from bench-data history), re-run a bench whose measured value moved
# worse-than-expected by at least its threshold — i.e. far enough to show as a PR red —
Expand Down Expand Up @@ -91,7 +104,11 @@ merge_best() { # $1 <- per-metric best of $1 and $2 (min, or max for pp/tg thro
bench_run() { # bench_run <measure-fn> <args...>
fn=$1
shift
if [ ! -s "$EXPECTATIONS" ]; then CUR=metrics; "$fn" "$@"; return; fi
if [ ! -s "$EXPECTATIONS" ]; then
[ -z "$_DBG_PATH" ] && { echo "DEBUG[bench]: bench_run -> SINGLE-SHOT (EXPECTATIONS not usable)"; _DBG_PATH=1; }
CUR=metrics; "$fn" "$@"; return
fi
[ -z "$_DBG_PATH" ] && { echo "DEBUG[bench]: bench_run -> RETRY path"; _DBG_PATH=1; }
best=$(newtmp); CUR=$best; : > "$best"; "$fn" "$@"
tries=0
while [ "$tries" -lt "$RETRY_MAX" ] && out_of_threshold "$best"; do
Expand Down Expand Up @@ -162,6 +179,8 @@ _llm_measure() {
fi
}

printf 'DEBUG[bench]: EXPECTATIONS=[%s] -s=%s lines=%s pwd=%s\n' "$EXPECTATIONS" "$([ -s "$EXPECTATIONS" ] && echo yes || echo no)" "$(wc -l < "$EXPECTATIONS" 2>/dev/null || echo NA)" "$(pwd)"

group "net benches"
net_bench arm_ml_kws_cnn_m pass $CACHEDIR/ARM-ML-KWS-CNN-M.pb -i 49,10,f32 --partial --input-node Mfcc

Expand Down
Loading