sonos · czoli1976 · May 30, 2026 · May 30, 2026 · May 30, 2026 · Jun 1, 2026
diff --git a/core/src/ops/matmul/pack.rs b/core/src/ops/matmul/pack.rs
@@ -7,25 +7,32 @@ use tract_linalg::block_quant::{
 };
 use tract_linalg::mmm::{MMMInputFormat, MMMInputValue, PackedMatrixStorage};
 use tract_linalg::pack::{PackedFormat, PackedI8K4};
+#[cfg(target_arch = "x86_64")]
+use tract_linalg::x86_64_fma::amx::PackedAmxA;
 
 use super::ModePicker;
 
 // Pack one (possibly strided) view with a dynamic packing format. Keeps the
 // PackedFormat fast path byte-identical; routes the K=4-inner SMOPA packer
-// (PackedI8K4) through its view packer. Other formats are unsupported here.
+// (PackedI8K4) and the AMX A-side packer (PackedAmxA) through their view
+// packers. Other formats are unsupported here.
 fn pack_view_with(
     packer: &dyn MMMInputFormat,
     t: &TensorView,
     k_axis: usize,
     mn_axis: usize,
 ) -> TractResult<Box<dyn MMMInputValue>> {
     if let Some(pf) = packer.downcast_ref::<PackedFormat>() {
-        pf.pack_tensor_view(t, k_axis, mn_axis)
-    } else if let Some(p4) = packer.downcast_ref::<PackedI8K4>() {
-        p4.pack_view(t, k_axis, mn_axis)
-    } else {
-        bail!("OptMatMulPack does not support packing format {packer:?}")
+        return pf.pack_tensor_view(t, k_axis, mn_axis);
     }
+    if let Some(p4) = packer.downcast_ref::<PackedI8K4>() {
+        return p4.pack_view(t, k_axis, mn_axis);
+    }
+    #[cfg(target_arch = "x86_64")]
+    if let Some(pa) = packer.downcast_ref::<PackedAmxA>() {
+        return pa.pack_view(t, k_axis, mn_axis);
+    }
+    bail!("OptMatMulPack does not support packing format {packer:?}")
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]

diff --git a/linalg/AMX_BENCH_RESULTS.md b/linalg/AMX_BENCH_RESULTS.md
@@ -0,0 +1,85 @@
+# AMX validation & benchmark results
+
+Run of `linalg/AMX_BENCH_RUNBOOK.md` on real Intel AMX hardware.
+
+- **Host:** `Intel(R) Xeon(R) Processor @ 2.10GHz` (Sapphire/Emerald Rapids-class), 4 vCPU
+- **ISA:** `amx_tile amx_int8 amx_bf16` + AVX-512-VNNI; kernel `6.18.5` (≥5.16); binutils `2.42`; rustc `1.94.1`
+- **Branch:** `claude/zealous-galileo-fEQ3d` @ `7a23812`
+- **Method:** `cargo bench`, default criterion sampling, pinned to core 2 (`taskset -c 2`), idle box (load ≈ 1.0)
+- **Date:** 2026-06-02
+
+## 1. AMX live confirmation ✅
+
+Gate-check (`amx_i32` bench) produced `avx512amx_8x8`/`avx512amx_16x16` columns with real `thrpt:` numbers — **neither** "tract not built with AMX" (build probe) **nor** "AMX not available, skipping" (runtime CPUID + `arch_prctl` XTILEDATA gate) appeared. AMX is genuinely exercised.
+
+## 2. Correctness
+
+| Suite | Result |
+|---|---|
+| `cargo test -p tract-linalg --lib avx512amx` | **297 passed; 3 failed** |
+| `cargo test -p tract-linalg --lib x86_64_fma::mmm` | **1833 passed; 3 failed** |
+
+**Bugfix `99eb75b9d` VALIDATED on silicon** ✅ — every `scalar_sub` / `per_row_sub` / `per_col_sub` (+`_f`) test passed for **both** `avx512amx_mmm_i32_16x16` and `avx512amx_mmm_f32_16x16`.
+
+**3 failures — all in the AMX bf16 path** (`avx512amx_mmm_f32_16x16::f32f32_bf16`): `fuse::prop`, `frame::prop`, `fuse::packed_packed_bug_3`.
+
+**Root cause = test-harness tolerance, NOT a kernel defect.** `packed_packed.rs:367` selects the comparison tolerance from the **accumulator** dtype:
+```rust
+let app = if K::Acc::datum_type() == f16::datum_type()
+    { Approximation::SuperApproximate } else { Approximation::Approximate };
+```
+This kernel accumulates in **f32** (TDPBF16PS: bf16×bf16→f32), so it gets `Approximate` = `(atol 1e-4, rtol 5e-4, 0 outliers)` — but the `f32f32_bf16` packing truncates inputs to bf16 (~2⁻⁸ ≈ 0.39% rel). bf16-grade error is checked against an f32-grade bar with zero tolerated outliers ⇒ guaranteed failure. `SuperApproximate` `(atol 0.1, rtol 0.05, 1e-4 outliers)` would pass. The structurally identical int8 16×16 kernel passes 100%.
+
+**Proposed fix:** in `check()`, pick `SuperApproximate` when the packing is bf16-based, not only when `K::Acc == f16`.
+
+**Empirically verified (on the AMX host):** the kernel was run on 7 cases (including the exact `bug_3` input) and compared against an independent **bf16-truncated** reference — built with the project's own `f32_to_bf16_rne` — judged by the *same* tight `Approximate` bar: **0 outliers across ~335k output elements** (max abs err ≤ 1.3e-5), versus **282,788 outliers** against a pure-f32 reference. The kernel reproduces "truncate inputs→bf16, accumulate→f32" exactly; the 3 red tests are 100% the f32 oracle, with no kernel defect.
+
+## 3. Benchmarks — throughput (Gelem/s, point estimate)
+
+### `amx_i32` — int8 GEMM
+| M×K×N | avx2 | avx512vnni (8×8) | avx512amx_8×8 | avx512amx_16×16 |
+|---|---:|---:|---:|---:|
+| 64×256×64 | 0.41 | 11.21 | 68.41 | **233.64** |
+| 256×256×256 | 0.41 | 11.31 | 68.47 | **237.29** |
+| 512×512×512 | 0.39 | 8.94 † | 112.86 | **228.15** |
+| 1024×1024×64 | 0.41 | 34.84 | 178.42 | **279.51** |
+
+### `amx_f32` — bf16→f32 GEMM
+| M×K×N | fma_16×6 | avx512_16×12 | avx512amx_bf16_16×16 |
+|---|---:|---:|---:|
+| 64×256×64 | 37.12 | 64.31 | **207.35** |
+| 256×256×256 | 37.90 | 71.90 | **225.74** |
+| 512×512×512 | 39.37 | 64.69 | **348.38** |
+| 1024×1024×64 | 36.85 | 59.22 | **318.36** |
+
+### `vnni_i32` — int8 GEMM (new 16×16 in isolation)
+| M×K×N | avx2 | avx512vnni (8×8) | avx512vnni_16×16 |
+|---|---:|---:|---:|
+| 64×256×64 | 0.41 | 10.90 | **135.74** |
+| 256×256×256 | 0.40 | 10.78 | **134.92** |
+| 512×512×512 | 0.40 | 20.53 | **154.39** |
+| 1024×1024×64 | 0.41 | 34.77 | **161.27** |
+
+† `avx512vnni`@512³ read 8.94 here vs 20.53 in `vnni_i32` (same kernel/shape). Treat **20.53** as the credible value (it fits the size trend 11.3→20.5→34.8); 8.94 was an outlier. A higher-sampling re-measure was attempted but could not complete — see §6.
+
+## 4. Head-to-head ratios
+
+| Comparison | 64×256×64 | 256×256×256 | 512×512×512 | 1024×1024×64 |
+|---|---:|---:|---:|---:|
+| **AMX 16×16 ÷ VNNI 16×16** (int8, same CPU) | 1.72× | 1.76× | 1.48× | 1.73× |
+| **AMX 16×16 ÷ AMX 8×8** (int8) | 3.42× | 3.47× | 2.02× | 1.57× |
+| **VNNI 16×16 ÷ VNNI 8×8** (int8) | 12.45× | 12.51× | 7.52× | 4.64× |
+| **AMX bf16 16×16 ÷ AVX-512 f32 16×12** | 3.22× | 3.14× | 5.39× | 5.38× |
+| *(bonus) AMX bf16 ÷ FMA f32 16×6* | 5.59× | 5.96× | 8.85× | 8.64× |
+
+## 5. Findings
+
+1. **AMX int8 16×16 wins everywhere — justifies `boost(100)` > VNNI `boost(50)`.** 1.48–1.76× over the new VNNI 16×16 on the *same* silicon. Dispatch ordering is correct.
+2. **AMX 16×16 vs 8×8: 1.57–3.47×.** 16×16 leads on all tested shapes; the 4×-work/instr advantage is largest on compact shapes (3.4× @ 64×256×64) and narrowest on tall-skinny 1024×1024×64 (1.57×, N=64). No tested shape favors 8×8 — any crossover lives below this suite (smaller M or N<16). `qmmm_i32` defaulting to 16×16 here is sound.
+3. **VNNI 16×16 vs 8×8: 4.64–12.5× — far above the dev box's 1.3–2.1×.** Likely the 8×8 kernel's ymm (256-bit) accumulators vs the new kernel's zmm (512-bit), amplified on Sapphire Rapids (no AVX-512 license downclock that Cascade Lake suffers). Strongly validates the new kernel; the magnitude warrants one sanity re-check (see #4).
+4. **Data-quality flag (resolved by inspection):** `avx512vnni` 8×8 @ 512³ read 8.94 (in `amx_i32`) vs 20.53 (in `vnni_i32`) — a 2.3× swing on the same kernel/shape. **20.53 is the credible figure** (it continues the monotone size trend 11.3 @ 256³ → 20.5 @ 512³ → 34.8 @ 1024×1024×64; 8.94 breaks it). A `--sample-size 200` re-measure was launched but the AMX host was reclaimed before it could run (see §6); the ratio table already uses the consistent 20.53 pairing. AMX columns were stable across runs.
+5. **AMX bf16 is 3.1–5.4× the AVX-512 f32 kernel** (5.6–8.9× over FMA), scaling up on larger shapes (348 Gelem/s @ 512³) — with the documented bf16 precision trade (see §2 and `X86_64_INT8_GEMM.md`).
+
+## 6. Reproducibility note
+
+Numbers were collected **2026-06-02** on an AMX-capable `Intel(R) Xeon(R) @ 2.10GHz` (`amx_tile/int8/bf16` + AVX-512-VNNI, kernel 6.18.5). The ephemeral session container was subsequently reclaimed and re-provisioned onto a different `Intel(R) Xeon(R) @ 2.80GHz` with **neither AMX nor AVX-512-VNNI** (only `avx512f`), on which `amx_i32`/`vnni_i32` both short-circuit and skip — so the one outstanding re-measure (VNNI-8×8 @ 512³) could not be completed in this session. To reproduce or extend, run on an AMX host (Sapphire Rapids / Emerald Rapids / Granite Rapids Xeon, or Xeon Max) following `linalg/AMX_BENCH_RUNBOOK.md`.
diff --git a/linalg/AMX_BENCH_RUNBOOK.md b/linalg/AMX_BENCH_RUNBOOK.md
@@ -0,0 +1,211 @@
+# AMX validation & benchmark runbook
+
+**For: a Claude Code session (or human) on an x86_64 CPU that has Intel AMX.**
+
+The kernel work on branch `claude/zealous-galileo-fEQ3d` was developed on a
+Cascade Lake-class container (AVX-512-VNNI, **no AMX**). Everything that can run
+without AMX is already validated there. This runbook covers the two things that
+box **could not** do and that need a real AMX CPU.
+
+## Your task
+
+**Benchmark every int8 / bf16 GEMM kernel in this tree on this AMX CPU — all the
+AMX kernels *and* the AVX-512-VNNI kernels we just improved — and run the AMX
+correctness suite.** Full kernel inventory to cover:
+
+| Kernel | ISA | Covered by bench |
+|---|---|---|
+| `avx512amx_mmm_i32_8x8` | AMX int8 (`tdpbssd`) | `amx_i32` |
+| `avx512amx_mmm_i32_16x16` | AMX int8 (`tdpbssd`) | `amx_i32` |
+| `avx512amx_mmm_f32_16x16` | AMX bf16→f32 (`tdpbf16ps`) | `amx_f32` |
+| `avx512vnni_mmm_i32_8x8` | AVX-512-VNNI (`vpdpbusd`) | `vnni_i32`, `amx_i32` |
+| **`avx512vnni_mmm_i32_16x16`** ← new | AVX-512-VNNI (`vpdpbusd`, zmm) | `vnni_i32` |
+| `avx2_mmm_i32_8x8` (baseline) | AVX2 | both i32 benches |
+
+Running the three benches in Step 4 covers all of the above. Yes — bench the VNNI
+kernels here too: an AMX CPU (Sapphire Rapids+) also has AVX-512-VNNI, so it's the
+one place you can measure AMX 16×16 and VNNI 16×16 **on the same silicon** and see
+how much AMX actually wins.
+
+In addition, this AMX CPU is the only place that can:
+
+1. **Correctness-test the AMX kernels** — including a recent bugfix to the AMX
+   16×16 `sub` fused-op handlers that was invisible on non-AMX hardware.
+2. **Benchmark** the AMX int8 / bf16 kernels and the new AVX-512-VNNI 16×16
+   kernel head-to-head.
+
+> ⚠️ **Most important caveat:** every AMX kernel test short-circuits to "ok" when
+> the host can't run AMX (`is_supported_here()` is false). So a green
+> `cargo test` on the wrong box proves **nothing**. You must first confirm AMX is
+> actually live (Step 2). The **benchmarks are the authoritative gate-check** —
+> they print an explicit "AMX … not available, skipping" message and emit no AMX
+> columns if the gate is closed.
+
+---
+
+## 0. Prerequisites
+
+| Requirement | Why | Check |
+|---|---|---|
+| AMX-capable CPU (Sapphire Rapids / Emerald Rapids / Granite Rapids Xeon, or Xeon Max) | `tdpbssd` / `tdpbf16ps` | `grep -o 'amx[_a-z]*' /proc/cpuinfo \| sort -u` → expect `amx_bf16 amx_int8 amx_tile` |
+| Linux kernel ≥ 5.16 | AMX tile-data XSAVE permission via `arch_prctl(ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)` | `uname -r` |
+| binutils/gas ≥ 2.34 (≥ 2.36 ideal) | assembles AMX mnemonics (and `{vex}` for AVX-VNNI) | `as --version` |
+| Rust stable (dev used 1.94–1.96) | build | `cargo --version` |
+
+If `/proc/cpuinfo` shows no `amx_*` flags, this is the wrong machine — stop here.
+
+---
+
+## 1. Get the code
+
+**Fresh clone (preferred):**
+```sh
+git clone https://github.com/czoli1976/tract.git
+cd tract
+git checkout claude/zealous-galileo-fEQ3d
+```
+
+**Existing checkout:**
+```sh
+git fetch origin claude/zealous-galileo-fEQ3d
+git checkout claude/zealous-galileo-fEQ3d && git pull
+# IMPORTANT when pulling into a checkout that was built before: the new kernel
+# template (avx512vnni_mmm_i32_16x16.S.j2) may not trigger a build-script rerun
+# (build.rs emits per-file rerun-if-changed). Force it once:
+touch linalg/build.rs
+```
+(A fresh clone needs no `touch` — it renders every template on first build.)
+
+---
+
+## 2. Confirm AMX is actually live (do this first)
+
+The AMX kernels are gated by CPUID **and** the kernel granting tile-data XSAVE
+permission. The benchmark is the cleanest runtime probe — if AMX is unavailable
+it prints a skip line instead of numbers:
+
+```sh
+cargo bench -p tract-linalg --bench amx_i32 -- --warm-up-time 0.2 --measurement-time 0.5 --sample-size 10 2>&1 | head -20
+```
+
+- ✅ **Good:** you see `avx512amx_8x8` and `avx512amx_16x16` lines with `thrpt:`.
+- ❌ **Bad:** `AMX int8 not available (CPUID + arch_prctl gate failed), skipping`
+  → AMX isn't usable (check kernel ≥ 5.16, not in a VM that masks AMX, XSAVE
+  permission not blocked by a seccomp/container policy). Don't proceed — the
+  correctness tests would silently no-op.
+
+Optional: `RUST_LOG=info cargo test -p tract-linalg --lib avx512amx_mmm_i32_16x16 -- --nocapture 2>&1 | grep -i activated`
+should log `qmmm_i32: x86_64/avx512amx_int8 (16x16 + 8x8 adaptive) activated`.
+
+---
+
+## 3. Correctness validation (the priority)
+
+Only meaningful once Step 2 confirms AMX is live.
+
+```sh
+# All three AMX kernel suites: int8 8x8, int8 16x16, bf16 16x16.
+cargo test -p tract-linalg --lib avx512amx 2>&1 | tail -30
+
+# Full x86_64 mmm suite (AMX + VNNI + AVX2 + FMA + AVX-512), for completeness.
+cargo test -p tract-linalg --lib x86_64_fma::mmm 2>&1 | tail -5
+```
+
+**Expected:** `test result: ok. <N> passed; 0 failed`.
+
+**What this specifically proves (and the dev box couldn't):** the
+`scalar_sub` / `per_row_sub` / `per_col_sub` (+ `_flipped`) fused-op tests for
+`test_avx512amx_mmm_i32_16x16` and `test_avx512amx_mmm_f32_16x16` **actually
+execute**. Those guard commit `99eb75b9d`, which fixed swapped operands in the
+AMX `sub` handlers (they were computing `acc − operand` instead of
+`operand − acc`, i.e. negated results). This fix is currently only
+build-verified — **this run is what confirms it on real silicon.**
+
+---
+
+## 4. Benchmarks
+
+On real hardware use default sampling (drop the reduced flags) and pin a core for
+stable numbers. Idle box, turbo/frequency-scaling fixed if you can.
+
+```sh
+# int8: AVX2 vs VNNI 8x8 vs AMX 8x8 vs AMX 16x16
+taskset -c 2 cargo bench -p tract-linalg --bench amx_i32
+
+# f32 via bf16: FMA 16x6 vs AVX-512 16x12 vs AMX-BF16 16x16
+taskset -c 2 cargo bench -p tract-linalg --bench amx_f32
+
+# the new kernel in isolation: AVX2 vs VNNI 8x8 vs VNNI 16x16
+taskset -c 2 cargo bench -p tract-linalg --bench vnni_i32
+```
+
+Bench layout (group `… /packed_packed`, shapes `64x256x64`, `256x256x256`,
+`512x512x512`, `1024x1024x64`, throughput in `Gelem/s`):
+
+| Bench | Columns |
+|---|---|
+| `amx_i32` | `avx2`, `avx512vnni`, `avx512amx_8x8`, `avx512amx_16x16` |
+| `amx_f32` | `fma_16x6`, `avx512_16x12`, `avx512amx_bf16_16x16` |
+| `vnni_i32` | `avx2`, `avx512vnni` (8×8), `avx512vnni_16x16` |
+
+Criterion writes HTML reports under `target/criterion/`.
+
+---
+
+## 5. What to report back
+
+**Correctness**
+- Confirm AMX was live (Step 2 showed AMX columns / cpuinfo has `amx_int8`).
+- `cargo test … avx512amx` result line (`N passed; 0 failed`), confirming the
+  AMX `*_sub` fused-op tests passed → bugfix `99eb75b9d` validated on hardware.
+
+**Performance** — the `thrpt:` (Gelem/s) per shape per column for all three
+benches, plus these head-to-head reads:
+
+1. **AMX 16×16 vs VNNI 16×16** (compare `amx_i32`'s `avx512amx_16x16` against
+   `vnni_i32`'s `avx512vnni_16x16`, same shapes). AMX should win — that justifies
+   the dispatch ordering (`boost(100)` for AMX 16×16 > `boost(50)` for VNNI
+   16×16). Report the ratio.
+2. **AMX 16×16 vs AMX 8×8** — the 4×-work-per-instruction claim and where 8×8
+   wins on small shapes (informs the `qmmm_i32` 16/8 crossover).
+3. **VNNI 16×16 vs 8×8** — does the ~1.3–2.1× measured on Cascade Lake hold on
+   this CPU too?
+4. **AMX-BF16 16×16 vs AVX-512 f32 16×12** — bf16 throughput win (with the bf16
+   precision trade-off noted in `linalg/X86_64_INT8_GEMM.md`).
+
+---
+
+## Appendix A — one-shot script
+
+```sh
+set -e
+echo "## CPU AMX flags:"; grep -o 'amx[_a-z]*' /proc/cpuinfo | sort -u || true
+echo "## kernel:"; uname -r
+echo "## gate check (expect AMX columns, not a skip message):"
+cargo bench -p tract-linalg --bench amx_i32 -- --warm-up-time 0.2 --measurement-time 0.5 --sample-size 10 2>&1 | grep -iE "amx|skipping|thrpt" | head
+echo "## correctness:"
+cargo test -p tract-linalg --lib avx512amx 2>&1 | tail -3
+cargo test -p tract-linalg --lib x86_64_fma::mmm 2>&1 | tail -3
+echo "## full benches:"
+taskset -c 2 cargo bench -p tract-linalg --bench amx_i32
+taskset -c 2 cargo bench -p tract-linalg --bench amx_f32
+taskset -c 2 cargo bench -p tract-linalg --bench vnni_i32
+```
+
+## Appendix B — what's on this branch
+
+Three commits on top of the prior AMX/VNNI work:
+
+| Commit | Summary |
+|---|---|
+| `9e8f1c5aa` | doc: `linalg/X86_64_INT8_GEMM.md` — the full int8 GEMM kernel cascade |
+| `26726db8e` | **feat**: `avx512vnni_mmm_i32_16x16` — zmm-wide int8 VNNI kernel (1.3–2.1× over 8×8 on Cascade Lake) |
+| `99eb75b9d` | **fix**: swapped operands in AMX 16×16 `sub` fused-op handlers (int8 + bf16) — **needs AMX to validate** |
+
+Background and the kernel-selection/dispatch model: see
+`linalg/X86_64_INT8_GEMM.md`.
+
+> Note on Intel SDE: SDE *can* emulate AMX for **functional/correctness** checks
+> on a non-AMX box (`sde64 -spr -- <test-binary>`), but it is **not** a
+> performance model — timings under SDE are meaningless. Use it only if no AMX
+> hardware is available, and never for the benchmark numbers above.
diff --git a/linalg/Cargo.toml b/linalg/Cargo.toml
@@ -167,3 +167,15 @@ harness = false
 [[bench]]
 name = "vnni_i32"
 harness = false
+
+[[bench]]
+name = "amx_i32"
+harness = false
+
+[[bench]]
+name = "amx_f32"
+harness = false
+
+[[bench]]
+name = "avxvnni_i32"
+harness = false