diff --git a/.all_crates.sh b/.all_crates.sh
index 838651e39a..859bbff144 100644
--- a/.all_crates.sh
+++ b/.all_crates.sh
@@ -1,2 +1,2 @@
 
-ALL_CRATES_PATH="data linalg core nnef nnef/nnef-resources pulse-opl pulse extra transformers hir tflite tensorflow onnx-opl onnx gpu metal cuda libcli api/rs api/ffi api/proxy/sys api/proxy cli"
+ALL_CRATES_PATH="data linalg core nnef nnef/nnef-resources pulse-opl pulse extra transformers hir tflite tensorflow onnx-opl onnx gpu metal cuda libcli api api/rs api/ffi api/proxy/sys api/proxy cli"
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 7e01dbd74b..9313697d43 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -9,6 +9,8 @@ updates:
       actions:
         patterns:
           - "*"
+    cooldown:
+      default-days: 7
 
   - package-ecosystem: "cargo"
     directory: "/"
@@ -20,6 +22,8 @@ updates:
       rust-dependencies:
         patterns:
           - "*"
+    cooldown:
+      default-days: 7
 
   - package-ecosystem: "pip"
     directory: "/api/py"
@@ -29,3 +33,5 @@ updates:
     schedule:
       interval: "weekly"
       day: "monday"
+    cooldown:
+      default-days: 7
diff --git a/.github/workflows/asan.yml b/.github/workflows/asan.yml
index c191e79529..e0b8385e70 100644
--- a/.github/workflows/asan.yml
+++ b/.github/workflows/asan.yml
@@ -9,6 +9,9 @@ env:
   CARGO_INCREMENTAL: false
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE20: true
 
+permissions:
+  contents: read
+
 jobs:
   sanitizer-address:
     strategy:
@@ -19,7 +22,9 @@ jobs:
     runs-on: ${{matrix.os}}
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
     - name: Rustup update
       run: rustup update
     - name: Run sanitized tests
diff --git a/.github/workflows/binaries.yml b/.github/workflows/binaries.yml
index c263fee75d..6d91302454 100644
--- a/.github/workflows/binaries.yml
+++ b/.github/workflows/binaries.yml
@@ -14,9 +14,14 @@ env:
   CARGO_INCREMENTAL: false
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE20: true
 
+permissions:
+  contents: read
+
 jobs:
   assets:
     name: Upload Release Binaries
+    permissions:
+      contents: write
     strategy:
       fail-fast: false
       matrix:
@@ -48,30 +53,39 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - name: Checkout code
-        uses: actions/checkout@v6
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          persist-credentials: false
 
       - name: Extract version tag
         id: version
+        env:
+          VERSION_OVERRIDE: ${{ inputs.version_override }}
+          GH_REF: ${{ github.ref }}
         run: |
-          if [ -n "${{ inputs.version_override }}" ]; then
-            echo "value=${{ inputs.version_override }}" >> $GITHUB_OUTPUT
+          if [ -n "$VERSION_OVERRIDE" ]; then
+            echo "value=$VERSION_OVERRIDE" >> "$GITHUB_OUTPUT"
           else
-            echo "value=$(echo ${{ github.ref }} | cut -f 3 -d / | sed 's/^v//' )" >> $GITHUB_OUTPUT
+            echo "value=$(echo "$GH_REF" | cut -f 3 -d / | sed 's/^v//' )" >> "$GITHUB_OUTPUT"
           fi
 
       - name: Build tract
+        env:
+          TARGET: ${{ matrix.target }}
+          MUSL: ${{ matrix.musl }}
+          VERSION: ${{ steps.version.outputs.value }}
         run: |
           set -ex
-          target=${{matrix.target}}
-          version=${{steps.version.outputs.value}}
+          target="$TARGET"
+          version="$VERSION"
           name=${target}-${version}
 
           rustup update
           rustup target add ${target}
 
-          if [ -n "${{matrix.musl}}" ]
+          if [ -n "$MUSL" ]
           then
-            MUSL_TRIPLE=${{matrix.musl}}
+            MUSL_TRIPLE="$MUSL"
             curl -s https://s3.amazonaws.com/tract-ci-builds/toolchains/${MUSL_TRIPLE}-cross.tgz | tar zx
 
             MUSL_BIN=`pwd`/${MUSL_TRIPLE}-cross/bin
@@ -90,7 +104,7 @@ jobs:
           tar czf tract-${name}.tgz tract-${name}
 
       - name: Upload asset
-        uses: softprops/action-gh-release@v3
+        uses: softprops/action-gh-release@b4309332981a82ec1c5618f44dd2e27cc8bfbfda # v3
         with:
           files: tract-${{matrix.target}}-${{ steps.version.outputs.value }}.tgz
           name: ${{ steps.version.outputs.value }}
diff --git a/.github/workflows/cost_model.yml b/.github/workflows/cost_model.yml
index 754fd4c1e0..8af23aeb1e 100644
--- a/.github/workflows/cost_model.yml
+++ b/.github/workflows/cost_model.yml
@@ -12,6 +12,9 @@ env:
   CARGO_INCREMENTAL: false
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE20: true
 
+permissions:
+  contents: read
+
 jobs:
   build:
     name: Upload cost model tasks
@@ -22,11 +25,14 @@ jobs:
         target: [ "aarch64", "armv7" ]
     steps:
       - name: Checkout code
-        uses: actions/checkout@v6
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          persist-credentials: false
 
       - name: Build and upload
-        run: ./.travis/cost_model_task_build.sh ${{matrix.target}} ${{github.event.inputs.dataset_id}}
+        run: ./.travis/cost_model_task_build.sh ${{matrix.target}} ${GITHUB_EVENT_INPUTS_DATASET_ID}
         env:
           AWS_ACCESS_KEY_ID: ${{secrets.TRACT_CI_AWS_ACCESS_KEY_ID}}
           AWS_SECRET_ACCESS_KEY: ${{secrets.TRACT_CI_AWS_SECRET_ACCESS_KEY}}
           AWS_EC2_METADATA_DISABLED: true
+          GITHUB_EVENT_INPUTS_DATASET_ID: ${{github.event.inputs.dataset_id}}
diff --git a/.github/workflows/crates.yml b/.github/workflows/crates.yml
index 735b1e7582..0acd81f364 100644
--- a/.github/workflows/crates.yml
+++ b/.github/workflows/crates.yml
@@ -10,6 +10,9 @@ env:
   CARGO_INCREMENTAL: false
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE20: true
 
+permissions:
+  contents: read
+
 jobs:
   prepare-matrix:
     runs-on: ubuntu-latest
@@ -50,7 +53,9 @@ jobs:
       RUSTUP_TOOLCHAIN: ${{matrix.rust}}
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
 
     - name: Cargo test
       run: cargo test -p ${{matrix.crate}}
@@ -65,7 +70,9 @@ jobs:
     env:
       RUSTUP_TOOLCHAIN: ${{matrix.rust}}
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
 
     - name: Cargo test
       run: cargo test -p tract-cuda -p test-cuda
@@ -80,7 +87,9 @@ jobs:
     env:
       RUSTUP_TOOLCHAIN: "1.91.0"
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
     - name: Minimum-BOM GPU smoke
       run: harness/cuda-minimum-deploy/gpu-ci.sh
 
@@ -94,7 +103,9 @@ jobs:
     env:
       RUSTUP_TOOLCHAIN: ${{matrix.rust}}
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
 
     - name: Cargo test
       run: cargo test -p tract-metal -p test-metal
@@ -111,7 +122,9 @@ jobs:
     env:
       RUSTUP_TOOLCHAIN: ${{matrix.rust}}
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
     - run: rustup component add clippy && cargo clippy
     - name: fmt
       run: rustup component add rustfmt && cargo fmt --check
@@ -127,7 +140,9 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
     - name: Install cargo-deny
       run: |
         curl -L https://github.com/EmbarkStudios/cargo-deny/releases/download/$VERSION/cargo-deny-$VERSION-x86_64-unknown-linux-musl.tar.gz \
diff --git a/.github/workflows/cross-platform.yml b/.github/workflows/cross-platform.yml
index 00e27f0f1f..f0c491d29b 100644
--- a/.github/workflows/cross-platform.yml
+++ b/.github/workflows/cross-platform.yml
@@ -5,14 +5,57 @@ on:
   schedule:
     - cron:  '0 5 * * *'
   workflow_dispatch:
+    inputs:
+      pr_number:
+        description: "Optional PR number to test (from fork ok). Leave empty to run on selected branch."
+        required: false
+        type: number
 
 env:
   CARGO_INCREMENTAL: false
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE20: true
   RUSTUP_TOOLCHAIN: 1.91.0
 
+permissions:
+  contents: read
+
 jobs:
+  prepare:
+    runs-on: ubuntu-latest
+    outputs:
+      test_ref: ${{ steps.set.outputs.test_ref }}
+      tract_bench_branch_name: ${{ steps.set.outputs.tract_bench_branch_name }}
+    steps:
+      - id: set
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9
+        with:
+          script: |
+            core.info(`event: ${context.eventName}`);
+            core.info(`payload.inputs: ${JSON.stringify(context.payload.inputs)}`);
+            core.info(`payload.pull_request.number: ${context.payload.pull_request?.number}`);
+            const prInput = context.payload.inputs?.pr_number ?? context.payload.pull_request?.number;
+            core.info(`prInput: ${prInput} (type: ${typeof prInput})`);
+            let ref, branch;
+            if (prInput) {
+              const pr = await github.rest.pulls.get({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                pull_number: Number(prInput),
+              });
+              ref = pr.data.head.sha;
+              branch = `pr-${prInput}-${pr.data.head.ref}`;
+            } else {
+              ref = process.env.GITHUB_SHA;
+              branch = process.env.GITHUB_HEAD_REF || process.env.GITHUB_REF_NAME || 'main';
+            }
+            const benchBranch = branch.replace(/\//g, '_');
+            core.info(`test_ref: ${ref}`);
+            core.info(`tract_bench_branch_name: ${benchBranch}`);
+            core.setOutput('test_ref', ref);
+            core.setOutput('tract_bench_branch_name', benchBranch);
+
   linux:
+    needs: prepare
     strategy:
       fail-fast: false
       matrix:
@@ -40,7 +83,11 @@ jobs:
       contents: read
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        ref: ${{ needs.prepare.outputs.test_ref }}
+        fetch-depth: 0
+        persist-credentials: false
 
     - name: Get current date
       id: date
@@ -48,12 +95,12 @@ jobs:
 
     - name: Configure AWS Credentials
       continue-on-error: true
-      uses: aws-actions/configure-aws-credentials@v6
+      uses: aws-actions/configure-aws-credentials@d979d5b3a71173a29b74b5b88418bfda9437d885 # v6
       with:
         role-to-assume: arn:aws:iam::567805100031:role/github-runner-tract-ci
         aws-region: us-east-2
 
-    - uses: actions/cache@v5
+    - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
       with:
         path: |
           ~/.rustup
@@ -65,16 +112,22 @@ jobs:
         key: ${{ runner.os }}-${{matrix.platform}}-${{steps.date.outputs.date}}
 
     - name: Setup wasmtime
-      if: ${{ matrix.platform }} == "wasm32-wasi"
-      uses: bytecodealliance/actions/wasmtime/setup@v1
+      if: matrix.platform == 'wasm32-wasi'
+      uses: bytecodealliance/actions/wasmtime/setup@9152e710e9f7182e4c29ad218e4f335a7b203613 # v1
 
     - name: Cross script
       env:
         PLATFORM: ${{matrix.platform}}
         AWS_EC2_METADATA_DISABLED: true
-      run: .travis/cross.sh
+        NEEDS_PREPARE_OUTPUTS_TRACT_BENCH_BRANCH_NAME: ${{ needs.prepare.outputs.tract_bench_branch_name }}
+        NEEDS_PREPARE_OUTPUTS_TEST_REF: ${{ needs.prepare.outputs.test_ref }}
+      run: |
+        export TRACT_BENCH_BRANCH_NAME="${NEEDS_PREPARE_OUTPUTS_TRACT_BENCH_BRANCH_NAME}"
+        export GITHUB_SHA="${NEEDS_PREPARE_OUTPUTS_TEST_REF}"
+        .travis/cross.sh
 
   apple:
+    needs: prepare
     strategy:
       fail-fast: false
       matrix:
@@ -88,11 +141,15 @@ jobs:
       contents: read
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        ref: ${{ needs.prepare.outputs.test_ref }}
+        fetch-depth: 0
+        persist-credentials: false
 
     - name: Configure AWS Credentials
       continue-on-error: true
-      uses: aws-actions/configure-aws-credentials@v6
+      uses: aws-actions/configure-aws-credentials@d979d5b3a71173a29b74b5b88418bfda9437d885 # v6
       with:
         role-to-assume: arn:aws:iam::567805100031:role/github-runner-tract-ci
         aws-region: us-east-2
@@ -104,4 +161,9 @@ jobs:
     - name: Cross script
       env:
         PLATFORM: ${{matrix.platform}}
-      run: .travis/cross.sh
+        NEEDS_PREPARE_OUTPUTS_TRACT_BENCH_BRANCH_NAME: ${{ needs.prepare.outputs.tract_bench_branch_name }}
+        NEEDS_PREPARE_OUTPUTS_TEST_REF: ${{ needs.prepare.outputs.test_ref }}
+      run: |
+        export TRACT_BENCH_BRANCH_NAME="${NEEDS_PREPARE_OUTPUTS_TRACT_BENCH_BRANCH_NAME}"
+        export GITHUB_SHA="${NEEDS_PREPARE_OUTPUTS_TEST_REF}"
+        .travis/cross.sh
diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
index 507f1f32dd..bc48fc6c58 100644
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -10,6 +10,9 @@ env:
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE20: true
   RUSTUP_TOOLCHAIN: 1.91.0
 
+permissions:
+  contents: read
+
 jobs:
   examples:
     runs-on: ubuntu-latest
@@ -17,7 +20,9 @@ jobs:
       examples: ${{steps.set-matrix.outputs.examples}}
 
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          persist-credentials: false
       - id: set-matrix
         run: |
           echo examples=`find examples -name ci.sh | cut -d/ -f 2 | jq -Rsc '. / "\n" - [""]'` >> "$GITHUB_OUTPUT"
@@ -26,18 +31,23 @@ jobs:
     name: ${{ matrix.ex }}
     runs-on: ubuntu-latest
     needs: examples
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
       matrix:
         ex: ${{fromJSON(needs.examples.outputs.examples)}}
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
 
     - name: Configure AWS Credentials
       # if: github.repository == 'sonos/tract'
       continue-on-error: true
-      uses: aws-actions/configure-aws-credentials@v6
+      uses: aws-actions/configure-aws-credentials@d979d5b3a71173a29b74b5b88418bfda9437d885 # v6
       with:
         role-to-assume: arn:aws:iam::567805100031:role/github-runner-tract-ci
         aws-region: us-east-2
@@ -45,17 +55,20 @@ jobs:
     - name: example tests
       env:
         AWS_EC2_METADATA_DISABLED: true
+        MATRIX_EX: ${{matrix.ex}}
       timeout-minutes: 30
       run: |
-        cd examples/${{matrix.ex}}
+        cd examples/${MATRIX_EX}
         ./ci.sh
 
   build-tract-cli:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          persist-credentials: false
       - run: cargo build -p tract-cli --profile opt-no-lto
-      - uses: actions/upload-artifact@v7
+      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
         with:
           name: tract-cli-x86_64
           path: ./target/opt-no-lto/tract
@@ -63,9 +76,11 @@ jobs:
   build-tract-cli-macos:
     runs-on: macOS
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          persist-credentials: false
       - run: cargo build -p tract-cli --profile opt-no-lto
-      - uses: actions/upload-artifact@v7
+      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
         with:
           name: tract-cli-aarch64-apple
           path: ./target/opt-no-lto/tract
@@ -76,7 +91,9 @@ jobs:
       examples: ${{steps.set-matrix.outputs.examples}}
 
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          persist-credentials: false
       - id: set-matrix
         run: |
           echo examples=`find examples -name ci-gpu.sh | cut -d/ -f 2 | jq -Rsc '. / "\n" - [""]'` >> "$GITHUB_OUTPUT"
@@ -91,9 +108,11 @@ jobs:
         ex: ${{fromJSON(needs.gpu-examples.outputs.examples)}}
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
 
-    - uses: actions/download-artifact@v8
+    - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
       with:
         name: tract-cli-x86_64
         path: target/opt-no-lto
@@ -102,8 +121,10 @@ jobs:
 
     - name: GPU example tests
       timeout-minutes: 60
+      env:
+        EX: ${{ matrix.ex }}
       run: |
-        cd examples/${{matrix.ex}}
+        cd "examples/$EX"
         ./ci-gpu.sh
 
   gpu-example-metal:
@@ -116,9 +137,11 @@ jobs:
         ex: ${{fromJSON(needs.gpu-examples.outputs.examples)}}
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
 
-    - uses: actions/download-artifact@v8
+    - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
       with:
         name: tract-cli-aarch64-apple
         path: target/opt-no-lto
@@ -128,5 +151,7 @@ jobs:
     - name: Metal GPU example tests
       timeout-minutes: 60
       run: |
-        cd examples/${{matrix.ex}}
+        cd examples/${MATRIX_EX}
         ./ci-gpu.sh
+      env:
+        MATRIX_EX: ${{matrix.ex}}
diff --git a/.github/workflows/full.yml b/.github/workflows/full.yml
index 7b4030a366..fba7c61ce5 100644
--- a/.github/workflows/full.yml
+++ b/.github/workflows/full.yml
@@ -14,6 +14,9 @@ env:
   CARGO_INCREMENTAL: false
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE20: true
 
+permissions:
+  contents: read
+
 jobs:
   prepare:
     runs-on: ubuntu-latest
@@ -21,7 +24,7 @@ jobs:
       test_ref: ${{ steps.set.outputs.test_ref }}
     steps:
       - id: set
-        uses: actions/github-script@v9
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9
         with:
           script: |
             const prInput = context.payload.inputs?.pr_number;
@@ -48,14 +51,15 @@ jobs:
       contents: read
     needs: prepare
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       with:
         ref: ${{ needs.prepare.outputs.test_ref }}
         fetch-depth: 0
+        persist-credentials: false
 
     - name: Configure AWS Credentials
       continue-on-error: true
-      uses: aws-actions/configure-aws-credentials@v6
+      uses: aws-actions/configure-aws-credentials@d979d5b3a71173a29b74b5b88418bfda9437d885 # v6
       with:
         role-to-assume: arn:aws:iam::567805100031:role/github-runner-tract-ci
         aws-region: us-east-2
@@ -68,10 +72,11 @@ jobs:
     needs: prepare
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       with:
         ref: ${{ needs.prepare.outputs.test_ref }}
         fetch-depth: 0
+        persist-credentials: false
     - name: Full test 
       env:
         AWS_EC2_METADATA_DISABLED: true
@@ -85,10 +90,11 @@ jobs:
         opset: [1_4_1, 1_5_0, 1_6_0, 1_7_0, 1_8_1, 1_9_0, 1_10_2, 1_11_0, 1_12_0, 1_13_0, 1_14_1, 1_15_0, 1_16_2, 1_17_0, 1_18_0, 1_19_1]
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       with:
         ref: ${{ needs.prepare.outputs.test_ref }}
         fetch-depth: 0
+        persist-credentials: false
     - name: Full test
       run: .travis/onnx-tests.sh ${{ matrix.opset }}
 
@@ -97,10 +103,11 @@ jobs:
     needs: prepare
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       with:
         ref: ${{ needs.prepare.outputs.test_ref }}
         fetch-depth: 0
+        persist-credentials: false
     - name: Full test
       run: .travis/tflite.sh
 
@@ -109,10 +116,11 @@ jobs:
     needs: prepare
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       with:
         ref: ${{ needs.prepare.outputs.test_ref }}
         fetch-depth: 0
+        persist-credentials: false
 
     - name: With assertions
       run: |
@@ -124,10 +132,11 @@ jobs:
     needs: prepare
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       with:
         ref: ${{ needs.prepare.outputs.test_ref }}
         fetch-depth: 0
+        persist-credentials: false
     - name: Without default features
       run: |
         rustup update
@@ -138,10 +147,11 @@ jobs:
     needs: prepare
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       with:
         ref: ${{ needs.prepare.outputs.test_ref }}
         fetch-depth: 0
+        persist-credentials: false
     - name: With complexes
       run: |
         rustup update
@@ -152,10 +162,11 @@ jobs:
     needs: prepare
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       with:
         ref: ${{ needs.prepare.outputs.test_ref }}
         fetch-depth: 0
+        persist-credentials: false
     - name: Check all targets
       run: |
         ROOT=$(pwd) ./.travis/ci-system-setup.sh
@@ -166,10 +177,11 @@ jobs:
     needs: prepare
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       with:
         ref: ${{ needs.prepare.outputs.test_ref }}
         fetch-depth: 0
+        persist-credentials: false
     - name: C smoke tests
       run: |
         cd api/c
@@ -181,17 +193,18 @@ jobs:
     needs: prepare
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
       with:
         ref: ${{ needs.prepare.outputs.test_ref }}
         fetch-depth: 0
+        persist-credentials: false
     - name: Setup Python
-      uses: actions/setup-python@v6
+      uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
       with:
         python-version: "3.13"
 
     - name: Install uv
-      uses: astral-sh/setup-uv@v7
+      uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
 
     - name: Pytest bindings
       timeout-minutes: 60
diff --git a/.github/workflows/large_models.yml b/.github/workflows/large_models.yml
index 702ed26e57..05d29e07ec 100644
--- a/.github/workflows/large_models.yml
+++ b/.github/workflows/large_models.yml
@@ -8,7 +8,10 @@ on:
 
 env:
   LARGE_MODELS: true
-  
+
+permissions:
+  contents: read
+
 jobs:
   cli: 
     name: Build tract on ${{ matrix.os }}
@@ -17,12 +20,14 @@ jobs:
       matrix:
         os: [ macos-latest, ubuntu-latest ]
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          persist-credentials: false
       - run: |
           ROOT=. ./.travis/ci-system-setup.sh
-          cargo build -p tract-cli --profile opt-no-lto --no-default-features --features transformers
+          cargo build -p tract-cli --profile opt-no-lto --no-default-features --features transformers,pulse,cuda-13000
       - run: echo uname=$(uname) >> $GITHUB_ENV
-      - uses: actions/upload-artifact@v7
+      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
         with:
           name: tract-cli-${{env.uname}}
           path: ./target/opt-no-lto/tract
@@ -79,31 +84,38 @@ jobs:
       contents: read
 
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
     - name: Configure AWS Credentials
       continue-on-error: true
-      uses: aws-actions/configure-aws-credentials@v6
+      uses: aws-actions/configure-aws-credentials@d979d5b3a71173a29b74b5b88418bfda9437d885 # v6
       with:
         role-to-assume: arn:aws:iam::567805100031:role/github-runner-tract-ci
         aws-region: us-east-2
     - run: echo uname=$(uname) >> $GITHUB_ENV
-    - uses: actions/download-artifact@v8
+    - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
       with:
         name: tract-cli-${{env.uname}}
         path: tract-cli-${{env.uname}}
 
     - name: Download and run
+      env:
+        UNAME: ${{ env.uname }}
+        RT_MATRIX: ${{ matrix.rt }}
+        MODEL: ${{ matrix.model }}
+        Q: ${{ matrix.q }}
       run: |
-        chmod +x tract-cli-${{env.uname}}/tract
-        export TRACT_RUN=$GITHUB_WORKSPACE/tract-cli-${{env.uname}}/tract
-        if [ "${{matrix.rt}}" = "gpu" ]
+        chmod +x "tract-cli-${UNAME}/tract"
+        export TRACT_RUN="$GITHUB_WORKSPACE/tract-cli-${UNAME}/tract"
+        if [ "$RT_MATRIX" = "gpu" ]
         then
           case $(uname) in
             Darwin) RT=metal;;
             Linux) RT=cuda;;
           esac
         fi
-        .travis/test-llm.sh ${{matrix.model}} ${{matrix.q}} $RT
+        .travis/test-llm.sh "$MODEL" "$Q" "$RT"
 
   parakeet-tdt-600m-v3:
     name: ${{matrix.os}} / Parakeet TDT 600m v3
@@ -117,16 +129,48 @@ jobs:
       contents: read
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
     - run: echo uname=$(uname) >> $GITHUB_ENV
-    - uses: actions/download-artifact@v8
+    - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
       with:
         name: tract-cli-${{env.uname}}
         path: tract-cli-${{env.uname}}
 
     - name: Download and run
+      env:
+        UNAME: ${{ env.uname }}
       run: |
-        chmod +x tract-cli-${{env.uname}}/tract
-        export TRACT_RUN=$GITHUB_WORKSPACE/tract-cli-${{env.uname}}/tract
+        chmod +x "tract-cli-${UNAME}/tract"
+        export TRACT_RUN="$GITHUB_WORKSPACE/tract-cli-${UNAME}/tract"
         ./harness/parakeet-tdt-600m-v3/ci.sh
 
+  nemotron-speech-streaming-en-06b:
+    name: ${{matrix.os}} / Nemotron speech streaming en 0.6b
+    needs: [ cli ]
+    strategy:
+      matrix:
+        os: [ macOS, cuda-lovelace ]
+      fail-fast: false
+    permissions:
+      id-token: write
+      contents: read
+    runs-on: ${{ matrix.os }}
+    steps:
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
+    - run: echo uname=$(uname) >> $GITHUB_ENV
+    - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
+      with:
+        name: tract-cli-${{env.uname}}
+        path: tract-cli-${{env.uname}}
+
+    - name: Download and run
+      env:
+        UNAME: ${{ env.uname }}
+      run: |
+        chmod +x "tract-cli-${UNAME}/tract"
+        export TRACT_RUN="$GITHUB_WORKSPACE/tract-cli-${UNAME}/tract"
+        ./harness/nemotron-speech-streaming-en-0.6b/ci.sh
diff --git a/.github/workflows/pydoc.yml b/.github/workflows/pydoc.yml
index 222f7f0256..f362fb08b0 100644
--- a/.github/workflows/pydoc.yml
+++ b/.github/workflows/pydoc.yml
@@ -8,27 +8,34 @@ on:
 env:
   CARGO_INCREMENTAL: false
 
+permissions:
+  contents: read
+
 jobs:
   build_doc:
     name: Build doc
     runs-on: ubuntu-latest
     if: github.repository == 'sonos/tract'
+    permissions:
+      contents: write
 
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          persist-credentials: false
 
       - name: Install Rust toolchain
-        uses: dtolnay/rust-toolchain@stable
+        uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable
 
       - name: Set up Python
-        uses: actions/setup-python@v6
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
         with:
           python-version: "3.12"
 
       - name: Extract version tag
         id: version
         if: github.event_name == 'release' && github.event.action == 'published'
-        run: echo value=$(echo ${{ github.ref }} | cut -f 3 -d / | sed 's/^v//' ) >> $GITHUB_OUTPUT
+        run: echo value=$(echo ${GITHUB_REF} | cut -f 3 -d / | sed 's/^v//' ) >> $GITHUB_OUTPUT
 
       - name: Build doc
         run: |
@@ -48,7 +55,7 @@ jobs:
           git config user.name "CI bot"
           git config user.email ci-bot@tract.rs
 
-          version="${{ steps.version.outputs.value }}"
+          version="${STEPS_VERSION_OUTPUTS_VALUE}"
           if [ -z "$version" ]; then
             version="dev"
           fi
@@ -86,3 +93,5 @@ jobs:
           # clean up worktree
           cd -
           git worktree remove "$workdir"
+        env:
+          STEPS_VERSION_OUTPUTS_VALUE: ${{ steps.version.outputs.value }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 6302568016..3c62678048 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -9,19 +9,26 @@ env:
   CARGO_INCREMENTAL: false
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE20: true
 
+permissions:
+  contents: read
+
 jobs:
   release:
     name: Create release
     runs-on: ubuntu-latest
+    permissions:
+      contents: write
     steps:
       - name: Extract version tag
         id: version
-        run: echo value=$(echo ${{ github.ref }} | cut -f 3 -d / | sed 's/^v//' ) >> $GITHUB_OUTPUT
+        run: echo value=$(echo ${GITHUB_REF} | cut -f 3 -d / | sed 's/^v//' ) >> $GITHUB_OUTPUT
 
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          persist-credentials: false
 
       - name: Create Release
-        uses: softprops/action-gh-release@v3
+        uses: softprops/action-gh-release@b4309332981a82ec1c5618f44dd2e27cc8bfbfda # v3
         with:
           name: tract ${{ steps.version.outputs.value }}
         env:
diff --git a/.github/workflows/tract-ci-bench.yml b/.github/workflows/tract-ci-bench.yml
index 23a81bafda..36d5153334 100644
--- a/.github/workflows/tract-ci-bench.yml
+++ b/.github/workflows/tract-ci-bench.yml
@@ -5,6 +5,9 @@ on:
     - cron:  '1 * * * *' # every hour at minute 1
   workflow_dispatch:
 
+permissions:
+  contents: read
+
 jobs:
   minion:
     strategy:
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 7713b63449..e9cd14640a 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -23,6 +23,9 @@ env:
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE20: true
   MACOSX_DEPLOYMENT_TARGET: 10.13
 
+permissions:
+  contents: read
+
 jobs:
   build_wheels:
     name: Build wheels on ${{ matrix.os }}
@@ -33,30 +36,34 @@ jobs:
         os: [ubuntu-22.04, windows-2022, macos-14]
 
     steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          persist-credentials: false
         
       - name: Setup | Rust
-        uses: dtolnay/rust-toolchain@stable
+        uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable
 
-      - uses: actions/setup-python@v6
+      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
         with:
           python-version: "3.13"
 
       - name: Install uv
-        uses: astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
+        with:
+          enable-cache: false
 
       - name: Install rust toolchains
         if: startsWith(matrix.os, 'macOS')
         run: rustup target install x86_64-apple-darwin aarch64-apple-darwin
 
       - name: Build wheels
-        uses: nick-fields/retry@v4
+        uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4
         with:
           max_attempts: 1
           timeout_seconds: 54000 # 15 hours :/
           command: uvx cibuildwheel --output-dir wheelhouse api/py
 
-      - uses: actions/upload-artifact@v7
+      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
         with:
           name: wheels-${{github.run_id}}-${{matrix.os}}
           path: ./wheelhouse/*.whl
@@ -65,15 +72,19 @@ jobs:
     name: Make SDist
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
 
     - name: Install uv
-      uses: astral-sh/setup-uv@v7
+      uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
+      with:
+        enable-cache: false
 
     - name: Build SDist
       run: cd api/py && uv build --sdist
 
-    - uses: actions/upload-artifact@v7
+    - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
       with:
         name: wheels-${{github.run_id}}-src
         path: api/py/dist/*.tar.gz
@@ -84,13 +95,13 @@ jobs:
     if: (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')) || inputs.publish
 
     steps:
-    - uses: actions/download-artifact@v8
+    - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
       with:
         pattern: wheels-${{github.run_id}}-*
         merge-multiple: true
         path: dist
 
-    - uses: pypa/gh-action-pypi-publish@v1.14.0
+    - uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
       with:
         user: __token__
         password: ${{ secrets.PYPI }}
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 92fb40d684..00c57a32f0 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -11,6 +11,9 @@ env:
   CARGO_INCREMENTAL: false
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE20: true
 
+permissions:
+  contents: read
+
 jobs:
   windows:
     strategy:
@@ -22,8 +25,10 @@ jobs:
     runs-on: ${{ matrix.os }}
 
     steps:
-    - uses: actions/checkout@v6
-    - uses: nick-fields/retry@v4
+    - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+      with:
+        persist-credentials: false
+    - uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4
       name: Install Rustup using win.rustup.rs
       with:
         timeout_minutes: 10
@@ -34,7 +39,7 @@ jobs:
             $ProgressPreference = "SilentlyContinue"
             Invoke-WebRequest https://win.rustup.rs/ -OutFile rustup-init.exe
             .\rustup-init.exe -y --default-host=x86_64-pc-windows-msvc --profile=minimal
-    - uses: nick-fields/retry@v4
+    - uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4
       name: Install the target
       with:
         timeout_minutes: 10
@@ -44,7 +49,7 @@ jobs:
             rustup toolchain add stable-x86_64-pc-windows-${{matrix.toolchain}}
             rustup default stable-x86_64-pc-windows-${{matrix.toolchain}}
     - name: Install LLVM and Clang
-      uses: KyleMayes/install-llvm-action@v2
+      uses: KyleMayes/install-llvm-action@ebc0426251bc40c7cd31162802432c68818ab8f0 # v2
       with:
         version: "11.0"
     - name: debug
@@ -54,7 +59,7 @@ jobs:
     - name: debug bin
       run: dir "C:\\Program Files\\LLVM\\bin"
     - name: top level cargo check
-      run: cargo check --workspace --exclude test-blas --exclude tract-metal --exclude test-metal --exclude causal_llm
+      run: cargo check --workspace --exclude tract-metal --exclude test-metal --exclude causal_llm
       env:
         LIBCLANG_PATH: "C:\\Program Files\\LLVM\\bin"
     - name: data / linalg / core / nnef / onnx / onnx-opl
diff --git a/.github/workflows/zizmor.yml b/.github/workflows/zizmor.yml
new file mode 100644
index 0000000000..fcc833cbfc
--- /dev/null
+++ b/.github/workflows/zizmor.yml
@@ -0,0 +1,32 @@
+name: Workflow audit (zizmor)
+
+on:
+  pull_request:
+    paths:
+      - '.github/workflows/**'
+      - '.github/dependabot.yml'
+      - '.github/workflows/zizmor.yml'
+  push:
+    branches: [ main ]
+    paths:
+      - '.github/workflows/**'
+      - '.github/dependabot.yml'
+  schedule:
+    - cron: '0 4 * * MON'
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  security-events: write
+
+jobs:
+  zizmor:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+
+      - name: Run zizmor 🌈
+        uses: zizmorcore/zizmor-action@5f14fd08f7cf1cb1609c1e344975f152c7ee938d # v0.5.6
diff --git a/.travis/bench-compare.sh b/.travis/bench-compare.sh
new file mode 100755
index 0000000000..531847724b
--- /dev/null
+++ b/.travis/bench-compare.sh
@@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+# .travis/bench-compare.sh <ref-a> <ref-b>
+#
+# Build tract for two git references, run bundle-entrypoint.sh for each,
+# and print a side-by-side evaltime comparison.
+#
+# Only net.*.evaltime.* and llm.*.(pp|tg)*.* metrics are shown.
+# Forwarded env vars:
+#   CACHEDIR    model cache directory (default: ~/.cache/tract-ci-minion-models)
+#   BENCH_OPTS  extra bench flags passed through to tract
+
+set -euo pipefail
+
+REF_A=${1:?'usage: bench-compare.sh <ref-a> <ref-b>'}
+REF_B=${2:?'usage: bench-compare.sh <ref-a> <ref-b>'}
+
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+REPO_ROOT=$(cd "$SCRIPT_DIR/.." && pwd)
+CACHEDIR=${CACHEDIR:-$HOME/.cache/tract-ci-minion-models}
+
+resolve_ref() {
+    local ref=$1
+    [[ $ref =~ ^[0-9]{4}$ ]] && ref="refs/pull/$ref/head"
+    local sha
+    sha=$(git -C "$REPO_ROOT" rev-parse --verify "$ref^{commit}" 2>/dev/null) \
+        && { echo "$sha"; return; }
+    if [[ $ref == refs/pull/*/head || $ref == refs/pull/*/merge ]]; then
+        echo "==> fetching $ref" >&2
+        git -C "$REPO_ROOT" fetch origin "$ref:$ref" >&2 \
+            || { echo "error: could not fetch '$ref' from origin" >&2; return 1; }
+        sha=$(git -C "$REPO_ROOT" rev-parse --verify "$ref^{commit}" 2>/dev/null) \
+            && { echo "$sha"; return; }
+    fi
+    echo "error: '$ref' does not resolve to a commit" >&2
+    return 1
+}
+ok=1
+COMMIT_A=$(resolve_ref "$REF_A") || ok=0
+COMMIT_B=$(resolve_ref "$REF_B") || ok=0
+[ $ok -eq 1 ] || exit 1
+
+WORK=$(mktemp -d /tmp/bench-compare.XXXXXXXX)
+
+cleanup() {
+    git -C "$REPO_ROOT" worktree remove --force "$WORK/wt_a" 2>/dev/null || true
+    git -C "$REPO_ROOT" worktree remove --force "$WORK/wt_b" 2>/dev/null || true
+    rm -rf "$WORK"
+}
+trap cleanup EXIT
+
+# ── build ─────────────────────────────────────────────────────────────────────
+
+build_ref() {
+    local label=$1 ref=$2
+    local wt="$WORK/wt_$label"
+    echo "==> worktree $ref" >&2
+    git -C "$REPO_ROOT" worktree add --detach "$wt" "$ref" >&2
+    echo "==> build $ref" >&2
+    cargo build --manifest-path "$wt/Cargo.toml" \
+        -p tract-cli -q --release \
+        --target-dir "$WORK/target_$label" >&2
+    echo "$WORK/target_$label/release/tract"
+}
+
+TRACT_A=$(build_ref a "$COMMIT_A")
+TRACT_B=$(build_ref b "$COMMIT_B")
+
+# ── bench ─────────────────────────────────────────────────────────────────────
+
+run_bench() {
+    local label=$1 tract=$2
+    local rundir="$WORK/run_$label"
+    mkdir -p "$rundir"
+    echo "==> bench [$label] $REF_A / $REF_B" >&2
+    (
+        cd "$rundir"
+        export TRACT_RUN="$tract"
+        export CACHEDIR="$CACHEDIR"
+        [ -n "${BENCH_OPTS:-}" ] && export BENCH_OPTS
+        bash "$SCRIPT_DIR/bundle-entrypoint.sh"
+    ) 2>&1 | sed "s/^/  [$label] /" >&2
+    if [ ! -f "$rundir/metrics" ]; then
+        echo "error: bench run failed for '$label' — no metrics produced" >&2
+        return 1
+    fi
+    echo "$rundir/metrics"
+}
+
+METRICS_A=$(run_bench a "$TRACT_A")
+METRICS_B=$(run_bench b "$TRACT_B")
+
+# ── comparison table ──────────────────────────────────────────────────────────
+
+filter_eval() {
+    # evaltime (ns), pp* and tg* (llm tokens/s or ms/token)
+    grep -E '\.(evaltime|pp[0-9]+|tg[0-9]+)\.' "$1" || true
+}
+
+declare -A va vb
+while read -r k v; do va[$k]=$v; done < <(filter_eval "$METRICS_A")
+while read -r k v; do vb[$k]=$v; done < <(filter_eval "$METRICS_B")
+
+all_keys=$(
+    { filter_eval "$METRICS_A"; filter_eval "$METRICS_B"; } \
+        | awk '{print $1}' | sort -u
+)
+
+if [ -t 1 ]; then
+    RED='\033[0;31m' GRN='\033[0;32m' YEL='\033[1;33m' RST='\033[0m'
+else
+    RED='' GRN='' YEL='' RST=''
+fi
+
+hrule() { printf '%.0s-' $(seq 1 "$1"); }
+
+W=62
+printf "\n"
+printf "  %-${W}s  %13s  %13s  %8s  %7s\n" "metric" "$REF_A" "$REF_B" "ratio" "delta%"
+printf "  %-${W}s  %13s  %13s  %8s  %7s\n" \
+    "$(hrule $W)" "$(hrule 13)" "$(hrule 13)" "$(hrule 8)" "$(hrule 7)"
+
+while IFS= read -r key; do
+    [[ -z $key ]] && continue
+    a_val=${va[$key]:-}
+    b_val=${vb[$key]:-}
+    if [[ -n $a_val && -n $b_val ]]; then
+        read -r ratio delta colour < <(awk -v a="$a_val" -v b="$b_val" 'BEGIN {
+            if (a == 0) { print "N/A N/A YEL"; exit }
+            r = b / a
+            c = (r > 1.05) ? "RED" : (r < 0.95) ? "GRN" : "RST"
+            printf "%.4f %+.1f %s\n", r, (r-1)*100, c
+        }')
+        case $colour in
+            RED) c=$RED ;;
+            GRN) c=$GRN ;;
+            YEL) printf "  ${YEL}%-${W}s  %13s  %13s  %8s  %7s${RST}\n" \
+                     "$key" "$a_val" "$b_val" "N/A" "N/A"; continue ;;
+            *)   c=$RST ;;
+        esac
+        printf "  ${c}%-${W}s  %13s  %13s  %8s  %6s%%${RST}\n" \
+            "$key" "$a_val" "$b_val" "$ratio" "$delta"
+    else
+        printf "  ${YEL}%-${W}s  %13s  %13s  %8s  %7s${RST}\n" \
+            "$key" "${a_val:-N/A}" "${b_val:-N/A}" "N/A" "N/A"
+    fi
+done <<< "$all_keys"
+
+printf "\n  evaltime in ns; llm pp/tg units from tract llm-bench output\n"
+printf "  green = REF_B faster (ratio < 0.95)   red = REF_B slower (ratio > 1.05)\n\n"
diff --git a/.travis/bundle-entrypoint.sh b/.travis/bundle-entrypoint.sh
index b6d1e891bf..63cb030275 100755
--- a/.travis/bundle-entrypoint.sh
+++ b/.travis/bundle-entrypoint.sh
@@ -148,7 +148,7 @@ net_bench voicecom_float 2sec $CACHEDIR/snips-voice-commands-cnn-float.pb -i 200
 net_bench trunet pulse1_f32 $CACHEDIR/trunet_dummy.nnef.tgz --nnef-tract-core --pulse 1
 net_bench trunet pulse1_f16 $CACHEDIR/trunet_dummy.nnef.tgz --nnef-tract-core -t f32_to_f16 --pulse 1
 
-. $PRIVATE
+[ -f "$PRIVATE" ] && . "$PRIVATE"
 
 if [ $(uname) = "Darwin" ]
 then
diff --git a/.travis/cli-tests.sh b/.travis/cli-tests.sh
index 3d6d11be38..cb4ca3fa79 100755
--- a/.travis/cli-tests.sh
+++ b/.travis/cli-tests.sh
@@ -26,6 +26,16 @@ do
     $t
 done
 
+echo
+echo $WHITE • harness/pulse-multi-axis $NC
+echo
+
+for t in `find harness/pulse-multi-axis -name runme.sh`
+do
+    echo $WHITE$t$NC
+    $t
+done
+
 echo
 echo $WHITE • onnx/test_cases $NC
 echo
@@ -114,6 +124,8 @@ $TRACT_RUN $MODELS/hey_snips_v4_model17.pb -i S,20,f32 \
 
 $CACHE_FILE trunet_dummy.nnef.tgz
 $TRACT_RUN --nnef-tract-core $MODELS/trunet_dummy.nnef.tgz dump -q
+$TRACT_RUN --nnef-tract-core $MODELS/trunet_dummy.nnef.tgz --pulse 1 \
+    compare --stream --allow-random-input -q
 
 echo $WHITE     LLM $NC
 
diff --git a/.travis/cross.sh b/.travis/cross.sh
index 5c61a6a594..805cd7aece 100755
--- a/.travis/cross.sh
+++ b/.travis/cross.sh
@@ -86,11 +86,19 @@ case "$PLATFORM" in
 
     "aarch64-unknown-linux-gnu-stretch" | "armv7-unknown-linux-gnueabihf-stretch" | "x86_64-unknown-linux-gnu-stretch")
         INNER_PLATFORM=${PLATFORM%-stretch}
+        # aarch64 stretch bench targets Jetson-class boxes that ship with CUDA 12;
+        # the default CUDA 13 cudarc binding wouldn't run there (cudart symbol
+        # rename across the 12/13 boundary).  Force cuda-12000 for that build only.
+        CUDA_FEATURE_ENV=""
+        if [ "$PLATFORM" = "aarch64-unknown-linux-gnu-stretch" ]
+        then
+            CUDA_FEATURE_ENV="-e TRACT_CUDA_FEATURE=cuda-12000"
+        fi
         (cd .travis/docker-debian-stretch; docker build --tag debian-stretch .)
         docker run -v `pwd`:/tract -w /tract \
             -e CI=true \
             -e SKIP_QEMU_TEST=skip \
-            -e PLATFORM=$INNER_PLATFORM debian-stretch \
+            -e PLATFORM=$INNER_PLATFORM $CUDA_FEATURE_ENV debian-stretch \
             ./.travis/cross.sh
         sudo chown -R `whoami` .
         export RUSTC_TRIPLE=$INNER_PLATFORM
@@ -196,7 +204,16 @@ case "$PLATFORM" in
 
         cargo dinghy --platform $PLATFORM $DINGHY_TEST_ARGS check -p tract-ffi
         # keep lto for these two are they're going to devices.
-        cargo dinghy --platform $PLATFORM build --release -p tract-cli -p example-tensorflow-mobilenet-v2
+        if [ -n "$TRACT_CUDA_FEATURE" ]
+        then
+            cargo dinghy --platform $PLATFORM build --release \
+                --no-default-features \
+                --features "onnx,tf,pulse,pulse-opl,tflite,transformers,extra,$TRACT_CUDA_FEATURE" \
+                -p tract-cli
+            cargo dinghy --platform $PLATFORM build --release -p example-tensorflow-mobilenet-v2
+        else
+            cargo dinghy --platform $PLATFORM build --release -p tract-cli -p example-tensorflow-mobilenet-v2
+        fi
         ;;
 
     wasm32-wasi)
diff --git a/.travis/make_bundle.sh b/.travis/make_bundle.sh
index cd2a7dd11c..75dd9bc6b7 100755
--- a/.travis/make_bundle.sh
+++ b/.travis/make_bundle.sh
@@ -2,8 +2,14 @@
 
 set -ex
 
-TRAVIS_COMMIT=${GITHUB_SHA:-$(git rev-parse HEAD 2>/dev/null || echo dummy-commit-id)}
-BRANCH=${GITHUB_HEAD_REF:-$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo main)}
+if [ -n "$GITHUB_ACTIONS" ]
+then
+    TRAVIS_COMMIT=${GITHUB_SHA:-dummy-commit-id}
+    BRANCH=${TRACT_BENCH_BRANCH_NAME:-${GITHUB_HEAD_REF:-main}}
+else
+    TRAVIS_COMMIT=$(git rev-parse HEAD 2>/dev/null || echo dummy-commit-id)
+    BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo main)
+fi
 BRANCH=$(echo $BRANCH | tr '/' '_')
 PLATFORM=${PLATFORM:-dummy-platform}
 
diff --git a/.travis/test-llm.sh b/.travis/test-llm.sh
index def9fd36e5..5d2b152d33 100755
--- a/.travis/test-llm.sh
+++ b/.travis/test-llm.sh
@@ -93,7 +93,9 @@ then
     fi
 fi
 
-$TRACT_RUN -v --nnef-tract-transformers $MODELS/$nnef -O --readings  --assert-maximal-mm-quality-cost 0 $TRACT_EXTRA_ARGS dump -q
+$TRACT_RUN -v --nnef-tract-transformers $MODELS/$nnef -O --readings  --assert-maximal-mm-quality-cost 0 $TRACT_EXTRA_ARGS dump -q \
+    --assert-op-count Rsqrt 0 \
+    --assert-op-count MeanOfSquares 0
 alloc_max=$(cat readings.out | tail -n +2 | awk '{print $10-$11}' | sort -n | tail -1)
 ratio=$((alloc_max * 100 / size))
 
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000000..4692ec13fb
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,283 @@
+# tract -- Agent Guide
+
+tract is Sonos' neural-network inference engine written in Rust.
+It reads ONNX, NNEF, TensorFlow Lite, and TensorFlow models, optimises them,
+and runs them on CPU (x86/ARM), GPU (Metal, CUDA), embedded targets, and WASM.
+
+This file is the operational quick reference. For conceptual background not
+derivable from the source, see [`doc/`](doc/):
+
+- [`doc/intro.md`](doc/intro.md) — tract-OPL design, translate-time vs runtime split
+- [`doc/pipeline.md`](doc/pipeline.md) — load → optimise → run, and the `Runtime` trait
+- [`doc/symbolic-shapes.md`](doc/symbolic-shapes.md) — `TDim`, `Symbol`, and how to bind them
+- [`doc/graph.md`](doc/graph.md) — Graph, Node, Outlet, Fact, model pipeline
+- [`doc/op.md`](doc/op.md) — anatomy of an Op (`Op` / `EvalOp` / `TypedOp` / `InferenceOp`)
+- [`doc/cli-recipe.md`](doc/cli-recipe.md) — `tract` command-line cookbook
+- [`doc/kernel-notes.md`](doc/kernel-notes.md) — tract-linalg kernels and debugging
+- [`doc/nnef/`](doc/nnef/) — reference schemas for the `tract_*` NNEF extensions
+
+---
+
+## Crate map
+
+| Crate | Purpose | Depends on |
+|---|---|---|
+| `data` | `Tensor`, `DType`, `TractResult`, low-level storage | _(none)_ |
+| `linalg` | Micro-kernel dispatch (BLAS-style, hand-rolled SIMD) | `data` |
+| `core` | `TypedModel`, op trait, passes, rewriter, `TypedModelPatch` | `linalg`, `data` |
+| `hir` | Untyped inference graph (pre-type-analysis) | `core` |
+| `nnef` | NNEF load/save, tract-OPL extensions, NNEF ser/de for core ops | `core` |
+| `onnx-opl` | tract-OPL extensions for ONNX-specific ops | `nnef`, `extra` |
+| `onnx` | ONNX importer | `nnef`, `hir`, `onnx-opl`, `extra` |
+| `tflite` | TensorFlow Lite importer | `core` |
+| `tensorflow` | TensorFlow importer | `hir`, `pulse` |
+| `pulse-opl` | Streaming op primitives | `nnef` |
+| `pulse` | Streaming / causal inference | `pulse-opl`, `transformers` |
+| `transformers` | Transformer-specific ops (RmsNorm, Silu, GeluApproximate, ...) | `nnef`¹ |
+| `gpu` | Shared GPU abstractions | `core`, `pulse-opl`, `transformers` |
+| `metal` | Apple Metal backend | `gpu`, `core`, `pulse-opl`, `transformers` |
+| `cuda` | NVIDIA CUDA backend | `gpu`, `core`, `pulse-opl`, `transformers` |
+| `extra` | Miscellaneous ops not yet in core | `nnef`, `pulse` |
+| `cli` / `libcli` | `tract` command-line tool | most of the above |
+| `api/rs` | High-level stable public Rust API | `nnef`, `onnx`, `pulse`, `transformers`, `metal`, `cuda`, ... |
+| `api/ffi` | C FFI over `api/rs` | `api/rs` |
+
+¹ `transformers` has no direct `tract-core` dep; import core types via `tract_nnef::tract_core`.
+
+---
+
+## Build and test
+
+```sh
+# build everything
+cargo build --workspace
+
+# test a single crate
+cargo test -p tract-core
+
+# test the whole workspace
+cargo test --workspace
+
+# format (always repo-wide, never per-crate; pin the toolchain to avoid spurious diffs)
+cargo +1.91.0 fmt --all
+
+# lint
+cargo clippy --workspace
+```
+
+The `harness/` directory contains integration tests that run against real
+models. `.travis/native.sh` runs the full native Linux CI suite; running it
+locally requires `libssl-dev` (needed by the `tflite` step).
+
+Synthetic NNEF tests under `harness/nnef-test-cases/`
+are driven by a `runme.sh` that calls the `tract` CLI with `--assert-output-bundle`
+against a reference `io.npz`. Add new cases there rather than as Rust integration
+tests. If the assertion you need isn't expressible through the CLI, extend the CLI.
+
+### Model inspection
+
+```sh
+# human-readable graph dump
+tract model.nnef.tgz dump
+
+# machine-readable — pipe to jq or python
+tract model.nnef.tgz dump --audit-json | jq '.nodes[] | select(.op_name == "Conv")'
+```
+
+Reach for `--audit-json` when scripting; the plain `dump` output is meant for
+humans and is awkward to parse.
+
+### test-rt
+
+`test-rt` is the cross-backend test framework. It separates test suites from
+runtimes: a suite (e.g. `suite-unit`, `suite-onnx`) defines a set of
+`Test`-trait objects; a runner crate (e.g. `test-unit-core`, `test-metal`,
+`test-cuda`) picks a `Runtime` implementation and runs a subset of those
+suites against it, with its own ignore list.
+
+Layout:
+
+| Crate | Role |
+|---|---|
+| `test-rt/infra` | `Test`, `TestSuite`, `Runtime` traits and test-runner harness |
+| `test-rt/suite-unit` | Unit tests for core ops (conv, einsum, matmul, ...) |
+| `test-rt/suite-onnx` | ONNX backend test suite |
+| `test-rt/test-unit-core` | Runs `suite-unit` on the default CPU runtime |
+| `test-rt/test-onnx-core` | Runs `suite-onnx` on the default CPU runtime |
+| `test-rt/test-metal` | Runs unit + onnx suites on the Metal backend |
+| `test-rt/test-cuda` | Runs unit + onnx suites on the CUDA backend |
+| `test-rt/test-f16` | Runs f16-specific cases |
+| `test-rt/test-tflite` | Runs suite against the TFLite runtime |
+| `test-rt/test-nnef-cycle` | Verifies NNEF round-trip for all suite cases |
+
+To add a new op test: add a case to the relevant `suite-*` crate. The runner
+crates pick it up automatically; add an ignore entry in the runner only if the
+backend genuinely cannot support the case.
+
+---
+
+## Core abstractions
+
+> **Client code** (applications, examples, language bindings) should use `api/rs`
+> only. The internal crates (`core`, `nnef`, `onnx`, ...) are not stable API surface.
+> When asked "is X part of the public API?", check `api/rs/src/lib.rs` — that is
+> the authoritative surface, not the internal crate's `pub` items.
+
+### key principle
+
+tract avoid specializing for one model or an application. Model-wide behaviours or
+optimisations should emerge from op-scoped manipulation composing together. There are
+pragmatic compromises: sometimes introducing "big" primitives than could be implemented 
+with atomic operators (Convolution, Attention, RmsNorm, ...) is unlocking optimisations.
+
+### TypedModel / TypedNode / TypedFact
+
+The main IR. A `TypedModel` is a DAG of `TypedNode`s. Each node holds a boxed
+`Op` and a list of `TypedFact` outputs (element type + symbolic shape).
+
+### Op trait (`core/src/ops/mod.rs`)
+
+Every op implements `Op` and usually `TypedOp`. Key methods:
+
+- `eval` -- eager execution
+- `output_facts` -- shape/type inference
+- `declutter` -- return a `TypedModelPatch` to simplify the op or replace it
+- `codegen` -- return a patch targeting a specific backend/platform
+
+## Model rewriting
+
+`TypedModelPatch` is the **preferred** way to modify a model. Direct mutation of
+`TypedModel` nodes or edges is an exception reserved for construction and
+well-understood bulk transforms -- in almost every other case, implement rule
+functions that return `TypedModelPatch`es, and wrap them in a `Rewriter` to
+define a `ModelTransform`.
+
+### TypedModelPatch (`core/src/model/patch.rs`)
+
+Surgical edits to a model. Build a patch, then `patch.apply(model)`.
+
+### Rewriter (`core/src/model/rewriter.rs`)
+
+Per-op declutter rules collected into a typed dispatch table:
+
+```rust
+Rewriter::default()
+    .with_rule_for::<MyOp>("rule-name", |ctx, model, node, name, op| {
+        rule_if!(some_condition);
+        rule_if_some!(x = maybe_value);
+        // build and return a TypedModelPatch
+        Ok(Some(patch))
+    })
+```
+
+Use `Rewriter` for rules that fire on a single op type. The three guard macros
+(defined in `core/src/transform.rs`) provide early-return ergonomics inside a
+`TractResult<Option<TypedModelPatch>>` body:
+
+| Macro | Exits when |
+|---|---|
+| `rule_if!(cond)` | `cond` is false |
+| `rule_if_let!(pat = expr)` | pattern does not match |
+| `rule_if_some!(pat = expr)` | value is `None` |
+
+### ModelTransform (`core/src/transform.rs`)
+
+Whole-model passes (e.g. float precision translation, block-quant folding).
+Implement the `ModelTransform` trait when you need to walk the entire graph
+rather than react to individual op types.
+
+### When to use which
+
+| Situation | Tool |
+|---|---|
+| Simplify / fuse one op type | `Op::declutter` + `TypedModelPatch` |
+| Cross-op pattern (N ops -> M ops) | `Rewriter` rule |
+| Whole-model structural change | `ModelTransform` |
+| Backend lowering for one op | `Op::codegen` + `TypedModelPatch` |
+
+---
+
+## Op detection via declutter
+
+Tract detects and fuses transformer ops during the declutter pass, not in a
+separate recognition pass:
+
+- `RmsNorm` -- detected in `Reduce::declutter`
+- `Silu` -- detected in `Sigmoid::declutter` (via `element_wise!` `; declutter:` param)
+- `GeluApproximate` -- detected in `Pow::declutter` (chained `declutter_pow`)
+
+---
+
+## Streaming and pulsification
+
+Streaming inference converts a `TypedModel` into a `PulsedModel`
+(`pulse/src/model.rs`) that processes a fixed-size chunk along one axis at each
+step.
+
+- **Streaming axis and pulse size.** Each op pulsifies through an impl in
+  `pulse/src/ops/`. The streaming axis is tracked node-by-node; an op that
+  reorders or merges axes must declare what the streaming axis becomes on its
+  output, otherwise pulsification fails or produces wrong delays.
+- **Delay.** Ops that need past context (conv, attention windows, banded masks)
+  insert a `Delay` op from `pulse-opl/src/delay.rs` to buffer earlier pulses.
+  The accumulated output delay is exposed as `pulse.delay` and used by the CLI
+  assertion path to skip warmup before comparing against a batch reference.
+- **ChangeAxes.** `core/src/optim/change_axes.rs` rewrites axis layout during
+  optimisation. Any change-axes interaction has to preserve the streaming axis
+  identity, so new ops that move axes around need a `change_axes` impl that
+  agrees with their pulsification.
+
+The streaming model has subtle invariants -- don't touch `pulse` / `pulse-opl`
+casually.
+
+---
+
+## NNEF serialisation
+
+NNEF ser/de for core ops lives in `nnef/src/ops/core/`. Ops are registered
+with a primary `tract_core_*` name and backward-compatible `tract_transformers_*`
+aliases where needed.
+
+Re-export shims in `transformers/src/ops/mod.rs` keep downstream crates
+(`cuda`, `metal`, `gpu`, `test-rt`) working without a direct `tract_core` dep.
+
+---
+
+## Commit hygiene
+
+- Run `cargo fmt --all` before every commit. Metal source files need formatting
+  too, even when building on Linux.
+
+---
+
+## Style
+
+  Commit messages:
+  - State what was wrong and the fix -- no consequence chains ("X broke Y broke Z").
+  - One short paragraph; skip "Result:/Consequence:/Symptom:" sections and laundry-lists of every place the bug surfaced.
+
+  Inline code comments:
+  - Default to none. Code MUST be self-explanatory via variable and function naming.
+  - In tract, an inline comment is a signal that something implicit is happening -- hidden constraint, non-obvious invariant, bug workaround.
+  - Existing files may carry stale or chatty comments; new contributions should not add to them.
+  - Comments describe the **current** code only. Don't narrate the diff ("the previous code did X", "this used to be Y", "was a copy-paste of the 32x1 kernel") -- that history belongs in the commit message and will be wrong after the next refactor.
+  - Avoid section banners (// -- Step 2: Pad -> Reshape --), prefer split in functions. It's ok to have long function prototype in private function (within reason) #[allow(clippy::too_many_arguments)] authorized in such case.
+
+  Idioms:
+  - Prefer `as_X()` over `to_X().ok()` for cheap reference-style conversions.
+  - No new `unsafe` without explicit permission. `shunt_outside_unchecked` is a last resort for surgical patches whose safety is locally obvious; reach for safe alternatives first.
+  - Don't add abstraction beyond the task. Three similar lines beat a premature helper.
+
+  Formatting:
+  - Always run `cargo +1.91.0 fmt --all` before committing -- bare `cargo fmt` uses a newer rustfmt and produces spurious diffs CI rejects.
+
+  PR comments and review replies:
+  - Open PR with a short, crystal-clear summary paragraph -- one or two sentences stating what the PR is about and why it matters, before details if necessary.
+  - Follow-up questions and comments on a PR must be handled by humans only, the maintainer is a human, they want to talk to an PR author, not prompt somebody's else LLM.
+
+## Things to avoid
+
+- **Clap extension traits** -- use the clap API directly, even with turbofish.
+- **Mocking internals in tests** -- prefer real model round-trips.
+- **Hand-rolling model-walk loops** -- reach for `Rewriter`, `ModelTransform`,
+  or `TypedModelPatch` instead.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 60e75cb240..85f47820ca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,60 @@
+# 0.23.0 — soon
+
+### CPU / linalg
+
+- **ARM SME backend (ARMv9.2-A).** New `linalg/arm64/sme` module provides SME GEMM (Phase 1) and SME2 GEMV (`sme_mmv_f32_64x1`, Phase 2A) micro-kernels for Apple M4+, Cortex-X4, and other ARMv9.2-A+ chips. Dispatch is gated on a 512-bit streaming vector length at runtime; SME2 assembler detection skips kernels when the assembler lacks support. Force via `TRACT_CPU_AARCH64_KIND=applem` (or `generic` to disable).
+- **Apple AMX: shape-aware dispatch.** AMX kernel selection is now M/N/K-aware at runtime, yielding 5–43% wins across canary models.
+- **NEON element-wise kernels.** `HardSwish`, `SiLU`, and `GELU` get dedicated aarch64 NEON kernels wired as single graph ops.
+- **x86_64.** M-aware kernel picker; AVX-512 GEMM routed to the 16×8 / 32×5 / 32×6 kernels.
+- **WASM SIMD.** Relaxed-SIMD FMA in all MMM kernels; 32×1 GEMV kernel (8 v128 accumulators, 8-way ILP); vectorised sigmoid and tanh; `rustfft wasm_simd` enabled. Low-accumulator MMM paths recover 8–23% under `+relaxed-simd`; M-band GEMV dispatch woken up (30–37% on small-M). `Executor::RayonGlobal` for `wasm-bindgen-rayon`.
+- **Multithreaded GEMM.** TLS borrow and sync hoisted out of the per-tile inner loop; 2D chunked dispatch with a small-MMM threshold avoids rayon overhead on small operands.
+- **im2col.** Contiguous-x fast path for valid (zero-padding, unit stride) convolutions; grouped lazy im2col extended; depthwise convolutions excluded from lazy im2col path; N=1/2/3 zone dispatch for depthwise.
+- **General.** Same-shape fast path in `BinMiniOp::generic_eval`; `rbytes=96/128` fast paths for mn-major packing.
+- **EinSum** Fold contiguous same-role axes in standard codegen.
+- **BLAS / SGemm integration dropped.**
+
+### ONNX
+
+- `Resize`: `pytorch_half_pixel` coordinate transformer.
+- `Reshape` with 0-dims and rank change fixed (issue #2104).
+
+### NNEF
+
+- Fix serialisation cycle for `ConvInteger`, `SameLower` padding convolution, and `QLinearConv`.
+- `tract_core` NNEF extension is now **opt-out**: the operator set is registered unconditionally; `--nnef-tract-core` is accepted as a no-op for backwards compatibility.
+
+### Pulse for chunked attention layers (experimental)
+
+- **`pulse::Blockify` rewrite pass.** Translates block-diagonal multi-time-axis subgraphs into chunk-parallel form: recognises quadratic sections (EinSum terminators, `DiagGather` initiators, banded masks, Softmax body chains) and rewrites each into a per-chunk section. Covered by ex01–ex10 synthetic harness cases.
+- **`DiagGather` op** (moved from `transformers` into `core`): causal skew-trick gather with ROI-driven narrowing and re-anchoring.
+- **`WindowOnAxis` op**: windowed gather over the streaming axis with configurable pad value.
+- **`AxisOp::Reshape` pulsifier**: auto-inserts alignment `Delay` on streaming-axis size change.
+- Stream-axis LCM merge + slope-based per-pulse sizing for `Range`.
+
+### Runtime / plan
+
+- Per-node shape resolve skipped once all symbols are bound.
+- `TDim::Sym` fast-path in shape resolve; lock-free `guess_scenario` on empty scope.
+- `PropagateRoi` iterates to fixed point and simplifies.
+
+### Scan
+
+- No longer auto-sets `external_state` on RNN / LSTM / GRU import.
+- `force_scan_external_state` ad-hoc transform added; `seq` symbol concretized before forcing.
+- Single-loop `Scan` not inlined unless state management is external.
+
+### CLI
+
+- `--set X=value` now accepts full TDim expressions (symbols, arithmetic), not just integer literals.
+
+### Documentation
+
+- `doc/pipeline.md`: Declutter vs Lowering breakdown; per-runtime variations; timing pitfalls.
+- `doc/symbolic-shapes.md`: TDim, Symbol, and how to bind them.
+- `doc/op.md`: working with a `Tensor`'s data.
+- `doc/cli-recipe.md`: `--audit-json`, `--save-outputs`, timing pitfalls, environment-variable table.
+- `README.md`: refreshed — current backends, modern examples table, Python bindings section, torch-to-nnef pointer.
+
 # 0.23.0-dev.5 - 2026-04-22
 
 ### API — breaking
diff --git a/Cargo.toml b/Cargo.toml
index 3402e1c4fe..81b9e48164 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,6 +35,7 @@ members = [
     "examples/keras-tract-tf2",
     "examples/nemo-parakeet-asr",
     "examples/nemo-nemotron-asr",
+    "examples/nemo-nemotron-streaming-asr",
     "examples/nnef-dump-mobilenet-v2",
     "examples/nnef-mobilenet-v2",
     "examples/onnx-mobilenet-v2",
@@ -44,6 +45,7 @@ members = [
     "examples/stable-diffusion",
     "examples/stable-diffusion-3",
     "examples/stable-diffusion-xl",
+    "examples/wasm-model-bench",
 
     "harness/core-proptest-pulse",
     "harness/nnef-inceptionv3",
@@ -55,7 +57,6 @@ members = [
     "test-rt/suite-unit",
     "test-rt/suite-onnx",
     "test-rt/test-f16",
-    "test-rt/test-blas",
     "test-rt/test-metal",
     "test-rt/test-cuda",
     "test-rt/test-unit-core",
@@ -109,7 +110,6 @@ default-members = [
     "test-rt/suite-unit",
     "test-rt/suite-onnx",
     "test-rt/test-f16",
-    "test-rt/test-blas",
     "test-rt/test-unit-core",
     "test-rt/test-onnx-core",
     "test-rt/test-nnef-cycle",
@@ -119,7 +119,6 @@ default-members = [
 rust-version = "1.91"
 
 [workspace.dependencies]
-accelerate-src = "0.3"
 anstyle = "1.0.2"
 anstyle-parse = "1.0.0"
 anstyle-query = "1.0.0"
@@ -182,7 +181,6 @@ num-complex = "0.4.0"
 num-integer = "0.1.44"
 num-traits = "0.2.14"
 num_cpus = "1"
-openblas-src = { version = "0.10", features = ["static"] }
 parking_lot = "0.12.3"
 pastey = "0.2"
 proptest = "1.0.0"
@@ -196,7 +194,7 @@ readings-probe = "0.1.8"
 regex = "1.5.4"
 ron = "0.12"
 reqwest = { version = "0.13", features = [ "blocking", "rustls-no-provider" ], default-features = false }
-rustfft = { version = "6.1", features = [ "neon" ] }
+rustfft = { version = "6.1", features = [ "neon", "wasm_simd" ] }
 rustls = { version = "0.23", default-features = false, features = [ "ring", "std", "tls12" ] }
 webpki-roots = "1"
 safetensors = "0.7"
@@ -205,19 +203,19 @@ serde = { version = "1.0.127", features = [ "derive" ] }
 serde_json = "1.0"
 simd-adler32 = { version = "0.3.7", features = ["std"] }
 smallvec = "1.6.1"
-string-interner = "0.19"
+string-interner = "0.20"
 tar = "0.4.37"
 tempfile = "3.8"
 tensorflow = "0.21.0"
 tflitec = { git = "https://github.com/kali/tflitec-rs.git", rev="9ceb838" }
 time = "0.3.23"
-tokenizers = "0.22"
+tokenizers = "0.23"
 unicode-normalization = "0.1.19"
 walkdir = "2.3.2"
 zerofrom = "0.1.5"
 tract-api = { version = "=0.23.0-dev.4", path = 'api' }
 tract-core = { version = "0.23.0-pre", path = 'core' }
-tract-cuda = { version = "0.23.0-pre", path = 'cuda' }
+tract-cuda = { version = "0.23.0-pre", path = 'cuda', default-features = false }
 tract-data = { version = "0.23.0-pre", path = 'data' }
 tract-extra = { version = "0.23.0-pre", path = 'extra' }
 tract-gpu = { version = "0.23.0-pre", path = 'gpu' }
diff --git a/README.md b/README.md
index a281af93e2..0755a39815 100644
--- a/README.md
+++ b/README.md
@@ -3,193 +3,176 @@
 ![Rust](https://img.shields.io/badge/rust-%23000000.svg?style=for-the-badge&logo=rust&logoColor=white)
 ![rustc >= 1.91.0](https://img.shields.io/badge/rustc-%3E%3D1.91.0-brightgreen)
 ![MIT/Apache 2](https://img.shields.io/crates/l/tract)
-[![Native Linux test status](https://github.com/snipsco/tract/workflows/Native%20Linux/badge.svg)](https://github.com/snipsco/tract/actions)
-[![Embedded targets status](https://github.com/snipsco/tract/workflows/Embedded%20targets/badge.svg)](https://github.com/snipsco/tract/actions)
+[![Native Linux test status](https://github.com/sonos/tract/workflows/Native%20Linux/badge.svg)](https://github.com/sonos/tract/actions)
+[![Embedded targets status](https://github.com/sonos/tract/workflows/Embedded%20targets/badge.svg)](https://github.com/sonos/tract/actions)
 [![Doc](https://docs.rs/tract-core/badge.svg)](https://docs.rs/tract-core)
-
 [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](https://pypi.org/project/tract/)
 
+Sonos' neural-network inference engine.
 
-Sonos' Neural Network inference engine.
-
-_This project used to be called tfdeploy, or Tensorflow-deploy-rust._
-
-## What ?
-
-`tract` is a Neural Network inference toolkit. It can read ONNX or NNEF, optimize them and run them.
-
-## Quick start, examples
-
-* [MobileNet v2 with ONNX](examples/onnx-mobilenet-v2)
-* [BERT example with ONNX](examples/pytorch-albert-v2)
-* [MobileNet v2 with TensorFlow](examples/tensorflow-mobilenet-v2)
-* [From Keras and TensorFlow 2 to tract](examples/keras-tract-tf2)
-* [ResNet with PyTorch](examples/pytorch-resnet)
-
-There is also [some technical documentation](doc/) and [blog](https://tech-blog.sonos.com/posts/optimising-a-neural-network-for-inference/) posts.
-
-## Tract in the landscape
-
-### ONNX
-
-As of today, `tract` passes successfully about 85% of ONNX backends
-tests. All "real life" integration tests in ONNX test suite are passing: 
-bvlc_alexnet, densenet121, inception_v1, inception_v2, resnet50, shufflenet,
-squeezenet, vgg19, zfnet512.
-
-Notable missing parts are operators dealing with Tensor Sequences and Optional Tensors : tract /really/ wants to flow Tensors and nothing else.
-This is structural. Changing it would be pretty difficult, and it's unclear whether it can be done without impairing performance or maintainability.
-We are not convinced these features have shown their interest in the wild yet, so we prefer to leave them aside.
-
-Other dark corners are specific operators like "Resize" which fit perfectly in the framework but need a complex internal logic that is far
-from our core business. In these cases, we are happy to accept contributions and to help. 
+tract loads ONNX and NNEF models, optimises them, and runs them anywhere —
+from embedded ARM CPUs to NVIDIA / Apple GPUs, in the browser via
+WebAssembly, or on a Linux / macOS / Windows workstation. It is used in
+production at Sonos for wake-word and streaming speech-recognition
+workloads, and also runs LLM, text-to-image, and classical CV models with
+a particular focus on the *translate-once / ship-tiny-runtime* story
+enabled by its NNEF-based intermediate format (tract-OPL).
 
-The following operators are implemented and tested.
+## Quick start
 
-Abs, Acos, Acosh, Add, And, ArgMax, ArgMin, ArrayFeatureExtractor, Asin, Asinh, Atan, Atanh, AveragePool, BatchNormalization, BitShift, BitwiseAnd, BitwiseNot, BitwiseOr, BitwiseXor, BlackmanWindow, Cast, CastLike, CategoryMapper, Ceil, Clip, Compress, Concat, Constant, ConstantLike, ConstantOfShape, Conv, ConvInteger, ConvTranspose, Cos, Cosh, CumSum, DFT, DepthToSpace, DequantizeLinear, Div, Dropout, DynamicQuantizeLinear, Einsum, Elu, Equal, Erf, Exp, Expand, EyeLike, Flatten, Floor, GRU, Gather, GatherElements, GatherND, Gemm, GlobalAveragePool, GlobalLpPool, GlobalMaxPool, Greater, GreaterOrEqual, HammingWindow, HannWindow, HardSigmoid, Hardmax, Identity, If, InstanceNormalization, IsInf, IsNaN, LRN, LSTM, LeakyRelu, Less, LessOrEqual, Log, LogSoftmax, MatMul, MatMulInteger, Max, MaxPool, Mean, MelWeightMatrix, Min, Mod, Mul, Multinomial, Neg, NonMaxSuppression, NonZero, Not, OneHot, Or, PRelu, Pad, ParametricSoftplus, Pow, QLinearConv, QLinearMatMul, QuantizeLinear, RNN, RandomNormal, RandomNormalLike, RandomUniform, RandomUniformLike, Range, Reciprocal, ReduceL1, ReduceL2, ReduceLogSum, ReduceLogSumExp, ReduceMax, ReduceMean, ReduceMin, ReduceProd, ReduceSum, ReduceSumSquare, Relu, Reshape, Resize, Round, Rsqrt, STFT, ScaledTanh, Scan, Scatter, ScatterElements, ScatterND, Selu, Shape, Shrink, Sigmoid, Sign, Sin, Sinh, Size, Slice, Softmax, Softplus, Softsign, SpaceToDepth, Split, Sqrt, Squeeze, Sub, Sum, Tan, Tanh, ThresholdedRelu, Tile, Transpose, TreeEnsembleClassifier, Unsqueeze, Where, Xor
+From [`examples/onnx-mobilenet-v2`](examples/onnx-mobilenet-v2):
 
-We test these operators against from ONNX 1.4.1 (operator set 9), up to ONNX 1.13.0 (operator set 18).
+```rust
+use tract::prelude::*;
+tract::impl_ndarray_interop!();
 
-We are using ONNX test suite, but it does not cover everything.
-We also deliberately ignore some tests, or restricting their scope depending on what we feel is realistic.
-Sometimes these decisions are just wrong, and sometimes they become wrong as time goes by and the fields moves in unexpected directions.
-So if you are puzzled by an ONNX model that does not work in tract, we are happy to take a look.
+let model = tract::onnx()?
+    .load("mobilenetv2-7.onnx")?
+    .into_model()?;
 
-### NNEF
+// prepare() optimises and compiles the model for the chosen runtime
+let runtime = tract::runtime_for_name("default")?;
+let runnable = runtime.prepare(model)?;
 
-Long story short, TensorFlow and ONNX formats are good for designing and
-training networks. They need to move fast to follow the research field, tend to
-integrate new features and operators greedily. They also exhibit a high level
-of expressivity to facilitate network design.
+let result = runnable.run([input.tract()?])?;
+```
 
-On the other hand, only a subset of operators and network features actually
-reach production, so systems running production network do not have to deal
-with so many operators. Furthermore, some information required for training can
-be stripped from the network before going to production for prediction.
+The [`tract`](https://crates.io/crates/tract) crate (`api/rs/src/lib.rs`) is the authoritative public API. The
+internal crates (`tract-core`, `tract-nnef`, `tract-onnx`, ...) are not
+stable surface and shouldn't be depended on directly.
 
-NNEF tries to bridge the gap between training frameworks and inference by
-proposing a format dedicated to production and prediction.
+For Python, see the [`tract`](https://pypi.org/project/tract/) package on PyPI.
 
-Tract supports NNEF:
+## Examples
 
-* tract_nnef can load and execute NNEF networks
-* tract supports most of the NNEF specification, the most notable exception
-    being the ROI operators
-* tract introduces tract-OPL, a series of NNEF extensions to support other
-    operators (or extend some operators semantics) in order to represent the
-    full range of tract-core neural network support: any network understood by
-    tract should be serializable to tract-OPL. This is a work in progress.
-* tract command line can translate networks from TensorFlow or ONNX to NNEF/OPL.
+[`examples/`](examples/) has runnable demos covering the workloads tract
+targets today:
 
-### tract-opl version compatibility
+| Example | What |
+|---|---|
+| [`onnx-mobilenet-v2`](examples/onnx-mobilenet-v2) | Minimal CV starter |
+| [`tflite-mobilenet-v3`](examples/tflite-mobilenet-v3) | TFLite import path |
+| [`causal_llm`](examples/causal_llm) | Transformer text generation |
+| [`nemo-parakeet-asr`](examples/nemo-parakeet-asr) / [`nemo-nemotron-streaming-asr`](examples/nemo-nemotron-streaming-asr) | Speech recognition, including streaming via pulsification |
+| [`stable-diffusion`](examples/stable-diffusion) / [`stable-diffusion-3`](examples/stable-diffusion-3) / [`stable-diffusion-xl`](examples/stable-diffusion-xl) | Text-to-image |
+| [`face_detection_yolov8onnx_example`](examples/face_detection_yolov8onnx_example) / [`face_similarity_arcface_onnx`](examples/face_similarity_arcface_onnx) | Modern object detection / face recognition |
+| [`wasm-model-bench`](examples/wasm-model-bench) | Running tract in the browser |
 
-A remainder: NNEF is not expressive enough to represent all ONNX. tract-OPL extends
-NNEF using proprietary to support what is missing. Notable extensions are pulse
-operators, recurring operators (as Scan) and symbolic extensions.
+## Resources
 
-There is no strict check in place here, so... implementation is not bullet proof.
-* NNEF part aims at being very stable. It is strongly constrained with compatibility
-with NNEF specification.
-* tract-opl is a bit more in flux. Nevertheless we try to maintain the following
-golden rule:
+Technical documentation lives under [`doc/`](doc/) (start at [`doc/intro.md`](doc/intro.md));
+the [`doc/cli-recipe.md`](doc/cli-recipe.md) page collects practical CLI recipes.
+The Sonos engineering [blog](https://tech-blog.sonos.com/posts/optimising-a-neural-network-for-inference/)
+has a long-form post on tract internals.
 
-     `models serialized with tract 0.x.y should work with tract 0.x.z where z >= y`
+## Python
 
-* in practice, breaking changes have been relatively rare so far. Most models are
-forward and retro compatible from when tract has acquired NNEF support.
+tract is also available as the [`tract`](https://pypi.org/project/tract/) package on PyPI,
+built on top of the same Rust core:
 
-Notable breakage occurred:
-* 0.16.3 (forward compatible) on Scan operator
-* 0.17.0 for binary decision tree classifier
+```sh
+pip install tract
+```
 
-Starting with `0.17.0`, a model property is injected in tract-opl files (`tract_nnef_ser_version`)
-to tag which version of tract generated the file. As most models will remain compatible,
-tract will not do any version check. It is up to the application developer to do so.
+The API mirrors the Rust pipeline: load a model, set input facts, optimise, then run.
+Documentation: [sonos.github.io/tract](https://sonos.github.io/tract). Source lives in [`api/py/`](api/py/).
 
-A softer version tag exists as `tract_nnef_format_version`. pre-0.17.0 version set it to
-`alpha1`, post-0.17.0 set it `beta1`. Don't put too much emphasis into the "alpha-ness" naming 
-of versions here.
+## Runtimes
 
-### Note: support for TensorFlow 1.x
+| Runtime | Name | Crate | Notes |
+|---|---|---|---|
+| CPU (x86, ARMv6/7/8, ARM SVE) | `"default"` | `tract-linalg` | Default. Hand-rolled SIMD micro-kernels. |
+| Apple Metal | `"metal"` | `tract-metal` | Apple GPUs. |
+| NVIDIA CUDA | `"cuda"` | `tract-cuda` | NVIDIA GPUs. |
+| WebAssembly | `"default"` | _via standard wasm32 targets_ | Browser / WASI deployment. |
 
-Even if `tract` is very far from supporting any arbitrary model, it can run
-Google Inception v3 and Snips wake word models. Missing operators are relatively 
-easy to add. The lack of easy to reuse test suite, and the wide diversity of 
-operators in Tensorflow make it difficult to target a full support.
+All runtimes share the `TypedModel` IR and the same loaders, so a model
+optimised on one platform can be moved to another.
 
-The following operators are implemented and tested:
+## Streaming and pulsification
 
-Abs, Add, AddN, AddV2, Assign, AvgPool, BatchToSpaceND, BiasAdd, BlockLSTM, Cast, Ceil, ConcatV2, Const, Conv2D, DepthwiseConv2dNative, Div, Enter, Equal, Exit, ExpandDims, FakeQuantWithMinMaxVars, Fill, FloorMod, FusedBatchNorm, GatherNd, GatherV2, Greater, GreaterEqual, Identity, Less, LessEqual, Log, LogicalAnd, LogicalOr, LoopCond, MatMul, Max, MaxPool, Maximum, Mean, Merge, Min, Minimum, Mul, Neg, NoOp, Pack, Pad, Placeholder, Pow, Prod, RandomUniform, RandomUniformInt, Range, RealDiv, Relu, Relu6, Reshape, Rsqrt, Shape, Sigmoid, Slice, Softmax, SpaceToBatchND, Squeeze, StridedSlice, Sub, Sum, Switch, Tanh, Tile, Transpose, VariableV2
+tract has first-class support for *pulsified* inference: a network that
+operates on full sequences during training is translated into one that
+processes a fixed-size pulse along its streaming axis at each step. This
+lets the same model serve both batch evaluation and low-latency real-time
+inference (wake-word, streaming ASR, ...).
 
-Additionally, the complexity of TensorFlow 2 make it very unlikely that a direct
-support will ever exist in tract. But many TensorFlow 2 models can be
-converted to ONNX and then loaded in tract.
+The translate-time logic lives in `tract-pulse`; runtime ships only the
+small `tract-pulse-opl` crate. See
+[`AGENTS.md` § Streaming and pulsification](AGENTS.md#streaming-and-pulsification)
+for the engineering view, and
+[`examples/nemo-nemotron-streaming-asr`](examples/nemo-nemotron-streaming-asr)
+for a working demo.
 
-## Example of supported networks
+## Formats and tract-OPL
 
-These models among others, are used to track tract performance evolution as
-part of the Continuous Integration jobs. See [.travis/README.md](readme) and 
-[.travis/bundle-entrypoint.sh](.travis/bundle-entrypoint.sh) for more
-information.
+| Format | Load | Save |
+|---|---|---|
+| ONNX | ✓ | — |
+| NNEF (+ tract-OPL extensions) | ✓ | ✓ |
+| TensorFlow Lite (legacy) | ✓ | ✓ |
+| TensorFlow 1 frozen graph (legacy) | ✓ | — |
 
-### Keyword spotting on Arm Cortex-M Microcontrollers
+PyTorch models can be exported directly to NNEF using
+[torch-to-nnef](https://sonos.github.io/torch-to-nnef/latest/)
+([source](https://github.com/sonos/torch-to-nnef)), an open-source
+PyTorch-to-NNEF converter maintained alongside tract — useful when you want
+to skip the detour through ONNX.
 
-https://github.com/ARM-software/ML-KWS-for-MCU
+tract-OPL is an NNEF-compatible intermediate representation that extends
+NNEF with the operators needed to express a full tract-core model. The
+recommended deployment workflow is:
 
-ARM demonstrated the capabilities of the Cortex-M family by providing
-tutorials and pre-trained models for keyword spotting. While the exercise
-is ultimately meant for micro-controllers, `tract` can run the intermediate
-TensorFlow models.
+1. **Once, at build time:** convert from ONNX / TF / TFLite to NNEF using
+   the `tract` CLI:
+   ```sh
+   tract model.onnx dump --nnef model.nnef.tgz
+   ```
+2. **At runtime:** ship only `tract-core` + `tract-nnef`, plus
+   `tract-onnx-opl` if the model uses ONNX-only operators, and
+   `tract-pulse-opl` if it is pulsified.
 
-For instance, on a Raspberry Pi Zero, the "CNN M" model runs in about 70
-micro-seconds, and 11 micro-seconds on a Raspberry Pi 3.
+This keeps the runtime footprint small (no protobuf, no training-framework
+loaders). See [`doc/intro.md`](doc/intro.md) for the full design rationale.
 
-### Snips wake word models
+### tract-OPL stability
 
-https://arxiv.org/abs/1811.07684
+NNEF parts are tied to the NNEF specification and very stable. tract-OPL
+extensions are a bit more in flux, but we observe the rule:
 
-Snips uses `tract` to run the wake word detectors. While earlier models were
-class-based and did not require any special treatment, `tract` pulsing
-capabilities made it possible to run WaveNet models efficiently enough for a
-Raspberry Pi Zero.
+> A model serialised with tract `0.x.y` should work with tract `0.x.z` where `z >= y`.
 
-### Inception v3
+Models embed a `tract_nnef_ser_version` property identifying the generating
+tract version; tract itself does not enforce a version check, so it is up
+to the application to do so if needed. See [`CHANGELOG.md`](CHANGELOG.md)
+for the running list of notable serialisation-format changes.
 
-|      Device         |      Family    |  TensorFlow-lite  |  tract  |
-|---------------------|----------------|-------------------|---------|
-|  Raspberry Pi Zero  |    Armv6 VFP   |        113s       |   39s   |
-|  Raspberry Pi 2     |    Armv7 NEON  |         25s       |    7s   |
-|  Raspberry Pi 3     |  aarch32 NEON  |          5s       |    5s   |
+## TensorFlow 1 (legacy)
 
-Notes:
+tract still loads TF1 frozen graphs and supports the operator set needed
+for the classical CV and wake-word models that originally drove its design
+(Inception v3, Snips wake words, ...). TensorFlow 2 is not directly
+supported — convert to ONNX first.
 
- * while the Raspberry Pi 3 is an Armv8 device, this bench is running
-     on Raspbian, an armv6 operating system, crippling the performance
-     of both benches
- * there exists other benches on the internet that show better
-     performance results for TensorFlow (not -Lite) on the Pi 3.
-     They use all four cores of the device. Both TensorFlow-Lite and tract
-     here have been made to run on a single-core.
+## License
 
-# License
+Files in `tensorflow/protos` are copied from the
+[TensorFlow](https://github.com/tensorflow/tensorflow) project and files in
+`onnx/protos` from the [ONNX](https://github.com/onnx/onnx) project; neither
+is covered by the licence statement below.
 
-Note: files in the `tensorflow/protos` directory are copied from the
-[TensorFlow](https://github.com/tensorflow/tensorflow) project and are not
-covered by the following licence statement.
+### Apache 2.0 / MIT
 
-Note: files in the `onnx/protos` directory are copied from the
-[ONNX](https://github.com/onnx/onnx) project and are not
-covered by the following license statement.
+All original work is licensed under either of
 
-## Apache 2.0/MIT
+* Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
 
-All original work licensed under either of
- * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
- * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
 at your option.
 
-## Contribution
+### Contribution
 
-Unless you explicitly state otherwise, any Contribution intentionally submitted
-for inclusion in the work by you, as defined in the Apache-2.0 license, shall
-be dual licensed as above, without any additional terms or conditions.
+Unless you explicitly state otherwise, any Contribution intentionally
+submitted for inclusion in the work by you, as defined in the Apache-2.0
+license, shall be dual licensed as above, without any additional terms or
+conditions.
diff --git a/api/ffi/src/lib.rs b/api/ffi/src/lib.rs
index 6774a608b9..ef56c44ed6 100644
--- a/api/ffi/src/lib.rs
+++ b/api/ffi/src/lib.rs
@@ -118,10 +118,10 @@ pub unsafe extern "C" fn tract_nnef_create(nnef: *mut *mut TractNnef) -> TRACT_R
 }
 
 #[unsafe(no_mangle)]
-pub unsafe extern "C" fn tract_nnef_enable_tract_core(nnef: *mut TractNnef) -> TRACT_RESULT {
+pub unsafe extern "C" fn tract_nnef_disable_tract_core(nnef: *mut TractNnef) -> TRACT_RESULT {
     wrap(|| unsafe {
         check_not_null!(nnef);
-        (*nnef).0.enable_tract_core()
+        (*nnef).0.disable_tract_core()
     })
 }
 
diff --git a/api/proxy/src/lib.rs b/api/proxy/src/lib.rs
index 874f2802c1..cf96954014 100644
--- a/api/proxy/src/lib.rs
+++ b/api/proxy/src/lib.rs
@@ -87,8 +87,8 @@ impl NnefInterface for Nnef {
         Ok(Model(model))
     }
 
-    fn enable_tract_core(&mut self) -> Result<()> {
-        check!(sys::tract_nnef_enable_tract_core(self.0))
+    fn disable_tract_core(&mut self) -> Result<()> {
+        check!(sys::tract_nnef_disable_tract_core(self.0))
     }
 
     fn enable_tract_extra(&mut self) -> Result<()> {
diff --git a/api/proxy/sys/tract.h b/api/proxy/sys/tract.h
index 9b04e61d1c..4f805f53e5 100644
--- a/api/proxy/sys/tract.h
+++ b/api/proxy/sys/tract.h
@@ -97,7 +97,7 @@ void tract_free_cstring(char *ptr);
  */
 enum TRACT_RESULT tract_nnef_create(struct TractNnef **nnef);
 
-enum TRACT_RESULT tract_nnef_enable_tract_core(struct TractNnef *nnef);
+enum TRACT_RESULT tract_nnef_disable_tract_core(struct TractNnef *nnef);
 
 enum TRACT_RESULT tract_nnef_enable_tract_extra(struct TractNnef *nnef);
 
diff --git a/api/py/pyproject.toml b/api/py/pyproject.toml
index 1f13673ef2..7043dd3362 100644
--- a/api/py/pyproject.toml
+++ b/api/py/pyproject.toml
@@ -2,7 +2,7 @@
 requires = [
     "setuptools >=80, <83",
     "setuptools_rust >=1.12, <1.13",
-    "wheel >=0.46.3, <0.47",
+    "wheel >=0.47.0, <0.48",
     "toml >=0.10, <0.11"
 ]
 
diff --git a/api/py/tests/mobilenet_onnx_test.py b/api/py/tests/mobilenet_onnx_test.py
index 059dc78f07..507c59af11 100644
--- a/api/py/tests/mobilenet_onnx_test.py
+++ b/api/py/tests/mobilenet_onnx_test.py
@@ -48,7 +48,7 @@ def test_state():
     assert numpy.argmax(confidences) == 652
 
 def test_nnef_register():
-    tract.nnef().with_tract_core().with_onnx().with_pulse().with_tract_extra()
+    tract.nnef().with_onnx().with_pulse().with_tract_extra()
 
 def test_nnef():
     model = (
@@ -133,7 +133,7 @@ def test_pulse():
     assert str(typed.output_fact(0)) == "5,1000,f32"
     properties = typed.property_keys()
     properties.sort()
-    assert properties == ["pulse.delay", "pulse.input_axes", "pulse.output_axes"]
+    assert properties == ["pulse.delay", "pulse.input_axes", "pulse.output_axes", "pulse.streaming_symbol"]
     assert typed.property("pulse.delay").to_numpy() == [0]
 
 def test_pulse_builder():
@@ -168,7 +168,7 @@ def test_runtime_properties():
     runnable = typed.into_runnable()
     properties = runnable.property_keys()
     properties.sort()
-    assert properties == ["pulse.delay", "pulse.input_axes", "pulse.output_axes"]
+    assert properties == ["pulse.delay", "pulse.input_axes", "pulse.output_axes", "pulse.streaming_symbol"]
     assert runnable.property("pulse.delay").to_numpy() == [0]
 
 def test_f32_to_f16():
@@ -222,7 +222,7 @@ def test_typed_model_to_nnef_and_back():
     typed = model.into_model()
     with tempfile.TemporaryDirectory() as tmpdirname:
         tmpdirname = Path(tmpdirname)
-        nnef = tract.nnef().with_tract_core()
+        nnef = tract.nnef()
 
         path = tmpdirname / "nnef-dir"
         nnef.write_model_to_dir(typed, path)
@@ -271,7 +271,7 @@ def test_profile():
     assert next(filter(lambda node: "cost" in node and "FMA(F32)" in node["cost"], profile["nodes"]), None) != None
 
 def test_transform_registry():
-    nnef = tract.nnef().with_tract_core()
+    nnef = tract.nnef()
     model = nnef.load("mobilenet_v2_1.0.onnx.nnef.tgz")
 
     #Convert model to half
@@ -284,7 +284,7 @@ def test_transform_registry():
     assert str(model.input_fact(0)) == "1,3,224,224,f32"
 
 def test_fact_and_dims():
-    nnef = tract.nnef().with_tract_core()
+    nnef = tract.nnef()
     model = nnef.load("mobilenet_v2_1.0.onnx.nnef.tgz")
     fact = model.parse_fact("B,S+P,64,f32")
     assert fact.datum_type() == tract.DatumType.F32
@@ -298,7 +298,7 @@ def test_fact_and_dims():
     assert int(fourteen) == 14
 
 def test_fact_and_dims_iterators():
-    nnef = tract.nnef().with_tract_core()
+    nnef = tract.nnef()
     model = nnef.load("mobilenet_v2_1.0.onnx.nnef.tgz")
     facts = model.input_facts()
     assert len(facts) == 1
@@ -310,7 +310,7 @@ def test_fact_and_dims_iterators():
     assert int(dims[3]) == 224
 
 def test_runtime_fact_iterator():
-    nnef = tract.nnef().with_tract_core()
+    nnef = tract.nnef()
     runnable = nnef.load("mobilenet_v2_1.0.onnx.nnef.tgz").into_runnable()
     inputs = runnable.input_facts();
     assert len(inputs) == 1
diff --git a/api/py/tract/nnef.py b/api/py/tract/nnef.py
index 1efff7e1a6..4faa7dc28d 100644
--- a/api/py/tract/nnef.py
+++ b/api/py/tract/nnef.py
@@ -47,12 +47,14 @@ def load(self, path: Union[str, Path]) -> Model:
         check(lib.tract_nnef_load(self.ptr, path, byref(model)))
         return Model(model)
 
-    def with_tract_core(self) -> "Nnef":
+    def without_tract_core(self) -> "Nnef":
         """
-        Enable tract-opl extensions to NNEF to covers tract-core operator set
+        Force the framework to emit strict NNEF instead of using the tract_core
+        extension. The tract_core extension is enabled by default; call this to
+        opt out.
         """
         self._valid()
-        check(lib.tract_nnef_enable_tract_core(self.ptr))
+        check(lib.tract_nnef_disable_tract_core(self.ptr))
         return self
 
     def with_tract_extra(self) -> "Nnef":
diff --git a/api/rs/Cargo.toml b/api/rs/Cargo.toml
index cc97e859aa..457a943486 100644
--- a/api/rs/Cargo.toml
+++ b/api/rs/Cargo.toml
@@ -34,7 +34,7 @@ serde_json.workspace = true
 tract-metal.workspace = true
 
 [target.'cfg(any(target_os = "linux", target_os = "windows"))'.dependencies]
-tract-cuda.workspace = true
+tract-cuda = { workspace = true, default-features = false }
 
 
 [dev-dependencies]
@@ -44,7 +44,23 @@ tempfile.workspace = true
 serde_json.workspace = true
 
 [features]
+default = ["cuda-13000"]
 complex = []
 # default = [ "dylib" ]
 # dylib = []
 # staticlib = []
+
+# CUDA driver-API selectors (linux/windows targets only). Pass-through to
+# tract-cuda. Exactly one must be active; the default pulls cuda-13000.
+cuda-12000 = ["tract-cuda/cuda-12000"]
+cuda-12010 = ["tract-cuda/cuda-12010"]
+cuda-12020 = ["tract-cuda/cuda-12020"]
+cuda-12030 = ["tract-cuda/cuda-12030"]
+cuda-12040 = ["tract-cuda/cuda-12040"]
+cuda-12050 = ["tract-cuda/cuda-12050"]
+cuda-12060 = ["tract-cuda/cuda-12060"]
+cuda-12080 = ["tract-cuda/cuda-12080"]
+cuda-12090 = ["tract-cuda/cuda-12090"]
+cuda-13000 = ["tract-cuda/cuda-13000"]
+cuda-13010 = ["tract-cuda/cuda-13010"]
+cuda-13020 = ["tract-cuda/cuda-13020"]
diff --git a/api/rs/src/lib.rs b/api/rs/src/lib.rs
index 241039b936..636829d586 100644
--- a/api/rs/src/lib.rs
+++ b/api/rs/src/lib.rs
@@ -92,8 +92,8 @@ impl NnefInterface for Nnef {
         Ok(Model(m))
     }
 
-    fn enable_tract_core(&mut self) -> Result<()> {
-        self.0.enable_tract_core();
+    fn disable_tract_core(&mut self) -> Result<()> {
+        self.0.disable_tract_core();
         Ok(())
     }
 
@@ -410,7 +410,7 @@ impl RunnableInterface for Runnable {
                 &self.0,
                 &BenchLimits::default(),
                 &mut annotations,
-                &RunTensors { sources: vec![inputs] },
+                &RunTensors { sources: vec![inputs], streaming_input_len: None },
                 None,
                 true,
             )?;
diff --git a/api/src/lib.rs b/api/src/lib.rs
index 0a87760cb6..df3a35d6fc 100644
--- a/api/src/lib.rs
+++ b/api/src/lib.rs
@@ -24,8 +24,9 @@ pub trait NnefInterface: Debug + Sized {
     /// data is the content of a NNEF model, as a `tar` file or a `tar.gz` file.
     fn load_buffer(&self, data: &[u8]) -> Result<Self::Model>;
 
-    /// Allow the framework to use tract_core extensions instead of a stricter NNEF definition.
-    fn enable_tract_core(&mut self) -> Result<()>;
+    /// Force the framework to emit strict NNEF instead of using the tract_core extension.
+    /// The tract_core extension is enabled by default; call this to opt out.
+    fn disable_tract_core(&mut self) -> Result<()>;
 
     /// Allow the framework to use tract_extra extensions.
     fn enable_tract_extra(&mut self) -> Result<()>;
@@ -46,13 +47,13 @@ pub trait NnefInterface: Debug + Sized {
     /// the node names in serialized form.
     fn enable_extended_identifier_syntax(&mut self) -> Result<()>;
 
-    /// Convenience function, similar with enable_tract_core but allowing method chaining.
-    fn with_tract_core(mut self) -> Result<Self> {
-        self.enable_tract_core()?;
+    /// Convenience function, similar to disable_tract_core but allowing method chaining.
+    fn without_tract_core(mut self) -> Result<Self> {
+        self.disable_tract_core()?;
         Ok(self)
     }
 
-    /// Convenience function, similar with enable_tract_core but allowing method chaining.
+    /// Convenience function, similar with enable_tract_extra but allowing method chaining.
     fn with_tract_extra(mut self) -> Result<Self> {
         self.enable_tract_extra()?;
         Ok(self)
diff --git a/api/tests/mobilenet/mod.rs b/api/tests/mobilenet/mod.rs
index ea4204f5fd..2487a2c999 100644
--- a/api/tests/mobilenet/mod.rs
+++ b/api/tests/mobilenet/mod.rs
@@ -156,7 +156,7 @@ fn test_pulse() -> anyhow::Result<()> {
     assert_eq!(typed.output_fact(0)?.to_string(), "5,1000,f32");
     let mut properties = typed.property_keys()?;
     properties.sort();
-    assert_eq!(&properties, &["pulse.delay", "pulse.input_axes", "pulse.output_axes"]);
+    assert_eq!(&properties, &["pulse.delay", "pulse.input_axes", "pulse.output_axes", "pulse.streaming_symbol"]);
     assert_eq!(typed.property("pulse.delay")?.as_slice::<i64>()?, &[0i64]);
     Ok(())
 }
@@ -194,7 +194,7 @@ fn test_runtime_properties() -> anyhow::Result<()> {
     let runnable = typed.into_runnable()?;
     let mut properties = runnable.property_keys()?;
     properties.sort();
-    assert_eq!(&properties, &["pulse.delay", "pulse.input_axes", "pulse.output_axes"]);
+    assert_eq!(&properties, &["pulse.delay", "pulse.input_axes", "pulse.output_axes", "pulse.streaming_symbol"]);
     assert_eq!(runnable.property("pulse.delay")?.as_slice::<i64>()?, &[0i64]);
     Ok(())
 }
@@ -276,7 +276,7 @@ fn test_typed_model_to_nnef_and_back() -> anyhow::Result<()> {
     model.analyse()?;
     let typed = model.into_model()?;
     let dir = tempfile::tempdir()?;
-    let nnef = nnef()?.with_tract_core()?;
+    let nnef = nnef()?;
 
     let path = dir.path().join("nnef-dir");
     nnef.write_model_to_dir(&path, &typed)?;
@@ -336,7 +336,7 @@ fn test_profile() -> anyhow::Result<()> {
 fn test_transform_registry() -> anyhow::Result<()> {
     ensure_models()?;
 
-    let nnef = nnef()?.with_tract_core()?;
+    let nnef = nnef()?;
     let mut model = nnef.load("mobilenet_v2_1.0.onnx.nnef.tgz")?;
 
     // Convert model to half
@@ -354,7 +354,7 @@ fn test_transform_registry() -> anyhow::Result<()> {
 #[test]
 fn test_fact_and_dims() -> anyhow::Result<()> {
     ensure_models()?;
-    let nnef = nnef()?.with_tract_core()?;
+    let nnef = nnef()?;
     let model = nnef.load("mobilenet_v2_1.0.onnx.nnef.tgz")?;
     let fact = model.parse_fact("B,S+P,64,f32")?;
     assert_eq!(fact.datum_type()?, f32::datum_type());
@@ -371,7 +371,7 @@ fn test_fact_and_dims() -> anyhow::Result<()> {
 #[test]
 fn test_fact_and_dims_iterators() -> anyhow::Result<()> {
     ensure_models()?;
-    let nnef = nnef()?.with_tract_core()?;
+    let nnef = nnef()?;
     let model = nnef.load("mobilenet_v2_1.0.onnx.nnef.tgz")?;
     let fact = model.input_facts()?.collect::<Vec<_>>();
     assert!(fact.len() == 1);
@@ -387,7 +387,7 @@ fn test_fact_and_dims_iterators() -> anyhow::Result<()> {
 #[test]
 fn test_runtime_fact_iterator() -> anyhow::Result<()> {
     ensure_models()?;
-    let nnef = nnef()?.with_tract_core()?;
+    let nnef = nnef()?;
     let runnable = nnef.load("mobilenet_v2_1.0.onnx.nnef.tgz")?.into_runnable()?;
     let inputs = runnable.input_facts()?.collect::<Vec<_>>();
     assert!(inputs.len() == 1);
diff --git a/cli/Cargo.toml b/cli/Cargo.toml
index 2f9661089d..e17edb2504 100644
--- a/cli/Cargo.toml
+++ b/cli/Cargo.toml
@@ -75,7 +75,7 @@ tract-metal.workspace = true
 
 [target.'cfg(any(target_os = "linux", target_os = "windows"))'.dependencies]
 cudarc.workspace = true
-tract-cuda.workspace = true
+tract-cuda = { workspace = true, default-features = false }
 
 [features]
 default = [
@@ -86,6 +86,7 @@ default = [
   "tflite",
   "transformers",
   "extra",
+  "cuda-13000",
 ]
 apple-amx-ios = ["tract-linalg/apple-amx-ios"]
 onnx = ["tract-onnx", "tract-libcli/hir", "tract-libcli/onnx"]
@@ -97,3 +98,21 @@ tflite = ["tract-tflite"]
 transformers = ["tract-transformers", "tract-libcli/transformers"]
 conform = ["tract-tensorflow/conform"]
 multithread-mm = ["tract-linalg/multithread-mm"]
+
+# CUDA driver-API selectors (linux/windows targets only). Picking one binds
+# cudarc against the matching CUDA enum/struct layout; the resulting binary
+# runs on any driver whose API is >= the chosen one. Exactly one must be
+# active. The default pulls cuda-13000 in; cross-compiles aimed at older
+# drivers (e.g. aarch64 with CUDA 12) override with `--no-default-features`.
+cuda-12000 = ["tract-cuda/cuda-12000"]
+cuda-12010 = ["tract-cuda/cuda-12010"]
+cuda-12020 = ["tract-cuda/cuda-12020"]
+cuda-12030 = ["tract-cuda/cuda-12030"]
+cuda-12040 = ["tract-cuda/cuda-12040"]
+cuda-12050 = ["tract-cuda/cuda-12050"]
+cuda-12060 = ["tract-cuda/cuda-12060"]
+cuda-12080 = ["tract-cuda/cuda-12080"]
+cuda-12090 = ["tract-cuda/cuda-12090"]
+cuda-13000 = ["tract-cuda/cuda-13000"]
+cuda-13010 = ["tract-cuda/cuda-13010"]
+cuda-13020 = ["tract-cuda/cuda-13020"]
diff --git a/cli/src/compare.rs b/cli/src/compare.rs
index 7b346e3bd3..b16650f24c 100644
--- a/cli/src/compare.rs
+++ b/cli/src/compare.rs
@@ -356,14 +356,57 @@ pub fn handle_stream(
     }
 
     let stream_dim = max_delay + 3 * input_pulse + input_pulse / 2;
-    let concrete_sym_values = SymbolValues::default().with(&stream_symbol, stream_dim as _);
+    let mut concrete_sym_values = SymbolValues::default().with(&stream_symbol, stream_dim as _);
+    // If Blockify rewrote the model (T → k·S), it stashes the new chunk
+    // symbol and chunk size in pulsed properties.  Bind the chunk symbol
+    // to `stream_dim / k` so per-fact `stream.dim.eval_to_i64` calls below
+    // resolve through it.  (Each `PulsedModel::new` mints a fresh chunk
+    // symbol, so we read from *this* pulsed instance.)
+    if let (Some(chunk_sym_t), Some(chunk_size_t)) = (
+        pulsed.properties.get(tract_pulse::blockify::BLOCKIFY_CHUNK_SYMBOL),
+        pulsed.properties.get(tract_pulse::blockify::BLOCKIFY_CHUNK_SIZE),
+    ) {
+        let chunk_sym_name = chunk_sym_t.to_plain_array_view::<String>()?[[0]].clone();
+        let chunk_sym = pulsed.symbols.sym(&chunk_sym_name);
+        let chunk_size = chunk_size_t.cast_to_scalar::<i64>()? as usize;
+        ensure!(
+            stream_dim % chunk_size == 0,
+            "stream_dim {stream_dim} not divisible by Blockify chunk size {chunk_size}"
+        );
+        concrete_sym_values = concrete_sym_values.with(&chunk_sym, (stream_dim / chunk_size) as _);
+    }
 
     // Second pass: build full metadata with fixed_output_len
     for pulsed_node in pulsed.nodes() {
         if let Ok(fact) = pulsed.outlet_fact(OutletId::new(pulsed_node.id, 0)) {
             if let Some(stream) = &fact.stream {
-                let output_pulse = fact.pulse().context("no pulse")?.to_usize()?;
-                let fixed_output_len = stream.dim.eval_to_i64(&concrete_sym_values)? as usize;
+                let output_pulse = fact
+                    .pulse()
+                    .with_context(|| {
+                        format!(
+                            "evaluating pulse on node #{} {} (slot 0)",
+                            pulsed_node.id, pulsed_node.name
+                        )
+                    })?
+                    .to_usize()
+                    .with_context(|| {
+                        format!(
+                            "evaluating pulse to usize on node #{} {} (slot 0), shape {:?}",
+                            pulsed_node.id, pulsed_node.name, fact.shape
+                        )
+                    })?;
+                let fixed_output_len = stream
+                    .dim
+                    .eval_to_i64(&concrete_sym_values)
+                    .with_context(|| {
+                        format!(
+                            "evaluating stream.dim {:?} on node #{} {} (slot 0); known symbols: {:?}",
+                            stream.dim,
+                            pulsed_node.id,
+                            pulsed_node.name,
+                            concrete_sym_values,
+                        )
+                    })? as usize;
                 pulse_meta.insert(
                     pulsed_node.name.clone(),
                     PulseInfo {
@@ -473,7 +516,8 @@ pub fn handle_stream(
     }
 
     // Concretize the reference model and delegate to compare()
-    let concrete_ref = Arc::new(reference.clone().concretize_dims(&concrete_sym_values)?);
+    let concrete_ref =
+        Arc::new(reference.clone().substitute_symbols(&concrete_sym_values.to_dim_map())?);
     compare(
         false,
         &concrete_ref,
@@ -531,10 +575,16 @@ where
                             .and_then(|r| r.as_ref().ok())
                             .cloned()
                     };
+                    // The GPU translator wraps nodes that absorb adjacent
+                    // axis ops with a `.fused_axis_op` suffix.  Strip it for
+                    // lookup so per-node bisection lines up GPU outputs with
+                    // the CPU reference.
+                    let stripped_name =
+                        node.name.strip_suffix(".fused_axis_op").unwrap_or(&node.name);
                     let reference: Option<TValue> = tract
                         .outlet_label((node.id, slot).into())
                         .and_then(get_value)
-                        .or_else(|| get_value(&node.name).filter(|_| slot == 0));
+                        .or_else(|| get_value(stripped_name).filter(|_| slot == 0));
 
                     let Some(reference) = reference else {
                         tags.style = Some(Yellow.into());
@@ -547,13 +597,34 @@ where
                         node.outputs[slot].fact.to_typed_fact().unwrap(),
                     );
                     let needed_type = clarified_fact.datum_type;
-                    let needed_shape =
-                        clarified_fact.shape.eval_to_usize(&session_state.resolved_symbols)?;
+                    let needed_shape = clarified_fact
+                        .shape
+                        .eval_to_usize(&session_state.resolved_symbols)
+                        .with_context(|| {
+                            format!(
+                                "evaluating shape {:?} on node #{} {} (slot {}); known symbols: {:?}",
+                                clarified_fact.shape,
+                                node.id,
+                                node.name,
+                                slot,
+                                session_state.resolved_symbols,
+                            )
+                        })?;
 
                     if **needed_shape != *reference.shape() {
                         let Ok(reshaped) = reference.clone().into_shape(&needed_shape) else {
-                            comparison_error = Some(format!("Incompatible shape on output {slot} reference is {reference:?}, model expects {:?}.", needed_shape));
-                            tags.style = Some(Red.into());
+                            // Pulsification can change intermediate-node shapes
+                            // (e.g. Blockify rewrites a `[T,T]` score into a
+                            // chunked `[S, k, k]`); the streamed value can't
+                            // be directly compared against the reference.
+                            // Mark unchecked rather than failing — the model
+                            // outputs are still validated end-to-end.
+                            tags.style = Some(Yellow.into());
+                            tags.labels.push(format!(
+                                "Skipped: incompatible shape on output {slot}, reference {:?}, model expects {:?}",
+                                reference.shape(), needed_shape,
+                            ));
+                            unchecked.insert(node.id);
                             continue;
                         };
                         reference = reshaped;
@@ -594,8 +665,19 @@ where
                         )
                     }
 
-                    if !cumulative && returning[slot].is_plain() {
-                        returning[slot] = reference.into_tvalue();
+                    if !cumulative {
+                        use tract_gpu::tensor::{DeviceTensorExt, IntoDevice};
+                        if returning[slot].is_plain() {
+                            returning[slot] = reference.into_tvalue();
+                        } else if returning[slot].as_device_tensor().is_some() {
+                            // Device-resident output: stage the CPU reference
+                            // back onto the device so downstream device ops can
+                            // consume it.  Without this, cumulative=off would
+                            // silently keep tract's (potentially-drifted) value
+                            // and per-node bisection becomes useless on GPU.
+                            returning[slot] =
+                                reference.into_device()?.into_tensor().into_tvalue();
+                        }
                     }
                 }
                 if let Some(e) = comparison_error {
diff --git a/cli/src/main.rs b/cli/src/main.rs
index 8f7aab7a42..bde9564ce1 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -89,7 +89,6 @@ pub const STAGES: &[&str] = &[
 fn main() -> TractResult<()> {
     use clap::*;
     let mut app = command!()
-        .allow_hyphen_values(true)
         .arg(arg!(--readings "Start readings instrumentation"))
         .arg(arg!(--"readings-heartbeat" [MS] "Heartbeat for readings background collector").default_value("5"))
         .arg(arg!(verbose: -v ... "Sets the level of verbosity.").action(clap::ArgAction::Count))
@@ -153,7 +152,8 @@ fn main() -> TractResult<()> {
         .arg(arg!(--"tflite-cycle" "Perform TFLITE dump and reload before optimizing"))
 
         .arg(arg!(--"no-nnef-tract-core" "Disable usage of tract-core extension in NNEF dump and load"))
-        .arg(arg!(--"nnef-tract-core" "Allow usage of tract-core extension in NNEF dump and load")).hide(true)
+        // deprecated: tract-core is now enabled by default
+        .arg(arg!(--"nnef-tract-core" "no-op, kept for backward compatibility").hide(true))
         .arg(arg!(--"nnef-tract-resource" "Allow usage of tract-resource extension in NNEF dump and load"))
         .arg(arg!(--"nnef-tract-onnx" "Allow usage of tract-onnx extension in NNEF dump and load"))
         .arg(arg!(--"nnef-tract-pulse" "Allow usage of tract-pulse extension in NNEF dump and load"))
@@ -979,8 +979,8 @@ fn nnef(matches: &clap::ArgMatches) -> tract_nnef::internal::Nnef {
             panic!("tract is build without tract-transformers support")
         }
     }
-    if !matches.get_flag("no-nnef-tract-core") {
-        fw = fw.with_tract_core();
+    if matches.get_flag("no-nnef-tract-core") {
+        fw = fw.without_tract_core();
     }
     if matches.get_flag("nnef-tract-resource") || matches.get_flag("opl") {
         use tract_nnef_resources::internal::JsonLoader;
diff --git a/cli/src/params.rs b/cli/src/params.rs
index 4f8bea628c..345287b33f 100644
--- a/cli/src/params.rs
+++ b/cli/src/params.rs
@@ -456,6 +456,29 @@ impl Parameters {
         Ok(values)
     }
 
+    /// Parse `--set X=value` into a `Symbol → TDim` substitution map.
+    /// `value` may be a plain integer or any TDim expression parseable
+    /// against the model's symbol scope (e.g. `2*S` to rebase the
+    /// streaming symbol onto a finer-grained chunk symbol before
+    /// pulsification).  Feeds straight into `model.substitute_symbols`.
+    pub fn parse_set_subs(
+        typed_model: &TypedModel,
+        set: impl Iterator<Item = impl AsRef<str>>,
+    ) -> TractResult<std::collections::HashMap<Symbol, TDim>> {
+        let mut subs = std::collections::HashMap::new();
+        for set in set {
+            let set = set.as_ref();
+            let (key, value) = set
+                .split_once('=')
+                .with_context(|| format!("--set must be in the X=value form, got {set}"))?;
+            let dim = tract_core::internal::parse_tdim(&typed_model.symbols, value)
+                .with_context(|| format!("--set: parsing TDim expression for {key}={value}"))?;
+            let sym = typed_model.get_or_intern_symbol(key);
+            subs.insert(sym, dim);
+        }
+        Ok(subs)
+    }
+
     pub fn parse_npz(
         input: &str,
         get_values: bool,
@@ -755,27 +778,15 @@ impl Parameters {
         }
 
         if let Some(set) = matches.get_many::<String>("set") {
-            let values = Self::parse_set_and_hint(typed_model.as_ref().unwrap(), set)?;
-            stage!("set", typed_model -> typed_model, |mut m: TypedModel| {
-                for node in m.eval_order()? {
-                    let node = m.node_mut(node);
-                    if let Some(op) = node.op_as_mut::<Const>() {
-                        if op.val().datum_type() == DatumType::TDim { {
-                            // get inner value to Arc<Tensor>
-                            let mut constant:Tensor = (**op.val()).clone();
-                            // Generally a shape or hyperparam
-                            constant
-                                .try_as_plain_mut()?
-                                .as_slice_mut::<TDim>()?
-                                .iter_mut()
-                                .for_each(|x| *x = x.eval(&values));
-
-                            *op = Const::new(constant.into_arc_tensor())?;
-                        }
-                        }
-                    }
-                }
-                m.concretize_dims(&values)
+            // --set delegates to model.substitute_symbols with a
+            // Symbol → TDim map (same path the `concretize_symbols`
+            // model transform takes).  Values may be plain integers or
+            // TDim expressions (e.g. `--set T=2*S` to rebase the
+            // streaming symbol).  Const TDim tensors are rewritten
+            // through Const's own substitute_symbols hook.
+            let subs = Self::parse_set_subs(typed_model.as_ref().unwrap(), set)?;
+            stage!("set", typed_model -> typed_model, move |m: TypedModel| {
+                m.substitute_symbols(&subs)
             });
             stage!("set-declutter", typed_model -> typed_model, |mut m| {
                 let mut dec = tract_core::optim::Optimizer::declutter();
diff --git a/cli/src/run.rs b/cli/src/run.rs
index 907820db83..20ae6ed145 100644
--- a/cli/src/run.rs
+++ b/cli/src/run.rs
@@ -182,9 +182,49 @@ where
         Vec::new()
     };
 
+    // Pre-compute the streaming-symbol binding so we can re-apply it every
+    // turn (run_plan_with_eval calls reset_turn at the end, wiping
+    // resolved_symbols).  Without this, PulsePad's `end_input.eval(...)`
+    // hits the `usize::MAX` fallback and the tail-pad fill silently never
+    // triggers — pulse output then disagrees with batch on the boundary.
+    let pulse_sym_binding: Option<(Symbol, i64)> =
+        if let (Some(sym_name), Some(stream_len), Some(first_input)) = (
+            state
+                .model()
+                .properties
+                .get("pulse.streaming_symbol")
+                .and_then(|t| t.to_plain_array_view::<String>().ok())
+                .and_then(|a| a.first().cloned()),
+            inputs.streaming_input_len,
+            inputs.sources.first().and_then(|t| t.first()),
+        ) {
+            let input_axes = state
+                .model()
+                .properties
+                .get("pulse.input_axes")
+                .context("Expect pulse.input_axes when pulse.streaming_symbol is set")?
+                .cast_to::<i64>()?;
+            let input_axis = input_axes.try_as_plain()?.as_slice::<i64>()?[0] as usize;
+            let pulse_value = first_input.shape()[input_axis];
+            // Linear case: stream.dim = pulse_value · symbol.  For non-linear
+            // dims (e.g. `4·s + 1`) this would be wrong, but blockified models
+            // arrive here with a chunk-aligned linear dim by construction.
+            if pulse_value > 0 && stream_len % pulse_value == 0 {
+                let sym = state.model().symbols.sym(&sym_name);
+                Some((sym, (stream_len / pulse_value) as i64))
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
     let mut sources = inputs.sources;
     for turn in 0..sources.len() {
         let inputs = std::mem::replace(&mut sources[turn], TVec::new());
+        if let Some((sym, val)) = &pulse_sym_binding {
+            state.turn_state.resolved_symbols.set(sym, *val);
+        }
         let turn_results =
             state.run_plan_with_eval(inputs, |session_state, state, node, input| {
                 if steps {
diff --git a/cli/src/utils.rs b/cli/src/utils.rs
index 27824fc669..08aab519c9 100644
--- a/cli/src/utils.rs
+++ b/cli/src/utils.rs
@@ -119,6 +119,13 @@ pub fn check_inferred(got: &[InferenceFact], expected: &[InferenceFact]) -> Trac
 }
 
 pub fn clarify_tvalue(t: &TValue) -> TractResult<TValue> {
+    use tract_gpu::tensor::DeviceTensorExt;
+    if let Some(device_tensor) = t.as_device_tensor() {
+        // Pull the device-resident tensor back to host so the comparison
+        // machinery (close_enough, cast_to_dt) can read it as plain storage.
+        let host = device_tensor.to_host()?;
+        return Ok(host.into_tensor().into_tvalue());
+    }
     Ok((*t).clone())
 }
 
diff --git a/core/Cargo.toml b/core/Cargo.toml
index a0372b5acf..f2a31c93b1 100644
--- a/core/Cargo.toml
+++ b/core/Cargo.toml
@@ -15,12 +15,9 @@ rust-version.workspace = true
 maintenance = { status = "actively-developed" }
 
 [dependencies]
-accelerate-src = { workspace = true, optional = true }
 anyhow.workspace = true
 anymap3.workspace = true
 bit-set.workspace = true
-blis-src = { version = "0.2", features = ["static", "pthreads"], optional = true }
-cblas = { version = "0.5", optional = true }
 derive-new.workspace = true
 downcast-rs.workspace = true
 erased-serde.workspace = true
@@ -33,7 +30,6 @@ ndarray.workspace = true
 num-integer.workspace = true
 num-traits.workspace = true
 num-complex.workspace = true
-openblas-src = { workspace=true, optional = true }
 pastey.workspace = true
 rustfft.workspace = true
 smallvec.workspace = true
@@ -45,10 +41,6 @@ inventory.workspace = true
 [features]
 default = [ ]
 complex = [ "tract-data/complex", "tract-linalg/complex" ]
-blas = [ "cblas" ]
-accelerate = [ "blas", "accelerate-src" ]
-blis = [ "blas", "blis-src" ]
-openblas = [ "blas", "openblas-src" ]
 paranoid_assertions = []
 
 [dev-dependencies]
@@ -65,3 +57,7 @@ proptest.workspace = true
 criterion = { version = "0.8", default-features = false, features = ["plotters", "cargo_bench_support"] }
 # Wasm doesn't support the `fork` feature of proptest.
 proptest = { version = "1.0.0", default-features = false, features = ["std", "bit-set"] }
+
+[[bench]]
+name = "plan_overhead"
+harness = false
diff --git a/core/benches/plan_overhead.rs b/core/benches/plan_overhead.rs
new file mode 100644
index 0000000000..7de71ecc85
--- /dev/null
+++ b/core/benches/plan_overhead.rs
@@ -0,0 +1,42 @@
+//! Plan-loop dispatch overhead micro-bench.
+//!
+//! Builds chains of N trivial ops (Add(unique-const)) and measures `plan.run()`
+//! wall-time. Per-op dispatch overhead is `ns_per_run / n_nodes` for large N
+//! (small N is dominated by run-entry cost).
+//!
+//! Use this to detect regressions in `do_exec_plan_with_eval`'s per-node loop.
+
+use criterion::{Criterion, criterion_group, criterion_main};
+use tract_core::internal::*;
+use tract_core::ops::math::add;
+
+const VEC_LEN: usize = 8;
+
+fn build_chain(n: usize) -> Arc<TypedRunnableModel> {
+    let mut model = TypedModel::default();
+    let input_fact = f32::fact([VEC_LEN]);
+    let mut prev = model.add_source("input", input_fact).unwrap();
+    for i in 0..n {
+        let v = (i as f32 + 1.0) * 1e-6;
+        let c = model.add_const(format!("c{i}"), tensor1(&vec![v; VEC_LEN])).unwrap();
+        prev = model.wire_node(format!("a{i}"), add(), &[prev, c]).unwrap()[0];
+    }
+    model.select_output_outlets(&[prev]).unwrap();
+    model.into_optimized().unwrap().into_runnable().unwrap()
+}
+
+fn bench_chain(c: &mut Criterion) {
+    let mut group = c.benchmark_group("plan_overhead");
+    let input: TValue = tensor1(&vec![1.0f32; VEC_LEN]).into();
+
+    for &n in &[1, 10, 100, 1000] {
+        let plan = build_chain(n);
+        group.bench_function(format!("chain_n{n}"), |b| {
+            b.iter(|| plan.run(tvec![input.clone()]).unwrap())
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_chain);
+criterion_main!(benches);
diff --git a/core/src/lib.rs b/core/src/lib.rs
index 6df5d01eb1..5dce8100ab 100644
--- a/core/src/lib.rs
+++ b/core/src/lib.rs
@@ -45,15 +45,6 @@
 //! tract-tensorflow or tract-onnx crates.
 //!
 
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-#[cfg(feature = "blis")]
-extern crate blis_src;
-#[cfg(feature = "blas")]
-extern crate cblas;
-#[cfg(feature = "openblas")]
-extern crate openblas_src;
-
 extern crate bit_set;
 #[macro_use]
 extern crate derive_new;
diff --git a/core/src/model/fact.rs b/core/src/model/fact.rs
index 9091eb8b46..9b97cf2463 100644
--- a/core/src/model/fact.rs
+++ b/core/src/model/fact.rs
@@ -56,6 +56,22 @@ impl ShapeFact {
         }
     }
 
+    /// Substitute symbols by TDim expressions in every dim of the shape.
+    /// Concrete shapes pass through unchanged.
+    #[inline]
+    pub fn substitute(
+        &self,
+        subs: &std::collections::HashMap<Symbol, TDim>,
+    ) -> TractResult<Cow<'_, ShapeFact>> {
+        if self.is_concrete() {
+            Ok(Cow::Borrowed(self))
+        } else {
+            Ok(Cow::Owned(
+                self.iter().map(|d| d.substitute_all(subs)).collect::<TractResult<ShapeFact>>()?,
+            ))
+        }
+    }
+
     #[inline]
     pub fn eval_to_usize(&self, values: &SymbolValues) -> TractResult<Cow<'_, TVec<usize>>> {
         if let Some(c) = &self.concrete {
diff --git a/core/src/model/typed.rs b/core/src/model/typed.rs
index 5a78620f36..ea6b2c0fe1 100644
--- a/core/src/model/typed.rs
+++ b/core/src/model/typed.rs
@@ -256,8 +256,8 @@ impl TypedModel {
         Ok(())
     }
 
-    pub fn concretize_dims(&self, values: &SymbolValues) -> TractResult<TypedModel> {
-        values.translate_model(self)
+    pub fn substitute_symbols(&self, subs: &HashMap<Symbol, TDim>) -> TractResult<TypedModel> {
+        crate::model::translator::Translate::translate_model(subs, self)
     }
 
     pub fn prop_consts(&mut self) -> TractResult<()> {
@@ -311,7 +311,7 @@ impl TypedModel {
 }
 
 use crate::model::translator::Translate;
-impl Translate<TypedFact, Box<dyn TypedOp>, TypedFact, Box<dyn TypedOp>> for SymbolValues {
+impl Translate<TypedFact, Box<dyn TypedOp>, TypedFact, Box<dyn TypedOp>> for HashMap<Symbol, TDim> {
     fn translate_node(
         &self,
         source: &TypedModel,
@@ -320,7 +320,7 @@ impl Translate<TypedFact, Box<dyn TypedOp>, TypedFact, Box<dyn TypedOp>> for Sym
         mapping: &HashMap<OutletId, OutletId>,
     ) -> TractResult<TVec<OutletId>> {
         target.check_consistency()?;
-        let outlets = node.op.concretize_dims(source, node, target, mapping, self)?;
+        let outlets = node.op.substitute_symbols(source, node, target, mapping, self)?;
         for &outlet in &outlets {
             let fact = &mut target.nodes[outlet.node].outputs[outlet.slot].fact;
             if fact.shape.volume().is_zero()
diff --git a/core/src/ops/array/broadcast.rs b/core/src/ops/array/broadcast.rs
index 8e48529244..1b86834413 100644
--- a/core/src/ops/array/broadcast.rs
+++ b/core/src/ops/array/broadcast.rs
@@ -1,4 +1,8 @@
+use tract_data::itertools::izip;
+
+use crate::broadcast::multi_broadcast;
 use crate::internal::*;
+use crate::ops::binary::TypedBinOp;
 
 #[derive(Debug, Clone, new, Hash, PartialEq, Eq)]
 pub struct MultiBroadcastTo {
@@ -30,6 +34,80 @@ impl EvalOp for MultiBroadcastTo {
 }
 
 impl TypedOp for MultiBroadcastTo {
+    fn axes_mapping(
+        &self,
+        inputs: &[&TypedFact],
+        outputs: &[&TypedFact],
+    ) -> TractResult<AxesMapping> {
+        // ONNX-style broadcasting right-aligns input over output, so when
+        // output_rank > input_rank the leading output axes are pure
+        // broadcast axes with no input correspondence. natural_for_rank's
+        // square shape would skip them and trip the optimizer's axes-mapping
+        // check (caught under paranoid_assertions).
+        let in_rank = inputs[0].rank();
+        let out_rank = outputs[0].rank();
+        let leading = out_rank.saturating_sub(in_rank);
+        let mut axes = tvec!();
+        let mut alphabet = 'a'..;
+        for o in 0..leading {
+            axes.push(
+                Axis::new(alphabet.next().unwrap(), inputs.len(), outputs.len()).output(0, o),
+            );
+        }
+        for i in 0..in_rank.min(out_rank) {
+            axes.push(
+                Axis::new(alphabet.next().unwrap(), inputs.len(), outputs.len())
+                    .input(0, i)
+                    .output(0, leading + i),
+            );
+        }
+        AxesMapping::new(inputs.len(), outputs.len(), axes)
+    }
+
+    fn change_axes(
+        &self,
+        model: &TypedModel,
+        node: &TypedNode,
+        _io: InOut,
+        change: &AxisOp,
+    ) -> TractResult<Option<AxisChangeConsequence>> {
+        // Only propagate axis changes that touch passthrough axes — those
+        // where the input and output shapes agree. Touching a broadcast
+        // axis (input=1, output=N) would make the input and output rank
+        // diverge through the change and break the broadcast relationship,
+        // and propagating Rm of a non-trivial axis into a Source produces
+        // the "Removing non-trivial axis" hard error from change_shape.
+        let input_shape = &model.outlet_fact(node.inputs[0])?.shape;
+        let canonical = change.canonical();
+        let touched: TVec<usize> = match canonical.as_ref() {
+            AxisOp::Add(ix) | AxisOp::Rm(ix) => tvec![*ix],
+            AxisOp::Move(from, to) => {
+                rule_if!(input_shape.rank() == self.shape.rank());
+                tvec![*from, *to]
+            }
+            _ => return Ok(None),
+        };
+        for &ix in &touched {
+            if ix < self.shape.rank()
+                && ix < input_shape.rank()
+                && input_shape[ix] != self.shape[ix]
+            {
+                return Ok(None);
+            }
+        }
+
+        let mut shape = self.shape.clone();
+        if change.change_shape(&mut shape, false).is_ok() {
+            return Ok(Some(AxisChangeConsequence::new(
+                model,
+                node,
+                Some(Box::new(MultiBroadcastTo { shape })),
+                change,
+            )));
+        }
+        Ok(None)
+    }
+
     fn output_facts(&self, inputs: &[&TypedFact]) -> TractResult<TVec<TypedFact>> {
         ensure!(inputs.len() == 1);
         let mut fact = inputs[0].datum_type.fact(self.shape.clone());
@@ -46,17 +124,18 @@ impl TypedOp for MultiBroadcastTo {
         crate::optim::propagate_roi::bubble_roi(model, node)
     }
 
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         _source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
         let input = mapping[&node.inputs[0]];
-        let op =
-            Self { shape: self.shape.iter().map(|d| d.eval(values)).collect::<TVec<_>>().into() };
+        let shape: TVec<_> =
+            self.shape.iter().map(|d| d.substitute_all(subs)).collect::<TractResult<_>>()?;
+        let op = Self { shape: shape.into() };
         target.wire_node(&node.name, op, &[input])
     }
 
@@ -67,11 +146,141 @@ impl TypedOp for MultiBroadcastTo {
     ) -> TractResult<Option<TypedModelPatch>> {
         let input_fact = model.outlet_fact(node.inputs[0])?;
         if input_fact.shape == self.shape {
-            TypedModelPatch::shunt_one_op(model, node)
-        } else {
-            Ok(None)
+            return TypedModelPatch::shunt_one_op(model, node);
         }
+        // Swap with an AxisOp successor: `Broadcast(x, S) → AxisOp` becomes
+        // `AxisOp(x) → Broadcast(σ(S))` whenever the AxisOp transforms every
+        // axis the broadcast actually expanded.  Fires per-successor, so this
+        // works under fan-out (the original broadcast stays in place for
+        // siblings; only the matched AxisOp branch is rerouted).
+        for succ in &*node.outputs[0].successors {
+            let succ = model.node(succ.node);
+            let Some(op) = succ.op_as::<AxisOp>() else { continue };
+            let mut shape = self.shape.clone();
+            if izip!(0.., &*input_fact.shape, &*self.shape)
+                .filter(|(_, l, r)| l != r)
+                .all(|(axis, _, _)| op.transform_axis(axis).is_some())
+                && op.change_shape(&mut shape, false).is_ok()
+            {
+                let mut patch = TypedModelPatch::default();
+                let mut wire = patch.tap_model(model, node.inputs[0])?;
+                wire = patch.wire_node(&succ.name, op.clone(), &[wire])?[0];
+                wire = patch.wire_node(&node.name, MultiBroadcastTo { shape }, &[wire])?[0];
+                patch.shunt_outside(model, succ.id.into(), wire)?;
+                return Ok(Some(patch));
+            }
+        }
+        if let [succ] = &*node.outputs[0].successors {
+            let succ = model.node(succ.node);
+            if succ.op_is::<TypedBinOp>() {
+                let our_slot = node.outputs[0].successors[0].slot;
+                let other_slot = 1 - our_slot;
+                let other_operand = succ.inputs[other_slot];
+                let other_fact = model.outlet_fact(other_operand)?;
+                let output_fact = model.outlet_fact(succ.id.into())?;
+                if input_fact.rank() == other_fact.rank()
+                    && multi_broadcast(&[&input_fact.shape, &other_fact.shape])
+                        .is_ok_and(|s| &*s == &*output_fact.shape)
+                {
+                    let mut operands = tvec!(node.inputs[0], other_operand);
+                    if our_slot == 1 {
+                        operands.swap(0, 1);
+                    }
+                    return TypedModelPatch::rewire(
+                        &model,
+                        &operands,
+                        &[succ.id.into()],
+                        &|p, inputs| p.wire_node(&succ.name, succ.op.clone(), &inputs),
+                    )
+                    .map(Some);
+                }
+            }
+        }
+        Ok(None)
     }
 
     as_op!();
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ops::change_axes::AxisOp;
+    use crate::ops::logic::And;
+
+    /// `Broadcast → Move` with the broadcast feeding a SINGLE successor.
+    /// Pre-existing path: the swap rewrite kicks in.
+    #[test]
+    fn broadcast_move_single_successor_swaps() -> TractResult<()> {
+        let mut model = TypedModel::default();
+        let t = model.symbols.sym("T");
+        let pad = model.add_source("pad", bool::fact(&[t.to_dim()]))?;
+        let unsq = model.wire_node("unsq", AxisOp::Add(0), &[pad])?[0];
+        let bcast = model.wire_node(
+            "bcast",
+            MultiBroadcastTo { shape: ShapeFact::from_dims([t.to_dim(), t.to_dim()]) },
+            &[unsq],
+        )?[0];
+        let mv = model.wire_node("move", AxisOp::Move(0, 1), &[bcast])?[0];
+        model.select_output_outlets(&[mv])?;
+
+        let model = model.into_decluttered()?;
+
+        let move_count = model
+            .nodes()
+            .iter()
+            .filter(|n| matches!(n.op_as::<AxisOp>(), Some(AxisOp::Move(0, 1))))
+            .count();
+        assert_eq!(move_count, 0, "Move should have been pushed through Broadcast and absorbed");
+        Ok(())
+    }
+
+    /// `Broadcast → {Move, And-direct}` — the encoder-style pad-mask outer-AND
+    /// pattern.  Pre-fix: declutter bailed because broadcast had > 1 successor;
+    /// the Move stayed.  Post-fix: the Move-branch gets its own swapped
+    /// chain, the direct-AND branch still consumes the original broadcast.
+    #[test]
+    fn broadcast_move_fanout_pushes_through_one_branch() -> TractResult<()> {
+        let mut model = TypedModel::default();
+        let t = model.symbols.sym("T");
+        let pad = model.add_source("pad", bool::fact(&[t.to_dim()]))?;
+        let unsq = model.wire_node("unsq", AxisOp::Add(0), &[pad])?[0];
+        let bcast = model.wire_node(
+            "bcast",
+            MultiBroadcastTo { shape: ShapeFact::from_dims([t.to_dim(), t.to_dim()]) },
+            &[unsq],
+        )?[0];
+        let mv = model.wire_node("move", AxisOp::Move(0, 1), &[bcast])?[0];
+        let and = model.wire_node("and", TypedBinOp(Box::new(And), None), &[bcast, mv])?[0];
+        model.select_output_outlets(&[and])?;
+
+        let model = model.into_decluttered()?;
+
+        // Expected: fan-out swap-through fires on the Move branch, then the
+        // existing Broadcast→TypedBinOp rule fires on each (now single-
+        // successor) broadcast, eliminating both — the AND ends up
+        // broadcasting [1, T] and [T, 1] implicitly.
+        let bcast_count = model.nodes().iter().filter(|n| n.op_is::<MultiBroadcastTo>()).count();
+        assert_eq!(
+            bcast_count, 0,
+            "Both broadcasts should be subsumed into AND's implicit broadcasting"
+        );
+
+        let and_node =
+            model.nodes().iter().find(|n| n.op_is::<TypedBinOp>()).expect("AND should survive");
+        assert_eq!(and_node.inputs.len(), 2);
+        let and_input_shapes: Vec<_> = and_node
+            .inputs
+            .iter()
+            .map(|i| model.outlet_fact(*i).unwrap().shape.to_tvec())
+            .collect();
+        let expected_a = tvec![1.to_dim(), t.to_dim()];
+        let expected_b = tvec![t.to_dim(), 1.to_dim()];
+        let (a, b) = (&and_input_shapes[0], &and_input_shapes[1]);
+        assert!(
+            (a == &expected_a && b == &expected_b) || (a == &expected_b && b == &expected_a),
+            "AND should receive [1, T] and [T, 1]; got {a:?} and {b:?}"
+        );
+        Ok(())
+    }
+}
diff --git a/core/src/ops/array/gather.rs b/core/src/ops/array/gather.rs
index 962fd0d534..3457b537d4 100644
--- a/core/src/ops/array/gather.rs
+++ b/core/src/ops/array/gather.rs
@@ -208,6 +208,53 @@ impl TypedOp for Gather {
         }
     }
 
+    fn axes_mapping(
+        &self,
+        inputs: &[&TypedFact],
+        _outputs: &[&TypedFact],
+    ) -> TractResult<AxesMapping> {
+        // Output = data[..axis] ++ indices ++ data[axis+1..].  Track:
+        // - data axes [0..axis)        → output [0..axis)
+        // - data axis  self.axis       consumed (no output)
+        // - data axes (axis..data_rank)→ output [axis + indices_rank..)
+        // - indices axes [0..ir)       → output [axis..axis + ir)
+        // Fall back to disconnected for exotic data (block-quant): the
+        // storage rank can differ from the logical rank.
+        if !inputs[0].is_plain() {
+            return AxesMapping::disconnected(
+                inputs,
+                &[&inputs[0].datum_type.fact(&[0i64.to_dim()])],
+            );
+        }
+        let data_rank = inputs[0].rank();
+        let indices_rank = inputs[1].rank();
+        let mut axes: TVec<crate::axes::Axis> = tvec!();
+        let mut alphabet = 'a'..;
+        for k in 0..self.axis {
+            axes.push(
+                crate::axes::Axis::new(alphabet.next().unwrap(), 2, 1).input(0, k).output(0, k),
+            );
+        }
+        axes.push(crate::axes::Axis::new(alphabet.next().unwrap(), 2, 1).input(0, self.axis));
+        for k in self.axis + 1..data_rank {
+            let out_pos = k - 1 + indices_rank;
+            axes.push(
+                crate::axes::Axis::new(alphabet.next().unwrap(), 2, 1)
+                    .input(0, k)
+                    .output(0, out_pos),
+            );
+        }
+        for k in 0..indices_rank {
+            let out_pos = self.axis + k;
+            axes.push(
+                crate::axes::Axis::new(alphabet.next().unwrap(), 2, 1)
+                    .input(1, k)
+                    .output(0, out_pos),
+            );
+        }
+        AxesMapping::new(2, 1, axes)
+    }
+
     fn declutter(
         &self,
         model: &TypedModel,
diff --git a/core/src/ops/array/pad.rs b/core/src/ops/array/pad.rs
index 02e7938038..b31d436a56 100644
--- a/core/src/ops/array/pad.rs
+++ b/core/src/ops/array/pad.rs
@@ -127,7 +127,7 @@ impl TypedOp for Pad {
         node: &TypedNode,
     ) -> TractResult<Option<TVec<Option<TDim>>>> {
         let output_fact = model.outlet_fact(OutletId::new(node.id, 0))?;
-        let Some(roi) = &output_fact.region_of_interest else { return Ok(None) };
+        rule_if_some!(roi = &output_fact.region_of_interest);
         // For each padded axis, substitute 🎯axis → 🎯axis - before
         let mut input_roi = roi.clone();
         for (axis, &(before, _)) in self.pads.iter().enumerate() {
diff --git a/core/src/ops/array/slice.rs b/core/src/ops/array/slice.rs
index b4b095bde8..15a12fb328 100644
--- a/core/src/ops/array/slice.rs
+++ b/core/src/ops/array/slice.rs
@@ -105,7 +105,7 @@ impl TypedOp for Slice {
         node: &TypedNode,
     ) -> TractResult<Option<TVec<Option<TDim>>>> {
         let output_fact = model.outlet_fact(OutletId::new(node.id, 0))?;
-        let Some(roi) = &output_fact.region_of_interest else { return Ok(None) };
+        rule_if_some!(roi = &output_fact.region_of_interest);
         if self.start.is_zero() {
             return Ok(Some(tvec![Some(roi.clone())]));
         }
@@ -178,16 +178,19 @@ impl TypedOp for Slice {
         }
     }
 
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         _source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
-        let op =
-            Slice { axis: self.axis, start: self.start.eval(values), end: self.end.eval(values) };
+        let op = Slice {
+            axis: self.axis,
+            start: self.start.substitute_all(subs)?,
+            end: self.end.substitute_all(subs)?,
+        };
         let inputs = node.inputs.iter().map(|i| mapping[i]).collect::<TVec<_>>();
         target.wire_node(&node.name, op, &inputs)
     }
diff --git a/core/src/ops/array/tile.rs b/core/src/ops/array/tile.rs
index 2d6842ecdc..00e7e10a33 100644
--- a/core/src/ops/array/tile.rs
+++ b/core/src/ops/array/tile.rs
@@ -45,15 +45,16 @@ impl EvalOp for Tile {
 impl TypedOp for Tile {
     as_op!();
 
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         _source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
-        let multipliers = self.multipliers.iter().map(|m| m.eval(values)).collect();
+        let multipliers =
+            self.multipliers.iter().map(|m| m.substitute_all(subs)).collect::<TractResult<_>>()?;
         target.wire_node(&node.name, Self { multipliers }, &[mapping[&node.inputs[0]]])
     }
 
diff --git a/core/src/ops/binary.rs b/core/src/ops/binary.rs
index d79362183a..c366026c3e 100644
--- a/core/src/ops/binary.rs
+++ b/core/src/ops/binary.rs
@@ -46,18 +46,26 @@ pub trait BinMiniOp:
 
     fn generic_eval(&self, a: TValue, b: TValue, c_dt: DatumType) -> TractResult<Tensor> {
         if let Some(tensor) = self.maybe_eval_qbinary_as_float_op(&a, &b, &c_dt)? {
-            Ok(tensor)
+            return Ok(tensor);
+        }
+        // Same-shape fast path: skip `multi_broadcast` allocation when shapes
+        // are already equal (very common: residuals, mask application, etc.).
+        // Correctness: equal shapes imply broadcast shape == a.shape() and the
+        // existing slow path would have taken this same branch.
+        if c_dt == a.datum_type() && a.shape() == b.shape() {
+            let mut a = a.into_tensor();
+            self.eval_in_a(&mut a, &b)?;
+            return Ok(a);
+        }
+        let c_shape = crate::broadcast::multi_broadcast(&[a.shape(), b.shape()])?;
+        if &*c_shape == a.shape() && c_dt == a.datum_type() {
+            let mut a = a.into_tensor();
+            self.eval_in_a(&mut a, &b)?;
+            Ok(a)
         } else {
-            let c_shape = crate::broadcast::multi_broadcast(&[a.shape(), b.shape()])?;
-            if &*c_shape == a.shape() && c_dt == a.datum_type() {
-                let mut a = a.into_tensor();
-                self.eval_in_a(&mut a, &b)?;
-                Ok(a)
-            } else {
-                let mut c = unsafe { Tensor::uninitialized_dt(c_dt, &c_shape)? };
-                self.eval_out_of_place(&mut c, &a, &b)?;
-                Ok(c)
-            }
+            let mut c = unsafe { Tensor::uninitialized_dt(c_dt, &c_shape)? };
+            self.eval_out_of_place(&mut c, &a, &b)?;
+            Ok(c)
         }
     }
     fn eval(&self, a: TValue, b: TValue, c_dt: DatumType) -> TractResult<Tensor> {
@@ -508,7 +516,14 @@ fn declutter_neutral(
 }
 
 /// When one input is the absorbing element (e.g. 0 for Mul, false for And),
-/// replace the entire op with the uniform (absorbing) input.
+/// replace the entire op with a uniform-value tensor of the output shape.
+///
+/// We can't shunt the uniform input directly: it may be lower-rank or have
+/// broadcast-from-1 dims that don't match the op's output shape (e.g.
+/// `Mul([4, 1], scalar-0)` outputs `[4, 1]`, not `[1]`).  Wire a
+/// `MultiBroadcastTo` from the uniform constant to the output shape;
+/// subsequent declutter folds it into a pure constant when the shape is
+/// fully concrete.
 fn declutter_absorbing(
     model: &TypedModel,
     node: &TypedNode,
@@ -520,13 +535,36 @@ fn declutter_absorbing(
             .map(|absorb| tensor0(absorb).close_enough(&uniform.uni, false).is_ok())
             .unwrap_or(false);
         if is_absorbing {
+            let output_fact = model.outlet_fact(node.id.into())?;
+            let output_dt = output_fact.datum_type;
+            let output_shape = output_fact.shape.clone();
             let uni_inlet = if uniform.left_is_uniform { 0 } else { 1 };
-            return Ok(Some(TypedModelPatch::rewire(
-                model,
-                &[node.inputs[uni_inlet]],
-                &[node.id.into()],
-                &|_, inputs| Ok(inputs.into()),
-            )?));
+            let uni_input_shape = &model.outlet_fact(node.inputs[uni_inlet])?.shape;
+            // Fast path: shapes and types match — shunt the absorbing input directly.
+            if uni_input_shape == &output_shape && uniform.uni.datum_type() == output_dt {
+                return Ok(Some(TypedModelPatch::rewire(
+                    model,
+                    &[node.inputs[uni_inlet]],
+                    &[node.id.into()],
+                    &|_, inputs| Ok(inputs.into()),
+                )?));
+            }
+            // General path: create a constant encoded in the output type.
+            // This handles both shape mismatches and quantization mismatches
+            // (e.g. absorbing input is QU8(Z:61 S:1) but output is QU8(Z:0 S:0.5)).
+            let absorb_val = mini_op.absorbing_element().unwrap();
+            let absorbing_const =
+                tensor0(absorb_val as f32).cast_to_dt(output_dt)?.into_owned().into_arc_tensor();
+            let mut patch = TypedModelPatch::default();
+            let uni_const =
+                patch.add_const(format!("{}.absorbing_const", node.name), absorbing_const)?;
+            let bcast = patch.wire_node(
+                format!("{}.absorbing_bcast", node.name),
+                crate::ops::array::MultiBroadcastTo { shape: output_shape },
+                &[uni_const],
+            )?[0];
+            patch.shunt_outside(model, node.id.into(), bcast)?;
+            return Ok(Some(patch));
         }
     }
     Ok(None)
@@ -627,6 +665,20 @@ impl EvalOp for OptBinByScalar {
 
     fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>> {
         let (a, b) = args_2!(inputs);
+        // Same as OptBinUnicast: the fast path uses at_prefix + as_slice_mut
+        // and relies on natural C-order strides for the slice math. Fall back
+        // to the generic eval if either operand has non-natural strides or a
+        // storage size that doesn't match its declared shape (e.g. after
+        // Tensor::insert_axis which leaves non-natural strides behind).
+        let a_natural = a.len() == a.shape().iter().product::<usize>()
+            && a.strides() == &*Tensor::natural_strides(a.shape());
+        let b_natural = b.len() == b.shape().iter().product::<usize>()
+            && b.strides() == &*Tensor::natural_strides(b.shape());
+        if !a_natural || !b_natural {
+            let c_dt = self.binop.result_datum_type(a.datum_type(), b.datum_type())?;
+            return Ok(tvec!(self.binop.eval(a, b, c_dt)?.into_tvalue()));
+        }
+
         // Not a requirement as TensorView doesn't require a owned tensor but in reality
         // "a "should be mutable (it's omitted here as Rust compiler advise to remove it)
         let a = a.into_tensor();
@@ -757,6 +809,25 @@ impl EvalOp for OptBinUnicast {
 
     fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>> {
         let (a, b) = args_2!(inputs);
+        // The unicast fast path indexes each input's storage via at_prefix +
+        // as_slice_mut, which uses `strides[i-1]` to size the resulting slice
+        // (data/src/tensor/view.rs:99). That formula only matches ∏(shape[i..])
+        // when the tensor has natural C-order strides. Producers like
+        // Tensor::insert_axis leave non-natural strides on a tensor (e.g.
+        // shape `[1, 1, 640]` with strides `[1, 1, 1]` after two insert_axis
+        // on a `[640]` tensor), which silently breaks the slice math. Fall
+        // back to the generic broadcasting eval when either operand is not in
+        // natural strides (or has a storage size that doesn't match the
+        // declared shape).
+        let a_natural = a.len() == a.shape().iter().product::<usize>()
+            && a.strides() == &*Tensor::natural_strides(a.shape());
+        let b_natural = b.len() == b.shape().iter().product::<usize>()
+            && b.strides() == &*Tensor::natural_strides(b.shape());
+        if !a_natural || !b_natural {
+            let c_dt = self.binop.result_datum_type(a.datum_type(), b.datum_type())?;
+            return Ok(tvec!(self.binop.eval(a, b, c_dt)?.into_tvalue()));
+        }
+
         // Not a requirement as TensorView doesn't require a owned tensor but in reality
         // "a "should be mutable (it's omitted here as Rust compiler advise to remove it)
         let a = a.into_tensor();
@@ -832,6 +903,46 @@ macro_rules! bin_to_super_type {
 
             fn eval_out_of_place(&self, c: &mut Tensor, a: &Tensor, b: &Tensor) -> TractResult<()> {
                 $(if $out_of_place(c, a, b)? { return Ok(()) } )?
+                    // Same-shape fast path: bypass ndarray Zip when c, a, b
+                    // share the same shape (and hence same len for plain
+                    // storage). Iterate over slices directly.
+                    if c.shape() == a.shape() && a.shape() == b.shape() {
+                        $(
+                            $(if c.datum_type() == $typ::datum_type() {
+                                let cab: fn(&mut $typ, &$typ, &$typ) -> () = $cab;
+                                let a_plain = a.try_as_plain()?;
+                                let a_slice = a_plain.as_slice::<$typ>()?;
+                                let b_plain = b.try_as_plain()?;
+                                let b_slice = b_plain.as_slice::<$typ>()?;
+                                let mut c_plain = c.try_as_plain_mut()?;
+                                let c_slice = c_plain.as_slice_mut::<$typ>()?;
+                                debug_assert_eq!(c_slice.len(), a_slice.len());
+                                debug_assert_eq!(c_slice.len(), b_slice.len());
+                                for ((cv, av), bv) in c_slice.iter_mut().zip(a_slice.iter()).zip(b_slice.iter()) {
+                                    cab(cv, av, bv);
+                                }
+                                return Ok(())
+                            })*
+                        )*
+                        $(
+                            $(
+                                $(if a.datum_type().unquantized() == <$typ_dt>::datum_type().unquantized() {
+                                    let cab: fn(&mut $typ_dt, &$typ_dt, &$typ_dt, i32, f32) -> () = $cab_dt;
+                                    let (zp, scale) = a.datum_type().qparams().map(|q| q.zp_scale()).unwrap_or((0, 1.));
+                                    let a_plain = a.try_as_plain()?;
+                                    let a_slice = a_plain.as_slice::<$typ_dt>()?;
+                                    let b_plain = b.try_as_plain()?;
+                                    let b_slice = b_plain.as_slice::<$typ_dt>()?;
+                                    let mut c_plain = c.try_as_plain_mut()?;
+                                    let c_slice = c_plain.as_slice_mut::<$typ_dt>()?;
+                                    for ((cv, av), bv) in c_slice.iter_mut().zip(a_slice.iter()).zip(b_slice.iter()) {
+                                        cab(cv, av, bv, zp, scale);
+                                    }
+                                    return Ok(())
+                                })*
+                            )*
+                        )?
+                    }
                     $(
                         $(if c.datum_type() == $typ::datum_type() {
                             let a = a.to_plain_array_view::<$typ>()?;
@@ -872,6 +983,40 @@ macro_rules! bin_to_super_type {
             fn eval_in_a(&self, a: &mut Tensor, b: &Tensor) -> TractResult<()> {
                 // c and a are same type
                 $(if $eval_in_a(a, b)? { return Ok(()) } )?
+                // Same-shape fast path: bypass ndarray Zip when a and b share
+                // the same shape (and hence same len for plain storage).
+                if a.shape() == b.shape() {
+                    $(
+                        $(if b.datum_type() == $typ::datum_type() {
+                            let cab: fn(&mut $typ, &$typ, &$typ) -> () = $cab;
+                            let b_plain = b.try_as_plain()?;
+                            let b_slice = b_plain.as_slice::<$typ>()?;
+                            let mut a_plain = a.try_as_plain_mut()?;
+                            let a_slice = a_plain.as_slice_mut::<$typ>()?;
+                            debug_assert_eq!(a_slice.len(), b_slice.len());
+                            for (av, bv) in a_slice.iter_mut().zip(b_slice.iter()) {
+                                cab(av, &av.clone(), bv);
+                            }
+                            return Ok(())
+                        })*
+                    )*
+                    $(
+                        $(
+                            $(if a.datum_type().unquantized() == <$typ_dt>::datum_type().unquantized() {
+                                let cab: fn(&mut $typ_dt, &$typ_dt, &$typ_dt, i32, f32) -> () = $cab_dt;
+                                let (zp, scale) = a.datum_type().qparams().map(|q| q.zp_scale()).unwrap_or((0, 1.));
+                                let b_plain = b.try_as_plain()?;
+                                let b_slice = b_plain.as_slice::<$typ_dt>()?;
+                                let mut a_plain = a.try_as_plain_mut()?;
+                                let a_slice = a_plain.as_slice_mut::<$typ_dt>()?;
+                                for (av, bv) in a_slice.iter_mut().zip(b_slice.iter()) {
+                                    cab(av, &(av.clone()), bv, zp, scale);
+                                }
+                                return Ok(())
+                            })*
+                        )*
+                    )?
+                }
                 $(
                     $(if b.datum_type() == $typ::datum_type() {
                         let cab: fn(&mut $typ, &$typ, &$typ) -> () = $cab;
@@ -1074,3 +1219,54 @@ pub(crate) fn one_input_is_uniform(
     }
     Ok(None)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Reproducer for the OptBinUnicast panic seen on Nemotron decoder CI
+    /// (cuda-lovelace + Darwin). A 1-D tensor that goes through `insert_axis`
+    /// twice ends up with declared shape `[1, 1, 640]` but strides `[1, 1, 1]`
+    /// instead of the natural `[640, 640, 1]`. TensorView::at_prefix then
+    /// returns a view whose `len()` reads `strides[1] = 1`, so the unicast
+    /// kernel sees `a.len = 1, b.len = 640` and OOBs into the tile buffer.
+    ///
+    /// Pre-fix this test panics inside `linalg/src/frame/unicast.rs` with
+    /// "range end index 640 out of range for slice of length …". With the
+    /// natural-strides guard in `OptBinUnicast::eval`, the call falls back to
+    /// `BinMiniOp::eval` and produces correct output.
+    #[test]
+    fn opt_bin_unicast_falls_back_on_non_natural_strides() {
+        // Construct `a` the way the LSTM bias path does: build a 640-element
+        // 1-D tensor, then insert two leading unit dims.
+        let a_data: Vec<f32> = (0..640).map(|i| i as f32).collect();
+        let mut a = tensor1(&a_data);
+        a.insert_axis(0).unwrap();
+        a.insert_axis(0).unwrap();
+        assert_eq!(a.shape(), &[1, 1, 640]);
+        assert_eq!(a.strides(), &[1, 1, 1]);
+        assert_ne!(a.strides(), &*Tensor::natural_strides(a.shape()));
+
+        // `b` is a normal contiguous tensor of the same declared shape.
+        let b_data: Vec<f32> = vec![1.0; 640];
+        let mut b = tensor1(&b_data);
+        b.insert_axis(0).unwrap();
+        b.insert_axis(0).unwrap();
+        // Reset b to natural strides so we exercise only the a-broken path
+        // and let the b-side go through cleanly.
+        b = b.into_shape(&[1, 1, 640]).unwrap();
+
+        let linalg_fn = tract_linalg::bin_unicast(f32::datum_type(), BinOp::Add)
+            .expect("f32 unicast Add kernel available");
+        let op = OptBinUnicast { binop: Box::new(Add), eval_fn: Arc::from(linalg_fn) };
+
+        let out = op.eval(tvec!(a.into_tvalue(), b.into_tvalue())).unwrap();
+        let out = &out[0];
+        assert_eq!(out.shape(), &[1, 1, 640]);
+        let plain = out.try_as_plain().unwrap();
+        let out_slice = plain.as_slice::<f32>().unwrap();
+        for (i, v) in out_slice.iter().enumerate() {
+            assert_eq!(*v, i as f32 + 1.0, "mismatch at {i}");
+        }
+    }
+}
diff --git a/core/src/ops/cast.rs b/core/src/ops/cast.rs
index f38eaf8f65..4f1b4d2906 100644
--- a/core/src/ops/cast.rs
+++ b/core/src/ops/cast.rs
@@ -1,4 +1,5 @@
 use crate::internal::*;
+use crate::ops::array::MultiBroadcastTo;
 
 pub fn cast(to: DatumType) -> Cast {
     Cast { to }
@@ -95,10 +96,28 @@ impl TypedOp for Cast {
         node: &TypedNode,
     ) -> TractResult<Option<TypedModelPatch>> {
         if model.outlet_fact(node.inputs[0])?.datum_type == self.to {
-            TypedModelPatch::shunt_one_op(model, node)
-        } else {
-            Ok(None)
+            return TypedModelPatch::shunt_one_op(model, node);
+        }
+        // linear_prec (fan-in=1, fan-out=1) rather than single_prec: swapping
+        // through a fan-out predecessor clones it, and the clone breaks
+        // downstream pattern detectors (e.g. Square+Reduce<Sum>+Mul fusion into
+        // Reduce<MeanOfSquares>, which then feeds RmsNorm detection).
+        //
+        // AxisOp is intentionally NOT in the predicate: pulling Cast above an
+        // AxisOp (Reshape/Move/Add/Rm) prevents the CUDA conversion from
+        // fusing the post-AxisOp Cast into the downstream GEMM-class kernel,
+        // leaving ~64 standalone CudaCast ops on OpenELM-270M (TG128 -4%).
+        if let Some(prec) = model.linear_prec(node.id)?
+            && (prec.op_is::<IntoShape>() || prec.op_is::<MultiBroadcastTo>())
+        {
+            let mut patch = TypedModelPatch::default();
+            let mut wire = tvec!(patch.tap_model(model, prec.inputs[0])?);
+            wire = patch.wire_node(&node.name, &node.op, &wire)?;
+            wire = patch.wire_node(&prec.name, &prec.op, &wire)?;
+            patch.shunt_outside(model, node.id.into(), wire[0])?;
+            return Ok(Some(patch));
         }
+        Ok(None)
     }
 
     fn axes_mapping(
diff --git a/core/src/ops/change_axes.rs b/core/src/ops/change_axes.rs
index 7fcd52e966..52fad5df56 100644
--- a/core/src/ops/change_axes.rs
+++ b/core/src/ops/change_axes.rs
@@ -5,7 +5,6 @@ use crate::internal::*;
 use crate::model::{TypedModel, TypedNode};
 use crate::ops::identity::Identity;
 use AxisOp::*;
-use num_traits::One;
 use tract_itertools::Itertools;
 use tract_linalg::block_quant::{BlockQuantFact, BlockQuantStorage};
 use tract_ndarray::{ArrayViewD, ArrayViewMutD};
@@ -326,7 +325,17 @@ impl AxisOp {
             Reshape(at, from, to) => {
                 let from_volume = from.iter().product::<TDim>();
                 let to_volume = to.iter().product::<TDim>();
-                ensure!(from_volume == to_volume, "{from_volume} should be equal to {to_volume}");
+                // Two algebraically equal volumes can land in different
+                // factored forms when the same dimension is built two ways
+                // (e.g. (B+2BY)·(1+Y) vs B·(1+Y)·(1+2Y) on Conformer-style
+                // streaming attention).  Compare polynomial expansions so
+                // structural mismatch on factor ordering doesn't fail the
+                // check.
+                ensure!(
+                    from_volume.clone().expand_polynomial()
+                        == to_volume.clone().expand_polynomial(),
+                    "{from_volume} should be equal to {to_volume}"
+                );
                 ensure!(*at + from.len() <= shape.len());
                 if shape.len() >= from.len() + *at
                     && tract_itertools::izip!(shape.iter().skip(*at), from)
@@ -657,7 +666,8 @@ impl EvalOp for AxisOp {
 }
 
 /// Remap coordinate symbols in a TDim expression according to an AxisOp.
-/// Returns None if the remapping cannot be determined (e.g. general reshape).
+/// Returns None if the remapping cannot be determined (e.g. general reshape
+/// with both ends > 1).
 fn remap_uniform_tdim(expr: &TDim, axis_op: &AxisOp) -> Option<TDim> {
     let syms = expr.symbols();
     let coord_syms: Vec<(usize, Symbol)> = syms
@@ -673,13 +683,76 @@ fn remap_uniform_tdim(expr: &TDim, axis_op: &AxisOp) -> Option<TDim> {
         return Some(expr.clone());
     }
 
-    // Reshape: only handle trivial all-ones case.
-    if let AxisOp::Reshape(_, from_dims, to_dims) = axis_op.canonical().as_ref() {
-        return if from_dims.iter().all(|d| d.is_one()) && to_dims.iter().all(|d| d.is_one()) {
-            Some(expr.clone())
-        } else {
-            None
-        };
+    if let AxisOp::Reshape(at, from_dims, to_dims) = axis_op.canonical().as_ref() {
+        // Trivial all-ones case: shape change is purely cosmetic, value is unaffected.
+        if from_dims.iter().all(|d| d.is_one()) && to_dims.iter().all(|d| d.is_one()) {
+            return Some(expr.clone());
+        }
+        // Pure split: from = [D], to = [d_0, …, d_{k-1}], Π = D.  The input
+        // axis-`at` position decomposes as
+        //     pos[at] = Σ_i pos[at+i]_new · stride_i
+        // with `stride_i = Π_{j>i} to_dims[j]` (last stride is 1).  Other
+        // input axes shift right by `k-1` (the net rank change).
+        if from_dims.len() == 1 {
+            let from_dim = from_dims[0].clone();
+            let to_product: TDim = to_dims.iter().fold(TDim::Val(1), |acc, d| acc * d.clone());
+            if to_product == from_dim {
+                let k_to = to_dims.len();
+                let mut map: HashMap<Symbol, TDim> = HashMap::default();
+                for (k, sym) in &coord_syms {
+                    let scope = sym.scope()?;
+                    let new_expr = if *k < *at {
+                        TDim::Sym(sym.clone())
+                    } else if *k == *at {
+                        let mut sum = TDim::Val(0);
+                        let mut stride = TDim::Val(1);
+                        for i in (0..k_to).rev() {
+                            let new_sym = scope.coord_sym(*at + i);
+                            sum = sum + TDim::Sym(new_sym) * stride.clone();
+                            stride = stride * to_dims[i].clone();
+                        }
+                        sum
+                    } else {
+                        TDim::Sym(scope.coord_sym(*k + k_to - 1))
+                    };
+                    map.insert(sym.clone(), new_expr);
+                }
+                return expr.substitute_all(&map).ok().map(|e| e.reduce());
+            }
+        }
+        // Pure merge: from = [d_0, …, d_{k-1}], to = [D].  We can express
+        // `pos[at+i]_old` from `pos[at]_new` only via integer division and
+        // modulo, which TDim doesn't carry.  Special-case the easy form
+        // where all but one of the merged dims is 1 — then the lone
+        // non-trivial sub-axis just maps to the new merged axis.
+        if to_dims.len() == 1 {
+            let to_dim = to_dims[0].clone();
+            let from_product: TDim = from_dims.iter().fold(TDim::Val(1), |acc, d| acc * d.clone());
+            if from_product == to_dim {
+                let k_from = from_dims.len();
+                let mut map: HashMap<Symbol, TDim> = HashMap::default();
+                for (k, sym) in &coord_syms {
+                    let scope = sym.scope()?;
+                    let new_expr = if *k < *at {
+                        TDim::Sym(sym.clone())
+                    } else if *k < *at + k_from {
+                        let i = *k - *at;
+                        let only_nontrivial =
+                            from_dims.iter().enumerate().all(|(j, d)| j == i || d.is_one());
+                        if only_nontrivial {
+                            TDim::Sym(scope.coord_sym(*at))
+                        } else {
+                            return None;
+                        }
+                    } else {
+                        TDim::Sym(scope.coord_sym(*k - (k_from - 1)))
+                    };
+                    map.insert(sym.clone(), new_expr);
+                }
+                return expr.substitute_all(&map).ok().map(|e| e.reduce());
+            }
+        }
+        return None;
     }
 
     // For Add/Rm/Move: use transform_axis and substitute all at once to avoid
@@ -833,19 +906,19 @@ impl TypedOp for AxisOp {
         Ok(Some(op))
     }
 
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         _source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
         let op = if let AxisOp::Reshape(axis, from, to) = self {
             AxisOp::Reshape(
                 *axis,
-                from.iter().map(|d| d.eval(values)).collect(),
-                to.iter().map(|d| d.eval(values)).collect(),
+                from.iter().map(|d| d.substitute_all(subs)).collect::<TractResult<_>>()?,
+                to.iter().map(|d| d.substitute_all(subs)).collect::<TractResult<_>>()?,
             )
         } else {
             self.clone()
@@ -879,9 +952,7 @@ impl TypedOp for AxisOp {
         model: &TypedModel,
         node: &TypedNode,
     ) -> TractResult<Option<TypedModelPatch>> {
-        if node.outputs[0].fact.exotic_fact.is_some() {
-            return Ok(None);
-        }
+        rule_if!(node.outputs[0].fact.exotic_fact.is_none());
         if let Some(shape) = node.outputs[0].fact.shape.as_concrete()
             && !matches!(self, AxisOp::Move(_, _))
         {
@@ -964,47 +1035,32 @@ pub fn perm_to_ops(input: &[usize]) -> TVec<AxisOp> {
 
 pub fn compute_shape_with_tf_rules(input: &[TDim], shape_spec: &[TDim]) -> TractResult<TVec<TDim>> {
     let mut shape: TVec<TDim> = shape_spec.into();
-    fn deal_with_zero<'a>(
-        mut input_dims: std::iter::Peekable<impl Iterator<Item = &'a TDim>>,
-        shape: &mut [TDim],
-    ) -> TractResult<()> {
-        let mut remaining_dim_input = 1.to_dim();
-        for slot in shape.iter_mut() {
-            if *slot == (-1).into() {
-                break;
-            }
-            if *slot == 0.into() {
-                if remaining_dim_input != TDim::one() {
-                    bail!("Invalid remaining dim");
-                }
-                *slot = (*input_dims.peek().context("Invalid")?).clone();
-            }
-            loop {
-                let quotient = remaining_dim_input.maybe_div(slot);
-                if quotient.is_err() || quotient.as_ref().unwrap().1 != 1 {
-                    remaining_dim_input *= input_dims.next().context("Invalid")?;
-                } else {
-                    break;
-                }
-            }
-            remaining_dim_input = remaining_dim_input.maybe_div(slot)?.0;
+    // Replace 0s with corresponding input dims (positional, per ONNX/TF spec)
+    for (i, s) in shape.iter_mut().enumerate() {
+        if *s == 0.into() {
+            *s = input
+                .get(i)
+                .with_context(|| {
+                    format!("Reshape: 0 at position {i} but input only has {} dims", input.len())
+                })?
+                .clone();
         }
-        Ok(())
     }
-
-    deal_with_zero(input.iter().peekable(), &mut shape)?;
-    shape.reverse();
-    deal_with_zero(input.iter().rev().peekable(), &mut shape)?;
-    shape.reverse();
-
+    let input_vol: TDim = input.iter().product();
     if let Some(pos) = shape.iter().position(|d| *d == (-1).into()) {
-        let input_vol: TDim = input.iter().product();
         let shape_vol: TDim = shape.iter().filter(|d| **d != (-1).into()).product();
         let div = input_vol.maybe_div(&shape_vol)?;
         if div.1 != 1 {
             bail!("invalid")
         }
         shape[pos] = div.0;
+    } else {
+        let shape_vol: TDim = shape.iter().product();
+        if input_vol != shape_vol {
+            bail!(
+                "Reshape volume mismatch: input {input:?} (vol={input_vol}) vs shape {shape:?} (vol={shape_vol})"
+            );
+        }
     }
     Ok(shape)
 }
@@ -1046,7 +1102,9 @@ pub fn to_axis_ops_with_tf_rules(
                         }
                     }
                 }
-                todo!()
+                bail!(
+                    "Could not find matching reshape grouping: current_input={current_input:?} final_output={final_output:?} common={common}"
+                )
             }
         } else if final_output.len() > current_input.len() {
             stack.push(AxisOp::Add(current_input.len()));
@@ -1628,6 +1686,15 @@ mod proptests {
         )
     }
 
+    #[test]
+    fn compute_zero_with_rank_change() {
+        // Moonshine RoPE: input rank 4, output rank 5, two leading 0s
+        assert_eq!(
+            &*compute_shape_with_tf_rules(s![1, 52, 8, 32], s!(0, 0, 8, 16, 2)).unwrap(),
+            s![1, 52, 8, 16, 2]
+        )
+    }
+
     #[test]
     fn axis_op_rm_begin() {
         assert_eq!(&*to_axis_ops_with_tf_rules(s![1, 2, 3], s!(2, 3)).unwrap(), &[Rm(0)])
diff --git a/core/src/ops/cnn/conv/blocked.rs b/core/src/ops/cnn/conv/blocked.rs
new file mode 100644
index 0000000000..c2a03e6d64
--- /dev/null
+++ b/core/src/ops/cnn/conv/blocked.rs
@@ -0,0 +1,380 @@
+//! Direct, register-blocked convolution for the "channel-mixing temporal conv"
+//! shape class: NCHW, kernel width 1 (extent only on H), unit stride/dilation on
+//! the contiguous W axis, grouped, with a *small* number of output channels per
+//! group (`ocg`).
+//!
+//! For such convs the im2col + matmul lowering is inefficient: the per-group
+//! matmul is `M = ocg` (tiny, e.g. 5) × `K = icg·KH` × `N = H·W`, so the matmul
+//! kernel's m-tile is mostly wasted — exactly the same pathology as a low-M GEMV.
+//! ORT side-steps it with a direct conv.
+//!
+//! This op computes the conv directly: for each (group, output-row, block of the
+//! contiguous W axis) it holds `ocg` accumulators in registers and reduces over
+//! `(kh, icg)`, loading each input row ONCE and reusing it across all `ocg`
+//! outputs (the same input-reuse a GEMM gets). Measured on df_dec's `df_convp.1`
+//! (group=2, 64→10ch, kernel [5,1], 100×96): 0.77 ms native / 0.79 ms wasm vs
+//! 1.72 / 2.42 ms for tract's lazy im2col and 1.13 ms for ORT — a 2.2–3.1× win,
+//! bit-exact.
+//!
+//! Eligibility is checked in `Conv::codegen`; anything outside the supported
+//! shape class falls back to im2col.
+
+use crate::internal::*;
+
+/// Width of the inner SIMD-vectorised block over the contiguous W axis.
+const WB: usize = 16;
+
+/// Direct blocked conv. Inputs: X [N, C, H, W] (NCHW, f32), kernel
+/// [OC, ICG·KH] (group-major: row `oc` holds its group's `icg·KH` weights,
+/// i-major/h-minor), bias [OC]. Output [N, OC, H_out, W].
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+pub struct BlockedConv {
+    pub n: usize,
+    pub c_in: usize,
+    pub h_in: usize,
+    pub w: usize,
+    pub oc: usize,
+    pub group: usize,
+    pub kh: usize,
+    pub stride_h: usize,
+    pub dil_h: usize,
+    pub pad_before_h: usize,
+    pub h_out: usize,
+}
+
+impl BlockedConv {
+    #[inline]
+    fn icg(&self) -> usize {
+        self.c_in / self.group
+    }
+    #[inline]
+    fn ocg(&self) -> usize {
+        self.oc / self.group
+    }
+}
+
+impl Op for BlockedConv {
+    fn name(&self) -> StaticName {
+        "BlockedConv".into()
+    }
+
+    fn info(&self) -> TractResult<Vec<String>> {
+        Ok(vec![format!(
+            "N={} C={}->OC={} group={} kh={} (icg={} ocg={}) HxW={}x{} -> H_out={} pad_before={} stride_h={} dil_h={}",
+            self.n,
+            self.c_in,
+            self.oc,
+            self.group,
+            self.kh,
+            self.icg(),
+            self.ocg(),
+            self.h_in,
+            self.w,
+            self.h_out,
+            self.pad_before_h,
+            self.stride_h,
+            self.dil_h,
+        )])
+    }
+
+    op_as_typed_op!();
+}
+
+impl EvalOp for BlockedConv {
+    fn is_stateless(&self) -> bool {
+        true
+    }
+
+    fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>> {
+        let x_t = inputs[0].cast_to::<f32>()?;
+        let k_t = inputs[1].cast_to::<f32>()?;
+        let b_t = inputs[2].cast_to::<f32>()?;
+        // SAFETY: just cast to f32; conv I/O tensors are standard (contiguous) layout.
+        let x = unsafe { x_t.as_slice_unchecked::<f32>() };
+        let kernel = unsafe { k_t.as_slice_unchecked::<f32>() };
+        let bias_raw = unsafe { b_t.as_slice_unchecked::<f32>() };
+        // Normalise bias to a per-output-channel vector (it may arrive as a
+        // scalar zero, empty, or already [oc]).
+        let bias_vec: Vec<f32> = match bias_raw.len() {
+            0 => vec![0.0; self.oc],
+            1 => vec![bias_raw[0]; self.oc],
+            _ => bias_raw.to_vec(),
+        };
+        let bias = bias_vec.as_slice();
+
+        let mut output =
+            unsafe { Tensor::uninitialized::<f32>(&[self.n, self.oc, self.h_out, self.w])? };
+        let out = unsafe { output.as_slice_mut_unchecked::<f32>() };
+
+        let ocg = self.ocg();
+        match ocg {
+            1 => self.run::<1>(x, kernel, bias, out),
+            2 => self.run::<2>(x, kernel, bias, out),
+            3 => self.run::<3>(x, kernel, bias, out),
+            4 => self.run::<4>(x, kernel, bias, out),
+            5 => self.run::<5>(x, kernel, bias, out),
+            6 => self.run::<6>(x, kernel, bias, out),
+            8 => self.run::<8>(x, kernel, bias, out),
+            _ => self.run_generic(x, kernel, bias, out),
+        }
+
+        Ok(tvec!(output.into_tvalue()))
+    }
+}
+
+impl BlockedConv {
+    /// Const-OCG fast path: `ocg` accumulators of WB lanes held in registers.
+    ///
+    /// The hot loop (full WB-wide blocks) touches `acc` ONLY at compile-time
+    /// constant offsets `[ocl][j]` (ocl<OCG, j<WB, both const) and stores it
+    /// whole — so LLVM's SROA promotes the OCG·WB accumulators to SSA registers
+    /// and keeps them resident across the runtime `(kh, icg)` reduction (this is
+    /// what makes it ~2.4× faster than a runtime-length-access variant, matching
+    /// the standalone microbench). `get_unchecked` keeps the runtime-derived
+    /// input/kernel/output indices bounds-check-free; all are provably in range
+    /// from the shape invariants. The `w % WB` remainder uses a scalar tail.
+    // Index loops are deliberate here: const offsets into `acc` are what let SROA
+    // keep the accumulators register-resident; iterator forms regressed codegen.
+    #[allow(clippy::needless_range_loop)]
+    fn run<const OCG: usize>(&self, x: &[f32], kernel: &[f32], bias: &[f32], out: &mut [f32]) {
+        let (icg, w, h_in, h_out, kh) = (self.icg(), self.w, self.h_in, self.h_out, self.kh);
+        let (sh, dh, pb) =
+            (self.stride_h as isize, self.dil_h as isize, self.pad_before_h as isize);
+        let kstride_oc = icg * kh; // weights row stride per output channel
+        let n_full = w / WB; // full WB-wide blocks; remainder handled after
+        for ni in 0..self.n {
+            let x_n = &x[ni * self.c_in * h_in * w..];
+            let out_n = &mut out[ni * self.oc * h_out * w..];
+            for g in 0..self.group {
+                let oc0 = g * OCG;
+                let ic0 = g * icg;
+                for oh in 0..h_out {
+                    // ---- full WB blocks: all-const acc access -> register-resident ----
+                    for blk in 0..n_full {
+                        let wb = blk * WB;
+                        let mut acc = [[0f32; WB]; OCG];
+                        for ocl in 0..OCG {
+                            let b = bias[oc0 + ocl];
+                            for j in 0..WB {
+                                acc[ocl][j] = b;
+                            }
+                        }
+                        for kh_i in 0..kh {
+                            let ih = oh as isize * sh + kh_i as isize * dh - pb;
+                            if ih < 0 || ih >= h_in as isize {
+                                continue;
+                            }
+                            let row0 = ((ic0 * h_in + ih as usize) * w + wb) as isize;
+                            for icl in 0..icg {
+                                let row_base = (row0 + (icl * h_in * w) as isize) as usize;
+                                for ocl in 0..OCG {
+                                    let wv = unsafe {
+                                        *kernel.get_unchecked(
+                                            (oc0 + ocl) * kstride_oc + icl * kh + kh_i,
+                                        )
+                                    };
+                                    let a = &mut acc[ocl];
+                                    for j in 0..WB {
+                                        a[j] += unsafe { *x_n.get_unchecked(row_base + j) } * wv;
+                                    }
+                                }
+                            }
+                        }
+                        for ocl in 0..OCG {
+                            let ob = ((oc0 + ocl) * h_out + oh) * w + wb;
+                            for j in 0..WB {
+                                unsafe { *out_n.get_unchecked_mut(ob + j) = acc[ocl][j] };
+                            }
+                        }
+                    }
+                    // ---- remainder (w % WB != 0): scalar tail accumulated in place ----
+                    let wb = n_full * WB;
+                    if wb < w {
+                        let rem = w - wb;
+                        for ocl in 0..OCG {
+                            let b = bias[oc0 + ocl];
+                            let ob = ((oc0 + ocl) * h_out + oh) * w + wb;
+                            for j in 0..rem {
+                                out_n[ob + j] = b;
+                            }
+                        }
+                        for kh_i in 0..kh {
+                            let ih = oh as isize * sh + kh_i as isize * dh - pb;
+                            if ih < 0 || ih >= h_in as isize {
+                                continue;
+                            }
+                            let ih = ih as usize;
+                            for icl in 0..icg {
+                                let row_base = ((ic0 + icl) * h_in + ih) * w + wb;
+                                for ocl in 0..OCG {
+                                    let wv = kernel[(oc0 + ocl) * kstride_oc + icl * kh + kh_i];
+                                    let ob = ((oc0 + ocl) * h_out + oh) * w + wb;
+                                    for j in 0..rem {
+                                        out_n[ob + j] += x_n[row_base + j] * wv;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /// Generic fallback for `ocg` outside the const-dispatched set. Correct but
+    /// not register-blocked (heap accumulators). Rarely hit for the eligible class.
+    #[allow(clippy::needless_range_loop)]
+    fn run_generic(&self, x: &[f32], kernel: &[f32], bias: &[f32], out: &mut [f32]) {
+        let (icg, ocg, w, h_in, h_out, kh) =
+            (self.icg(), self.ocg(), self.w, self.h_in, self.h_out, self.kh);
+        let (sh, dh, pb) =
+            (self.stride_h as isize, self.dil_h as isize, self.pad_before_h as isize);
+        let kstride_oc = icg * kh;
+        let mut acc = vec![0f32; ocg * w];
+        for ni in 0..self.n {
+            let x_n = &x[ni * self.c_in * h_in * w..];
+            let out_n = &mut out[ni * self.oc * h_out * w..];
+            for g in 0..self.group {
+                let oc0 = g * ocg;
+                let ic0 = g * icg;
+                for oh in 0..h_out {
+                    for ocl in 0..ocg {
+                        let b = bias[oc0 + ocl];
+                        for j in 0..w {
+                            acc[ocl * w + j] = b;
+                        }
+                    }
+                    for kh_i in 0..kh {
+                        let ih = oh as isize * sh + kh_i as isize * dh - pb;
+                        if ih < 0 || ih >= h_in as isize {
+                            continue;
+                        }
+                        let ih = ih as usize;
+                        for icl in 0..icg {
+                            let ic = ic0 + icl;
+                            let row = &x_n[(ic * h_in + ih) * w..(ic * h_in + ih) * w + w];
+                            for ocl in 0..ocg {
+                                let wv = kernel[(oc0 + ocl) * kstride_oc + icl * kh + kh_i];
+                                let a = &mut acc[ocl * w..ocl * w + w];
+                                for j in 0..w {
+                                    a[j] += row[j] * wv;
+                                }
+                            }
+                        }
+                    }
+                    for ocl in 0..ocg {
+                        let ob = ((oc0 + ocl) * h_out + oh) * w;
+                        out_n[ob..ob + w].copy_from_slice(&acc[ocl * w..ocl * w + w]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl TypedOp for BlockedConv {
+    fn output_facts(&self, inputs: &[&TypedFact]) -> TractResult<TVec<TypedFact>> {
+        ensure!(inputs.len() == 3, "BlockedConv expects 3 inputs (X, kernel, bias)");
+        Ok(tvec!(f32::datum_type().fact([self.n, self.oc, self.h_out, self.w])))
+    }
+
+    fn cost(&self, _inputs: &[&TypedFact]) -> TractResult<TVec<(Cost, TDim)>> {
+        let macs = self.n * self.oc * self.h_out * self.w * self.icg() * self.kh;
+        Ok(tvec!((Cost::FMA(f32::datum_type()), macs.to_dim())))
+    }
+
+    as_op!();
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Independent scalar reference for the eligible conv class (NCHW, kw=1,
+    /// unit stride/dilation on W). `kernel` is `[oc, icg*kh]` (group-major,
+    /// i-major/h-minor); input channel for output `oc` is `(oc/ocg)*icg + icl`.
+    #[allow(clippy::too_many_arguments)]
+    fn reference(op: &BlockedConv, x: &[f32], kernel: &[f32], bias: &[f32]) -> Vec<f32> {
+        let (icg, ocg) = (op.icg(), op.ocg());
+        let (h_in, w, kh) = (op.h_in, op.w, op.kh);
+        let (sh, dh, pb) = (op.stride_h as isize, op.dil_h as isize, op.pad_before_h as isize);
+        let mut out = vec![0f32; op.n * op.oc * op.h_out * w];
+        for ni in 0..op.n {
+            for oc in 0..op.oc {
+                let g = oc / ocg;
+                for oh in 0..op.h_out {
+                    for wi in 0..w {
+                        let mut acc = bias[oc];
+                        for kh_i in 0..kh {
+                            let ih = oh as isize * sh + kh_i as isize * dh - pb;
+                            if ih < 0 || ih >= h_in as isize {
+                                continue;
+                            }
+                            let ih = ih as usize;
+                            for icl in 0..icg {
+                                let ic = g * icg + icl;
+                                let xv = x[((ni * op.c_in + ic) * h_in + ih) * w + wi];
+                                acc += xv * kernel[oc * (icg * kh) + icl * kh + kh_i];
+                            }
+                        }
+                        out[((ni * op.oc + oc) * op.h_out + oh) * w + wi] = acc;
+                    }
+                }
+            }
+        }
+        out
+    }
+
+    fn run_case(c_in: usize, oc: usize, group: usize, kh: usize, h_in: usize, w: usize, pb: usize) {
+        let icg = c_in / group;
+        let h_out = h_in + pb - (kh - 1); // stride=dil=1, pad_after=0
+        let op = BlockedConv {
+            n: 1,
+            c_in,
+            h_in,
+            w,
+            oc,
+            group,
+            kh,
+            stride_h: 1,
+            dil_h: 1,
+            pad_before_h: pb,
+            h_out,
+        };
+        let x: Vec<f32> = (0..c_in * h_in * w).map(|i| ((i as f32 * 0.137).sin()) * 0.7).collect();
+        let kernel: Vec<f32> =
+            (0..oc * icg * kh).map(|i| ((i as f32 * 0.091).cos()) * 0.3).collect();
+        let bias: Vec<f32> = (0..oc).map(|i| (i as f32 * 0.05) - 0.1).collect();
+
+        let want = reference(&op, &x, &kernel, &bias);
+        let got = op
+            .eval(tvec![
+                Tensor::from_shape(&[1, c_in, h_in, w], &x).unwrap().into_tvalue(),
+                Tensor::from_shape(&[oc, icg * kh], &kernel).unwrap().into_tvalue(),
+                Tensor::from_shape(&[oc], &bias).unwrap().into_tvalue(),
+            ])
+            .unwrap();
+        let got_view = got[0].to_plain_array_view::<f32>().unwrap();
+        let got = got_view.as_slice().unwrap();
+        assert_eq!(got.len(), want.len());
+        let max_abs = got.iter().zip(&want).map(|(a, b)| (a - b).abs()).fold(0.0, f32::max);
+        assert!(
+            max_abs < 1e-5,
+            "BlockedConv mismatch (c_in={c_in} oc={oc} g={group} kh={kh} h={h_in} w={w} pb={pb}): max_abs={max_abs}"
+        );
+    }
+
+    #[test]
+    fn blocked_conv_matches_reference() {
+        // df_convp.1-like: group=2, ocg=5, kh=5, causal pad, w multiple of WB.
+        run_case(64, 10, 2, 5, 12, 96, 4);
+        // full block + remainder (w=20 = 16 + 4), ocg=2.
+        run_case(4, 4, 2, 3, 5, 20, 1);
+        // remainder-only (w=5 < WB), ocg=3.
+        run_case(8, 6, 2, 4, 7, 5, 2);
+        // group=1, ocg=3, no padding.
+        run_case(6, 3, 1, 3, 8, 33, 0);
+        // ocg=1 edge.
+        run_case(4, 2, 2, 2, 6, 17, 1);
+    }
+}
diff --git a/core/src/ops/cnn/conv/conv.rs b/core/src/ops/cnn/conv/conv.rs
index 379304b595..d593ae67b5 100644
--- a/core/src/ops/cnn/conv/conv.rs
+++ b/core/src/ops/cnn/conv/conv.rs
@@ -33,7 +33,7 @@ use crate::ops::matmul::optimized::{OptMatMul, ProtoFusedSpec};
 use crate::ops::nn::{BaseDataShape, DataFormat, DataShape};
 
 use tract_linalg::mmm::{MMMInputFormat, MatMatMul};
-use tract_linalg::pack::PackedFormat;
+use tract_linalg::pack::{PackedFormat, PackedI8K4};
 
 #[derive(Debug, Clone, new, Hash, PartialEq, Eq)]
 pub struct Conv {
@@ -123,13 +123,11 @@ impl Conv {
                 &[kernel],
             )?
         } else {
-            let format = format
-                .downcast_ref::<PackedFormat>()
-                .context("Expect regular packing for numeric weights")?;
+            // PackedFormat or a custom numeric packer (e.g. PackedI8K4).
             model.wire_node(
                 format!("{name}.prep_kernel.pack"),
                 OptMatMulPack {
-                    packers: vec![format.clone()],
+                    packers: vec![dyn_clone::clone_box(format)],
                     k_axis: 2,
                     mn_axis: 1,
                     mode_picker: ModePicker::Single,
@@ -240,7 +238,11 @@ impl Conv {
             &sum_ker_n_g_c,
         )?;
 
-        ensure!(mmm.packings()[packing].1.downcast_ref::<PackedFormat>().is_some());
+        ensure!(
+            mmm.packings()[packing].1.downcast_ref::<PackedFormat>().is_some()
+                || mmm.packings()[packing].1.downcast_ref::<PackedI8K4>().is_some(),
+            "Im2Col/QSumB support PackedFormat or PackedI8K4 activation packings"
+        );
         let mut sum_x = model.wire_node(
             format!("{name}.sum_x"),
             super::QSumB { dt: b_fact.datum_type, n, r: mmm.nr(), k },
@@ -465,7 +467,10 @@ impl Conv {
         let size_of_b = x_fact.datum_type.size_of() as isize;
         let n_byte_offsets: Vec<isize> =
             geo.patch.centers_offsets().into_iter().map(|x| x * size_of_b).collect();
-        let k_byte_offsets: Vec<isize> = (0..self.input_channels())
+        // For grouped convs, k offsets cover one group's input slice (ci_per_group channels);
+        // each group reads from a different base offset (group_stride_bytes apart).
+        let ci_per_group = self.input_channels() / self.group;
+        let k_byte_offsets: Vec<isize> = (0..ci_per_group)
             .flat_map(|ici| {
                 geo.patch
                     .standard_layout_data_field
@@ -473,6 +478,7 @@ impl Conv {
                     .map(move |x| (x + (ici * c_stride) as isize) * size_of_b)
             })
             .collect();
+        let group_stride_bytes = (ci_per_group * c_stride) as isize * size_of_b;
         let (mmm_output_shape, c_axis, h_axis) = self.mmm_output_shape(&geo.output_shape)?;
         let packer = mmm.packings()[packing]
             .1
@@ -487,7 +493,7 @@ impl Conv {
         let params = LazyIm2colParams { packer, n_byte_offsets, k_byte_offsets };
         let x = model.wire_node(
             format!("{name}.lazyIm2col"),
-            LazyIm2Col { params: Arc::new(params) },
+            LazyIm2Col { params: Arc::new(params), group: self.group, group_stride_bytes },
             &[x],
         )?[0];
 
@@ -662,6 +668,86 @@ impl Conv {
         Ok(model.wire_node(name, op, &[x, kernel[0], bias])?[0])
     }
 
+    /// Eligibility for the direct register-blocked conv (see `blocked.rs`):
+    /// f32 NCHW, kernel width 1 (extent on H only), unit stride/dilation on the
+    /// contiguous W axis, grouped with a *small* number of out-channels per group
+    /// (where the im2col matmul's M-tile would be mostly wasted). Concrete shape
+    /// required. Returns the fully-parameterised op, or None to fall back.
+    fn try_blocked_conv(&self, input_fact: &TypedFact) -> Option<super::BlockedConv> {
+        // The direct blocked conv beats im2col on wasm (no AMX; the gather +
+        // wasted-M-tile matmul is slow) but LOSES on native, where shape-aware
+        // AMX dispatch already handles the tiny-M matmul well. So: on by default
+        // on wasm, opt-in on native. Env overrides either way for A/B.
+        let enabled = if cfg!(target_family = "wasm") {
+            std::env::var("TRACT_DISABLE_BLOCKED_CONV").is_err()
+        } else {
+            std::env::var("TRACT_ENABLE_BLOCKED_CONV").is_ok()
+        };
+        if !enabled {
+            return None;
+        }
+        if self.q_params.is_some() {
+            return None;
+        }
+        if input_fact.datum_type != f32::datum_type() {
+            return None;
+        }
+        if self.pool_spec.data_format != crate::ops::nn::DataFormat::NCHW {
+            return None;
+        }
+        if self.pool_spec.rank() != 2 || self.pool_spec.kernel_shape[1] != 1 {
+            return None;
+        }
+        if self.pool_spec.stride(1) != 1 || self.pool_spec.dilation(1) != 1 {
+            return None;
+        }
+        let group = self.group;
+        let oc = self.output_channels();
+        let c_in = self.input_channels();
+        if group == 0 || !oc.is_multiple_of(group) || !c_in.is_multiple_of(group) {
+            return None;
+        }
+        let ocg = oc / group;
+        // Win condition: tiny per-group output count makes the im2col matmul's
+        // m-tile wasteful. Large ocg packs the tile fine — leave it to im2col.
+        if ocg == 0 || ocg > 8 {
+            return None;
+        }
+        let concrete = input_fact.shape.as_concrete()?;
+        let shape = self.pool_spec.data_format.shape(concrete).ok()?;
+        let h_axis = shape.h_axis();
+        let h_in = concrete[h_axis];
+        let w = concrete[h_axis + 1];
+        let pads = self.pool_spec.computed_padding(shape.hw_dims());
+        Some(super::BlockedConv {
+            n: *shape.n().unwrap_or(&1),
+            c_in,
+            h_in,
+            w,
+            oc,
+            group,
+            kh: self.pool_spec.kernel_shape[0],
+            stride_h: self.pool_spec.stride(0),
+            dil_h: self.pool_spec.dilation(0),
+            pad_before_h: pads[0].pad_before,
+            h_out: pads[0].convoluted,
+        })
+    }
+
+    fn wire_as_blocked_conv(
+        &self,
+        model: &mut TypedModel,
+        name: &str,
+        wire: &[OutletId],
+        op: super::BlockedConv,
+    ) -> TractResult<OutletId> {
+        let &[x, kernel, bias] = wire else { bail!("Wrong number of inputs") };
+        // Kernel → [group, ocg, icg·kh] (group-major, i-major/h-minor); its flat
+        // layout is exactly the [oc, icg·kh] the op indexes.
+        let g_o_ihw = self.wire_kernel_as_g_o_ihw(model, name, kernel)?;
+        Ok(model.wire_node(name, op, &[x, g_o_ihw[0], bias])?[0])
+    }
+
     fn declutter_stride_slice_to_downsample(
         &self,
         model: &TypedModel,
@@ -1092,9 +1178,7 @@ impl TypedOp for Conv {
         io: InOut,
         change: &AxisOp,
     ) -> TractResult<Option<AxisChangeConsequence>> {
-        if io == InOut::In(1) {
-            return Ok(None);
-        }
+        rule_if!(io != InOut::In(1));
         if io == InOut::In(2)
             && let &AxisOp::Rm(_) = change
         {
@@ -1118,9 +1202,7 @@ impl TypedOp for Conv {
                     ),
                 }));
             }
-            if change.transform_axis(n).map(|axis| axis > 0).unwrap_or(true) {
-                return Ok(None);
-            }
+            rule_if!(change.transform_axis(n).map(|axis| axis == 0).unwrap_or(false));
         }
         // format swap: chw <-> hwc
         let (new_format, axis_move) = match self.pool_spec.data_format {
@@ -1145,9 +1227,7 @@ impl TypedOp for Conv {
             }));
         }
         // geo axis manips
-        if model.node_input_facts(node.id)?[1].is_exotic() {
-            return Ok(None);
-        }
+        rule_if!(!model.node_input_facts(node.id)?[1].is_exotic());
         use AxisOp::*;
         let h_axis = shape.h_axis();
         let hw_axes = shape.hw_axes();
@@ -1197,16 +1277,21 @@ impl TypedOp for Conv {
                 patch.shunt_outside(model, node.id.into(), wire[0])?;
                 patch.obliterate(node.id)?;
                 Ok(Some(patch))
+            } else if let Some(op) = self.try_blocked_conv(input_fact) {
+                // Direct register-blocked conv for the small-ocg NCHW kw=1 class;
+                // beats lazy im2col by avoiding the gather + wasted M-tile matmul.
+                let mut patch = TypedModelPatch::new("blocked-conv");
+                let inputs = patch.taps(model, &node.inputs)?;
+                let wire = self
+                    .wire_as_blocked_conv(&mut patch, &node.name, &inputs, op)
+                    .context("wire_as_blocked_conv")?;
+                patch.shunt_outside(model, OutletId::new(node.id, 0), wire)?;
+                patch.obliterate(node.id)?;
+                Ok(Some(patch))
             } else if input_fact
                 .shape
                 .as_concrete()
-                .map(|s| {
-                    should_use_lazy(
-                        &self.pool_spec.data_format.shape(s.into()).unwrap(),
-                        &self.pool_spec,
-                        self.group,
-                    )
-                })
+                .map(|s| should_use_lazy(&self.pool_spec, self.group, s, input_fact.datum_type))
                 .unwrap_or(false)
             {
                 let mut patch = TypedModelPatch::new("lazy-im2col");
@@ -1246,10 +1331,93 @@ impl TypedOp for Conv {
     as_op!();
 }
 
-fn should_use_lazy(input_shape: &DataShape, pool_spec: &PoolSpec, group: usize) -> bool {
-    input_shape.n().unwrap_or(&1) == &1
-        && group == 1
-        && pool_spec.kernel_shape.iter().product::<usize>() > 5
+/// Default minimum kernel volume for picking LazyIm2col over eager Im2col.
+///
+/// LazyIm2col has per-output-position gather indirection overhead; eager Im2col has
+/// materialisation overhead (one big alloc + strided memcpy). For tiny kernels the
+/// indirection wins; for bigger kernels the materialisation cost dominates. This default
+/// is conservative — empirically lazy already wins for kernel volumes ≥ 4 on Apple AMX
+/// (and likely lower on memory-constrained targets like embedded ARM). Override via
+/// `TRACT_LAZY_IM2COL_MIN_KERNEL` env var to experiment with lower thresholds.
+const DEFAULT_LAZY_IM2COL_MIN_KERNEL: usize = 6;
+
+fn lazy_im2col_min_kernel() -> usize {
+    use std::sync::OnceLock;
+    static V: OnceLock<usize> = OnceLock::new();
+    *V.get_or_init(|| {
+        std::env::var("TRACT_LAZY_IM2COL_MIN_KERNEL")
+            .ok()
+            .and_then(|s| s.parse::<usize>().ok())
+            .unwrap_or(DEFAULT_LAZY_IM2COL_MIN_KERNEL)
+    })
+}
+
+/// Default eager-Im2col scratch-size ceiling, in bytes, above which LazyIm2col is
+/// preferred regardless of kernel volume.
+///
+/// Eager Im2col materialises a `[k, n]` packed scratch of `k·n·sizeof` bytes — it is
+/// allocated, written, then read back by the matmul. While that scratch is small it
+/// stays hot in cache and the round-trip is cheap, so the kernel-volume rule above
+/// governs. Once it is large, the materialisation becomes a pure memory-bandwidth tax
+/// (write + read of multiple MB every inference) that outweighs LazyIm2col's per-panel
+/// gather indirection — so prefer lazy. The kernel-volume rule alone misses this case:
+/// a *small* kernel over a *large* output (big `n`) still materialises multiple MB.
+///
+/// The crossover is target-dependent. On WASM the materialisation tax bites harder
+/// (no hardware-prefetch help, bounds-checked stores), so lazy wins from ~1 MiB of
+/// scratch upward. On native CPUs the caches and prefetchers absorb a few MB, so the
+/// crossover sits higher (~4 MiB, measured on Apple Silicon). Hence the per-family
+/// defaults below. Override on either target via `TRACT_LAZY_IM2COL_MAX_EAGER_BYTES`;
+/// this value is the key knob for the canary-model regression gate.
+#[cfg(target_family = "wasm")]
+const DEFAULT_LAZY_IM2COL_MAX_EAGER_BYTES: usize = 1024 * 1024;
+#[cfg(not(target_family = "wasm"))]
+const DEFAULT_LAZY_IM2COL_MAX_EAGER_BYTES: usize = 4 * 1024 * 1024;
+
+fn lazy_im2col_max_eager_bytes() -> usize {
+    use std::sync::OnceLock;
+    static V: OnceLock<usize> = OnceLock::new();
+    *V.get_or_init(|| {
+        std::env::var("TRACT_LAZY_IM2COL_MAX_EAGER_BYTES")
+            .ok()
+            .and_then(|s| s.parse::<usize>().ok())
+            .unwrap_or(DEFAULT_LAZY_IM2COL_MAX_EAGER_BYTES)
+    })
+}
+
+fn should_use_lazy(
+    pool_spec: &PoolSpec,
+    group: usize,
+    input_shape: &[usize],
+    dt: DatumType,
+) -> bool {
+    // Depthwise convs (group == in_channels == out_channels) have a specialised
+    // `DepthWise` op downstream that's much faster than the generic im2col + matmul
+    // path on every backend we measured (Apple AMX, x64, aarch64). Don't intercept
+    // them here — let the dispatch in `conv.rs` reach `wire_as_depth_wise`.
+    let is_depthwise =
+        group > 1 && group == pool_spec.input_channels && group == pool_spec.output_channels;
+    if is_depthwise {
+        return false;
+    }
+    let Ok(output_shape) = pool_spec.output_shape(input_shape) else { return false };
+    // LazyIm2col's offset tables are built for a single batch.
+    if output_shape.n().unwrap_or(&1) != &1 {
+        return false;
+    }
+    let kernel_volume = pool_spec.kernel_shape.iter().product::<usize>();
+    // Primary rule: kernel volume. LazyIm2col's per-output-position gather indirection
+    // is cheap relative to materialising the scratch for a sizeable kernel.
+    if kernel_volume >= lazy_im2col_min_kernel() {
+        return true;
+    }
+    // Shape-aware rule: prefer lazy when the eager scratch (`k·n·sizeof`) is large,
+    // even for a small kernel. `n` is the output spatial volume — the dimension the
+    // kernel-volume rule ignores but which actually drives the materialisation cost.
+    let n: usize = output_shape.hw_dims().iter().product();
+    let k = pool_spec.input_channels * kernel_volume / group;
+    let eager_scratch_bytes = k.saturating_mul(n).saturating_mul(dt.size_of());
+    eager_scratch_bytes >= lazy_im2col_max_eager_bytes()
 }
 
 #[allow(non_snake_case)]
diff --git a/core/src/ops/cnn/conv/depth_wise.rs b/core/src/ops/cnn/conv/depth_wise.rs
index 210e4c76f2..94de33b3bf 100644
--- a/core/src/ops/cnn/conv/depth_wise.rs
+++ b/core/src/ops/cnn/conv/depth_wise.rs
@@ -151,29 +151,27 @@ macro_rules! impl_eval {
                    zone, c_stride_i, c_stride_o, k_stride_i, iptr, kptr, bias, optr,
                    )
                    } else */
-                if zone.values_offsets.len() == 4 {
-                    [<process_zone_n_ $suffix>]::<T, 4, 4>(
+                match zone.values_offsets.len() {
+                    1 => [<process_zone_n_ $suffix>]::<T, 1, 4>(
                         dw, zone, c_stride_i, c_stride_o, k_stride_i, iptr, kptr, bias, optr, add, mul,
-                        )
-                        /*
-                           } else if zone.values_offsets.len() == 5 {
-                           dw.process_zone_n::<T, 5, 2>(
-                           zone, c_stride_i, c_stride_o, k_stride_i, iptr, kptr, bias, optr,
-                           )
-                           } else if zone.values_offsets.len() == 9 {
-                           dw.process_zone_n::<T, 9, 1>(
-                           zone, c_stride_i, c_stride_o, k_stride_i, iptr, kptr, bias, optr,
-                           )
-                           */
-                } else {
-                    zone.visit_output(&dw.patch, |visitor| {
+                    ),
+                    2 => [<process_zone_n_ $suffix>]::<T, 2, 4>(
+                        dw, zone, c_stride_i, c_stride_o, k_stride_i, iptr, kptr, bias, optr, add, mul,
+                    ),
+                    3 => [<process_zone_n_ $suffix>]::<T, 3, 4>(
+                        dw, zone, c_stride_i, c_stride_o, k_stride_i, iptr, kptr, bias, optr, add, mul,
+                    ),
+                    4 => [<process_zone_n_ $suffix>]::<T, 4, 4>(
+                        dw, zone, c_stride_i, c_stride_o, k_stride_i, iptr, kptr, bias, optr, add, mul,
+                    ),
+                    _ => zone.visit_output(&dw.patch, |visitor| {
                         for c in 0..*dw.input_shape.c() as isize {
                             let iptr = iptr.offset(c_stride_i * c);
                             let optr = optr.offset(c_stride_o * c);
                             let kptr = kptr.offset(k_stride_i * c);
                             [<inner_loop_ $suffix>]::<T>(iptr, kptr, bias, optr, c, visitor, add, mul)
                         }
-                    })
+                    }),
                 }
             }}
 
diff --git a/core/src/ops/cnn/conv/im2col.rs b/core/src/ops/cnn/conv/im2col.rs
index 34407dd0f9..6cec51ba95 100644
--- a/core/src/ops/cnn/conv/im2col.rs
+++ b/core/src/ops/cnn/conv/im2col.rs
@@ -1,7 +1,8 @@
 use tract_linalg::mmm::{
-    EagerPackedInput, MMMInputValue, MatMatMul, PackedExoticFact, PackedMatrixStorage,
+    EagerPackedInput, MMMInputFormat, MMMInputValue, MatMatMul, PackedExoticFact,
+    PackedMatrixStorage,
 };
-use tract_linalg::pack::{PackedFormat, PackingWriter};
+use tract_linalg::pack::{PackedFormat, PackedI8K4, PackingWriter};
 
 use crate::internal::*;
 use ndarray::prelude::*;
@@ -23,7 +24,8 @@ struct SymbolicGeometry {
     group: usize,
     pool_spec: PoolSpec,
     pool_geometry: PoolGeometry,
-    b_pack: PackedFormat,
+    // The kernel's activation packing: PackedFormat (K-major) or PackedI8K4 (K=4-inner).
+    out_format: Box<dyn MMMInputFormat>,
     k: usize,
 }
 
@@ -32,7 +34,7 @@ struct ConcreteGeometry {
     pool: ConcretePoolGeometry,
     pub n: usize,
     k: usize,
-    pub b_pack: PackedFormat,
+    pub out_format: Box<dyn MMMInputFormat>,
     pub ci_per_group: usize,
     patcher: Patcher,
     input_shape_with_n: DataShape,
@@ -40,10 +42,10 @@ struct ConcreteGeometry {
 }
 
 impl GeometryBound<SymbolicGeometry, ConcreteGeometry> {
-    pub fn b_pack(&self) -> &PackedFormat {
+    pub fn out_format(&self) -> &dyn MMMInputFormat {
         match self {
-            GeometryBound::Symbolic(s) => &s.b_pack,
-            GeometryBound::Concrete(s) => &s.b_pack,
+            GeometryBound::Symbolic(s) => &*s.out_format,
+            GeometryBound::Concrete(s) => &*s.out_format,
         }
     }
     pub fn k(&self) -> usize {
@@ -88,7 +90,7 @@ impl ResolveTo<ConcreteGeometry> for SymbolicGeometry {
             n,
             k: self.k,
             ci_per_group,
-            b_pack: self.b_pack.clone(),
+            out_format: self.out_format.clone(),
             patcher,
             input_shape_with_n,
             packed_shape,
@@ -105,15 +107,10 @@ impl Im2Col {
         mmm: Box<dyn MatMatMul>,
         packing: usize,
     ) -> TractResult<Im2Col> {
-        let b_pack = mmm.packings()[packing]
-            .1
-            .downcast_ref::<PackedFormat>()
-            .context("Im2Col expects regular packed format")?
-            .clone();
-
+        let out_format = dyn_clone::clone_box(&*mmm.packings()[packing].1);
         let pool_geometry = pool_spec.compute_geo(input_full_shape)?;
         let geometry: GeometryBound<_, _> =
-            SymbolicGeometry { group, pool_spec: pool_spec.clone(), pool_geometry, b_pack, k }
+            SymbolicGeometry { group, pool_spec: pool_spec.clone(), pool_geometry, out_format, k }
                 .into();
         let geometry = geometry.optimize_if(input_full_shape.as_concrete())?;
         Ok(Im2Col { pool_spec, group, geometry })
@@ -156,8 +153,21 @@ impl EvalOp for Im2Col {
             if !self.pool_spec.data_format.has_n() {
                 input.insert_axis(0)?;
             }
-            let panel_bytes =
-                geometry.b_pack.single_panel_len(geometry.k) * input.datum_type().size_of();
+            let dt = input.datum_type();
+            let r = geometry.out_format.r();
+            // Buffer geometry. zero_init for PackedI8K4: the K=4-inner writer skips
+            // the K-padding lanes (k..k_aligned), which SMOPA accumulates — they must
+            // be 0. PackedFormat has no K padding; its mn-padding maps to discarded
+            // output rows, so uninitialized is fine (matches prior behaviour).
+            let (single_panel_len, buf_align, zero_init) =
+                if let Some(pf) = geometry.out_format.downcast_ref::<PackedFormat>() {
+                    (pf.single_panel_len(geometry.k), pf.alignment(), false)
+                } else if let Some(p4) = geometry.out_format.downcast_ref::<PackedI8K4>() {
+                    (p4.single_panel_len(geometry.k), p4.alignment(), true)
+                } else {
+                    bail!("Im2Col: unsupported packing format {:?}", geometry.out_format)
+                };
+            let panel_bytes = single_panel_len * dt.size_of();
 
             let n_batches = *geometry.input_shape_with_n.n().unwrap_or(&1);
             let n_groups = self.group;
@@ -169,12 +179,15 @@ impl EvalOp for Im2Col {
                     let n =
                         if geometry.pool.output_shape.shape.contains(&0) { 0 } else { geometry.n };
                     let mut data = Tensor::uninitialized_aligned_dt(
-                        input.datum_type(),
-                        &[geometry.b_pack.len(geometry.k, n)],
-                        geometry.b_pack.alignment(),
+                        dt,
+                        &[n.divceil(r) * single_panel_len],
+                        buf_align,
                     )?;
+                    if zero_init {
+                        data.as_bytes_mut().fill(0);
+                    }
                     if n > 0 {
-                        dispatch_copy_by_size!(Patcher::patch(input.datum_type())(
+                        dispatch_copy_by_size!(Patcher::patch(dt)(
                             &geometry.patcher,
                             &geometry,
                             &input,
@@ -185,7 +198,7 @@ impl EvalOp for Im2Col {
                     }
                     values.push(Box::new(EagerPackedInput {
                         fact: PackedExoticFact {
-                            format: Box::new(geometry.b_pack.clone()),
+                            format: geometry.out_format.clone(),
                             k: geometry.k,
                             mn: n.to_dim(),
                         },
@@ -211,7 +224,7 @@ impl TypedOp for Im2Col {
         let output_shape = self.pool_spec.output_shape(&inputs[0].shape)?;
         let mn = output_shape.hw_dims().iter().product::<TDim>();
         let pof = PackedExoticFact {
-            format: Box::new(self.geometry.b_pack().clone()),
+            format: dyn_clone::clone_box(self.geometry.out_format()),
             k: self.geometry.k(),
             mn,
         };
@@ -259,34 +272,57 @@ impl Patcher {
         pack: &'p mut TensorView,
         g: usize,
         pad_value: Option<&Tensor>,
+    ) -> TractResult<()> {
+        // Pick the packing writer for the kernel's output format, then run the
+        // (writer-generic) patcher. PackedFormat keeps the K-major fast path;
+        // PackedI8K4 writes the SMOPA K=4-inner layout in the same single pass.
+        let ptr = unsafe { pack.as_slice_mut_unchecked::<T>().as_mut_ptr() };
+        if let Some(pf) = geo.out_format.downcast_ref::<PackedFormat>() {
+            let mut w = pf.write_with_k_outer(ptr, geo.k, geo.n);
+            self.run::<T, _>(geo, input, g, pad_value, &mut w)
+        } else if let Some(p4) = geo.out_format.downcast_ref::<PackedI8K4>() {
+            let mut w = p4.write_with_k_outer(ptr, geo.k, geo.n);
+            self.run::<T, _>(geo, input, g, pad_value, &mut w)
+        } else {
+            bail!("Im2Col: unsupported packing format {:?}", geo.out_format)
+        }
+    }
+
+    fn run<T: Copy + Datum + num_traits::Zero, W: PackingWriter<T>>(
+        &self,
+        geo: &ConcreteGeometry,
+        input: &TensorView,
+        g: usize,
+        pad_value: Option<&Tensor>,
+        writer: &mut W,
     ) -> TractResult<()> {
         match self {
-            Patcher::Valid1d => Self::valid_1d::<T>(geo, input, pack, g),
-            Patcher::Valid2d => Self::valid_2d::<T>(geo, input, pack, g),
-            Patcher::Padded2d => Self::padded_2d::<T>(
+            Patcher::Valid1d => Self::valid_1d::<T, W>(geo, input, g, writer),
+            Patcher::Valid2d => Self::valid_2d::<T, W>(geo, input, g, writer),
+            Patcher::Padded2d => Self::padded_2d::<T, W>(
                 geo,
                 input,
-                pack,
                 g,
                 pad_value.unwrap_or(&Tensor::zero_scalar::<T>()?),
+                writer,
             ),
-            _ => Self::generic::<T>(
+            _ => Self::generic::<T, W>(
                 geo,
                 input,
-                pack,
                 g,
                 pad_value.unwrap_or(&Tensor::zero_scalar::<T>()?),
+                writer,
             ),
         }
     }
 
     #[inline(never)]
-    fn generic<'p, T: Copy + Datum>(
-        geometry: &'p ConcreteGeometry,
+    fn generic<T: Copy + Datum, W: PackingWriter<T>>(
+        geometry: &ConcreteGeometry,
         input: &TensorView,
-        pack: &'p mut TensorView,
         g: usize,
         pad_value: &Tensor,
+        writer: &mut W,
     ) -> TractResult<()> {
         unsafe {
             let pad_value = *pad_value.to_scalar_unchecked();
@@ -307,33 +343,50 @@ impl Patcher {
                     }
                 }
             }
-            geometry.b_pack.pack(pack, mega_matrix.view(), 0, 1);
+            // mega_matrix is [k, n] (k-major); feed K-outer to the writer, which
+            // lays out the kernel's packing (K-major for PackedFormat, K=4-inner
+            // for PackedI8K4) — byte-identical to PackedFormat::pack for the former.
+            let mv = mega_matrix.as_slice_unchecked::<T>();
+            for kk in 0..geometry.k {
+                writer.write_slice(&mv[kk * geometry.n..(kk + 1) * geometry.n]);
+            }
             Ok(())
         }
     }
 
     #[inline(never)]
-    fn valid_1d<'p, T: Copy + Datum>(
-        geometry: &'p ConcreteGeometry,
+    fn valid_1d<T: Copy + Datum, W: PackingWriter<T>>(
+        geometry: &ConcreteGeometry,
         input: &TensorView,
-        pack: &'p mut TensorView,
         g: usize,
+        writer: &mut W,
     ) -> TractResult<()> {
         unsafe {
             let x_stride = *geometry.input_shape_with_n.h_stride() as isize
                 * geometry.pool.patch.spec.strides[0] as isize;
             let c_stride = *geometry.input_shape_with_n.c_stride() as isize;
-            let pack = pack.as_slice_mut_unchecked::<T>();
-            let mut writer =
-                geometry.b_pack.write_with_k_outer(pack.as_mut_ptr(), geometry.k, geometry.n);
             let iptr = input.as_ptr_unchecked::<T>();
             let iptr = iptr.add(g * geometry.ci_per_group * geometry.input_shape_with_n.c_stride());
+            let output_x = *geometry.pool.patch.output_shape.get_unchecked(0);
+            // Fast path: stride-1 contiguous read along x. Replaces the
+            // per-element pointer-arithmetic loop with a single write_slice
+            // (memcpy when the slice fits in the current panel).
+            // Byte-identical to the slow path (write_slice's contract).
+            let contiguous_x = x_stride == 1;
             for ci in 0..geometry.ci_per_group {
                 let iptr = iptr.offset(ci as isize * c_stride);
                 for koffset in &geometry.pool.patch.standard_layout_data_field {
                     let iptr = iptr.offset(*koffset);
-                    for x in 0..*geometry.pool.patch.output_shape.get_unchecked(0) {
-                        writer.write(*iptr.offset(x as isize * x_stride));
+                    if contiguous_x {
+                        let row = std::slice::from_raw_parts(iptr, output_x);
+                        writer.write_slice(row);
+                    } else {
+                        // Hoist multiplication out of inner loop.
+                        let mut iptr_x = iptr;
+                        for _ in 0..output_x {
+                            writer.write(*iptr_x);
+                            iptr_x = iptr_x.offset(x_stride);
+                        }
                     }
                 }
             }
@@ -342,16 +395,15 @@ impl Patcher {
     }
 
     #[inline(never)]
-    fn padded_2d<'p, T: Copy + Datum>(
-        geometry: &'p ConcreteGeometry,
+    fn padded_2d<T: Copy + Datum, W: PackingWriter<T>>(
+        geometry: &ConcreteGeometry,
         input: &TensorView,
-        pack: &'p mut TensorView,
         g: usize,
         pad_value: &Tensor,
+        writer: &mut W,
     ) -> TractResult<()> {
         unsafe {
             let pad_value = *pad_value.to_scalar_unchecked();
-            let pack = pack.as_slice_mut_unchecked::<T>();
             let y_stride = geometry.pool.patch.spec.strides[0] as isize;
             let x_stride = geometry.pool.patch.spec.strides[1] as isize;
             let shape = &geometry.input_shape_with_n;
@@ -361,8 +413,6 @@ impl Patcher {
             let input_heigth = shape.hw_dims()[0] as isize;
             let input_width = shape.hw_dims()[1] as isize;
             let kernel_len = geometry.pool.patch.standard_layout_data_field.len();
-            let mut writer =
-                geometry.b_pack.write_with_k_outer(pack.as_mut_ptr(), geometry.k, geometry.n);
             let iptr = input.as_ptr_unchecked::<T>();
             let iptr = iptr.add(g * geometry.ci_per_group * shape.c_stride());
             let output_width = *geometry.pool.patch.output_shape.get_unchecked(1);
@@ -388,22 +438,22 @@ impl Patcher {
                             Self::padded_2d_invalid_x_loop(
                                 valid_x_start as usize,
                                 pad_value,
-                                &mut writer,
+                                &mut *writer,
                             );
                             Self::padded_2d_valid_x_loop(
                                 valid_x_start,
                                 valid_x_end,
                                 x_stride_ptr,
                                 iptr,
-                                &mut writer,
+                                &mut *writer,
                             );
                             Self::padded_2d_invalid_x_loop(
                                 output_width - valid_x_end as usize,
                                 pad_value,
-                                &mut writer,
+                                &mut *writer,
                             );
                         } else {
-                            Self::padded_2d_invalid_x_loop(output_width, pad_value, &mut writer);
+                            Self::padded_2d_invalid_x_loop(output_width, pad_value, &mut *writer);
                         }
                     }
                 }
@@ -413,10 +463,10 @@ impl Patcher {
     }
 
     #[inline(never)]
-    unsafe fn padded_2d_invalid_x_loop<T: Copy + Datum>(
+    unsafe fn padded_2d_invalid_x_loop<T: Copy + Datum, W: PackingWriter<T>>(
         count: usize,
         pad_value: T,
-        writer: &mut tract_linalg::pack::KOutWriter<T>,
+        writer: &mut W,
     ) {
         for _ in 0..count {
             writer.write(pad_value);
@@ -424,46 +474,68 @@ impl Patcher {
     }
 
     #[inline(never)]
-    unsafe fn padded_2d_valid_x_loop<T: Copy + Datum>(
+    unsafe fn padded_2d_valid_x_loop<T: Copy + Datum, W: PackingWriter<T>>(
         x_min: isize,
         x_max: isize,
         x_stride_ptr: isize,
         iptr: *const T,
-        writer: &mut tract_linalg::pack::KOutWriter<T>,
+        writer: &mut W,
     ) {
-        for x in x_min..x_max {
-            writer.write(unsafe { *iptr.offset(x * x_stride_ptr) });
+        // Fast path: x_stride_ptr == 1 means consecutive x values are at
+        // consecutive memory addresses, so the inner loop is a contiguous
+        // slice write — byte-identical to the per-element loop.
+        if x_stride_ptr == 1 && x_max > x_min {
+            unsafe {
+                let row = std::slice::from_raw_parts(iptr.offset(x_min), (x_max - x_min) as usize);
+                writer.write_slice(row);
+            }
+        } else {
+            for x in x_min..x_max {
+                writer.write(unsafe { *iptr.offset(x * x_stride_ptr) });
+            }
         }
     }
 
     #[inline(never)]
-    fn valid_2d<'p, T: Copy + Datum>(
-        geometry: &'p ConcreteGeometry,
+    fn valid_2d<T: Copy + Datum, W: PackingWriter<T>>(
+        geometry: &ConcreteGeometry,
         input: &TensorView,
-        pack: &'p mut TensorView,
         g: usize,
+        writer: &mut W,
     ) -> TractResult<()> {
         unsafe {
-            let pack = pack.as_slice_mut_unchecked::<T>();
             let shape = &geometry.input_shape_with_n;
             let y_stride = geometry.pool.patch.spec.strides[0] as isize;
             let x_stride = geometry.pool.patch.spec.strides[1] as isize;
             let y_stride_ptr = y_stride * *shape.h_stride() as isize;
             let x_stride_ptr = x_stride * *shape.w_stride() as isize;
             let c_stride_ptr = *shape.c_stride() as isize;
-            let mut writer =
-                geometry.b_pack.write_with_k_outer(pack.as_mut_ptr(), geometry.k, geometry.n);
             let iptr = input.as_ptr_unchecked::<T>();
             let iptr = iptr.add(g * geometry.ci_per_group * shape.c_stride());
+            let output_y = *geometry.pool.patch.output_shape.get_unchecked(0);
+            let output_x = *geometry.pool.patch.output_shape.get_unchecked(1);
+            // Fast path: stride-1 contiguous reads along x within each y-row.
+            // Each y-row becomes a single write_slice (memcpy when the slice
+            // fits in the current panel). Byte-identical to the slow path.
+            let contiguous_x = x_stride_ptr == 1;
             for ci in 0..geometry.ci_per_group {
                 let iptr = iptr.offset(ci as isize * c_stride_ptr);
                 for koffset in &geometry.pool.patch.standard_layout_data_field {
                     let iptr = iptr.offset(*koffset);
-                    for y in 0..*geometry.pool.patch.output_shape.get_unchecked(0) {
-                        let iptr = iptr.offset(y as isize * y_stride_ptr);
-                        for x in 0..*geometry.pool.patch.output_shape.get_unchecked(1) {
-                            writer.write(*iptr.offset(x as isize * x_stride_ptr));
+                    let mut iptr_y = iptr;
+                    for _ in 0..output_y {
+                        if contiguous_x {
+                            let row = std::slice::from_raw_parts(iptr_y, output_x);
+                            writer.write_slice(row);
+                        } else {
+                            // Hoist x multiplication out of inner loop.
+                            let mut iptr_x = iptr_y;
+                            for _ in 0..output_x {
+                                writer.write(*iptr_x);
+                                iptr_x = iptr_x.offset(x_stride_ptr);
+                            }
                         }
+                        iptr_y = iptr_y.offset(y_stride_ptr);
                     }
                 }
             }
diff --git a/core/src/ops/cnn/conv/lazy_im2col.rs b/core/src/ops/cnn/conv/lazy_im2col.rs
index 9eaaf19170..6bb5d9ff46 100644
--- a/core/src/ops/cnn/conv/lazy_im2col.rs
+++ b/core/src/ops/cnn/conv/lazy_im2col.rs
@@ -80,6 +80,13 @@ impl ExoticFact for LazyIm2colParams {
 #[derive(Clone, Debug, Hash, PartialEq, Eq)]
 pub struct LazyIm2Col {
     pub params: Arc<LazyIm2colParams>,
+    /// Number of groups for grouped convolution (1 if ungrouped). The output is then a
+    /// `[1, group]`-batched packed tensor; each group reads from a different slice of the
+    /// input via a per-group offset of `g * ci_per_group * c_stride * size_of`.
+    pub group: usize,
+    /// Byte stride between consecutive groups in the input tensor's flat byte buffer.
+    /// Equal to `ci_per_group * c_stride * size_of(input)`. Unused (0) when `group == 1`.
+    pub group_stride_bytes: isize,
 }
 
 impl Op for LazyIm2Col {
@@ -98,9 +105,16 @@ impl EvalOp for LazyIm2Col {
     fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>> {
         let tensor = args_1!(inputs);
         let dt = tensor.datum_type();
-        let input: Box<dyn MMMInputValue> =
-            Box::new(LazyIm2colInput { tensor, im2col: self.params.clone() });
-        let output = PackedMatrixStorage::new_batched(&[1, 1], vec![input]).into_tensor(dt);
+        let mut values: Vec<Box<dyn MMMInputValue>> = Vec::with_capacity(self.group);
+        for g in 0..self.group {
+            let group_offset_bytes = g as isize * self.group_stride_bytes;
+            values.push(Box::new(LazyIm2colInput {
+                tensor: tensor.clone(),
+                im2col: self.params.clone(),
+                group_offset_bytes,
+            }));
+        }
+        let output = PackedMatrixStorage::new_batched(&[1, self.group], values).into_tensor(dt);
         Ok(tvec!(output.into_tvalue()))
     }
 }
@@ -110,9 +124,9 @@ impl TypedOp for LazyIm2Col {
         let exotic_fact = DynPackedExoticFact {
             k: self.params.k_byte_offsets.len().to_dim(),
             mn: self.params.n_byte_offsets.len().to_dim(),
-            packers: vec![self.params.packer.clone()],
+            packers: vec![Box::new(self.params.packer.clone()) as Box<dyn MMMInputFormat>],
         };
-        Ok(tvec!(inputs[0].datum_type.fact([1, 1]).with_exotic_fact(exotic_fact)))
+        Ok(tvec!(inputs[0].datum_type.fact([1, self.group]).with_exotic_fact(exotic_fact)))
     }
 
     as_op!();
@@ -122,6 +136,9 @@ impl TypedOp for LazyIm2Col {
 struct LazyIm2colInput {
     tensor: TValue,
     im2col: Arc<LazyIm2colParams>,
+    /// Per-group base byte offset added to every gather. For group g this is
+    /// `g * ci_per_group * c_stride * size_of(input)`. Zero for ungrouped convs.
+    group_offset_bytes: isize,
 }
 
 impl Display for LazyIm2colInput {
@@ -132,7 +149,7 @@ impl Display for LazyIm2colInput {
 
 impl Hash for LazyIm2colInput {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        (self.tensor.as_bytes(), &self.im2col).hash(state);
+        (self.tensor.as_bytes(), &self.im2col, self.group_offset_bytes).hash(state);
     }
 }
 
@@ -149,7 +166,7 @@ impl LazyIm2colInput {
         let k_byte_offsets = self.im2col.k_byte_offsets.as_ptr();
         let n_byte_offsets = self.im2col.n_byte_offsets.as_ptr();
         unsafe {
-            let ptr = self.tensor.as_ptr_unchecked::<u8>();
+            let ptr = self.tensor.as_ptr_unchecked::<u8>().offset(self.group_offset_bytes);
             let o1 = *n_byte_offsets.offset(n);
             let o2 = *n_byte_offsets.offset(n + 1);
             let o3 = *n_byte_offsets.offset(n + 2);
@@ -187,7 +204,7 @@ impl LazyIm2colInput {
         n: isize,
     ) {
         unsafe {
-            let ptr = self.tensor.as_ptr_unchecked::<u8>();
+            let ptr = self.tensor.as_ptr_unchecked::<u8>().offset(self.group_offset_bytes);
             let k_byte_offsets = self.im2col.k_byte_offsets.as_ptr();
             let n_byte_offsets = self.im2col.n_byte_offsets.as_ptr();
             let o1 = *n_byte_offsets.offset(n);
@@ -221,7 +238,7 @@ impl LazyIm2colInput {
         n: isize,
     ) {
         unsafe {
-            let ptr = self.tensor.as_ptr_unchecked::<u8>();
+            let ptr = self.tensor.as_ptr_unchecked::<u8>().offset(self.group_offset_bytes);
             let k_byte_offsets = self.im2col.k_byte_offsets.as_ptr();
             let n_byte_offsets = self.im2col.n_byte_offsets.as_ptr();
             let o1 = *n_byte_offsets.offset(n);
@@ -249,7 +266,7 @@ impl LazyIm2colInput {
         n: isize,
     ) {
         unsafe {
-            let ptr = self.tensor.as_ptr_unchecked::<u8>();
+            let ptr = self.tensor.as_ptr_unchecked::<u8>().offset(self.group_offset_bytes);
             let k_byte_offsets = self.im2col.k_byte_offsets.as_ptr();
             let n_byte_offsets = self.im2col.n_byte_offsets.as_ptr();
             let o1 = *n_byte_offsets.offset(n);
@@ -280,7 +297,7 @@ impl LazyIm2colInput {
             _ => (),
         }
         unsafe {
-            let ptr = self.tensor.as_ptr_unchecked::<u8>();
+            let ptr = self.tensor.as_ptr_unchecked::<u8>().offset(self.group_offset_bytes);
             let k_byte_offsets = self.im2col.k_byte_offsets.as_ptr();
             let n_byte_offsets = self.im2col.n_byte_offsets.as_ptr();
             for k in k_range.start..k_range.end {
diff --git a/core/src/ops/cnn/conv/mod.rs b/core/src/ops/cnn/conv/mod.rs
index 0cdca69b88..c8d496a773 100644
--- a/core/src/ops/cnn/conv/mod.rs
+++ b/core/src/ops/cnn/conv/mod.rs
@@ -1,4 +1,5 @@
 mod block_quant;
+mod blocked;
 #[allow(clippy::module_inception)]
 mod conv;
 mod depth_wise;
@@ -9,6 +10,7 @@ mod q_sum_b;
 use crate::internal::*;
 use crate::ops::cnn::Deconv;
 
+pub use self::blocked::BlockedConv;
 pub use self::conv::Conv;
 pub use self::im2col::Im2Col;
 pub(crate) use self::q_sum_b::QSumB;
diff --git a/core/src/ops/cnn/conv/q_sum_b.rs b/core/src/ops/cnn/conv/q_sum_b.rs
index 434abbdf70..53f04aefcb 100644
--- a/core/src/ops/cnn/conv/q_sum_b.rs
+++ b/core/src/ops/cnn/conv/q_sum_b.rs
@@ -1,5 +1,6 @@
 use crate::internal::*;
 use tract_linalg::mmm::{MMMInputValue, PackedMatrixStorage};
+use tract_linalg::pack::PackedI8K4;
 use tract_ndarray::prelude::*;
 
 #[derive(Debug, Clone, Hash, PartialEq, Eq)]
@@ -80,14 +81,27 @@ impl QSumB {
         output: &mut [i32],
     ) -> TractResult<()> {
         let (r, k, n) = (input.format().r(), input.k(), input.mn());
+        // PackedI8K4 is K=4-inner: element (ik, ir) at (ik/4)*r*4 + ir*4 + ik%4,
+        // and the panel is k padded up to a multiple of 4. PackedFormat is K-major.
+        let is_k4 = input.format().downcast_ref::<PackedI8K4>().is_some();
+        let panel_len = if is_k4 { r * k.div_ceil(4) * 4 } else { r * k };
         let panels = n.divceil(r);
         for ipanel in 0..panels {
             let panel = input.panel_bytes(ipanel, None)?;
-            let panel: &[T] = unsafe { std::slice::from_raw_parts(panel as *const T, r * k) };
+            let panel: &[T] = unsafe { std::slice::from_raw_parts(panel as *const T, panel_len) };
             let mut vec = vec![0i32; r];
-            for ik in 0..k {
-                for ir in 0..r {
-                    vec[ir] += panel[ik * r + ir].as_();
+            if is_k4 {
+                for ik in 0..k {
+                    let kbase = (ik / 4) * r * 4 + ik % 4;
+                    for ir in 0..r {
+                        vec[ir] += panel[kbase + ir * 4].as_();
+                    }
+                }
+            } else {
+                for ik in 0..k {
+                    for ir in 0..r {
+                        vec[ir] += panel[ik * r + ir].as_();
+                    }
                 }
             }
             let len = r.min(n - r * ipanel);
diff --git a/core/src/ops/cnn/deconv/deconv_sum.rs b/core/src/ops/cnn/deconv/deconv_sum.rs
index 4e43560763..04479a9b2e 100644
--- a/core/src/ops/cnn/deconv/deconv_sum.rs
+++ b/core/src/ops/cnn/deconv/deconv_sum.rs
@@ -106,17 +106,17 @@ impl TypedOp for DeconvSum {
         Ok(tvec!(inputs[0].datum_type.fact(shape)))
     }
 
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         _source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
         target.wire_node(
             &node.name,
-            Self { input_shape: self.input_shape.eval(values)?.into_owned(), ..self.clone() },
+            Self { input_shape: self.input_shape.substitute(subs)?.into_owned(), ..self.clone() },
             &[mapping[&node.inputs[0]], mapping[&node.inputs[1]]],
         )
     }
diff --git a/core/src/ops/downsample/scan.rs b/core/src/ops/downsample/scan.rs
index fff2fb4cd7..1e097f617a 100644
--- a/core/src/ops/downsample/scan.rs
+++ b/core/src/ops/downsample/scan.rs
@@ -81,10 +81,10 @@ pub fn pull_downsample_over_scan(
                 inputs.push(ds);
             }
             InputMapping::Scan(info) => {
-                if info.chunk > 0 && !(info.chunk as usize).is_multiple_of(down_op.stride as usize)
-                {
-                    return Ok(None);
-                }
+                rule_if!(
+                    info.chunk <= 0
+                        || (info.chunk as usize).is_multiple_of(down_op.stride as usize)
+                );
                 info.chunk = info.chunk.unsigned_abs().divceil(down_op.stride as usize) as isize
                     * info.chunk.signum();
                 let tap = patch.tap_model(model, scan_node.inputs[slot])?;
diff --git a/core/src/ops/einsum/as_blas.rs b/core/src/ops/einsum/as_blas.rs
deleted file mode 100644
index c195c88cac..0000000000
--- a/core/src/ops/einsum/as_blas.rs
+++ /dev/null
@@ -1,172 +0,0 @@
-use tract_ndarray::Dimension;
-
-use crate::transform::ModelTransform;
-use crate::{broadcast, internal::*};
-use std::fmt::Debug;
-
-use super::prefix_matmul::{PrefixMatMul, rewrite_einsum_to_prefix_matmul};
-
-#[derive(Debug, Default)]
-pub struct AsBlas;
-
-impl ModelTransform for AsBlas {
-    fn name(&self) -> StaticName {
-        "as_blas".into()
-    }
-
-    fn transform(&self, model: &mut TypedModel) -> TractResult<()> {
-        rewrite_einsum_to_prefix_matmul(model, true)?;
-        Rewriter::default()
-            .with_rule_for("matmul-to-sgemm", matmul_to_sgemm)
-            .rewrite(&(), model)?;
-        Ok(())
-    }
-}
-
-fn matmul_to_sgemm(
-    _ctx: &(),
-    model: &TypedModel,
-    node: &TypedNode,
-    _node_name: &str,
-    op: &PrefixMatMul,
-) -> TractResult<Option<TypedModelPatch>> {
-    if !op.transpose_a
-        && !op.transpose_b
-        && !op.transpose_c
-        && op.quantize_output.is_none()
-        && model.node_input_facts(node.id)?.iter().all(|f| f.datum_type == f32::datum_type())
-    {
-        TypedModelPatch::replace_single_op(model, node, &node.inputs, SGemm::default()).map(Some)
-    } else {
-        Ok(None)
-    }
-}
-
-#[derive(Debug, Default, Clone, PartialEq, Eq)]
-pub struct SGemm {}
-
-impl Op for SGemm {
-    fn name(&self) -> StaticName {
-        "SGemm".into()
-    }
-
-    op_as_typed_op!();
-}
-
-impl SGemm {
-    fn output_shape<D: DimLike>(&self, a: &[D], b: &[D]) -> TractResult<TVec<D>> {
-        ensure!(a.len() == b.len());
-        let a_rank = a.len();
-        let b_rank = b.len();
-        let m = a[a_rank - 2].clone();
-        let n = b[b_rank - 1].clone();
-        let mut c_shape = broadcast::multi_broadcast(&[&a[..a_rank - 2], &b[..b_rank - 2]])
-            .context("Unable to broadcast")?;
-        c_shape.push(m);
-        c_shape.push(n);
-        Ok(c_shape)
-    }
-}
-
-impl EvalOp for SGemm {
-    fn is_stateless(&self) -> bool {
-        true
-    }
-
-    fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>> {
-        let (a, b) = args_2!(inputs);
-        let a_ptr = a.as_ptr::<f32>()?;
-        let b_ptr = b.as_ptr::<f32>()?;
-        let c_shape = self.output_shape(a.shape(), b.shape())?;
-        let rank = c_shape.len();
-        let m = c_shape[rank - 2];
-        let n = c_shape[rank - 1];
-        let k = a.shape()[rank - 1];
-        unsafe {
-            let mut c = Tensor::uninitialized::<f32>(&c_shape)?;
-            let c_ptr = c.as_ptr_mut::<f32>()?;
-            let silent_a_axis = c.rank() - a.rank();
-            let silent_b_axis = c.rank() - b.rank();
-            for prefix in ndarray::indices(&c_shape[0..rank - 2]) {
-                let mut a_ptr = a_ptr;
-                let mut b_ptr = b_ptr;
-                let mut c_ptr = c_ptr;
-                for (axis, x) in prefix.as_array_view().iter().enumerate() {
-                    if axis >= silent_a_axis && a.shape()[axis - silent_a_axis] != 1 {
-                        a_ptr = a_ptr.offset(*x as isize * a.strides()[axis - silent_a_axis]);
-                    }
-                    if axis >= silent_b_axis && b.shape()[axis - silent_b_axis] != 1 {
-                        b_ptr = b_ptr.offset(*x as isize * b.strides()[axis - silent_b_axis]);
-                    }
-                    c_ptr = c_ptr.offset(*x as isize * c.strides()[axis]);
-                }
-                if m == 1 {
-                    cblas::sgemv(
-                        cblas::Layout::RowMajor,
-                        cblas::Transpose::Ordinary,
-                        k as _,
-                        n as _,
-                        1.0,
-                        std::slice::from_raw_parts(b_ptr, n * k),
-                        n as _,
-                        std::slice::from_raw_parts(a_ptr, k),
-                        1,
-                        0.0,
-                        std::slice::from_raw_parts_mut(c_ptr, n),
-                        1,
-                    )
-                } else if n == 1 {
-                    cblas::sgemv(
-                        cblas::Layout::RowMajor,
-                        cblas::Transpose::None,
-                        m as _,
-                        k as _,
-                        1.0,
-                        std::slice::from_raw_parts(a_ptr, m * k),
-                        k as _,
-                        std::slice::from_raw_parts(b_ptr, k),
-                        1,
-                        0.0,
-                        std::slice::from_raw_parts_mut(c_ptr, m),
-                        1,
-                    )
-                } else {
-                    cblas::sgemm(
-                        cblas::Layout::RowMajor,
-                        cblas::Transpose::None,
-                        cblas::Transpose::None,
-                        m as _,
-                        n as _,
-                        k as _,
-                        1.0,
-                        std::slice::from_raw_parts(a_ptr, m * k),
-                        k as _,
-                        std::slice::from_raw_parts(b_ptr, k * n),
-                        n as _,
-                        0.0,
-                        std::slice::from_raw_parts_mut(c_ptr, m * n),
-                        n as _,
-                    )
-                }
-            }
-
-            Ok(tvec!(c.into_tvalue()))
-        }
-    }
-}
-
-impl TypedOp for SGemm {
-    fn output_facts(&self, inputs: &[&TypedFact]) -> TractResult<TVec<TypedFact>> {
-        ensure!(inputs[0].datum_type == f32::datum_type());
-        ensure!(inputs[1].datum_type == f32::datum_type());
-        Ok(tvec!(f32::fact(&self.output_shape(&inputs[0].shape, &inputs[1].shape)?)))
-    }
-
-    fn cost(&self, inputs: &[&TypedFact]) -> TractResult<TVec<(Cost, TDim)>> {
-        let fma = self.output_shape(&inputs[0].shape, &inputs[1].shape)?.iter().product::<TDim>()
-            * inputs[0].shape.last().unwrap();
-        Ok(tvec!((Cost::FMA(f32::datum_type()), fma)))
-    }
-
-    as_op!();
-}
diff --git a/core/src/ops/einsum/einsum_matmul.rs b/core/src/ops/einsum/einsum_matmul.rs
index d2d0f73692..cf497af0c1 100644
--- a/core/src/ops/einsum/einsum_matmul.rs
+++ b/core/src/ops/einsum/einsum_matmul.rs
@@ -3,7 +3,6 @@ use std::ops::Deref;
 
 use tract_itertools::{izip, multiunzip};
 use tract_linalg::block_quant::PackedBlockQuantFormat;
-use tract_linalg::pack::PackedFormat;
 
 use super::*;
 use crate::ops::cast::cast;
@@ -49,6 +48,25 @@ fn merge_same_role_axes_rule(
     let b_order: Vec<char> = op.axes.axes(InOut::In(1)).map(|a| a.repr).collect();
     let c_order: Vec<char> = op.axes.axes(InOut::Out(0)).map(|a| a.repr).collect();
 
+    // Per-input shapes, used both to reject broadcast merges and to gauge
+    // whether a merge earns its reshape / axis-permute cost.
+    let input_facts = model.node_input_facts(node.id)?;
+    let input_shapes = op.actual_input_shapes_from_facts(&input_facts)?;
+    // An axis is "non-unit" if its extent is not statically 1 (symbolic extents
+    // count as potentially large). A fold only reduces the number of matmul
+    // invocations when it combines at least two non-unit axes; folding a unit
+    // axis — the streaming axis at pulse=1, or a batch axis of 1 — leaves the
+    // matmul geometry untouched and only adds reshapes (and, in the k-axis
+    // branch, a MoveAxis), so we decline those.
+    let is_non_unit = |c: &char| -> bool {
+        let dim = a_order
+            .iter()
+            .position(|x| x == c)
+            .map(|p| &input_shapes[0][p])
+            .or_else(|| b_order.iter().position(|x| x == c).map(|p| &input_shapes[1][p]));
+        dim.map_or(true, |d| d.as_i64() != Some(1))
+    };
+
     // Find first group of 2+ same-role axes that are consecutive in all inputs.
     // Scan each input's axis order for runs of same-role axes.
     let role_map: std::collections::HashMap<char, Role> = axes.iter().cloned().collect();
@@ -100,10 +118,28 @@ fn merge_same_role_axes_rule(
         }
     }
 
+    if let Some(ref group) = best_group {
+        // Reject the group if any axis has mismatched per-input dims. This
+        // catches broadcasting cases — e.g. GQA `bhgmk,bhgnk->bhgmn` where g has
+        // dim 2 in input[0] and dim 1 in input[1]. Merging would collapse the
+        // broadcast structure into a non-broadcast dim mismatch that downstream
+        // OptMatMul codegen / kernels cannot handle.
+        let dims_match = group.iter().all(|c| {
+            match (a_order.iter().position(|x| x == c), b_order.iter().position(|x| x == c)) {
+                (Some(p0), Some(p1)) => input_shapes[0][p0] == input_shapes[1][p1],
+                _ => true,
+            }
+        });
+        // Decline merges that combine fewer than two non-unit axes: they don't
+        // shrink the matmul loop count, they only add reshapes / a MoveAxis.
+        let worth_merging = group.iter().filter(|c| is_non_unit(c)).count() >= 2;
+        if !dims_match || !worth_merging {
+            best_group = None;
+        }
+    }
+
     if let Some(group) = best_group {
         // Found a mergeable group. Emit the patch.
-        let input_facts = model.node_input_facts(node.id)?;
-        let input_shapes = op.actual_input_shapes_from_facts(&input_facts)?;
         let output_shape = super::eval::output_shape(&op.axes, &input_shapes)?;
 
         let drop_set: Vec<char> = group[1..].to_vec();
@@ -232,6 +268,14 @@ fn merge_same_role_axes_rule(
             if left_role != right_role || mid_role != Some(k_role) {
                 continue;
             }
+            // Only move the k-axis aside if the resulting merge is worth it:
+            // both axes we'd bring together must be non-unit. Otherwise the
+            // MoveAxis buys nothing (e.g. a 1×1 conv at pulse=1, where the
+            // batch / streaming axes are 1 and the only real axis was already
+            // the matmul m).
+            if !is_non_unit(&left) || !is_non_unit(&right) {
+                continue;
+            }
             // left and right must also be consecutive in other inputs
             // (output order is handled by the EinSum formula)
             let other_input_orders: Vec<&Vec<char>> = [(0, &a_order), (1, &b_order)]
@@ -819,23 +863,21 @@ fn optimized_mat_mul(
     let name = &node.name;
 
     let pack_a: Box<dyn TypedOp> = if input_facts[0].konst.is_some() {
-        if let Some(pf) = left_pack.downcast_ref::<PackedFormat>() {
-            Box::new(OptMatMulPack {
-                packers: vec![pf.clone()],
-                mode_picker: ModePicker::Single,
-                k_axis: op.a_k(),
-                mn_axis: op.a_m(),
-            })
-        } else if let Some(packed_format) =
-            left_pack.downcast_ref::<PackedBlockQuantFormat>().cloned()
-        {
+        if let Some(packed_format) = left_pack.downcast_ref::<PackedBlockQuantFormat>().cloned() {
             Box::new(OptSimpleMatMulPack {
                 packed_format,
                 k: input_shapes[0][op.a_k()].to_usize().unwrap(),
                 m: input_shapes[0][op.a_m()].to_usize().unwrap(),
             })
         } else {
-            bail!("Unexpected static input format {left_pack:?}");
+            // PackedFormat or a custom packer (e.g. PackedI8K4); OptMatMulPack
+            // dispatches on the concrete format at pack time.
+            Box::new(OptMatMulPack {
+                packers: vec![left_pack],
+                mode_picker: ModePicker::Single,
+                k_axis: op.a_k(),
+                mn_axis: op.a_m(),
+            })
         }
     } else {
         Box::new(OptMatMulPack {
@@ -843,11 +885,8 @@ fn optimized_mat_mul(
                 .iter()
                 .map(|(mmm, p, pe)| {
                     pe.as_ref()
-                        .map(|pe| &pe.from)
-                        .unwrap_or(&mmm.packings()[*p].0)
-                        .downcast_ref::<PackedFormat>()
-                        .unwrap()
-                        .clone()
+                        .map(|pe| pe.from.clone())
+                        .unwrap_or_else(|| mmm.packings()[*p].0.clone())
                 })
                 .collect(),
             mode_picker: mode_picker.clone(),
@@ -862,12 +901,7 @@ fn optimized_mat_mul(
         OptMatMulPack {
             k_axis: op.b_k(),
             mn_axis: op.b_n(),
-            packers: impls
-                .iter()
-                .map(|(mmm, p, _)| {
-                    mmm.packings()[*p].1.downcast_ref::<PackedFormat>().unwrap().clone()
-                })
-                .collect(),
+            packers: impls.iter().map(|(mmm, p, _)| mmm.packings()[*p].1.clone()).collect(),
             mode_picker: mode_picker.clone(),
         },
         &[taps[1]],
diff --git a/core/src/ops/einsum/kernel_selection.rs b/core/src/ops/einsum/kernel_selection.rs
index e2a7ed988a..aa00942b18 100644
--- a/core/src/ops/einsum/kernel_selection.rs
+++ b/core/src/ops/einsum/kernel_selection.rs
@@ -53,8 +53,14 @@ pub fn strategize(model: &TypedModel, node: &TypedNode, op: &EinSumMatMul) -> Tr
         return Ok(single_strat(it));
     }
     if op.n.as_i64().is_some_and(|n| n > 1) {
-        let it =
-            impls.into_iter().max_by_key(|(m, _, pe)| (pe.is_none(), m.nr() * m.mr())).unwrap();
+        // For a 2D matmul (n > 1) a GEMV kernel (nr == 1) is a poor fit: it
+        // processes one output column per pass. Demote nr == 1 so it never wins
+        // the `nr * mr` tie against a square tile (e.g. i8 64x1 vs 8x8 both have
+        // nr*mr == 64). Ordering among nr > 1 kernels is left untouched.
+        let it = impls
+            .into_iter()
+            .max_by_key(|(m, _, pe)| (pe.is_none(), m.nr() > 1, m.nr() * m.mr()))
+            .unwrap();
         return Ok(single_strat(it));
     }
     let mut grouped_by_left_packing = Vec::<(&dyn MMMInputFormat, Vec<_>)>::new();
@@ -79,7 +85,21 @@ pub fn strategize(model: &TypedModel, node: &TypedNode, op: &EinSumMatMul) -> Tr
             (p, best_for_mmv, best_for_mmm)
         })
         .max_by_key(|(_, mmv, mmm)| {
-            (mmv.0.nr() == 1 && mmm.0.nr() > 1, mmv.2.is_none(), mmm.0.mr(), mmm.0.nr())
+            // When no group offers the ideal (true GEMV nr==1 + true matrix nr>1)
+            // pair, still prefer a group whose matrix-role kernel is a real matrix
+            // (nr > 1) over a GEMV-only group. Without this, int8 — whose GEMV
+            // (64x1), SMLAL (8x8) and SDOT (8x8_dot) kernels each use a different
+            // packing, so no single group is ideal — falls through to `mmm.mr` and
+            // picks the 64x1 GEMV even for symbolic (dynamic) n. f32/f16/block-quant
+            // are unaffected: they have a packing group that IS ideal (e.g. f32
+            // 32x1/32x3, q40 32x1/32x3), so the first key already decides.
+            (
+                mmv.0.nr() == 1 && mmm.0.nr() > 1,
+                mmv.2.is_none(),
+                mmm.0.nr() > 1,
+                mmm.0.mr(),
+                mmm.0.nr(),
+            )
         })
         .unwrap();
 
@@ -117,7 +137,11 @@ pub fn list_impls(
         .mmm_impls()
         .iter()
         .filter(|mmm| {
-            op.acceptable_accumulators().contains(&mmm.internal_type())
+            // Only consider kernels runnable on this CPU: e.g. the SDOT i8 kernel
+            // carries a FEAT_DotProd platform predicate, and must not be selected on
+            // a CPU that would trap on the instruction.
+            mmm.is_supported_here()
+                && op.acceptable_accumulators().contains(&mmm.internal_type())
                 && mmm.stores().contains(&op.operating_dt.unquantized())
         })
         .flat_map(move |mmm| {
diff --git a/core/src/ops/einsum/mod.rs b/core/src/ops/einsum/mod.rs
index cb6db25e1b..82988bdf68 100644
--- a/core/src/ops/einsum/mod.rs
+++ b/core/src/ops/einsum/mod.rs
@@ -7,8 +7,6 @@ use crate::tract_data::itertools::Itertools;
 
 mod eval;
 
-#[cfg(feature = "blas")]
-pub mod as_blas;
 pub mod einsum_matmul;
 pub mod kernel_selection;
 pub mod prefix_matmul;
@@ -89,9 +87,8 @@ impl EinSum {
         let mut taps = tvec!();
         for (ix, input) in node.inputs.iter().enumerate() {
             let mut tap = patch.tap_model(model, *input)?;
-            if new_axis.inputs[ix].len() > 1 {
-                return Ok(None); // FIXME maybe
-            } else if new_axis.inputs[ix].is_empty() {
+            rule_if!(new_axis.inputs[ix].len() <= 1); // FIXME maybe
+            if new_axis.inputs[ix].is_empty() {
                 let insert_at = self.axes.rank(InOut::In(ix));
                 tap = patch.wire_node(
                     format!("{}.prop_axis.{}.input_{}", &node.name, new_axis.repr, ix),
@@ -223,7 +220,84 @@ impl TypedOp for EinSum {
         model: &TypedModel,
         node: &TypedNode,
     ) -> TractResult<Option<TVec<Option<TDim>>>> {
-        crate::optim::propagate_roi::bubble_roi(model, node)
+        // First try bubble_roi: works for inputs that cover all ROI coord
+        // axes mentioned in the output ROI.  For inputs that DON'T cover
+        // every coord axis (= contracted/projected-out axes from this
+        // input's perspective), try the closed-form chunked-band recogniser
+        // which yields a constant band on the input's kept axis after
+        // existentially quantifying the projected axes.
+        let output_fact = model.outlet_fact(OutletId::new(node.id, 0))?;
+        let Some(roi) = &output_fact.region_of_interest else { return Ok(None) };
+        let input_facts: TVec<&TypedFact> =
+            node.inputs.iter().map(|i| model.outlet_fact(*i)).collect::<TractResult<_>>()?;
+        let output_facts = tvec![output_fact];
+        let inputs_ref: Vec<&TypedFact> = input_facts.iter().copied().collect();
+        let outputs_ref: Vec<&TypedFact> = output_facts.iter().copied().collect();
+        let mapping = self.axes_mapping(&inputs_ref, &outputs_ref)?;
+        let roi_coord_axes: Vec<(usize, Symbol)> = roi
+            .symbols()
+            .into_iter()
+            .filter_map(|s| crate::ops::logic::sym_to_coord_axis(&s).map(|k| (k, s)))
+            .collect();
+
+        let project_for_input = |input_ix: usize| -> Option<TDim> {
+            // Classify each output ROI coord axis: projected (no input axis)
+            // or preserved (maps to input).
+            let mut projected: Vec<Symbol> = vec![];
+            let mut preserved: Vec<(Symbol, usize)> = vec![];
+            for (out_pos, sym) in &roi_coord_axes {
+                let logical = mapping
+                    .iter_all_axes()
+                    .find(|a| a.outputs.first().is_some_and(|o| o.contains(out_pos)))?;
+                match logical.inputs[input_ix].first() {
+                    None => projected.push(sym.clone()),
+                    Some(&in_pos) => {
+                        if input_facts[input_ix].shape[in_pos] != output_fact.shape[*out_pos] {
+                            return None;
+                        }
+                        preserved.push((sym.clone(), in_pos));
+                    }
+                }
+            }
+            if projected.is_empty() {
+                // All axes preserved — fall through to standard remap.
+                let mut sub_map: HashMap<Symbol, TDim> = HashMap::new();
+                for (sym, in_pos) in &preserved {
+                    if crate::ops::logic::sym_to_coord_axis(sym) != Some(*in_pos) {
+                        let scope = sym.scope()?;
+                        sub_map.insert(sym.clone(), TDim::Sym(scope.coord_sym(*in_pos)));
+                    }
+                }
+                return if sub_map.is_empty() {
+                    Some(roi.clone())
+                } else {
+                    roi.substitute_all(&sub_map).ok()
+                };
+            }
+            // Try the chunked-band recogniser: one projected axis × one
+            // preserved axis at a time.
+            for p_sym in &projected {
+                for (k_sym, k_in_pos) in &preserved {
+                    if let Some(band) = crate::optim::propagate_roi::recognise_chunked_band_project(
+                        roi, p_sym, k_sym,
+                    ) {
+                        // Result mentions k_sym (output frame).  Remap to
+                        // input axis position.
+                        if crate::ops::logic::sym_to_coord_axis(k_sym) != Some(*k_in_pos) {
+                            let scope = k_sym.scope()?;
+                            let mut m: HashMap<Symbol, TDim> = HashMap::new();
+                            m.insert(k_sym.clone(), TDim::Sym(scope.coord_sym(*k_in_pos)));
+                            return band.substitute_all(&m).ok();
+                        }
+                        return Some(band);
+                    }
+                }
+            }
+            None
+        };
+        let result: TVec<Option<TDim>> =
+            (0..node.inputs.len()).map(|ix| project_for_input(ix)).collect();
+        Ok(Some(result))
     }
 
     fn axes_mapping(
@@ -341,6 +415,9 @@ impl TypedOp for EinSum {
         if let Some(patch) = declutter_broadcast(self, session, model, node)? {
             return Ok(Some(patch));
         }
+        if let Some(patch) = unit_k_to_broadcast_mul(self, model, node)? {
+            return Ok(Some(patch));
+        }
         Ok(None)
     }
 
@@ -353,6 +430,14 @@ impl TypedOp for EinSum {
             (self.q_params.is_none() && node.inputs.len() == 2)
                 || (self.q_params.is_some() && node.inputs.len() == 9)
         );
+        // Some EinSums are introduced during codegen itself (e.g. ConvTranspose lowering
+        // emits an EinSum + DeconvSum pair). Those don't get a chance to go through declutter
+        // before being lowered, so we re-check the unit-K → broadcast-Mul rule here as a
+        // fast path. For EinSums that already existed at declutter time, this is a no-op
+        // (the declutter pass would already have rewritten them).
+        if let Some(patch) = unit_k_to_broadcast_mul(self, model, node)? {
+            return Ok(Some(patch));
+        }
         einsum_matmul::detect_rule(&(), model, node, &node.name, self)
     }
 
@@ -379,18 +464,14 @@ fn declutter_reshape_folding_input_axis(
             axes = axes.with_extra_axis(*label, InOut::In(extra_input), 0)?;
         }
         let folded_axis = op.axes.axis((InOut::In(slot), at))?;
-        if folded_axis.outputs[0].len() > 1 {
-            return Ok(None);
-        };
+        rule_if!(folded_axis.outputs[0].len() <= 1);
         let mut patch = TypedModelPatch::default();
         let mut taps = patch.taps(model, &node.inputs)?;
         for (input, tap) in taps.iter_mut().enumerate() {
             if folded_axis.inputs[input].len() == 0 {
                 continue;
             };
-            if folded_axis.inputs[input].len() > 1 {
-                return Ok(None);
-            };
+            rule_if!(folded_axis.inputs[input].len() <= 1);
             let pos = folded_axis.inputs[input][0];
             for label in &extra_labels {
                 axes = axes.with_extra_axis_occurency(*label, InOut::In(input), pos)?;
@@ -442,3 +523,152 @@ fn declutter_broadcast(
     }
     Ok(None)
 }
+
+/// Rewrite an EinSum whose contraction product is statically 1 as a broadcast Mul.
+///
+/// Triggers when:
+/// - All "k-like" axes (present in both inputs, absent from output) have shape 1 in both inputs, OR
+/// - There are no k-like axes at all (Hadamard products like `mn,mn->mn`, outer products like
+///   `m,n->mn`, or any pure broadcast pattern).
+///
+/// In both cases the einsum has no real contraction work — it's a broadcast multiplication
+/// dressed up as an einsum. Lowering it as a matmul leaves the GEMM kernel running per-tile
+/// setup (clear, panel-load, store) for at most one FMA, so a direct broadcast Mul is much
+/// faster on Native (and a net semantic simplification regardless of perf).
+///
+/// Quantized einsums are left untouched: the existing `dequant` path in `EinSumMatMul::codegen`
+/// produces a non-q einsum that this rule then catches naturally on the next declutter pass.
+fn unit_k_to_broadcast_mul(
+    op: &EinSum,
+    model: &TypedModel,
+    node: &TypedNode,
+) -> TractResult<Option<TypedModelPatch>> {
+    if op.q_params.is_some() || node.inputs.len() != 2 {
+        return Ok(None);
+    }
+    let input_facts = model.node_input_facts(node.id)?;
+    let input_shapes = op.actual_input_shapes_from_facts(&input_facts)?;
+    let k_axes: TVec<&Axis> = op
+        .axes
+        .iter_all_axes()
+        .filter(|a| a.inputs[0].len() == 1 && a.inputs[1].len() == 1 && a.outputs[0].is_empty())
+        .collect();
+    // Bail if any k-axis is non-trivial — that's a real contraction, leave it to matmul lowering.
+    let any_nontrivial_k = k_axes.iter().any(|a| {
+        !input_shapes[0][a.inputs[0][0]].is_one() || !input_shapes[1][a.inputs[1][0]].is_one()
+    });
+    if any_nontrivial_k {
+        return Ok(None);
+    }
+    // Scope: only fire when this einsum's output is consumed by a DeconvSum (i.e. it was
+    // emitted by the ConvTranspose lowering pipeline in `Deconv::wire_with_deconv_sum`).
+    // That's the original target case (DFN3 / GTCRN depthwise ConvTranspose with 1×N kernel
+    // collapsing to K=1 — see PR #2183). Other K=1 einsums (e.g. degenerate Q@K^T inside
+    // SDPA when head_dim=1, random-shape proptests with K=1) are intentionally left alone:
+    // backend-specific pipelines (Metal SDPA fusion, MetalMul rank-4 broadcast-segment limit,
+    // …) pattern-match on the matmul shape and break when we substitute a Mul.
+    let has_deconv_sum_consumer = node.outputs.first().map_or(false, |o| {
+        o.successors.iter().any(|inlet| model.node(inlet.node).op.name() == "DeconvSum")
+    });
+    if !has_deconv_sum_consumer {
+        return Ok(None);
+    }
+
+    let one = TDim::one();
+    // Reject "non-trivial single-side disappearing" axes — those need a real reduction.
+    for axis in op.axes.iter_all_axes() {
+        let in_left =
+            axis.inputs[0].first().map(|pos| &input_shapes[0][*pos]).unwrap_or(&one) != &one;
+        let in_right =
+            axis.inputs[1].first().map(|pos| &input_shapes[1][*pos]).unwrap_or(&one) != &one;
+        let in_out = !axis.outputs[0].is_empty();
+        if (in_left ^ in_right) && !in_out {
+            return Ok(None);
+        }
+    }
+
+    let c_axes: Vec<char> = op.axes.axes(InOut::Out(0)).map(|a| a.repr).collect();
+    if c_axes.is_empty() {
+        return Ok(None);
+    }
+
+    let k_reprs: TVec<char> = k_axes.iter().map(|a| a.repr).collect();
+    let mut patch = TypedModelPatch::new("EinSum unit-K → broadcast Mul");
+    let mut wires: TVec<OutletId> = patch.taps(model, &node.inputs)?;
+    let name = &node.name;
+
+    for (slot, wire) in wires.iter_mut().enumerate() {
+        // Promote inputs to operating_dt so the result type matches EinSum::output_facts
+        // (e.g. i8 inputs with i32 operating_dt for an integer matmul that has been dequantized).
+        let cur_dt = patch.outlet_fact(*wire)?.datum_type;
+        if cur_dt != op.operating_dt {
+            *wire = patch.wire_node(
+                format!("{name}.cast_in{slot}"),
+                crate::ops::cast::cast(op.operating_dt),
+                &[*wire],
+            )?[0];
+        }
+
+        // Drop k axes (sorted descending so positions stay valid).
+        let mut k_positions: Vec<usize> = k_axes.iter().map(|a| a.inputs[slot][0]).collect();
+        k_positions.sort_by(|a, b| b.cmp(a));
+        for (i, pos) in k_positions.into_iter().enumerate() {
+            *wire =
+                patch.wire_node(format!("{name}.rm_k_in{slot}.{i}"), AxisOp::Rm(pos), &[*wire])?[0];
+        }
+
+        let mut current: Vec<char> = op
+            .axes
+            .axes(InOut::In(slot))
+            .map(|a| a.repr)
+            .filter(|c| !k_reprs.contains(c))
+            .collect();
+
+        // Drop any remaining axes not in output (must be size 1 by precondition above).
+        let mut to_drop: Vec<(usize, char)> = current
+            .iter()
+            .enumerate()
+            .filter(|(_, c)| !c_axes.contains(c))
+            .map(|(i, c)| (i, *c))
+            .collect();
+        to_drop.sort_by(|a, b| b.0.cmp(&a.0));
+        for (pos, c) in to_drop {
+            *wire = patch.wire_node(
+                format!("{name}.rm_extra_in{slot}_{c}"),
+                AxisOp::Rm(pos),
+                &[*wire],
+            )?[0];
+            current.remove(pos);
+        }
+
+        // Insert unit axes for output axes missing from this input.
+        for (target_pos, &t) in c_axes.iter().enumerate() {
+            if !current.contains(&t) {
+                *wire = patch.wire_node(
+                    format!("{name}.add_in{slot}_{t}"),
+                    AxisOp::Add(target_pos),
+                    &[*wire],
+                )?[0];
+                current.insert(target_pos, t);
+            }
+        }
+
+        // Permute to match output axis order.
+        for (target_pos, &t) in c_axes.iter().enumerate() {
+            let cur_pos = current.iter().position(|&c| c == t).unwrap();
+            if cur_pos != target_pos {
+                *wire = patch.wire_node(
+                    format!("{name}.move_in{slot}_{t}"),
+                    AxisOp::Move(cur_pos, target_pos),
+                    &[*wire],
+                )?[0];
+                let removed = current.remove(cur_pos);
+                current.insert(target_pos, removed);
+            }
+        }
+    }
+
+    let result = patch.wire_node(name, crate::ops::math::mul(), &wires)?;
+    patch.shunt_outside(model, node.id.into(), result[0])?;
+    Ok(Some(patch))
+}
diff --git a/core/src/ops/einsum/prefix_matmul.rs b/core/src/ops/einsum/prefix_matmul.rs
index e5cc5d9f41..28c87272f8 100644
--- a/core/src/ops/einsum/prefix_matmul.rs
+++ b/core/src/ops/einsum/prefix_matmul.rs
@@ -529,6 +529,62 @@ mod test {
         .check()
     }
 
+    fn check_k1(expr: &str, a_shape: &[usize], b_shape: &[usize]) -> TractResult<()> {
+        let a_len = a_shape.iter().product::<usize>();
+        let b_len = b_shape.iter().product::<usize>();
+        let a =
+            tensor1(&(0..a_len).map(|i| (i + 1) as f32).collect_vec()).into_shape(a_shape).unwrap();
+        let b = tensor1(&(0..b_len).map(|i| (i + 7) as f32 * 0.5).collect_vec())
+            .into_shape(b_shape)
+            .unwrap();
+        EinSumProblem { expr: expr.to_string(), a, b }.check()
+    }
+
+    // K=1 means the einsum's contraction degenerates to broadcast-mul. detect_rule
+    // short-circuits to a Mul op rather than dispatching the GEMM kernel for a single
+    // FMA per tile. The gmk,Ngnk->Ngmn pattern with k=1 is what depthwise ConvTranspose
+    // lowers to, and the per-tile GEMM overhead dominates in that case.
+    #[test]
+    fn k1_amk_akn_amn() -> TractResult<()> {
+        check_k1("amk,akn->amn", &[2, 3, 1], &[2, 1, 4])
+    }
+
+    #[test]
+    fn k1_gmk_ngnk_ngmn() -> TractResult<()> {
+        check_k1("gmk,Ngnk->Ngmn", &[3, 2, 1], &[1, 3, 4, 1])
+    }
+
+    #[test]
+    fn k1_mk_kn_mn() -> TractResult<()> {
+        check_k1("mk,kn->mn", &[2, 1], &[1, 3])
+    }
+
+    // The unit_k_to_broadcast_mul declutter rule is scoped to einsums whose output is
+    // consumed by DeconvSum (the original ConvTranspose-K=1 target). For these isolated
+    // einsum tests the rule won't fire — outputs still go through OptMatMul correctly,
+    // just slower than a broadcast Mul would be. The end-to-end ConvTranspose case is
+    // covered by integration tests (DFN3 erb_dec, GTCRN) and verified bit-exact in the
+    // PR description.
+    #[test]
+    fn k1_no_k_axis_outer() -> TractResult<()> {
+        check_k1("m,n->mn", &[3], &[4])
+    }
+
+    #[test]
+    fn k1_high_rank_no_decov_sum() -> TractResult<()> {
+        // From a CI proptest failure: a high-rank einsum with K=1, no DeconvSum
+        // downstream → rule must not fire, OptMatMul handles it correctly.
+        check_k1("wmexk,wxnk->ewnxm", &[2, 1, 2, 2, 1], &[2, 2, 1, 1])
+    }
+
+    #[test]
+    fn k1_sdpa_shape_no_deconv_sum() -> TractResult<()> {
+        // Regression for SDPA failure mode: when head_dim=1, SDPA's score einsum
+        // looks like a K=1 case structurally, but it's followed by Softmax (not
+        // DeconvSum). Rule must NOT fire — Metal's SDPA fusion would otherwise break.
+        check_k1("bhmk,bhnk->bhmn", &[1, 3, 4, 1], &[1, 3, 4, 1])
+    }
+
     #[test]
     fn q() -> TractResult<()> {
         let qp = QParams::ZpScale { zero_point: 0, scale: 0.1 };
diff --git a/core/src/ops/element_wise.rs b/core/src/ops/element_wise.rs
index 13e188217e..b66217cc77 100644
--- a/core/src/ops/element_wise.rs
+++ b/core/src/ops/element_wise.rs
@@ -1,4 +1,5 @@
 use crate::internal::*;
+use crate::ops::array::MultiBroadcastTo;
 use downcast_rs::Downcast;
 use dyn_eq::DynEq;
 use std::fmt;
@@ -163,8 +164,14 @@ impl TypedOp for ElementWiseOp {
         model: &TypedModel,
         node: &TypedNode,
     ) -> TractResult<Option<TypedModelPatch>> {
-        if let Some(prec) = model.single_prec(node.id)?
-            && (prec.op_is::<AxisOp>() || prec.op_is::<IntoShape>())
+        // linear_prec (fan-in=1, fan-out=1) rather than single_prec: swapping
+        // through a fan-out predecessor clones it, and the clone can break
+        // downstream pattern detectors (e.g. Square+Reduce<Sum>+Mul fusion
+        // into Reduce<MeanOfSquares> feeding RmsNorm detection).
+        if let Some(prec) = model.linear_prec(node.id)?
+            && (prec.op_is::<AxisOp>()
+                || prec.op_is::<IntoShape>()
+                || prec.op_is::<MultiBroadcastTo>())
         {
             let mut patch = TypedModelPatch::default();
             let mut wire = tvec!(patch.tap_model(model, prec.inputs[0])?);
diff --git a/core/src/ops/konst.rs b/core/src/ops/konst.rs
index 165f486521..a2754d50d0 100644
--- a/core/src/ops/konst.rs
+++ b/core/src/ops/konst.rs
@@ -76,18 +76,18 @@ impl TypedOp for Const {
         Ok(tvec!((Cost::Params(self.0.datum_type().unquantized()), self.0.len().into())))
     }
 
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         _source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         _mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
         let op = if self.0.datum_type() == TDim::datum_type() {
             let mut tensor = self.0.clone().into_tensor();
             for d in tensor.try_as_plain_mut()?.as_slice_mut::<TDim>()? {
-                *d = d.eval(values);
+                *d = d.substitute_all(subs)?;
             }
             Const(tensor.into_arc_tensor(), self.1.clone())
         } else {
diff --git a/core/src/ops/logic.rs b/core/src/ops/logic.rs
index a78e19a4aa..f67b1c1bf2 100644
--- a/core/src/ops/logic.rs
+++ b/core/src/ops/logic.rs
@@ -238,7 +238,7 @@ impl TypedOp for Iff {
         // which injects a concrete Const(0/1) that this rule then folds.
         let cond_fact = model.outlet_fact(node.inputs[0])?;
         rule_if_some!(uniform = &cond_fact.uniform);
-        let Ok(cond_val) = uniform.cast_to_scalar::<bool>() else { return Ok(None) };
+        rule_if_let!(Ok(cond_val) = uniform.cast_to_scalar::<bool>());
         let branch = if cond_val { node.inputs[1] } else { node.inputs[2] };
         let mut patch = TypedModelPatch::default();
         let wire = patch.tap_model(model, branch)?;
diff --git a/core/src/ops/logic/comparison.rs b/core/src/ops/logic/comparison.rs
index 43bb4dfcdd..b4e56de291 100644
--- a/core/src/ops/logic/comparison.rs
+++ b/core/src/ops/logic/comparison.rs
@@ -27,9 +27,7 @@ fn eval_tdim_symbolic(
     inputs: &TVec<TValue>,
     prove: impl Fn(&TDim, &TDim) -> TractResult<bool>,
 ) -> TractResult<Option<TVec<TValue>>> {
-    if inputs[0].datum_type() != TDim::datum_type() {
-        return Ok(None);
-    }
+    rule_if!(inputs[0].datum_type() == TDim::datum_type());
     let mut a = inputs[0].clone().into_tensor();
     let mut b = inputs[1].clone().into_tensor();
     for a in a.try_as_plain_mut()?.as_slice_mut::<TDim>()? {
diff --git a/core/src/ops/math/mod.rs b/core/src/ops/math/mod.rs
index b451ed4e91..181f323b6e 100644
--- a/core/src/ops/math/mod.rs
+++ b/core/src/ops/math/mod.rs
@@ -96,6 +96,7 @@ bin_to_super_type!(mul, Mul,
                     },
                    linalg: Mul,
                    neutral_element: 1,
+                   absorbing_element: 0,
                    out_of_place: |c:&mut Tensor, a:&Tensor, b: &Tensor| -> TractResult<bool> {
                        if c.datum_type() == TDim::datum_type() &&
                            a.datum_type() == TDim::datum_type() && b.datum_type() == TDim::datum_type() {
diff --git a/core/src/ops/matmul/optimized.rs b/core/src/ops/matmul/optimized.rs
index 6c2378abda..301b0aa24a 100644
--- a/core/src/ops/matmul/optimized.rs
+++ b/core/src/ops/matmul/optimized.rs
@@ -499,9 +499,7 @@ impl TypedOp for OptMatMul {
                 || self.mmm.iter().all(|m| m.stores().contains(&cast_to)))
             && let Some(ProtoFusedSpec::Store(stores)) = self.micro_ops.last()
         {
-            if stores.iter().any(|s| matches!(s, OutputStoreSpec::Strides { .. })) {
-                return Ok(None);
-            }
+            rule_if!(stores.iter().all(|s| !matches!(s, OutputStoreSpec::Strides { .. })));
             let c_fact = cast_to.fact(self.c_fact.shape.clone());
             let mut patch =
                 TypedModelPatch::fuse_with_next(model, node, Self { c_fact, ..self.clone() })?;
diff --git a/core/src/ops/matmul/pack.rs b/core/src/ops/matmul/pack.rs
index 41ecb42c81..934abfee86 100644
--- a/core/src/ops/matmul/pack.rs
+++ b/core/src/ops/matmul/pack.rs
@@ -1,17 +1,43 @@
 use crate::axes::Axis;
 use crate::internal::*;
 use ndarray::*;
+use tract_linalg::WeightType;
 use tract_linalg::block_quant::{
     BlockQuantStorage, PackedBlockQuantFact, PackedBlockQuantFormat, block_quant_slice,
 };
-use tract_linalg::mmm::{MMMInputValue, PackedMatrixStorage};
-use tract_linalg::pack::PackedFormat;
+use tract_linalg::mmm::{MMMInputFormat, MMMInputValue, PackedMatrixStorage};
+use tract_linalg::pack::{PackedFormat, PackedI8K4};
+#[cfg(target_arch = "x86_64")]
+use tract_linalg::x86_64_fma::amx::PackedAmxA;
 
 use super::ModePicker;
 
+// Pack one (possibly strided) view with a dynamic packing format. Keeps the
+// PackedFormat fast path byte-identical; routes the K=4-inner SMOPA packer
+// (PackedI8K4) and the AMX A-side packer (PackedAmxA) through their view
+// packers. Other formats are unsupported here.
+fn pack_view_with(
+    packer: &dyn MMMInputFormat,
+    t: &TensorView,
+    k_axis: usize,
+    mn_axis: usize,
+) -> TractResult<Box<dyn MMMInputValue>> {
+    if let Some(pf) = packer.downcast_ref::<PackedFormat>() {
+        return pf.pack_tensor_view(t, k_axis, mn_axis);
+    }
+    if let Some(p4) = packer.downcast_ref::<PackedI8K4>() {
+        return p4.pack_view(t, k_axis, mn_axis);
+    }
+    #[cfg(target_arch = "x86_64")]
+    if let Some(pa) = packer.downcast_ref::<PackedAmxA>() {
+        return pa.pack_view(t, k_axis, mn_axis);
+    }
+    bail!("OptMatMulPack does not support packing format {packer:?}")
+}
+
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct OptMatMulPack {
-    pub(crate) packers: Vec<PackedFormat>,
+    pub(crate) packers: Vec<Box<dyn MMMInputFormat>>,
     pub(crate) mode_picker: ModePicker,
     pub(crate) k_axis: usize,
     pub(crate) mn_axis: usize,
@@ -88,7 +114,7 @@ impl OptMatMulPack {
             let packer = &self.packers[mode];
             let output_shape: TVec<usize> = self.output_shape(input.shape());
             let stores = if output_shape.iter().all(|d| *d == 1) {
-                let packed = packer.pack_tensor_view(&input.view(), self.k_axis, self.mn_axis)?;
+                let packed = pack_view_with(&**packer, &input.view(), self.k_axis, self.mn_axis)?;
                 PackedMatrixStorage::new_batched(&output_shape, vec![packed])
                     .into_tensor(input.datum_type())
             } else {
@@ -106,7 +132,8 @@ impl OptMatMulPack {
                         .map(|(x, s)| *x as isize * s)
                         .sum::<isize>()
                         * input.datum_type().size_of() as isize;
-                    values.push(packer.pack_tensor_view(
+                    values.push(pack_view_with(
+                        &**packer,
                         &TensorView::from_bytes(&input, offset, input.shape(), input.strides()),
                         self.k_axis,
                         self.mn_axis,
@@ -131,12 +158,17 @@ impl OptMatMulPack {
 pub struct DynPackedExoticFact {
     pub k: TDim,
     pub mn: TDim,
-    pub packers: Vec<PackedFormat>,
+    pub packers: Vec<Box<dyn MMMInputFormat>>,
 }
 
 impl ExoticFact for DynPackedExoticFact {
     fn buffer_sizes(&self) -> TVec<TDim> {
-        tvec!(self.k.clone() * &self.mn * self.packers[0].dt.size_of())
+        let elem_bytes = match self.packers[0].precursor() {
+            WeightType::Plain(dt) => dt.size_of(),
+            // OptMatMulPack only ever carries plain (PackedFormat / PackedI8K4) packers.
+            WeightType::BlockQuant(_) => 1,
+        };
+        tvec!(self.k.clone() * &self.mn * elem_bytes)
     }
 }
 
diff --git a/core/src/ops/mod.rs b/core/src/ops/mod.rs
index 4ca32b6f62..b69eefc5c8 100644
--- a/core/src/ops/mod.rs
+++ b/core/src/ops/mod.rs
@@ -291,15 +291,17 @@ pub trait TypedOp:
         Ok(None)
     }
 
-    /// Transform the op into by providing a value to one or more symbols.
+    /// Transform the op by substituting one or more symbols with TDim
+    /// expressions (a concrete integer is `TDim::Val(v)`; an expression
+    /// can be any other TDim, including symbolic ones).
     #[allow(unused_variables)]
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
         let inputs = node.inputs.iter().map(|i| mapping[i]).collect::<TVec<_>>();
         target.wire_node(&node.name, node.op.clone(), &inputs)
diff --git a/core/src/ops/nn/gelu_approximate.rs b/core/src/ops/nn/gelu_approximate.rs
index cbc36d62e2..81cc8997d5 100644
--- a/core/src/ops/nn/gelu_approximate.rs
+++ b/core/src/ops/nn/gelu_approximate.rs
@@ -19,11 +19,16 @@ element_wise!(gelu_approximate, GeluApproximate { fast_impl: bool },
         Ok(())
     },
     [f32] => |op, xs| {
-        let pow = if op.fast_impl { 2 } else { 3 };
-        xs.iter_mut().for_each(|x| {
-            *x = gelu_approx_f32(*x, pow);
-        });
-        Ok(())
+        if op.fast_impl {
+            // pow=2 fast path: no linalg kernel yet, scalar fallback.
+            xs.iter_mut().for_each(|x| {
+                *x = gelu_approx_f32(*x, 2);
+            });
+            Ok(())
+        } else {
+            // pow=3 canonical path: linalg NEON kernel composes with tanh.
+            (tract_linalg::ops().gelu_f32)().run(xs)
+        }
     };
     cost: |dt| {tvec!((Cost::FMA(dt), 15))}
 );
diff --git a/core/src/ops/nn/mod.rs b/core/src/ops/nn/mod.rs
index 08223c43fe..54b16814d2 100644
--- a/core/src/ops/nn/mod.rs
+++ b/core/src/ops/nn/mod.rs
@@ -26,7 +26,7 @@ element_wise!(sigmoid, Sigmoid,
 
 element_wise!(hard_swish, HardSwish,
 [f16] => |_, xs| { xs.iter_mut().for_each(|x| *x = *x * f16::from_f32(0.0).max(f16::from_f32(1.0).min(f16::from_f32(1. / 6.) * *x + f16::from_f32(0.5)))); Ok(()) },
-[f32] => |_, xs| { xs.iter_mut().for_each(|x| *x = *x * 0f32.max(1f32.min((1. / 6.) * *x + 0.5))); Ok(()) }
+[f32] => |_, xs| { (tract_linalg::ops().hardswish_f32)().run(xs) }
                                          );
 
 element_wise!(leaky_relu, LeakyRelu { alpha: f32 },
diff --git a/core/src/ops/nn/rms_norm.rs b/core/src/ops/nn/rms_norm.rs
index 65a85b7645..b224a894c3 100644
--- a/core/src/ops/nn/rms_norm.rs
+++ b/core/src/ops/nn/rms_norm.rs
@@ -30,8 +30,15 @@ impl EvalOp for RmsNorm {
         let input = args_1!(inputs);
 
         let input_f32 = input.cast_to::<f32>()?.into_owned();
+        // eps inherits the input dtype from the declutter pattern (F16 when the
+        // surrounding LayerNorm chain is F16). The MeanOfSquares + Add + Rsqrt
+        // + Mul chain below all runs at F32, so eps must be cast to match —
+        // otherwise the Add::eval call below panics with
+        //   "tensor is F32, accessed as F16"
+        // when input is F16.
+        let eps = self.eps.cast_to::<f32>()?.into_owned();
         let a1 = Reducer::MeanOfSquares.reduce(&[self.axis], &input_f32)?;
-        let mut a2 = Add.eval(a1.into_tvalue(), self.eps.clone().into_tvalue(), DatumType::F32)?;
+        let mut a2 = Add.eval(a1.into_tvalue(), eps.into_tvalue(), DatumType::F32)?;
         Rsqrt {}.eval_in_place(&mut a2, None)?;
         let a3 = Mul.eval(a2.into_tvalue(), input_f32.into_tvalue(), DatumType::F32)?;
         Ok(tvec![a3.cast_to_dt(input.datum_type())?.into_owned().into()])
@@ -101,9 +108,7 @@ impl TypedOp for RmsNorm {
         _start: &TDim,
         _end: &TDim,
     ) -> TractResult<Option<TVec<OutletId>>> {
-        if output_axis == self.axis {
-            return Ok(None);
-        }
+        rule_if!(output_axis != self.axis);
         patch.wire_node(&node.name, self.clone(), inputs).map(Some)
     }
 
@@ -170,3 +175,34 @@ pub fn detect_rms_norm(
     patch.shunt_outside(model, mul_succ.id.into(), out[0])?;
     Ok(Some(patch))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ops::nn::RmsNorm;
+
+    /// Regression: the declutter pattern (`detect_rms_norm`) stores `eps` with
+    /// the input dtype (F16 when the surrounding LayerNorm chain is F16) — see
+    /// `rule_if!(eps.datum_type() == dt)` above. The eval path runs at F32, so
+    /// it must cast `self.eps` to F32 before using it. Without the cast in
+    /// `RmsNorm::eval`, this test panics with "tensor is F32, accessed as F16".
+    #[test]
+    fn eval_with_f16_eps_and_f16_input() {
+        let to_h = |x: f32| f16::from_f32(x);
+        let input = tensor1(&[to_h(1.0), to_h(2.0), to_h(3.0), to_h(4.0)]);
+        let eps = tensor0(to_h(1e-5)).into_arc_tensor();
+        let op = RmsNorm { axis: 0, eps };
+        let out = op.eval(tvec!(input.clone().into())).expect("eval should not panic");
+        let out = out.into_iter().next().unwrap().into_tensor();
+        assert_eq!(out.datum_type(), DatumType::F16);
+        assert_eq!(out.shape(), &[4]);
+        // Reference: rms = sqrt((1+4+9+16)/4 + eps) = sqrt(7.5 + 1e-5) ≈ 2.7386
+        // normalised: [1, 2, 3, 4] / 2.7386 ≈ [0.365, 0.730, 1.095, 1.461]
+        let got = unsafe { out.as_slice_unchecked::<f16>() };
+        let expected = [0.365_f32, 0.730, 1.095, 1.461];
+        for (i, (g, e)) in got.iter().zip(expected.iter()).enumerate() {
+            let diff = (g.to_f32() - e).abs();
+            assert!(diff < 0.01, "lane {i}: got {} expected {}", g.to_f32(), e);
+        }
+    }
+}
diff --git a/core/src/ops/nn/silu.rs b/core/src/ops/nn/silu.rs
index 4712424b84..2ff53123f5 100644
--- a/core/src/ops/nn/silu.rs
+++ b/core/src/ops/nn/silu.rs
@@ -13,12 +13,7 @@ element_wise!(silu, Silu,
         });
         Ok(())
     },
-    [f32] => |_, xs| {
-        let mut sigmoid = xs.to_vec();
-        (tract_linalg::ops().sigmoid_f32)().run(&mut sigmoid)?;
-        xs.iter_mut().zip(sigmoid).for_each(|(x, s)| *x *= s);
-        Ok(())
-    };
+    [f32] => |_, xs| { (tract_linalg::ops().silu_f32)().run(xs) };
     cost: |dt| {tvec!((Cost::FMA(dt), 12), (Cost::Div(dt), 1))};
     declutter: detect_silu
 );
diff --git a/core/src/ops/scan/decluttered.rs b/core/src/ops/scan/decluttered.rs
index 22116c67de..9d638b4cd3 100644
--- a/core/src/ops/scan/decluttered.rs
+++ b/core/src/ops/scan/decluttered.rs
@@ -14,6 +14,17 @@ use super::*;
 pub struct Scan {
     pub skip: usize,
     pub reset_every_turn: bool,
+    /// True iff the caller manages State inputs externally — they supply a
+    /// fresh value every run (typically reading the Scan's last_value_slot
+    /// output and feeding it back into the next call's State input). This
+    /// is set explicitly at construction time, e.g. by the ONNX LSTM/GRU/
+    /// RNN importer when both `initial_h` and `Y_h` are exposed (parakeet
+    /// decoder). When true, declutter_single_loop can safely inline a
+    /// stateful single-iter Scan because the caller's per-call value
+    /// reaches the body directly. When false (default), tract carries
+    /// State across calls internally and inlining would break recurrence
+    /// (issue #2157).
+    pub external_state: bool,
     pub body: TypedModel,
     pub decluttered: bool,
     pub input_mapping: Vec<InputMapping>,
@@ -56,6 +67,7 @@ impl Scan {
         Ok(Scan {
             skip,
             reset_every_turn: false,
+            external_state: false,
             body,
             decluttered: false,
             input_mapping,
@@ -91,9 +103,25 @@ impl Scan {
         let inputs = model.node_input_facts(node.id)?;
         let iters =
             super::iteration_count(&self.input_mapping, &inputs).context("No scan input")?;
-        if !iters.is_one() {
-            return Ok(None);
-        }
+        rule_if!(iters.is_one());
+        // Inlining wires the body's State input directly from the outer
+        // initial-state input on every call. At runtime (optimized.rs
+        // OpState::eval), the State input is only seeded from inputs[slot]
+        // on the first call or when reset_every_turn is set; otherwise the
+        // body input is fed from the carried hidden_state.
+        //
+        // Inlining is therefore safe iff the caller does not rely on
+        // tract's across-call carry, which we encode as one of:
+        //   - reset_every_turn (carry is cleared every turn anyway), or
+        //   - external_state (caller plumbs the State input every call,
+        //     e.g. parakeet decoder).
+        // Otherwise (internal-managed state, e.g. DFN3 GRU under pulse),
+        // bail. See issue #2157.
+        rule_if!(
+            self.reset_every_turn
+                || self.external_state
+                || !self.input_mapping.iter().any(InputMapping::is_state)
+        );
         let mut patch = TypedModelPatch::new("Inline single loop scan");
         patch.model = self.body.clone();
         for (outer_wire, inner_wire) in izip!(&node.inputs, &self.body.inputs) {
@@ -866,22 +894,22 @@ impl TypedOp for Scan {
         Ok(None)
     }
 
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         _source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
         let inputs = node.inputs.iter().map(|o| mapping[o]).collect::<TVec<_>>();
         let op = Self {
             output_mapping: self
                 .output_mapping
                 .iter()
-                .map(|om| om.concretize_dims(values))
+                .map(|om| om.substitute_symbols(subs))
                 .collect::<TractResult<Vec<_>>>()?,
-            body: self.body.concretize_dims(values)?,
+            body: self.body.substitute_symbols(subs)?,
             ..self.clone()
         };
         target.wire_node(&node.name, op, &inputs)
diff --git a/core/src/ops/scan/mod.rs b/core/src/ops/scan/mod.rs
index a0e5446eff..74530f8ec8 100644
--- a/core/src/ops/scan/mod.rs
+++ b/core/src/ops/scan/mod.rs
@@ -52,9 +52,16 @@ impl<F: Clone> OutputMapping<F> {
 }
 
 impl<F: Clone + DimLike> OutputMapping<F> {
-    pub fn concretize_dims(&self, values: &SymbolValues) -> TractResult<OutputMapping<F>> {
+    pub fn substitute_symbols(
+        &self,
+        subs: &std::collections::HashMap<Symbol, F>,
+    ) -> TractResult<OutputMapping<F>> {
         Ok(Self {
-            full_dim_hint: self.full_dim_hint.as_ref().map(|h| h.eval(values)),
+            full_dim_hint: self
+                .full_dim_hint
+                .as_ref()
+                .map(|h| h.substitute_all(subs))
+                .transpose()?,
             ..self.clone()
         })
     }
diff --git a/core/src/ops/source.rs b/core/src/ops/source.rs
index c06cee617a..4b4c32244d 100644
--- a/core/src/ops/source.rs
+++ b/core/src/ops/source.rs
@@ -66,15 +66,16 @@ impl TypedOp for TypedSource {
         )))
     }
 
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         _source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         _mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
-        let shape: TVec<_> = self.fact.shape.iter().map(|d| d.eval(values)).collect();
+        let shape: TVec<_> =
+            self.fact.shape.iter().map(|d| d.substitute_all(subs)).collect::<TractResult<_>>()?;
         target.wire_node(&node.name, Self { fact: self.fact.datum_type.fact(&*shape) }, &[])
     }
 
diff --git a/core/src/optim/mod.rs b/core/src/optim/mod.rs
index 88b23298cc..44db70c1f3 100644
--- a/core/src/optim/mod.rs
+++ b/core/src/optim/mod.rs
@@ -8,6 +8,7 @@ mod concat_then_einsum;
 mod op_optim;
 mod prop_const;
 pub mod propagate_roi;
+pub mod propagate_uniform_tdim;
 mod push_split_down;
 mod slice;
 mod uniform_mask;
@@ -15,6 +16,7 @@ mod uniform_mask;
 use self::change_axes::ChangeAxes;
 use self::prop_const::PropConst;
 use self::propagate_roi::PropagateRoi;
+use self::propagate_uniform_tdim::PropagateUniformTdim;
 use self::push_split_down::PushSplitDown;
 use self::slice::PushSliceUp;
 use self::uniform_mask::FoldUniformMask;
@@ -33,6 +35,27 @@ pub trait TypedPass: Debug + Send + Sync + dyn_clone::DynClone {
     }
 }
 
+#[derive(Clone, Debug, Default)]
+struct MergeConsecutiveSameRoleAxes;
+
+impl TypedPass for MergeConsecutiveSameRoleAxes {
+    fn reset(&mut self) -> TractResult<()> {
+        Ok(())
+    }
+    fn next(
+        &mut self,
+        _session: &mut OptimizerSession,
+        _model: &TypedModel,
+    ) -> TractResult<Option<TypedModelPatch>> {
+        Ok(None)
+    }
+    fn run_direct(&mut self, model: &mut TypedModel) -> TractResult<bool> {
+        let before = model.nodes.len();
+        crate::ops::einsum::einsum_matmul::merge_consecutive_same_role_axes(model)?;
+        Ok(model.nodes.len() != before)
+    }
+}
+
 dyn_clone::clone_trait_object!(TypedPass);
 
 #[derive(Debug)]
@@ -69,6 +92,7 @@ impl Optimizer {
     pub fn declutter() -> Optimizer {
         Optimizer::passes(vec![
             Box::<PropConst>::default(),
+            Box::<PropagateUniformTdim>::default(),
             Box::<PropagateRoi>::default(),
             Box::<FoldUniformMask>::default(),
             Box::new(OpOptim("declutter", TypedOp::declutter_with_session, 0)),
@@ -82,6 +106,7 @@ impl Optimizer {
     pub fn codegen() -> Optimizer {
         Optimizer::passes(vec![
             Box::<PropConst>::default(),
+            Box::<MergeConsecutiveSameRoleAxes>::default(),
             Box::new(OpOptim(
                 "codegen",
                 |op, _session, model, node| TypedOp::codegen(op, model, node),
diff --git a/core/src/optim/propagate_roi.rs b/core/src/optim/propagate_roi.rs
index a7f8e62c62..df0c00ca9a 100644
--- a/core/src/optim/propagate_roi.rs
+++ b/core/src/optim/propagate_roi.rs
@@ -34,7 +34,7 @@ fn roi_union(a: &TDim, b: &TDim) -> TDim {
 /// broadcast from dim=1, or absent), returns None for that input.
 pub fn bubble_roi(model: &TypedModel, node: &TypedNode) -> TractResult<Option<TVec<Option<TDim>>>> {
     let output_fact = model.outlet_fact(OutletId::new(node.id, 0))?;
-    let Some(roi) = &output_fact.region_of_interest else { return Ok(None) };
+    rule_if_some!(roi = &output_fact.region_of_interest);
 
     let input_facts: TVec<&TypedFact> =
         node.inputs.iter().map(|i| model.outlet_fact(*i)).collect::<TractResult<_>>()?;
@@ -71,6 +71,195 @@ pub fn bubble_roi(model: &TypedModel, node: &TypedNode) -> TractResult<Option<TV
     Ok(Some(result))
 }
 
+/// Recognise a chunked-band predicate on output coords `(p, k_axis)` of the
+/// shape produced by `DiagGather::input_roi`'s `c → r + q − offset`
+/// substitution applied to a `Mul(Ge(L, q/k − c/k), Ge(q/k − c/k, 0))` band,
+/// and return the projected band on `k_axis` after existentially
+/// quantifying `p` over its dim bound.
+///
+/// Specifically, recognises:
+///
+///   `Mul(Ge(L_val, A), Ge(A, 0))`
+///
+/// where `A = ⌊p/k⌋ − ⌊(p + k_axis − offset)/k⌋` with `p` the projected
+/// coord symbol (e.g. query) and `k_axis` the kept coord symbol (e.g.
+/// rel-pos index).  Closed-form projection: as `p` varies, `A` takes
+/// values in `{−⌈(k_axis − offset)/k⌉, −⌊(k_axis − offset)/k⌋}`, so the
+/// existential `0 ≤ A ≤ L_val` is satisfiable iff
+///
+///   `k_axis ∈ [offset − (L_val + 1)·k + 1, offset + (k − 1)]`
+///
+/// — a constant band of width `(L_val + 2)·k − 1`.
+///
+/// Returns `None` if the pattern doesn't match.
+pub fn recognise_chunked_band_project(roi: &TDim, p_sym: &Symbol, k_sym: &Symbol) -> Option<TDim> {
+    // Match Mul(Ge(L, A), Ge(A, R)).
+    let TDim::Mul(terms) = roi else { return None };
+    if terms.len() != 2 {
+        return None;
+    }
+    let TDim::Ge(top_l, top_r) = &terms[0] else { return None };
+    let TDim::Ge(bot_l, bot_r) = &terms[1] else { return None };
+
+    // Identify which orientation: top = Ge(L, A) and bot = Ge(A, R)?
+    // We need the same `A` to appear as second arg of first and first arg
+    // of second.
+    let (l_val, a, r_val) = if top_r.as_ref() == bot_l.as_ref() {
+        (top_l.as_ref(), top_r.as_ref(), bot_r.as_ref())
+    } else if top_l.as_ref() == bot_r.as_ref() {
+        // Reverse: top is Ge(A, L'), bot is Ge(R', A) — swap roles.
+        (bot_l.as_ref(), top_l.as_ref(), top_r.as_ref())
+    } else {
+        return None;
+    };
+
+    // R side must be 0 (the band is 0 ≤ X ≤ L).
+    if r_val != &TDim::Val(0) {
+        return None;
+    }
+    let big_l = l_val.to_i64().ok()?;
+    if big_l < 0 {
+        return None;
+    }
+
+    // `A` may have a constant offset c factored out by the simplifier (e.g.
+    // when the original offset isn't a multiple of k, the simplifier
+    // rewrites `(p+r-9)/k` as `(p+r+5)/k - 1` for k=14).  Peel c off so
+    // we can match the inner diff-of-divs, then re-fold c·k into the
+    // recovered offset.
+    let (a_no_const, c) = split_const(a);
+    let (k, p_num, q_num) = match_diff_of_divs(&a_no_const)?;
+    let derived_inner_offset = (p_num + TDim::Sym(k_sym.clone()) - q_num).reduce();
+    if derived_inner_offset.symbols().contains(p_sym)
+        || derived_inner_offset.symbols().contains(k_sym)
+    {
+        return None;
+    }
+    let actual_offset = (derived_inner_offset + TDim::Val(c * k as i64)).reduce();
+
+    // The projected band on k_sym: [offset − (L+1)·k + 1, offset + (k − 1)].
+    let high = (actual_offset.clone() + TDim::Val(k as i64 - 1)).reduce();
+    let low = (actual_offset - TDim::Val((big_l + 1) * k as i64 - 1)).reduce();
+    Some(
+        TDim::Mul(vec![
+            TDim::Ge(Box::new(high), Box::new(TDim::Sym(k_sym.clone()))),
+            TDim::Ge(Box::new(TDim::Sym(k_sym.clone())), Box::new(low)),
+        ])
+        .reduce(),
+    )
+}
+
+/// Split `expr` into `(expr_without_constant, constant_part)`.  If `expr`
+/// is `Add([...constants..., ...non-constants...])`, sum up the constant
+/// terms and return the non-constant remainder.  Otherwise returns
+/// `(expr, 0)`.
+fn split_const(expr: &TDim) -> (TDim, i64) {
+    if let TDim::Add(terms) = expr {
+        let mut c = 0i64;
+        let mut rest: Vec<TDim> = vec![];
+        for t in terms {
+            match t {
+                TDim::Val(v) => c += *v,
+                _ => rest.push(t.clone()),
+            }
+        }
+        let new_expr = if rest.is_empty() {
+            TDim::Val(0)
+        } else if rest.len() == 1 {
+            rest.into_iter().next().unwrap()
+        } else {
+            TDim::Add(rest)
+        };
+        return (new_expr, c);
+    }
+    (expr.clone(), 0)
+}
+
+/// If `expr` matches `Div(p_expr, k) − Div(q_expr, k)` (in either order),
+/// returns `(k, p_expr, q_expr)` where `p_expr` is the numerator with the
+/// positive coefficient.
+fn match_diff_of_divs(expr: &TDim) -> Option<(u64, TDim, TDim)> {
+    let TDim::Add(terms) = expr else { return None };
+    if terms.len() != 2 {
+        return None;
+    }
+    let mut pos_div: Option<(TDim, u64)> = None;
+    let mut neg_div: Option<(TDim, u64)> = None;
+    for t in terms {
+        match t {
+            TDim::Div(inner, k) => {
+                pos_div = Some(((**inner).clone(), *k));
+            }
+            TDim::MulInt(-1, inner) => {
+                if let TDim::Div(num, k) = inner.as_ref() {
+                    neg_div = Some(((**num).clone(), *k));
+                }
+            }
+            _ => {}
+        }
+    }
+    let (p_expr, k1) = pos_div?;
+    let (q_expr, k2) = neg_div?;
+    if k1 != k2 {
+        return None;
+    }
+    Some((k1, p_expr, q_expr))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Closed-form recognition: chunked-band predicate after DG substitution
+    /// `c → r + q − offset` should project `q` out and yield a constant band
+    /// on `r` of width `(L+2)·k − 1`, centred around `offset`.
+    #[test]
+    fn recognise_chunked_band_yields_constant_band() {
+        let scope = SymbolScope::default();
+        let p = scope.coord_sym(0); // q (projected)
+        let k_ax = scope.coord_sym(1); // r (kept)
+        let offset = 9i64;
+        let k: u64 = 14;
+        let big_l = 5i64;
+
+        // A = p/k − (p + k_ax − offset)/k
+        let num1 = TDim::Sym(p.clone());
+        let num2 = TDim::Sym(p.clone()) + TDim::Sym(k_ax.clone()) - TDim::Val(offset);
+        let a = (TDim::Div(Box::new(num1), k) - TDim::Div(Box::new(num2), k)).reduce();
+        let band = TDim::Mul(vec![
+            TDim::Ge(Box::new(TDim::Val(big_l)), Box::new(a.clone())),
+            TDim::Ge(Box::new(a), Box::new(TDim::Val(0))),
+        ])
+        .reduce();
+        eprintln!("input band: {band}");
+
+        let projected =
+            recognise_chunked_band_project(&band, &p, &k_ax).expect("recogniser should match");
+        eprintln!("projected: {projected}");
+
+        // Expected: r ∈ [offset − (L+1)·k + 1, offset + (k − 1)]
+        //         = [9 − 84 + 1, 9 + 13] = [-74, 22] (width 97)
+        let high_expected = offset + k as i64 - 1; // 22
+        let low_expected = offset - (big_l + 1) * k as i64 + 1; // -74
+        let TDim::Mul(terms) = &projected else { panic!("expected Mul") };
+        assert_eq!(terms.len(), 2);
+        // Position-independent: one Ge term is `Ge(high, r)` (= r ≤ high),
+        // the other is `Ge(r, low)` (= r ≥ low).
+        let mut saw_high = false;
+        let mut saw_low = false;
+        for t in terms {
+            let TDim::Ge(l, r) = t else { panic!("expected Ge inside Mul") };
+            if **l == TDim::Val(high_expected) && **r == TDim::Sym(k_ax.clone()) {
+                saw_high = true;
+            } else if **l == TDim::Sym(k_ax.clone()) && **r == TDim::Val(low_expected) {
+                saw_low = true;
+            }
+        }
+        assert!(saw_high, "missing Ge(high={high_expected}, r); got: {projected}");
+        assert!(saw_low, "missing Ge(r, low={low_expected}); got: {projected}");
+    }
+}
+
 impl super::TypedPass for PropagateRoi {
     fn reset(&mut self) -> TractResult<()> {
         Ok(())
@@ -86,44 +275,56 @@ impl super::TypedPass for PropagateRoi {
 
     fn run_direct(&mut self, model: &mut TypedModel) -> TractResult<bool> {
         let order = model.eval_order()?;
-        let mut changed = false;
-
-        // Collect ROI demands from all nodes.
-        let mut demands: HashMap<OutletId, Option<TDim>> = HashMap::new();
-
-        for &node_id in &order {
-            let node = &model.nodes()[node_id];
-            let Some(input_rois) = node.op.as_typed().unwrap().input_roi(model, node)? else {
-                continue;
-            };
-            for (ix, roi) in input_rois.into_iter().enumerate() {
-                let outlet = node.inputs[ix];
-                match (demands.get(&outlet), &roi) {
-                    (_, None) => {
-                        demands.insert(outlet, None);
+        let mut any_changed = false;
+
+        loop {
+            let mut changed = false;
+            let mut demands: HashMap<OutletId, Option<TDim>> = HashMap::new();
+
+            for &node_id in &order {
+                let node = &model.nodes()[node_id];
+                let Some(input_rois) = node.op.as_typed().unwrap().input_roi(model, node)? else {
+                    continue;
+                };
+                for (ix, roi) in input_rois.into_iter().enumerate() {
+                    let outlet = node.inputs[ix];
+                    match (demands.get(&outlet), &roi) {
+                        (_, None) => {
+                            demands.insert(outlet, None);
+                        }
+                        (Option::None, Some(roi)) => {
+                            demands.insert(outlet, Some(roi.clone()));
+                        }
+                        (Some(None), Some(_)) => {}
+                        (Some(Some(existing)), Some(new)) => {
+                            demands.insert(outlet, Some(roi_union(existing, new)));
+                        }
                     }
-                    (Option::None, Some(roi)) => {
-                        demands.insert(outlet, Some(roi.clone()));
+                }
+            }
+
+            // Apply demands to model facts.
+            for (outlet, demand) in demands {
+                if let Some(roi) = demand {
+                    let roi = roi.simplify();
+                    // ROI of 1 means "all positions matter" — equivalent to None.
+                    if roi == TDim::Val(1) {
+                        continue;
                     }
-                    (Some(None), Some(_)) => {}
-                    (Some(Some(existing)), Some(new)) => {
-                        demands.insert(outlet, Some(roi_union(existing, new)));
+                    let fact = &mut model.nodes_mut()[outlet.node].outputs[outlet.slot].fact;
+                    if fact.region_of_interest.as_ref() != Some(&roi) {
+                        fact.region_of_interest = Some(roi);
+                        changed = true;
                     }
                 }
             }
-        }
 
-        // Apply demands to model facts.
-        for (outlet, demand) in demands {
-            if let Some(roi) = demand {
-                let fact = &mut model.nodes_mut()[outlet.node].outputs[outlet.slot].fact;
-                if fact.region_of_interest.as_ref() != Some(&roi) {
-                    fact.region_of_interest = Some(roi);
-                    changed = true;
-                }
+            any_changed |= changed;
+            if !changed {
+                break;
             }
         }
 
-        Ok(changed)
+        Ok(any_changed)
     }
 }
diff --git a/core/src/optim/propagate_uniform_tdim.rs b/core/src/optim/propagate_uniform_tdim.rs
new file mode 100644
index 0000000000..28efb0687c
--- /dev/null
+++ b/core/src/optim/propagate_uniform_tdim.rs
@@ -0,0 +1,75 @@
+use crate::internal::*;
+use crate::optim::OptimizerSession;
+
+/// Forward pass that refreshes `TypedFact::uniform_tdim` annotations by
+/// re-running each op's `output_facts` against its current input facts.
+///
+/// The default declutter pipeline computes a node's `uniform_tdim` once at
+/// load time, then reuses the cached fact.  Some declutter rewrites
+/// (notably `Iff` folding when the condition is provably constant) shunt a
+/// node's input edge without re-running the consumer's `output_facts` —
+/// the consumer's cached fact then references the stale upstream fact and
+/// loses any newly-available `uniform_tdim` annotation.  Every wire
+/// downstream of the shunt then sees `uniform_tdim = None`, and passes
+/// like `FoldUniformMask` or Blockify section detection silently miss it.
+///
+/// This pass walks the model in topological order, calls `output_facts`
+/// fresh on each node, and copies the recomputed `uniform_tdim` over the
+/// cached one when it differs.  Other fact fields are untouched (the
+/// existing declutter loop is responsible for them).  Iterates to fixpoint
+/// since a refreshed annotation upstream may unlock more refreshes
+/// downstream.
+#[derive(Clone, Debug, Default)]
+pub struct PropagateUniformTdim;
+
+impl super::TypedPass for PropagateUniformTdim {
+    fn reset(&mut self) -> TractResult<()> {
+        Ok(())
+    }
+
+    fn next(
+        &mut self,
+        _session: &mut OptimizerSession,
+        _model: &TypedModel,
+    ) -> TractResult<Option<TypedModelPatch>> {
+        Ok(None)
+    }
+
+    fn run_direct(&mut self, model: &mut TypedModel) -> TractResult<bool> {
+        let order = model.eval_order()?;
+        let mut any_changed = false;
+        loop {
+            let mut changed = false;
+            for &node_id in &order {
+                let typed_op = match model.nodes()[node_id].op.as_typed() {
+                    Some(op) => op,
+                    None => continue,
+                };
+                let input_facts: TVec<TypedFact> = model.nodes()[node_id]
+                    .inputs
+                    .iter()
+                    .map(|i| model.outlet_fact(*i).cloned())
+                    .collect::<TractResult<_>>()?;
+                let input_refs: TVec<&TypedFact> = input_facts.iter().collect();
+                let new_facts = match typed_op.output_facts(&input_refs) {
+                    Ok(f) => f,
+                    Err(_) => continue,
+                };
+                for (slot, new_fact) in new_facts.iter().enumerate() {
+                    let current_uniform_tdim =
+                        model.nodes()[node_id].outputs[slot].fact.uniform_tdim.clone();
+                    if current_uniform_tdim != new_fact.uniform_tdim {
+                        model.nodes_mut()[node_id].outputs[slot].fact.uniform_tdim =
+                            new_fact.uniform_tdim.clone();
+                        changed = true;
+                    }
+                }
+            }
+            if !changed {
+                break;
+            }
+            any_changed = true;
+        }
+        Ok(any_changed)
+    }
+}
diff --git a/core/src/optim/slice.rs b/core/src/optim/slice.rs
index 3f511de8ab..00204cf9e5 100644
--- a/core/src/optim/slice.rs
+++ b/core/src/optim/slice.rs
@@ -182,6 +182,11 @@ fn should_slice_output(
     rule_if_let!(Ok(mut boundaries) =
         boundaries.iter().map(|x| x.to_usize()).collect::<TractResult<TVec<usize>>>());
     rule_if_let!(Ok(end) = node.outputs[0].fact.shape[axis].to_usize());
+    // op_slices_to_slice_op requires boundaries to cover the full
+    // [0..full_len] range. When every slicing successor starts at
+    // some offset > 0, the collected boundaries miss the leading 0;
+    // sort+dedup handles the case where a slicer already starts at 0.
+    boundaries.push(0);
     boundaries.push(end);
     boundaries.sort();
     boundaries.dedup();
@@ -247,3 +252,30 @@ pub fn rewire_sliced_outputs(
     }
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ops::math;
+
+    /// Two slice successors that both start at > 0. None covers the
+    /// prefix [0..2]. Before the fix, `should_slice_output` returned
+    /// boundaries `[2, 4, 5, 8]` (start at 2, not 0), and
+    /// `op_slices_to_slice_op` failed `boundaries[0] == 0.to_dim()`.
+    #[test]
+    fn push_slice_up_multi_succ_no_prefix() -> TractResult<()> {
+        let mut model = TypedModel::default();
+        let a = model.add_source("a", f32::fact([8]))?;
+        let b = model.add_source("b", f32::fact([8]))?;
+        let sum = model.wire_node("sum", math::add(), &[a, b])?[0];
+        let s1 =
+            model.wire_node("s1", Slice { axis: 0, start: 2.to_dim(), end: 4.to_dim() }, &[sum])?
+                [0];
+        let s2 =
+            model.wire_node("s2", Slice { axis: 0, start: 5.to_dim(), end: 8.to_dim() }, &[sum])?
+                [0];
+        model.select_output_outlets(&[s1, s2])?;
+        let _ = model.into_decluttered()?;
+        Ok(())
+    }
+}
diff --git a/core/src/optim/uniform_mask.rs b/core/src/optim/uniform_mask.rs
index 909d39fed3..bb2c4b5c17 100644
--- a/core/src/optim/uniform_mask.rs
+++ b/core/src/optim/uniform_mask.rs
@@ -16,6 +16,19 @@ use crate::optim::OptimizerSession;
 /// The per-op logic is limited to `try_fold_node` (which input indices may carry a bool
 /// `uniform_tdim`). All transformation logic in `try_fold_uniform_bool_input` and
 /// `split_op_two_regions` is op-agnostic.
+///
+/// # Interaction with `ScaledMaskedSoftmax`
+///
+/// SMS is intentionally opaque to this pass.  Its band-mask handling lives in
+/// `ScaledMaskedSoftmax::input_roi` (plants band ROI on the scores input) and
+/// `::declutter` (materialises the band via `materialise_band_roi_on_input`).
+/// Both fire before pulse and require no `FoldUniformMask` participation.
+///
+/// Practical consequence: the relative order of `detect_scaled_masked_softmax`
+/// (in tract-transformers) and this pass doesn't matter for correctness — once
+/// SMS exists, FoldUniformMask just skips it, and the ROI path handles the
+/// band.  If SMS isn't yet detected (plain `Iff + softmax`), FoldUniformMask
+/// still folds the Iff mask region by region as usual.
 #[derive(Clone, Debug, Default)]
 pub struct FoldUniformMask(usize);
 
diff --git a/core/src/runtime.rs b/core/src/runtime.rs
index e9a30e96a8..06ecbca35a 100644
--- a/core/src/runtime.rs
+++ b/core/src/runtime.rs
@@ -216,9 +216,7 @@ pub fn runtimes() -> impl Iterator<Item = &'static dyn Runtime> {
 }
 
 pub fn runtime_for_name(s: &str) -> TractResult<Option<&'static dyn Runtime>> {
-    let Some(rt) = inventory::iter::<InventorizedRuntime>().find(|rt| rt.name() == s) else {
-        return Ok(None);
-    };
+    rule_if_some!(rt = inventory::iter::<InventorizedRuntime>().find(|rt| rt.name() == s));
     rt.check()?;
     Ok(Some(rt.0))
 }
diff --git a/core/src/transform.rs b/core/src/transform.rs
index bc96d6bf6b..b83a592f81 100644
--- a/core/src/transform.rs
+++ b/core/src/transform.rs
@@ -1,8 +1,6 @@
 use std::borrow::Cow;
 
 use crate::internal::*;
-#[cfg(feature = "blas")]
-use crate::ops::einsum::as_blas::AsBlas;
 use crate::ops::matmul::de_block_quant::BlockQuantTransform;
 use std::fmt::Debug;
 
@@ -248,9 +246,18 @@ pub fn get_transform_with_params(
     Ok(None)
 }
 
+/// Per-symbol substitution: either a concrete integer or a TDim
+/// expression string parsed against the model's symbol scope.
+#[derive(Debug, serde::Deserialize)]
+#[serde(untagged)]
+pub enum SymbolValueSpec {
+    Int(i64),
+    Expr(String),
+}
+
 #[derive(Debug, Default, serde::Deserialize)]
 pub struct ConcretizeSymbolsConfig {
-    pub values: std::collections::HashMap<String, i64>,
+    pub values: std::collections::HashMap<String, SymbolValueSpec>,
 }
 
 #[derive(Debug)]
@@ -262,11 +269,19 @@ impl ModelTransform for ConcretizeSymbolsTransform {
     }
 
     fn transform(&self, model: &mut TypedModel) -> TractResult<()> {
-        let mut table = SymbolValues::default();
-        for (k, v) in &self.0.values {
-            table = table.with(&model.symbols.sym(k), *v);
+        let mut subs = std::collections::HashMap::new();
+        for (k, spec) in &self.0.values {
+            let sym = model.symbols.sym(k);
+            let dim = match spec {
+                SymbolValueSpec::Int(v) => TDim::Val(*v),
+                SymbolValueSpec::Expr(s) => model
+                    .symbols
+                    .parse_tdim(s)
+                    .with_context(|| format!("Parsing TDim expression {s:?} for symbol {k}"))?,
+            };
+            subs.insert(sym, dim);
         }
-        *model = model.concretize_dims(&table)?;
+        *model = model.substitute_symbols(&subs)?;
         Ok(())
     }
 }
@@ -275,9 +290,54 @@ register_model_transform!("concretize_symbols", ConcretizeSymbolsConfig, |config
     ConcretizeSymbolsTransform(config)
 )));
 
+/// Ad-hoc fix-up for NNEF artifacts exported before Scan grew the
+/// `external_state` flag (issue #2157). For every Scan in the model:
+/// 1. Substitute the scan-axis symbol on the Scan input with 1 across the
+///    whole model (caller is bound by the per-call seq=1 contract that
+///    external state management implies).
+/// 2. Set `external_state = true`.
+///
+/// After this transform, the standard declutter pipeline sees `iters == 1`
+/// on each Scan and `declutter_single_loop` inlines the body. Apply only
+/// when the loaded model is known to use external state management, e.g.
+/// the parakeet decoder. Cheaper than re-exporting cached NNEF.
+#[derive(Debug)]
+struct ForceScanExternalState;
+
+impl ModelTransform for ForceScanExternalState {
+    fn name(&self) -> StaticName {
+        "force_scan_external_state".into()
+    }
+
+    fn transform(&self, model: &mut TypedModel) -> TractResult<()> {
+        use crate::ops::scan::{InputMapping, Scan};
+        let mut subs: HashMap<Symbol, TDim> = HashMap::new();
+        for node in &model.nodes {
+            let Some(scan) = node.op_as::<Scan>() else { continue };
+            for (slot, mapping) in scan.input_mapping.iter().enumerate() {
+                let InputMapping::Scan(info) = mapping else { continue };
+                let outer = node.inputs[slot];
+                let dim = &model.outlet_fact(outer)?.shape[info.axis];
+                if let TDim::Sym(s) = dim {
+                    subs.insert(s.clone(), TDim::Val(1));
+                }
+            }
+        }
+        if !subs.is_empty() {
+            *model = model.substitute_symbols(&subs)?;
+        }
+        for node in &mut model.nodes {
+            if let Some(scan) = node.op_as_mut::<Scan>() {
+                scan.external_state = true;
+            }
+        }
+        Ok(())
+    }
+}
+
+register_simple_model_transform!("force_scan_external_state", ForceScanExternalState);
+
 register_simple_model_transform!("softmax_fast_compact", SoftmaxFastCompact);
-#[cfg(feature = "blas")]
-register_simple_model_transform!("as_blas", AsBlas);
 register_simple_model_transform!("block_quant", BlockQuantTransform);
 
 #[derive(Debug, serde::Deserialize, Default)]
diff --git a/cuda/Cargo.toml b/cuda/Cargo.toml
index aa37e8488f..95c4884a8a 100644
--- a/cuda/Cargo.toml
+++ b/cuda/Cargo.toml
@@ -41,7 +41,22 @@ proptest.workspace = true
 rand.workspace = true
 
 [features]
+# Pick exactly one cuda-XXXXX to bind cudarc against. Higher minor versions
+# bind more symbols; the resulting binary still runs against any CUDA driver
+# whose API version is >= the chosen one (the runtime check in utils.rs
+# enforces this).
+cuda-12000 = ["cudarc/cuda-12000"]
+cuda-12010 = ["cudarc/cuda-12010"]
+cuda-12020 = ["cudarc/cuda-12020"]
+cuda-12030 = ["cudarc/cuda-12030"]
+cuda-12040 = ["cudarc/cuda-12040"]
+cuda-12050 = ["cudarc/cuda-12050"]
+cuda-12060 = ["cudarc/cuda-12060"]
+cuda-12080 = ["cudarc/cuda-12080"]
+cuda-12090 = ["cudarc/cuda-12090"]
 cuda-13000 = ["cudarc/cuda-13000"]
+cuda-13010 = ["cudarc/cuda-13010"]
+cuda-13020 = ["cudarc/cuda-13020"]
 default = ["cuda-13000"]
 
 [[bench]]
diff --git a/cuda/src/context.rs b/cuda/src/context.rs
index d2e23d1141..c81805b412 100644
--- a/cuda/src/context.rs
+++ b/cuda/src/context.rs
@@ -416,9 +416,7 @@ impl TractCudaStream {
     /// Record a start/end event pair around a kernel launch.
     /// Call this from `TractLaunchArgs::launch()` when profiling is active.
     pub fn record_profile_events(&self) -> TractResult<Option<(CudaEvent, CudaEvent)>> {
-        if !self.is_profiling() {
-            return Ok(None);
-        }
+        rule_if!(self.is_profiling());
         let flags = Some(cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT);
         let start = self.inner.record_event(flags)?;
         Ok(Some((start, self.inner.context().new_event(flags)?)))
diff --git a/cuda/src/kernels/array/diag_gather.rs b/cuda/src/kernels/array/diag_gather.rs
new file mode 100644
index 0000000000..481d8e435c
--- /dev/null
+++ b/cuda/src/kernels/array/diag_gather.rs
@@ -0,0 +1,207 @@
+use crate::context::{TractCudaStream, cuda_context};
+use crate::kernels::launch_args::TractLaunchArgs;
+use crate::kernels::{LibraryName, MAX_THREADS, get_cuda_view};
+use anyhow::ensure;
+use cudarc::driver::{CudaStream, LaunchConfig, PushKernelArg};
+use std::fmt;
+use tract_core::internal::*;
+use tract_gpu::tensor::DeviceTensor;
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct DiagGather;
+
+impl fmt::Display for DiagGather {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+impl DiagGather {
+    pub fn is_supported_dt(dt: DatumType) -> bool {
+        matches!(dt, DatumType::F32 | DatumType::F16)
+    }
+
+    pub fn kernel_name(&self, dt: DatumType) -> TractResult<String> {
+        ensure!(Self::is_supported_dt(dt), "Unsupported dt {:?} for cuda diag_gather op", dt);
+        let tname = DeviceTensor::tname(dt)?;
+        Ok(format!("diag_gather_{tname}"))
+    }
+
+    pub fn eval(
+        &self,
+        stream: &TractCudaStream,
+        input: &DeviceTensor,
+        offset: i64,
+        out_len: usize,
+    ) -> TractResult<DeviceTensor> {
+        let rank = input.rank();
+        ensure!(rank >= 2);
+        let mut out_shape: TVec<usize> = input.shape().into();
+        out_shape[rank - 1] = out_len;
+        let output = unsafe { DeviceTensor::uninitialized_dt(input.datum_type(), &out_shape)? };
+        self.dispatch_eval(stream, input, offset, out_len, &output)?;
+        stream.synchronize()?;
+        Ok(output)
+    }
+
+    pub fn dispatch_eval(
+        &self,
+        stream: &TractCudaStream,
+        input: &DeviceTensor,
+        offset: i64,
+        out_len: usize,
+        output: &DeviceTensor,
+    ) -> TractResult<()> {
+        let rank = input.rank();
+        ensure!(rank >= 2);
+        ensure!(output.rank() == rank);
+        ensure!(output.datum_type() == input.datum_type());
+        let in_shape = input.shape();
+        let out_shape = output.shape();
+        // Output shares all leading axes with input, only the last differs.
+        ensure!(in_shape[..rank - 2] == out_shape[..rank - 2]);
+        ensure!(in_shape[rank - 2] == out_shape[rank - 2]);
+        ensure!(out_shape[rank - 1] == out_len);
+        // i64 -> i32 down-cast: model widths are well under i32::MAX in practice.
+        let offset_i32: i32 = offset.try_into().context("DiagGather offset overflows i32")?;
+        let out_len_i32: i32 = out_len.try_into().context("DiagGather out_len overflows i32")?;
+
+        // Flatten the (rank-2) leading axes into one batch axis.  Assumes the
+        // leading block is plain row-major (encoder use: rank-4 BxHxTxR with
+        // natural strides), so the batch stride is `t_q * (R or out_len)`.
+        let in_strides = input.strides();
+        let out_strides = output.strides();
+        let batch: usize = in_shape[..rank - 2].iter().product();
+        let t_q = in_shape[rank - 2];
+        let r_in = in_shape[rank - 1];
+        let in_stride_b: i32 = if rank >= 3 { (t_q * r_in) as i32 } else { 0 };
+        let in_stride_i = in_strides[rank - 2] as i32;
+        let in_stride_r = in_strides[rank - 1] as i32;
+        let out_stride_b: i32 = if rank >= 3 { (t_q * out_len) as i32 } else { 0 };
+        let out_stride_i = out_strides[rank - 2] as i32;
+        let out_stride_k = out_strides[rank - 1] as i32;
+
+        let i_view = get_cuda_view(input);
+        let o_view = get_cuda_view(output);
+
+        let func = cuda_context()
+            .load_pipeline(LibraryName::Array, self.kernel_name(input.datum_type())?)?;
+
+        let mut launch_args = TractLaunchArgs::new(stream, &func);
+        launch_args.push_view(&i_view);
+        launch_args.push_view(&o_view);
+        launch_args.push::<i32>(offset_i32);
+        launch_args.push::<i32>(batch as i32);
+        launch_args.push::<i32>(t_q as i32);
+        launch_args.push::<i32>(r_in as i32);
+        launch_args.push::<i32>(out_len_i32);
+        launch_args.push::<i32>(in_stride_b);
+        launch_args.push::<i32>(in_stride_i);
+        launch_args.push::<i32>(in_stride_r);
+        launch_args.push::<i32>(out_stride_b);
+        launch_args.push::<i32>(out_stride_i);
+        launch_args.push::<i32>(out_stride_k);
+
+        // Grid: x = out_len cols, y = T_q rows, z = batch.  Threads per block
+        // along x = min(out_len, 256) rounded down to multiple of 32.
+        let block_x = out_len.min(MAX_THREADS).max(32);
+        let grid_x = out_len.div_ceil(block_x);
+        let cfg = LaunchConfig {
+            grid_dim: (grid_x as _, t_q as _, batch as _),
+            block_dim: (block_x as _, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        launch_args.launch(cfg)
+    }
+}
+
+pub fn cuda_diag_gather_dispatch(
+    input: &DeviceTensor,
+    offset: i64,
+    out_len: usize,
+    output: &DeviceTensor,
+) -> TractResult<()> {
+    crate::with_cuda_stream(|stream| {
+        DiagGather.dispatch_eval(stream, input, offset, out_len, output)
+    })
+}
+
+crate::register_cuda_op!(tract_transformers::ops::diag_gather::DiagGather, |source, node, op| {
+    rule_if!(DiagGather::is_supported_dt(source.node_input_facts(node.id)?[0].datum_type));
+    Ok(Some(Box::new(tract_gpu::ops::diag_gather::GpuDiagGather::new(
+        op.offset.clone(),
+        op.out_len.clone(),
+        "Cuda",
+        cuda_diag_gather_dispatch,
+    ))))
+});
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tract_core::internal::Tensor;
+    use tract_gpu::tensor::IntoDevice;
+    use tract_transformers::ops::diag_gather as cpu_dg;
+
+    fn run_against_cpu(shape: &[usize], offset: i64, out_len: usize) -> TractResult<()> {
+        use tract_core::plan::TurnState;
+        crate::with_cuda_stream(|stream| {
+            let len: usize = shape.iter().product();
+            let data: Vec<f32> = (0..len).map(|i| i as f32).collect();
+            let cpu_in = Tensor::from_shape(shape, &data)?;
+            let cuda_in = cpu_in.clone().into_device()?;
+
+            // CPU DiagGather only implements eval_with_session (it resolves
+            // TDims against the session's resolved_symbols); pass an empty
+            // TurnState since the TDims here are already concrete.
+            let cpu_op =
+                cpu_dg::DiagGather { offset: (offset as i64).to_dim(), out_len: out_len.to_dim() };
+            let session = TurnState::default();
+            let cpu_out = cpu_op.eval_with_session(0, &session, tvec![cpu_in.into_tvalue()])?[0]
+                .clone()
+                .into_tensor();
+            let cuda_out = DiagGather.eval(stream, &cuda_in, offset, out_len)?;
+            cpu_out
+                .close_enough(&cuda_out.to_host()?.into_tensor(), Approximation::Exact)
+                .with_context(|| format!("shape={shape:?} offset={offset} out_len={out_len}"))
+        })
+    }
+
+    #[test]
+    fn test_diag_gather_skew_basic() -> TractResult<()> {
+        // Classic skew-trick shape: [B*H, T, 2T-1] -> [B*H, T, T] with offset = T-1.
+        let t = 4;
+        run_against_cpu(&[2, t, 2 * t - 1], (t - 1) as i64, t)
+    }
+
+    #[test]
+    fn test_diag_gather_rank4_encoder_like() -> TractResult<()> {
+        // Mirrors the encoder shape: [B, H, T_q, R].
+        let t = 14;
+        run_against_cpu(&[1, 8, t, 2 * t - 1], (t - 1) as i64, t)
+    }
+
+    #[test]
+    fn test_diag_gather_out_of_bounds_zero_fill() -> TractResult<()> {
+        // out_len > R so some `r = offset + k - i` fall outside [0, R) and
+        // must be zero-filled (matches CPU contract).
+        let r = 5;
+        let t = 4;
+        run_against_cpu(&[1, t, r], 1, 8)
+    }
+
+    #[test]
+    fn test_diag_gather_partial_overlap() -> TractResult<()> {
+        // offset = 0: row i reads input[..., i, -i..-i+out_len], so the
+        // first `i` columns of each row are out of bounds and zeroed.
+        let t = 4;
+        let r = 6;
+        run_against_cpu(&[1, t, r], 0, t)
+    }
+
+    #[test]
+    fn test_diag_gather_rank2() -> TractResult<()> {
+        // Smallest valid rank: no leading batch axes.
+        run_against_cpu(&[5, 9], 4, 5)
+    }
+}
diff --git a/cuda/src/kernels/array/gather.rs b/cuda/src/kernels/array/gather.rs
new file mode 100644
index 0000000000..1dc6c48fda
--- /dev/null
+++ b/cuda/src/kernels/array/gather.rs
@@ -0,0 +1,197 @@
+use crate::context::{TractCudaStream, cuda_context};
+use crate::kernels::launch_args::TractLaunchArgs;
+use crate::kernels::{LibraryName, MAX_THREADS, get_cuda_view};
+use anyhow::ensure;
+use cudarc::driver::{CudaStream, LaunchConfig, PushKernelArg};
+use std::fmt;
+use tract_core::internal::*;
+use tract_gpu::tensor::DeviceTensor;
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct Gather;
+
+impl fmt::Display for Gather {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+impl Gather {
+    pub fn is_supported_dt(dt: DatumType) -> bool {
+        matches!(dt, DatumType::F32 | DatumType::F16)
+    }
+
+    pub fn kernel_name(&self, dt: DatumType) -> TractResult<String> {
+        ensure!(Self::is_supported_dt(dt), "Unsupported dt {:?} for cuda gather op", dt);
+        let tname = DeviceTensor::tname(dt)?;
+        Ok(format!("gather_{tname}"))
+    }
+
+    pub fn eval(
+        &self,
+        stream: &TractCudaStream,
+        data: &DeviceTensor,
+        indices: &DeviceTensor,
+        axis: usize,
+    ) -> TractResult<DeviceTensor> {
+        ensure!(data.rank() > axis);
+        let mut out_shape: TVec<usize> = data.shape()[..axis].into();
+        out_shape.extend(indices.shape().iter().copied());
+        out_shape.extend(data.shape()[axis + 1..].iter().copied());
+        let output = unsafe { DeviceTensor::uninitialized_dt(data.datum_type(), &out_shape)? };
+        self.dispatch_eval(stream, data, indices, axis, &output)?;
+        stream.synchronize()?;
+        Ok(output)
+    }
+
+    pub fn dispatch_eval(
+        &self,
+        stream: &TractCudaStream,
+        data: &DeviceTensor,
+        indices: &DeviceTensor,
+        axis: usize,
+        output: &DeviceTensor,
+    ) -> TractResult<()> {
+        ensure!(data.rank() > axis);
+        ensure!(indices.datum_type() == i64::datum_type());
+        ensure!(output.datum_type() == data.datum_type());
+
+        let data_shape = data.shape();
+        let pre: usize = data_shape[..axis].iter().product();
+        let a_size: usize = data_shape[axis];
+        let post: usize = data_shape[axis + 1..].iter().product();
+        let n_indices: usize = indices.shape().iter().product();
+
+        // Output volume must match (pre, n_indices, post) under natural strides;
+        // the GpuGather::output_facts code computes this same shape.
+        let expected: usize = pre * n_indices * post;
+        ensure!(
+            output.shape().iter().product::<usize>() == expected,
+            "Gather output shape mismatch: data={:?} axis={} indices={:?} output={:?}",
+            data_shape,
+            axis,
+            indices.shape(),
+            output.shape()
+        );
+
+        let d_view = get_cuda_view(data);
+        let i_view = get_cuda_view(indices);
+        let o_view = get_cuda_view(output);
+
+        let func = cuda_context()
+            .load_pipeline(LibraryName::Array, self.kernel_name(data.datum_type())?)?;
+
+        let mut launch_args = TractLaunchArgs::new(stream, &func);
+        launch_args.push_view(&d_view);
+        launch_args.push_view(&i_view);
+        launch_args.push_view(&o_view);
+        launch_args.push::<i32>(pre as i32);
+        launch_args.push::<i32>(a_size as i32);
+        launch_args.push::<i32>(post as i32);
+        launch_args.push::<i32>(n_indices as i32);
+
+        let block_x = post.min(MAX_THREADS).max(32);
+        let grid_x = post.div_ceil(block_x);
+        let cfg = LaunchConfig {
+            grid_dim: (grid_x as _, n_indices as _, pre as _),
+            block_dim: (block_x as _, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        launch_args.launch(cfg)
+    }
+}
+
+pub fn cuda_gather_dispatch(
+    data: &DeviceTensor,
+    indices: &DeviceTensor,
+    axis: usize,
+    output: &DeviceTensor,
+) -> TractResult<()> {
+    crate::with_cuda_stream(|stream| Gather.dispatch_eval(stream, data, indices, axis, output))
+}
+
+crate::register_cuda_op!(tract_core::ops::array::Gather, |source, node, op| {
+    let facts = source.node_input_facts(node.id)?;
+    // Plain-tensor path only.  The CPU op also handles block-quant and packed
+    // matrix storage; those decompose into a dequantization step that the GPU
+    // path doesn't (yet) cover.
+    rule_if!(facts[0].is_plain());
+    rule_if!(Gather::is_supported_dt(facts[0].datum_type));
+    rule_if!(facts[1].datum_type == i64::datum_type());
+    rule_if!(op.output_type.is_none() || op.output_type == Some(facts[0].datum_type));
+    Ok(Some(Box::new(tract_gpu::ops::gather::GpuGather::new(
+        op.axis,
+        "Cuda",
+        cuda_gather_dispatch,
+    ))))
+});
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tract_core::internal::Tensor;
+    use tract_core::ops::array::Gather as CpuGather;
+    use tract_gpu::tensor::IntoDevice;
+
+    fn run_against_cpu(
+        data_shape: &[usize],
+        indices_shape: &[usize],
+        indices_data: &[i64],
+        axis: usize,
+    ) -> TractResult<()> {
+        crate::with_cuda_stream(|stream| {
+            let n: usize = data_shape.iter().product();
+            let data = Tensor::from_shape(
+                data_shape,
+                &(0..n).map(|i| i as f32 / 10.0).collect::<Vec<_>>(),
+            )?;
+            let indices = Tensor::from_shape(indices_shape, indices_data)?;
+            let cuda_data = data.clone().into_device()?;
+            let cuda_indices = indices.clone().into_device()?;
+
+            let cpu_op = CpuGather::new(axis);
+            let cpu_out = cpu_op.eval(tvec![data.into_tvalue(), indices.into_tvalue()])?[0]
+                .clone()
+                .into_tensor();
+            let cuda_out = Gather.eval(stream, &cuda_data, &cuda_indices, axis)?;
+            cpu_out
+                .close_enough(&cuda_out.to_host()?.into_tensor(), Approximation::Exact)
+                .with_context(|| {
+                    format!(
+                        "data={data_shape:?} indices={indices_shape:?} axis={axis} \
+                         indices_data={indices_data:?}"
+                    )
+                })
+        })
+    }
+
+    /// Embedding lookup: rank-2 table, rank-2 index — the nemotron decoder shape.
+    #[test]
+    fn test_gather_embedding() -> TractResult<()> {
+        run_against_cpu(&[1025, 640], &[1, 1], &[42], 0)
+    }
+
+    /// Multi-batch embedding lookup.
+    #[test]
+    fn test_gather_embedding_multi() -> TractResult<()> {
+        run_against_cpu(&[100, 16], &[2, 3], &[0, 1, 99, 50, 25, 7], 0)
+    }
+
+    /// Non-zero axis: pre-batch axes flatten correctly.
+    #[test]
+    fn test_gather_axis_1() -> TractResult<()> {
+        run_against_cpu(&[3, 10, 4], &[2], &[0, 9], 1)
+    }
+
+    /// Negative indices wrap (axis size = 100, so -1 → 99, -100 → 0).
+    #[test]
+    fn test_gather_negative_indices() -> TractResult<()> {
+        run_against_cpu(&[100, 4], &[3], &[-1, -100, -50], 0)
+    }
+
+    /// Scalar index input (rank-0).
+    #[test]
+    fn test_gather_scalar_index() -> TractResult<()> {
+        run_against_cpu(&[5, 8], &[], &[3], 0)
+    }
+}
diff --git a/cuda/src/kernels/array/mod.rs b/cuda/src/kernels/array/mod.rs
index 309b21d06f..ff2e25ba02 100644
--- a/cuda/src/kernels/array/mod.rs
+++ b/cuda/src/kernels/array/mod.rs
@@ -1,12 +1,18 @@
 mod cast;
 mod copy;
+mod diag_gather;
 mod dispatch;
+mod gather;
 mod rotate_half;
 
 pub use cast::Cast;
 pub use cast::cuda_cast_dispatch;
 pub use copy::Memcpy;
+pub use diag_gather::DiagGather;
+pub use diag_gather::cuda_diag_gather_dispatch;
 pub use dispatch::cuda_copy_nd_dispatch;
+pub use gather::Gather;
+pub use gather::cuda_gather_dispatch;
 pub use rotate_half::RotateHalf;
 pub use rotate_half::cuda_rotate_half_dispatch;
 
@@ -26,5 +32,17 @@ pub fn all_functions() -> Vec<String> {
             .flat_map(|(dt1, dt2)| Cast.kernel_name(dt1, dt2).into_iter()),
     );
 
+    functions.extend(
+        tract_gpu::tensor::DeviceTensor::SUPPORTED_DT
+            .into_iter()
+            .flat_map(|dt| DiagGather.kernel_name(dt).into_iter()),
+    );
+
+    functions.extend(
+        tract_gpu::tensor::DeviceTensor::SUPPORTED_DT
+            .into_iter()
+            .flat_map(|dt| Gather.kernel_name(dt).into_iter()),
+    );
+
     functions.into_iter().collect()
 }
diff --git a/cuda/src/kernels/cu/array.cu b/cuda/src/kernels/cu/array.cu
index 5ed198dad4..260aae7029 100644
--- a/cuda/src/kernels/cu/array.cu
+++ b/cuda/src/kernels/cu/array.cu
@@ -219,3 +219,63 @@ INSTANTIATE_CAST_FROM(u64, uint64_t)
 // Rotate half: only float types
 INSTANTIATE_ROTATE_HALF(f32, float)
 INSTANTIATE_ROTATE_HALF(f16, __half)
+
+// Diagonal gather (Transformer-XL rel-pos skew, folded):
+//   out[..., i, k] = in[..., i, offset + k - i], 0 on out-of-bounds.
+// Leading axes are flattened by the host into one batch axis.  Each thread
+// owns one (b, i, k) output element — bandwidth-bound, no shared memory.
+#define INSTANTIATE_DIAG_GATHER(name, T)                                       \
+    extern "C" __global__ void diag_gather_##name(                             \
+        const T *input, T *output, const int32_t offset,                       \
+        const int32_t batch, const int32_t t_q, const int32_t r_in,            \
+        const int32_t out_len, const int32_t in_stride_b,                      \
+        const int32_t in_stride_i, const int32_t in_stride_r,                  \
+        const int32_t out_stride_b, const int32_t out_stride_i,                \
+        const int32_t out_stride_k) {                                          \
+        const int32_t k = blockIdx.x * blockDim.x + threadIdx.x;               \
+        if (k >= out_len)                                                      \
+            return;                                                            \
+        const int32_t i = blockIdx.y;                                          \
+        const int32_t b = blockIdx.z;                                          \
+        const int32_t r = offset + k - i;                                      \
+        T *out_ptr = output + b * out_stride_b + i * out_stride_i +            \
+                     k * out_stride_k;                                         \
+        if (r >= 0 && r < r_in) {                                              \
+            const T *in_ptr =                                                  \
+                input + b * in_stride_b + i * in_stride_i + r * in_stride_r;   \
+            *out_ptr = *in_ptr;                                                \
+        } else {                                                               \
+            *out_ptr = (T)0;                                                   \
+        }                                                                      \
+    }
+
+INSTANTIATE_DIAG_GATHER(f32, float)
+INSTANTIATE_DIAG_GATHER(f16, __half)
+
+// Gather along one axis:
+//   out[i_pre, i_n, i_post] = data[i_pre, indices[i_n], i_post]
+// where the host flattens to (pre × a_size × post) for data and
+// (pre × n_indices × post) for output.  `n_indices` here is the *flat*
+// indices count (product of the indices tensor's shape).  Negative indices
+// wrap with `a_size`, matching the CPU contract.
+#define INSTANTIATE_GATHER(name, T)                                            \
+    extern "C" __global__ void gather_##name(                                  \
+        const T *data, const int64_t *indices, T *output, const int32_t pre,   \
+        const int32_t a_size, const int32_t post, const int32_t n_indices) {   \
+        const int32_t i_post = blockIdx.x * blockDim.x + threadIdx.x;          \
+        if (i_post >= post)                                                    \
+            return;                                                            \
+        const int32_t i_n = blockIdx.y;                                        \
+        const int32_t i_pre = blockIdx.z;                                      \
+        int64_t k = indices[i_n];                                              \
+        if (k < 0)                                                             \
+            k += a_size;                                                       \
+        const int64_t in_off =                                                 \
+            ((int64_t)i_pre * a_size + k) * post + i_post;                     \
+        const int64_t out_off =                                                \
+            ((int64_t)i_pre * n_indices + i_n) * post + i_post;                \
+        output[out_off] = data[in_off];                                        \
+    }
+
+INSTANTIATE_GATHER(f32, float)
+INSTANTIATE_GATHER(f16, __half)
diff --git a/cuda/src/kernels/cu/nn.cu b/cuda/src/kernels/cu/nn.cu
index 7eb34dc4e9..fa3151acaa 100644
--- a/cuda/src/kernels/cu/nn.cu
+++ b/cuda/src/kernels/cu/nn.cu
@@ -588,6 +588,136 @@ __device__ void scaled_masked_softmax(
             out_stride_4);                                                     \
     }
 
+// Bool-mask variant: mask is char (0/1), substitutes -inf at masked positions
+// before the softmax.  When post_mask is non-zero, fully-masked rows (all
+// inputs -inf) are written as 0 instead of the NaN the naive softmax would
+// emit.  Partially-masked rows are unaffected: exp(-inf) = 0 already zeros
+// masked positions in the output.
+template <typename T, int BLOCK_SIZE>
+__device__ void scaled_bool_masked_softmax(
+    const T *x, const char *mask, const float scale, T *dst,
+    const int32_t post_mask, const int32_t shape_0, const int32_t shape_1,
+    const int32_t shape_2, const int32_t shape_3, const int32_t shape_4,
+    const int32_t stride_0, const int32_t stride_1, const int32_t stride_2,
+    const int32_t stride_3, const int32_t stride_4,
+    const int32_t mask_stride_0, const int32_t mask_stride_1,
+    const int32_t mask_stride_2, const int32_t mask_stride_3,
+    const int32_t mask_stride_4, const int32_t out_stride_0,
+    const int32_t out_stride_1, const int32_t out_stride_2,
+    const int32_t out_stride_3, const int32_t out_stride_4) {
+    int32_t z0 = blockIdx.z / shape_1;
+    int32_t z1 = blockIdx.z % shape_1;
+    x += blockIdx.x * stride_3 + blockIdx.y * stride_2 + z1 * stride_1 +
+         z0 * stride_0;
+    mask += blockIdx.x * mask_stride_3 + blockIdx.y * mask_stride_2 +
+            z1 * mask_stride_1 + z0 * mask_stride_0;
+    dst += blockIdx.x * out_stride_3 + blockIdx.y * out_stride_2 +
+           z1 * out_stride_1 + z0 * out_stride_0;
+
+    const int block_size = BLOCK_SIZE == 0 ? blockDim.x : BLOCK_SIZE;
+
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    extern __shared__ float data_soft_max_f32[];
+    float *buf_iw = data_soft_max_f32;
+    float *vals = buf_iw + WARP_SIZE;
+
+    float max_val = -CUDART_INF_F;
+    _Pragma("unroll") for (int col0 = 0; col0 < shape_4; col0 += block_size) {
+        const int col = col0 + threadIdx.x;
+        if (col >= shape_4) {
+            break;
+        }
+
+        const bool m = mask[col * mask_stride_4] != 0;
+        const float val = m ? ((float)x[col * stride_4]) * scale : -CUDART_INF_F;
+        vals[col] = val;
+        max_val = max(max_val, val);
+    }
+
+    max_val = warp_reduce_max(max_val);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf_iw[lane_id] = -CUDART_INF_F;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = max_val;
+        }
+        __syncthreads();
+
+        max_val = buf_iw[lane_id];
+        max_val = warp_reduce_max(max_val);
+    }
+
+    float tmp = 0.0f;
+    _Pragma("unroll") for (int col0 = 0; col0 < shape_4; col0 += block_size) {
+        const int col = col0 + threadIdx.x;
+        if (col >= shape_4) {
+            break;
+        }
+
+        const float val = expf(vals[col] - max_val);
+        tmp += val;
+        vals[col] = val;
+    }
+
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __syncthreads();
+        if (warp_id == 0) {
+            buf_iw[lane_id] = 0.0f;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = tmp;
+        }
+        __syncthreads();
+
+        tmp = buf_iw[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }
+
+    // Row-uniform: tmp <= 0 (or NaN) iff every position was masked.  When
+    // post_mask is set we write 0 in that case to scrub the NaN; otherwise
+    // we fall through to the normal 1/sum path and let it propagate.
+    const bool zero_row = post_mask && !(tmp > 0.0f);
+    const float inv_sum = 1.0f / tmp;
+
+    _Pragma("unroll") for (int col0 = 0; col0 < shape_4; col0 += block_size) {
+        const int col = col0 + threadIdx.x;
+        if (col >= shape_4) {
+            return;
+        }
+        dst[col * out_stride_4] = zero_row ? (T)0.0f : (T)(vals[col] * inv_sum);
+    }
+}
+
+#define INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, bname,                 \
+                                               block_size_template)            \
+    extern "C" __global__ void scaled_bool_masked_softmax_##bname##name(       \
+        const T *x, const char *mask, const float scale, T *dst,               \
+        const int32_t post_mask, const int32_t shape_0,                        \
+        const int32_t shape_1, const int32_t shape_2, const int32_t shape_3,   \
+        const int32_t shape_4, const int32_t stride_0,                         \
+        const int32_t stride_1, const int32_t stride_2,                        \
+        const int32_t stride_3, const int32_t stride_4,                        \
+        const int32_t mask_stride_0, const int32_t mask_stride_1,              \
+        const int32_t mask_stride_2, const int32_t mask_stride_3,              \
+        const int32_t mask_stride_4, const int32_t out_stride_0,               \
+        const int32_t out_stride_1, const int32_t out_stride_2,                \
+        const int32_t out_stride_3, const int32_t out_stride_4) {              \
+        scaled_bool_masked_softmax<T, block_size_template>(                    \
+            x, mask, scale, dst, post_mask, shape_0, shape_1, shape_2,         \
+            shape_3, shape_4, stride_0, stride_1, stride_2, stride_3,          \
+            stride_4, mask_stride_0, mask_stride_1, mask_stride_2,             \
+            mask_stride_3, mask_stride_4, out_stride_0, out_stride_1,          \
+            out_stride_2, out_stride_3, out_stride_4);                         \
+    }
+
 #define INSTANTIATE_RMS_NORM(name, T, bname, block_size)                       \
     extern "C" __global__ void rms_norm_##bname##name(                         \
         const T *x, T *dst, const int32_t shape_0, const int32_t shape_1,      \
@@ -652,8 +782,24 @@ INSTANTIATE_SOFTMAX(f16, __half, , 1024)
     INSTANTIATE_SCALED_MASKED_SOFTMAX(name, T, 32768_, 1024)                   \
     INSTANTIATE_SCALED_MASKED_SOFTMAX(name, T, 0_, 0)
 
+#define INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX_FOR_T(name, T)                  \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 32_, 32)                   \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 64_, 64)                   \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 128_, 126)                 \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 256_, 256)                 \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 512_, 512)                 \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 1024_, 1024)               \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 2048_, 1024)               \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 4096_, 1024)               \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 8192_, 1024)               \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 16384_, 1024)              \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 32768_, 1024)              \
+    INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX(name, T, 0_, 0)
+
 INSTANTIATE_SCALED_MASKED_SOFTMAX_FOR_T(f32, float)
 INSTANTIATE_SCALED_MASKED_SOFTMAX_FOR_T(f16, __half)
+INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX_FOR_T(f32, float)
+INSTANTIATE_SCALED_BOOL_MASKED_SOFTMAX_FOR_T(f16, __half)
 
 INSTANTIATE_REDUCE(f32, float, small_, 32)
 INSTANTIATE_REDUCE(f32, float, , 1024)
diff --git a/cuda/src/kernels/nn/mod.rs b/cuda/src/kernels/nn/mod.rs
index 668fad0763..a2bb0e78e1 100644
--- a/cuda/src/kernels/nn/mod.rs
+++ b/cuda/src/kernels/nn/mod.rs
@@ -54,7 +54,11 @@ pub fn all_functions() -> Vec<String> {
         tract_gpu::tensor::DeviceTensor::SUPPORTED_DT
             .into_iter()
             .flat_map(|dt| sms_block_sizes().into_iter().map(move |bs| (dt, bs as usize)))
-            .flat_map(|(dt, bs)| ScaledMaskedSoftmax.kernel_name(dt, bs).into_iter()),
+            .flat_map(|(dt, bs)| {
+                [false, true]
+                    .into_iter()
+                    .flat_map(move |mb| ScaledMaskedSoftmax.kernel_name(dt, mb, bs).into_iter())
+            }),
     );
 
     functions.extend(
diff --git a/cuda/src/kernels/nn/scaled_masked_softmax.rs b/cuda/src/kernels/nn/scaled_masked_softmax.rs
index 8e5d80a0c8..3653a490af 100644
--- a/cuda/src/kernels/nn/scaled_masked_softmax.rs
+++ b/cuda/src/kernels/nn/scaled_masked_softmax.rs
@@ -18,14 +18,25 @@ impl ScaledMaskedSoftmax {
         matches!(dt, DatumType::F32 | DatumType::F16)
     }
 
-    pub fn kernel_name(&self, dt: DatumType, block_size: usize) -> TractResult<String> {
+    pub fn is_supported_mask_dt(input_dt: DatumType, mask_dt: DatumType) -> bool {
+        mask_dt == input_dt || mask_dt == bool::datum_type()
+    }
+
+    pub fn kernel_name(
+        &self,
+        input_dt: DatumType,
+        mask_is_bool: bool,
+        block_size: usize,
+    ) -> TractResult<String> {
         ensure!(
-            Self::is_supported_dt(dt),
-            "Unsupported dt {:?} for cuda scaled masked softmaxop",
-            dt
+            Self::is_supported_dt(input_dt),
+            "Unsupported dt {:?} for cuda scaled masked softmax op",
+            input_dt
         );
-        let tname = DeviceTensor::tname(dt)?;
-        Ok(format!("scaled_masked_softmax_{block_size}_{tname}"))
+        let tname = DeviceTensor::tname(input_dt)?;
+        let stem =
+            if mask_is_bool { "scaled_bool_masked_softmax" } else { "scaled_masked_softmax" };
+        Ok(format!("{stem}_{block_size}_{tname}"))
     }
 
     pub fn eval(
@@ -34,9 +45,10 @@ impl ScaledMaskedSoftmax {
         input: &DeviceTensor,
         scale: &Tensor,
         mask: &DeviceTensor,
+        post_softmax_mask: bool,
     ) -> TractResult<DeviceTensor> {
         let output = unsafe { DeviceTensor::uninitialized_dt(input.datum_type(), input.shape())? };
-        self.dispatch_eval(stream, input, scale, mask, &output)?;
+        self.dispatch_eval(stream, input, scale, mask, post_softmax_mask, &output)?;
         stream.synchronize()?;
         Ok(output)
     }
@@ -47,19 +59,22 @@ impl ScaledMaskedSoftmax {
         input: &DeviceTensor,
         scale: &Tensor,
         mask: &DeviceTensor,
+        post_softmax_mask: bool,
         output: &DeviceTensor,
     ) -> TractResult<()> {
         ensure!(output.shape() == input.shape());
         ensure!(input.rank() >= 2 && input.rank() <= 5);
         ensure!(mask.rank() == input.rank());
         ensure!(output.datum_type() == input.datum_type());
-        ensure!(mask.datum_type() == input.datum_type());
+        let mask_is_bool = mask.datum_type() == bool::datum_type();
+        ensure!(Self::is_supported_mask_dt(input.datum_type(), mask.datum_type()));
+        // post_softmax_mask is meaningful only with a bool mask (CPU contract).
+        ensure!(!post_softmax_mask || mask_is_bool);
 
         let shape = pad(input.shape(), 1);
         let strides = pad(input.strides(), 0);
         let mask_strides = pad(&compute_broadcast_strides::<i32>(mask.shape(), mask.strides())?, 0);
         let output_strides = pad(output.strides(), 0);
-        let inner_len = shape[4];
 
         let i_view = get_cuda_view(input);
         let mask_view = get_cuda_view(mask);
@@ -74,14 +89,19 @@ impl ScaledMaskedSoftmax {
         let block_size =
             if inner_len.is_power_of_two() && inner_len > 32 { inner_len.min(1024) } else { 0 };
 
-        let func = cuda_context()
-            .load_pipeline(LibraryName::NN, self.kernel_name(input.datum_type(), block_size)?)?;
+        let func = cuda_context().load_pipeline(
+            LibraryName::NN,
+            self.kernel_name(input.datum_type(), mask_is_bool, block_size)?,
+        )?;
 
         let mut launch_args = TractLaunchArgs::new(stream, &func);
         launch_args.push_view(&i_view);
         launch_args.push_view(&mask_view);
         launch_args.push::<f32>(scale.cast_to_scalar::<f32>()?);
         launch_args.push_view(&o_view);
+        if mask_is_bool {
+            launch_args.push::<i32>(post_softmax_mask as i32);
+        }
         launch_args.push_slice_i32(&shape);
         launch_args.push_slice_i32(&strides);
         launch_args.push_slice_i32(&mask_strides);
@@ -111,22 +131,28 @@ pub fn cuda_scaled_masked_softmax_dispatch(
     input: &DeviceTensor,
     scale: &Tensor,
     mask: &DeviceTensor,
+    post_softmax_mask: bool,
     output: &DeviceTensor,
 ) -> TractResult<()> {
     crate::with_cuda_stream(|stream| {
-        ScaledMaskedSoftmax.dispatch_eval(stream, input, scale, mask, output)
+        ScaledMaskedSoftmax.dispatch_eval(stream, input, scale, mask, post_softmax_mask, output)
     })
 }
 
 crate::register_cuda_op!(
     tract_transformers::ops::scaled_masked_softmax::ScaledMaskedSoftmax,
     |source, node, op| {
-        rule_if!(!op.post_softmax_mask);
-        rule_if!(ScaledMaskedSoftmax::is_supported_dt(
-            source.node_input_facts(node.id)?[0].datum_type
+        let facts = source.node_input_facts(node.id)?;
+        rule_if!(ScaledMaskedSoftmax::is_supported_dt(facts[0].datum_type));
+        rule_if!(ScaledMaskedSoftmax::is_supported_mask_dt(
+            facts[0].datum_type,
+            facts[1].datum_type,
         ));
+        // post_softmax_mask requires a bool mask (CPU contract).
+        rule_if!(!op.post_softmax_mask || facts[1].datum_type == bool::datum_type());
         Ok(Some(Box::new(tract_gpu::ops::scaled_masked_softmax::GpuScaledMaskedSoftmax::new(
             op.scale.clone(),
+            op.post_softmax_mask,
             "Cuda",
             cuda_scaled_masked_softmax_dispatch,
         ))))
@@ -170,13 +196,63 @@ mod tests {
                 .eval(tvec![a.to_host()?.into_tvalue(), mask.to_host()?.into_tvalue()])?[0]
                 .clone()
                 .into_tensor();
-            let cuda_output = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask)?;
+            let cuda_output = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask, false)?;
             cpu_output
                 .close_enough(&cuda_output.to_host()?.into_tensor(), Approximation::Approximate)?;
             Ok(())
         })
     }
 
+    /// Bool-mask path with a fully-masked row.  Without post_softmax_mask
+    /// the output is NaN (matches CPU); with it on, the NaN is scrubbed to 0.
+    #[test]
+    fn test_scaled_bool_masked_softmax_post_mask_scrubs_nan() -> TractResult<()> {
+        crate::with_cuda_stream(|stream| {
+            let m = 3;
+            let n = 5;
+            let scale: Arc<_> = tensor0(0.125f32).into();
+            // Row 0: fully masked.  Row 1: partially masked.  Row 2: fully unmasked.
+            let mask_data: Vec<bool> = (0..m)
+                .flat_map(|r| {
+                    (0..n).map(move |c| match r {
+                        0 => false,
+                        1 => c >= 2,
+                        _ => true,
+                    })
+                })
+                .collect();
+            let mask = Tensor::from_shape(&[1, 1, m, n], &mask_data)?.into_device()?;
+            let a = Tensor::from_shape(
+                &[1, 1, m, n],
+                &(0..m * n).map(|f| f as f32).collect::<Vec<_>>(),
+            )?
+            .into_device()?;
+
+            for post in [false, true] {
+                let cpu = scaled_masked_softmax::ScaledMaskedSoftmax {
+                    scale: scale.clone(),
+                    post_softmax_mask: post,
+                };
+                let cpu_out = cpu
+                    .eval(tvec![a.to_host()?.into_tvalue(), mask.to_host()?.into_tvalue()])?[0]
+                    .clone()
+                    .into_tensor();
+                let cuda_out = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask, post)?;
+                let cuda_host = cuda_out.to_host()?.into_tensor();
+                let cpu_slice = cpu_out.view().as_slice::<f32>().unwrap();
+                let cuda_slice = cuda_host.view().as_slice::<f32>().unwrap();
+                for (i, (c, g)) in cpu_slice.iter().zip(cuda_slice.iter()).enumerate() {
+                    if c.is_nan() {
+                        assert!(g.is_nan(), "post={post} idx={i}: cpu NaN, cuda {g}");
+                    } else {
+                        assert!((c - g).abs() < 1e-5, "post={post} idx={i}: cpu {c} cuda {g}");
+                    }
+                }
+            }
+            Ok(())
+        })
+    }
+
     proptest::proptest! {
         #[test]
         fn scaled_masked_softmax_prop_f32(pb in any::<ScaledMaskedSoftmaxProblem<f32>>()) {
@@ -269,7 +345,7 @@ mod tests {
                 let mask =
                     Tensor::from_shape(self.mask_shape.as_slice(), &self.mask)?.into_device()?;
                 let scale: Arc<_> = tensor0::<F>(0.125f32.as_()).into();
-                let cuda_output = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask)?;
+                let cuda_output = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask, false)?;
                 Ok(cuda_output.to_host()?.into_tensor())
             })
         }
diff --git a/cuda/src/ops/quant_q81.rs b/cuda/src/ops/quant_q81.rs
index f724f1062a..10c763d2a6 100644
--- a/cuda/src/ops/quant_q81.rs
+++ b/cuda/src/ops/quant_q81.rs
@@ -104,15 +104,15 @@ impl TypedOp for CudaGgmlQuantQ81 {
         .with_context(|| format!("Error while computing facts for {:?}", self.name()))
     }
 
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         _source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
-        let op = Self::new(self.io_facts.in_fact.eval(values)?.into_owned())?;
+        let op = Self::new(self.io_facts.in_fact.substitute(subs)?.into_owned())?;
         target.wire_node(&node.name, op, &[mapping[&node.inputs[0]]])
     }
     as_op!();
diff --git a/cuda/src/rewrite_rules/fuse_axis_op.rs b/cuda/src/rewrite_rules/fuse_axis_op.rs
index cc76e2fb68..975eab207c 100644
--- a/cuda/src/rewrite_rules/fuse_axis_op.rs
+++ b/cuda/src/rewrite_rules/fuse_axis_op.rs
@@ -113,9 +113,7 @@ pub fn fuse_axis_op(
 
     let node_name = &node.name;
 
-    let Some(in_nodes) = model.all_prec(node.id)? else {
-        return Ok(None);
-    };
+    rule_if_some!(in_nodes = model.all_prec(node.id)?);
 
     let mut grouped_axis_ops: TVec<TVec<GpuAxisOp>> = tvec![];
     let mut tap_inputs = tvec![];
@@ -205,7 +203,7 @@ pub fn fuse_move_axis(
     }
 
     // Fuse consecutive MoveAxis if possible
-    let Some(cursor) = model.single_succ(axis_node.id)? else { return Ok(None) };
+    rule_if_some!(cursor = model.single_succ(axis_node.id)?);
     if let (AxisOp::Move(from_1, to_1), AxisOp::Move(from_2, to_2)) = (
         axis_op.inner.clone(),
         cursor.op_as::<GpuAxisOp>().map(|ax_op| ax_op.inner.clone()).unwrap_or(AxisOp::Add(0)),
@@ -230,7 +228,7 @@ pub fn fuse_move_axis(
     }
 
     // Add(x) -> Move(x, y)
-    let Some(cursor) = model.single_prec(axis_node.id)? else { return Ok(None) };
+    rule_if_some!(cursor = model.single_prec(axis_node.id)?);
     if let (AxisOp::Move(from_1, to_1), AxisOp::Add(ax)) = (
         axis_op.inner.clone(),
         cursor.op_as::<GpuAxisOp>().map(|ax_op| ax_op.inner.clone()).unwrap_or(AxisOp::Rm(0)),
diff --git a/cuda/src/rewrite_rules/pad_q40_weights.rs b/cuda/src/rewrite_rules/pad_q40_weights.rs
index 480e1739a2..cb1c68a258 100644
--- a/cuda/src/rewrite_rules/pad_q40_weights.rs
+++ b/cuda/src/rewrite_rules/pad_q40_weights.rs
@@ -55,16 +55,10 @@ pub fn pad_q40_weights(
     _node_name: &str,
     op: &Const,
 ) -> TractResult<Option<TypedModelPatch>> {
-    let Some(dev_tensor) = op.val().to_device_tensor().ok() else {
-        return Ok(None);
-    };
+    rule_if_some!(dev_tensor = op.val().to_device_tensor().ok());
 
-    let DeviceTensor::Owned(t) = dev_tensor else {
-        return Ok(None);
-    };
-    let Some(cuda_tensor) = t.downcast_ref::<CudaTensor>() else {
-        return Ok(None);
-    };
+    rule_if_let!(DeviceTensor::Owned(t) = dev_tensor);
+    rule_if_some!(cuda_tensor = t.downcast_ref::<CudaTensor>());
 
     let bqf = cuda_tensor
         .exotic_fact()
@@ -77,9 +71,7 @@ pub fn pad_q40_weights(
     // all axis ops and CudaFusedAxisOp internal reshapes).  The raw tensor
     // may have a per-head shape like [m, num_heads, head_dim] that gets
     // collapsed before the GEMM.
-    let Some(effective_shape) = effective_gemm_shape(model, node, bqf.shape())? else {
-        return Ok(None);
-    };
+    rule_if_some!(effective_shape = effective_gemm_shape(model, node, bqf.shape())?);
     let effective_k = *effective_shape.last().unwrap();
     rule_ensure!(effective_k % Q40_ROW_PADDING != 0);
 
diff --git a/cuda/src/transform.rs b/cuda/src/transform.rs
index ee588d1108..97dc5bc9fa 100644
--- a/cuda/src/transform.rs
+++ b/cuda/src/transform.rs
@@ -135,9 +135,7 @@ fn try_make_cuda_op(
     });
 
     let input_facts = source.node_input_facts(node.id)?;
-    if !input_facts.iter().all(|f| DeviceTensor::is_supported_dt(f.datum_type)) {
-        return Ok(None);
-    }
+    rule_if!(input_facts.iter().all(|f| DeviceTensor::is_supported_dt(f.datum_type)));
 
     // Copy-based ops are fully generic (no backend-specific dispatch needed).
     if let Some(op) = tract_gpu::ops::copy_based::try_make_copy_based_op(source, node)? {
@@ -448,8 +446,61 @@ impl Translate<TypedFact, Box<dyn TypedOp>, TypedFact, Box<dyn TypedOp>> for Cud
             }
         }
 
-        // Single-op translation
-        if let Some(gpu_op) = try_make_cuda_op(source, node)? {
+        // Single-op translation.  Pre-check that the gpu_op accepts the
+        // already-translated target-side input facts: some translators
+        // (notably the AxisOp path: GpuAxisOp can carry a `Reshape(from,
+        // to)` whose dims were synthesised from the source shape, but an
+        // upstream node may have been translated into a different shape
+        // — e.g. pulsification of an upstream matmul producing a smaller
+        // axis).  Without the pre-check, those stale reshapes pass
+        // try_make_cuda_op and then bail inside `wire_node`'s output_facts
+        // call, aborting the entire CUDA transform.  Fall back to the CPU
+        // op so the model stays runnable.
+        // Snapshot target-side input facts before any further mutation; clone
+        // out so we release the borrow on `target` before wiring below.
+        let target_inputs: TVec<TypedFact> = node
+            .inputs
+            .iter()
+            .map(|i| target.outlet_fact(mapping[i]).map(|f| f.clone()))
+            .collect::<TractResult<_>>()?;
+        // Mirror what `sync_inputs_if_required(ToDevice)` will do at wire time:
+        // wrap any non-device input as a device fact so the GPU op's
+        // `output_facts` sees the same uniform device-fact inputs it will get
+        // after sync nodes are inserted.  Without this, mixed host/device
+        // inputs (e.g. an LLM kv-cache concat: host past + device current) make
+        // `output_facts` bail with "Inconsistent facts", wrongly tripping the
+        // CPU fallback.
+        let target_inputs_post_sync: TVec<TypedFact> = target_inputs
+            .iter()
+            .map(|f| -> TractResult<TypedFact> {
+                if f.as_device_fact().is_some() {
+                    Ok(f.clone())
+                } else {
+                    Ok(tract_gpu::fact::DeviceFact::from_host(f.clone())?.into_exotic_fact())
+                }
+            })
+            .collect::<TractResult<_>>()?;
+        let target_input_post_sync_refs: TVec<&TypedFact> =
+            target_inputs_post_sync.iter().collect();
+        let force_cpu = std::env::var("TRACT_CUDA_FORCE_CPU")
+            .ok()
+            .map(|s| s.split(',').any(|pat| !pat.is_empty() && node.name.contains(pat)))
+            .unwrap_or(false);
+        let maybe_gpu_op = if force_cpu { None } else { try_make_cuda_op(source, node)? };
+        if let Some(ref op) = maybe_gpu_op
+            && std::env::var("TRACT_CUDA_TRANSLATE_DEBUG").is_ok()
+            && let Err(e) = op.output_facts(&target_input_post_sync_refs)
+        {
+            eprintln!(
+                "cuda-translate-fallback: {} ({}) inputs={:?} -> {e:?}",
+                node.name,
+                op.name(),
+                target_inputs_post_sync,
+            );
+        }
+        if let Some(gpu_op) = maybe_gpu_op
+            && gpu_op.output_facts(&target_input_post_sync_refs).is_ok()
+        {
             let device_inputs =
                 sync_inputs_if_required(target, node, mapping, DeviceSyncKind::ToDevice)?;
             let outlet_ids = target.wire_node(node.name.clone(), gpu_op, &device_inputs)?;
diff --git a/cuda/src/utils.rs b/cuda/src/utils.rs
index 1498816bef..5cdd971387 100644
--- a/cuda/src/utils.rs
+++ b/cuda/src/utils.rs
@@ -14,15 +14,60 @@ use crate::ops::GgmlQuantQ81Fact;
 static CULIBS_MISSING: OnceLock<Option<&'static str>> = OnceLock::new();
 static DEPENDENCIES_OK: OnceLock<()> = OnceLock::new();
 
-// Ensure exactly cuda-13000 is enabled.
-// Prevent accidental change of feature gate without
-// updating this required API version used for compatibility check.
-// please update the 3 references bellow if cudarc gate is updated to a newer version.
+/// CUDA Driver API version this build of tract-cuda is bound against. Used both
+/// for the runtime driver-version check (in `ensure_cuda_driver_compatible`)
+/// and for the per-version cubin cache path. Derived at compile time from the
+/// active `cuda-XXXXX` feature; cudarc binds to that same enum/struct layout,
+/// so the two must agree. Adding a new minor here means adding the matching
+/// cargo feature too.
+#[cfg(feature = "cuda-12000")]
+pub const REQUIRED_CUDA_API: i32 = 12000;
+#[cfg(feature = "cuda-12010")]
+pub const REQUIRED_CUDA_API: i32 = 12010;
+#[cfg(feature = "cuda-12020")]
+pub const REQUIRED_CUDA_API: i32 = 12020;
+#[cfg(feature = "cuda-12030")]
+pub const REQUIRED_CUDA_API: i32 = 12030;
+#[cfg(feature = "cuda-12040")]
+pub const REQUIRED_CUDA_API: i32 = 12040;
+#[cfg(feature = "cuda-12050")]
+pub const REQUIRED_CUDA_API: i32 = 12050;
+#[cfg(feature = "cuda-12060")]
+pub const REQUIRED_CUDA_API: i32 = 12060;
+#[cfg(feature = "cuda-12080")]
+pub const REQUIRED_CUDA_API: i32 = 12080;
+#[cfg(feature = "cuda-12090")]
+pub const REQUIRED_CUDA_API: i32 = 12090;
+#[cfg(feature = "cuda-13000")]
 pub const REQUIRED_CUDA_API: i32 = 13000;
-#[cfg(not(feature = "cuda-13000"))]
+#[cfg(feature = "cuda-13010")]
+pub const REQUIRED_CUDA_API: i32 = 13010;
+#[cfg(feature = "cuda-13020")]
+pub const REQUIRED_CUDA_API: i32 = 13020;
+
+// Exactly one cuda-XXXXX feature must be enabled. cudarc itself panics at its
+// build script if zero are enabled, but it does not catch the case of two
+// being enabled simultaneously — and a mismatch between REQUIRED_CUDA_API and
+// cudarc's actual binding would be a silent ABI hazard. Enumerate the
+// supported set explicitly so a duplicate triggers `REQUIRED_CUDA_API` re-def
+// at compile time.
+#[cfg(not(any(
+    feature = "cuda-12000",
+    feature = "cuda-12010",
+    feature = "cuda-12020",
+    feature = "cuda-12030",
+    feature = "cuda-12040",
+    feature = "cuda-12050",
+    feature = "cuda-12060",
+    feature = "cuda-12080",
+    feature = "cuda-12090",
+    feature = "cuda-13000",
+    feature = "cuda-13010",
+    feature = "cuda-13020",
+)))]
 compile_error!(
-    "Tract CUDA backend currently supports only cudarc feature 'cuda-13000'. \
-        Enabled in Cargo features.",
+    "Tract CUDA backend requires exactly one of the cuda-XXXXX features \
+     (cuda-12000..cuda-13020) to be enabled. Enable it in Cargo features.",
 );
 
 /// CUDA Driver API status code type (CUresult is an enum, but it's ABI-compatible with int).
diff --git a/data/src/dim/parse.rs b/data/src/dim/parse.rs
index cbf4e4e1fe..adb7637d56 100644
--- a/data/src/dim/parse.rs
+++ b/data/src/dim/parse.rs
@@ -84,6 +84,7 @@ fn atom<'i>(symbol_table: &SymbolScope, i: &'i str) -> R<'i, TDim> {
         map(numeric, TDim::Val),
         map(|i| func(symbol_table, "min", i), TDim::Min),
         map(|i| func(symbol_table, "max", i), TDim::Max),
+        map(|i| func(symbol_table, "broadcast", i), TDim::Broadcast),
         map(|i| func(symbol_table, "floor", i), |xs| xs[0].clone()),
         map(|i| identifier(symbol_table, i), TDim::Sym),
         map(pair(recognize(stag("-")), |i| atom(symbol_table, i)), |(_, dim)| dim * -1),
@@ -175,6 +176,24 @@ mod test {
         );
     }
 
+    #[test]
+    fn parse_broadcast_func() {
+        let table = SymbolScope::default();
+        assert_eq!(
+            parse_tdim(&table, "broadcast(P,S)").unwrap(),
+            TDim::Broadcast(vec!(table.sym("P").into(), table.sym("S").into()))
+        );
+    }
+
+    #[test]
+    fn parse_broadcast_display_roundtrip() {
+        let table = SymbolScope::default();
+        let original = TDim::Broadcast(vec![table.sym("P").into(), table.sym("S").into()]);
+        let printed = format!("{original}");
+        let reparsed = parse_tdim(&table, &printed).unwrap();
+        assert_eq!(reparsed, original);
+    }
+
     #[test]
     fn parse_inequality_0() {
         let table = SymbolScope::default();
diff --git a/data/src/dim/sym.rs b/data/src/dim/sym.rs
index 96ac758a47..d975174b60 100644
--- a/data/src/dim/sym.rs
+++ b/data/src/dim/sym.rs
@@ -492,6 +492,12 @@ impl SymbolValues {
     pub fn get(&self, s: &Symbol) -> Option<i64> {
         self.values.get(s).copied()
     }
+
+    /// View the bindings as `Symbol → TDim` (each `i64` lifted to `TDim::Val`),
+    /// for callers that need to plug into APIs taking `HashMap<Symbol, TDim>`.
+    pub fn to_dim_map(&self) -> HashMap<Symbol, TDim> {
+        self.values.iter().map(|(s, v)| (s.clone(), TDim::Val(*v))).collect()
+    }
 }
 
 #[cfg(test)]
diff --git a/data/src/dim/tree.rs b/data/src/dim/tree.rs
index be09079344..6d87d4baba 100644
--- a/data/src/dim/tree.rs
+++ b/data/src/dim/tree.rs
@@ -30,7 +30,13 @@ impl std::error::Error for TooEarly {}
 
 macro_rules! b( ($e:expr) => { Box::new($e) } );
 
-#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+// `Hash` stays structural while `PartialEq` accepts an algebraic second chance:
+// see the `PartialEq` impl below for the rationale (the simplifier's internal
+// `HashMap<TDim, _>` only ever compares within same-canonical-form buckets, so
+// the standard `a == b => hash(a) == hash(b)` contract being violated outside
+// that path is acceptable here).
+#[allow(clippy::derived_hash_with_manual_eq)]
+#[derive(Clone, Eq, Hash, Debug)]
 pub enum TDim {
     Val(i64),
     Sym(Symbol),
@@ -49,6 +55,72 @@ pub enum TDim {
 
 use TDim::*;
 
+/// Structural equality on the TDim tree — what `#[derive(PartialEq)]` would
+/// produce.  Used as the fast-path inside `PartialEq` (and by the simplifier's
+/// internal `HashMap<TDim, _>`, which compares within same-hash buckets where
+/// structural equality is the only thing that matters).
+fn eq_structural(a: &TDim, b: &TDim) -> bool {
+    match (a, b) {
+        (Val(x), Val(y)) => x == y,
+        (Sym(x), Sym(y)) => x == y,
+        (Add(x), Add(y))
+        | (Mul(x), Mul(y))
+        | (Broadcast(x), Broadcast(y))
+        | (Min(x), Min(y))
+        | (Max(x), Max(y)) => {
+            x.len() == y.len() && x.iter().zip(y).all(|(a, b)| eq_structural(a, b))
+        }
+        (MulInt(p, x), MulInt(q, y)) => p == q && eq_structural(x, y),
+        (Div(x, p), Div(y, q)) => p == q && eq_structural(x, y),
+        (Ge(a, b), Ge(c, d)) | (Eq(a, b), Eq(c, d)) => eq_structural(a, c) && eq_structural(b, d),
+        _ => false,
+    }
+}
+
+// Thread-local guard: while simplifying the difference inside `eq`, fall back
+// to the structural-only path for any nested `==` calls.  Without this guard
+// the simplifier's internal `HashMap<TDim, i64>` would re-enter `eq` from
+// inside `(self - other).simplify()`, recursing without bound on
+// non-structurally-equal inputs.
+std::thread_local! {
+    static EQ_GUARD: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
+}
+
+impl PartialEq for TDim {
+    fn eq(&self, other: &Self) -> bool {
+        // Fast path: structural tree equality.
+        if eq_structural(self, other) {
+            return true;
+        }
+        // Inside an enclosing simplification triggered by a previous
+        // second-chance call, fall back to structural equality only.
+        if EQ_GUARD.with(|g| g.get()) {
+            return false;
+        }
+        // Skip second-chance when either side is a leaf (`Val` or `Sym`).
+        // For `Val(c)` vs anything non-`Val`: if they were semantically
+        // equal, the simplifier should already have folded the other side
+        // to `Val(c)`; running a diff-and-simplify here just risks
+        // arithmetic overflow on extreme constants (e.g. the simplifier
+        // filters against `Val(i64::MAX)`/`Val(i64::MIN)` sentinels).
+        // For `Sym(x)` leaves, assertion-driven equality belongs in
+        // `simplify`, not in `eq`.
+        if matches!(self, Val(_) | Sym(_)) || matches!(other, Val(_) | Sym(_)) {
+            return false;
+        }
+        // Second chance: prove the difference simplifies to zero.  Two
+        // algebraically equal TDims often arrive at different canonical
+        // forms via different construction paths (e.g. `1 + (7S+3)/4` and
+        // `((S+1)*7)/4` after blockify substitutes T → k·S in encoder
+        // shapes).  Subtracting and simplifying lets the existing
+        // simplifier rules cancel them out.
+        EQ_GUARD.with(|g| g.set(true));
+        let diff = (self.clone() - other.clone()).simplify();
+        EQ_GUARD.with(|g| g.set(false));
+        matches!(diff, Val(0))
+    }
+}
+
 fn tdim_lexi_order(a: &TDim, b: &TDim) -> Ordering {
     match (a, b) {
         (Sym(a), Sym(b)) => a.cmp(b),
@@ -88,6 +160,59 @@ fn tdim_lexi_order(a: &TDim, b: &TDim) -> Ordering {
     }
 }
 
+/// `Div(Add(terms), q)` — try to extract every `MulInt(c, X)` where `c % q == 0`
+/// out of the Div, leaving only a constant remainder in `[0, q)`.
+///
+/// Returns `Some(simplified)` when the residual constant is in `[0, q)` and
+/// every extracted symbolic factor `X` is provably non-negative — both
+/// conditions are required for soundness under tract's truncating
+/// division (`Rust i64 /`):
+///
+/// * the constant being in `[0, q)` makes `c/q_trunc = 0`;
+/// * `X ≥ 0` makes the identity `(k·X + c)/k_trunc = X` hold (it fails
+///   at e.g. `X = -1, k = 2, c = 0` because truncation rounds toward zero).
+///
+/// The `Val` arm above already handles constants outside `[0, q)`, so by
+/// the time we get here `terms` contains at most one `Val` and any number
+/// of `MulInt(c, X)` / other shapes.
+fn try_divide_multiple_plus_remainder(
+    terms: &[TDim],
+    q: u64,
+    scope: &SymbolScopeData,
+    extra: &[Assertion],
+) -> Option<TDim> {
+    let mut quotients: Vec<TDim> = vec![];
+    let mut const_rem: i64 = 0;
+    let mut any_extracted = false;
+    for term in terms {
+        match term {
+            MulInt(c, x) if *c != 0 && c.rem_euclid(q as i64) == 0 => {
+                if !scope.prove_positive_or_zero_with_extra(x, extra) {
+                    return None;
+                }
+                let new_coeff = c / (q as i64);
+                quotients.push(if new_coeff == 1 {
+                    (**x).clone()
+                } else if new_coeff == -1 {
+                    MulInt(-1, x.clone())
+                } else {
+                    MulInt(new_coeff, x.clone())
+                });
+                any_extracted = true;
+            }
+            Val(v) => const_rem += v,
+            _ => return None,
+        }
+    }
+    if !any_extracted {
+        return None;
+    }
+    if !(0..q as i64).contains(&const_rem) {
+        return None;
+    }
+    Some(if quotients.len() == 1 { quotients.remove(0) } else { Add(quotients) })
+}
+
 impl fmt::Display for TDim {
     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
         match &self {
@@ -95,7 +220,9 @@ impl fmt::Display for TDim {
             Val(it) => write!(fmt, "{it}"),
             Add(it) => write!(fmt, "{}", it.iter().map(|x| format!("{x}")).join("+")),
             Mul(it) => write!(fmt, "{}", it.iter().map(|x| format!("({x})")).join("*")),
-            Broadcast(it) => write!(fmt, "{}", it.iter().map(|x| format!("({x})")).join("#")),
+            Broadcast(it) => {
+                write!(fmt, "broadcast({})", it.iter().map(|x| format!("({x})")).join(", "))
+            }
             Min(it) => write!(fmt, "min({})", it.iter().map(|x| format!("{x}")).join(",")),
             Max(it) => write!(fmt, "max({})", it.iter().map(|x| format!("{x}")).join(",")),
             MulInt(a, b) => write!(fmt, "{a}*{b}"),
@@ -326,11 +453,24 @@ impl TDim {
                     if let Add(terms) = &num {
                         let (integer, non_integer): (Vec<_>, Vec<_>) =
                             terms.iter().cloned().partition(|a| a.gcd() % q == 0);
-                        let mut new_terms = integer.iter().map(|i| i.div(*q)).collect::<Vec<_>>();
-                        if non_integer.len() > 0 {
-                            new_terms.push(Div(b!(Add(non_integer)), *q));
+                        // Skip when the non-integer bucket holds a constant:
+                        // under tract's truncating `/`, splitting (k·X+c)/k →
+                        // X + c/k is unsound for negative X (X=-1, k=2, c=1:
+                        // (-1)/2 = 0 ≠ X). The sound version, gated on
+                        // prove_positive_or_zero, lives in simplify_rec::Div
+                        // via try_divide_multiple_plus_remainder. Cases where
+                        // the remainder is purely symbolic (e.g. A%2 → /2
+                        // lowers to (A − 2·(A/2))/2, non_integer=[A]) stay
+                        // here: the emitted Div(non_integer, q) cancels with
+                        // the extracted quotient and reduces to zero.
+                        if !non_integer.iter().any(|t| matches!(t, Val(_))) {
+                            let mut new_terms =
+                                integer.iter().map(|i| i.div(*q)).collect::<Vec<_>>();
+                            if non_integer.len() > 0 {
+                                new_terms.push(Div(b!(Add(non_integer)), *q));
+                            }
+                            forms.push(Add(new_terms))
                         }
-                        forms.push(Add(new_terms))
                     }
                     forms.push(Div(b!(num), *q))
                 }
@@ -355,6 +495,58 @@ impl TDim {
         Self::find_any_sym(self).and_then(|s| s.scope().clone())
     }
 
+    /// Fully distribute every `Mul` of `Add`s in `self` into a flat sum of
+    /// products, then `simplify`.  Used to compare two algebraically equal
+    /// but differently-factored TDims for equality (e.g. Reshape volume
+    /// checks on graphs where the same dimension is built two ways).
+    ///
+    /// Cost can blow up combinatorially on very deeply factored expressions
+    /// — call this only at boundaries where structural equality is needed,
+    /// not as a general-purpose simplifier.
+    pub fn expand_polynomial(self) -> TDim {
+        use self::TDim::*;
+        match self {
+            Mul(terms) => {
+                let terms: Vec<TDim> = terms.into_iter().map(Self::expand_polynomial).collect();
+                if let Some(add_idx) = terms.iter().position(|t| matches!(t, Add(_))) {
+                    let Add(add_terms) = terms[add_idx].clone() else { unreachable!() };
+                    let others: Vec<TDim> = terms
+                        .iter()
+                        .enumerate()
+                        .filter(|(i, _)| *i != add_idx)
+                        .map(|(_, t)| t.clone())
+                        .collect();
+                    Add(add_terms
+                        .into_iter()
+                        .map(|t| {
+                            let mut product = others.clone();
+                            product.push(t);
+                            Mul(product).expand_polynomial()
+                        })
+                        .collect())
+                    .simplify()
+                } else {
+                    Mul(terms).simplify()
+                }
+            }
+            MulInt(c, inner) => MulInt(c, Box::new(inner.expand_polynomial())).simplify(),
+            Add(terms) => Add(terms.into_iter().map(Self::expand_polynomial).collect()).simplify(),
+            Div(a, q) => Div(Box::new(a.expand_polynomial()), q).simplify(),
+            Min(terms) => Min(terms.into_iter().map(Self::expand_polynomial).collect()).simplify(),
+            Max(terms) => Max(terms.into_iter().map(Self::expand_polynomial).collect()).simplify(),
+            Broadcast(terms) => {
+                Broadcast(terms.into_iter().map(Self::expand_polynomial).collect()).simplify()
+            }
+            Ge(a, b) => {
+                Ge(Box::new(a.expand_polynomial()), Box::new(b.expand_polynomial())).simplify()
+            }
+            Eq(a, b) => {
+                Eq(Box::new(a.expand_polynomial()), Box::new(b.expand_polynomial())).simplify()
+            }
+            it @ (Sym(_) | Val(_)) => it,
+        }
+    }
+
     pub fn simplify(self) -> TDim {
         use self::TDim::*;
         if let Ok(v) = self.eval_to_i64(&SymbolValues::default()) {
@@ -442,16 +634,56 @@ impl TDim {
                     }
                 }
 
+                // Pull the integer GCD of all term coefficients out as a
+                // common factor: e.g. Add([Val(6), MulInt(14, S)]) becomes
+                // MulInt(2, Add([Val(3), MulInt(7, S)])).  The downstream
+                // Div(MulInt(p, a), q) arm then cancels (p, q) gcd, so
+                // (6 + 14·S) / 8 reduces to (3 + 7·S) / 4 with no special
+                // Div-over-Add rule needed.
+                //
+                // Only consider entries with non-zero counts — zero-count
+                // entries (canceled-out factors) get filtered later, but
+                // would otherwise drag the gcd to spurious values.  Only
+                // factor when at least one surviving entry has a
+                // non-constant key, otherwise the Add reduces to a single
+                // `Val` and wrapping it in `MulInt(g, Val(c/g))` is a
+                // strict regression in canonical form.
+                let has_non_const =
+                    simplified_terms.iter().any(|(k, &c)| c != 0 && !matches!(k, Val(_)));
+                let coef_gcd = if has_non_const {
+                    simplified_terms
+                        .values()
+                        .filter(|&&c| c != 0)
+                        .map(|c| c.unsigned_abs() as i64)
+                        .reduce(|a, b| a.gcd(&b))
+                        .unwrap_or(0)
+                } else {
+                    0
+                };
+                let outer_factor = if coef_gcd > 1 {
+                    for v in simplified_terms.values_mut() {
+                        *v /= coef_gcd;
+                    }
+                    Some(coef_gcd)
+                } else {
+                    None
+                };
+
                 let mut members: Vec<TDim> = simplified_terms
                     .into_iter()
                     .filter_map(|(term, count)| evaluate_count(term, count))
                     .collect();
                 members.sort_by(tdim_lexi_order);
 
-                match members.len() {
+                let inner = match members.len() {
                     0 => TDim::Val(0),
                     1 => members.into_iter().next().unwrap(),
                     _ => TDim::Add(members),
+                };
+                match outer_factor {
+                    None => inner,
+                    Some(_) if matches!(inner, TDim::Val(0)) => TDim::Val(0),
+                    Some(g) => TDim::MulInt(g, Box::new(inner)),
                 }
             }
             Mul(terms) => {
@@ -459,6 +691,14 @@ impl TDim {
                 // expand Mul([a, Add([b, c])]) => Add([Mul([a, b]), Mul([a, c])]).
                 // This lets (T+1)*P simplify to T*P + P, which is needed for
                 // cancellation in expressions like (T+1)*P - T*P.
+                //
+                // Multi-Add Muls are *not* eagerly expanded here — keeping them
+                // factored matters for `maybe_div`'s bag-of-factors path, which
+                // cancels common Add factors symbolically (e.g. dividing
+                // 16B·(1+Y)² by 8B·(1+Y) needs the (1+Y)s to be visible as
+                // factors, not melted into a polynomial sum).  Use
+                // `expand_polynomial` if you need a fully distributed canonical
+                // form for equality comparisons (see Reshape volume check).
                 {
                     let add_indices: Vec<usize> = terms
                         .iter()
@@ -598,22 +838,61 @@ impl TDim {
                                 q
                             )),
                         )
-                    } else if let Some(v) =
-                        terms.iter().find_map(|t| if let Val(v) = t { Some(*v) } else { None })
+                    } else if let Some(val) = terms
+                        .iter()
+                        .find_map(|t| if let Val(v) = t { Some(*v) } else { None })
+                        .and_then(|v| {
+                            if v >= q as i64 {
+                                Some(v / q as i64)
+                            } else if v < 0 {
+                                Some(-Integer::div_ceil(&-v, &(q as i64)))
+                            } else {
+                                None
+                            }
+                        })
                     {
-                        let offset = if v >= q as i64 {
-                            Some(v / q as i64)
-                        } else if v < 0 {
-                            Some(-Integer::div_ceil(&-v, &(q as i64)))
-                        } else {
-                            None
+                        terms.push(Val(-val * q as i64));
+                        // simplify_rec the inner Div too so that follow-up rules
+                        // (e.g. divide-multiple-plus-remainder below) can collapse
+                        // it once the Val extraction has tidied up the residual.
+                        let inner = Div(b!(Add(terms).simplify_rec(scope, scenario, extra)), q)
+                            .simplify_rec(scope, scenario, extra);
+                        Add(vec![Val(val), inner])
+                    } else if let Some(simplified) =
+                        try_divide_multiple_plus_remainder(&terms, q, scope, extra)
+                    {
+                        // Match `Div(Add([k·X, …, c]), k)` where:
+                        //   - one or more terms have a coefficient that is a multiple of q,
+                        //   - the rest sum to a constant in [0, q),
+                        //   - every extracted X is provably non-negative.
+                        // Then `(k·X + c)/k = X + 0 = X` under tract's truncating
+                        // division.  This is sound for X ≥ 0 only — at X = -1
+                        // the truncation rounds toward zero, not floor, breaking
+                        // the identity.  We use prove_positive_or_zero to gate.
+                        simplified.simplify_rec(scope, scenario, extra)
+                    } else if let Some(found_idx) = terms.iter().position(|term| {
+                        // Rule: (Y − q·(Y/q)) / q = 0  [i.e. (Y mod q) / q = 0]
+                        // Always sound: |Y mod q| < q so (Y mod q)/q = 0 under
+                        // truncating division regardless of sign of Y.
+                        matches!(term, MulInt(p, inner)
+                            if *p == -(q as i64)
+                            && matches!(inner.as_ref(), Div(_, q2) if *q2 == q))
+                    }) {
+                        let MulInt(_, inner) = &terms[found_idx] else { unreachable!() };
+                        let Div(y, _) = inner.as_ref() else { unreachable!() };
+                        let remaining: Vec<TDim> = terms
+                            .iter()
+                            .enumerate()
+                            .filter(|&(i, _)| i != found_idx)
+                            .map(|(_, t)| t.clone())
+                            .collect();
+                        let remaining_sum = match remaining.len() {
+                            0 => Val(0),
+                            1 => remaining.into_iter().next().unwrap(),
+                            _ => Add(remaining),
                         };
-                        if let Some(val) = offset {
-                            terms.push(Val(-val * q as i64));
-                            Add(vec![
-                                Val(val),
-                                Div(b!(Add(terms).simplify_rec(scope, scenario, extra)), q),
-                            ])
+                        if eq_structural(&remaining_sum, y) {
+                            Val(0)
                         } else {
                             Div(b!(Add(terms)), q)
                         }
@@ -959,6 +1238,25 @@ impl TDim {
         self.clone().neg().prove_strict_positive()
     }
 
+    /// Least common multiple of two `TDim`s when both reduce to positive
+    /// integers.
+    ///
+    /// Returns `Val(0)` if either operand is `0`, and `None` if either is
+    /// symbolic, negative, or if the LCM would overflow `i64`. Callers
+    /// that need a safe answer for symbolic operands should fall back at
+    /// the call site.
+    pub fn lcm(&self, other: &TDim) -> Option<TDim> {
+        match (self.as_i64(), other.as_i64()) {
+            (Some(a), Some(b)) if a > 0 && b > 0 => {
+                let g = (a as u64).gcd(&(b as u64));
+                let l = (a as u64 / g).saturating_mul(b as u64);
+                if l > i64::MAX as u64 { None } else { Some(TDim::Val(l as i64)) }
+            }
+            (Some(0), _) | (_, Some(0)) => Some(TDim::Val(0)),
+            _ => None,
+        }
+    }
+
     pub fn gcd(&self) -> u64 {
         use self::TDim::*;
         match self {
@@ -1014,7 +1312,7 @@ impl TDim {
         TDim::Div(Box::new(Add(vec![self, Val(rhs as i64 - 1)])), rhs).reduce()
     }
 
-    pub(super) fn guess_slope(&self, sym: &Symbol) -> (i64, u64) {
+    pub fn guess_slope(&self, sym: &Symbol) -> (i64, u64) {
         fn slope_rec(d: &TDim, sym: &Symbol) -> (i64, i64) {
             match d {
                 Val(_) => (0, 1),
@@ -1380,6 +1678,16 @@ mod tests {
         assert_eq!(add(&A.to_dim(), &neg(&A.to_dim())).reduce(), Val(0))
     }
 
+    #[test]
+    fn lcm_basic() {
+        assert_eq!(Val(16).lcm(&Val(32)), Some(Val(32)));
+        assert_eq!(Val(32).lcm(&Val(16)), Some(Val(32)));
+        assert_eq!(Val(6).lcm(&Val(8)), Some(Val(24)));
+        assert_eq!(Val(7).lcm(&Val(7)), Some(Val(7)));
+        // Symbolic: not computable; callers fall back.
+        assert_eq!(Val(16).lcm(&A.to_dim()), None);
+    }
+
     #[test]
     fn reduce_neg_mul() {
         assert_eq!(neg(&mul(2, &A.to_dim())).reduce(), mul(-2, &A.to_dim()))
@@ -1502,6 +1810,59 @@ mod tests {
         assert_eq!(e1, e2);
     }
 
+    #[test]
+    fn divide_multiple_plus_remainder() {
+        // (k·X + r)/k → X under truncating division when 0 ≤ r < k AND X ≥ 0.
+        let scope = SymbolScope::default().with_assertion("S>=0").unwrap();
+        let s = scope.sym("S");
+
+        // (2S+1)/2 → S
+        let e: TDim = (s.to_dim() * 2 + 1) / 2;
+        assert_eq!(e.simplify(), s.to_dim());
+
+        // -1 + (2S+1)/2 → S - 1
+        let e: TDim = (s.to_dim() * 2 + 1) / 2 - 1;
+        assert_eq!(e.simplify(), s.to_dim() - 1);
+
+        // (2S-1)/2 → S - 1   (Val rule extracts -1 first, then our rule)
+        let e: TDim = (s.to_dim() * 2 - 1) / 2;
+        assert_eq!(e.simplify(), s.to_dim() - 1);
+
+        // (4S+3)/2 → 2S + 1   (Val rule extracts 1 = 3/2, then our rule on (4S+1)/2 → 2S)
+        let e: TDim = (s.to_dim() * 4 + 3) / 2;
+        assert_eq!(e.simplify(), s.to_dim() * 2 + 1);
+    }
+
+    #[test]
+    fn divide_multiple_plus_remainder_no_assertion() {
+        // Without an X≥0 assertion the (k·X+c)/k → X identity does NOT hold
+        // (X=-1, k=2, c=1 gives -1/2=0 ≠ X under truncating division). The
+        // wiggle Div arm used to emit that variant unconditionally; reduce()
+        // would then pick it on cost. Now wiggle skips the variant when the
+        // remainder bucket contains a Val, leaving only the sound rule
+        // gated on prove_positive_or_zero in simplify_rec.
+        let scope = SymbolScope::default();
+        let s = scope.sym("S");
+        let e: TDim = (s.to_dim() * 2 + 1) / 2;
+        assert_ne!(e.simplify(), s.to_dim());
+    }
+
+    #[test]
+    fn modulo_div_is_zero() {
+        // (Y − q·(Y/q)) / q = 0 for any Y and any q — the modulo remainder
+        // divided by the modulus is always zero under truncating division.
+        let scope = SymbolScope::default();
+        let s = scope.sym("S");
+        // Simple case: (S - 2*(S/2)) / 2 = (S mod 2) / 2 = 0
+        let e: TDim = (s.to_dim() - s.to_dim() / 2 * 2) / 2;
+        assert_eq!(e.simplify(), TDim::Val(0));
+        // Composite case: ((S+1) - 2*((S+1)/2)) / 2 = 0
+        // This is the exact pattern from SameUpper conv padding.
+        let a = s.to_dim() + 1;
+        let e2: TDim = (a.clone() - a.clone() / 2 * 2) / 2;
+        assert_eq!(e2.simplify(), TDim::Val(0));
+    }
+
     #[test]
     fn reduce_div_bug_3() {
         let e1: TDim = (A.to_dim() / 2) * -4;
@@ -1515,6 +1876,19 @@ mod tests {
         assert_eq!(e, A.to_dim());
     }
 
+    #[test]
+    fn expand_polynomial_two_add_factors() {
+        // (a + 2*a*b) * (1 + b)  ==poly==  a * (1 + b) * (1 + 2*b)
+        // Both fully expand to a + 3*a*b + 2*a*b*b.  We don't auto-expand in
+        // simplify (it would block maybe_div on factored-form denominators),
+        // but expand_polynomial does, and Reshape uses it for volume checks.
+        let a = A.to_dim();
+        let b = B.to_dim();
+        let lhs = (a.clone() + a.clone() * &b * 2) * (TDim::from(1) + &b);
+        let rhs = a.clone() * (TDim::from(1) + &b) * (TDim::from(1) + b.clone() * 2);
+        assert_eq!(lhs.expand_polynomial(), rhs.expand_polynomial());
+    }
+
     #[test]
     fn reduce_div_mul() {
         let e: TDim = A.to_dim() / 2 * 2;
@@ -1901,4 +2275,51 @@ mod tests {
         let c_s = c.simplify();
         assert_eq!(a_s, c_s, "8*(-1*B) should simplify the same as -8*B");
     }
+
+    /// Encoder-pulse case: (6 + 14·S) / 8 == (3 + 7·S) / 4.
+    /// Both Add terms (Val(6) and MulInt(14, S)) share factor 2 with
+    /// divisor 8, so the simplifier should reduce both sides by 2.
+    #[test]
+    fn reduce_div_by_common_factor_with_divisor() {
+        let lhs = (A.to_dim() * 14 + 6) / 8;
+        let rhs = (A.to_dim() * 7 + 3) / 4;
+        assert_eq!(lhs, rhs);
+    }
+
+    /// Common factor that fully divides the divisor → drop the divisor.
+    /// (4·a + 8) / 4  ==  a + 2.
+    #[test]
+    fn reduce_div_when_factor_equals_divisor() {
+        let lhs = (A.to_dim() * 4 + 8) / 4;
+        let rhs = A.to_dim() + 2;
+        assert_eq!(lhs, rhs);
+    }
+
+    /// No common factor → no reduction (identity check).
+    /// (3 + 7·a) / 4 stays as-is (gcd(3, 7, 4) = 1).
+    #[test]
+    fn no_reduce_when_terms_coprime_with_divisor() {
+        let e = (A.to_dim() * 7 + 3) / 4;
+        // We just check it didn't reduce to something weird; the
+        // canonical form is `Div(Add(...), 4)`.
+        match &e {
+            Div(_, q) => assert_eq!(*q, 4),
+            other => panic!("expected Div(_, 4), got {other:?}"),
+        }
+    }
+
+    /// Sym without an explicit `MulInt` wrapper has implicit coefficient
+    /// 1.  Any common factor gcd including 1 collapses to 1, so the
+    /// reduction does nothing — the rule must not silently drop the Sym.
+    #[test]
+    fn no_reduce_when_sym_has_implicit_unit_coefficient() {
+        // (a + 4) / 2 must stay non-trivial — gcd(1, 4, 2) = 1.
+        let e = (A.to_dim() + 4) / 2;
+        // It can simplify to other forms but it should still depend on `a`.
+        // Eval at a=2 → (2+4)/2 = 3.  Eval at a=4 → (4+4)/2 = 4.
+        let sv2 = SymbolValues::default().with(&A, 2);
+        let sv4 = SymbolValues::default().with(&A, 4);
+        assert_eq!(e.eval_to_i64(&sv2).unwrap(), 3);
+        assert_eq!(e.eval_to_i64(&sv4).unwrap(), 4);
+    }
 }
diff --git a/doc/README.md b/doc/README.md
index 0cfc361b1b..27295d70f1 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -1,7 +1,18 @@
-# Tract internals documentation
+# tract internals documentation
 
-This kind of documentation does not tend to age well. Use with caution.
+Internal notes about tract. Start from [`AGENTS.md`](../AGENTS.md) for the
+operational quick reference (crate map, build/test, model rewriting,
+streaming, CLI inspection); the documents here cover conceptual material
+that is harder to derive from reading the source.
 
-* a [tract crates introduction](intro.md)
-* a [tract command line cookbook](cli-recipe.md)
-* [graph, model, node, op, facts](graph.md)
+* [`intro.md`](intro.md) — what tract is and the tract-OPL design
+* [`pipeline.md`](pipeline.md) — load → optimise → run, and the `Runtime` trait
+* [`symbolic-shapes.md`](symbolic-shapes.md) — `TDim`, `Symbol`, and how to bind them
+* [`graph.md`](graph.md) — Graph, Node, Outlet, Fact, model pipeline
+* [`op.md`](op.md) — anatomy of an Op (`Op` / `EvalOp` / `TypedOp` / `InferenceOp`)
+* [`cli-recipe.md`](cli-recipe.md) — `tract` command-line cookbook
+* [`kernel-notes.md`](kernel-notes.md) — tract-linalg kernels and debugging
+* [`nnef/`](nnef/) — reference schemas for the `tract_*` NNEF extensions
+
+Documentation drifts faster than code. If something here disagrees with the
+source, trust the source — and consider patching the doc.
diff --git a/doc/cli-recipe.md b/doc/cli-recipe.md
index be03013c76..c6239154a9 100644
--- a/doc/cli-recipe.md
+++ b/doc/cli-recipe.md
@@ -77,6 +77,32 @@ Several other intermediate network "stages" can be reached by using `--pass XXX`
 `--pass load` and `--pass analyse` are interesting as they can dump a network for which inputs are
 unknown (maybe to try and figure out what they could be).
 
+For machine-readable output, pass `--audit-json` to `dump`:
+
+```bash
+tract -O mobilenetv2-7.onnx -i 1,3,224,224,f32 dump --audit-json | jq '.nodes[0]'
+```
+
+The default `dump` text output is meant for humans and is awkward to parse;
+prefer `--audit-json` from scripts. (Same advice applies to `--profile`,
+see below.)
+
+## NNEF round-trip
+
+To convert a loaded network into the tract-OPL (NNEF) form on disk:
+
+```bash
+tract mobilenetv2-7.onnx -i 1,3,224,224,f32 dump --nnef model.nnef.tgz
+```
+
+To load it back, pass `--nnef-tract-core` (and `--nnef-tract-onnx` if the
+network uses ONNX-only extensions) so the parser registers the right
+operator set:
+
+```bash
+tract --nnef-tract-core model.nnef.tgz dump
+```
+
 ## Benching a network
 
 We can get a reading of tract performance on a model by running the `bench` or`criterion`
@@ -90,6 +116,16 @@ tract -O mobilenetv2-7.onnx -i 1,3,224,224,f32 criterion
 The first one is a simple bench runner customized for tract specific needs, the second one
 uses the [criterion](https://docs.rs/criterion) crate.
 
+**`-O` is required for any meaningful number.** Without it, `bench` runs the
+decluttered-but-not-optimised graph: generic `Scan` instead of `OptScan`,
+`EinSum` / `Conv` instead of lowered `OptMatMul`, no codegen. The result
+runs and is numerically correct, but it is several times slower than what
+you would actually ship — the dump's op histogram is the tell. The library
+equivalent is `model.into_optimized()`; calling `into_runnable()` on a
+decluttered model is also valid but measures the same un-optimised graph.
+See [`pipeline.md`](pipeline.md) for the full stage breakdown and the
+per-runtime variations (`DefaultRuntime` / `MetalRuntime` / `CudaRuntime`).
+
 ## Profiling a network
 
 Getting a raw performance number is a first step, but tract can also profile a network execution.
@@ -110,6 +146,64 @@ to double tract Flops number before comparing with, say, BLAS implementations.
 Please do not parse this output. At least use the `--json` output. We do not commit on its stability
 but it's less susceptible to changes.
 
+**`--profile` finds hot ops within one graph; it is not a valid A/B between
+two graph shapes.** Per-node timing accrues per-node dispatch overhead
+(~tens of ns per op on commodity hardware): a path of many small nodes
+pays it many times; a single fused op pays it once. Summing per-node
+times to compare a fused rewrite against the unfused original
+systematically over-credits the fused side. Use `bench` wall-clock for
+between-graph comparisons.
+
+## Common timing pitfalls
+
+- **Thermal bias on sustained workloads.** Apple Silicon throttles after
+  a few minutes of continuous bench load. Alternating OFF / ON runs
+  systematically bias the second half. Batch N OFF runs then N ON runs
+  (or insert cooldowns) before trusting a 1-2% delta.
+- **WASM benches don't transfer between engines.** Wasmtime/Cranelift
+  and V8/TurboFan disagree at the 10-20% level on the same SIMD kernel.
+  Measure in the engine you ship to.
+- **WASM tier-up.** V8 runs the Liftoff baseline JIT first, then re-JITs
+  hot code with TurboFan. First-pass numbers can be 2-4× off steady state;
+  warm generously and read steady-state.
+
+## Environment variables
+
+A small set of `TRACT_*` env vars override defaults that are normally fine
+out of the box. Most are codegen or CPU-detection knobs you only reach for
+when chasing a perf regression or working around an exotic platform; they
+apply equally to the library and the CLI.
+
+| Variable | Effect |
+|---|---|
+| `TRACT_LOG` | `env_logger` filter (e.g. `tract=debug`, `cli=info,tract=warn`). The CLI also derives a default level from `-v` / `-vv`. |
+| `TRACT_LAZY_IM2COL_MIN_KERNEL` | Minimum convolution kernel volume before lazy im2col is preferred over eager. Default: 6. Lower it to experiment on memory-constrained targets. |
+| `TRACT_LAZY_IM2COL_MAX_EAGER_BYTES` | Scratch-buffer ceiling above which `Conv` switches from eager to lazy im2col. Per-family default (~1 MiB on WASM, ~4 MiB on native). Key knob for the canary-model regression gate. |
+| `TRACT_CPU_AARCH64_KIND` | Force aarch64 CPU family detection (`a53`, `a55`, `a72`, `applem`, `generic`, …). Useful for QEMU runs that misreport. |
+| `TRACT_CPU_AARCH64_OVERRIDE_CPU_PART` | Force the raw CPU part hex (`0xd03`, …) before the kind-lookup table runs. Lower-level escape hatch when `TRACT_CPU_AARCH64_KIND` doesn't cover the target. |
+| `TRACT_CPU_ARM32_NEON` | Force armv7 NEON detection on/off (`true`/`1` or `false`/`0`). |
+| `TRACT_CPU_EXPECT_ARM32_NEON` | Used by the test suite to assert the detection result matches what the platform should expose. CI-only. |
+
+The two `LAZY_IM2COL_*` knobs are documented inline next to the constants in
+`core/src/ops/cnn/conv/conv.rs`; the CPU-detect knobs in `linalg/src/arm32.rs`
+and `linalg/src/arm64.rs`. See [`kernel-notes.md`](kernel-notes.md) for
+context on the kernel selection that `LAZY_IM2COL_*` is steering.
+
+## Pulsified networks
+
+The CLI can turn a streaming-friendly network into a pulsified one and run
+the assertion path against a batch reference (see also AGENTS.md §Streaming
+and pulsification):
+
+```bash
+tract --nnef-tract-core model.nnef.tgz --pulse 'T=2' run \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+```
+
+The CLI accounts for the accumulated `pulse.delay` when comparing against
+the batch reference. Synthetic test cases under `harness/pulse-multi-axis/`
+follow this pattern via a `runme.sh` driver.
+
 ## Running a test case
 
 `tract` command line can also be use to build test-case, either for non-regression insurance
@@ -143,3 +237,22 @@ The log displays "Checked output #0, ok." (among other information).
 
 [generate_io.py here](/onnx/test_cases/transformer-mlm/generate_io.py) contains an example building a
 testcase for a BERT model from huggingface for inspiration.
+
+## Saving outputs
+
+The `--save-outputs` (long form `--save-outputs-npz`) flag on the `run` subcommand writes the
+model outputs to an `.npz` file after execution. This is the easiest way to capture a reference
+output for a given input, which can then be used with `--assert-output-bundle` in a later run.
+
+```sh
+# capture outputs from a first run
+tract -O model.onnx run --input-from-bundle inputs.npz --save-outputs reference.npz
+
+# replay and assert on a subsequent run (e.g. after a code change)
+tract -O model.onnx run --input-from-bundle inputs.npz --assert-output-bundle reference.npz
+```
+
+Output tensors are keyed by their model output name (or `output_N` if unnamed).
+
+There is also `--save-outputs-nnef` which writes each output tensor as a separate `.dat` file
+in a folder, in NNEF layout — useful for inspecting individual tensors with external tools.
diff --git a/doc/graph.md b/doc/graph.md
index 70a89d9252..5c39a7a158 100644
--- a/doc/graph.md
+++ b/doc/graph.md
@@ -210,10 +210,14 @@ trait InferenceOp /* [...] */ {
         &mut self,
         inputs: TVec<&InferenceFact>,
         outputs: TVec<&InferenceFact>,
-    ) -> TractResult<(Vec<InferenceFact>, Vec<InferenceFact>)> {
+        observed: TVec<&InferenceFact>,
+    ) -> TractResult<(TVec<InferenceFact>, TVec<InferenceFact>, TVec<InferenceFact>)>;
 }
 ```
 
+The third `observed` parameter and return slot is used by control-flow ops
+(e.g. scan) — ignore it for normal ops.
+
 Here, the analyser will call the `infer` method providing all the known
 information accumulated, and the op must do its best to return more determined
 facts. Note that in the case of `TypedOp`, the typing information strictly goes
@@ -231,7 +235,7 @@ already computed, handing them over to the successor ops.
 
 ```rust
 pub trait EvalOp {
-    fn eval(&self, inputs: Vec<Arc<Tensor>>) -> TractResult<Vec<Arc<Tensor>>>;
+    fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>>;
     /* .. */
 }
 ```
diff --git a/doc/intro.md b/doc/intro.md
index bb9350fda4..0010fb85d3 100644
--- a/doc/intro.md
+++ b/doc/intro.md
@@ -1,157 +1,61 @@
 # Tract
 
-tract is a neural network inference library. It takes trained networks from higher-level
-frameworks (Tensorflow, PyTorch, etc.), converts them to an intermediate representation
-and runs them on the end-user data. It is designed to be very portable and embedding
-friendly. We believe in running Neural Network Inference on the Edge, on a
-browser or a small embeddable CPU.
-
-## How to use tract ?
-
-* tract-onnx is a Rust library that can load and run an ONNX network. About 85%
-    of ONNX operators are supported.
-* tract-tensorflow is a Rust library that can load and run a TensorFlow 1
-    network. Because of the huge size of TensorFlow, a smaller portion of the
-    operator set is supported.
-* tract-nnef is a Rust lbrary that can load and run NNEF networks. Most of
-    NNEF is supported (missing deconv, ROI operations and quantization).
-* tract is the main command line interface (can be installed with "cargo install").
-    It can load network in any of the previously listed formats, dump them in a
-    user friendly form, bench and profile a network.
-    Additionaly, the tract command line can be used to convert a network to
-    NNEF (with some extensions). tract-nnef is significanly smaller and
-    lighter to start than tract-onnx or tract-tensorflow, so this conversion
-    is useful for embedded situations.
-
-## Crates
-
-### tract-data
-
-Contains the Tensor struct, DatumType enum, and TDim (symbolic dimension
-value type).
-
-### tract-linalg
-
-It is bit of a misnomer: this crate contains the low-level optimised
-routines for fast computation (actually not restricted to LINear ALGebra).
-Beyond Intel, we payed specific attention to the ARM 6, 7 and 8 use
-platforms. It is not meant to be used directly.
-
-### tract-core
-
-The heart of tract. It contains
-
-    * the network graph representation manipulation (Graph, Node)
-    * the "core" operator set of tract
-    * most of the network optimisation logic.
-
-tract-core depends on tract-linalg only, and is usually not used directly.
-
-### tract-nnef
-
-It support for NNEF format and maps its operator set to tract-core operators,
-It also contains some tract-core proprietary extension to NNEF.
-This crate depends on tract-core (thus tract-linalg transitively).
-It is the entry-point for embedded situations where NNEF is preferred to ONNX
-or tensorflow formats (requiring model translation to NNEF before hand).
-
-### tract-hir
-
-Python-based training frameworks (TensorFlow or ONNX) have to support lots of
-"python-isms" or "numpy-isms". While they are helpful at model design time,
-they can be a burden at inference time. As a consequence, we try to have most
-of them translated before getting into tract-core. This allow us to comply with
-ONNX or TensorFlow semantics while keeping tract-core complexity more
-manageable.
-
-Examples of such patterns are: negative indexing, negative rank indexing, rank
-broadcasting.
-
-It features the InferenceModel, InferenceFact (and friends), along with the
-"analyser" that can work from the partial types and shapes included in the
-training frameworks formats, to the stricter expectations of tract-core.
-
-It also contains translation to tract-core logic for operators which have
-close enough semantics between TensorFlow and ONNX.
-
-This crate is not meant to be used directly.
-
-### tract-onnx and tract-onnx-opl
-
-Support for ONNX protobuf format and mapping of ONNX operators to tract-hir,
-tract-core or ad-hoc operators.
-
-tract-onnx-opl depends only on tract-core and tract-nnef. It contains
-operators implementation from ONNX operators which do not have an equivalent
-in tract-core, including dumping to / loading from OPL.
-
-tract-onnx is the library to use to load and run an ONNX network. It uses
-tract-hir for type inference and translate ONNX operators to operators from
-tract-core and tract-onnx-opl.
-
-### tract-tensorflow
-
-Support for TensorFlow 1 frozen model format, similar to the ONNX crates.
-
-NB: The split between tract-tensorflow (tensorflow parser, tensorflow operators
-mapping to core) and tract-tensorflow-opl (ad-hoc implementation of operators)
-has not been done yet.
-
-### tract-pulse and tract-pulse-opl
-
-Implements translation of streaming networks to pulsing network (tract-pulse)
-including runtime support (ad-hoc operatrs in tract-pulse-opl).
-
-### tract-kaldi
-
-Partial support for kaldi framework model. Consider it very experimental, it
-may disappear at any time.
-
-### tract
-
-In the `cli/` sub-directory, implements the command line tool.
+tract is a neural network inference library. It takes trained networks from
+higher-level frameworks (TensorFlow, PyTorch, etc.), converts them to an
+intermediate representation, and runs them on end-user data. It is designed
+to be portable and embedding-friendly, with a focus on inference on the
+edge, in the browser, or on small embeddable CPUs.
+
+For an overview of the codebase (crates, traits, model rewriting, streaming,
+CLI inspection) see [`AGENTS.md`](../AGENTS.md). The notes in this directory
+cover material that is harder to derive from reading the source:
+
+* this file — the tract-OPL philosophy and the translate-time / runtime split
+* [`pipeline.md`](pipeline.md) — load → optimise → run, and the `Runtime` trait
+* [`symbolic-shapes.md`](symbolic-shapes.md) — `TDim`, `Symbol`, and how to bind them
+* [`graph.md`](graph.md) — Graph, Node, Fact, Op concepts
+* [`op.md`](op.md) — anatomy of an Op (Op / EvalOp / TypedOp / InferenceOp)
+* [`cli-recipe.md`](cli-recipe.md) — `tract` command-line cookbook
+* [`kernel-notes.md`](kernel-notes.md) — tract-linalg kernels and debugging
+* [`nnef/`](nnef/) — reference schemas for tract NNEF extensions
+
+## Public API
+
+Client code (applications, examples, language bindings) should use the
+`api/rs` crate. The authoritative surface is `api/rs/src/lib.rs`. The
+internal crates (`core`, `nnef`, `onnx`, ...) are not stable API.
 
 ## tract-OPL
 
-Tract OPL (for Operation Programming Language) is an intermediate
-representation of a Neural Network. It is based on NNEF. NNEF is a
-specification aiming to be for *inference* applications what ONNX is
-to *training* frameworks. As it turns out, inference implementations and
-training frameworks have widely divergent objectives.
-
-Tract can be used as a monolithic library, accepting an ONNX or
-TensorFlow model, loading it and optimising it on the fly (using
-tract-onnx API).
-
-We have recently added support for (most of) NNEF. As this format is
-designed for inference, translating it to tract-core operator set is
-very straightforward.
-
-We have built tract OPL on top the tract NNEF support: we have extended
-NNEF to support tract operators that are not present in NNEF. The same
-extension mechanism can be used to extend NNEF with operators belonging
-to ONNX that we chose not to include in tract-core. That way it is
-possible to reduce runtime footprint and startup time:
-    * tract command line includes tract-onnx. It can be used to translate
-an onnx network to a tract-core-plus-extensions model in memory, then dump
-this network in NNEF form. This is done once, right after training.
-    * At runtime we only to tract-core, tract-nnef (for the
-format parser) and optionaly tract-onnx-opl if the network used one of the
-handful of ONNX operations that are not supported natively by tract-core.
-
-The split between translation time and runtime have also been done for the
-streaming (aka pulse) capabilities. We only need tract-pulse to preprocess
-the network (which we can do with the command line) but only ship
-`tract-pulse-opl`.
-
-It could (and should) be done with tract-tensorflow too.
-
-Note that tract-OPL format is machine independant. We still need to call
-into_optimized() on the loaded NNEF network to get the most efficient network
-possible, but this operation is actually much lighter than the "decluttering"
-of the network from the training formats to the tract-core/NNEF semantics.
-
-We are playing with the idea of adding another similar split (tract R for
-tract Runtime). The machine optimized network form would be stored at this
-time, shedding most of the optimisation code from tract-core and making
-networks even faster to load.
+Tract OPL (Operation Programming Language) is an NNEF-based intermediate
+representation of a neural network. NNEF aims to be for *inference* what
+ONNX is for *training* frameworks — inference engines and training
+frameworks have widely divergent requirements, so OPL keeps the operator
+surface narrow and machine-independent.
+
+We extend NNEF with fragments for tract-core operators that NNEF does not
+cover and for ONNX/TF operators we chose to keep out of tract-core. This
+lets us split the operator surface across crates and shrink the runtime
+footprint:
+
+* The `tract` command line includes `tract-onnx`. It can translate an ONNX
+  network to a tract-core-plus-extensions model in memory and dump it as
+  NNEF. This is normally done once, right after training.
+* At runtime, you ship only `tract-core`, `tract-nnef` (the parser), and
+  optionally `tract-onnx-opl` for the handful of ONNX-only operators.
+
+The same translate-time / runtime split applies to streaming: `tract-pulse`
+turns a regular model into a pulsified one (via the command line if needed),
+and only the much smaller `tract-pulse-opl` is needed at runtime.
+
+The tract-OPL format is machine-independent. Calling `into_optimized()` on a
+loaded NNEF network produces the most efficient form for the current
+machine; this is much cheaper than the full decluttering pass that runs
+when loading directly from a training format. For the full pipeline
+mechanics — what `into_optimized` actually runs, how the `Runtime` trait
+fits in, and what `MetalRuntime`/`CudaRuntime` do differently — see
+[`pipeline.md`](pipeline.md).
+
+The NNEF extensions are documented as reference schemas in
+[`nnef/`](nnef/) — `tract_core`, `tract_onnx`, `tract_pulse`, and
+`tract_resource`.
diff --git a/doc/kernel-notes.md b/doc/kernel-notes.md
index bba29c458b..895ea6b314 100644
--- a/doc/kernel-notes.md
+++ b/doc/kernel-notes.md
@@ -22,3 +22,13 @@ If one needs to debug a kernel a useful workflow is to simply insert a
 `mov rNN, [0]` at the appropriate point, and configure GDB with
 `handle SIGSEGV stop nopass`. This'll pause in GDB but not send the
 signal to the program.
+
+## Tuning knobs
+
+A handful of `TRACT_*` env vars steer kernel selection and CPU detection
+without recompiling — most usefully `TRACT_LAZY_IM2COL_MIN_KERNEL` /
+`TRACT_LAZY_IM2COL_MAX_EAGER_BYTES` for the `Conv` codegen crossover, and
+`TRACT_CPU_AARCH64_KIND` / `TRACT_CPU_ARM32_NEON` for forcing detection on
+emulated or misreporting targets. See
+[`cli-recipe.md` § Environment variables](cli-recipe.md#environment-variables)
+for the full list.
diff --git a/doc/nnef/tract-onnx.nnef b/doc/nnef/tract-onnx.nnef
index 24632c780f..91b1d4cfb2 100644
--- a/doc/nnef/tract-onnx.nnef
+++ b/doc/nnef/tract-onnx.nnef
@@ -8,7 +8,9 @@ fragment tract_onnx_isinf(
     input: tensor<scalar>,
     detect_positive: logical = true,
     detect_negative: logical = true
-) -> (output: tensor<?>)fragment tract_onnx_lrn(
+) -> (output: tensor<?>);
+
+fragment tract_onnx_lrn(
     input: tensor<scalar>,
     alpha: scalar = 0.0001,
     beta: scalar = 0.75,
diff --git a/doc/nnef/tract-pulse.nnef b/doc/nnef/tract-pulse.nnef
index cfcc462ae7..3909dd6e0b 100644
--- a/doc/nnef/tract-pulse.nnef
+++ b/doc/nnef/tract-pulse.nnef
@@ -1,6 +1,6 @@
-# Extension `tract_resource` extends NNEF with operators
+# Extension `tract_pulse` extends NNEF with operators
 # for pulsified networks.
-# 
+#
 # Add `extension tract_pulse` to `graph.nnef`
 
 
diff --git a/doc/op.md b/doc/op.md
index 8b8fbbfca3..136127c00e 100644
--- a/doc/op.md
+++ b/doc/op.md
@@ -73,13 +73,13 @@ business side of things.
 
 ```rust
 pub trait EvalOp {
-     fn eval(&self, inputs: TVec<Arc<Tensor>>) -> TractResult<TVec<Arc<Tensor>>> {
+     fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>> {
          bail!("stateless evaluation not implemented")
      }
 
      fn state(
          &self,
-         session: &mut SessionState,
+         session: &mut TurnState,
          node_id: usize,
      ) -> TractResult<Option<Box<dyn OpState>>> {
          Ok(None)
@@ -89,21 +89,23 @@ pub trait EvalOp {
 }
 ```
 
-The EvalOp realize the actual computation the Operator is supposed to perform. It
-supports both *stateful* and *stateless* operators. Most of them are stateless:
-they should just implement `eval` method and say so in `is_stateless()`. The
-handful of stateful operators will implements `state()` instead and return
-`false` is is_stateless: the framework will call `state()` during the network
-initialization, then will call `eval()` on the obtained `OpState` instead:
+The EvalOp realises the actual computation the Operator is supposed to
+perform. It supports both *stateful* and *stateless* operators. Most are
+stateless: they implement `eval` and return `true` from `is_stateless()`.
+The handful of stateful operators implement `state()` instead and return
+`false` from `is_stateless()`; the framework calls `state()` during
+network initialisation, then calls `eval()` on the obtained `OpState`
+instead:
 
 ```rust
-pub trait OpState: fmt::Debug + Send + dyn_clone::DynClone {
+pub trait OpState: fmt::Debug + dyn_clone::DynClone + OpStateFreeze + Downcast {
     fn eval(
         &mut self,
-        session: &mut SessionState,
+        session: &mut TurnState,
         op: &dyn Op,
-        inputs: TVec<Arc<Tensor>>,
-    ) -> TractResult<TVec<Arc<Tensor>>>;
+        inputs: TVec<TValue>,
+    ) -> TractResult<TVec<TValue>>;
+    /* [...] */
 }
 ```
 
@@ -112,6 +114,53 @@ required, or access the `SessionState`.
 
 But most operators are stateless anyway.
 
+## Working with a Tensor's data
+
+A `Tensor` is fully dynamically typed: shape, datum type, and storage
+layout are all carried at runtime, none of them in the Rust type. Every
+typed access goes through a turbofish (`::<T>()`) and a runtime dtype
+check (or an `_unchecked` variant that skips the check).
+
+Inside `eval`, you receive `TVec<TValue>` (each `TValue` is essentially
+an `Arc<Tensor>`). The inventory of `Tensor` access methods:
+
+Reading the data:
+
+- `t.to_plain_array_view::<T>()?` / `_mut` — safe `ndarray::ArrayView`
+  with dtype + storage checks; call `.as_slice()` on it for a slice.
+- `t.as_ptr::<T>()?` / `as_ptr_mut::<T>()?` — safe, dtype-checked raw
+  pointer (useful at FFI boundaries).
+- For rank-0 tensors: `t.try_as_plain()?.to_scalar::<T>()?` for a safe
+  read, `t.to_scalar_mut::<T>()?` for the mutable side.
+- `t.as_bytes()` / `as_bytes_mut()` — typeless byte view.
+- `t.as_slice_unchecked::<T>()` / `as_slice_mut_unchecked::<T>()` — the
+  hot-path slice. `unsafe` only because nothing re-checks that the
+  dtype is `T`; once `output_facts` has asserted it, the call is
+  effectively safe.
+
+Constructors:
+
+- `tensor0(x)` … `tensor4(&[[[[…]; N]; M]; T])` — rank-0..rank-4 tensor
+  literals from Rust array/slice literals. `tensor0` is the canonical
+  way to spell a scalar constant; the higher-rank ones are the
+  convenient option for unit-test fixtures. `rctensor0` … `rctensor4`
+  for an `Arc<Tensor>` instead.
+- `Tensor::from_shape::<T>(&shape, &data)?` — copies the slice in.
+- `Tensor::zero::<T>(&shape)?` / `zero_dt(dt, &shape)?` — zero-initialised.
+- `unsafe { Tensor::uninitialized::<T>(&shape)? }` — for the
+  fill-it-yourself output path; pair with `as_slice_mut_unchecked` to
+  write.
+
+Conversions:
+
+- `t.cast_to::<T>()?` / `cast_to_dt(dt)?` — element-wise cast, returns
+  a `Cow<Tensor>` (no-op if already the right dtype).
+- `t.into_shape(&shape)?` — reshape, consumes self.
+
+There is no safe `as_slice::<T>()` shortcut: either drop into
+`_unchecked` once the dtype is established, or go through
+`to_plain_array_view`.
+
 ## tract-core TypedOp trait
 
 `Op` is metadata, `EvalOp` is runtime, `TypedOp` is about reasoning on the
@@ -310,4 +359,3 @@ representation of the Op. The callback can add NNEF fragments (NNEF lingo for
 functions) to the NNEF document but its main responsibility is to translate 
 the node and its op to some NNEF ast nodes.
 
-## Expansions, and rules wrapper
diff --git a/doc/pipeline.md b/doc/pipeline.md
new file mode 100644
index 0000000000..d7c3dcd02f
--- /dev/null
+++ b/doc/pipeline.md
@@ -0,0 +1,242 @@
+# Load → optimise → run
+
+This is the canonical pipeline a model goes through between disk and a
+`Runnable` you can call `.run()` on. Each stage is a separate method, so
+you can stop at any point to inspect or serialise the intermediate.
+
+## Public API
+
+The pipeline shows up in the public API as a chain of typed wrappers, one
+per stage. The happy path on CPU is two method calls between loading and
+running:
+
+```rust
+use tract::prelude::*;
+
+let model = tract::onnx()?
+    .load("model.onnx")?            // InferenceModel
+    .into_model()?                  // Model
+    .into_runnable()?;              // Runnable
+
+let outputs = model.run(tvec!(input.into()))?;
+```
+
+What each call buys you:
+
+- `.load(path)` — opens a framework file and returns an `InferenceModel`,
+  the partial-shape/type form. Bind partial shapes here with
+  `set_input_fact(ix, "1,3,224,224,f32")`.
+- `.into_model()` — resolves all shapes and types and hands back a
+  `Model` in tract-OPL form. This is the portable, NNEF-serialisable
+  representation; you can `.transform("f32_to_f16")` it or write it to
+  disk before going any further.
+- `.into_runnable()` — produces a `Runnable` via the **`default`**
+  runtime, which is the in-process CPU implementation. Codegen and
+  kernel selection happen here; if you skipped this call and serialised
+  the `Model` instead, you would have something that loads on any
+  machine.
+
+Loading from NNEF is one step shorter: `tract::nnef()?.load(path)?` hands
+back a `Model` directly. NNEF already carries fully-resolved shapes and
+types, so there is nothing for the `InferenceModel` stage to resolve.
+This is the recommended deployment shape — translate to NNEF once (with
+the CLI or once at startup), ship that, and skip the framework loaders
+at runtime.
+
+For a non-CPU runtime, pick one explicitly and prepare against it:
+
+```rust
+let rt = tract::runtime_for_name("metal")?;     // or "cuda", "default", ...
+let runnable = rt.prepare(model)?;
+```
+
+`runtime_for_name` looks up the runtime by name in the `inventory`-based
+registry; whichever runtime crates are linked into the binary contribute
+to the pool (`tract-metal`, `tract-cuda`, ...). `.into_runnable()` is
+exactly the shortcut for `runtime_for_name("default")?.prepare(model)?`
+— same code path, the CPU runtime just happens to be registered under
+that name.
+
+## CLI and internals
+
+This is the *what is `.into_runnable()` actually doing* view: the
+pipeline stage by stage, the `Runtime` trait that owns the second half,
+and how the CLI's flags surface each piece.
+
+### The stages
+
+| Stage | Method | Lives in | Output |
+|---|---|---|---|
+| Load | `tract::onnx().load(path)` / `tract::nnef().model_for_path(path)` | `tract-onnx` / `tract-nnef` | `InferenceModel` |
+| Analyse | `InferenceModel::into_typed()` | `tract-hir` | `TypedModel` (full shapes/types) |
+| Declutter | `TypedModel::into_decluttered()` | `tract-core` | portable, NNEF-serialisable tract-OPL |
+| Optimise | `TypedModel::optimize()` | `tract-core` | target-specific LIR (codegen, kernel selection) |
+| Plan | `TypedSimplePlan::new_with_options(model, options)` | `tract-core` | `Runnable` |
+| Spawn / run | `Runnable::spawn() -> State` ; `State::run(inputs)` | `tract-core` | tensors |
+
+Two convenience wrappers stitch the middle stages:
+
+- `TypedModel::into_decluttered()` = declutter only.
+- `TypedModel::into_optimized()` = declutter + optimise.
+
+### Decluttered vs optimised
+
+**Decluttered** is the portable form: training artefacts are gone, obvious
+patterns are fused (`RmsNorm`, `Silu`, ...), but the operator set is still
+the high-level tract-OPL one — `EinSum`, `Conv`, generic `Scan`, etc. This
+is what you serialise to NNEF, what `tract dump` shows by default, and
+what crosses a CPU/GPU split unchanged.
+
+**Optimised** is target-specific: `EinSum`/`Conv` get lowered to
+`OptMatMul` over the platform's micro-kernels (`avx512_mmm_f32_*`,
+`arm64simd_mmm_f32_*`, ...), `Scan` becomes `OptScan`, and per-machine
+codegen patches are applied. This is what runs; you don't serialise it
+because it's only valid for the machine that produced it.
+
+Two consequences worth knowing:
+
+- **Optimisation isn't optional for timing.** Running a decluttered model
+  is numerically correct but several times slower than what you'd ship
+  (see [`cli-recipe.md`](cli-recipe.md) § Benching for the `-O` callout).
+- **"Optimised" means different things to different runtimes.** The CPU
+  runtime's `into_optimized()` differs from what Metal or CUDA does
+  before they hand off to their own kernels — see the next section.
+
+### The Runtime trait
+
+`tract_core::runtime::Runtime` owns the "from typed model to runnable"
+half of the pipeline. A runtime's `prepare(model: TypedModel)` method
+applies whatever transform / codegen its target needs, then wraps the
+result in a `TypedSimplePlan`:
+
+```rust
+pub trait Runtime: Debug + Send + Sync + 'static {
+    fn name(&self) -> StaticName;
+    fn prepare(&self, model: TypedModel) -> TractResult<Box<dyn Runnable>>;
+    /* ... */
+}
+```
+
+Concrete runtimes shipped in-tree:
+
+| Runtime | Crate | What `prepare` does |
+|---|---|---|
+| `DefaultRuntime` | `tract-core` | `model.into_optimized()` then `SimplePlan` |
+| `MetalRuntime` | `tract-metal` | `MetalTransform::default()` then `into_optimized()` then `SimplePlan` |
+| `CudaRuntime` | `tract-cuda` | `CudaTransform` then `optimize()` then `SimplePlan` |
+| `UnoptimizedRuntime` | `tract-cli` (CLI only) | `SimplePlan` straight over the typed model — no optimise |
+
+A runtime registers itself via `register_runtime!` (an `inventory`-based
+registry), so any crate linked into the binary contributes to the runtime
+pool. The CLI's `--runtime` flag and the library's `runtime_for_name(s)`
+lookup go through that registry.
+
+Single-threaded ARM CPU loads are the primary production target; x86_64,
+Apple Silicon (CPU and Metal), CUDA, and WASM are all supported.
+
+`UnoptimizedRuntime` only exists for the CLI — it lets `tract --pass …
+run` execute the intermediate form (decluttered, before-optimise, etc.)
+for inspection or `--assert-output-bundle` round-trips. It is not what
+you want for performance numbers; mistaking it for `DefaultRuntime` is
+the source of the "tract is silently slow" trap.
+
+### How the CLI maps to runtimes
+
+- **`tract … run`** (no `-O`) — runs `UnoptimizedRuntime` over whatever
+  stage `--pass` last produced (default: `before-optimize`). Correct,
+  slow.
+- **`tract -O … run`** — runs `DefaultRuntime`, which calls
+  `into_optimized()`. Production path; same as the library's
+  `.into_runnable()`.
+- **`tract -O --metal … run`** / **`--cuda …`** / **`--runtime <name>`**
+  — picks a specific runtime by name. Each runtime applies its own
+  pre-plan transform (e.g. `MetalTransform` inserts Metal-side dispatch
+  ops, then standard `into_optimized` lowers what's left of the CPU
+  part).
+
+## Example rewrites
+
+Two distinct pipeline stages rewrite the graph: **declutter** is
+target-independent (same regardless of which `Runtime` is going to
+run the result) and **lowering** is the per-target `codegen` step that
+turns high-level ops into the primitives a specific machine actually
+runs. The subsections below give a small, non-exhaustive sampling of
+each; the source of truth is the `*::declutter` and `*::codegen`
+methods on the relevant ops.
+
+### Declutter
+
+Decluttering goes in two directions at once: it **decomposes** some
+high-level ops into primitives, and it **fuses** recognisable chains
+of primitives back into one high-level op. The op set after declutter
+is different from the framework's source op set in both directions —
+worth knowing when reading a `dump`, because *"my model had N
+`LayerNorm` and I see zero of them"* is not a bug.
+
+Which way a given op goes is a pragmatic call driven by optimisation
+opportunity and operator prevalence. A pattern is fused when it is
+common enough to be worth its own kernel (`RmsNorm`, `Silu`,
+`GeluApproximate` — high prevalence in transformers, concentrated
+payoff from a dedicated implementation). A high-level op is decomposed
+when the smaller pieces compose better with neighbouring ops or expose
+optimisations the wrapping op would have hidden. The same
+`LayerNormalization` runs both ways: the inner RmsNorm gets its
+dedicated kernel, and the surrounding mean-subtract and γ/β affine are
+left as primitives that downstream rewrites can pick up.
+
+Decompositions (one upstream op → several primitives):
+
+- `MatMul` and `Gemm` → `EinSum`. tract has no first-class matmul op;
+  every framework-side matrix-multiply lands as an `EinSum` with an
+  axes spec (carried since `to_typed`, so by the time you see the
+  typed graph there is no `MatMul` node left).
+- `LayerNormalization` → `Sub(mean)` + `RmsNorm` (the inner
+  `rsqrt(mean(x²) + ε)` chain) + `Mul(γ)` + `Add(β)`. A transformer
+  with N `LayerNorm` shows *0 `LayerNorm` + N `RmsNorm`* in the dump,
+  with the surrounding `Sub` / `Mul` / `Add` ops still present — that
+  histogram is correct.
+- `AveragePool` → `SumPool(normalize = true)`.
+- `HardSigmoid` → a `Clip` / `Min` / `Max` chain.
+- `Resize` (nearest, integer scales) → `Reshape` → `AddAxis` →
+  `MultiBroadcastTo` → `Reshape` tile chain.
+- `Conv` on unit-batch input → the batch dim is peeled, inner convs
+  end up rank-3 `CHW` rather than rank-4 `NCHW`.
+
+Fusions (a primitive chain → one high-level op):
+
+- `Mul(rsqrt(reduce_mean(x²) + ε))` → `RmsNorm` — fires in
+  `Reduce::declutter`.
+- `x · sigmoid(x)` → `Silu` — fires in `Sigmoid::declutter`.
+- The GELU-approximate polynomial → `GeluApproximate` — chained
+  `declutter_pow` in `Pow::declutter`.
+
+### Lowering
+
+These run during `optimize()` and are per-target: the active `Runtime`
+decides what fires. The examples below are what `DefaultRuntime`
+(CPU) produces. Not material you usually audit from `dump`, but
+useful context when reading a profile or chasing a perf surprise.
+
+- `EinSum` lowers to one or more `OptMatMul`s once axis identities
+  resolve to concrete `M, K, N` patterns; the same `EinSum` op can
+  surface as several different shape signatures depending on the
+  surrounding declutter. This is the path every framework-side
+  matmul-shaped op (`MatMul`, `Gemm`, the linear inside `Conv`'s
+  im2col branches) eventually goes through.
+- `Conv` codegen (`core/src/ops/cnn/conv/conv.rs::codegen`) tries
+  four lowerings in order:
+  1. quantised im2col + matmul, if the op has `q_params`;
+  2. lazy im2col + matmul, if the input shape is concrete and the
+     kernel volume / scratch ceiling favour it (see
+     `TRACT_LAZY_IM2COL_*`);
+  3. direct depthwise, if `group != 1 && group == in_channels == out_channels`;
+  4. eager im2col + matmul otherwise.
+  Branches 1, 2, 4 all produce an `OptMatMul` over an im2col buffer;
+  branch 3 is the only non-im2col convolution.
+- `Scan` → `OptScan` keeps a persistent inner-body plan
+  (`model_state: TypedSimpleState`) across iterations, but each
+  iteration is a full `model_state.run(inputs)`: `set_inputs` →
+  `resolve_symbols_with_states` → `exec_plan` → `outputs` →
+  `reset_turn`. `reset_turn` clears `resolved_symbols`, so any
+  loop-invariant symbol is re-resolved on every step. Worth knowing
+  when profiling tight RNN / decoder loops.
diff --git a/doc/symbolic-shapes.md b/doc/symbolic-shapes.md
new file mode 100644
index 0000000000..b933a80864
--- /dev/null
+++ b/doc/symbolic-shapes.md
@@ -0,0 +1,200 @@
+# Symbolic shapes and `TDim`
+
+A `TypedFact`'s shape is a `Vec<TDim>`, not a `Vec<usize>`: dimensions
+that depend on a runtime input (batch size, sequence length, image
+side) live in the graph as symbolic expressions until something binds
+them to concrete values. This page is about that machinery.
+
+## What `TDim` is
+
+`TDim` (in `tract_data::dim`) is the algebraic data type tract uses
+for any dimension that might not be known at graph-build time. It is
+an enum of:
+
+- `Val(i64)` — a known integer.
+- `Sym(Symbol)` — a named symbolic atom (`B`, `S`, `T`, ...).
+- `Add(Vec<TDim>)` / `Mul(Vec<TDim>)` / `MulInt(i64, Box<TDim>)` /
+  `Div(Box<TDim>, u64)` — arithmetic.
+- `Min(Vec<TDim>)` / `Max(Vec<TDim>)` / `Broadcast(Vec<TDim>)` —
+  shape-aware reductions (see below).
+- `Ge(...)` / `Eq(...)` — comparison terms that evaluate to `0` or `1`,
+  used as boolean indicator terms inside larger expressions.
+
+So `(S+1)/2`, `B*C`, `max(L, 1)` all sit in the graph as `TDim` trees,
+and `output_facts` on every op produces output shapes in this algebra.
+The optimiser does proper algebra on it (proving `S >= 0`, recognising
+that `4*(S/4) + (S%4) == S`, etc.) so rewrites can fire even when the
+exact values aren't known.
+
+### Variants worth a closer look
+
+A few of the variants aren't what they look like, and getting their
+semantics wrong leads to either over-fitted optimisations or shape
+inference failures that look mysterious.
+
+- **`Broadcast(Vec<TDim>)`** is the dimension-wise broadcast rule from
+  NumPy / ONNX, lifted to symbolic shapes. `Broadcast(1, X) = X`,
+  `Broadcast(X, X) = X`, and `Broadcast(X, Y)` with both proven `> 1`
+  must have `X == Y` or the model is inconsistent. The simplifier
+  reduces what it can; what it can't goes into the optimised graph
+  as a literal `Broadcast` term. This is **not** the same as
+  `Max(X, Y)` — broadcast asserts compatibility, `Max` does not.
+  Op shape inference for elementwise binops produces `Broadcast`
+  per axis, never `Max`. In the TDim textual syntax (the format
+  `parse_tdim` and CLI `--set` accept) the binary operator is `#`, so
+  `S#1` parses to `Broadcast([S, 1])` (which simplifies to `S`). It
+  pretty-prints back as `broadcast((S), (1))`.
+
+- **`MulInt(i64, Box<TDim>)`** is the canonical form for "integer
+  scalar times a symbolic dimension". It is structurally distinct
+  from `Mul(Vec<TDim>)` even though `MulInt(2, x)` and
+  `Mul([Val(2), x])` denote the same value; the simplifier
+  canonicalises into `MulInt` whenever one factor is a known
+  integer, which makes pattern matching cheap.
+
+- **`Div(Box<TDim>, u64)`** is integer division by a known positive
+  divisor. The denominator must be a `u64` constant so the rest of
+  the simplifier can reason about it ("is this multiple of 4
+  recoverable from a `(_+3)/4` ceiling?"). There is no symbolic /
+  symbolic division.
+
+- **`Min` / `Max`** are honest min/max, used for clipped indexing and
+  for shapes that get clamped (`Slice` with bounds that may exceed
+  the dim, `MaxPool` with `ceil_mode = false`, etc.). Unlike
+  `Broadcast`, they don't assert anything about the inputs.
+
+- **`Ge(a, b)` / `Eq(a, b)`** evaluate to `Val(1)` if the relation
+  holds, `Val(0)` otherwise. They show up as multiplicative gates
+  in larger expressions — `Ge(end, start) * (end - start)` gives a
+  clamped non-negative length. The simplifier proves and discharges
+  many of them; the rest sit in the graph and resolve at run time.
+
+## Symbols and the scope
+
+A `Symbol` is an interned name. It lives in a `SymbolScope`, and every
+`TypedModel` carries its own:
+
+```rust
+let scope = &model.symbols;
+let s = scope.sym("S");                  // get-or-create
+let expr = scope.parse_tdim("4 * S + 1")?;
+```
+
+Two symbols with the same name from different scopes are different
+symbols. When you compare or substitute, the scope identity matters.
+`model.symbols` is the only one you should reach for in normal use.
+
+## Where symbols come from
+
+The model loaders create symbols on your behalf:
+
+- **ONNX**: a dynamic dim with a `dim_param` (e.g. `"batch_size"`) is
+  parsed into a symbol of the same name in the model's scope. A
+  dim_param of `"?"` or `"unk__N"` becomes an unknown without a name —
+  use `set_input_fact` to constrain it.
+- **NNEF / tract-OPL**: `dim_param` fragments in the textual graph
+  become symbols at load time. The OPL serialiser writes them back
+  out, so a round-tripped model keeps the same symbolic shape it had
+  before.
+- **Programmatic**: any code touching a model can call
+  `model.symbols.sym(name)` to create one and use the resulting
+  `Symbol` inside a `TDim` it builds.
+
+## Binding them from the library
+
+Two patterns, depending on whether the binding is fixed for the
+deployment or varies per call.
+
+**Bake the values in at build time** — replace symbols with constants
+(or with other symbols) and re-run the optimiser against the new
+shapes. The right call when the dim is a knob you set once at
+deployment time (input resolution, fixed batch):
+
+```rust
+use std::collections::HashMap;
+
+let s = model.symbols.sym("S");
+let b = model.symbols.sym("B");
+let mut subs = HashMap::new();
+subs.insert(s, 224.into());           // S = 224
+subs.insert(b, 1.into());             // B = 1
+let model = model.substitute_symbols(&subs)?;
+```
+
+After this, the shapes are pure `Val(_)` and downstream passes treat
+the model as fully concrete.
+
+**Let the runtime resolve at call time** — keep the symbols in the
+graph and just feed concrete-shaped tensors. The runtime matches each
+input tensor's actual shape against the symbolic input fact and
+records the symbol bindings for the run:
+
+```rust
+// Input fact is (B, 3, S, S, f32); inputs[0] is shape (1, 3, 224, 224).
+// The plan infers B=1, S=224 from the input, propagates through every
+// dependent shape, and runs.
+let outputs = runnable.run(tvec!(input.into()))?;
+```
+
+This is the right choice when the binding varies per call (dynamic
+batch, variable sequence length). The model's optimised form stays
+shared; only the per-call shape-resolution table changes.
+
+The two paths are not exclusive: bake the dimensions that never change
+(e.g. a known batch size in a server with fixed concurrency) and leave
+the rest symbolic for the runtime to bind.
+
+## Setting input facts on an `InferenceModel` (ONNX-only)
+
+This step lives on the `InferenceModel`, not on `Model`. It exists
+because ONNX expresses shape and type partially: a dim can be a known
+int, a named `dim_param`, **or simply absent** (`?` / `unk__N`), and
+element types can be missing from `graph.input` annotations too. NNEF
+does not have that problem — a NNEF model arrives already fully typed,
+so `tract::nnef().load(path)` returns a `Model` directly and there is
+nothing to set.
+
+For ONNX, pin whatever the loader left unresolved before
+`.into_model()`:
+
+```rust
+let mut m = tract::onnx()?.load(path)?;
+m.set_input_fact(0, "B,3,224,224,f32")?;     // names a symbol B
+let model = m.into_model()?;
+```
+
+The string form is parsed against `m`'s symbol scope, so `B` here is
+the same symbol the rest of the graph already references (whatever the
+ONNX `dim_param` named it). `into_model()` then runs analysis with the
+pinned shape and produces a typed `Model` in the same form a NNEF load
+would have given you directly.
+
+## CLI counterparts
+
+The CLI does the same things via flags:
+
+- `-i N,3,224,224,f32` — set an input fact (per-input).
+- `--set S=224 --set B=1` — bind symbols to constants. Equivalent to
+  `substitute_symbols` on the library side.
+- `--input-from-bundle io.npz` — derive concrete input shapes from
+  the tensors actually present in the bundle; runs both "set input
+  facts" and "concretise symbols" in one step. The library equivalent
+  is the `set_input_fact` + `substitute_symbols` pair above.
+
+## ONNX gotchas
+
+Some ONNX exports carry `value_info` annotations that contradict
+post-rewrite shapes, or output type annotations that disagree with
+what tract infers. Three escape hatches on the `Onnx` loader:
+
+```rust
+let onnx = tract::onnx()?
+    .with_ignore_output_shapes(true)?     // drop graph.output shape annotations
+    .with_ignore_output_types(true)?      // drop graph.output type annotations
+    .with_ignore_value_info(true)?;       // drop intermediate value_info
+```
+
+Which one bites depends on the exporter (and its version);
+`with_ignore_output_shapes(true)` is the most frequently useful when
+an export's `graph.output` shape annotations have not been kept in
+sync with the actual graph.
diff --git a/examples/causal_llm/Cargo.toml b/examples/causal_llm/Cargo.toml
index 3edb77c8c6..a952927546 100644
--- a/examples/causal_llm/Cargo.toml
+++ b/examples/causal_llm/Cargo.toml
@@ -27,6 +27,6 @@ rand.workspace = true
 tokenizers.workspace = true
 
 [target.'cfg(target_arch = "wasm32")'.dependencies]
-tokenizers = { version = "0.22", default-features = false, features = [
+tokenizers = { version = "0.23", default-features = false, features = [
   "unstable_wasm",
 ] }
diff --git a/examples/nemo-nemotron-asr/src/main.rs b/examples/nemo-nemotron-asr/src/main.rs
index defa022726..9de861874c 100644
--- a/examples/nemo-nemotron-asr/src/main.rs
+++ b/examples/nemo-nemotron-asr/src/main.rs
@@ -30,7 +30,7 @@ fn main() -> anyhow::Result<()> {
     let vocab = config.pointer("/joint/vocabulary").unwrap().as_array().unwrap();
     let vocab: Vec<&str> = vocab.iter().map(|v| v.as_str().unwrap()).collect();
 
-    let nnef = tract::nnef()?.with_tract_core()?.with_tract_transformers()?;
+    let nnef = tract::nnef()?.with_tract_transformers()?;
     let gpu = ["cuda", "metal", "default"]
         .iter()
         .find_map(|rt| tract::runtime_for_name(rt).ok())
diff --git a/examples/nemo-nemotron-streaming-asr/Cargo.toml b/examples/nemo-nemotron-streaming-asr/Cargo.toml
new file mode 100644
index 0000000000..d5835e35d1
--- /dev/null
+++ b/examples/nemo-nemotron-streaming-asr/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "nemo-nemotron-streaming-asr"
+version = "0.1.0"
+edition = "2024"
+
+[features]
+live = ["cpal"]
+
+[dependencies]
+anyhow.workspace = true
+clap.workspace = true
+cpal = { version = "0.17", optional = true }
+float-ord.workspace = true
+hound = "3.5.1"
+itertools.workspace = true
+ndarray.workspace = true
+serde_json.workspace = true
+tract.workspace = true
diff --git a/examples/nemo-nemotron-streaming-asr/ci.sh b/examples/nemo-nemotron-streaming-asr/ci.sh
new file mode 100755
index 0000000000..0ff1680fd6
--- /dev/null
+++ b/examples/nemo-nemotron-streaming-asr/ci.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -x
+
+[ -e .venv ] || python3 -m venv .venv
+source .venv/bin/activate
+
+pip install "nemo-toolkit[asr]" "torch_to_nnef[nemo_tract]"
+
+mkdir -p assets
+wget -qN https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -O assets/2086-149220-0033.wav
+rm -rf assets/model
+t2n_export_nemo -s nvidia/nemotron-speech-streaming-en-0.6b -e assets/model -tt skip --split-joint-decoder
+
+# Inject missing upper bound assertion into encoder model (~6.7min at 100Hz).
+# This is needed so that position-table bounds checks resolve during pulsification.
+enc_tgz=assets/model/encoder.nnef.tgz
+p1_tgz=assets/model/encoder.p1.nnef.tgz
+tmpdir=$(mktemp -d)
+tar xzf "$enc_tgz" -C "$tmpdir"
+sed -i '/^extension tract_symbol AUDIO_SIGNAL__TIME;/a extension tract_assert AUDIO_SIGNAL__TIME<=39993;' "$tmpdir/graph.nnef"
+tar czf "$p1_tgz" -C "$tmpdir" .
+rm -rf "$tmpdir"
+
+cargo run --release
+rm -rf assets
diff --git a/examples/nemo-nemotron-streaming-asr/src/main.rs b/examples/nemo-nemotron-streaming-asr/src/main.rs
new file mode 100644
index 0000000000..8f07c22a5d
--- /dev/null
+++ b/examples/nemo-nemotron-streaming-asr/src/main.rs
@@ -0,0 +1,540 @@
+use std::fs::File;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::sync::mpsc;
+use std::time::{Duration, Instant};
+
+use anyhow::*;
+use clap::Parser;
+#[cfg(feature = "live")]
+use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
+use float_ord::FloatOrd;
+use itertools::Itertools;
+use ndarray::prelude::*;
+use tract::prelude::*;
+
+tract::impl_ndarray_interop!();
+
+/// Streaming ASR demo using nvidia/nemotron-speech-streaming-en-0.6b.
+///
+/// Transcribes audio incrementally using pulsified preprocessor + encoder
+/// on GPU with RNNT greedy decoding.
+///
+/// By default, reads from a WAV file with simulated real-time playback.
+/// Use --live to capture from the system microphone.
+#[derive(Parser)]
+struct Config {
+    /// Path to the model assets directory.
+    #[arg(long, default_value = "assets")]
+    assets: PathBuf,
+
+    /// WAV file to transcribe (16kHz mono PCM). Ignored if --live is set.
+    #[arg(long, default_value = "assets/2086-149220-0033.wav")]
+    wav: PathBuf,
+
+    /// Capture from the system microphone instead of a WAV file.
+    /// Requires the `live` feature: cargo run --features live -- --live
+    #[arg(long)]
+    live: bool,
+
+    /// Preprocessor pulse in audio samples. ~100ms = 1600 samples.
+    #[arg(long, default_value_t = 1600)]
+    preproc_pulse: usize,
+
+    /// Encoder pulse in feature frames. 14 token chunks * 8x subsampling = 112.
+    #[arg(long, default_value_t = 112)]
+    encoder_pulse: usize,
+
+    /// Do not simulate real-time playback (WAV mode only: process as fast as possible).
+    #[arg(long)]
+    no_realtime: bool,
+}
+
+fn argmax(slice: &[f32]) -> Option<usize> {
+    slice.into_iter().position_max_by_key(|x| FloatOrd(**x))
+}
+
+fn fact_shape(f: &Fact) -> anyhow::Result<Vec<usize>> {
+    (0..f.rank()?).map(|a| f.dim(a).and_then(|d| d.to_int64()).map(|v| v as usize)).collect()
+}
+
+// ─── Shared read-only model context ─────────────────────────────────────────
+
+struct NemotronModels {
+    config: Config,
+    preprocessor: Runnable,
+    encoder: Runnable,
+    decoder: Runnable,
+    joint: Runnable,
+    vocab: Vec<String>,
+    blank_id: usize,
+    pp_delay: usize,
+    pp_out_axis: usize,
+    pp_out_pulse: usize,
+    pp_input_shape: Vec<usize>,
+    enc_delay: usize,
+    enc_output_axis: usize,
+    enc_output_pulse: usize,
+    enc_input_shape: Vec<usize>,
+}
+
+impl NemotronModels {
+    fn load(config: Config) -> anyhow::Result<(Arc<Self>, Duration)> {
+        let t0 = Instant::now();
+        let assets = config.assets.display();
+
+        let model_config: serde_json::Value =
+            serde_json::from_reader(File::open(format!("{assets}/model/model_config.json"))?)?;
+        let blank_id =
+            model_config.pointer("/decoder/vocab_size").unwrap().as_i64().unwrap() as usize;
+        let vocab: Vec<String> = model_config
+            .pointer("/joint/vocabulary")
+            .unwrap()
+            .as_array()
+            .unwrap()
+            .iter()
+            .map(|v| v.as_str().unwrap().to_owned())
+            .collect();
+
+        let nnef = tract::nnef()?.with_tract_transformers()?;
+        let runtime = ["cuda", "metal", "default"]
+            .iter()
+            .find_map(|rt| tract::runtime_for_name(rt).ok())
+            .unwrap();
+
+        eprint!("Loading preprocessor to {}...", runtime.name()?);
+        let mut pp = nnef.load(format!("{assets}/model/preprocessor.nnef.tgz"))?;
+        pp.transform(ConcretizeSymbols::new().value("BATCH", 1))?;
+        pp.transform(
+            r#"{"name":"patch","body":"length = tract_core_shape_of(input_signal)[1];"}"#,
+        )?;
+        pp.transform(r#"{"name":"select_outputs","outputs":["processed_signal"]}"#)?;
+        pp.transform(Pulse::new(config.preproc_pulse.to_string()).symbol("INPUT_SIGNAL__TIME"))?;
+        let pp_delay = pp.property("pulse.delay")?.as_slice::<i64>()?[0].to_owned() as usize;
+        let pp_out_axis =
+            pp.property("pulse.output_axes")?.as_slice::<i64>()?[0].to_owned() as usize;
+        let pp_out_pulse = pp.output_fact(0)?.dim(pp_out_axis)?.to_int64()? as usize;
+        let pp_input_shape = fact_shape(&pp.input_fact(0)?)?;
+        let preprocessor = runtime.prepare(pp)?;
+        eprintln!(" done.");
+
+        eprint!("Loading encoder to {}...", runtime.name()?);
+        let mut enc = nnef.load(format!("{assets}/model/encoder.p1.nnef.tgz"))?;
+        enc.transform(ConcretizeSymbols::new().value("BATCH", 1))?;
+        enc.transform("transformers_detect_all")?;
+        enc.transform(
+            r#"{"name":"patch","body":"length = tract_core_shape_of(audio_signal)[2];"}"#,
+        )?;
+        enc.transform(r#"{"name":"select_outputs","outputs":["outputs"]}"#)?;
+        enc.transform(Pulse::new(config.encoder_pulse.to_string()).symbol("AUDIO_SIGNAL__TIME"))?;
+        let enc_delay = enc.property("pulse.delay")?.as_slice::<i64>()?[0].to_owned() as usize;
+        let enc_output_axis =
+            enc.property("pulse.output_axes")?.as_slice::<i64>()?[0].to_owned() as usize;
+        let enc_output_pulse = enc.output_fact(0)?.dim(enc_output_axis)?.to_int64()? as usize;
+        let enc_input_shape = fact_shape(&enc.input_fact(0)?)?;
+        let encoder = runtime.prepare(enc)?;
+        eprintln!(" done.");
+
+        eprint!("Loading decoder to {}...", runtime.name()?);
+        let mut dec = nnef.load(format!("{assets}/model/decoder.nnef.tgz"))?;
+        dec.transform(ConcretizeSymbols::new().value("BATCH", 1).value("TARGETS__TIME", 1))?;
+        let decoder = runtime.prepare(dec)?;
+        eprintln!(" done.");
+
+        eprint!("Loading joint to {}...", runtime.name()?);
+        let mut jnt = nnef.load(format!("{assets}/model/joint.nnef.tgz"))?;
+        jnt.transform(
+            ConcretizeSymbols::new()
+                .value("BATCH", 1)
+                .value("ENCODER_OUTPUTS__TIME", 1)
+                .value("DECODER_OUTPUTS__TIME", 1),
+        )?;
+        let joint = runtime.prepare(jnt)?;
+        eprintln!(" done.");
+
+        let load_time = t0.elapsed();
+        eprintln!("Ready ({:.1}s)", load_time.as_secs_f64());
+
+        Ok((
+            Arc::new(Self {
+                config,
+                preprocessor,
+                encoder,
+                decoder,
+                joint,
+                vocab,
+                blank_id,
+                pp_delay,
+                pp_out_axis,
+                pp_out_pulse,
+                pp_input_shape,
+                enc_delay,
+                enc_output_axis,
+                enc_output_pulse,
+                enc_input_shape,
+            }),
+            load_time,
+        ))
+    }
+
+    fn spawn(self: &Arc<Self>) -> anyhow::Result<StreamState> {
+        StreamState::new(Arc::clone(self))
+    }
+}
+
+// ─── Mutable streaming state ────────────────────────────────────────────────
+
+struct StreamState {
+    models: Arc<NemotronModels>,
+    preproc: State,
+    encoder: State,
+    dec_token: Tensor,
+    dec_state_0: Tensor,
+    dec_state_1: Tensor,
+    audio_buf: Vec<f32>,
+    audio_consumed: usize,
+    feat_buf: Vec<ArrayD<f32>>,
+    feat_buf_frames: usize,
+    pp_delay_remaining: usize,
+    enc_delay_remaining: usize,
+    hyp: Vec<usize>,
+    pulse_count: usize,
+    total_preproc: Duration,
+    total_encoder: Duration,
+    total_joint: Duration,
+    total_decoder: Duration,
+    n_preproc: usize,
+    n_encoder: usize,
+    n_joint: usize,
+    n_decoder: usize,
+}
+
+impl StreamState {
+    fn new(models: Arc<NemotronModels>) -> anyhow::Result<Self> {
+        let preproc = models.preprocessor.spawn_state()?;
+        let encoder = models.encoder.spawn_state()?;
+
+        let blank_tok = Tensor::from_slice(&[1, 1], &[models.blank_id as i32])?;
+        let s0 = Array3::<f32>::zeros([2, 1, 640]).tract()?;
+        let s1 = Array3::<f32>::zeros([2, 1, 640]).tract()?;
+        let [_out, s0, s1] = models.decoder.run([blank_tok.clone(), s0, s1])?.try_into().unwrap();
+        let [dec_token, dec_state_0, dec_state_1] =
+            models.decoder.run([blank_tok, s0, s1])?.try_into().unwrap();
+
+        Ok(Self {
+            pp_delay_remaining: models.pp_delay,
+            enc_delay_remaining: models.enc_delay,
+            models,
+            preproc,
+            encoder,
+            dec_token,
+            dec_state_0,
+            dec_state_1,
+            audio_buf: Vec::new(),
+            audio_consumed: 0,
+            feat_buf: Vec::new(),
+            feat_buf_frames: 0,
+            hyp: Vec::new(),
+            pulse_count: 0,
+            total_preproc: Duration::ZERO,
+            total_encoder: Duration::ZERO,
+            total_joint: Duration::ZERO,
+            total_decoder: Duration::ZERO,
+            n_preproc: 0,
+            n_encoder: 0,
+            n_joint: 0,
+            n_decoder: 0,
+        })
+    }
+
+    fn show(&self, label: &str) {
+        let vocab: Vec<&str> = self.models.vocab.iter().map(|s| s.as_str()).collect();
+        let text: String = self.hyp.iter().map(|&t| vocab[t]).join("");
+        let display = text.replace('▁', " ");
+        eprint!("\r{} {label}          ", display.trim_start());
+    }
+
+    fn push_audio(&mut self, samples: &[f32]) -> anyhow::Result<()> {
+        self.audio_buf.extend_from_slice(samples);
+        self.show("");
+        let preproc_pulse = self.models.config.preproc_pulse;
+        while self.audio_consumed + preproc_pulse <= self.audio_buf.len() {
+            let start = self.audio_consumed;
+            let end = start + preproc_pulse;
+            let pp_input =
+                Tensor::from_slice(&self.models.pp_input_shape, &self.audio_buf[start..end])?;
+            self.audio_consumed = end;
+            self.run_preproc(pp_input)?;
+        }
+        Ok(())
+    }
+
+    fn flush(&mut self) -> anyhow::Result<()> {
+        let remaining = self.audio_buf.len() - self.audio_consumed;
+        if remaining > 0 {
+            let mut data = vec![0.0f32; self.models.pp_input_shape.iter().product()];
+            data[..remaining].copy_from_slice(&self.audio_buf[self.audio_consumed..]);
+            let pp_input = Tensor::from_slice(&self.models.pp_input_shape, &data)?;
+            self.run_preproc(pp_input)?;
+        }
+        if self.feat_buf_frames > 0 {
+            let refs: Vec<_> = self.feat_buf.iter().map(|a| a.view()).collect();
+            let leftover = ndarray::concatenate(Axis(self.models.pp_out_axis), &refs)?;
+            let s = &self.models.enc_input_shape;
+            let mut enc_input = Array3::<f32>::zeros((s[0], s[1], s[2]));
+            let n = leftover.shape()[self.models.pp_out_axis].min(s[2]);
+            enc_input.slice_mut(s![.., .., ..n]).assign(&leftover.slice(s![.., .., ..n]));
+            self.run_encoder_pulse(enc_input.into_dyn())?;
+        }
+        Ok(())
+    }
+
+    fn transcript(&self) -> String {
+        let vocab: Vec<&str> = self.models.vocab.iter().map(|s| s.as_str()).collect();
+        self.hyp.iter().map(|&t| vocab[t]).join("")
+    }
+
+    fn run_preproc(&mut self, input: Tensor) -> anyhow::Result<()> {
+        self.show("[pre]");
+        let t = Instant::now();
+        let results = self.preproc.run([input])?;
+        self.total_preproc += t.elapsed();
+        self.n_preproc += 1;
+        let features: ArrayD<f32> = results[0].ndarray()?.into_owned();
+        self.feed_features(features)
+    }
+
+    fn feed_features(&mut self, features: ArrayD<f32>) -> anyhow::Result<()> {
+        let pp_out_pulse = self.models.pp_out_pulse;
+        let pp_out_axis = self.models.pp_out_axis;
+        let encoder_pulse = self.models.config.encoder_pulse;
+        let usable_start = self.pp_delay_remaining.min(pp_out_pulse);
+        self.pp_delay_remaining = self.pp_delay_remaining.saturating_sub(pp_out_pulse);
+        if usable_start >= pp_out_pulse {
+            return Ok(());
+        }
+        let usable = features.slice_axis(Axis(pp_out_axis), (usable_start..pp_out_pulse).into());
+        self.feat_buf_frames += usable.shape()[pp_out_axis];
+        self.feat_buf.push(usable.to_owned());
+
+        while self.feat_buf_frames >= encoder_pulse {
+            let refs: Vec<_> = self.feat_buf.iter().map(|a| a.view()).collect();
+            let all = ndarray::concatenate(Axis(pp_out_axis), &refs)?;
+            let enc_feat = all.slice_axis(Axis(pp_out_axis), (..encoder_pulse).into()).to_owned();
+            self.run_encoder_pulse(enc_feat)?;
+            let leftover = all.slice_axis(Axis(pp_out_axis), (encoder_pulse..).into());
+            self.feat_buf_frames -= encoder_pulse;
+            self.feat_buf.clear();
+            if self.feat_buf_frames > 0 {
+                self.feat_buf.push(leftover.to_owned());
+            }
+        }
+        Ok(())
+    }
+
+    fn run_encoder_pulse(&mut self, features: ArrayD<f32>) -> anyhow::Result<()> {
+        self.show("[enc]");
+        let t = Instant::now();
+        let pulse_tensor: Tensor = features.tract()?;
+        let results = self.encoder.run([pulse_tensor])?;
+        self.total_encoder += t.elapsed();
+        self.n_encoder += 1;
+        let enc_out: ArrayD<f32> = results[0].ndarray()?.into_owned();
+        self.pulse_count += 1;
+        for f in 0..self.models.enc_output_pulse {
+            if self.enc_delay_remaining > 0 {
+                self.enc_delay_remaining -= 1;
+                continue;
+            }
+            let frame: Tensor =
+                enc_out.slice_axis(Axis(self.models.enc_output_axis), (f..f + 1).into()).tract()?;
+            self.decode_frame(frame)?;
+        }
+        Ok(())
+    }
+
+    fn decode_frame(&mut self, frame: Tensor) -> anyhow::Result<()> {
+        let mut tokens_this_frame = 0usize;
+        loop {
+            self.show("[jnt]");
+            let t = Instant::now();
+            let [logits] =
+                self.models.joint.run([frame.clone(), self.dec_token.clone()])?.try_into().unwrap();
+            self.total_joint += t.elapsed();
+            self.n_joint += 1;
+            let logits_slice = logits.as_slice::<f32>()?;
+            let token_id = argmax(logits_slice).unwrap();
+            if token_id == self.models.blank_id {
+                break;
+            }
+            self.hyp.push(token_id);
+            tokens_this_frame += 1;
+            self.show("[dec]");
+            let t = Instant::now();
+            let tok = Tensor::from_slice(&[1, 1], &[token_id as i32])?;
+            [self.dec_token, self.dec_state_0, self.dec_state_1] = self
+                .models
+                .decoder
+                .run([tok, self.dec_state_0.clone(), self.dec_state_1.clone()])?
+                .try_into()
+                .unwrap();
+            self.total_decoder += t.elapsed();
+            self.n_decoder += 1;
+            if tokens_this_frame >= 10 {
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    fn print_stats(&self, load_time: Duration, stream_time: Duration, audio_duration: f64) {
+        eprintln!();
+        eprintln!("--- stats ---");
+        eprintln!("model load:    {:.1}s", load_time.as_secs_f64());
+        eprintln!("audio:         {:.2}s", audio_duration);
+        eprintln!(
+            "stream wall:   {:.2}s ({:.1}x real-time)",
+            stream_time.as_secs_f64(),
+            audio_duration / stream_time.as_secs_f64()
+        );
+        eprintln!("pulses:        {}", self.pulse_count);
+        let stats = [
+            ("preprocessor", self.total_preproc, self.n_preproc),
+            ("encoder", self.total_encoder, self.n_encoder),
+            ("joint", self.total_joint, self.n_joint),
+            ("decoder", self.total_decoder, self.n_decoder),
+        ];
+        for (name, total, n) in &stats {
+            if *n > 0 {
+                eprintln!(
+                    "{name:14} {:.1}ms total, {:.2}ms/call ({n} calls)",
+                    total.as_secs_f64() * 1000.0,
+                    total.as_secs_f64() * 1000.0 / *n as f64,
+                );
+            }
+        }
+        let compute =
+            self.total_preproc + self.total_encoder + self.total_joint + self.total_decoder;
+        eprintln!(
+            "compute total: {:.1}ms ({:.1}x real-time)",
+            compute.as_secs_f64() * 1000.0,
+            audio_duration / compute.as_secs_f64()
+        );
+    }
+}
+
+// ─── Main ───────────────────────────────────────────────────────────────────
+
+/// Start a WAV file audio source: reads samples in chunks with optional real-time pacing.
+fn start_wav_source(
+    path: &PathBuf,
+    realtime: bool,
+) -> anyhow::Result<(mpsc::Receiver<Vec<f32>>, std::thread::JoinHandle<()>, f64)> {
+    let wav: Vec<f32> = hound::WavReader::open(path)?
+        .samples::<i16>()
+        .map(|x| x.unwrap() as f32 / 32768.0)
+        .collect();
+    let audio_duration = wav.len() as f64 / 16000.0;
+    let total_samples = wav.len();
+    let audio_chunk = 80; // 5ms chunks
+
+    let (tx, rx) = mpsc::sync_channel::<Vec<f32>>(4);
+    let handle = std::thread::Builder::new().name("wav".into()).spawn(move || {
+        let mut offset = 0;
+        while offset < total_samples {
+            let end = (offset + audio_chunk).min(total_samples);
+            if tx.send(wav[offset..end].to_vec()).is_err() {
+                break;
+            }
+            offset = end;
+            if realtime {
+                std::thread::sleep(Duration::from_secs_f64(audio_chunk as f64 / 16000.0));
+            }
+        }
+    })?;
+    Ok((rx, handle, audio_duration))
+}
+
+/// Start a live microphone audio source via cpal.
+#[cfg(feature = "live")]
+fn start_live_source() -> anyhow::Result<(mpsc::Receiver<Vec<f32>>, cpal::Stream)> {
+    let host = cpal::default_host();
+    let device = host.default_input_device().context("no input device")?;
+    eprintln!("Microphone: {}", device.name()?);
+
+    let target_config = cpal::StreamConfig {
+        channels: 1,
+        sample_rate: cpal::SampleRate(16000),
+        buffer_size: cpal::BufferSize::Default,
+    };
+
+    let (tx, rx) = mpsc::sync_channel::<Vec<f32>>(8);
+    let stream = device.build_input_stream(
+        &target_config,
+        move |data: &[f32], _: &cpal::InputCallbackInfo| {
+            let _ = tx.send(data.to_vec());
+        },
+        |err| eprintln!("audio error: {err}"),
+        None,
+    )?;
+    stream.play()?;
+    Ok((rx, stream))
+}
+
+fn main() -> anyhow::Result<()> {
+    let config = Config::parse();
+    let live = config.live;
+    let wav_path = config.wav.clone();
+    let no_realtime = config.no_realtime;
+
+    let (models, load_time) = NemotronModels::load(config)?;
+    let mut state = models.spawn()?;
+
+    if live {
+        #[cfg(not(feature = "live"))]
+        anyhow::bail!("--live requires the `live` feature: cargo run --features live -- --live");
+
+        #[cfg(feature = "live")]
+        {
+            let (audio_rx, _stream) = start_live_source()?;
+            eprintln!("Listening... (press Ctrl-C to stop)\n");
+
+            let stream_start = Instant::now();
+            for chunk in audio_rx {
+                state.push_audio(&chunk)?;
+            }
+            state.flush()?;
+            let stream_time = stream_start.elapsed();
+            let audio_duration = stream_time.as_secs_f64();
+
+            let transcript = state.transcript();
+            let display = transcript.replace('▁', " ");
+            eprint!("\r{}                    \n", display.trim_start());
+            state.print_stats(load_time, stream_time, audio_duration);
+        }
+    } else {
+        // ── WAV file mode ───────────────────────────────────────────
+        let (audio_rx, mic_handle, audio_duration) = start_wav_source(&wav_path, !no_realtime)?;
+
+        let stream_start = Instant::now();
+        for chunk in audio_rx {
+            state.push_audio(&chunk)?;
+        }
+        state.flush()?;
+        let stream_time = stream_start.elapsed();
+
+        mic_handle.join().unwrap();
+
+        let transcript = state.transcript();
+        let display = transcript.replace('▁', " ");
+        eprint!("\r{}                    \n", display.trim_start());
+
+        state.print_stats(load_time, stream_time, audio_duration);
+
+        let expected = "▁well▁I▁don't▁wish▁to▁see▁it▁any▁more▁observed▁Phoebe,▁turning▁away▁her▁eyes.▁It▁is▁certainly▁very▁like▁the▁old▁portrait";
+        if transcript != expected {
+            eprintln!("\nNOTE: streaming transcript differs slightly from batch reference");
+        }
+    }
+    Ok(())
+}
diff --git a/examples/nemo-parakeet-asr/src/main.rs b/examples/nemo-parakeet-asr/src/main.rs
index 435ca2f9a3..f152f059ad 100644
--- a/examples/nemo-parakeet-asr/src/main.rs
+++ b/examples/nemo-parakeet-asr/src/main.rs
@@ -19,7 +19,7 @@ fn main() -> anyhow::Result<()> {
     let vocab = config.pointer("/joint/vocabulary").unwrap().as_array().unwrap();
     let vocab: Vec<&str> = vocab.iter().map(|v| v.as_str().unwrap()).collect();
 
-    let nnef = tract::nnef()?.with_tract_core()?.with_tract_transformers()?;
+    let nnef = tract::nnef()?.with_tract_transformers()?;
     let gpu = ["cuda", "metal", "default"]
         .iter()
         .find_map(|rt| tract::runtime_for_name(rt).ok())
diff --git a/examples/nnef-dump-mobilenet-v2/README.md b/examples/nnef-dump-mobilenet-v2/README.md
index 387066cc8d..dc48942424 100644
--- a/examples/nnef-dump-mobilenet-v2/README.md
+++ b/examples/nnef-dump-mobilenet-v2/README.md
@@ -54,7 +54,7 @@ use tract::prelude::*;
 tract::impl_ndarray_interop!();
 
 fn main() -> Result<()> {
-    let model = tract::nnef()?.with_tract_core()?.load("mobilenet.nnef.tgz")?.into_runnable()?;
+    let model = tract::nnef()?.load("mobilenet.nnef.tgz")?.into_runnable()?;
 
     // open image, resize it and make a Tensor out of it
     let image = image::open("grace_hopper.jpg").unwrap().to_rgb8();
diff --git a/examples/nnef-dump-mobilenet-v2/src/main.rs b/examples/nnef-dump-mobilenet-v2/src/main.rs
index 724852bb1e..14c4efe1d8 100644
--- a/examples/nnef-dump-mobilenet-v2/src/main.rs
+++ b/examples/nnef-dump-mobilenet-v2/src/main.rs
@@ -4,7 +4,7 @@ use tract::prelude::*;
 tract::impl_ndarray_interop!();
 
 fn main() -> Result<()> {
-    let model = tract::nnef()?.with_tract_core()?.load("mobilenet.nnef.tgz")?.into_runnable()?;
+    let model = tract::nnef()?.load("mobilenet.nnef.tgz")?.into_runnable()?;
 
     // open image, resize it and make a Tensor out of it
     let image = image::open("grace_hopper.jpg").unwrap().to_rgb8();
diff --git a/examples/stable-diffusion-3/Cargo.toml b/examples/stable-diffusion-3/Cargo.toml
index fdbb148d15..fb178c7ac0 100644
--- a/examples/stable-diffusion-3/Cargo.toml
+++ b/examples/stable-diffusion-3/Cargo.toml
@@ -11,5 +11,5 @@ kdam = "0.6"
 ndarray.workspace = true
 rand = "0.10"
 rand_distr = "0.6"
-tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
+tokenizers = { version = "0.23", default-features = false, features = ["onig"] }
 tract.workspace = true
diff --git a/examples/stable-diffusion-xl/Cargo.toml b/examples/stable-diffusion-xl/Cargo.toml
index ab993e74ac..454afa6fee 100644
--- a/examples/stable-diffusion-xl/Cargo.toml
+++ b/examples/stable-diffusion-xl/Cargo.toml
@@ -11,5 +11,5 @@ kdam = "0.6"
 ndarray.workspace = true
 rand = "0.10"
 rand_distr = "0.6"
-tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
+tokenizers = { version = "0.23", default-features = false, features = ["onig"] }
 tract.workspace = true
diff --git a/examples/stable-diffusion/Cargo.toml b/examples/stable-diffusion/Cargo.toml
index e8460dc615..d11973b108 100644
--- a/examples/stable-diffusion/Cargo.toml
+++ b/examples/stable-diffusion/Cargo.toml
@@ -12,5 +12,5 @@ ndarray.workspace = true
 ndarray-npy.workspace = true
 rand = "0.10"
 rand_distr = "0.6"
-tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
+tokenizers = { version = "0.23", default-features = false, features = ["onig"] }
 tract.workspace = true
diff --git a/examples/wasm-model-bench/Cargo.toml b/examples/wasm-model-bench/Cargo.toml
new file mode 100644
index 0000000000..0a05e96a64
--- /dev/null
+++ b/examples/wasm-model-bench/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "wasm-model-bench"
+version = "0.1.0"
+license = "MIT OR Apache-2.0"
+edition = "2024"
+
+[lib]
+path = "src/lib.rs"
+
+[[bin]]
+name = "bench-onnx"
+path = "src/bench_onnx.rs"
+
+[[bin]]
+name = "bench-nnef"
+path = "src/bench_nnef.rs"
+
+[dependencies]
+anyhow.workspace = true
+tract-nnef.workspace = true
+tract-onnx.workspace = true
diff --git a/examples/wasm-model-bench/MMM_MACRO_ATTRIBUTION.pdf b/examples/wasm-model-bench/MMM_MACRO_ATTRIBUTION.pdf
new file mode 100644
index 0000000000..5d98322303
Binary files /dev/null and b/examples/wasm-model-bench/MMM_MACRO_ATTRIBUTION.pdf differ
diff --git a/examples/wasm-model-bench/src/bench_nnef.rs b/examples/wasm-model-bench/src/bench_nnef.rs
new file mode 100644
index 0000000000..c42a1ad871
--- /dev/null
+++ b/examples/wasm-model-bench/src/bench_nnef.rs
@@ -0,0 +1,29 @@
+//! Bench an NNEF model. Loads via tract-nnef, runs N timed reps, reports
+//! min/median/max/spread.
+
+use anyhow::Result;
+use tract_nnef::prelude::*;
+
+fn main() -> Result<()> {
+    let mut args = std::env::args();
+    let _prog = args.next();
+    let model_path = args.next().ok_or_else(|| {
+        anyhow::anyhow!("usage: bench-nnef <model.nnef.tgz> [warmup] [timed] [reps]")
+    })?;
+    let warmup_iters: usize = args.next().map(|s| s.parse()).transpose()?.unwrap_or(20);
+    let timed_iters: usize = args.next().map(|s| s.parse()).transpose()?.unwrap_or(50);
+    let repetitions: usize = args.next().map(|s| s.parse()).transpose()?.unwrap_or(10);
+
+    let model =
+        tract_nnef::nnef().model_for_path(&model_path)?.into_optimized()?.into_runnable()?;
+
+    let inputs = wasm_model_bench::build_zero_inputs(&model)?;
+
+    let samples =
+        wasm_model_bench::run_bench(&model, &inputs, warmup_iters, timed_iters, repetitions)?;
+    wasm_model_bench::print_stats(&model_path, &samples);
+    if std::env::var("TRACT_BENCH_QUALITY").ok().as_deref() == Some("1") {
+        wasm_model_bench::run_quality_check(&model, &inputs)?;
+    }
+    Ok(())
+}
diff --git a/examples/wasm-model-bench/src/bench_onnx.rs b/examples/wasm-model-bench/src/bench_onnx.rs
new file mode 100644
index 0000000000..906d35154b
--- /dev/null
+++ b/examples/wasm-model-bench/src/bench_onnx.rs
@@ -0,0 +1,87 @@
+//! Bench an ONNX model.
+//!
+//! Usage: bench-onnx <model.onnx> [<shape_spec>|-] [warmup] [timed] [reps]
+//!
+//!   <shape_spec> can be a single per-input fact like "1,3,224,224,f32" or
+//!   multi-input with ";" separators: "1,1,100,32,f32;1,2,100,96,f32".
+//!   Use "-" or empty to skip override for that input.
+//!
+//! Optional env var TRACT_BENCH_SYMBOLS="S=100,BATCH=1" applies global
+//! symbol concretization for models whose internal nodes still reference
+//! symbols after input facts are set.
+
+use anyhow::{Result, anyhow};
+use tract_onnx::prelude::*;
+
+fn parse_fact_spec(spec: &str) -> Result<InferenceFact> {
+    let parts: Vec<&str> = spec.split(',').collect();
+    if parts.len() < 2 {
+        anyhow::bail!("shape spec needs at least 'shape,dtype': {spec}");
+    }
+    let dt_str = parts.last().unwrap();
+    let dt = match *dt_str {
+        "f32" => DatumType::F32,
+        "f16" => DatumType::F16,
+        "i64" => DatumType::I64,
+        "i32" => DatumType::I32,
+        "u8" => DatumType::U8,
+        s => return Err(anyhow!("unsupported dtype: {s}")),
+    };
+    let shape: Vec<usize> = parts[..parts.len() - 1]
+        .iter()
+        .map(|s| s.parse::<usize>().map_err(anyhow::Error::from))
+        .collect::<Result<_>>()?;
+    Ok(InferenceFact::dt_shape(dt, shape))
+}
+
+fn main() -> Result<()> {
+    let mut args = std::env::args();
+    let _prog = args.next();
+    let model_path = args.next().ok_or_else(|| {
+        anyhow!("usage: bench-onnx <model.onnx> [<shape_spec>|-] [warmup] [timed] [reps]")
+    })?;
+    let shape_spec = args.next();
+    let warmup_iters: usize = args.next().map(|s| s.parse()).transpose()?.unwrap_or(20);
+    let timed_iters: usize = args.next().map(|s| s.parse()).transpose()?.unwrap_or(50);
+    let repetitions: usize = args.next().map(|s| s.parse()).transpose()?.unwrap_or(10);
+
+    let mut model = tract_onnx::onnx().model_for_path(&model_path)?;
+    let symbols_env = std::env::var("TRACT_BENCH_SYMBOLS").ok().filter(|s| !s.is_empty());
+
+    // When symbols are provided, the model is symbolic-shaped and we go straight
+    // to TypedModel → substitute_symbols, ignoring shape_spec (input shapes will
+    // be derived from the symbol substitution). When no symbols, we use the
+    // shape_spec to pin input facts on the InferenceModel before into_typed.
+    let typed = if let Some(symbols_str) = symbols_env {
+        let typed = model.into_typed()?;
+        let mut subs = std::collections::HashMap::new();
+        for kv in symbols_str.split(',') {
+            let (k, v) = kv.split_once('=').ok_or_else(|| anyhow!("bad symbol: {kv}"))?;
+            let sym = typed.symbols.sym(k);
+            subs.insert(sym, TDim::Val(v.parse::<i64>()?));
+        }
+        typed.substitute_symbols(&subs)?
+    } else {
+        if let Some(spec) = shape_spec.as_deref().filter(|s| *s != "-") {
+            for (i, one) in spec.split(';').enumerate() {
+                if one.is_empty() || one == "-" {
+                    continue;
+                }
+                let fact = parse_fact_spec(one)?;
+                model.set_input_fact(i, fact)?;
+            }
+        }
+        model.into_typed()?
+    };
+    let model = typed.into_optimized()?.into_runnable()?;
+
+    let inputs = wasm_model_bench::build_zero_inputs(&model)?;
+
+    let samples =
+        wasm_model_bench::run_bench(&model, &inputs, warmup_iters, timed_iters, repetitions)?;
+    wasm_model_bench::print_stats(&model_path, &samples);
+    if std::env::var("TRACT_BENCH_QUALITY").ok().as_deref() == Some("1") {
+        wasm_model_bench::run_quality_check(&model, &inputs)?;
+    }
+    Ok(())
+}
diff --git a/examples/wasm-model-bench/src/lib.rs b/examples/wasm-model-bench/src/lib.rs
new file mode 100644
index 0000000000..9500c2329e
--- /dev/null
+++ b/examples/wasm-model-bench/src/lib.rs
@@ -0,0 +1,99 @@
+//! Shared bench harness for WASM E2E model timing.
+
+use anyhow::Result;
+use std::sync::Arc;
+use std::time::Instant;
+use tract_nnef::internal::DimLike;
+use tract_nnef::prelude::*;
+
+pub type Runnable = Arc<TypedSimplePlan>;
+
+pub fn build_zero_inputs(model: &Runnable) -> Result<TVec<TValue>> {
+    let mut inputs = tvec![];
+    let typed = model.model();
+    for &outlet in typed.input_outlets()?.iter() {
+        let fact = typed.outlet_fact(outlet)?;
+        let shape: Vec<usize> =
+            fact.shape.iter().map(|d| d.to_usize()).collect::<TractResult<Vec<_>>>()?;
+        let dt = fact.datum_type;
+        let tensor = Tensor::zero_dt(dt, &shape)?;
+        inputs.push(tensor.into_tvalue());
+    }
+    Ok(inputs)
+}
+
+pub fn run_bench(
+    model: &Runnable,
+    inputs: &TVec<TValue>,
+    warmup_iters: usize,
+    timed_iters: usize,
+    repetitions: usize,
+) -> Result<Vec<f64>> {
+    for _ in 0..warmup_iters {
+        let _ = model.run(inputs.clone())?;
+    }
+
+    let mut samples = Vec::with_capacity(repetitions);
+    for _ in 0..repetitions {
+        let t0 = Instant::now();
+        for _ in 0..timed_iters {
+            let _ = model.run(inputs.clone())?;
+        }
+        let elapsed = t0.elapsed();
+        let ns_per_call = elapsed.as_secs_f64() / timed_iters as f64 * 1e9;
+        samples.push(ns_per_call);
+    }
+    samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    Ok(samples)
+}
+
+/// Print quality metrics for the model's output(s). Use after bench. With
+/// fixed-shape models and a deterministic input (zeros), running this with
+/// baseline vs relaxed-simd builds and comparing outputs is the quality
+/// regression check (FMA gives ~1 ulp drift; mul+add is bit-stable).
+pub fn run_quality_check(model: &Runnable, inputs: &TVec<TValue>) -> Result<()> {
+    let outputs = model.run(inputs.clone())?;
+    for (i, out) in outputs.iter().enumerate() {
+        let dt = out.datum_type();
+        let shape = out.shape();
+        if dt == DatumType::F32 {
+            let tensor: &Tensor = &*out;
+            let slice: &[f32] = unsafe { tensor.as_slice_unchecked::<f32>() };
+            let n = slice.len();
+            let l2: f64 = slice.iter().map(|x| (*x as f64) * (*x as f64)).sum::<f64>().sqrt();
+            let mean: f64 = slice.iter().map(|x| *x as f64).sum::<f64>() / n as f64;
+            let preview: Vec<f32> = slice.iter().take(5).copied().collect();
+            let last5: Vec<f32> =
+                slice.iter().rev().take(5).copied().collect::<Vec<_>>().into_iter().rev().collect();
+            // Cheap deterministic checksum: XOR of all bit-patterns
+            let mut xor: u32 = 0;
+            for x in slice {
+                xor ^= x.to_bits();
+            }
+            eprintln!(
+                "  output[{i}] shape={shape:?} dt=F32 n={n} L2={l2:.6e} mean={mean:.6e} xor=0x{xor:08x} first5={preview:?} last5={last5:?}"
+            );
+        } else {
+            eprintln!("  output[{i}] shape={shape:?} dt={dt:?} (skip non-F32 stats)");
+        }
+    }
+    Ok(())
+}
+
+pub fn print_stats(label: &str, samples: &[f64]) {
+    let min = samples[0];
+    let median = samples[samples.len() / 2];
+    let max = samples[samples.len() - 1];
+    let pct_spread = (max - min) / min * 100.0;
+    let target = if cfg!(target_feature = "relaxed-simd") {
+        "+relaxed-simd (FMA)"
+    } else if cfg!(target_family = "wasm") {
+        "+simd128 only (mul+add)"
+    } else {
+        "native"
+    };
+    eprintln!(
+        "[{target}] {label}: min={min:.0} median={median:.0} max={max:.0} ns/inference (spread {pct_spread:.0}%, n={})",
+        samples.len()
+    );
+}
diff --git a/extra/src/lib.rs b/extra/src/lib.rs
index dbe46874b3..35427c2b91 100644
--- a/extra/src/lib.rs
+++ b/extra/src/lib.rs
@@ -9,7 +9,6 @@ pub trait WithTractExtra {
 
 impl WithTractExtra for tract_nnef::framework::Nnef {
     fn enable_tract_extra(&mut self) {
-        self.enable_tract_core();
         self.registries.push(tract_extra_registry());
     }
 
diff --git a/gpu/src/ops/change_axes.rs b/gpu/src/ops/change_axes.rs
index b0e926489a..847cbe7dfe 100644
--- a/gpu/src/ops/change_axes.rs
+++ b/gpu/src/ops/change_axes.rs
@@ -162,19 +162,19 @@ impl TypedOp for GpuAxisOp {
         self.inner.axes_mapping(&ref_inputs, &ref_outputs)
     }
 
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         _source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
         let inner = if let AxisOp::Reshape(axis, from, to) = &self.inner {
             AxisOp::Reshape(
                 *axis,
-                from.iter().map(|d| d.eval(values)).collect(),
-                to.iter().map(|d| d.eval(values)).collect(),
+                from.iter().map(|d| d.substitute_all(subs)).collect::<TractResult<_>>()?,
+                to.iter().map(|d| d.substitute_all(subs)).collect::<TractResult<_>>()?,
             )
         } else {
             self.inner.clone()
diff --git a/gpu/src/ops/diag_gather.rs b/gpu/src/ops/diag_gather.rs
new file mode 100644
index 0000000000..a2fa28b44a
--- /dev/null
+++ b/gpu/src/ops/diag_gather.rs
@@ -0,0 +1,98 @@
+use crate::tensor::{DeviceTensor, DeviceTensorExt};
+use derive_new::new;
+use tract_core::internal::*;
+
+/// `out[..., i, k] = in[..., i, offset + k - i]`, with zero-fill on
+/// out-of-bounds reads. Mirrors `tract_transformers::ops::diag_gather::DiagGather`.
+///
+/// `offset` and `out_len` are TDim because the rel-pos table width and the
+/// query-axis length may both be symbolic upstream; both are resolved against
+/// `session.resolved_symbols` at eval time.
+pub type DispatchDiagGatherFn =
+    fn(input: &DeviceTensor, offset: i64, out_len: usize, output: &DeviceTensor) -> TractResult<()>;
+
+#[derive(Clone, new)]
+pub struct GpuDiagGather {
+    pub offset: TDim,
+    pub out_len: TDim,
+    pub backend_name: &'static str,
+    pub dispatch: DispatchDiagGatherFn,
+}
+
+impl std::fmt::Debug for GpuDiagGather {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}DiagGather", self.backend_name)
+    }
+}
+
+impl PartialEq for GpuDiagGather {
+    fn eq(&self, other: &Self) -> bool {
+        self.backend_name == other.backend_name
+            && self.offset == other.offset
+            && self.out_len == other.out_len
+    }
+}
+impl Eq for GpuDiagGather {}
+
+impl std::hash::Hash for GpuDiagGather {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.backend_name.hash(state);
+        self.offset.hash(state);
+        self.out_len.hash(state);
+    }
+}
+
+impl Op for GpuDiagGather {
+    fn name(&self) -> StaticName {
+        format!("{}DiagGather", self.backend_name).into()
+    }
+    fn info(&self) -> TractResult<Vec<String>> {
+        Ok(vec![format!("offset={}, out_len={}", self.offset, self.out_len)])
+    }
+    op_as_typed_op!();
+}
+
+impl EvalOp for GpuDiagGather {
+    fn is_stateless(&self) -> bool {
+        true
+    }
+
+    fn eval_with_session(
+        &self,
+        node_id: usize,
+        session: &TurnState,
+        inputs: TVec<TValue>,
+    ) -> TractResult<TVec<TValue>> {
+        let input_val = args_1!(inputs);
+        let input = input_val.to_device_tensor()?;
+        let offset = self.offset.eval(&session.resolved_symbols).to_i64()?;
+        let out_len = self.out_len.eval(&session.resolved_symbols).to_usize()?;
+        let mut out_shape: TVec<usize> = input.shape().into();
+        let rank = out_shape.len();
+        ensure!(rank >= 2);
+        out_shape[rank - 1] = out_len;
+        let output = crate::session_handler::make_tensor_for_node(
+            session,
+            node_id,
+            input.datum_type(),
+            &out_shape,
+        )?;
+        (self.dispatch)(input, offset, out_len, &output)?;
+        Ok(tvec!(output.into_tensor().into_tvalue()))
+    }
+}
+
+impl TypedOp for GpuDiagGather {
+    fn output_facts(&self, inputs: &[&TypedFact]) -> TractResult<TVec<TypedFact>> {
+        crate::utils::facts_to_device_facts(inputs, |facts| {
+            ensure!(facts.len() == 1);
+            let mut shape: TVec<TDim> = facts[0].shape.to_tvec();
+            ensure!(shape.len() >= 2);
+            let rank = shape.len();
+            shape[rank - 1] = self.out_len.clone();
+            Ok(tvec!(facts[0].datum_type.fact(&shape)))
+        })
+        .with_context(|| format!("Error while computing facts for {:?}", self.name()))
+    }
+    as_op!();
+}
diff --git a/gpu/src/ops/gather.rs b/gpu/src/ops/gather.rs
new file mode 100644
index 0000000000..a2e85461d6
--- /dev/null
+++ b/gpu/src/ops/gather.rs
@@ -0,0 +1,108 @@
+use crate::tensor::{DeviceTensor, DeviceTensorExt};
+use derive_new::new;
+use tract_core::internal::*;
+
+/// `output = data.gather(axis, indices)`, i.e.
+/// `output[..., i, ...] = data[..., indices[i], ...]` along `axis`.
+/// Negative indices wrap (matches the CPU op).
+///
+/// First implementation supports the plain-tensor path only (no block-quant,
+/// no packed-matrix storage); the translator's `rule_if` guards the rest out.
+pub type DispatchGatherFn = fn(
+    data: &DeviceTensor,
+    indices: &DeviceTensor,
+    axis: usize,
+    output: &DeviceTensor,
+) -> TractResult<()>;
+
+#[derive(Clone, new)]
+pub struct GpuGather {
+    pub axis: usize,
+    pub backend_name: &'static str,
+    pub dispatch: DispatchGatherFn,
+}
+
+impl std::fmt::Debug for GpuGather {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}Gather", self.backend_name)
+    }
+}
+
+impl PartialEq for GpuGather {
+    fn eq(&self, other: &Self) -> bool {
+        self.backend_name == other.backend_name && self.axis == other.axis
+    }
+}
+impl Eq for GpuGather {}
+
+impl std::hash::Hash for GpuGather {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.backend_name.hash(state);
+        self.axis.hash(state);
+    }
+}
+
+impl Op for GpuGather {
+    fn name(&self) -> StaticName {
+        format!("{}Gather", self.backend_name).into()
+    }
+    fn info(&self) -> TractResult<Vec<String>> {
+        Ok(vec![format!("axis={}", self.axis)])
+    }
+    op_as_typed_op!();
+}
+
+impl EvalOp for GpuGather {
+    fn is_stateless(&self) -> bool {
+        true
+    }
+
+    fn eval_with_session(
+        &self,
+        node_id: usize,
+        session: &TurnState,
+        inputs: TVec<TValue>,
+    ) -> TractResult<TVec<TValue>> {
+        let (data_val, indices_val) = args_2!(inputs);
+        let data = data_val.to_device_tensor()?;
+        let indices = indices_val.to_device_tensor()?;
+        let out_shape = compute_output_shape(self.axis, data.shape(), indices.shape())?;
+        let output = crate::session_handler::make_tensor_for_node(
+            session,
+            node_id,
+            data.datum_type(),
+            &out_shape,
+        )?;
+        (self.dispatch)(data, indices, self.axis, &output)?;
+        Ok(tvec!(output.into_tensor().into_tvalue()))
+    }
+}
+
+impl TypedOp for GpuGather {
+    fn output_facts(&self, inputs: &[&TypedFact]) -> TractResult<TVec<TypedFact>> {
+        crate::utils::facts_to_device_facts(inputs, |facts| {
+            ensure!(facts.len() == 2);
+            ensure!(facts[1].datum_type == i64::datum_type());
+            ensure!(facts[0].rank() > self.axis);
+            let dt = facts[0].datum_type;
+            let mut shape: TVec<TDim> = facts[0].shape.iter().take(self.axis).cloned().collect();
+            shape.extend(facts[1].shape.iter().cloned());
+            shape.extend(facts[0].shape.iter().skip(self.axis + 1).cloned());
+            Ok(tvec!(dt.fact(&shape)))
+        })
+        .with_context(|| format!("Error while computing facts for {:?}", self.name()))
+    }
+    as_op!();
+}
+
+fn compute_output_shape(
+    axis: usize,
+    data: &[usize],
+    indices: &[usize],
+) -> TractResult<TVec<usize>> {
+    ensure!(data.len() > axis);
+    let mut out: TVec<usize> = data[..axis].into();
+    out.extend(indices.iter().copied());
+    out.extend(data[axis + 1..].iter().copied());
+    Ok(out)
+}
diff --git a/gpu/src/ops/mod.rs b/gpu/src/ops/mod.rs
index 95e84edf89..b424f50ef9 100644
--- a/gpu/src/ops/mod.rs
+++ b/gpu/src/ops/mod.rs
@@ -5,8 +5,10 @@ pub mod cast;
 pub mod change_axes;
 pub mod concat;
 pub mod copy_based;
+pub mod diag_gather;
 pub mod dyn_kv_cache;
 pub mod element_wise;
+pub mod gather;
 pub mod gelu_approximate;
 pub mod iff;
 pub mod leaky_relu;
diff --git a/gpu/src/ops/pulse.rs b/gpu/src/ops/pulse.rs
index 745ebc63da..d8b2e279a9 100644
--- a/gpu/src/ops/pulse.rs
+++ b/gpu/src/ops/pulse.rs
@@ -301,11 +301,13 @@ impl GpuPulsePadState {
             self.save_frame(&*ctx, op, input, latest_valid_frame)?;
         }
 
-        // Start with a copy of input
+        // Start with a copy of input.  The fused-axis-op chain may have
+        // installed a non-contiguous view (Move only permutes strides,
+        // never materialises), so a flat memcpy would read the buffer in
+        // pre-Move order; copy_nd honours `input.strides()` instead.
         let mut output =
             make_tensor_for_node(session, self.node_id, input.datum_type(), input.shape())?;
-        let flat_len = input.len() * input.datum_type().size_of();
-        ctx.flat_copy(input, 0, &output, 0, flat_len)?;
+        ctx.copy_nd(input, 0, input.strides(), &output, 0, input.shape(), output.strides())?;
 
         // Quick return if entirely in valid or invalid range
         if (pulse_begin >= op.begin_input && pulse_end <= end_input)
diff --git a/gpu/src/ops/scaled_masked_softmax.rs b/gpu/src/ops/scaled_masked_softmax.rs
index 2e6faaefe1..8aae92b261 100644
--- a/gpu/src/ops/scaled_masked_softmax.rs
+++ b/gpu/src/ops/scaled_masked_softmax.rs
@@ -2,14 +2,25 @@ use crate::tensor::{DeviceTensor, DeviceTensorExt};
 use derive_new::new;
 use tract_core::internal::*;
 
-/// A = SOFTMAX(INPUT * SCALE + MASK, AXIS=2)
-/// Only input of rank of 3 is supported
-pub type DispatchScaledMaskedSoftmaxFn =
-    fn(&DeviceTensor, &Tensor, &DeviceTensor, &DeviceTensor) -> TractResult<()>;
+/// Fused scale + mask + softmax over the last axis.  When the mask is float
+/// it is added in log-space (`out = softmax(x*scale + mask)`); when it is
+/// bool, masked positions are substituted with `-inf` before softmax.
+///
+/// If `post_softmax_mask` is true (bool mask only), fully-masked rows — whose
+/// softmax would otherwise be NaN — are written as `0` instead.  Partially-
+/// masked rows are unaffected.
+pub type DispatchScaledMaskedSoftmaxFn = fn(
+    input: &DeviceTensor,
+    scale: &Tensor,
+    mask: &DeviceTensor,
+    post_softmax_mask: bool,
+    output: &DeviceTensor,
+) -> TractResult<()>;
 
 #[derive(Clone, new)]
 pub struct GpuScaledMaskedSoftmax {
     pub scale: Arc<Tensor>,
+    pub post_softmax_mask: bool,
     pub backend_name: &'static str,
     pub dispatch: DispatchScaledMaskedSoftmaxFn,
 }
@@ -22,7 +33,9 @@ impl std::fmt::Debug for GpuScaledMaskedSoftmax {
 
 impl PartialEq for GpuScaledMaskedSoftmax {
     fn eq(&self, other: &Self) -> bool {
-        self.backend_name == other.backend_name && self.scale == other.scale
+        self.backend_name == other.backend_name
+            && self.scale == other.scale
+            && self.post_softmax_mask == other.post_softmax_mask
     }
 }
 impl Eq for GpuScaledMaskedSoftmax {}
@@ -31,6 +44,7 @@ impl std::hash::Hash for GpuScaledMaskedSoftmax {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
         self.backend_name.hash(state);
         self.scale.hash(state);
+        self.post_softmax_mask.hash(state);
     }
 }
 
@@ -61,7 +75,7 @@ impl EvalOp for GpuScaledMaskedSoftmax {
             input.datum_type(),
             input.shape(),
         )?;
-        (self.dispatch)(input, &self.scale, mask, &output)?;
+        (self.dispatch)(input, &self.scale, mask, self.post_softmax_mask, &output)?;
         Ok(tvec!(output.into_tensor().into_tvalue()))
     }
 }
@@ -71,7 +85,10 @@ impl TypedOp for GpuScaledMaskedSoftmax {
         crate::utils::facts_to_device_facts(inputs, |facts| {
             ensure!(facts.len() == 2);
             let dt = facts[0].datum_type;
-            ensure!(dt == facts[1].datum_type);
+            let mask_dt = facts[1].datum_type;
+            ensure!(mask_dt == dt || mask_dt == bool::datum_type());
+            // post_softmax_mask is bool-mask-only per the CPU contract.
+            ensure!(!self.post_softmax_mask || mask_dt == bool::datum_type());
             ensure!(facts[0].rank() <= 5);
             ensure!(facts[0].rank() >= 2);
             ensure!(facts[0].rank() == facts[1].rank());
diff --git a/gpu/src/ops/slice.rs b/gpu/src/ops/slice.rs
index f0cfd76321..925bc4bf42 100644
--- a/gpu/src/ops/slice.rs
+++ b/gpu/src/ops/slice.rs
@@ -94,19 +94,19 @@ impl TypedOp for GpuSlice {
             .with_context(|| format!("Error while computing facts for {:?}", self.name()))
     }
 
-    fn concretize_dims(
+    fn substitute_symbols(
         &self,
         _source: &TypedModel,
         node: &TypedNode,
         target: &mut TypedModel,
         mapping: &HashMap<OutletId, OutletId>,
-        values: &SymbolValues,
+        subs: &HashMap<Symbol, TDim>,
     ) -> TractResult<TVec<OutletId>> {
         let op = GpuSlice {
             inner: Slice {
                 axis: self.inner.axis,
-                start: self.inner.start.eval(values),
-                end: self.inner.end.eval(values),
+                start: self.inner.start.substitute_all(subs)?,
+                end: self.inner.end.substitute_all(subs)?,
             },
         };
         let inputs = node.inputs.iter().map(|i| mapping[i]).collect::<TVec<_>>();
diff --git a/gpu/src/rewrite_rules/rewire_syncs.rs b/gpu/src/rewrite_rules/rewire_syncs.rs
index 710d882f8c..945b74c044 100644
--- a/gpu/src/rewrite_rules/rewire_syncs.rs
+++ b/gpu/src/rewrite_rules/rewire_syncs.rs
@@ -24,12 +24,8 @@ pub fn rewire_back_and_forth_sync(
     rule_ensure!(op.kind == DeviceSyncKind::ToDevice);
 
     // Identify precessor ToHost
-    let Some(sync_to_host_prec) = model.single_prec(node.id)? else {
-        return Ok(None);
-    };
-    let Some(sync_to_host_prec_op) = sync_to_host_prec.op_as::<DeviceSync>() else {
-        return Ok(None);
-    };
+    rule_if_some!(sync_to_host_prec = model.single_prec(node.id)?);
+    rule_if_some!(sync_to_host_prec_op = sync_to_host_prec.op_as::<DeviceSync>());
     rule_ensure!(sync_to_host_prec_op.kind == DeviceSyncKind::ToHost);
 
     let patch =
@@ -48,23 +44,17 @@ pub fn rewire_sync_after_const(
 ) -> TractResult<Option<TypedModelPatch>> {
     // Search pattern => Const => ToHost
 
-    let Some(device_const) = op.val().as_device_tensor() else {
-        return Ok(None);
-    };
+    rule_if_some!(device_const = op.val().as_device_tensor());
 
     // Identify successors ToHost
-    let Some(next_nodes) = model.all_succ(node.id)? else {
-        return Ok(None);
-    };
+    rule_if_some!(next_nodes = model.all_succ(node.id)?);
 
     let sync_to_hosts = next_nodes
         .into_iter()
         .filter(|n| n.op_as::<DeviceSync>().is_some_and(|sync| sync.kind == DeviceSyncKind::ToHost))
         .collect_vec();
 
-    if sync_to_hosts.is_empty() {
-        return Ok(None);
-    };
+    rule_if!(!sync_to_hosts.is_empty());
 
     let host_const = device_const.to_host()?;
     let exotic_fact: Option<Box<dyn ExoticFact>> =
diff --git a/gpu/src/rewrite_rules/rms_norm.rs b/gpu/src/rewrite_rules/rms_norm.rs
index a319ab12e8..95089e65ee 100644
--- a/gpu/src/rewrite_rules/rms_norm.rs
+++ b/gpu/src/rewrite_rules/rms_norm.rs
@@ -11,26 +11,34 @@ pub fn remove_rms_norm_cast(
     op: &RmsNorm,
 ) -> TractResult<Option<TypedModelPatch>> {
     // Identify Cast from F16 To F32
-    let Some(cast_in_node) = model
-        .single_prec(node.id)?
-        .and_then(|n| n.op_as::<Cast>().and_then(|cast| (cast.to == DatumType::F32).then_some(n)))
-        .filter(|n| {
-            model.node_input_facts(n.id).map(|i| i[0].datum_type == DatumType::F16).unwrap_or(false)
-        })
-    else {
-        return Ok(None);
-    };
+    rule_if_some!(
+        cast_in_node = model
+            .single_prec(node.id)?
+            .and_then(|n| n
+                .op_as::<Cast>()
+                .and_then(|cast| (cast.to == DatumType::F32).then_some(n)))
+            .filter(|n| {
+                model
+                    .node_input_facts(n.id)
+                    .map(|i| i[0].datum_type == DatumType::F16)
+                    .unwrap_or(false)
+            })
+    );
 
     // Identify Cast from F32 To F16
-    let Some(cast_out_node) = model
-        .single_succ(node.id)?
-        .and_then(|n| n.op_as::<Cast>().and_then(|cast| (cast.to == DatumType::F16).then_some(n)))
-        .filter(|n| {
-            model.node_input_facts(n.id).map(|i| i[0].datum_type == DatumType::F32).unwrap_or(false)
-        })
-    else {
-        return Ok(None);
-    };
+    rule_if_some!(
+        cast_out_node = model
+            .single_succ(node.id)?
+            .and_then(|n| n
+                .op_as::<Cast>()
+                .and_then(|cast| (cast.to == DatumType::F16).then_some(n)))
+            .filter(|n| {
+                model
+                    .node_input_facts(n.id)
+                    .map(|i| i[0].datum_type == DatumType::F32)
+                    .unwrap_or(false)
+            })
+    );
 
     let mut patch = TypedModelPatch::default();
     let rsm_input = patch.taps(model, &cast_in_node.inputs)?;
diff --git a/harness/core-proptest-pulse/Cargo.toml b/harness/core-proptest-pulse/Cargo.toml
index 669815987b..9e0c880b55 100644
--- a/harness/core-proptest-pulse/Cargo.toml
+++ b/harness/core-proptest-pulse/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2024"
 
 [dependencies]
 tract-core.workspace = true
+tract-nnef.workspace = true
 tract-pulse.workspace = true
 
 [dev-dependencies]
diff --git a/harness/core-proptest-pulse/src/deconv.rs b/harness/core-proptest-pulse/src/deconv.rs
index e823d581c7..51aacd5d69 100644
--- a/harness/core-proptest-pulse/src/deconv.rs
+++ b/harness/core-proptest-pulse/src/deconv.rs
@@ -300,3 +300,78 @@ fn deconv2d() {
         .for_each(|(ix, x)| *x = ix as f32);
     proptest_regular_against_pulse(model, 1, input.into_plain_array().unwrap(), 2).unwrap()
 }
+
+// Issue #2203: pulse-mode Deconv with non-zero bias and kernel > stride
+// double-counts the bias in the overlap region. Bulk adds bias once per
+// output slot; the per-pulse Deconv also adds bias to its full
+// ``P*S + (K-1)`` output, and the DeconvDelay overlap-add then sums
+// pulse N's bias-included tail into pulse N+1's bias-included head.
+// Surfaced by Pocket-TTS / Mimi (depthwise ConvTranspose1d, K=32, S=16).
+// Existing ``proptest`` and ``example_*`` cases all use ``bias=0`` (see
+// ``DeconvOp::chain``), which masks the bug.
+
+fn run_issue_2203(group: usize, output_channels: usize, bias: tract_ndarray::Array1<f32>) {
+    let ker_len = 32usize;
+    let stride = 16usize;
+    let pulse = 2usize;
+    let input_len = 8usize;
+    let in_channels_per_group = output_channels / group;
+
+    let mut model = TypedModel::default();
+    let mut fact = f32::fact(&[1, output_channels, input_len]);
+    let s = model.symbols.sym("S");
+    fact.shape.set(2, s.to_dim());
+    let input = model.add_source("a", fact).unwrap();
+    let kernel = tract_ndarray::Array3::from_shape_vec(
+        (output_channels, in_channels_per_group, ker_len),
+        (0..output_channels * in_channels_per_group * ker_len)
+            .map(|i| 0.001_f32 * (i as f32 + 1.0))
+            .collect(),
+    )
+    .unwrap();
+    let deconv = tract_core::ops::cnn::Deconv {
+        pool_spec: PoolSpec {
+            data_format: DataFormat::NCHW,
+            kernel_shape: tvec!(ker_len),
+            padding: PaddingSpec::Explicit(tvec!(0), tvec!(0)),
+            strides: Some(tvec!(stride)),
+            dilations: None,
+            input_channels: output_channels,
+            output_channels,
+        },
+        kernel_format: tract_core::ops::cnn::KernelFormat::OIHW,
+        adjustments: tvec!(0),
+        group,
+    };
+    let kernel_node = model.add_const("kernel", kernel).unwrap();
+    let bias_node = model.add_const("bias", bias).unwrap();
+    let id = model.wire_node("deconv1", deconv, &[input, kernel_node, bias_node]).unwrap()[0];
+    model.select_output_outlets(&[id]).unwrap();
+
+    let input = tract_ndarray::Array3::from_shape_vec(
+        (1, output_channels, input_len),
+        (0..output_channels * input_len).map(|i| 0.1_f32 * (i as f32 + 1.0)).collect(),
+    )
+    .unwrap();
+    proptest_regular_against_pulse(model, pulse as _, input.into_dyn(), 2).unwrap()
+}
+
+#[test]
+fn issue_2203_dense_with_bias_kernel_32_stride_16() {
+    run_issue_2203(1, 8, tract_ndarray::Array1::<f32>::from_elem((8,), 0.5_f32))
+}
+
+#[test]
+fn issue_2203_depthwise_with_bias_kernel_32_stride_16() {
+    // Mimi's actual configuration: depthwise (groups = output_channels),
+    // kernel (G, 1, K) in OIHW.
+    run_issue_2203(
+        8,
+        8,
+        tract_ndarray::Array1::<f32>::from_shape_vec(
+            (8,),
+            (0..8).map(|i| 0.001_f32 * (i as f32 + 1.0)).collect(),
+        )
+        .unwrap(),
+    )
+}
diff --git a/harness/core-proptest-pulse/src/lib.rs b/harness/core-proptest-pulse/src/lib.rs
index a5f360696f..faaccb07ae 100644
--- a/harness/core-proptest-pulse/src/lib.rs
+++ b/harness/core-proptest-pulse/src/lib.rs
@@ -44,9 +44,10 @@ fn proptest_regular_against_pulse(
     let model = model.into_decluttered().unwrap();
     // dbg!(&model);
     let s = model.symbols.sym("S");
-    let symbols = SymbolValues::default().with(&s, len as i64);
+    let subs =
+        std::collections::HashMap::from([(s.clone(), tract_data::prelude::TDim::Val(len as i64))]);
 
-    let concrete = model.clone().concretize_dims(&symbols).unwrap();
+    let concrete = model.clone().substitute_symbols(&subs).unwrap();
     if concrete.nodes.iter().any(|n| n.outputs.iter().any(|o| o.fact.shape.volume().is_zero())) {
         return Err(TestCaseError::reject("too short input"));
     }
@@ -62,6 +63,7 @@ fn proptest_regular_against_pulse(
     let output_fact = pulsed.output_fact(0).unwrap().clone();
 
     let stream_info = output_fact.stream.as_ref().unwrap();
+    let symbols = SymbolValues::default().with(&s, len as i64);
     prop_assert!(stream_info.dim.eval(&symbols) == outputs[0].shape()[stream_info.axis].to_dim());
     let output_stream_axis = stream_info.axis;
     let delay = stream_info.delay;
diff --git a/harness/nemotron-speech-streaming-en-0.6b/ci.sh b/harness/nemotron-speech-streaming-en-0.6b/ci.sh
index 15431a17af..5b3697e60b 100755
--- a/harness/nemotron-speech-streaming-en-0.6b/ci.sh
+++ b/harness/nemotron-speech-streaming-en-0.6b/ci.sh
@@ -12,8 +12,8 @@ for rt in $TRACT_RUNTIMES
 do
 	gpu_assert=""
 	case "$rt" in
-		--cuda) gpu_assert="--assert-op-only Cuda*,Gpu*,DeviceSync*,Const,Source,STFT,Pad,IsNan,Add,Range,Cast,Eq,Div,Sub,Scan,Gather";;
-		--metal) gpu_assert="--assert-op-only Metal*,Gpu*,DeviceSync*,Const,Source,STFT,Pad,IsNan,Add,Range,Cast,Eq,Div,Sub,Scan,Gather,Reduce*";;
+		--cuda) gpu_assert="--assert-op-only Cuda*,Gpu*,DeviceSync*,Const,Source,STFT,Pad,Add,Range,Cast,Eq,Div,Sub";;
+		--metal) gpu_assert="--assert-op-only Metal*,Gpu*,DeviceSync*,Const,Source,STFT,Pad,Add,Range,Cast,Eq,Div,Sub";;
 	esac
 
 	for m in preprocessor encoder decoder joint
@@ -24,11 +24,18 @@ do
 		else
 			nnef_file=$MODEL.$m.nnef.tgz
 		fi
+		# Decoder is stepped one token per call by the caller (external state
+		# carry); force the Scan op into single-iter inlining so the LSTM body
+		# lands on the GPU instead of bouncing through CPU each step.
+		extra_transforms=""
+		if [ "$m" = "decoder" ]; then
+			extra_transforms="-t force_scan_external_state"
+		fi
 		$CACHE_FILE \
 			$S3DIR/$nnef_file \
 			$S3DIR/$MODEL.$m.io.npz
 
-		$TRACT_RUN $MODELS/$S3DIR/$nnef_file $rt --nnef-tract-transformers -t transformers_detect_all run \
+		$TRACT_RUN $MODELS/$S3DIR/$nnef_file $rt --nnef-tract-transformers -t transformers_detect_all $extra_transforms run \
 			--input-from-bundle $MODELS/$S3DIR/$MODEL.$m.io.npz --assert-output-bundle $MODELS/$S3DIR/$MODEL.$m.io.npz \
 			--approx very $gpu_assert
 	done
@@ -53,12 +60,48 @@ $TRACT_RUN $model_prefix.preprocessor.nnef.tgz \
 	-t 'pulse(symbol: Some("INPUT_SIGNAL__TIME"), pulse: "4800")' \
 	dump -q
 
+# Check that pulsified preprocessor and encoder translate cleanly on each GPU
+# runtime (the GPU translator must fall back to CPU for ops it can't lower, not
+# abort the whole transform).  Allowlist what currently falls back so a
+# regression spilling another op to CPU fails CI.  Runtime numeric checks are
+# deferred; only the translation is asserted here.
+for rt in $TRACT_RUNTIMES
+do
+	case "$rt" in
+		--cuda)
+			pp_assert="--assert-op-only Cuda*,Gpu*,DeviceSync*,Const,Source,STFT,Pad,PulsedSameAxisConcat,OptMulByScalar,OptSubUnicast"
+			enc_assert="--assert-op-only Cuda*,Gpu*,DeviceSync*,Const,Source,AffineChunkTrim,PulsedRange"
+			;;
+		--metal)
+			pp_assert="--assert-op-only Metal*,Gpu*,DeviceSync*,Const,Source,STFT,Pad,PulsedSameAxisConcat,OptMulByScalar,OptSubUnicast"
+			enc_assert="--assert-op-only Metal*,Gpu*,DeviceSync*,Const,Source,AffineChunkTrim,PulsedRange"
+			;;
+		*) continue;;
+	esac
+	$TRACT_RUN $model_prefix.preprocessor.nnef.tgz $rt \
+		-t 'concretize_symbols(values: {"BATCH": 1})' \
+		-t 'patch(body: "length = tract_core_shape_of(input_signal)[1];")' \
+		-t 'select_outputs(outputs: ["processed_signal"])' \
+		-t 'pulse(symbol: Some("INPUT_SIGNAL__TIME"), pulse: "4800")' \
+		dump -q $pp_assert
+	$TRACT_RUN $model_prefix.encoder.p1.nnef.tgz $rt \
+		--nnef-tract-transformers \
+		-t 'concretize_symbols(values: {"BATCH": 1})' \
+		-t 'patch(body: "length = tract_core_shape_of(audio_signal)[2];")' \
+		-t 'select_outputs(outputs: ["outputs"])' \
+		-t 'pulse(symbol: Some("AUDIO_SIGNAL__TIME"), pulse: "112")' \
+		dump -q $enc_assert
+done
+
 # Check that the encoder can be pulsified.
 # The encoder subsamples by 8x (three stride-2 convolutions) before the transformer.
 # The chunk-window mask has P=14 transformer tokens per chunk, so the input pulse
 # must be 14 * 8 = 112 audio frames.
 $TRACT_RUN $model_prefix.encoder.p1.nnef.tgz \
 	--nnef-tract-transformers \
+	-t 'concretize_symbols(values: {"BATCH": 1})' \
+	-t 'patch(body: "length = tract_core_shape_of(audio_signal)[2];")' \
+	-t 'select_outputs(outputs: ["outputs"])' \
 	-t 'pulse(symbol: Some("AUDIO_SIGNAL__TIME"), pulse: "112")' \
 	dump -q
 
diff --git a/harness/nnef-test-cases/cmp-operators/graph.nnef b/harness/nnef-test-cases/cmp-operators/graph.nnef
new file mode 100644
index 0000000000..cab5b96d7b
--- /dev/null
+++ b/harness/nnef-test-cases/cmp-operators/graph.nnef
@@ -0,0 +1,12 @@
+version 1.0;
+
+graph cmp_operators(input) -> (lt, le, gt, ge, eq, ne)
+{
+    input = external<scalar>(shape = [4]);
+    lt = input < 0.5;
+    le = input <= 0.5;
+    gt = input > 0.5;
+    ge = input >= 0.5;
+    eq = input == 0.5;
+    ne = input != 0.5;
+}
diff --git a/harness/nnef-test-cases/cmp-operators/runme.sh b/harness/nnef-test-cases/cmp-operators/runme.sh
new file mode 100755
index 0000000000..2dfa26aa64
--- /dev/null
+++ b/harness/nnef-test-cases/cmp-operators/runme.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+$TRACT_RUN . run --allow-random-input
diff --git a/harness/nnef-test-cases/copy-identity/graph.nnef b/harness/nnef-test-cases/copy-identity/graph.nnef
new file mode 100644
index 0000000000..2b65956523
--- /dev/null
+++ b/harness/nnef-test-cases/copy-identity/graph.nnef
@@ -0,0 +1,7 @@
+version 1.0;
+
+graph main(input) -> (output)
+{
+    input = external<scalar>(shape = [1, 4]);
+    output = copy(input);
+}
diff --git a/harness/nnef-test-cases/copy-identity/runme.sh b/harness/nnef-test-cases/copy-identity/runme.sh
new file mode 100755
index 0000000000..5394182ce5
--- /dev/null
+++ b/harness/nnef-test-cases/copy-identity/runme.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# No graph.quant: copy is pure identity, no Cast node should appear.
+$TRACT_RUN --nnef-tract-core . dump --assert-op-count Cast 0
+$TRACT_RUN --nnef-tract-core . run --allow-random-input
diff --git a/harness/nnef-test-cases/copy-requant/graph.nnef b/harness/nnef-test-cases/copy-requant/graph.nnef
new file mode 100644
index 0000000000..2b65956523
--- /dev/null
+++ b/harness/nnef-test-cases/copy-requant/graph.nnef
@@ -0,0 +1,7 @@
+version 1.0;
+
+graph main(input) -> (output)
+{
+    input = external<scalar>(shape = [1, 4]);
+    output = copy(input);
+}
diff --git a/harness/nnef-test-cases/copy-requant/graph.quant b/harness/nnef-test-cases/copy-requant/graph.quant
new file mode 100644
index 0000000000..c045185b36
--- /dev/null
+++ b/harness/nnef-test-cases/copy-requant/graph.quant
@@ -0,0 +1,2 @@
+"input": zero_point_linear_quantize(zero_point = 0, scale = 0.003921568859368563, bits = 8, signed = false, symmetric = false);
+"output": zero_point_linear_quantize(zero_point = -128, scale = 0.003921568859368563, bits = 8, signed = true, symmetric = false);
diff --git a/harness/nnef-test-cases/copy-requant/runme.sh b/harness/nnef-test-cases/copy-requant/runme.sh
new file mode 100755
index 0000000000..1f8cd2980d
--- /dev/null
+++ b/harness/nnef-test-cases/copy-requant/runme.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# u8 input → i8 output, both with the same scale, zero-point shifted by 128.
+# Cast must subtract 128 from each byte to land the same real-valued range
+# in the signed representation.
+$TRACT_RUN --nnef-tract-core . dump --assert-op-count Cast 1
+$TRACT_RUN --nnef-tract-core . run --allow-random-input
diff --git a/harness/parakeet-tdt-600m-v3/ci.sh b/harness/parakeet-tdt-600m-v3/ci.sh
index 225be86d42..b891a0dbbb 100755
--- a/harness/parakeet-tdt-600m-v3/ci.sh
+++ b/harness/parakeet-tdt-600m-v3/ci.sh
@@ -9,8 +9,8 @@ for rt in $TRACT_RUNTIMES
 do
 	gpu_assert=""
 	case "$rt" in
-		--cuda) gpu_assert="--assert-op-only Cuda*,Gpu*,DeviceSync*,Const,Source,STFT,Pad,IsNan,Add,Range,Cast,Eq,Div,Sub,Scan,Gather";;
-		--metal) gpu_assert="--assert-op-only Metal*,Gpu*,DeviceSync*,Const,Source,STFT,Pad,IsNan,Add,Range,Cast,Eq,Div,Sub,Scan,Gather,Reduce*";;
+		--cuda) gpu_assert="--assert-op-only Cuda*,Gpu*,DeviceSync*,Const,Source,STFT,Pad,IsNan,Add,Range,Cast,Eq,Div,Sub,Scan,Gather,DiagGather";;
+		--metal) gpu_assert="--assert-op-only Metal*,Gpu*,DeviceSync*,Const,Source,STFT,Pad,IsNan,Add,Range,Cast,Eq,Div,Sub,Scan,Gather,Reduce*,DiagGather";;
 	esac
 
 	for m in preprocessor encoder decoder joint
@@ -21,14 +21,24 @@ do
 		else
 			nnef_file=nvidia--parakeet-tdt-0.6b-v3-f32f32.$m.nnef.tgz
 		fi
+		# decoder LSTM is externally state-managed (caller plumbs state_0/
+		# state_1 every run): force the external_state flag on Scans
+		# pre-optimisation so declutter_single_loop inlines them, and assert
+		# no Scan survives. Cached NNEF predates the flag — see #2157.
+		extra_transform=""
+		extra_assert=""
+		if [ "$m" = "decoder" ]; then
+			extra_transform="-t force_scan_external_state"
+			extra_assert="--assert-op-count Scan 0"
+		fi
 		$CACHE_FILE \
 			asr/608/nvidia--parakeet-tdt-0.6b-v3-f32f32/$nnef_file \
 			asr/608/nvidia--parakeet-tdt-0.6b-v3-f32f32/nvidia--parakeet-tdt-0.6b-v3-f32f32.$m.io.npz
 
 		$TRACT_RUN $MODELS/asr/608/nvidia--parakeet-tdt-0.6b-v3-f32f32/$nnef_file $rt \
-			--nnef-tract-transformers -t transformers_detect_all run \
+			--nnef-tract-transformers -t transformers_detect_all $extra_transform run \
 			--input-from-bundle $MODELS/asr/608/nvidia--parakeet-tdt-0.6b-v3-f32f32/nvidia--parakeet-tdt-0.6b-v3-f32f32.$m.io.npz \
 			--assert-output-bundle $MODELS/asr/608/nvidia--parakeet-tdt-0.6b-v3-f32f32/nvidia--parakeet-tdt-0.6b-v3-f32f32.$m.io.npz \
-			--approx very $gpu_assert
+			--approx very $gpu_assert $extra_assert
 	 done
 done
diff --git a/harness/pre-optimized-graphes/hey_snips_v4_model17/expected b/harness/pre-optimized-graphes/hey_snips_v4_model17/expected
index 3306036659..f2a349ccb4 100644
--- a/harness/pre-optimized-graphes/hey_snips_v4_model17/expected
+++ b/harness/pre-optimized-graphes/hey_snips_v4_model17/expected
@@ -7,7 +7,7 @@ extension tract_symbol S;
 fragment tract_core_properties(
 ) -> (properties: (string, tensor<scalar>)[])
 {
-  properties = [("pulse.delay", tract_core_cast([182], to = "i64")), ("pulse.input_axes", tract_core_cast([0], to = "i64")), ("pulse.output_axes", tract_core_cast([0], to = "i64")), ("tract_nnef_ser_version", "0.19.3-pre"), ("tract_nnef_format_version", "beta1")];
+  properties = [("pulse.delay", tract_core_cast([182], to = "i64")), ("pulse.input_axes", tract_core_cast([0], to = "i64")), ("pulse.output_axes", tract_core_cast([0], to = "i64")), ("pulse.streaming_symbol", ["S"]), ("tract_nnef_ser_version", "0.19.3-pre"), ("tract_nnef_format_version", "beta1")];
 }
 
 graph network(input_node) -> (i"wavenet_2/post_proc_2-1x1_conv-conv1d/convolution/Conv2D") {
diff --git a/harness/pre-optimized-graphes/mdl-en-2019-Q3-librispeech/expected b/harness/pre-optimized-graphes/mdl-en-2019-Q3-librispeech/expected
index ec8a6edd74..7ed0723b58 100644
--- a/harness/pre-optimized-graphes/mdl-en-2019-Q3-librispeech/expected
+++ b/harness/pre-optimized-graphes/mdl-en-2019-Q3-librispeech/expected
@@ -123,7 +123,7 @@ fragment scan_body_1(
 fragment tract_core_properties(
 ) -> (properties: (string, tensor<scalar>)[])
 {
-  properties = [("pulse.delay", tract_core_cast([6], to = "i64")), ("pulse.input_axes", tract_core_cast([0], to = "i64")), ("pulse.output_axes", tract_core_cast([0], to = "i64")), ("tract_nnef_ser_version", "0.19.3-pre"), ("tract_nnef_format_version", "beta1")];
+  properties = [("pulse.delay", tract_core_cast([6], to = "i64")), ("pulse.input_axes", tract_core_cast([0], to = "i64")), ("pulse.output_axes", tract_core_cast([0], to = "i64")), ("pulse.streaming_symbol", ["S"]), ("tract_nnef_ser_version", "0.19.3-pre"), ("tract_nnef_format_version", "beta1")];
 }
 
 graph network(input) -> (output) {
@@ -204,7 +204,7 @@ graph network(input) -> (output) {
   i"fastlstm1.peephole0.mul.fix-rank" = variable<scalar>(label = "fastlstm1.peephole0.mul.fix-rank", shape = [1, 256]);
   i"fastlstm1.peephole1.mul.fix-rank" = variable<scalar>(label = "fastlstm1.peephole1.mul.fix-rank", shape = [1, 256]);
   i"fastlstm1.peephole2.mul.fix-rank" = variable<scalar>(label = "fastlstm1.peephole2.mul.fix-rank", shape = [1, 256]);
-  i"fastlstm1.c_final" = tract_core_scan(body = "scan_body_0", scan = [("fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.0..256.concat-einsum-k.0..256", i"fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.0..256.concat-einsum-k.0..256", 0, 1), ("fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.512..768.concat-einsum-k.0..256", i"fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.512..768.concat-einsum-k.0..256.fix_c.0", 0, 1), ("fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.256..512.concat-einsum-k.0..256", i"fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.256..512.concat-einsum-k.0..256.fix_c.0", 0, 1), ("fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.768..1024.concat-einsum-k.0..256", i"fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.768..1024.concat-einsum-k.0..256.fix_c.0", 0, 1)], full = [("fastlstm1.four_parts.W.split-over-1.0..256.concat-einsum-slice-k.1.256..384", i"fastlstm1.four_parts.W.split-over-1.0..256.concat-einsum-slice-k.1.256..384"), ("fastlstm1.four_parts.W.split-over-1.256..512.concat-einsum-slice-k.1.256..384", i"fastlstm1.four_parts.W.split-over-1.256..512.concat-einsum-slice-k.1.256..384"), ("fastlstm1.four_parts.W.split-over-1.512..768.concat-einsum-slice-k.1.256..384", i"fastlstm1.four_parts.W.split-over-1.512..768.concat-einsum-slice-k.1.256..384"), ("fastlstm1.four_parts.W.split-over-1.768..1024.concat-einsum-slice-k.1.256..384", i"fastlstm1.four_parts.W.split-over-1.768..1024.concat-einsum-slice-k.1.256..384"), ("fastlstm1.four_parts.split-1-over-1.0..256.slice", i"fastlstm1.four_parts.split-1-over-1.0..256.slice"), ("fastlstm1.four_parts.split-1-over-1.256..512.slice", i"fastlstm1.four_parts.split-1-over-1.256..512.slice"), ("fastlstm1.four_parts.split-1-over-1.512..768.slice", i"fastlstm1.four_parts.split-1-over-1.512..768.slice"), ("fastlstm1.four_parts.split-1-over-1.768..1024.slice", i"fastlstm1.four_parts.split-1-over-1.768..1024.slice"), ("fastlstm1.h_new.W.split-1-over-1.0..128.slice", i"fastlstm1.h_new.W.split-1-over-1.0..128.slice"), ("fastlstm1.h_new.split-1-over-1.0..128.slice", i"fastlstm1.h_new.split-1-over-1.0..128.slice"), ("fastlstm1.peephole0.mul.fix-rank", i"fastlstm1.peephole0.mul.fix-rank"), ("fastlstm1.peephole1.mul.fix-rank", i"fastlstm1.peephole1.mul.fix-rank"), ("fastlstm1.peephole2.mul.fix-rank", i"fastlstm1.peephole2.mul.fix-rank")], state = [("fastlstm1.c", i"tap.tap.fastlstm1.c_init.0-35/0-104/0", "fastlstm1.c_new"), ("fastlstm1.r", i"tap.tap.tap.fastlstm1.r_init.0-36/0-110/0-164/0", "fastlstm1.r_new")], output = [("fastlstm1.h_new.W.prop_axis.a.input_0", "full", 0, 1)], skip = 2, reset_every_turn = false);
+  i"fastlstm1.c_final" = tract_core_scan(body = "scan_body_0", scan = [("fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.0..256.concat-einsum-k.0..256", i"fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.0..256.concat-einsum-k.0..256", 0, 1), ("fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.512..768.concat-einsum-k.0..256", i"fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.512..768.concat-einsum-k.0..256.fix_c.0", 0, 1), ("fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.256..512.concat-einsum-k.0..256", i"fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.256..512.concat-einsum-k.0..256.fix_c.0", 0, 1), ("fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.768..1024.concat-einsum-k.0..256", i"fastlstm1.c_final.extracted.fastlstm1.four_parts.W.split-over-1.768..1024.concat-einsum-k.0..256.fix_c.0", 0, 1)], full = [("fastlstm1.four_parts.W.split-over-1.0..256.concat-einsum-slice-k.1.256..384", i"fastlstm1.four_parts.W.split-over-1.0..256.concat-einsum-slice-k.1.256..384"), ("fastlstm1.four_parts.W.split-over-1.256..512.concat-einsum-slice-k.1.256..384", i"fastlstm1.four_parts.W.split-over-1.256..512.concat-einsum-slice-k.1.256..384"), ("fastlstm1.four_parts.W.split-over-1.512..768.concat-einsum-slice-k.1.256..384", i"fastlstm1.four_parts.W.split-over-1.512..768.concat-einsum-slice-k.1.256..384"), ("fastlstm1.four_parts.W.split-over-1.768..1024.concat-einsum-slice-k.1.256..384", i"fastlstm1.four_parts.W.split-over-1.768..1024.concat-einsum-slice-k.1.256..384"), ("fastlstm1.four_parts.split-1-over-1.0..256.slice", i"fastlstm1.four_parts.split-1-over-1.0..256.slice"), ("fastlstm1.four_parts.split-1-over-1.256..512.slice", i"fastlstm1.four_parts.split-1-over-1.256..512.slice"), ("fastlstm1.four_parts.split-1-over-1.512..768.slice", i"fastlstm1.four_parts.split-1-over-1.512..768.slice"), ("fastlstm1.four_parts.split-1-over-1.768..1024.slice", i"fastlstm1.four_parts.split-1-over-1.768..1024.slice"), ("fastlstm1.h_new.W.split-1-over-1.0..128.slice", i"fastlstm1.h_new.W.split-1-over-1.0..128.slice"), ("fastlstm1.h_new.split-1-over-1.0..128.slice", i"fastlstm1.h_new.split-1-over-1.0..128.slice"), ("fastlstm1.peephole0.mul.fix-rank", i"fastlstm1.peephole0.mul.fix-rank"), ("fastlstm1.peephole1.mul.fix-rank", i"fastlstm1.peephole1.mul.fix-rank"), ("fastlstm1.peephole2.mul.fix-rank", i"fastlstm1.peephole2.mul.fix-rank")], state = [("fastlstm1.c", i"tap.tap.fastlstm1.c_init.0-35/0-104/0", "fastlstm1.c_new"), ("fastlstm1.r", i"tap.tap.tap.fastlstm1.r_init.0-36/0-110/0-164/0", "fastlstm1.r_new")], output = [("fastlstm1.h_new.W.prop_axis.a.input_0", "full", 0, 1)], skip = 2, reset_every_turn = false, external_state = false);
   i"fastlstm1.h_new.W.fix_a" = transpose(i"fastlstm1.c_final", axes = [1, 0, 2]);
   i"fastlstm1.h_new.W.fix_b" = variable<scalar>(label = "fastlstm1.h_new.W.fix_b", shape = [1, 256, 256]);
   i"fastlstm1.h_new.W" = matmul(i"fastlstm1.h_new.W.fix_a", i"fastlstm1.h_new.W.fix_b", transposeA = false, transposeB = false);
@@ -271,7 +271,7 @@ graph network(input) -> (output) {
   i"fastlstm2.peephole0.mul.fix-rank" = variable<scalar>(label = "fastlstm2.peephole0.mul.fix-rank", shape = [1, 256]);
   i"fastlstm2.peephole1.mul.fix-rank" = variable<scalar>(label = "fastlstm2.peephole1.mul.fix-rank", shape = [1, 256]);
   i"fastlstm2.peephole2.mul.fix-rank" = variable<scalar>(label = "fastlstm2.peephole2.mul.fix-rank", shape = [1, 256]);
-  i"fastlstm2.c_final" = tract_core_scan(body = "scan_body_1", scan = [("fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.0..256.concat-einsum-k.0..256", i"fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.0..256.concat-einsum-k.0..256", 0, 1), ("fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.512..768.concat-einsum-k.0..256", i"fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.512..768.concat-einsum-k.0..256.fix_c.0", 0, 1), ("fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.256..512.concat-einsum-k.0..256", i"fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.256..512.concat-einsum-k.0..256.fix_c.0", 0, 1), ("fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.768..1024.concat-einsum-k.0..256", i"fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.768..1024.concat-einsum-k.0..256.fix_c.0", 0, 1)], full = [("fastlstm2.four_parts.W.split-over-1.0..256.concat-einsum-slice-k.1.256..384", i"fastlstm2.four_parts.W.split-over-1.0..256.concat-einsum-slice-k.1.256..384"), ("fastlstm2.four_parts.W.split-over-1.256..512.concat-einsum-slice-k.1.256..384", i"fastlstm2.four_parts.W.split-over-1.256..512.concat-einsum-slice-k.1.256..384"), ("fastlstm2.four_parts.W.split-over-1.512..768.concat-einsum-slice-k.1.256..384", i"fastlstm2.four_parts.W.split-over-1.512..768.concat-einsum-slice-k.1.256..384"), ("fastlstm2.four_parts.W.split-over-1.768..1024.concat-einsum-slice-k.1.256..384", i"fastlstm2.four_parts.W.split-over-1.768..1024.concat-einsum-slice-k.1.256..384"), ("fastlstm2.four_parts.split-1-over-1.0..256.slice", i"fastlstm2.four_parts.split-1-over-1.0..256.slice"), ("fastlstm2.four_parts.split-1-over-1.256..512.slice", i"fastlstm2.four_parts.split-1-over-1.256..512.slice"), ("fastlstm2.four_parts.split-1-over-1.512..768.slice", i"fastlstm2.four_parts.split-1-over-1.512..768.slice"), ("fastlstm2.four_parts.split-1-over-1.768..1024.slice", i"fastlstm2.four_parts.split-1-over-1.768..1024.slice"), ("fastlstm2.h_new.W.split-1-over-1.0..128.slice", i"fastlstm2.h_new.W.split-1-over-1.0..128.slice"), ("fastlstm2.h_new.split-1-over-1.0..128.slice", i"fastlstm2.h_new.split-1-over-1.0..128.slice"), ("fastlstm2.peephole0.mul.fix-rank", i"fastlstm2.peephole0.mul.fix-rank"), ("fastlstm2.peephole1.mul.fix-rank", i"fastlstm2.peephole1.mul.fix-rank"), ("fastlstm2.peephole2.mul.fix-rank", i"fastlstm2.peephole2.mul.fix-rank")], state = [("fastlstm2.c", i"tap.tap.fastlstm1.c_init.0-35/0.1-106/0", "fastlstm2.c_new"), ("fastlstm2.r", i"tap.tap.tap.fastlstm1.r_init.0-36/0.1-112/0-166/0", "fastlstm2.r_new")], output = [("fastlstm2.h_new.W.prop_axis.a.input_0", "full", 0, 1)], skip = 6, reset_every_turn = false);
+  i"fastlstm2.c_final" = tract_core_scan(body = "scan_body_1", scan = [("fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.0..256.concat-einsum-k.0..256", i"fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.0..256.concat-einsum-k.0..256", 0, 1), ("fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.512..768.concat-einsum-k.0..256", i"fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.512..768.concat-einsum-k.0..256.fix_c.0", 0, 1), ("fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.256..512.concat-einsum-k.0..256", i"fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.256..512.concat-einsum-k.0..256.fix_c.0", 0, 1), ("fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.768..1024.concat-einsum-k.0..256", i"fastlstm2.c_final.extracted.fastlstm2.four_parts.W.split-over-1.768..1024.concat-einsum-k.0..256.fix_c.0", 0, 1)], full = [("fastlstm2.four_parts.W.split-over-1.0..256.concat-einsum-slice-k.1.256..384", i"fastlstm2.four_parts.W.split-over-1.0..256.concat-einsum-slice-k.1.256..384"), ("fastlstm2.four_parts.W.split-over-1.256..512.concat-einsum-slice-k.1.256..384", i"fastlstm2.four_parts.W.split-over-1.256..512.concat-einsum-slice-k.1.256..384"), ("fastlstm2.four_parts.W.split-over-1.512..768.concat-einsum-slice-k.1.256..384", i"fastlstm2.four_parts.W.split-over-1.512..768.concat-einsum-slice-k.1.256..384"), ("fastlstm2.four_parts.W.split-over-1.768..1024.concat-einsum-slice-k.1.256..384", i"fastlstm2.four_parts.W.split-over-1.768..1024.concat-einsum-slice-k.1.256..384"), ("fastlstm2.four_parts.split-1-over-1.0..256.slice", i"fastlstm2.four_parts.split-1-over-1.0..256.slice"), ("fastlstm2.four_parts.split-1-over-1.256..512.slice", i"fastlstm2.four_parts.split-1-over-1.256..512.slice"), ("fastlstm2.four_parts.split-1-over-1.512..768.slice", i"fastlstm2.four_parts.split-1-over-1.512..768.slice"), ("fastlstm2.four_parts.split-1-over-1.768..1024.slice", i"fastlstm2.four_parts.split-1-over-1.768..1024.slice"), ("fastlstm2.h_new.W.split-1-over-1.0..128.slice", i"fastlstm2.h_new.W.split-1-over-1.0..128.slice"), ("fastlstm2.h_new.split-1-over-1.0..128.slice", i"fastlstm2.h_new.split-1-over-1.0..128.slice"), ("fastlstm2.peephole0.mul.fix-rank", i"fastlstm2.peephole0.mul.fix-rank"), ("fastlstm2.peephole1.mul.fix-rank", i"fastlstm2.peephole1.mul.fix-rank"), ("fastlstm2.peephole2.mul.fix-rank", i"fastlstm2.peephole2.mul.fix-rank")], state = [("fastlstm2.c", i"tap.tap.fastlstm1.c_init.0-35/0.1-106/0", "fastlstm2.c_new"), ("fastlstm2.r", i"tap.tap.tap.fastlstm1.r_init.0-36/0.1-112/0-166/0", "fastlstm2.r_new")], output = [("fastlstm2.h_new.W.prop_axis.a.input_0", "full", 0, 1)], skip = 6, reset_every_turn = false, external_state = false);
   i"fastlstm2.h_new.W.fix_a" = transpose(i"fastlstm2.c_final", axes = [1, 0, 2]);
   i"fastlstm2.h_new.W.fix_b" = variable<scalar>(label = "fastlstm2.h_new.W.fix_b", shape = [1, 256, 256]);
   i"fastlstm2.h_new.W" = matmul(i"fastlstm2.h_new.W.fix_b", i"fastlstm2.h_new.W.fix_a", transposeA = true, transposeB = true);
diff --git a/harness/pulse-multi-axis/README.md b/harness/pulse-multi-axis/README.md
new file mode 100644
index 0000000000..9a1dfcc7c6
--- /dev/null
+++ b/harness/pulse-multi-axis/README.md
@@ -0,0 +1,105 @@
+# pulse-multi-axis harness
+
+Synthetic test cases driving the Blockify graph-rewrite pass for pulse v1.
+
+These are deliberately written *without* attention-specific framing
+(no Q/K/V naming, no softmax, no value tensor, no pre-chunked input shape)
+so the pulsifier has to discover any per-pulse window structure from the
+graph alone.
+
+Each example pairs a `graph.nnef`, a `gen-inputs.py` that emits an
+`io.npz` (batch reference), and a `runme.sh` that runs batch and
+pulsified via the CLI and asserts both against `io.npz`.  The CLI's
+`--assert-output-bundle` path automatically skips `pulse.delay` warmup
+tokens, so cases with non-zero output delay (ex03 future-window) align
+their streamed output to the batch reference without bespoke Rust glue.
+
+## ex01-block-diag-reduce
+
+Two streams `a, b` of shape `[T, D]`.  Pairwise dot product into a `[T, T]`
+score matrix.  Block-diagonal mask (`mask[i,j] = (i/P == j/P)`, P=2)
+multiplied in.  Sum-reduce on axis 0 → output `[T]`.
+
+The score matrix wire has streams on **both** of its axes simultaneously.
+The block-diagonal mask annihilates everything except the current diagonal
+P×P block, which is the structural information that drives Blockify's
+rewrite into single-streaming-axis chunked form.
+
+Streaming-axis: 0 on both inputs and on the output.  Pulse size P=2.
+
+`blockified/` contains a hand-written reference of what the post-Blockify
+typed graph should look like — model interface preserved (inputs `[2*S, 4]`,
+output `[2*S]`), internal reshape factors the streaming axis into chunks.
+
+## ex02-block-diag-bilinear
+
+Same block-diagonal structure as ex01, but the row-axis Reduce<Sum> is
+replaced by a second EinSum against a third stream `c [T, D]`:
+
+    output[i, d] = sum_j masked[i, j] * c[j, d]            # [T, D]
+
+This is the SDPA structure (Q·Kᵀ → mask → attn·V) without softmax.
+Smallest synthetic that exercises a downstream second EinSum after the
+masked score matrix.
+
+Blockify recognises this pattern (the recogniser matches Mul-by-mask
+followed by either `Reduce<Sum>` or a contracting `EinSum`) and rewrites
+the second EinSum the same way as the first, with the chunk batch axis
+prepended to its subscripts.  Numerical match is verified end-to-end in
+`harness/core-proptest-pulse/tests/blockify_ex01.rs`.
+
+## ex03-banded-reduce
+
+Same shape as ex01, but with an **asymmetric** mask — every row at chunk
+`c` attends to chunks `{c, c-1, ..., c-L}` (here `L = 1`).  The mask
+matrix is a P-block lower bidiagonal, mimicking the geometry of
+multi-chunk attention with left-context.
+
+The structural justification for chunked pulsification is identical to
+ex01 (per-pulse work bounded by `(L+1)·P` past samples plus the current
+P-block).  Only the mask predicate differs: `eq(chunk_row, chunk_col)`
+in ex01 becomes `0 ≤ chunk_row - chunk_col ≤ L` here.
+
+Blockify recognises the banded form `(diff >= L1) && (diff <= L2)` —
+parametrised by `MaskForm::Banded { lower, upper, k }` — and rewrites
+the contracted-axis input by wrapping it with `WindowOnAxis(W)` (where
+`W = upper − lower + 1`) followed by a flatten reshape, so the chunked
+einsum's contracted axis carries `W·k` elements per chunk instead of
+`k`.  Pulsification is non-causal: the streamed output is delayed by
+`L = upper − lower` chunks (the future-lookahead the band requires),
+flushed by feeding zero-chunks at the end of the stream.  Numerical
+match is verified end-to-end in
+`harness/core-proptest-pulse/tests/blockify_ex01.rs::ex03_*`.
+
+## ex04-banded-causal
+
+Same shape as ex03 but with a *causal* mask: every row at chunk `c`
+attends only to chunks `{c, c-1, ..., c-L}` (no future lookahead).
+Mask form: `-L ≤ diff ≤ 0` with `diff = chunk(i) - chunk(j)`.  For
+`L = 1, P = 2` the mask is the upper P-block bidiagonal (mirror of
+ex03).
+
+Blockify dispatches the same banded recogniser, but `WindowOnAxis` is
+parameterised with `start = lower < 0` (past-window flavour).  The
+pulsifier wires `Delay(0, W-1) → PulsePad(before = -start) →
+PulsedExposeWindow`: the `PulsePad` zero-fills the leading `-start`
+chunks of the post-Delay buffer (matching the out-of-stream zero
+semantics of the batch reference) and shifts `stream.delay` back so
+the final output has `stream.delay = 0` — fully causal, no trailing
+flush.  Numerical match in `ex04_*`.
+
+## ex05-banded-bilinear
+
+ex02 + ex03 mask: the SDPA-without-softmax shape (Q·Kᵀ → mask → attn·V)
+with a banded mask `0 ≤ chunk(i) - chunk(j) ≤ 1`.  The terminator
+EinSum `"ij,jd→id"` contracts `j` (= `mask.axis_b`), which means the
+windowed input is on the *j-side*: `b` (initiator input 1) AND `c`
+(terminator auxiliary).  Both get `WindowOnAxis(W, start = -upper)`
+followed by a flatten reshape, so the contracted axis on the score
+matrix and on `c` both carry `W·k` elements per chunk.  Output
+`stream.delay = 0` (causal in i: per output i-chunk, the j-window
+covers `[chunk(i) - 1, chunk(i)]` — past+current).
+
+This exercises the contracted-axis-detection logic in
+`detect_contracted_score_axis` and the auxiliary-input windowing in
+`wire_terminator_einsum` — paths that ex01–ex04 don't reach.
diff --git a/harness/pulse-multi-axis/ex01-block-diag-reduce/gen-inputs.py b/harness/pulse-multi-axis/ex01-block-diag-reduce/gen-inputs.py
new file mode 100644
index 0000000000..f45be2f255
--- /dev/null
+++ b/harness/pulse-multi-axis/ex01-block-diag-reduce/gen-inputs.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+"""Generate io.npz for the block-diagonal-reduce synthetic.
+
+Parameters
+----------
+P = 2 (pulse / chunk size)
+C = 3 (chunks)
+T = C * P = 6 (total stream length)
+D = 4 (per-token feature dim)
+"""
+
+import numpy as np
+
+P, C, D = 2, 3, 4
+T = C * P
+
+rng = np.random.default_rng(42)
+
+a = rng.standard_normal((T, D)).astype(np.float32)
+b = rng.standard_normal((T, D)).astype(np.float32)
+
+scores = np.einsum("id,jd->ij", a, b)                   # [T, T]
+
+idx       = np.arange(T)
+chunk_id  = idx // P
+mask      = (chunk_id[:, None] == chunk_id[None, :]).astype(np.float32)
+
+masked = scores * mask
+output = masked.sum(axis=0).astype(np.float32)          # [T]
+
+np.savez("io.npz", a=a, b=b, output=output)
+print(f"Saved io.npz  a={a.shape} b={b.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex01-block-diag-reduce/graph.nnef b/harness/pulse-multi-axis/ex01-block-diag-reduce/graph.nnef
new file mode 100644
index 0000000000..cef91edd82
--- /dev/null
+++ b/harness/pulse-multi-axis/ex01-block-diag-reduce/graph.nnef
@@ -0,0 +1,44 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_symbol T;
+extension tract_assert T>=0;
+
+# Block-diagonal-by-P pairwise interaction between two streams.
+#
+# Inputs:  a, b  both [T, 4]
+#
+# scores[i, j] = sum_d a[i, d] * b[j, d]                      [T, T]
+# mask[i, j]   = (floor(i / P) == floor(j / P))   P = 2       [T, T] f32
+# masked       = scores * mask                                [T, T]
+# output[j]    = sum_i masked[i, j]                           [T]
+#
+# Streaming axis: 0 on both inputs and on the output.
+# Pulse size P = 2.  Per pulse: P new samples each on a, b -> P new
+# samples on output.  The block-diagonal mask annihilates everything
+# except the current diagonal P×P block, so the per-pulse computation
+# is bounded.
+#
+# Deliberately avoids attention-specific shape: no pre-chunking in the
+# input, no softmax, no value tensor, generic input names.
+
+graph network(a, b) -> (output)
+{
+    a = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    b = tract_core_external(shape = [T, 4], datum_type = 'f32');
+
+    scores = tract_core_einsum([a, b], expr = "id,jd->ij", acc = "f32");
+
+    pos      = tract_core_range(0, tract_core_shape_of(a)[0], step = 1);
+    chunk_id = pos / 2;
+
+    chunk_row  = unsqueeze(chunk_id, axes = [1]);
+    chunk_col  = unsqueeze(chunk_id, axes = [0]);
+    same_chunk = eq(chunk_row, chunk_col);
+
+    mask_f32 = tract_core_cast(same_chunk, to = 'f32');
+    masked   = mul(scores, mask_f32);
+
+    reduced = sum_reduce(masked, axes = [0]);
+    output  = squeeze(reduced, axes = [0]);
+}
diff --git a/harness/pulse-multi-axis/ex01-block-diag-reduce/io.npz b/harness/pulse-multi-axis/ex01-block-diag-reduce/io.npz
new file mode 100644
index 0000000000..85d600c91d
Binary files /dev/null and b/harness/pulse-multi-axis/ex01-block-diag-reduce/io.npz differ
diff --git a/harness/pulse-multi-axis/ex01-block-diag-reduce/runme.sh b/harness/pulse-multi-axis/ex01-block-diag-reduce/runme.sh
new file mode 100755
index 0000000000..8e9071d53d
--- /dev/null
+++ b/harness/pulse-multi-axis/ex01-block-diag-reduce/runme.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# Batch — T concrete, full reference.
+$TRACT_RUN --nnef-tract-core --set T=6 . run --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+
+# Pulsified — Blockify recognises the block-diagonal mask, rewrites the
+# section, and pulse fires on the chunk axis with pulse 2 tokens.  The
+# block-diagonal mask has zero output delay, so the streamed output
+# matches the batch reference 1:1.
+$TRACT_RUN --nnef-tract-core . --pulse 'T=2' run --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
diff --git a/harness/pulse-multi-axis/ex01-blockified/gen-inputs.py b/harness/pulse-multi-axis/ex01-blockified/gen-inputs.py
new file mode 100644
index 0000000000..c66a487d48
--- /dev/null
+++ b/harness/pulse-multi-axis/ex01-blockified/gen-inputs.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+"""Generate io.npz for the blockified (post-rewrite) form, with the
+original [T] model interface preserved.
+
+Reuses the same numerical inputs as the parent ex01-block-diag-reduce
+(seed=42), and the same output shape [T] — the chunking happens inside
+the graph, not at the boundary.
+"""
+
+import numpy as np
+
+P, S, D = 2, 3, 4
+T = S * P
+
+rng = np.random.default_rng(42)
+
+a = rng.standard_normal((T, D)).astype(np.float32)
+b = rng.standard_normal((T, D)).astype(np.float32)
+
+a_blk = a.reshape(S, P, D)
+b_blk = b.reshape(S, P, D)
+
+block_scores = np.einsum("spd,sqd->spq", a_blk, b_blk)         # [S, P, P]
+output       = block_scores.sum(axis=1).reshape(T).astype(np.float32)  # [T]
+
+np.savez("io.npz", a=a, b=b, output=output)
+print(f"Saved io.npz  a={a.shape} b={b.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex01-blockified/graph.nnef b/harness/pulse-multi-axis/ex01-blockified/graph.nnef
new file mode 100644
index 0000000000..840eaf2725
--- /dev/null
+++ b/harness/pulse-multi-axis/ex01-blockified/graph.nnef
@@ -0,0 +1,28 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_symbol S;
+extension tract_assert S>=0;
+
+# Hand-written "post-Blockify" form of ex01-block-diag-reduce.
+# Phase A POC reference: model interface preserved (inputs [2*S, 4],
+# output [2*S]).  Internally the graph factors the streaming axis into
+# [S, P], does the chunk-batched EinSum, reduces, and re-flattens.
+# The block-diagonal mask is gone — folded into the chunk batch axis 's'
+# on the EinSum.
+
+graph network(a, b) -> (output)
+{
+    a = tract_core_external(shape = [2*S, 4], datum_type = 'f32');
+    b = tract_core_external(shape = [2*S, 4], datum_type = 'f32');
+
+    a_blk = reshape(a, shape = [S, 2, 4]);
+    b_blk = reshape(b, shape = [S, 2, 4]);
+
+    block_scores = tract_core_einsum([a_blk, b_blk], expr = "spd,sqd->spq", acc = "f32");
+
+    reduced     = sum_reduce(block_scores, axes = [1]);
+    reduced_2d  = squeeze(reduced, axes = [1]);
+
+    output = reshape(reduced_2d, shape = [2*S]);
+}
diff --git a/harness/pulse-multi-axis/ex01-blockified/io.npz b/harness/pulse-multi-axis/ex01-blockified/io.npz
new file mode 100644
index 0000000000..85d600c91d
Binary files /dev/null and b/harness/pulse-multi-axis/ex01-blockified/io.npz differ
diff --git a/harness/pulse-multi-axis/ex02-block-diag-bilinear/gen-inputs.py b/harness/pulse-multi-axis/ex02-block-diag-bilinear/gen-inputs.py
new file mode 100644
index 0000000000..1496961255
--- /dev/null
+++ b/harness/pulse-multi-axis/ex02-block-diag-bilinear/gen-inputs.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+"""Generate io.npz for the block-diagonal-bilinear synthetic.
+
+Q·Kᵀ → block-diagonal mask → attn·V, no softmax.
+
+Parameters
+----------
+P = 2 (pulse / chunk size)
+S = 3 (chunks)
+T = S * P = 6
+D = 4
+"""
+
+import numpy as np
+
+P, S, D = 2, 3, 4
+T = S * P
+
+rng = np.random.default_rng(42)
+
+a = rng.standard_normal((T, D)).astype(np.float32)
+b = rng.standard_normal((T, D)).astype(np.float32)
+c = rng.standard_normal((T, D)).astype(np.float32)
+
+scores = np.einsum("id,jd->ij", a, b)                   # [T, T]
+idx       = np.arange(T)
+chunk_id  = idx // P
+mask      = (chunk_id[:, None] == chunk_id[None, :]).astype(np.float32)
+masked    = scores * mask
+output    = np.einsum("ij,jd->id", masked, c).astype(np.float32)  # [T, D]
+
+np.savez("io.npz", a=a, b=b, c=c, output=output)
+print(f"Saved io.npz  a={a.shape} b={b.shape} c={c.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex02-block-diag-bilinear/graph.nnef b/harness/pulse-multi-axis/ex02-block-diag-bilinear/graph.nnef
new file mode 100644
index 0000000000..2cadf01044
--- /dev/null
+++ b/harness/pulse-multi-axis/ex02-block-diag-bilinear/graph.nnef
@@ -0,0 +1,43 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_symbol T;
+extension tract_assert T>=0;
+
+# Block-diagonal-by-P pairwise interaction with a value tensor.
+#
+# This is the SDPA structure Q·Kᵀ → mask → attn·V, *without* softmax —
+# the simplest synthetic that exercises a downstream second EinSum
+# instead of a Reduce<Sum> after the masked score matrix.
+#
+# Inputs:  a, b, c  all [T, 4]
+#
+# scores[i, j] = sum_d a[i, d] * b[j, d]                      [T, T]
+# mask[i, j]   = (floor(i / P) == floor(j / P))   P = 2       [T, T] f32
+# masked       = scores * mask                                [T, T]
+# output[i, d] = sum_j masked[i, j] * c[j, d]                 [T, 4]
+#
+# The block-diagonal mask zeroes out cross-chunk score contributions,
+# so output[i in chunk c] only depends on c[j in chunk c].  Per pulse:
+# P new samples on each input -> P new output rows of size D.
+
+graph network(a, b, c) -> (output)
+{
+    a = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    b = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    c = tract_core_external(shape = [T, 4], datum_type = 'f32');
+
+    scores = tract_core_einsum([a, b], expr = "id,jd->ij", acc = "f32");
+
+    pos      = tract_core_range(0, tract_core_shape_of(a)[0], step = 1);
+    chunk_id = pos / 2;
+
+    chunk_row  = unsqueeze(chunk_id, axes = [1]);
+    chunk_col  = unsqueeze(chunk_id, axes = [0]);
+    same_chunk = eq(chunk_row, chunk_col);
+
+    mask_f32 = tract_core_cast(same_chunk, to = 'f32');
+    masked   = mul(scores, mask_f32);
+
+    output = tract_core_einsum([masked, c], expr = "ij,jd->id", acc = "f32");
+}
diff --git a/harness/pulse-multi-axis/ex02-block-diag-bilinear/io.npz b/harness/pulse-multi-axis/ex02-block-diag-bilinear/io.npz
new file mode 100644
index 0000000000..be446d05d0
Binary files /dev/null and b/harness/pulse-multi-axis/ex02-block-diag-bilinear/io.npz differ
diff --git a/harness/pulse-multi-axis/ex02-block-diag-bilinear/runme.sh b/harness/pulse-multi-axis/ex02-block-diag-bilinear/runme.sh
new file mode 100755
index 0000000000..440b0f918c
--- /dev/null
+++ b/harness/pulse-multi-axis/ex02-block-diag-bilinear/runme.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# Batch
+$TRACT_RUN --nnef-tract-core --set T=6 . run --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+
+# Pulsified — block-diagonal mask + EinSum terminator, no output delay.
+$TRACT_RUN --nnef-tract-core . --pulse 'T=2' run --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
diff --git a/harness/pulse-multi-axis/ex03-banded-reduce/gen-inputs.py b/harness/pulse-multi-axis/ex03-banded-reduce/gen-inputs.py
new file mode 100644
index 0000000000..6a9fefcad6
--- /dev/null
+++ b/harness/pulse-multi-axis/ex03-banded-reduce/gen-inputs.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+"""Generate io.npz for the banded-reduce synthetic.
+
+Banded mask geometry: every row at chunk c attends to chunks
+{c, c-1, ..., c-L}.  For L = 1 and P = 2 this is a P-block lower
+bidiagonal.
+
+Parameters
+----------
+P = 2 (pulse / chunk size)
+C = 3 (chunks)
+T = C * P = 6 (total stream length)
+D = 4 (per-token feature dim)
+L = 1 (left-context, in chunks)
+"""
+
+import numpy as np
+
+P, C, D, L = 2, 3, 4, 1
+T = C * P
+
+rng = np.random.default_rng(42)
+
+a = rng.standard_normal((T, D)).astype(np.float32)
+b = rng.standard_normal((T, D)).astype(np.float32)
+
+scores = np.einsum("id,jd->ij", a, b)                   # [T, T]
+
+idx       = np.arange(T)
+chunk_id  = idx // P
+diff      = chunk_id[:, None] - chunk_id[None, :]
+mask      = ((diff >= 0) & (diff <= L)).astype(np.float32)
+
+masked = scores * mask
+output = masked.sum(axis=0).astype(np.float32)          # [T]
+
+np.savez("io.npz", a=a, b=b, output=output)
+print(f"Saved io.npz  a={a.shape} b={b.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex03-banded-reduce/graph.nnef b/harness/pulse-multi-axis/ex03-banded-reduce/graph.nnef
new file mode 100644
index 0000000000..2dddf078ae
--- /dev/null
+++ b/harness/pulse-multi-axis/ex03-banded-reduce/graph.nnef
@@ -0,0 +1,63 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_symbol T;
+extension tract_assert T>=0;
+
+# Banded-by-P pairwise interaction between two streams.
+#
+# Mimics the geometry of multi-chunk attention with left-context L:
+# every row at chunk c attends to chunks {c, c-1, ..., c-L}.  For
+# L = 1 and P = 2 the mask is a P-block lower bidiagonal:
+#
+#       j-chunk:    0   1   2
+#                 +---+---+---+
+#       i-chunk 0 | 1 | 0 | 0 |
+#               1 | 1 | 1 | 0 |
+#               2 | 0 | 1 | 1 |
+#                 +---+---+---+
+#
+# Inputs:  a, b  both [T, 4]
+#
+# scores[i, j] = sum_d a[i, d] * b[j, d]                      [T, T]
+# diff[i, j]   = floor(i / P) - floor(j / P)                   P = 2
+# mask[i, j]   = (0 <= diff[i, j]) and (diff[i, j] <= L)       L = 1
+# masked       = scores * mask                                [T, T]
+# output[j]    = sum_i masked[i, j]                           [T]
+#
+# Streaming axis: 0 on both inputs and on the output.
+# Pulse size P = 2.  Per pulse: P new samples each on a, b -> P new
+# samples on output.  The banded mask annihilates everything outside
+# the (L+1) diagonal P×P band, so the per-pulse computation is
+# bounded by (L+1)·P past samples plus the current P-block.
+#
+# Blockify's current recogniser only matches the symmetric block-
+# diagonal form (eq(chunk_row, chunk_col)).  The banded form here is
+# the next target: the structural justification (per-pulse bounded
+# work after a chunk reshape) is the same, only the mask predicate
+# differs.
+
+graph network(a, b) -> (output)
+{
+    a = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    b = tract_core_external(shape = [T, 4], datum_type = 'f32');
+
+    scores = tract_core_einsum([a, b], expr = "id,jd->ij", acc = "f32");
+
+    pos      = tract_core_range(0, tract_core_shape_of(a)[0], step = 1);
+    chunk_id = pos / 2;
+
+    chunk_row = unsqueeze(chunk_id, axes = [1]);
+    chunk_col = unsqueeze(chunk_id, axes = [0]);
+    diff      = chunk_row - chunk_col;
+
+    left_ok  = ge(diff, 0);
+    right_ok = le(diff, 1);
+    in_band  = and(left_ok, right_ok);
+
+    mask_f32 = tract_core_cast(in_band, to = 'f32');
+    masked   = mul(scores, mask_f32);
+
+    reduced = sum_reduce(masked, axes = [0]);
+    output  = squeeze(reduced, axes = [0]);
+}
diff --git a/harness/pulse-multi-axis/ex03-banded-reduce/io.npz b/harness/pulse-multi-axis/ex03-banded-reduce/io.npz
new file mode 100644
index 0000000000..c867efea85
Binary files /dev/null and b/harness/pulse-multi-axis/ex03-banded-reduce/io.npz differ
diff --git a/harness/pulse-multi-axis/ex03-banded-reduce/runme.sh b/harness/pulse-multi-axis/ex03-banded-reduce/runme.sh
new file mode 100755
index 0000000000..6c94f19a4e
--- /dev/null
+++ b/harness/pulse-multi-axis/ex03-banded-reduce/runme.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# Batch
+$TRACT_RUN --nnef-tract-core --set T=6 . run --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+
+# Pulsified — future-window banded mask, output stream.delay = 2 tokens
+# (= L*P with L=1, P=2).  The CLI assertion path uses `pulse.delay`
+# (now correctly rescaled through the boundary merge reshape) to skip
+# the warmup tokens before comparing against the batch reference.
+$TRACT_RUN --nnef-tract-core . --pulse 'T=2' run --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
diff --git a/harness/pulse-multi-axis/ex04-banded-causal/gen-inputs.py b/harness/pulse-multi-axis/ex04-banded-causal/gen-inputs.py
new file mode 100644
index 0000000000..3ee27bec13
--- /dev/null
+++ b/harness/pulse-multi-axis/ex04-banded-causal/gen-inputs.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+"""Generate io.npz for the banded-causal synthetic.
+
+Causal banded mask: row at chunk c attends to chunks {c, c-1, ..., c-L}
+(past-only).  For L = 1 and P = 2 this is a P-block upper bidiagonal.
+
+Parameters
+----------
+P = 2 (pulse / chunk size)
+C = 3 (chunks)
+T = C * P = 6 (total stream length)
+D = 4 (per-token feature dim)
+L = 1 (left-context, in chunks)
+"""
+
+import numpy as np
+
+P, C, D, L = 2, 3, 4, 1
+T = C * P
+
+rng = np.random.default_rng(42)
+
+a = rng.standard_normal((T, D)).astype(np.float32)
+b = rng.standard_normal((T, D)).astype(np.float32)
+
+scores = np.einsum("id,jd->ij", a, b)                   # [T, T]
+
+idx       = np.arange(T)
+chunk_id  = idx // P
+diff      = chunk_id[:, None] - chunk_id[None, :]
+mask      = ((diff >= -L) & (diff <= 0)).astype(np.float32)
+
+masked = scores * mask
+output = masked.sum(axis=0).astype(np.float32)          # [T]
+
+np.savez("io.npz", a=a, b=b, output=output)
+print(f"Saved io.npz  a={a.shape} b={b.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex04-banded-causal/graph.nnef b/harness/pulse-multi-axis/ex04-banded-causal/graph.nnef
new file mode 100644
index 0000000000..8b9a1a5696
--- /dev/null
+++ b/harness/pulse-multi-axis/ex04-banded-causal/graph.nnef
@@ -0,0 +1,60 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_symbol T;
+extension tract_assert T>=0;
+
+# Banded-causal pairwise interaction between two streams.
+#
+# Same shape as ex03 but with a *causal* mask: every row at chunk c
+# attends only to chunks {c, c-1, ..., c-L} (no future lookahead).
+# For L = 1 and P = 2 the mask is the upper bidiagonal P-block:
+#
+#       j-chunk:    0   1   2
+#                 +---+---+---+
+#       i-chunk 0 | 1 | 1 | 0 |
+#               1 | 0 | 1 | 1 |
+#               2 | 0 | 0 | 1 |
+#                 +---+---+---+
+#
+# Inputs:  a, b  both [T, 4]
+#
+# scores[i, j] = sum_d a[i, d] * b[j, d]                      [T, T]
+# diff[i, j]   = floor(i / P) - floor(j / P)                   P = 2
+# mask[i, j]   = (-L <= diff[i, j]) and (diff[i, j] <= 0)      L = 1
+# masked       = scores * mask                                [T, T]
+# output[j]    = sum_i masked[i, j]                           [T]
+#
+# Streaming axis: 0 on both inputs and on the output.
+# Pulse size P = 2.  Per pulse: P new samples each on a, b -> P new
+# samples on output.  No output delay (the band is causal: at j-time
+# c_j we only need a-chunks {c_j, c_j-1, ..., c_j-L} which are all in
+# the past or current).  Blockify recognises the banded form via the
+# `Mul([Ge(0, diff), Ge(diff, -1)])` AST, wraps the contracted-axis
+# input with `WindowOnAxis(W=2, start=-1)` (past-window flavour), and
+# the resulting graph pulsifies on the chunk axis.
+
+graph network(a, b) -> (output)
+{
+    a = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    b = tract_core_external(shape = [T, 4], datum_type = 'f32');
+
+    scores = tract_core_einsum([a, b], expr = "id,jd->ij", acc = "f32");
+
+    pos      = tract_core_range(0, tract_core_shape_of(a)[0], step = 1);
+    chunk_id = pos / 2;
+
+    chunk_row = unsqueeze(chunk_id, axes = [1]);
+    chunk_col = unsqueeze(chunk_id, axes = [0]);
+    diff      = chunk_row - chunk_col;
+
+    left_ok  = ge(diff, -1);
+    right_ok = le(diff, 0);
+    in_band  = and(left_ok, right_ok);
+
+    mask_f32 = tract_core_cast(in_band, to = 'f32');
+    masked   = mul(scores, mask_f32);
+
+    reduced = sum_reduce(masked, axes = [0]);
+    output  = squeeze(reduced, axes = [0]);
+}
diff --git a/harness/pulse-multi-axis/ex04-banded-causal/io.npz b/harness/pulse-multi-axis/ex04-banded-causal/io.npz
new file mode 100644
index 0000000000..bb2f0fa0b2
Binary files /dev/null and b/harness/pulse-multi-axis/ex04-banded-causal/io.npz differ
diff --git a/harness/pulse-multi-axis/ex04-banded-causal/runme.sh b/harness/pulse-multi-axis/ex04-banded-causal/runme.sh
new file mode 100755
index 0000000000..2a9a31bf0a
--- /dev/null
+++ b/harness/pulse-multi-axis/ex04-banded-causal/runme.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# Batch
+$TRACT_RUN --nnef-tract-core --set T=6 . run --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+
+# Pulsified — causal banded mask (-1 ≤ diff ≤ 0).  PulsePad zero-fills
+# the leading past-window slot so the streamed output's stream.delay
+# is 0 and the output matches batch 1:1 from index 0.
+$TRACT_RUN --nnef-tract-core . --pulse 'T=2' run --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
diff --git a/harness/pulse-multi-axis/ex05-banded-bilinear/gen-inputs.py b/harness/pulse-multi-axis/ex05-banded-bilinear/gen-inputs.py
new file mode 100644
index 0000000000..8495163e4a
--- /dev/null
+++ b/harness/pulse-multi-axis/ex05-banded-bilinear/gen-inputs.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+"""Generate io.npz for the banded-bilinear synthetic.
+
+ex02-style Q·Kᵀ → mask → attn·V structure with ex03's banded mask.
+Mask `0 ≤ chunk(i) - chunk(j) ≤ 1` combined with the bilinear's
+contraction over j gives, per output i-chunk c_i, a sum over j-chunks
+in [c_i - 1, c_i] — past+current.  Streaming output is causal in i.
+
+Parameters
+----------
+P = 2 (pulse / chunk size)
+S = 3 (chunks)
+T = S * P = 6
+D = 4
+L = 1 (band width on diff: 0..L)
+"""
+
+import numpy as np
+
+P, S, D, L = 2, 3, 4, 1
+T = S * P
+
+rng = np.random.default_rng(42)
+
+a = rng.standard_normal((T, D)).astype(np.float32)
+b = rng.standard_normal((T, D)).astype(np.float32)
+c = rng.standard_normal((T, D)).astype(np.float32)
+
+scores = np.einsum("id,jd->ij", a, b)                   # [T, T]
+idx       = np.arange(T)
+chunk_id  = idx // P
+diff      = chunk_id[:, None] - chunk_id[None, :]
+mask      = ((diff >= 0) & (diff <= L)).astype(np.float32)
+masked    = scores * mask
+output    = np.einsum("ij,jd->id", masked, c).astype(np.float32)  # [T, D]
+
+np.savez("io.npz", a=a, b=b, c=c, output=output)
+print(f"Saved io.npz  a={a.shape} b={b.shape} c={c.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex05-banded-bilinear/graph.nnef b/harness/pulse-multi-axis/ex05-banded-bilinear/graph.nnef
new file mode 100644
index 0000000000..28b296122f
--- /dev/null
+++ b/harness/pulse-multi-axis/ex05-banded-bilinear/graph.nnef
@@ -0,0 +1,53 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_symbol T;
+extension tract_assert T>=0;
+
+# Banded pairwise interaction with a value tensor (ex02 + ex03 mask).
+#
+# Q·Kᵀ → banded mask → attn·V, no softmax.  Same SDPA-without-softmax
+# shape as ex02, but with a banded mask `0 ≤ chunk(i) - chunk(j) ≤ 1`
+# instead of a block-diagonal one.  Combined with the ex02-style EinSum
+# terminator (which contracts j), the per-output-row dependency is on
+# j-chunks `[chunk(i) - 1, chunk(i)]` — a causal-in-i past+current
+# window.
+#
+# Inputs:  a, b, c  all [T, 4]
+#
+# scores[i, j] = sum_d a[i, d] * b[j, d]                      [T, T]
+# diff[i, j]   = floor(i / P) - floor(j / P)                   P = 2
+# mask[i, j]   = (0 <= diff[i, j]) and (diff[i, j] <= 1)
+# masked       = scores * mask                                [T, T]
+# output[i, d] = sum_j masked[i, j] * c[j, d]                 [T, 4]
+#
+# Streaming axis: 0 on all inputs and on the output.
+# Pulse size P = 2.  Per pulse: P new samples each on a, b, c -> P new
+# output rows of size D.  Output stream.delay = 0 (causal in i: per
+# i-chunk we only need j-chunks at chunk(i) - 1 and chunk(i), both of
+# which have already arrived).
+
+graph network(a, b, c) -> (output)
+{
+    a = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    b = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    c = tract_core_external(shape = [T, 4], datum_type = 'f32');
+
+    scores = tract_core_einsum([a, b], expr = "id,jd->ij", acc = "f32");
+
+    pos      = tract_core_range(0, tract_core_shape_of(a)[0], step = 1);
+    chunk_id = pos / 2;
+
+    chunk_row = unsqueeze(chunk_id, axes = [1]);
+    chunk_col = unsqueeze(chunk_id, axes = [0]);
+    diff      = chunk_row - chunk_col;
+
+    left_ok  = ge(diff, 0);
+    right_ok = le(diff, 1);
+    in_band  = and(left_ok, right_ok);
+
+    mask_f32 = tract_core_cast(in_band, to = 'f32');
+    masked   = mul(scores, mask_f32);
+
+    output = tract_core_einsum([masked, c], expr = "ij,jd->id", acc = "f32");
+}
diff --git a/harness/pulse-multi-axis/ex05-banded-bilinear/io.npz b/harness/pulse-multi-axis/ex05-banded-bilinear/io.npz
new file mode 100644
index 0000000000..d8d797782d
Binary files /dev/null and b/harness/pulse-multi-axis/ex05-banded-bilinear/io.npz differ
diff --git a/harness/pulse-multi-axis/ex05-banded-bilinear/runme.sh b/harness/pulse-multi-axis/ex05-banded-bilinear/runme.sh
new file mode 100755
index 0000000000..aa7a1f9e4b
--- /dev/null
+++ b/harness/pulse-multi-axis/ex05-banded-bilinear/runme.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# Batch
+$TRACT_RUN --nnef-tract-core --set T=6 . run --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+
+# Pulsified — banded mask + EinSum terminator (ex02 + ex03 mask).
+# Terminator contracts axis_b (j); Blockify windows the j-side input
+# (b) at the initiator AND the j-side auxiliary (c) at the terminator,
+# both with `start = -upper` (past+current relative to kept axis i).
+# Output stream.delay = 0 (causal in i).
+$TRACT_RUN --nnef-tract-core . --pulse 'T=2' run --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
diff --git a/harness/pulse-multi-axis/ex06-block-diag-sdpa/gen-inputs.py b/harness/pulse-multi-axis/ex06-block-diag-sdpa/gen-inputs.py
new file mode 100644
index 0000000000..fab83dad09
--- /dev/null
+++ b/harness/pulse-multi-axis/ex06-block-diag-sdpa/gen-inputs.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""Generate io.npz for the block-diagonal SDPA synthetic.
+
+Q·Kᵀ → ScaledMaskedSoftmax(scale=0.5, bool block-diag mask) → attn·V.
+
+The bool-mask SMS semantics:
+    pre  = where(mask, scores * scale, -inf)
+    attn = softmax(pre, axis=-1)
+    out  = einsum("ij,jd->id", attn, V)
+
+Parameters
+----------
+P = 2 (pulse / chunk size)
+S = 3 (chunks)
+T = S * P = 6
+D = 4
+"""
+
+import numpy as np
+
+P, S, D = 2, 3, 4
+T = S * P
+SCALE = 0.5
+
+rng = np.random.default_rng(42)
+
+a = rng.standard_normal((T, D)).astype(np.float32)
+b = rng.standard_normal((T, D)).astype(np.float32)
+c = rng.standard_normal((T, D)).astype(np.float32)
+
+scores = np.einsum("id,jd->ij", a, b)              # [T, T]
+
+idx       = np.arange(T)
+chunk_id  = idx // P
+in_block  = (chunk_id[:, None] == chunk_id[None, :])  # bool [T, T]
+
+scaled    = scores * SCALE
+pre       = np.where(in_block, scaled, -np.inf)
+attn      = np.exp(pre - pre.max(axis=-1, keepdims=True))
+attn      = attn / attn.sum(axis=-1, keepdims=True)
+attn      = attn.astype(np.float32)
+
+output    = np.einsum("ij,jd->id", attn, c).astype(np.float32)  # [T, D]
+
+np.savez("io.npz", a=a, b=b, c=c, output=output)
+print(f"Saved io.npz  a={a.shape} b={b.shape} c={c.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex06-block-diag-sdpa/graph.nnef b/harness/pulse-multi-axis/ex06-block-diag-sdpa/graph.nnef
new file mode 100644
index 0000000000..71d546f392
--- /dev/null
+++ b/harness/pulse-multi-axis/ex06-block-diag-sdpa/graph.nnef
@@ -0,0 +1,43 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_registry tract_transformers;
+extension tract_symbol T;
+extension tract_assert T>=0;
+
+# Block-diagonal SDPA without softmax-free shortcut: the real shape.
+#
+# Q·Kᵀ → ScaledMaskedSoftmax(scale, bool block-diag mask) → attn·V.
+#
+# The body op (ScaledMaskedSoftmax) is non-trivial — unlike ex02's
+# `Mul(scores, mask_f32)` which collapses to identity in the chunked
+# form, this op actually does compute work (scale + softmax over the
+# last axis).  When the mask is block-diagonal, the chunked equivalent
+# is `softmax(scores * scale, axis=-1)` over the per-chunk [k, k] score
+# matrix — equivalent to `ScaledMaskedSoftmax` with an all-true bool
+# mask, since within each chunk every (i, j) pair is in-band.
+#
+# Inputs:  a (queries), b (keys), c (values)  all [T, 4]
+# Output: [T, 4]
+#
+# Streaming axis: 0 on all inputs and on the output.  Pulse size P = 2.
+
+graph network(a, b, c) -> (output)
+{
+    a = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    b = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    c = tract_core_external(shape = [T, 4], datum_type = 'f32');
+
+    scores = tract_core_einsum([a, b], expr = "id,jd->ij", acc = "f32");
+
+    pos      = tract_core_range(0, tract_core_shape_of(a)[0], step = 1);
+    chunk_id = pos / 2;
+
+    chunk_row = unsqueeze(chunk_id, axes = [1]);
+    chunk_col = unsqueeze(chunk_id, axes = [0]);
+    in_block  = eq(chunk_row, chunk_col);
+
+    attn = tract_transformers_scaled_masked_softmax(scores, in_block, scale = 0.5, post_softmax_mask = false);
+
+    output = tract_core_einsum([attn, c], expr = "ij,jd->id", acc = "f32");
+}
diff --git a/harness/pulse-multi-axis/ex06-block-diag-sdpa/io.npz b/harness/pulse-multi-axis/ex06-block-diag-sdpa/io.npz
new file mode 100644
index 0000000000..08f37215f8
Binary files /dev/null and b/harness/pulse-multi-axis/ex06-block-diag-sdpa/io.npz differ
diff --git a/harness/pulse-multi-axis/ex06-block-diag-sdpa/runme.sh b/harness/pulse-multi-axis/ex06-block-diag-sdpa/runme.sh
new file mode 100755
index 0000000000..d30806fc22
--- /dev/null
+++ b/harness/pulse-multi-axis/ex06-block-diag-sdpa/runme.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# Batch
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers --set T=6 . run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+
+# Pulsified — block-diag SDPA: Q·Kᵀ → ScaledMaskedSoftmax → attn·V.
+# The body op ScaledMaskedSoftmax has to participate in the chunked
+# rewrite (unlike ex02's Mul-by-mask which collapses to identity).
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers . --pulse 'T=2' run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
diff --git a/harness/pulse-multi-axis/ex07-banded-causal-sdpa/gen-inputs.py b/harness/pulse-multi-axis/ex07-banded-causal-sdpa/gen-inputs.py
new file mode 100644
index 0000000000..20aa6225aa
--- /dev/null
+++ b/harness/pulse-multi-axis/ex07-banded-causal-sdpa/gen-inputs.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""Generate io.npz for the banded causal SDPA synthetic.
+
+Q·Kᵀ → ScaledMaskedSoftmax(scale=0.5, banded bool mask) → attn·V, with
+mask `0 ≤ chunk(i) - chunk(j) ≤ 1` (causal in i with 1-chunk left
+context).
+
+Per output i-chunk c_i, attention runs over j with chunk(j) ∈ [c_i - 1,
+c_i].  At c_i = 0 the past j-chunk doesn't exist; the softmax shrinks
+to chunk 0's k positions only.
+
+Parameters
+----------
+P = 2 (pulse / chunk size)
+S = 3 (chunks)
+T = S * P = 6
+D = 4
+L = 1 (band width: 0..L)
+"""
+
+import numpy as np
+
+P, S, D, L = 2, 3, 4, 1
+T = S * P
+SCALE = 0.5
+
+rng = np.random.default_rng(42)
+
+a = rng.standard_normal((T, D)).astype(np.float32)
+b = rng.standard_normal((T, D)).astype(np.float32)
+c = rng.standard_normal((T, D)).astype(np.float32)
+
+scores = np.einsum("id,jd->ij", a, b)              # [T, T]
+idx       = np.arange(T)
+chunk_id  = idx // P
+diff      = chunk_id[:, None] - chunk_id[None, :]
+mask      = (diff >= 0) & (diff <= L)              # bool [T, T]
+
+scaled    = scores * SCALE
+pre       = np.where(mask, scaled, -np.inf)
+attn      = np.exp(pre - pre.max(axis=-1, keepdims=True))
+attn      = attn / attn.sum(axis=-1, keepdims=True)
+attn      = attn.astype(np.float32)
+
+output    = np.einsum("ij,jd->id", attn, c).astype(np.float32)  # [T, D]
+
+np.savez("io.npz", a=a, b=b, c=c, output=output)
+print(f"Saved io.npz  a={a.shape} b={b.shape} c={c.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex07-banded-causal-sdpa/graph.nnef b/harness/pulse-multi-axis/ex07-banded-causal-sdpa/graph.nnef
new file mode 100644
index 0000000000..17cd02d272
--- /dev/null
+++ b/harness/pulse-multi-axis/ex07-banded-causal-sdpa/graph.nnef
@@ -0,0 +1,47 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_registry tract_transformers;
+extension tract_symbol T;
+extension tract_assert T>=0;
+
+# Banded causal SDPA — the conformer/whisper-style attention shape.
+#
+# Q·Kᵀ → ScaledMaskedSoftmax(banded mask, scale=0.5) → attn·V.
+#
+# Mask: `0 ≤ chunk(i) - chunk(j) ≤ 1`.  Combined with the terminator
+# EinSum contracting `j`, the per-output-i-chunk dependency is on
+# j-chunks `[chunk(i) - 1, chunk(i)]` — causal in `i` with one chunk
+# of left context.
+#
+# Boundary case: at chunk(i) = 0 the past j-chunk doesn't exist; the
+# softmax should normalise only over chunk(j) = 0.  Mul-by-mask +
+# Reduce/EinSum bodies (ex01-ex05) shrug this off because zero-data
+# annihilates the masked-out positions naturally.  SMS does NOT —
+# `exp(0·scale) = 1` leaks into the denominator.  Today's blockify
+# substitutes the mask with a constant all-true block, which is the
+# wrong answer at this boundary.  This synthetic surfaces the bug.
+
+graph network(a, b, c) -> (output)
+{
+    a = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    b = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    c = tract_core_external(shape = [T, 4], datum_type = 'f32');
+
+    scores = tract_core_einsum([a, b], expr = "id,jd->ij", acc = "f32");
+
+    pos      = tract_core_range(0, tract_core_shape_of(a)[0], step = 1);
+    chunk_id = pos / 2;
+
+    chunk_row = unsqueeze(chunk_id, axes = [1]);
+    chunk_col = unsqueeze(chunk_id, axes = [0]);
+    diff      = chunk_row - chunk_col;
+
+    left_ok  = ge(diff, 0);
+    right_ok = le(diff, 1);
+    in_band  = and(left_ok, right_ok);
+
+    attn = tract_transformers_scaled_masked_softmax(scores, in_band, scale = 0.5, post_softmax_mask = false);
+
+    output = tract_core_einsum([attn, c], expr = "ij,jd->id", acc = "f32");
+}
diff --git a/harness/pulse-multi-axis/ex07-banded-causal-sdpa/io.npz b/harness/pulse-multi-axis/ex07-banded-causal-sdpa/io.npz
new file mode 100644
index 0000000000..b8c3a68bdd
Binary files /dev/null and b/harness/pulse-multi-axis/ex07-banded-causal-sdpa/io.npz differ
diff --git a/harness/pulse-multi-axis/ex07-banded-causal-sdpa/runme.sh b/harness/pulse-multi-axis/ex07-banded-causal-sdpa/runme.sh
new file mode 100755
index 0000000000..a73c12d701
--- /dev/null
+++ b/harness/pulse-multi-axis/ex07-banded-causal-sdpa/runme.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# Batch
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers --set T=6 . run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+
+# Pulsified — Q·Kᵀ → SMS(banded mask) → attn·V.  At chunk(i) = 0 the
+# past j-chunk doesn't exist.  Blockify chunkifies the mask construction
+# faithfully (Sub/Ge/Le/And replayed in chunked form, with WindowOnAxis
+# on the contracted side using a sentinel pad value so the band predicate
+# evaluates "out of band" on boundary slots), so SMS sees a chunked mask
+# that agrees with the batch reference at every chunk including chunk 0.
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers . --pulse 'T=2' run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
diff --git a/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/gen-inputs.py b/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/gen-inputs.py
new file mode 100644
index 0000000000..06cbf1eba1
--- /dev/null
+++ b/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/gen-inputs.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+"""Generate io.npz for the block-diag select+softmax SDPA synthetic.
+
+Same semantics as ex06 but with explicit `select(mask, scores, -inf)`
+instead of `ScaledMaskedSoftmax` — see graph.nnef for why.  No scale.
+
+Parameters
+----------
+P = 2 (pulse / chunk size)
+S = 3 (chunks)
+T = S * P = 6
+D = 4
+"""
+
+import numpy as np
+
+P, S, D = 2, 3, 4
+T = S * P
+
+rng = np.random.default_rng(42)
+
+a = rng.standard_normal((T, D)).astype(np.float32)
+b = rng.standard_normal((T, D)).astype(np.float32)
+c = rng.standard_normal((T, D)).astype(np.float32)
+
+scores = np.einsum("id,jd->ij", a, b)              # [T, T]
+
+idx       = np.arange(T)
+chunk_id  = idx // P
+in_block  = (chunk_id[:, None] == chunk_id[None, :])  # bool [T, T]
+
+pre       = np.where(in_block, scores, -np.inf)
+attn      = np.exp(pre - pre.max(axis=-1, keepdims=True))
+attn      = attn / attn.sum(axis=-1, keepdims=True)
+attn      = attn.astype(np.float32)
+
+output    = np.einsum("ij,jd->id", attn, c).astype(np.float32)  # [T, D]
+
+np.savez("io.npz", a=a, b=b, c=c, output=output)
+print(f"Saved io.npz  a={a.shape} b={b.shape} c={c.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/graph.nnef b/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/graph.nnef
new file mode 100644
index 0000000000..8cfaffb1ad
--- /dev/null
+++ b/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/graph.nnef
@@ -0,0 +1,46 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_registry tract_transformers;
+extension tract_symbol T;
+extension tract_assert T>=0;
+
+# Block-diagonal SDPA via `select(mask, scores, -inf)` + softmax (the
+# explicit batch form of `ScaledMaskedSoftmax`).  Same semantics as
+# ex06, but the masking step is `select` instead of SMS.
+#
+# Why this matters: encoder-style attention graphs (e.g. the Nemotron
+# encoder, harness/sdpa-pulse/ex04+) write the mask-apply step as
+# `masked = select(bool_mask, scores, scores * 0.0 + -inf)`.  Declutter
+# folds `scores * 0.0 + -inf` into a `MultiBroadcastTo` of a scalar
+# `-inf` constant up to `scores`'s `[T, T]` shape, which lands inside
+# the multi-T-axis section as a non-EinSum, non-uniform_tdim initiator
+# — a third initiator flavour Blockify needs to handle.
+#
+# Inputs:  a (queries), b (keys), c (values)  all [T, 4]
+# Output: [T, 4]
+#
+# Streaming axis: 0 on all inputs and on the output.  Pulse size P = 2.
+
+graph network(a, b, c) -> (output)
+{
+    a = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    b = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    c = tract_core_external(shape = [T, 4], datum_type = 'f32');
+
+    scores = tract_core_einsum([a, b], expr = "id,jd->ij", acc = "f32");
+
+    pos      = tract_core_range(0, tract_core_shape_of(a)[0], step = 1);
+    chunk_id = pos / 2;
+
+    chunk_row = unsqueeze(chunk_id, axes = [1]);
+    chunk_col = unsqueeze(chunk_id, axes = [0]);
+    in_block  = eq(chunk_row, chunk_col);
+
+    # Apply mask: keep scores where mask=true, -inf elsewhere.
+    # `scores * 0.0 + -inf` declutters to a const broadcast (-inf, [T,T]).
+    masked_scores = select(in_block, scores, scores * 0.0 + -inf);
+
+    attn   = softmax(masked_scores, axes = [1]);
+    output = tract_core_einsum([attn, c], expr = "ij,jd->id", acc = "f32");
+}
diff --git a/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/io.npz b/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/io.npz
new file mode 100644
index 0000000000..8679cf3901
Binary files /dev/null and b/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/io.npz differ
diff --git a/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/runme.sh b/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/runme.sh
new file mode 100755
index 0000000000..f265b443a3
--- /dev/null
+++ b/harness/pulse-multi-axis/ex08-block-diag-select-sdpa/runme.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# Batch
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers --set T=6 . run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+
+# Pulsified — Q·Kᵀ → select(block-diag mask, scores, -inf) → softmax →
+# attn·V.  This exercises the `MultiBroadcastTo` initiator path in
+# Blockify: declutter folds `scores * 0.0 + -inf` to a scalar-(-inf)
+# broadcast to `[T, T]`, which lands as a non-data initiator inside
+# the multi-T-axis section.
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers . --pulse 'T=2' run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
diff --git a/harness/pulse-multi-axis/ex09-two-chained-sdpa/gen-inputs.py b/harness/pulse-multi-axis/ex09-two-chained-sdpa/gen-inputs.py
new file mode 100644
index 0000000000..585194c1fd
--- /dev/null
+++ b/harness/pulse-multi-axis/ex09-two-chained-sdpa/gen-inputs.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""Generate io.npz for two chained block-diag SDPA layers.
+
+Layer 1: SDPA(q,  k1, v1)        with block-diag mask, scale=0.5.
+Layer 2: SDPA(L1, k2, v2)        same mask, scale=0.5.
+
+Parameters
+----------
+P = 2 (pulse / chunk size)
+S = 3 (chunks)
+T = S * P = 6
+D = 4
+"""
+
+import numpy as np
+
+P, S, D = 2, 3, 4
+T = S * P
+SCALE = 0.5
+
+rng = np.random.default_rng(42)
+
+q  = rng.standard_normal((T, D)).astype(np.float32)
+k1 = rng.standard_normal((T, D)).astype(np.float32)
+v1 = rng.standard_normal((T, D)).astype(np.float32)
+k2 = rng.standard_normal((T, D)).astype(np.float32)
+v2 = rng.standard_normal((T, D)).astype(np.float32)
+
+
+def block_diag_sdpa(qx, kx, vx):
+    scores = np.einsum("id,jd->ij", qx, kx)
+    idx = np.arange(T)
+    chunk_id = idx // P
+    in_block = (chunk_id[:, None] == chunk_id[None, :])
+    scaled = scores * SCALE
+    pre = np.where(in_block, scaled, -np.inf)
+    attn = np.exp(pre - pre.max(axis=-1, keepdims=True))
+    attn = attn / attn.sum(axis=-1, keepdims=True)
+    return np.einsum("ij,jd->id", attn.astype(np.float32), vx).astype(np.float32)
+
+
+layer1 = block_diag_sdpa(q, k1, v1)
+output = block_diag_sdpa(layer1, k2, v2)
+
+np.savez("io.npz", q=q, k1=k1, v1=v1, k2=k2, v2=v2, output=output)
+print(f"Saved io.npz  q={q.shape} k1={k1.shape} v1={v1.shape} k2={k2.shape} v2={v2.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex09-two-chained-sdpa/graph.nnef b/harness/pulse-multi-axis/ex09-two-chained-sdpa/graph.nnef
new file mode 100644
index 0000000000..c2be520bb3
--- /dev/null
+++ b/harness/pulse-multi-axis/ex09-two-chained-sdpa/graph.nnef
@@ -0,0 +1,46 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_registry tract_transformers;
+extension tract_symbol T;
+extension tract_assert T>=0;
+
+# Two chained block-diagonal SDPA blocks.  The first block produces
+# `[T, D]`, which feeds as the queries of the second.  Each block has
+# its own (K, V) inputs.  Same chunk axis and mask construction across
+# both blocks (single chunk_id wire, shared between the two Eq nodes).
+#
+# Why this matters: the recogniser detects independent quadratic
+# sections via connected components of multi-T-axis nodes; this test
+# exercises *two* sections in the same model, each ChunkSize = 2,
+# block-diagonal mask.  Each section is rewritten by its own
+# TypedModelPatch and applied in sequence.
+#
+# Streaming axis: 0 on all inputs and on the output.  Pulse size P = 2.
+
+graph network(q, k1, v1, k2, v2) -> (output)
+{
+    q  = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    k1 = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    v1 = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    k2 = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    v2 = tract_core_external(shape = [T, 4], datum_type = 'f32');
+
+    # Shared chunk-id wire (T → chunk_id = pos / 2).
+    pos      = tract_core_range(0, tract_core_shape_of(q)[0], step = 1);
+    chunk_id = pos / 2;
+
+    chunk_row = unsqueeze(chunk_id, axes = [1]);
+    chunk_col = unsqueeze(chunk_id, axes = [0]);
+    in_block  = eq(chunk_row, chunk_col);
+
+    # Block 1: SDPA(q, k1, v1).
+    scores1 = tract_core_einsum([q, k1], expr = "id,jd->ij", acc = "f32");
+    attn1   = tract_transformers_scaled_masked_softmax(scores1, in_block, scale = 0.5, post_softmax_mask = false);
+    block1  = tract_core_einsum([attn1, v1], expr = "ij,jd->id", acc = "f32");
+
+    # Block 2: SDPA(block1, k2, v2) — block1 acts as the queries here.
+    scores2 = tract_core_einsum([block1, k2], expr = "id,jd->ij", acc = "f32");
+    attn2   = tract_transformers_scaled_masked_softmax(scores2, in_block, scale = 0.5, post_softmax_mask = false);
+    output  = tract_core_einsum([attn2, v2], expr = "ij,jd->id", acc = "f32");
+}
diff --git a/harness/pulse-multi-axis/ex09-two-chained-sdpa/io.npz b/harness/pulse-multi-axis/ex09-two-chained-sdpa/io.npz
new file mode 100644
index 0000000000..50b624e5f2
Binary files /dev/null and b/harness/pulse-multi-axis/ex09-two-chained-sdpa/io.npz differ
diff --git a/harness/pulse-multi-axis/ex09-two-chained-sdpa/runme.sh b/harness/pulse-multi-axis/ex09-two-chained-sdpa/runme.sh
new file mode 100755
index 0000000000..0e355d7092
--- /dev/null
+++ b/harness/pulse-multi-axis/ex09-two-chained-sdpa/runme.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# Batch
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers --set T=6 . run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+
+# Pulsified — two chained block-diagonal SDPA blocks.  The recogniser
+# finds two quadratic sections (one per block); each is rewritten by
+# its own TypedModelPatch.  The shared chunk-id wire is preserved
+# (faithful chunkification of the mask chain in both sections).
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers . --pulse 'T=2' run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
diff --git a/harness/pulse-multi-axis/ex10-conv-then-sdpa/gen-inputs.py b/harness/pulse-multi-axis/ex10-conv-then-sdpa/gen-inputs.py
new file mode 100644
index 0000000000..744f8b4803
--- /dev/null
+++ b/harness/pulse-multi-axis/ex10-conv-then-sdpa/gen-inputs.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+"""Generate io.npz for max-pool then block-diag SDPA.
+
+Pre-pool the queries via a 3-tap max pool (kernel=3, padding=1) on the
+streaming axis, then block-diag SDPA(q_pooled, k, v) with scale=0.5.
+
+Parameters
+----------
+P = 2 (pulse / chunk size)
+S = 3 (chunks)
+T = S * P = 6
+D = 4
+"""
+
+import numpy as np
+
+P, S, D = 2, 3, 4
+T = S * P
+SCALE = 0.5
+
+rng = np.random.default_rng(42)
+
+q = rng.standard_normal((T, D)).astype(np.float32)
+k = rng.standard_normal((T, D)).astype(np.float32)
+v = rng.standard_normal((T, D)).astype(np.float32)
+
+# Max-pool kernel=5, padding=2, stride=1 along the time axis.
+# Delay = (kernel - 1) / 2 = 2, exactly one chunk at P=2.  tract's
+# max_pool with border='constant' ignores padded positions
+# (equivalent to filling with -inf in the max).
+q_padded = np.pad(q, ((2, 2), (0, 0)), mode="constant", constant_values=-np.inf)
+q_pre = np.stack(
+    [q_padded[i : i + T] for i in range(5)], axis=0
+).max(axis=0).astype(np.float32)
+
+scores = np.einsum("id,jd->ij", q_pre, k)
+idx = np.arange(T)
+chunk_id = idx // P
+in_block = (chunk_id[:, None] == chunk_id[None, :])
+scaled = scores * SCALE
+pre = np.where(in_block, scaled, -np.inf)
+attn = np.exp(pre - pre.max(axis=-1, keepdims=True))
+attn = (attn / attn.sum(axis=-1, keepdims=True)).astype(np.float32)
+output = np.einsum("ij,jd->id", attn, v).astype(np.float32)
+
+np.savez("io.npz", q=q, k=k, v=v, output=output)
+print(f"Saved io.npz  q={q.shape} k={k.shape} v={v.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex10-conv-then-sdpa/graph.nnef b/harness/pulse-multi-axis/ex10-conv-then-sdpa/graph.nnef
new file mode 100644
index 0000000000..cb633247f2
--- /dev/null
+++ b/harness/pulse-multi-axis/ex10-conv-then-sdpa/graph.nnef
@@ -0,0 +1,57 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_registry tract_transformers;
+extension tract_symbol T;
+extension tract_assert T>=0;
+
+# 1D convolution (kernel=3, padding=1) on the queries, then block-
+# diagonal SDPA.  The conv adds a streaming-axis delay of 1 per pulse;
+# the SDPA section sits downstream and must chunk-rewrite while
+# carrying that delay through.
+#
+# Layout: queries are reshaped to [1, 1, T, 4] (NCHW with H=time so the
+# conv operates on H) — kernel size [1, 1, 3, 1] with padding
+# [(0,0), (0,0), (1,1), (0,0)] keeps the time dim at T.  Reshape back
+# to [T, 4] to feed the SDPA.
+#
+# Streaming axis: 0 on `q`/`k`/`v` and on `output`.  Pulse size P = 2.
+
+graph network(q, k, v) -> (output)
+{
+    q = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    k = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    v = tract_core_external(shape = [T, 4], datum_type = 'f32');
+
+    # Lift to NCHW [1, 1, T, 4] for the pool, then back.  Max-pool of
+    # size 5 with symmetric padding 2 keeps the time dim at T and
+    # introduces a streaming-axis delay of 2 — exactly one chunk at
+    # P=2.  (A kernel-3 / pad-1 pool would give a 1-element delay,
+    # which doesn't divide the chunk size and can't be carried through
+    # the chunkifying Reshape.  REVISIT: extend `PulsedReshape`'s
+    # delay-rescaling to handle non-multiple cases via stream-axis
+    # alignment padding.)
+    q_3d   = unsqueeze(q, axes = [0]);            # [1, T, 4]
+    q_4d   = unsqueeze(q_3d, axes = [0]);         # [1, 1, T, 4]
+    pooled = max_pool(q_4d,
+                      size    = [1, 1, 5, 1],
+                      padding = [(0, 0), (0, 0), (2, 2), (0, 0)],
+                      stride  = [1, 1, 1, 1],
+                      dilation = [1, 1, 1, 1],
+                      border  = 'constant');
+    pooled_3d = squeeze(pooled, axes = [0]);      # [1, T, 4]
+    q_pre     = squeeze(pooled_3d, axes = [0]);   # [T, 4]
+
+    # Block-diagonal SDPA on the pooled queries.
+    scores = tract_core_einsum([q_pre, k], expr = "id,jd->ij", acc = "f32");
+
+    pos      = tract_core_range(0, tract_core_shape_of(q)[0], step = 1);
+    chunk_id = pos / 2;
+
+    chunk_row = unsqueeze(chunk_id, axes = [1]);
+    chunk_col = unsqueeze(chunk_id, axes = [0]);
+    in_block  = eq(chunk_row, chunk_col);
+
+    attn   = tract_transformers_scaled_masked_softmax(scores, in_block, scale = 0.5, post_softmax_mask = false);
+    output = tract_core_einsum([attn, v], expr = "ij,jd->id", acc = "f32");
+}
diff --git a/harness/pulse-multi-axis/ex10-conv-then-sdpa/io.npz b/harness/pulse-multi-axis/ex10-conv-then-sdpa/io.npz
new file mode 100644
index 0000000000..0afcda9a8c
Binary files /dev/null and b/harness/pulse-multi-axis/ex10-conv-then-sdpa/io.npz differ
diff --git a/harness/pulse-multi-axis/ex10-conv-then-sdpa/runme.sh b/harness/pulse-multi-axis/ex10-conv-then-sdpa/runme.sh
new file mode 100755
index 0000000000..c8c6bee94c
--- /dev/null
+++ b/harness/pulse-multi-axis/ex10-conv-then-sdpa/runme.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+cd `dirname $0`
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+# Batch
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers --set T=6 . run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+
+# Pulsified — max_pool(kernel=3, padding=1) on the queries, then a
+# block-diagonal SDPA section.  At pulse time the pool emits a
+# streaming-axis Delay of 1; the downstream SDPA section is rewritten
+# in chunked form by Blockify and inherits that delay through the
+# normal pulse machinery.
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers . --pulse 'T=2' run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
diff --git a/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/gen-inputs.py b/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/gen-inputs.py
new file mode 100644
index 0000000000..03d8ff944e
--- /dev/null
+++ b/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/gen-inputs.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""ex13-padmask-broadcast-transpose — io.npz generator.
+
+T = 4, chunk size P = 2 → 2 chunks of 2 tokens each.
+The pad mask marks the last frame as padding (invalid).
+
+Reference computation:
+    scores = q · kᵀ                        # [T, T]
+    pad_2d[i, j] = pad[i] AND pad[j]       # [T, T]
+    in_block[i, j] = (i // P == j // P)    # [T, T]
+    mask     = in_block AND pad_2d         # [T, T]
+    masked   = where(mask, scores, -inf)
+    attn     = softmax(masked, axis=-1)    # rows where every key is masked → NaN
+    output   = attn · c                    # [T, D]
+"""
+import numpy as np
+
+T, D, P = 4, 4, 2
+rng = np.random.default_rng(13)
+
+q   = rng.standard_normal((T, D)).astype(np.float32)
+k   = rng.standard_normal((T, D)).astype(np.float32)
+c   = rng.standard_normal((T, D)).astype(np.float32)
+# All frames valid in this minimal version.  The pad-mask construction
+# pattern (broadcast → transpose → AND) reproduces regardless of which
+# bits are set; using all-True keeps the reference output finite (a
+# padded row would be fully masked under block-diag and produce NaN
+# from softmax).
+pad = np.ones(T, dtype=bool)
+
+scores  = q @ k.T                                         # [T, T]
+pad_2d  = pad[:, None] & pad[None, :]                     # [T, T]
+
+idx       = np.arange(T)
+chunk_id  = idx // P
+diff      = chunk_id[:, None] - chunk_id[None, :]
+in_block  = (diff >= 0) & (diff <= 1)                     # banded-causal [T, T]
+
+mask    = in_block & pad_2d
+masked  = np.where(mask, scores, -np.inf)
+
+# Softmax row-by-row, with a guard against fully-masked rows producing NaN.
+m       = masked.max(axis=-1, keepdims=True)
+exp_s   = np.exp(masked - m)
+denom   = exp_s.sum(axis=-1, keepdims=True)
+attn    = np.where(denom > 0, exp_s / denom, 0.0).astype(np.float32)
+
+output  = (attn @ c).astype(np.float32)
+
+np.savez("io.npz", q=q, k=k, c=c, pad=pad, output=output)
+print(f"Saved io.npz  q={q.shape} pad={pad.shape} output={output.shape}")
diff --git a/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/graph.nnef b/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/graph.nnef
new file mode 100644
index 0000000000..78aee08fb0
--- /dev/null
+++ b/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/graph.nnef
@@ -0,0 +1,69 @@
+version 1.0;
+
+extension tract_registry tract_core;
+extension tract_registry tract_transformers;
+extension tract_symbol T;
+extension tract_assert T>=0;
+
+# ex13-padmask-broadcast-transpose — minimal repro of encoder.p1's
+# blockify failure on the pad-mask construction.
+#
+# The encoder builds its 2D pad-mask from a 1D per-frame validity mask
+# `pad [T]` via:
+#
+#     pad_2d_in   = unsqueeze(pad, [0])              # [1, T]
+#     pad_repeat  = MultiBroadcastTo(pad_2d_in,
+#                                    shape = [T, T]) # [T, T]
+#     pad_T       = transpose(pad_repeat, [1, 0])    # [T, T]
+#     pad_2d      = and(pad_repeat, pad_T)           # [T, T]
+#
+# Semantically: pad_2d[i, j] = pad[i] AND pad[j].
+#
+# Blockify treats `pad_repeat` as a streaming-MultiBroadcastTo
+# initiator (broadcast-from-1 slot becomes the within-chunk axis,
+# streaming axis becomes [chunks, k] split with windowing).  The body
+# transpose then has to swap two within-chunk axes that were chunked
+# differently (broadcast-from-1 → k, vs. windowed-streaming → W = L·k).
+# These have different sizes, so the AND of (chunked, transposed)
+# can't broadcast — same shape mismatch as encoder.p1's
+# `padMaskForAttMask_1` body op, at the smallest possible scale.
+
+graph network(q, k, c, pad) -> (output)
+{
+    q   = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    k   = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    c   = tract_core_external(shape = [T, 4], datum_type = 'f32');
+    pad = tract_core_external(shape = [T],    datum_type = 'bool');
+
+    # Content scores.
+    scores = matmul(q, k, transposeB = true);          # [T, T]
+
+    # Pad-mask 2D construction (encoder pattern).
+    pad_2d_in  = unsqueeze(pad, axes = [0]);           # [1, T]
+    pad_repeat = tract_core_broadcast(pad_2d_in, shape = [T, T]);
+    pad_T      = transpose(pad_repeat, axes = [1, 0]); # [T, T]
+    pad_2d     = and(pad_repeat, pad_T);               # [T, T]
+
+    # Banded-causal mask (P = 2 tokens per chunk, look-back L = 1):
+    # at chunk c we attend to chunks {c-1, c}.  W = 2·k = 4 keys.
+    # This makes the within-chunk T_q axis (size k=2, broadcast-from-1
+    # in the pad construction) and the within-chunk T_k axis (size W=4,
+    # windowed) different sizes — so the body AND between
+    # `pad_repeat` and `pad_T` shape-mismatches in the chunked frame.
+    pos      = tract_core_range(0, T, step = 1);
+    chunk_id = pos / 2;
+    chunk_r  = unsqueeze(chunk_id, axes = [1]);
+    chunk_c  = unsqueeze(chunk_id, axes = [0]);
+    diff     = chunk_r - chunk_c;
+    left_ok  = ge(diff, 0);
+    right_ok = le(diff, 1);
+    in_block = and(left_ok, right_ok);
+
+    # Combined mask: in-chunk AND both frames valid.
+    full_mask = and(in_block, pad_2d);
+
+    masked = select(full_mask, scores, scores * 0.0 + -inf);
+    attn   = softmax(masked, axes = [1]);
+
+    output = matmul(attn, c);
+}
diff --git a/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/io.npz b/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/io.npz
new file mode 100644
index 0000000000..e324400a15
Binary files /dev/null and b/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/io.npz differ
diff --git a/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/runme.sh b/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/runme.sh
new file mode 100755
index 0000000000..a94bfb9d48
--- /dev/null
+++ b/harness/pulse-multi-axis/ex13-padmask-broadcast-transpose/runme.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+# ex13-padmask-broadcast-transpose — minimal repro of the pad-mask
+# outer-AND pattern that encoder exports build for the 2D validity mask.
+#
+# Source layout: a 1D per-frame validity mask `pad [T]` is unsqueezed,
+# broadcast to `[T, T]`, transposed, then AND'd against the original
+# broadcast — yielding `pad_2d[i, j] = pad[i] AND pad[j]`.  Combined with
+# a banded-causal block mask (P=2, L=1 → W=4), the AND lifts a 1D
+# streaming source into a multi-T-axis score-shape wire, making it the
+# section's initiator.
+#
+# Two passes handle this end-to-end:
+# 1. `core/array/broadcast` declutter swaps each `Broadcast → AxisOp`
+#    branch through under fan-out, then the existing single-successor
+#    `Broadcast → TypedBinOp` elimination subsumes both broadcasts —
+#    the AND ends up consuming `[1, T]` and `[T, 1]` directly.
+# 2. Blockify's generic chunked-`TypedBinOp` section-initiator handler
+#    chunk-splits each AND input on its streaming axis, windows the one
+#    whose axis lands on the contracted (K) side, and wires the chunked
+#    AND with implicit broadcasting at the chunked rank.
+#
+# Batch and pulsified both pass.
+
+cd "$(dirname "$0")"
+set -ex
+
+: ${TRACT_RUN:=cargo run -p tract-cli $CARGO_OPTS --}
+
+python3 gen-inputs.py
+
+# Batch.
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers --set T=4 . run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
+
+# Pulsified.
+$TRACT_RUN --nnef-tract-core --nnef-tract-transformers . --pulse 'T=2' run \
+    --approx approximate \
+    --input-from-bundle io.npz --assert-output-bundle io.npz
diff --git a/hir/src/infer/factoid.rs b/hir/src/infer/factoid.rs
index a9ecc1c1e0..e4083e403d 100644
--- a/hir/src/infer/factoid.rs
+++ b/hir/src/infer/factoid.rs
@@ -205,9 +205,7 @@ impl ShapeFactoid {
     }
 
     pub fn as_concrete_finite(&self) -> TractResult<Option<TVec<usize>>> {
-        if self.open {
-            return Ok(None);
-        }
+        rule_if!(!self.open);
         Ok(self.dims.iter().map(|d| d.concretize().and_then(|d| d.to_usize().ok())).collect())
     }
 
diff --git a/hir/src/ops/array/strided_slice.rs b/hir/src/ops/array/strided_slice.rs
index 11f980cd9b..d895db30b1 100644
--- a/hir/src/ops/array/strided_slice.rs
+++ b/hir/src/ops/array/strided_slice.rs
@@ -76,6 +76,12 @@ impl InferenceRulesOp for StridedSlice {
                 let mut output_shape = input_shape.clone();
                 let mut shrink = vec![];
                 for (ix, axis) in axes.into_iter().enumerate() {
+                    ensure!(
+                        axis < input_shape.len(),
+                        "StridedSlice: axis {} out of range for input of rank {}",
+                        axis,
+                        input_shape.len()
+                    );
                     let preped =
                         self.prepare_one_dim(ix, &input_shape[axis], begin, end, &strides)?;
                     output_shape[axis] = preped.soft_len()?;
diff --git a/libcli/src/tensor.rs b/libcli/src/tensor.rs
index cc6c1a4037..d2fc9528ad 100644
--- a/libcli/src/tensor.rs
+++ b/libcli/src/tensor.rs
@@ -306,6 +306,13 @@ pub struct RunParams {
 
 pub struct RunTensors {
     pub sources: Vec<TVec<TValue>>,
+    /// In pulse mode, the *real* input length on the streaming axis
+    /// (i.e. before the trailing turns of zero-padding that
+    /// `get_or_make_tensors` adds to flush the pipeline).  Used by the
+    /// runner to bind the streaming symbol so PulsePad and friends
+    /// resolve `end_input` correctly at end-of-stream.  `None` for
+    /// non-pulse runs.
+    pub streaming_input_len: Option<usize>,
 }
 
 #[cfg(feature = "transformers")]
@@ -402,6 +409,7 @@ fn get_or_make_tensors(
     name: &str,
     input_idx: usize,
     target: &mut TVec<Vec<TValue>>,
+    streaming_input_len: &mut Option<usize>,
 ) -> TractResult<()> {
     if let Some(mut value) = params
         .tensors_values
@@ -468,6 +476,14 @@ fn get_or_make_tensors(
                     input_pulse_axis
                 );
             }
+            // Record the real streaming-axis length on the *first* input we
+            // see; downstream uses it to bind the streaming symbol so that
+            // PulsePad's `end_input` resolves to the correct end-of-stream.
+            // (All inputs share the same streaming dim, so picking the first
+            // is fine.)
+            if streaming_input_len.is_none() {
+                *streaming_input_len = Some(input_len);
+            }
 
             // how many pulses do we need to push full result out ?
             // guess by looking at len and delay of the first output
@@ -540,10 +556,19 @@ fn get_or_make_tensors(
 pub fn get_or_make_inputs(tract: &Arc<dyn Model>, params: &RunParams) -> TractResult<RunTensors> {
     // Resolve source inputs
     let mut tmp_inputs = tvec![];
+    let mut streaming_input_len = None;
     for (ix, input) in tract.input_outlets().iter().enumerate() {
         let fact = tract.outlet_typedfact(*input)?;
         let name = tract.node_name(input.node);
-        get_or_make_tensors(tract, params, fact, name, ix, &mut tmp_inputs)?;
+        get_or_make_tensors(
+            tract,
+            params,
+            fact,
+            name,
+            ix,
+            &mut tmp_inputs,
+            &mut streaming_input_len,
+        )?;
     }
 
     let n_turns = tmp_inputs.iter().map(|t| t.len()).max().unwrap_or(0);
@@ -556,7 +581,7 @@ pub fn get_or_make_inputs(tract: &Arc<dyn Model>, params: &RunParams) -> TractRe
         })
         .collect::<Vec<_>>();
 
-    Ok(RunTensors { sources })
+    Ok(RunTensors { sources, streaming_input_len })
 }
 
 fn make_inputs(values: &[impl std::borrow::Borrow<TypedFact>]) -> TractResult<TVec<TValue>> {
diff --git a/linalg/AMX_BENCH_RESULTS.md b/linalg/AMX_BENCH_RESULTS.md
new file mode 100644
index 0000000000..b1261e61d3
--- /dev/null
+++ b/linalg/AMX_BENCH_RESULTS.md
@@ -0,0 +1,85 @@
+# AMX validation & benchmark results
+
+Run of `linalg/AMX_BENCH_RUNBOOK.md` on real Intel AMX hardware.
+
+- **Host:** `Intel(R) Xeon(R) Processor @ 2.10GHz` (Sapphire/Emerald Rapids-class), 4 vCPU
+- **ISA:** `amx_tile amx_int8 amx_bf16` + AVX-512-VNNI; kernel `6.18.5` (≥5.16); binutils `2.42`; rustc `1.94.1`
+- **Branch:** `claude/zealous-galileo-fEQ3d` @ `7a23812`
+- **Method:** `cargo bench`, default criterion sampling, pinned to core 2 (`taskset -c 2`), idle box (load ≈ 1.0)
+- **Date:** 2026-06-02
+
+## 1. AMX live confirmation ✅
+
+Gate-check (`amx_i32` bench) produced `avx512amx_8x8`/`avx512amx_16x16` columns with real `thrpt:` numbers — **neither** "tract not built with AMX" (build probe) **nor** "AMX not available, skipping" (runtime CPUID + `arch_prctl` XTILEDATA gate) appeared. AMX is genuinely exercised.
+
+## 2. Correctness
+
+| Suite | Result |
+|---|---|
+| `cargo test -p tract-linalg --lib avx512amx` | **297 passed; 3 failed** |
+| `cargo test -p tract-linalg --lib x86_64_fma::mmm` | **1833 passed; 3 failed** |
+
+**Bugfix `99eb75b9d` VALIDATED on silicon** ✅ — every `scalar_sub` / `per_row_sub` / `per_col_sub` (+`_f`) test passed for **both** `avx512amx_mmm_i32_16x16` and `avx512amx_mmm_f32_16x16`.
+
+**3 failures — all in the AMX bf16 path** (`avx512amx_mmm_f32_16x16::f32f32_bf16`): `fuse::prop`, `frame::prop`, `fuse::packed_packed_bug_3`.
+
+**Root cause = test-harness tolerance, NOT a kernel defect.** `packed_packed.rs:367` selects the comparison tolerance from the **accumulator** dtype:
+```rust
+let app = if K::Acc::datum_type() == f16::datum_type()
+    { Approximation::SuperApproximate } else { Approximation::Approximate };
+```
+This kernel accumulates in **f32** (TDPBF16PS: bf16×bf16→f32), so it gets `Approximate` = `(atol 1e-4, rtol 5e-4, 0 outliers)` — but the `f32f32_bf16` packing truncates inputs to bf16 (~2⁻⁸ ≈ 0.39% rel). bf16-grade error is checked against an f32-grade bar with zero tolerated outliers ⇒ guaranteed failure. `SuperApproximate` `(atol 0.1, rtol 0.05, 1e-4 outliers)` would pass. The structurally identical int8 16×16 kernel passes 100%.
+
+**Proposed fix:** in `check()`, pick `SuperApproximate` when the packing is bf16-based, not only when `K::Acc == f16`.
+
+**Empirically verified (on the AMX host):** the kernel was run on 7 cases (including the exact `bug_3` input) and compared against an independent **bf16-truncated** reference — built with the project's own `f32_to_bf16_rne` — judged by the *same* tight `Approximate` bar: **0 outliers across ~335k output elements** (max abs err ≤ 1.3e-5), versus **282,788 outliers** against a pure-f32 reference. The kernel reproduces "truncate inputs→bf16, accumulate→f32" exactly; the 3 red tests are 100% the f32 oracle, with no kernel defect.
+
+## 3. Benchmarks — throughput (Gelem/s, point estimate)
+
+### `amx_i32` — int8 GEMM
+| M×K×N | avx2 | avx512vnni (8×8) | avx512amx_8×8 | avx512amx_16×16 |
+|---|---:|---:|---:|---:|
+| 64×256×64 | 0.41 | 11.21 | 68.41 | **233.64** |
+| 256×256×256 | 0.41 | 11.31 | 68.47 | **237.29** |
+| 512×512×512 | 0.39 | 8.94 † | 112.86 | **228.15** |
+| 1024×1024×64 | 0.41 | 34.84 | 178.42 | **279.51** |
+
+### `amx_f32` — bf16→f32 GEMM
+| M×K×N | fma_16×6 | avx512_16×12 | avx512amx_bf16_16×16 |
+|---|---:|---:|---:|
+| 64×256×64 | 37.12 | 64.31 | **207.35** |
+| 256×256×256 | 37.90 | 71.90 | **225.74** |
+| 512×512×512 | 39.37 | 64.69 | **348.38** |
+| 1024×1024×64 | 36.85 | 59.22 | **318.36** |
+
+### `vnni_i32` — int8 GEMM (new 16×16 in isolation)
+| M×K×N | avx2 | avx512vnni (8×8) | avx512vnni_16×16 |
+|---|---:|---:|---:|
+| 64×256×64 | 0.41 | 10.90 | **135.74** |
+| 256×256×256 | 0.40 | 10.78 | **134.92** |
+| 512×512×512 | 0.40 | 20.53 | **154.39** |
+| 1024×1024×64 | 0.41 | 34.77 | **161.27** |
+
+† `avx512vnni`@512³ read 8.94 here vs 20.53 in `vnni_i32` (same kernel/shape). Treat **20.53** as the credible value (it fits the size trend 11.3→20.5→34.8); 8.94 was an outlier. A higher-sampling re-measure was attempted but could not complete — see §6.
+
+## 4. Head-to-head ratios
+
+| Comparison | 64×256×64 | 256×256×256 | 512×512×512 | 1024×1024×64 |
+|---|---:|---:|---:|---:|
+| **AMX 16×16 ÷ VNNI 16×16** (int8, same CPU) | 1.72× | 1.76× | 1.48× | 1.73× |
+| **AMX 16×16 ÷ AMX 8×8** (int8) | 3.42× | 3.47× | 2.02× | 1.57× |
+| **VNNI 16×16 ÷ VNNI 8×8** (int8) | 12.45× | 12.51× | 7.52× | 4.64× |
+| **AMX bf16 16×16 ÷ AVX-512 f32 16×12** | 3.22× | 3.14× | 5.39× | 5.38× |
+| *(bonus) AMX bf16 ÷ FMA f32 16×6* | 5.59× | 5.96× | 8.85× | 8.64× |
+
+## 5. Findings
+
+1. **AMX int8 16×16 wins everywhere — justifies `boost(100)` > VNNI `boost(50)`.** 1.48–1.76× over the new VNNI 16×16 on the *same* silicon. Dispatch ordering is correct.
+2. **AMX 16×16 vs 8×8: 1.57–3.47×.** 16×16 leads on all tested shapes; the 4×-work/instr advantage is largest on compact shapes (3.4× @ 64×256×64) and narrowest on tall-skinny 1024×1024×64 (1.57×, N=64). No tested shape favors 8×8 — any crossover lives below this suite (smaller M or N<16). `qmmm_i32` defaulting to 16×16 here is sound.
+3. **VNNI 16×16 vs 8×8: 4.64–12.5× — far above the dev box's 1.3–2.1×.** Likely the 8×8 kernel's ymm (256-bit) accumulators vs the new kernel's zmm (512-bit), amplified on Sapphire Rapids (no AVX-512 license downclock that Cascade Lake suffers). Strongly validates the new kernel; the magnitude warrants one sanity re-check (see #4).
+4. **Data-quality flag (resolved by inspection):** `avx512vnni` 8×8 @ 512³ read 8.94 (in `amx_i32`) vs 20.53 (in `vnni_i32`) — a 2.3× swing on the same kernel/shape. **20.53 is the credible figure** (it continues the monotone size trend 11.3 @ 256³ → 20.5 @ 512³ → 34.8 @ 1024×1024×64; 8.94 breaks it). A `--sample-size 200` re-measure was launched but the AMX host was reclaimed before it could run (see §6); the ratio table already uses the consistent 20.53 pairing. AMX columns were stable across runs.
+5. **AMX bf16 is 3.1–5.4× the AVX-512 f32 kernel** (5.6–8.9× over FMA), scaling up on larger shapes (348 Gelem/s @ 512³) — with the documented bf16 precision trade (see §2 and `X86_64_INT8_GEMM.md`).
+
+## 6. Reproducibility note
+
+Numbers were collected **2026-06-02** on an AMX-capable `Intel(R) Xeon(R) @ 2.10GHz` (`amx_tile/int8/bf16` + AVX-512-VNNI, kernel 6.18.5). The ephemeral session container was subsequently reclaimed and re-provisioned onto a different `Intel(R) Xeon(R) @ 2.80GHz` with **neither AMX nor AVX-512-VNNI** (only `avx512f`), on which `amx_i32`/`vnni_i32` both short-circuit and skip — so the one outstanding re-measure (VNNI-8×8 @ 512³) could not be completed in this session. To reproduce or extend, run on an AMX host (Sapphire Rapids / Emerald Rapids / Granite Rapids Xeon, or Xeon Max) following `linalg/AMX_BENCH_RUNBOOK.md`.
diff --git a/linalg/AMX_BENCH_RUNBOOK.md b/linalg/AMX_BENCH_RUNBOOK.md
new file mode 100644
index 0000000000..c5fde39c07
--- /dev/null
+++ b/linalg/AMX_BENCH_RUNBOOK.md
@@ -0,0 +1,211 @@
+# AMX validation & benchmark runbook
+
+**For: a Claude Code session (or human) on an x86_64 CPU that has Intel AMX.**
+
+The kernel work on branch `claude/zealous-galileo-fEQ3d` was developed on a
+Cascade Lake-class container (AVX-512-VNNI, **no AMX**). Everything that can run
+without AMX is already validated there. This runbook covers the two things that
+box **could not** do and that need a real AMX CPU.
+
+## Your task
+
+**Benchmark every int8 / bf16 GEMM kernel in this tree on this AMX CPU — all the
+AMX kernels *and* the AVX-512-VNNI kernels we just improved — and run the AMX
+correctness suite.** Full kernel inventory to cover:
+
+| Kernel | ISA | Covered by bench |
+|---|---|---|
+| `avx512amx_mmm_i32_8x8` | AMX int8 (`tdpbssd`) | `amx_i32` |
+| `avx512amx_mmm_i32_16x16` | AMX int8 (`tdpbssd`) | `amx_i32` |
+| `avx512amx_mmm_f32_16x16` | AMX bf16→f32 (`tdpbf16ps`) | `amx_f32` |
+| `avx512vnni_mmm_i32_8x8` | AVX-512-VNNI (`vpdpbusd`) | `vnni_i32`, `amx_i32` |
+| **`avx512vnni_mmm_i32_16x16`** ← new | AVX-512-VNNI (`vpdpbusd`, zmm) | `vnni_i32` |
+| `avx2_mmm_i32_8x8` (baseline) | AVX2 | both i32 benches |
+
+Running the three benches in Step 4 covers all of the above. Yes — bench the VNNI
+kernels here too: an AMX CPU (Sapphire Rapids+) also has AVX-512-VNNI, so it's the
+one place you can measure AMX 16×16 and VNNI 16×16 **on the same silicon** and see
+how much AMX actually wins.
+
+In addition, this AMX CPU is the only place that can:
+
+1. **Correctness-test the AMX kernels** — including a recent bugfix to the AMX
+   16×16 `sub` fused-op handlers that was invisible on non-AMX hardware.
+2. **Benchmark** the AMX int8 / bf16 kernels and the new AVX-512-VNNI 16×16
+   kernel head-to-head.
+
+> ⚠️ **Most important caveat:** every AMX kernel test short-circuits to "ok" when
+> the host can't run AMX (`is_supported_here()` is false). So a green
+> `cargo test` on the wrong box proves **nothing**. You must first confirm AMX is
+> actually live (Step 2). The **benchmarks are the authoritative gate-check** —
+> they print an explicit "AMX … not available, skipping" message and emit no AMX
+> columns if the gate is closed.
+
+---
+
+## 0. Prerequisites
+
+| Requirement | Why | Check |
+|---|---|---|
+| AMX-capable CPU (Sapphire Rapids / Emerald Rapids / Granite Rapids Xeon, or Xeon Max) | `tdpbssd` / `tdpbf16ps` | `grep -o 'amx[_a-z]*' /proc/cpuinfo \| sort -u` → expect `amx_bf16 amx_int8 amx_tile` |
+| Linux kernel ≥ 5.16 | AMX tile-data XSAVE permission via `arch_prctl(ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)` | `uname -r` |
+| binutils/gas ≥ 2.34 (≥ 2.36 ideal) | assembles AMX mnemonics (and `{vex}` for AVX-VNNI) | `as --version` |
+| Rust stable (dev used 1.94–1.96) | build | `cargo --version` |
+
+If `/proc/cpuinfo` shows no `amx_*` flags, this is the wrong machine — stop here.
+
+---
+
+## 1. Get the code
+
+**Fresh clone (preferred):**
+```sh
+git clone https://github.com/czoli1976/tract.git
+cd tract
+git checkout claude/zealous-galileo-fEQ3d
+```
+
+**Existing checkout:**
+```sh
+git fetch origin claude/zealous-galileo-fEQ3d
+git checkout claude/zealous-galileo-fEQ3d && git pull
+# IMPORTANT when pulling into a checkout that was built before: the new kernel
+# template (avx512vnni_mmm_i32_16x16.S.j2) may not trigger a build-script rerun
+# (build.rs emits per-file rerun-if-changed). Force it once:
+touch linalg/build.rs
+```
+(A fresh clone needs no `touch` — it renders every template on first build.)
+
+---
+
+## 2. Confirm AMX is actually live (do this first)
+
+The AMX kernels are gated by CPUID **and** the kernel granting tile-data XSAVE
+permission. The benchmark is the cleanest runtime probe — if AMX is unavailable
+it prints a skip line instead of numbers:
+
+```sh
+cargo bench -p tract-linalg --bench amx_i32 -- --warm-up-time 0.2 --measurement-time 0.5 --sample-size 10 2>&1 | head -20
+```
+
+- ✅ **Good:** you see `avx512amx_8x8` and `avx512amx_16x16` lines with `thrpt:`.
+- ❌ **Bad:** `AMX int8 not available (CPUID + arch_prctl gate failed), skipping`
+  → AMX isn't usable (check kernel ≥ 5.16, not in a VM that masks AMX, XSAVE
+  permission not blocked by a seccomp/container policy). Don't proceed — the
+  correctness tests would silently no-op.
+
+Optional: `RUST_LOG=info cargo test -p tract-linalg --lib avx512amx_mmm_i32_16x16 -- --nocapture 2>&1 | grep -i activated`
+should log `qmmm_i32: x86_64/avx512amx_int8 (16x16 + 8x8 adaptive) activated`.
+
+---
+
+## 3. Correctness validation (the priority)
+
+Only meaningful once Step 2 confirms AMX is live.
+
+```sh
+# All three AMX kernel suites: int8 8x8, int8 16x16, bf16 16x16.
+cargo test -p tract-linalg --lib avx512amx 2>&1 | tail -30
+
+# Full x86_64 mmm suite (AMX + VNNI + AVX2 + FMA + AVX-512), for completeness.
+cargo test -p tract-linalg --lib x86_64_fma::mmm 2>&1 | tail -5
+```
+
+**Expected:** `test result: ok. <N> passed; 0 failed`.
+
+**What this specifically proves (and the dev box couldn't):** the
+`scalar_sub` / `per_row_sub` / `per_col_sub` (+ `_flipped`) fused-op tests for
+`test_avx512amx_mmm_i32_16x16` and `test_avx512amx_mmm_f32_16x16` **actually
+execute**. Those guard commit `99eb75b9d`, which fixed swapped operands in the
+AMX `sub` handlers (they were computing `acc − operand` instead of
+`operand − acc`, i.e. negated results). This fix is currently only
+build-verified — **this run is what confirms it on real silicon.**
+
+---
+
+## 4. Benchmarks
+
+On real hardware use default sampling (drop the reduced flags) and pin a core for
+stable numbers. Idle box, turbo/frequency-scaling fixed if you can.
+
+```sh
+# int8: AVX2 vs VNNI 8x8 vs AMX 8x8 vs AMX 16x16
+taskset -c 2 cargo bench -p tract-linalg --bench amx_i32
+
+# f32 via bf16: FMA 16x6 vs AVX-512 16x12 vs AMX-BF16 16x16
+taskset -c 2 cargo bench -p tract-linalg --bench amx_f32
+
+# the new kernel in isolation: AVX2 vs VNNI 8x8 vs VNNI 16x16
+taskset -c 2 cargo bench -p tract-linalg --bench vnni_i32
+```
+
+Bench layout (group `… /packed_packed`, shapes `64x256x64`, `256x256x256`,
+`512x512x512`, `1024x1024x64`, throughput in `Gelem/s`):
+
+| Bench | Columns |
+|---|---|
+| `amx_i32` | `avx2`, `avx512vnni`, `avx512amx_8x8`, `avx512amx_16x16` |
+| `amx_f32` | `fma_16x6`, `avx512_16x12`, `avx512amx_bf16_16x16` |
+| `vnni_i32` | `avx2`, `avx512vnni` (8×8), `avx512vnni_16x16` |
+
+Criterion writes HTML reports under `target/criterion/`.
+
+---
+
+## 5. What to report back
+
+**Correctness**
+- Confirm AMX was live (Step 2 showed AMX columns / cpuinfo has `amx_int8`).
+- `cargo test … avx512amx` result line (`N passed; 0 failed`), confirming the
+  AMX `*_sub` fused-op tests passed → bugfix `99eb75b9d` validated on hardware.
+
+**Performance** — the `thrpt:` (Gelem/s) per shape per column for all three
+benches, plus these head-to-head reads:
+
+1. **AMX 16×16 vs VNNI 16×16** (compare `amx_i32`'s `avx512amx_16x16` against
+   `vnni_i32`'s `avx512vnni_16x16`, same shapes). AMX should win — that justifies
+   the dispatch ordering (`boost(100)` for AMX 16×16 > `boost(50)` for VNNI
+   16×16). Report the ratio.
+2. **AMX 16×16 vs AMX 8×8** — the 4×-work-per-instruction claim and where 8×8
+   wins on small shapes (informs the `qmmm_i32` 16/8 crossover).
+3. **VNNI 16×16 vs 8×8** — does the ~1.3–2.1× measured on Cascade Lake hold on
+   this CPU too?
+4. **AMX-BF16 16×16 vs AVX-512 f32 16×12** — bf16 throughput win (with the bf16
+   precision trade-off noted in `linalg/X86_64_INT8_GEMM.md`).
+
+---
+
+## Appendix A — one-shot script
+
+```sh
+set -e
+echo "## CPU AMX flags:"; grep -o 'amx[_a-z]*' /proc/cpuinfo | sort -u || true
+echo "## kernel:"; uname -r
+echo "## gate check (expect AMX columns, not a skip message):"
+cargo bench -p tract-linalg --bench amx_i32 -- --warm-up-time 0.2 --measurement-time 0.5 --sample-size 10 2>&1 | grep -iE "amx|skipping|thrpt" | head
+echo "## correctness:"
+cargo test -p tract-linalg --lib avx512amx 2>&1 | tail -3
+cargo test -p tract-linalg --lib x86_64_fma::mmm 2>&1 | tail -3
+echo "## full benches:"
+taskset -c 2 cargo bench -p tract-linalg --bench amx_i32
+taskset -c 2 cargo bench -p tract-linalg --bench amx_f32
+taskset -c 2 cargo bench -p tract-linalg --bench vnni_i32
+```
+
+## Appendix B — what's on this branch
+
+Three commits on top of the prior AMX/VNNI work:
+
+| Commit | Summary |
+|---|---|
+| `9e8f1c5aa` | doc: `linalg/X86_64_INT8_GEMM.md` — the full int8 GEMM kernel cascade |
+| `26726db8e` | **feat**: `avx512vnni_mmm_i32_16x16` — zmm-wide int8 VNNI kernel (1.3–2.1× over 8×8 on Cascade Lake) |
+| `99eb75b9d` | **fix**: swapped operands in AMX 16×16 `sub` fused-op handlers (int8 + bf16) — **needs AMX to validate** |
+
+Background and the kernel-selection/dispatch model: see
+`linalg/X86_64_INT8_GEMM.md`.
+
+> Note on Intel SDE: SDE *can* emulate AMX for **functional/correctness** checks
+> on a non-AMX box (`sde64 -spr -- <test-binary>`), but it is **not** a
+> performance model — timings under SDE are meaningless. Use it only if no AMX
+> hardware is available, and never for the benchmark numbers above.
diff --git a/linalg/Cargo.toml b/linalg/Cargo.toml
index 1eff18cc4d..19393f11c3 100644
--- a/linalg/Cargo.toml
+++ b/linalg/Cargo.toml
@@ -83,6 +83,18 @@ harness = false
 name = "mm_for_asr_am"
 harness = false
 
+[[bench]]
+name = "hardswish"
+harness = false
+
+[[bench]]
+name = "silu"
+harness = false
+
+[[bench]]
+name = "gelu"
+harness = false
+
 [[bench]]
 name = "sigmoid"
 harness = false
@@ -119,3 +131,27 @@ harness = false
 bench = false
 name = "leaky_relu"
 harness = false
+
+[[bench]]
+name = "avx512_zombies"
+harness = false
+
+[[bench]]
+name = "wasm"
+harness = false
+
+[[bench]]
+name = "vnni_i32"
+harness = false
+
+[[bench]]
+name = "amx_i32"
+harness = false
+
+[[bench]]
+name = "amx_f32"
+harness = false
+
+[[bench]]
+name = "avxvnni_i32"
+harness = false
diff --git a/linalg/MULTITHREAD_BENCHMARKS.md b/linalg/MULTITHREAD_BENCHMARKS.md
new file mode 100644
index 0000000000..6849b94d2c
--- /dev/null
+++ b/linalg/MULTITHREAD_BENCHMARKS.md
@@ -0,0 +1,139 @@
+# Multithreaded MMM benchmarks
+
+Validation data for the `multithread-mm` rayon path with this PR's
+`chunked_dispatch_rayon` + `THREADING_PANEL_THRESHOLD` + `RayonGlobal`
+changes.
+
+## Setup
+
+- **Base**: tract `main` (commit `41b7b02`), with the merged WASM kernel
+  kit (PRs `#2164` + `#2173`).
+- **Vanilla baseline**: same commit, MMM dispatch unchanged
+  (1D `into_par_iter` over single panel axis).
+- **Patched**: this PR applied — chunked 2D dispatch, threshold, RayonGlobal.
+- **Both binaries built identically**: same compiler, same kernel kit, same
+  `+atomics +bulk-memory +mutable-globals +simd128` target features for
+  WASM.
+- **Driver**: Playwright headless, real browser engines. Median of 60
+  iterations after 3-iter warmup.
+- **Output verification**: FNV-1a hash of result tensor. All 60 cells
+  produce identical hash (`20ea4579c427f925` for DFN3,
+  shape-deterministic for synthetic) — bit-equal output preserved.
+
+## Synthetic dense matmul (the parallelism-bound case)
+
+| Shape | Engine | Vanilla (1 thread) | Patched, 4 threads | Speedup |
+|---|---|---|---|---|
+| **1024×1024×1024** (transformer FFN) | Chromium | 55.4 ms | 16.6 ms | **3.34×** |
+|  | WebKit | 79.8 ms | 25.6 ms | **3.12×** |
+|  | Firefox | 1107 ms | 325 ms | **3.40×** |
+| **512×768×768** (BERT FFN) | Chromium | 15.7 ms | 5.1 ms | **3.07×** |
+|  | WebKit | 22.6 ms | 6.8 ms | **3.30×** |
+|  | Firefox | 311 ms | 91 ms | **3.42×** |
+| **256×256×128** | Chromium | 0.47 ms | 0.22 ms | **2.19×** |
+|  | WebKit | 0.66 ms | 0.20 ms | **3.30×** |
+|  | Firefox | 9.0 ms | 2.6 ms | **3.41×** |
+| 64×256×64 (DFN-like small) | Chromium | 0.07 ms | 0.07 ms | 1.0× (within noise) |
+|  | WebKit | 0.08 ms | 0.04 ms | 2.00× |
+|  | Firefox | 1.18 ms | 0.38 ms | **3.11×** |
+| 32×32×32 (tiny) | All | sub-ms | sub-ms | **1.0× (threshold gates)** |
+
+The threshold correctly gates the smallest shape; threading kicks in once
+panel count clears the gate, and scales near-linearly with thread count
+on all three engines.
+
+## Real model (DeepFilterNet 3, full streaming inference)
+
+5-frame chunks at 48 kHz (50 ms of audio per iteration).
+
+| Engine | Vanilla mono | Patched, 4 threads | RTF (vanilla → patched) | Speedup |
+|---|---|---|---|---|
+| Chromium | 3.31 ms | 3.17 ms | 0.066 → 0.063 | 1.04× (within noise) |
+| WebKit | 4.32 ms | 4.00 ms | 0.086 → 0.080 | 1.08× |
+| Firefox | 34.20 ms | 33.34 ms | 0.684 → 0.667 | 1.03× |
+
+DFN3 is Amdahl-bound: only ~25% of runtime is in MMMs that clear the
+threshold; the rest is FFT, complex multiplication, and small RNN-internal
+matmuls that the threshold deliberately keeps single-threaded. The
+threshold's role here is to **prevent regression**, not deliver speedup.
+This is correct behavior — DFN3-class workloads should not pay rayon
+overhead on tiny ops.
+
+## Native (macOS aarch64, generic kernels)
+
+Spot-check on the rayon path before/after. Tract's existing rayon path
+already worked well on native; the change is mostly a refactor.
+
+| Shape | Vanilla 1D | Patched 2D | Change |
+|---|---|---|---|
+| 256×256, 4 threads | 2.13 ms | 2.12 ms | net-neutral |
+| 512×512, 4 threads | 9.93 ms | 9.80 ms | +1% |
+| 64×256, 4 threads | 615 µs | 625 µs | −2% |
+
+Within noise on common shapes. The 2D dispatch shows a latent benefit on
+shapes 1D parallelism handles poorly (e.g. m=8 n=2048, where 1D over m
+can only feed 2 threads); not yet measured directly on native but the
+dispatch math is the same on both targets.
+
+## Determinism
+
+Across all measured cells (synthetic + DFN3, 3 engines, 1/2/3/4 threads):
+
+- **WASM**: 60 cells, all produce identical hash per `(shape, mode)` pair.
+- **Native**: existing tract proptests (3524 lib tests) pass with this
+  PR's `multithread-mm` enabled.
+
+## Tuning the threshold
+
+The default `THREADING_PANEL_THRESHOLD` is `64` panels (m_panels ×
+n_panels). Adjust at runtime via:
+
+```rust
+use tract_linalg::multithread::set_threading_panel_threshold;
+
+set_threading_panel_threshold(0);    // thread every size, no gate
+set_threading_panel_threshold(256);  // gate harder — transformer-only
+set_threading_panel_threshold(64);   // default
+```
+
+Useful when profiling or specialising the build for a known workload class:
+
+| Workload class | Suggested threshold |
+|---|---|
+| Streaming RNN / mobile vision (many small MMMs) | 64 (default) or higher |
+| Mid-size dense (BERT-class) | 32–64 |
+| Large dense only (transformer FFN, LLM) | 16 or lower |
+
+The constant lives in `linalg/src/multithread.rs`; readers go through
+`current_threading_panel_threshold()` (`AtomicUsize::Relaxed`, no lock on
+the dispatch hot path).
+
+## Reproduction
+
+The harness uses [Vonage's libDF fork](https://github.com/czoli1976/DeepFilterNet)
+(branch `dfn3-wasm-opt-tract-022-kernel-kit`) migrated to tract main, with
+a `wasm-bindgen-rayon`-based threading bootstrap. Build:
+
+```bash
+RUSTFLAGS="-C target-feature=+atomics,+bulk-memory,+mutable-globals,+simd128" \
+wasm-pack build --target web --release \
+    --no-default-features --features wasm-mt -- \
+    -Z build-std=std,panic_abort
+```
+
+JS-side:
+
+```javascript
+import init, { initThreadPool, df_set_thread_count } from './pkg/df.js';
+await init();
+await initThreadPool(navigator.hardwareConcurrency);  // wasm-bindgen-rayon
+df_set_thread_count(4);  // sets Executor::RayonGlobal in tract-linalg
+```
+
+Without `Executor::RayonGlobal` (this PR), `df_set_thread_count` would
+need to construct an `Arc<rayon::ThreadPool>` — which fails on
+`wasm32-unknown-unknown` because `rayon::ThreadPoolBuilder::new().build()`
+internally calls `std::thread::spawn` (unsupported there). That's the
+crux of why this enabling change is needed in tract-linalg itself: any
+browser threading via wasm-bindgen-rayon would otherwise silently fall
+back to single-threaded.
diff --git a/linalg/SME_PHASE1_BENCH.xlsx b/linalg/SME_PHASE1_BENCH.xlsx
new file mode 100644
index 0000000000..b90cc5fc6c
Binary files /dev/null and b/linalg/SME_PHASE1_BENCH.xlsx differ
diff --git a/linalg/WASM_RELAXED_SIMD.md b/linalg/WASM_RELAXED_SIMD.md
new file mode 100644
index 0000000000..3797a0a3d9
--- /dev/null
+++ b/linalg/WASM_RELAXED_SIMD.md
@@ -0,0 +1,105 @@
+# tract-linalg on `wasm32` — relaxed-simd FMA
+
+The WASM MMM kernels (`wasm_f32_4x4`, `4x1`, `8x1`, `16x1`, `32x1`, `8x8`)
+and the WASM sigmoid/tanh activations all flip between two emit modes at
+compile time, gated on `cfg(target_feature = "relaxed-simd")`:
+
+- **Without** `+relaxed-simd`: pure `f32x4_add(_, f32x4_mul(_, _))` (mul+add).
+  Runs on any WASM runtime that supports `simd128`.
+- **With** `+relaxed-simd`: `f32x4_relaxed_madd(_, _, _)`. Fused, single-rounded
+  multiply-add on hosts whose CPU has hardware FMA (all ARM64, x86_64 + FMA3).
+  Universal browser/runtime support since 2023 (Chrome 114+, Firefox 120+,
+  Safari 17+, wasmtime 16+).
+
+The speedup of the relaxed path over the baseline is typically **1.40–1.55× at
+the kernel level** and **1.08–1.46× end-to-end** across vision CNNs,
+transformer attention and RNN audio models. Bit-pattern drift versus the
+mul+add path is bounded at one ulp (FMA single-rounding); within
+`Approximation::Close` (1e-4).
+
+## Build flags
+
+```sh
+# Baseline (any wasm32 runtime supporting simd128)
+RUSTFLAGS='-C target-feature=+simd128' \
+  cargo build --release --target wasm32-wasip1 -p tract-linalg
+
+# Relaxed (requires host support for relaxed-simd; ~1.40× faster on FMA-capable hosts)
+RUSTFLAGS='-C target-feature=+simd128,+relaxed-simd' \
+  cargo build --release --target wasm32-wasip1 -p tract-linalg
+```
+
+Same on `wasm32-unknown-unknown` if shipping for the browser.
+
+## Why two binaries (and not in-process runtime dispatch)
+
+WASM validates the entire module at instantiation, before any code runs.
+A binary containing `f32x4.relaxed_madd` fails to instantiate on hosts without
+relaxed-simd — `LinkError` / `CompileError`, not a runtime trap. So the
+x86/ARM pattern (one binary, both paths in source, runtime CPU detection picks
+at execution time) cannot be replicated in-binary on WASM: the FMA opcodes are
+either present (and host support is required) or absent.
+
+Runtime dispatch happens one layer up — at the host runtime / consumer layer
+— by selecting the correct binary at module-load time.
+
+## Consumer-side dispatch
+
+### Browser / `WebAssembly.validate`
+
+```js
+async function loadTract(baseUrl) {
+    const candidate = await fetch(`${baseUrl}/tract-relaxed.wasm`);
+    const bytes = await candidate.arrayBuffer();
+
+    const wantRelaxed = WebAssembly.validate(bytes, {
+        builtins: ['relaxed_simd'],
+    });
+
+    const url = wantRelaxed
+        ? `${baseUrl}/tract-relaxed.wasm`
+        : `${baseUrl}/tract.wasm`;
+
+    const final = await fetch(url);
+    return WebAssembly.instantiateStreaming(final);
+}
+```
+
+Fallback for hosts without the `WebAssembly.validate(bytes, { ... })`
+options-arg: try-instantiate the relaxed binary, catch `LinkError` /
+`CompileError`, retry with the baseline.
+
+### `wasmtime` (server / native)
+
+```rust
+use wasmtime::{Config, Engine};
+
+let mut config = Config::new();
+config.wasm_relaxed_simd(true);     // gate on host-CPU detection if needed
+let engine = Engine::new(&config)?;
+
+let bytes = std::fs::read(if relaxed_supported {
+    "tract-relaxed.wasm"
+} else {
+    "tract.wasm"
+})?;
+let module = wasmtime::Module::new(&engine, &bytes)?;
+```
+
+`wasmtime::Engine`'s `wasm_relaxed_simd` configures the runtime; a separate
+`wasmtime::Module::validate()` call against the engine is the equivalent of
+the browser's `WebAssembly.validate` for picking which binary to load.
+
+## Quality
+
+The two binaries are **not bit-identical**. FMA's single-rounding produces
+≤1 ulp drift from explicit mul+add. Verified end-to-end on Inception v3 and
+DFN3 sub-models:
+
+| model        | output shape       | baseline L2  | relaxed L2   |
+|--------------|--------------------|-------------:|-------------:|
+| Inception v3 | [1, 1001]          | 6.477089e-2  | 6.477089e-2  |
+| DFN3 df_dec  | [1, 100, 96, 10]   | 1.080686e-2  | 1.080686e-2  |
+
+L2 norms are bit-identical to 7 sig figs; per-element values diverge in the
+7th–8th decimal place. Within tract's `Approximation::Close` (1e-4).
diff --git a/linalg/X86_64_INT8_GEMM.md b/linalg/X86_64_INT8_GEMM.md
new file mode 100644
index 0000000000..c960b5f8e5
--- /dev/null
+++ b/linalg/X86_64_INT8_GEMM.md
@@ -0,0 +1,135 @@
+# x86_64 int8 GEMM kernels
+
+This note documents the int8 (i32-accumulator) matrix-multiply kernel family for
+x86_64, for maintainers touching `linalg/src/x86_64_fma/mmm.rs` (Rust
+registration + dispatch) and `linalg/x86_64/fma/*.S.j2` (assembly templates).
+
+The kernels form a throughput cascade from the portable AVX2 emulation up to
+Intel AMX, with AVX-512-VNNI in between. The right kernel is chosen at runtime
+from CPUID + (for selection among ties) the einsum kernel scorer.
+
+## Kernel family
+
+| Kernel | ISA | Tile M×N | Matmul instr | A packing | B packing | Build gate |
+|---|---|---|---|---|---|---|
+| `avx2_mmm_i32_8x8` | AVX2 | 8×8 (ymm) | `vpmaddubsw` emulation | `PackedFormat` i8 | `PackedFormat` i8 | always |
+| `avx512vnni_mmm_i32_8x8` | AVX-512-VNNI | 8×8 (ymm) | `vpdpbusd` | `PackedI8K4(8)` | `PackedI8K4(8)` | always |
+| `avx512vnni_mmm_i32_16x16` | AVX-512-VNNI | 16×16 (zmm) | `vpdpbusd` ×16 rows | `PackedI8K4(16)` | `PackedI8K4(16)` | always |
+| `avxvnni_mmm_i32_8x8` | AVX-VNNI (VEX) | 8×8 (ymm) | `{vex} vpdpbusd` | `PackedI8K4(8)` | `PackedI8K4(8)` | `tract_avxvnni` |
+| `avx512amx_mmm_i32_8x8` | AMX-INT8 | 8×8 | `tdpbssd` | `PackedAmxA(8)` | `PackedI8K4(8)` | `tract_amx_int8` |
+| `avx512amx_mmm_i32_16x16` | AMX-INT8 | 16×16 | `tdpbssd` (16384 MACs) | `PackedAmxA(16)` | `PackedI8K4(16)` | `tract_amx_int8` |
+| `avx512amx_mmm_f32_16x16` (f32) | AMX-BF16 | 16×16 | `tdpbf16ps` | `PackedAmxBf16A(16)` | `PackedBf16K2(16)` | `tract_amx_bf16` |
+
+The two AVX-512-VNNI kernels and the AVX2 one are always compiled (their
+mnemonics are in every supported binutils); the AMX and AVX-VNNI kernels are
+behind assembler-probe cfgs (see below).
+
+## The u8×s8 `+128` bias trick (VNNI / AVX-VNNI)
+
+`vpdpbusd` is **u8 × s8** (unsigned first operand). To compute the s8×s8 product
+we need, the kernel offsets the A bytes by `+128` (modular `vpaddb`, making them
+u8 in `[0,255]`) and then removes the resulting per-column bias
+`128 * sum_k(B[n])` after the K loop. The bias is accumulated cheaply during the
+loop with a `vpdpbusd` against an all-`0x01` u8 vector.
+
+- **8×8 (ymm)** accumulators are *column-major* (`ymm{n}` = column n), so the
+  bias is computed per column and splatted back with `vpermd`.
+- **16×16 (zmm)** accumulators are *row-major* (`zmm{m}` = row m, 16 columns in
+  the 16 lanes). The per-column bias is then a single lane-aligned vector, so the
+  correction is one `vpsubd` per row — cleaner and cheaper than the 8×8 path.
+
+AMX `tdpbssd` is **s8 × s8**, so the AMX int8 kernels need no `+128` trick; their
+i32 accumulators are bit-identical to the AVX2 / VNNI reference.
+
+## Packing formats (see `linalg/src/frame/pack.rs`)
+
+- **`PackedI8K4(r)`** — K=4-inner. Per K=4 block, `r` elements × 4 K-bytes (= `4r`
+  bytes); element `e` sits at byte offset `e*4` holding `[e, 4kb..4kb+3]`. K is
+  zero-padded to a multiple of 4, so kernels read `ceil(k/4)` full blocks safely.
+- **`PackedAmxA(r)`** — AMX A layout: per panel of `r` M-rows, row-major within
+  the panel, K-bytes contiguous, K padded to a multiple of 64 (one `tdpbssd` step
+  consumes 64 K-bytes).
+- **`PackedAmxBf16A` / `PackedBf16K2`** — f32 inputs truncated to bf16 at pack
+  time (round-to-nearest-even, matching `VCVTNEPS2BF16`) for the AMX-BF16 f32 path.
+
+## Build-time cfg gating (`linalg/build.rs`)
+
+Some mnemonics are too new for old toolchains, so each is guarded by an
+**assembler probe** that tries to compile a tiny dummy `.S`. The probe sets a cfg
+that gates *both* compiling the kernel template and referencing its Rust symbol:
+
+| cfg | enables | requires |
+|---|---|---|
+| `tract_amx_int8` | AMX int8 kernels (`tdpbssd`) | gas ≥ 2.34 |
+| `tract_amx_bf16` | AMX bf16 kernel (`tdpbf16ps`) | gas ≥ 2.34 |
+| `tract_avxvnni` | AVX-VNNI ymm kernel (`{vex}` prefix) | binutils ≥ 2.36 |
+
+Kernel `.S.j2` templates are sorted by filename prefix in `build.rs`:
+`avx512amx_*_i32_*` and `*_f32_*` are pulled into their own gated compiles;
+`avxvnni_*` likewise; everything else (including `avx512vnni_*`) stays in the
+generic `-mfma` bulk compile. **A new `avx512vnni_*` kernel needs no `build.rs`
+change** — but note that adding a brand-new template file may not trigger a
+`build.rs` re-run on an incremental build (it emits per-file `rerun-if-changed`),
+so `touch linalg/build.rs` after creating one.
+
+These cfgs reflect **assembler** capability, not the host CPU. A kernel can be
+*compiled* (assembler supports the mnemonic) yet never *run* (CPU lacks the
+feature) — which matters for tests (below).
+
+## Dispatch
+
+`plug()` installs kernels in nested feature order, richest ISA last:
+
+```
+avx2 → [avxvnni] → fma → avx512f → avx512vnni → [amx_int8]      (int8 path)
+                                  → [amx_bf16 overlay]           (f32 path)
+```
+
+Each `plug_*` pushes kernels into `ops.mmm_impls` and may set the explicit int8
+picker `ops.qmmm_i32`. Because later plugs overwrite `qmmm_i32`, the best
+available ISA wins. The pickers are **shape-adaptive**: the 16×16 tile is the
+throughput champion when both M and N fill at least one tile; the 8×8 kernel has
+lower per-call setup and wins on small problems. (AMX additionally requires
+K ≥ 64; VNNI has no K gate since one `vpdpbusd` step is just 4 K-bytes.)
+
+For paths that don't go through `qmmm_i32` (symbolic / unknown shapes via the
+einsum kernel scorer), selection among equal-quality kernels uses
+`-quality_cost*1000 + boost`. All `ManuallyOptimized` kernels tie on quality, so
+`boost` breaks the tie:
+
+| Kernel | boost |
+|---|---|
+| `avx512amx_mmm_i32_16x16`, `avx512amx_mmm_f32_16x16` | 100 |
+| `avx512vnni_mmm_i32_16x16` | 50 |
+| all 8×8 kernels | 0 |
+
+So for unknown shapes: AMX 16×16 ≻ VNNI 16×16 ≻ {VNNI/AMX 8×8}. When AMX is
+absent, VNNI 16×16 is the int8 champion.
+
+## Testing and a cautionary tale
+
+`MMMExternKernel!` auto-generates a `#[cfg(test)] mod test_<kernel>` with
+packed-packed (per packing), fused-op frame, quant-rounding, store, and proptest
+coverage. The harness **skips a kernel when `ker.is_supported_here()` is false**
+(runtime CPUID). Consequently **AMX kernel tests only run on AMX hardware.**
+
+The usual dev/CI host is Cascade Lake-class (AVX-512-VNNI, no AMX), so the AMX
+tests are skipped there. That let a swapped-operand bug in the AMX 16×16 `sub`
+fused-op handlers (`scalar_sub` / `per_row_sub` / `per_col_sub` and their
+`_flipped` twins computed `acc - operand` instead of the correct `operand - acc`)
+go unnoticed — until `avx512vnni_mmm_i32_16x16`, which **reuses the same zmm
+row-major epilogue** and *does* run on VNNI hardware, exposed it (negated
+results). Takeaway: a VNNI kernel that shares an AMX kernel's epilogue effectively
+becomes the on-hardware test for that shared epilogue. The convention for the
+non-commutative `sub` lives in `linalg/x86_64/fma/fma_mmm_ymm_ops.j2`
+(`scalar` / `per_row` / `per_col` macros, `flipped` flag).
+
+## Possible follow-ups
+
+- A dispatch integration test asserting `qmmm_i32` selects the 16×16 kernel for
+  large M,N and the 8×8 for small (no precedent for kernel-selection asserts
+  in-tree yet; would need a small helper to read back the chosen `MatMatMul`).
+- On Sapphire Rapids+ hardware: validate the AMX `sub` fix end-to-end, benchmark
+  the AMX kernels, and re-check the 16×16/8×8 crossover and the `boost` values.
+- A wider AVX-512-BF16 (`vdpbf16ps`) f32 kernel for Cooper Lake-class cores, and
+  a Q4_0/Q8_0 → s8 packer feeding the AMX/VNNI 16×16 path directly.
diff --git a/linalg/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.S.j2 b/linalg/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.S.j2
index 0448683816..e2f6377cac 100644
--- a/linalg/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.S.j2
+++ b/linalg/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.S.j2
@@ -49,11 +49,11 @@ armv7neon_mmm_f32_8x1_{{core}}_{{suffix}}:
 {% set mr = 8 %}{% set from = 8 %}{% set to = 9 %}{% include "armv7neon_mmm_f32_per_cols.j2" %}
 
 .add_unicast:
-    {% for reg in range(0, 16) %}
+    {% for reg in range(0, 4) %}
         vld1.f32    d{{reg}}[0], [ r3 ], r4
         vld1.f32    d{{reg}}[1], [ r3 ], r4
     {% endfor %}
-    {% for reg in range(0, 8) %}
+    {% for reg in range(0, 2) %}
         vadd.f32 q{{ reg + 8 }}, q{{ reg + 8 }}, q{{reg}}
     {% endfor %}
 
diff --git a/linalg/arm64/arm64fp16/arm64fp16_mmm_8h_ops.j2 b/linalg/arm64/arm64fp16/arm64fp16_mmm_8h_ops.j2
index 76e7d17679..b9146123cd 100644
--- a/linalg/arm64/arm64fp16/arm64fp16_mmm_8h_ops.j2
+++ b/linalg/arm64/arm64fp16/arm64fp16_mmm_8h_ops.j2
@@ -60,7 +60,7 @@ b           .non_linear_loop
 {% elif cols == 4 %}
         ldr         d0, [ x2 ]
 {% elif cols == 6 %}
-        ld1         {v0.d}[0], [ x2 ], #8
+        ldr         d0, [ x2 ], #8
         ld1         {v0.s}[2], [ x2 ]
 {% else %}
     {% for reg in range(1, loads + 1) %}
diff --git a/linalg/arm64/arm64simd/arm64simd_mmm_4s_ops.j2 b/linalg/arm64/arm64simd/arm64simd_mmm_4s_ops.j2
index c4da6e1238..5770b783f1 100644
--- a/linalg/arm64/arm64simd/arm64simd_mmm_4s_ops.j2
+++ b/linalg/arm64/arm64simd/arm64simd_mmm_4s_ops.j2
@@ -55,7 +55,7 @@ b           .non_linear_loop
 {%if cols == 1 %}
         ld1         {v0.s}[0], [ x2 ]
 {% elif cols == 3 %}
-        ld1         {v0.d}[0], [ x2 ], #8
+        ldr         d0, [ x2 ], #8
         ld1         {v0.s}[2], [ x2 ]
 {% else %}
     {% for reg in range(1, loads + 1) %}
diff --git a/linalg/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.S.raw b/linalg/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.S.raw
index a380bcccf8..0ea9472a0e 100644
--- a/linalg/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.S.raw
+++ b/linalg/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.S.raw
@@ -24,11 +24,17 @@ fmla        v18.4s, v1.4s, v4.s[3]
 fmla        v19.4s, v2.4s, v4.s[3]
 
 fmla        v20.4s, v0.4s, v5.s[0]
+    prfm        pldl1keep, [x1, #256]
 fmla        v21.4s, v1.4s, v5.s[0]
+    prfm        pldl1keep, [x1, #320]
 fmla        v22.4s, v2.4s, v5.s[0]
+    prfm        pldl1keep, [x1, #384]
 fmla        v23.4s, v0.4s, v5.s[1]
+    prfm        pldl1keep, [x1, #448]
 fmla        v24.4s, v1.4s, v5.s[1]
+    prfm        pldl1keep, [x2, #256]
 fmla        v25.4s, v2.4s, v5.s[1]
+    prfm        pldl1keep, [x2, #320]
 
 fmla        v26.4s, v0.4s, v5.s[2]
 fmla        v27.4s, v1.4s, v5.s[2]
diff --git a/linalg/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.S.raw b/linalg/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.S.raw
index 2ae7a54eba..7c2225fc99 100644
--- a/linalg/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.S.raw
+++ b/linalg/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.S.raw
@@ -26,10 +26,11 @@ add         x2, x2, #16
 fmla        v28.4s, v0.4s, v4.s[3]
 prfm        pldl1keep, [x1, #256]
 fmla        v29.4s, v1.4s, v4.s[3]
-prfm        pldl1keep, [x2, #256]
+prfm        pldl1keep, [x1, #320]
 fmla        v30.4s, v2.4s, v4.s[3]
-prfm        pldl1keep, [x1, #256]
+prfm        pldl1keep, [x2, #256]
 fmla        v31.4s, v3.4s, v4.s[3]
+prfm        pldl1keep, [x2, #320]
 
 ins         v0.d[0], x5
 ins         v2.d[0], x9
diff --git a/linalg/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.S.raw b/linalg/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.S.raw
index 6742473ea2..b77e405dbb 100644
--- a/linalg/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.S.raw
+++ b/linalg/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.S.raw
@@ -41,10 +41,15 @@ fmla        v25.4s, v5.4s, v7.s[2]
 fmla        v26.4s, v0.4s, v7.s[3]
     prfm        pldl1keep, [x1, #320]
 fmla        v27.4s, v1.4s, v7.s[3]
+    prfm        pldl1keep, [x1, #384]
 fmla        v28.4s, v2.4s, v7.s[3]
+    prfm        pldl1keep, [x1, #448]
 fmla        v29.4s, v3.4s, v7.s[3]
+    prfm        pldl1keep, [x2, #320]
 fmla        v30.4s, v4.4s, v7.s[3]
+    prfm        pldl1keep, [x2, #384]
 fmla        v31.4s, v5.4s, v7.s[3]
+    prfm        pldl1keep, [x2, #448]
 
 ins         v0.d[0], x4
 ins         v1.d[0], x6
diff --git a/linalg/arm64/sme/dispatcher.j2 b/linalg/arm64/sme/dispatcher.j2
new file mode 100644
index 0000000000..5663fd2382
--- /dev/null
+++ b/linalg/arm64/sme/dispatcher.j2
@@ -0,0 +1,37 @@
+// vim: ft=arm
+
+.non_linear:
+    sub         x0, x0, 40
+
+.non_linear_loop:
+    add         x0, x0, 40
+    ldr         x2, [x0]
+
+    mov         x4, #{{ jump_table | length }}
+
+    cmp         x2, #{{ jump_table | length }}
+    csel        x2, x2, x4, lt
+    cmp         x2, #0
+    csel        x2, x4, x2, lt
+
+    adr         x3, .jmp_table
+    add         x3, x3, x2, LSL#2
+    br          x3
+
+.jmp_table:
+{% for j in jump_table %}
+    b   .{{j}}
+{% endfor %}
+    b   .unsupported
+
+    add x0, x2, #4000
+    b .return
+
+.unsupported:
+    mov         x0, #1
+    b           .return
+
+.done:
+    mov         x0, 0
+    b           .return
+
diff --git a/linalg/arm64/sme/dummy_sme.S b/linalg/arm64/sme/dummy_sme.S
new file mode 100644
index 0000000000..c35f9c0bce
--- /dev/null
+++ b/linalg/arm64/sme/dummy_sme.S
@@ -0,0 +1,14 @@
+// Build-time capability probe for the assembler, used by build.rs
+// (assembler_supports_sme). Older binutils — notably the Debian stretch
+// aarch64 cross-toolchain in CI — predate SME and cannot assemble these
+// mnemonics even with `.arch armv9-a+sme2`. If this file fails to assemble,
+// build.rs skips the SME kernels and the `tract_sme` cfg, and the runtime
+// falls back to the portable path. Not linked into anything.
+.arch armv9-a+sme2
+.text
+.globl tract_sme_probe
+tract_sme_probe:
+    smstart
+    zero    {za}
+    smstop
+    ret
diff --git a/linalg/arm64/sme/sme_mmm_f32_32x32.S.j2 b/linalg/arm64/sme/sme_mmm_f32_32x32.S.j2
new file mode 100644
index 0000000000..5396f76ed5
--- /dev/null
+++ b/linalg/arm64/sme/sme_mmm_f32_32x32.S.j2
@@ -0,0 +1,474 @@
+// vim: ft=arm
+//
+// SME f32 32x32 matmul kernel.
+//
+// ZA tile layout (4 .S tiles, 16x16 each, indexed left/right x top/bottom):
+//
+//   ZA0.S : C[0..16,  0..16]   (top-left)
+//   ZA1.S : C[0..16,  16..32]  (top-right)
+//   ZA2.S : C[16..32, 0..16]   (bottom-left)
+//   ZA3.S : C[16..32, 16..32]  (bottom-right)
+//
+// Inner K-step: load 32 f32 of A (split z0+z2) and 32 of B (split z1+z3),
+// issue 4 FMOPAs (one per tile). All 4 tiles are independent → SME unit
+// reaches 1 fmopa/cycle = ~2 TFLOPS on M4.
+//
+// Calling convention (extern "C", AAPCS64):
+//   x0 = const *FusedKerSpec<f32>, advanced 40 B per dispatcher iteration.
+//   x1 = stack-resident 4 KiB scratch buffer for tile spills (Phase 1B+).
+//
+// Streaming mode: PSTATE.SM=1 from prologue smstart to epilogue smstop.
+// V0..V31 (low 128 bits = Z0..Z31 low) are destroyed by the smstart/smstop
+// pair; v8..v15 are saved/restored to stack across the streaming region per
+// AAPCS callee-save rules.
+
+.arch armv9-a+sme2
+.text
+.align 4
+
+.global {{G}}sme_mmm_f32_32x32_{{suffix}}
+{{G}}sme_mmm_f32_32x32_{{suffix}}:
+
+    // Save callee-saved q8..q15 (AAPCS preserves low 64 bits of v8..v15;
+    // we save the full 128-bit Q to keep the stack layout simple).
+    stp     q8,  q9,  [sp, #-128]!
+    stp     q10, q11, [sp, #32]
+    stp     q12, q13, [sp, #64]
+    stp     q14, q15, [sp, #96]
+
+    // Allocate 4 KiB tile-spill scratch (kept live across the whole call).
+    sub     sp, sp, #4096
+    mov     x1, sp
+
+    smstart
+    ptrue   p0.b
+
+{% include "dispatcher.j2" %}
+
+// -------- supported fuse ops ---------------------------------------------
+
+.add_mat_mul:
+    ldr     x2, [x0, #24]       // b
+    ldp     x3, x4, [x0, #8]    // k, a
+
+    cmp     x3, #0
+    b.eq    .non_linear_loop
+
+.Lmatmul_loop:
+    ld1w    {z0.s}, p0/z, [x4]
+    ld1w    {z2.s}, p0/z, [x4, #1, mul vl]
+    ld1w    {z1.s}, p0/z, [x2]
+    ld1w    {z3.s}, p0/z, [x2, #1, mul vl]
+    add     x4, x4, #128
+    add     x2, x2, #128
+
+    fmopa   za0.s, p0/m, p0/m, z0.s, z1.s    // C[0..16,  0..16]
+    fmopa   za1.s, p0/m, p0/m, z0.s, z3.s    // C[0..16,  16..32]
+    fmopa   za2.s, p0/m, p0/m, z2.s, z1.s    // C[16..32, 0..16]
+    fmopa   za3.s, p0/m, p0/m, z2.s, z3.s    // C[16..32, 16..32]
+
+    subs    x3, x3, #1
+    b.ne    .Lmatmul_loop
+    b       .non_linear_loop
+
+.clear:
+    zero    {za}
+    b       .non_linear_loop
+
+.store:
+    // FusedKerSpec::Store(OutputStoreKer { ptr, row_byte_stride,
+    //                                      col_byte_stride, item_size })
+    //   [x0, #8]  = ptr           [x0, #16] = row_byte_stride
+    //   [x0, #24] = col_byte_stride  [x0, #32] = item_size
+    ldp     x5, x6, [x0, #8]    // x5 = ptr, x6 = row_byte_stride
+    ldp     x7, x8, [x0, #24]   // x7 = col_byte_stride, x8 = item_size
+
+    // Fast path: contiguous f32 columns (col_stride == 4) → direct ZA→user.
+    // st1w-from-ZA does not accept "[Xn, #imm, MUL VL]" offsets, so we keep
+    // two parallel base pointers for the left and right halves of each row.
+    cmp     x7, #4
+    b.ne    .Lstore_generic
+    cmp     x8, #4
+    b.ne    .Lstore_generic
+
+    add     x4, x5, #64         // right-half base
+    mov     w12, #0
+.Lstore_top:
+    st1w    {za0h.s[w12, 0]}, p0, [x5]
+    st1w    {za1h.s[w12, 0]}, p0, [x4]
+    add     x5, x5, x6
+    add     x4, x4, x6
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .Lstore_top
+    mov     w12, #0
+.Lstore_bot:
+    st1w    {za2h.s[w12, 0]}, p0, [x5]
+    st1w    {za3h.s[w12, 0]}, p0, [x4]
+    add     x5, x5, x6
+    add     x4, x4, x6
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .Lstore_bot
+    b       .non_linear_loop
+
+.Lstore_generic:
+    // Slow path: spill ZA → x1 (scratch buffer, 32x32 row-major, 128 B/row)
+    // using two parallel pointers, then per-element strided scatter.
+    mov     x4, x1              // left-half pointer
+    add     x9, x1, #64         // right-half pointer
+    mov     w12, #0
+.Lstore_spill_top:
+    st1w    {za0h.s[w12, 0]}, p0, [x4]
+    st1w    {za1h.s[w12, 0]}, p0, [x9]
+    add     x4, x4, #128
+    add     x9, x9, #128
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .Lstore_spill_top
+    mov     w12, #0
+.Lstore_spill_bot:
+    st1w    {za2h.s[w12, 0]}, p0, [x4]
+    st1w    {za3h.s[w12, 0]}, p0, [x9]
+    add     x4, x4, #128
+    add     x9, x9, #128
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .Lstore_spill_bot
+
+    // Strided f32 scatter: 32 rows × 32 cols.
+    mov     x3, #0
+.Lstore_row:
+    mov     x4, x5
+    mov     x10, #0
+    lsl     x9, x3, #7          // row*128 byte offset in scratch
+    add     x11, x1, x9
+.Lstore_col:
+    ldr     w9, [x11], #4
+    str     w9, [x4]
+    add     x4, x4, x7
+    add     x10, x10, #1
+    cmp     x10, #32
+    b.lt    .Lstore_col
+    add     x5, x5, x6
+    add     x3, x3, #1
+    cmp     x3, #32
+    b.lt    .Lstore_row
+    b       .non_linear_loop
+
+// -------- scalar ops ------------------------------------------------------
+//
+// FusedKerSpec::Scalar{Add,Mul,Sub,SubF,Min,Max}(TI) — broadcast scalar from
+// [x0, #8] to all lanes, apply elementwise across the 4-tile 32x32 grid.
+//
+// Sub vs SubF semantics (matching apple_amx + tests/fuse.rs):
+//   ScalarSub   → result = scalar - z   (mnemonic fsubr)
+//   ScalarSubF  → result = z - scalar   (mnemonic fsub)
+//
+// Slice-op loop: for each slice index w12, extract ZA tile slice → Z reg,
+// op with broadcast-scalar in z4, insert Z back. Two halves × 4 tiles total.
+
+{% macro scalar_op(label, op) %}
+{{label}}:
+    ldr     w2, [x0, #8]
+    dup     z4.s, w2
+    mov     w12, #0
+.L{{label|replace('.', '')}}_top:
+    mov     z6.s, p0/m, za0h.s[w12, 0]
+    mov     z7.s, p0/m, za1h.s[w12, 0]
+    {{op}}  z6.s, p0/m, z6.s, z4.s
+    {{op}}  z7.s, p0/m, z7.s, z4.s
+    mov     za0h.s[w12, 0], p0/m, z6.s
+    mov     za1h.s[w12, 0], p0/m, z7.s
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .L{{label|replace('.', '')}}_top
+    mov     w12, #0
+.L{{label|replace('.', '')}}_bot:
+    mov     z6.s, p0/m, za2h.s[w12, 0]
+    mov     z7.s, p0/m, za3h.s[w12, 0]
+    {{op}}  z6.s, p0/m, z6.s, z4.s
+    {{op}}  z7.s, p0/m, z7.s, z4.s
+    mov     za2h.s[w12, 0], p0/m, z6.s
+    mov     za3h.s[w12, 0], p0/m, z7.s
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .L{{label|replace('.', '')}}_bot
+    b       .non_linear_loop
+{% endmacro %}
+
+{{ scalar_op('.scalar_add', 'fadd') }}
+{{ scalar_op('.scalar_mul', 'fmul') }}
+{{ scalar_op('.scalar_sub', 'fsubr') }}
+{{ scalar_op('.scalar_sub_flipped', 'fsub') }}
+{{ scalar_op('.scalar_min', 'fmin') }}
+{{ scalar_op('.scalar_max', 'fmax') }}
+
+// -------- per-col ops -----------------------------------------------------
+//
+// 32-element column vector → z4 (cols 0-15) + z5 (cols 16-31).
+// Same z4/z5 is applied to every row across the 4-tile grid.
+
+{% macro per_col_op(label, op) %}
+{{label}}:
+    ldr     x2, [x0, #8]
+    ld1w    {z4.s}, p0/z, [x2]
+    ld1w    {z5.s}, p0/z, [x2, #1, mul vl]
+    mov     w12, #0
+.L{{label|replace('.', '')}}_top:
+    mov     z6.s, p0/m, za0h.s[w12, 0]
+    mov     z7.s, p0/m, za1h.s[w12, 0]
+    {{op}}  z6.s, p0/m, z6.s, z4.s
+    {{op}}  z7.s, p0/m, z7.s, z5.s
+    mov     za0h.s[w12, 0], p0/m, z6.s
+    mov     za1h.s[w12, 0], p0/m, z7.s
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .L{{label|replace('.', '')}}_top
+    mov     w12, #0
+.L{{label|replace('.', '')}}_bot:
+    mov     z6.s, p0/m, za2h.s[w12, 0]
+    mov     z7.s, p0/m, za3h.s[w12, 0]
+    {{op}}  z6.s, p0/m, z6.s, z4.s
+    {{op}}  z7.s, p0/m, z7.s, z5.s
+    mov     za2h.s[w12, 0], p0/m, z6.s
+    mov     za3h.s[w12, 0], p0/m, z7.s
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .L{{label|replace('.', '')}}_bot
+    b       .non_linear_loop
+{% endmacro %}
+
+{{ per_col_op('.per_col_add', 'fadd') }}
+{{ per_col_op('.per_col_mul', 'fmul') }}
+{{ per_col_op('.per_col_sub', 'fsubr') }}
+{{ per_col_op('.per_col_sub_flipped', 'fsub') }}
+{{ per_col_op('.per_col_min', 'fmin') }}
+{{ per_col_op('.per_col_max', 'fmax') }}
+
+// -------- per-row ops -----------------------------------------------------
+//
+// 32-element row vector at x2 (top 16 rows) and x2+64 (bottom 16 rows).
+// Load one f32 per iteration and broadcast (no SVE indexed-broadcast for
+// arbitrary i across 16 lanes, so we just walk the bias pointer).
+
+{% macro per_row_op(label, op) %}
+{{label}}:
+    ldr     x2, [x0, #8]
+    add     x3, x2, #64
+    mov     w12, #0
+.L{{label|replace('.', '')}}_top:
+    ldr     w4, [x2], #4
+    dup     z4.s, w4
+    mov     z6.s, p0/m, za0h.s[w12, 0]
+    mov     z7.s, p0/m, za1h.s[w12, 0]
+    {{op}}  z6.s, p0/m, z6.s, z4.s
+    {{op}}  z7.s, p0/m, z7.s, z4.s
+    mov     za0h.s[w12, 0], p0/m, z6.s
+    mov     za1h.s[w12, 0], p0/m, z7.s
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .L{{label|replace('.', '')}}_top
+    mov     w12, #0
+.L{{label|replace('.', '')}}_bot:
+    ldr     w4, [x3], #4
+    dup     z4.s, w4
+    mov     z6.s, p0/m, za2h.s[w12, 0]
+    mov     z7.s, p0/m, za3h.s[w12, 0]
+    {{op}}  z6.s, p0/m, z6.s, z4.s
+    {{op}}  z7.s, p0/m, z7.s, z4.s
+    mov     za2h.s[w12, 0], p0/m, z6.s
+    mov     za3h.s[w12, 0], p0/m, z7.s
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .L{{label|replace('.', '')}}_bot
+    b       .non_linear_loop
+{% endmacro %}
+
+{{ per_row_op('.per_row_add', 'fadd') }}
+{{ per_row_op('.per_row_mul', 'fmul') }}
+{{ per_row_op('.per_row_sub', 'fsubr') }}
+{{ per_row_op('.per_row_sub_flipped', 'fsub') }}
+{{ per_row_op('.per_row_min', 'fmin') }}
+{{ per_row_op('.per_row_max', 'fmax') }}
+
+// -------- AddRowColProducts: ZA += rows ⊗ cols (rank-1 update) ------------
+//
+// Same shape as a K=1 matmul step: load 32 f32 of rows + 32 of cols, four
+// FMOPAs into the 2x2 ZA grid.
+
+.add_row_col_products:
+    ldp     x2, x3, [x0, #8]            // rows ptr, cols ptr
+    ld1w    {z0.s}, p0/z, [x2]
+    ld1w    {z2.s}, p0/z, [x2, #1, mul vl]
+    ld1w    {z1.s}, p0/z, [x3]
+    ld1w    {z3.s}, p0/z, [x3, #1, mul vl]
+    fmopa   za0.s, p0/m, p0/m, z0.s, z1.s
+    fmopa   za1.s, p0/m, p0/m, z0.s, z3.s
+    fmopa   za2.s, p0/m, p0/m, z2.s, z1.s
+    fmopa   za3.s, p0/m, p0/m, z2.s, z3.s
+    b       .non_linear_loop
+
+// -------- AddUnicast: ZA += C[i][j] from strided buffer -------------------
+//
+// FusedKerSpec::AddUnicast(OutputStoreKer { ptr, row_byte_stride,
+//                                           col_byte_stride, item_size })
+// Fast path: contiguous f32 cols (col_stride == 4) — load each row via
+// ld1w then in-place fadd to ZA slice.
+
+.add_unicast:
+    ldp     x5, x6, [x0, #8]    // ptr, row_byte_stride
+    ldp     x7, x8, [x0, #24]   // col_byte_stride, item_size
+
+    cmp     x7, #4
+    b.ne    .Laddu_generic
+    cmp     x8, #4
+    b.ne    .Laddu_generic
+
+    add     x4, x5, #64
+    mov     w12, #0
+.Laddu_top:
+    ld1w    {z8.s}, p0/z, [x5]
+    ld1w    {z9.s}, p0/z, [x4]
+    mov     z6.s, p0/m, za0h.s[w12, 0]
+    mov     z7.s, p0/m, za1h.s[w12, 0]
+    fadd    z6.s, p0/m, z6.s, z8.s
+    fadd    z7.s, p0/m, z7.s, z9.s
+    mov     za0h.s[w12, 0], p0/m, z6.s
+    mov     za1h.s[w12, 0], p0/m, z7.s
+    add     x5, x5, x6
+    add     x4, x4, x6
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .Laddu_top
+    mov     w12, #0
+.Laddu_bot:
+    ld1w    {z8.s}, p0/z, [x5]
+    ld1w    {z9.s}, p0/z, [x4]
+    mov     z6.s, p0/m, za2h.s[w12, 0]
+    mov     z7.s, p0/m, za3h.s[w12, 0]
+    fadd    z6.s, p0/m, z6.s, z8.s
+    fadd    z7.s, p0/m, z7.s, z9.s
+    mov     za2h.s[w12, 0], p0/m, z6.s
+    mov     za3h.s[w12, 0], p0/m, z7.s
+    add     x5, x5, x6
+    add     x4, x4, x6
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .Laddu_bot
+    b       .non_linear_loop
+
+.Laddu_generic:
+    // Generic strided gather: walk 32 rows × 32 cols, accumulate one
+    // element at a time into the spill scratch, then re-load each slice
+    // into ZA and fadd.
+    //
+    // Phase 1B keeps this slow but correct — it triggers only for non-
+    // contiguous AddUnicast which auto-tests don't exercise.
+    mov     x3, #0                  // row idx
+    mov     x9, x1                  // scratch ptr
+.Laddu_gen_row:
+    mov     x10, #0
+    mov     x11, x5
+.Laddu_gen_col:
+    ldr     w4, [x11]
+    str     w4, [x9], #4
+    add     x11, x11, x7
+    add     x10, x10, #1
+    cmp     x10, #32
+    b.lt    .Laddu_gen_col
+    add     x5, x5, x6
+    add     x3, x3, #1
+    cmp     x3, #32
+    b.lt    .Laddu_gen_row
+
+    // Now scratch holds 32x32 f32 row-major. Same loop as fast path but
+    // reading from scratch (contiguous).
+    mov     x9, x1
+    add     x4, x9, #64
+    mov     w12, #0
+.Laddu_gen_apply_top:
+    ld1w    {z8.s}, p0/z, [x9]
+    ld1w    {z10.s}, p0/z, [x4]
+    mov     z6.s, p0/m, za0h.s[w12, 0]
+    mov     z7.s, p0/m, za1h.s[w12, 0]
+    fadd    z6.s, p0/m, z6.s, z8.s
+    fadd    z7.s, p0/m, z7.s, z10.s
+    mov     za0h.s[w12, 0], p0/m, z6.s
+    mov     za1h.s[w12, 0], p0/m, z7.s
+    add     x9, x9, #128
+    add     x4, x4, #128
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .Laddu_gen_apply_top
+    mov     w12, #0
+.Laddu_gen_apply_bot:
+    ld1w    {z8.s}, p0/z, [x9]
+    ld1w    {z10.s}, p0/z, [x4]
+    mov     z6.s, p0/m, za2h.s[w12, 0]
+    mov     z7.s, p0/m, za3h.s[w12, 0]
+    fadd    z6.s, p0/m, z6.s, z8.s
+    fadd    z7.s, p0/m, z7.s, z10.s
+    mov     za2h.s[w12, 0], p0/m, z6.s
+    mov     za3h.s[w12, 0], p0/m, z7.s
+    add     x9, x9, #128
+    add     x4, x4, #128
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .Laddu_gen_apply_bot
+    b       .non_linear_loop
+
+// -------- LoadTile: ZA := row-major tile from memory ----------------------
+//
+// FusedKerSpec::LoadTile(col_major_ptr, row_major_ptr):
+//   [x0, #8]  = col-major ptr (unused; AMX prefers this for its layout)
+//   [x0, #16] = row-major ptr (32x32 f32, 128 B per row)
+//
+// We use the row-major pointer because the ZA H-tile store path is itself
+// row-major and matches naturally.
+
+.load_tile:
+    ldr     x2, [x0, #16]
+    add     x4, x2, #64
+    mov     w12, #0
+.Lloadtile_top:
+    ld1w    {z6.s}, p0/z, [x2]
+    ld1w    {z7.s}, p0/z, [x4]
+    mov     za0h.s[w12, 0], p0/m, z6.s
+    mov     za1h.s[w12, 0], p0/m, z7.s
+    add     x2, x2, #128
+    add     x4, x4, #128
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .Lloadtile_top
+    mov     w12, #0
+.Lloadtile_bot:
+    ld1w    {z6.s}, p0/z, [x2]
+    ld1w    {z7.s}, p0/z, [x4]
+    mov     za2h.s[w12, 0], p0/m, z6.s
+    mov     za3h.s[w12, 0], p0/m, z7.s
+    add     x2, x2, #128
+    add     x4, x4, #128
+    add     w12, w12, #1
+    cmp     w12, #16
+    b.lt    .Lloadtile_bot
+    b       .non_linear_loop
+
+// -------- still not implemented (low priority for Phase 1) ----------------
+
+.leaky_relu:
+.q_scale:
+.q_shl:
+.q_shr:
+    b       .unsupported
+
+// -------- epilogue --------------------------------------------------------
+
+.return:
+    smstop
+    add     sp, sp, #4096
+    ldp     q14, q15, [sp, #96]
+    ldp     q12, q13, [sp, #64]
+    ldp     q10, q11, [sp, #32]
+    ldp     q8,  q9,  [sp], #128
+    ret
diff --git a/linalg/arm64/sme/sme_mmv_f32_64x1.S.j2 b/linalg/arm64/sme/sme_mmv_f32_64x1.S.j2
new file mode 100644
index 0000000000..2b1fddd3f9
--- /dev/null
+++ b/linalg/arm64/sme/sme_mmv_f32_64x1.S.j2
@@ -0,0 +1,268 @@
+// vim: ft=arm
+//
+// SME2 f32 64x1 GEMV kernel.
+//
+// Accumulator layout: ZA tile slot rows 0..3 (one vgx4 group starting at
+// w8=0). The 64-element output column maps to these 4 slots × 16 f32 each.
+//
+// Inner K-step: load 64 f32 of A's column into {z0.s-z3.s} via one SME2
+// multi-vec LD1W, broadcast B[k] into z4 with LD1RW, issue ONE multi-vec
+// vgx4 FMLA-into-ZA. Measured peak on M4: ~125 GFLOPS (ZA-write port).
+// Plain SME single-vec predicated FMLA into Z regs caps at ~31 GFLOPS so
+// is not used. Detection therefore gates on FEAT_SME2, not just FEAT_SME.
+//
+// Calling convention (extern "C", AAPCS64):
+//   x0 = const *FusedKerSpec<f32>, advanced 40 B per dispatcher iteration.
+//   x1 = stack-resident 256 B scratch buffer for the strided-store path.
+//   w8 = 0 throughout (vgx-group base index, set in prologue).
+//
+// Streaming-mode rules: PSTATE.SM=1 from prologue smstart to epilogue
+// smstop. v8..v15 saved/restored across the streaming region.
+
+.arch armv9-a+sme2
+.text
+.align 4
+
+.global {{G}}sme_mmv_f32_64x1_{{suffix}}
+{{G}}sme_mmv_f32_64x1_{{suffix}}:
+
+    stp     q8,  q9,  [sp, #-128]!
+    stp     q10, q11, [sp, #32]
+    stp     q12, q13, [sp, #64]
+    stp     q14, q15, [sp, #96]
+
+    // 256 B = 64 f32 spill buffer for the strided-store / AddUnicast paths.
+    sub     sp, sp, #256
+    mov     x1, sp
+
+    smstart
+    ptrue   p0.b
+    ptrue   pn8.b
+    mov     w8, #0
+
+{% include "dispatcher.j2" %}
+
+// -------- supported fuse ops -----------------------------------------------
+
+.add_mat_mul:
+    ldr     x2, [x0, #24]       // b ptr
+    ldp     x3, x4, [x0, #8]    // k, a ptr
+    cmp     x3, #0
+    b.eq    .non_linear_loop
+
+.Lmmv_loop:
+    ld1w    {z0.s-z3.s}, pn8/z, [x4]
+    add     x4, x4, #256
+    ld1rw   {z4.s}, p0/z, [x2]
+    add     x2, x2, #4
+    fmla    za.s[w8, 0, vgx4], {z0.s-z3.s}, z4.s[0]
+    subs    x3, x3, #1
+    b.ne    .Lmmv_loop
+    b       .non_linear_loop
+
+.clear:
+    zero    {za}
+    b       .non_linear_loop
+
+.store:
+    // FusedKerSpec::Store(OutputStoreKer { ptr, row_byte_stride,
+    //                                      col_byte_stride, item_size })
+    //   [x0, #8]  = ptr           [x0, #16] = row_byte_stride
+    //   [x0, #24] = col_byte_stride  [x0, #32] = item_size
+    // x8 must NOT be touched (it's the vgx-base index, set to 0 in prologue).
+    ldp     x5, x6, [x0, #8]    // ptr, row_byte_stride
+    ldp     x7, x9, [x0, #24]   // col_byte_stride, item_size
+
+    // At NR=1 the output column is one element per row; the fast path
+    // triggers when (row_byte_stride==4 AND item_size==4) i.e. the 64
+    // outputs are contiguous in memory.
+    cmp     x6, #4
+    b.ne    .Lstore_generic
+    cmp     x9, #4
+    b.ne    .Lstore_generic
+
+    mov     {z0.s-z3.s}, za.s[w8, 0, vgx4]
+    st1w    {z0.s}, p0, [x5]
+    st1w    {z1.s}, p0, [x5, #1, mul vl]
+    st1w    {z2.s}, p0, [x5, #2, mul vl]
+    st1w    {z3.s}, p0, [x5, #3, mul vl]
+    b       .non_linear_loop
+
+.Lstore_generic:
+    // Spill ZA → 256 B scratch buffer x1, then per-element strided write.
+    mov     {z0.s-z3.s}, za.s[w8, 0, vgx4]
+    st1w    {z0.s}, p0, [x1]
+    st1w    {z1.s}, p0, [x1, #1, mul vl]
+    st1w    {z2.s}, p0, [x1, #2, mul vl]
+    st1w    {z3.s}, p0, [x1, #3, mul vl]
+
+    mov     x3, #0
+    mov     x9, x1
+.Lstore_scatter:
+    ldr     w10, [x9], #4
+    str     w10, [x5]
+    add     x5, x5, x6
+    add     x3, x3, #1
+    cmp     x3, #64
+    b.lt    .Lstore_scatter
+    b       .non_linear_loop
+
+// -------- LoadTile: ZA := tile from row-major source -----------------------
+//
+// FusedKerSpec::LoadTile(col_major_ptr, row_major_ptr) — same as Phase 1's
+// 32x32 LoadTile. NR=1 collapses both pointers to the same 64-element vec;
+// we use the row-major form at [x0, #16].
+
+.load_tile:
+    ldr     x2, [x0, #16]
+    ld1w    {z0.s-z3.s}, pn8/z, [x2]
+    mov     za.s[w8, 0, vgx4], {z0.s-z3.s}
+    b       .non_linear_loop
+
+// -------- AddRowColProducts: ZA += rows ⊗ cols (rank-1 K=1) ---------------
+//
+// NR=1: cols is a single f32, rows is a 64-element vector. Effectively one
+// K-step of add_mat_mul with K=1.
+
+.add_row_col_products:
+    ldp     x2, x3, [x0, #8]           // rows ptr, cols ptr
+    ld1w    {z0.s-z3.s}, pn8/z, [x2]
+    ld1rw   {z4.s}, p0/z, [x3]
+    fmla    za.s[w8, 0, vgx4], {z0.s-z3.s}, z4.s[0]
+    b       .non_linear_loop
+
+// -------- AddUnicast: ZA += C from strided buffer --------------------------
+//
+// NR=1 implies a 64-element column vec layout. Fast path = contiguous f32
+// rows (row_stride == 4); generic path gathers strided.
+
+.add_unicast:
+    ldp     x5, x6, [x0, #8]    // ptr, row_byte_stride
+    ldp     x7, x9, [x0, #24]   // col_byte_stride, item_size
+
+    cmp     x6, #4
+    b.ne    .Laddu_generic
+    cmp     x9, #4
+    b.ne    .Laddu_generic
+
+    // Fast path: contiguous load via 4-vec LD1W.
+    ld1w    {z16.s-z19.s}, pn8/z, [x5]
+    mov     {z0.s-z3.s}, za.s[w8, 0, vgx4]
+    fadd    z0.s, p0/m, z0.s, z16.s
+    fadd    z1.s, p0/m, z1.s, z17.s
+    fadd    z2.s, p0/m, z2.s, z18.s
+    fadd    z3.s, p0/m, z3.s, z19.s
+    mov     za.s[w8, 0, vgx4], {z0.s-z3.s}
+    b       .non_linear_loop
+
+.Laddu_generic:
+    // Per-element strided gather into scratch, then contiguous accumulate.
+    mov     x3, #0
+    mov     x9, x1
+.Laddu_gather:
+    ldr     w10, [x5]
+    str     w10, [x9], #4
+    add     x5, x5, x6
+    add     x3, x3, #1
+    cmp     x3, #64
+    b.lt    .Laddu_gather
+    ld1w    {z16.s-z19.s}, pn8/z, [x1]
+    mov     {z0.s-z3.s}, za.s[w8, 0, vgx4]
+    fadd    z0.s, p0/m, z0.s, z16.s
+    fadd    z1.s, p0/m, z1.s, z17.s
+    fadd    z2.s, p0/m, z2.s, z18.s
+    fadd    z3.s, p0/m, z3.s, z19.s
+    mov     za.s[w8, 0, vgx4], {z0.s-z3.s}
+    b       .non_linear_loop
+
+// -------- scalar / per_col ops (degenerate at NR=1; per_col == scalar) -----
+//
+// Per Phase 1's mapping:
+//   ScalarSub   → result = scalar - z   (fsubr)
+//   ScalarSubF  → result = z - scalar   (fsub)
+// Same convention applies to PerCol*.
+
+{% macro scalar_op(label, op) %}
+{{label}}:
+    ldr     w2, [x0, #8]
+    dup     z4.s, w2
+    mov     {z0.s-z3.s}, za.s[w8, 0, vgx4]
+    {{op}}  z0.s, p0/m, z0.s, z4.s
+    {{op}}  z1.s, p0/m, z1.s, z4.s
+    {{op}}  z2.s, p0/m, z2.s, z4.s
+    {{op}}  z3.s, p0/m, z3.s, z4.s
+    mov     za.s[w8, 0, vgx4], {z0.s-z3.s}
+    b       .non_linear_loop
+{% endmacro %}
+
+{{ scalar_op('.scalar_add', 'fadd') }}
+{{ scalar_op('.scalar_mul', 'fmul') }}
+{{ scalar_op('.scalar_sub', 'fsubr') }}
+{{ scalar_op('.scalar_sub_flipped', 'fsub') }}
+{{ scalar_op('.scalar_min', 'fmin') }}
+{{ scalar_op('.scalar_max', 'fmax') }}
+
+// per_col at NR=1 takes a *pointer* to 1 f32 at [x0, #8]; dereference
+// and broadcast. Result is functionally identical to scalar but the
+// load path differs.
+
+{% macro per_col_op(label, op) %}
+{{label}}:
+    ldr     x2, [x0, #8]
+    ld1rw   {z4.s}, p0/z, [x2]
+    mov     {z0.s-z3.s}, za.s[w8, 0, vgx4]
+    {{op}}  z0.s, p0/m, z0.s, z4.s
+    {{op}}  z1.s, p0/m, z1.s, z4.s
+    {{op}}  z2.s, p0/m, z2.s, z4.s
+    {{op}}  z3.s, p0/m, z3.s, z4.s
+    mov     za.s[w8, 0, vgx4], {z0.s-z3.s}
+    b       .non_linear_loop
+{% endmacro %}
+
+{{ per_col_op('.per_col_add', 'fadd') }}
+{{ per_col_op('.per_col_mul', 'fmul') }}
+{{ per_col_op('.per_col_sub', 'fsubr') }}
+{{ per_col_op('.per_col_sub_flipped', 'fsub') }}
+{{ per_col_op('.per_col_min', 'fmin') }}
+{{ per_col_op('.per_col_max', 'fmax') }}
+
+// -------- per_row ops: 64-element bias, lane-wise op against accumulator --
+
+{% macro per_row_op(label, op) %}
+{{label}}:
+    ldr     x2, [x0, #8]
+    ld1w    {z16.s-z19.s}, pn8/z, [x2]
+    mov     {z0.s-z3.s}, za.s[w8, 0, vgx4]
+    {{op}}  z0.s, p0/m, z0.s, z16.s
+    {{op}}  z1.s, p0/m, z1.s, z17.s
+    {{op}}  z2.s, p0/m, z2.s, z18.s
+    {{op}}  z3.s, p0/m, z3.s, z19.s
+    mov     za.s[w8, 0, vgx4], {z0.s-z3.s}
+    b       .non_linear_loop
+{% endmacro %}
+
+{{ per_row_op('.per_row_add', 'fadd') }}
+{{ per_row_op('.per_row_mul', 'fmul') }}
+{{ per_row_op('.per_row_sub', 'fsubr') }}
+{{ per_row_op('.per_row_sub_flipped', 'fsub') }}
+{{ per_row_op('.per_row_min', 'fmin') }}
+{{ per_row_op('.per_row_max', 'fmax') }}
+
+// -------- not yet implemented ----------------------------------------------
+
+.leaky_relu:
+.q_scale:
+.q_shl:
+.q_shr:
+    b       .unsupported
+
+// -------- epilogue ---------------------------------------------------------
+
+.return:
+    smstop
+    add     sp, sp, #256
+    ldp     q14, q15, [sp, #96]
+    ldp     q12, q13, [sp, #64]
+    ldp     q10, q11, [sp, #32]
+    ldp     q8,  q9,  [sp], #128
+    ret
diff --git a/linalg/arm64/sve/sve_mmm_f16.c b/linalg/arm64/sve/sve_mmm_f16.c
new file mode 100644
index 0000000000..55bb28bb98
--- /dev/null
+++ b/linalg/arm64/sve/sve_mmm_f16.c
@@ -0,0 +1,153 @@
+// SVE f16 GEMM kernel for tract's MMM framework (the mmm_f16 slot).
+//
+// Tile MR=8 x NR=8 with native f16 accumulation, the f16 sibling of
+// sve_mmm_f32.c. The hot AddMatMul is the same vector-length-agnostic
+// broadcast-A rank-1 update, but over f16 lanes: the NR columns are walked in
+// svcnth() chunks with whilelt predication and folded with svmla_n_f16 (native
+// f16 fused multiply-add, as the NEON arm64fp16 kernels do), so one binary is
+// correct and full-width at any SVE VL 128..2048-bit.
+//
+// Gated on FEAT_SVE2 AND FEAT_FP16 (Rust side). Built with +fp16. Consumes
+// tract's native f16 K-major packing. Fuse ops act on the MRxNR tile in memory
+// (scalar; not the hot path); as with the f32 kernel, LeakyRelu and the i32
+// quantization ops are excluded by CAN_FUSE. Returns 0 on success, 1 otherwise.
+
+#include <arm_sve.h>
+#include <stdint.h>
+#include <string.h>
+
+#define MR 8
+#define NR 8
+
+enum {
+    DONE = 0, CLEAR, LOAD_TILE,
+    SCALAR_MIN, SCALAR_MAX, SCALAR_ADD, SCALAR_MUL, SCALAR_SUB, SCALAR_SUBF,
+    LEAKY_RELU,
+    PER_ROW_MIN, PER_ROW_MAX, PER_ROW_ADD, PER_ROW_MUL, PER_ROW_SUB, PER_ROW_SUBF,
+    PER_COL_MIN, PER_COL_MAX, PER_COL_ADD, PER_COL_MUL, PER_COL_SUB, PER_COL_SUBF,
+    Q_SCALE, Q_SHR, Q_SHL,
+    ADD_UNICAST, ADD_ROW_COL_PRODUCTS, STORE, ADD_MAT_MUL
+};
+
+typedef struct {
+    uint64_t disc;
+    uint64_t f0, f1, f2, f3;
+} spec_t;
+
+static inline __fp16 f16_of(uint64_t bits) {
+    __fp16 f;
+    uint16_t lo = (uint16_t)bits;
+    memcpy(&f, &lo, 2);
+    return f;
+}
+
+// AddMatMul: ab[m][n] += sum_k pa[k*MR+m] * pb[k*NR+n]. VLA over NR (f16 lanes).
+static void add_mat_mul(__fp16 ab[MR][NR], const __fp16 *pa, const __fp16 *pb, long k) {
+    for (long n0 = 0; n0 < NR; n0 += svcnth()) {
+        svbool_t pg = svwhilelt_b16((uint64_t)n0, (uint64_t)NR);
+        svfloat16_t a0 = svld1_f16(pg, &ab[0][n0]), a1 = svld1_f16(pg, &ab[1][n0]);
+        svfloat16_t a2 = svld1_f16(pg, &ab[2][n0]), a3 = svld1_f16(pg, &ab[3][n0]);
+        svfloat16_t a4 = svld1_f16(pg, &ab[4][n0]), a5 = svld1_f16(pg, &ab[5][n0]);
+        svfloat16_t a6 = svld1_f16(pg, &ab[6][n0]), a7 = svld1_f16(pg, &ab[7][n0]);
+        for (long kk = 0; kk < k; kk++) {
+            svfloat16_t b = svld1_f16(pg, &pb[kk * NR + n0]);
+            const __fp16 *arow = &pa[kk * MR];
+            a0 = svmla_n_f16_x(pg, a0, b, arow[0]);
+            a1 = svmla_n_f16_x(pg, a1, b, arow[1]);
+            a2 = svmla_n_f16_x(pg, a2, b, arow[2]);
+            a3 = svmla_n_f16_x(pg, a3, b, arow[3]);
+            a4 = svmla_n_f16_x(pg, a4, b, arow[4]);
+            a5 = svmla_n_f16_x(pg, a5, b, arow[5]);
+            a6 = svmla_n_f16_x(pg, a6, b, arow[6]);
+            a7 = svmla_n_f16_x(pg, a7, b, arow[7]);
+        }
+        svst1_f16(pg, &ab[0][n0], a0); svst1_f16(pg, &ab[1][n0], a1);
+        svst1_f16(pg, &ab[2][n0], a2); svst1_f16(pg, &ab[3][n0], a3);
+        svst1_f16(pg, &ab[4][n0], a4); svst1_f16(pg, &ab[5][n0], a5);
+        svst1_f16(pg, &ab[6][n0], a6); svst1_f16(pg, &ab[7][n0], a7);
+    }
+}
+
+// Store the MRxNR f16 tile with arbitrary row/col byte strides.
+static void store_tile(__fp16 ab[MR][NR], const spec_t *s) {
+    uint8_t *ptr = (uint8_t *)s->f0;
+    long rstride = (long)s->f1, cstride = (long)s->f2, isz = (long)s->f3;
+    for (long i = 0; i < MR; i++)
+        for (long j = 0; j < NR; j++) {
+            uint8_t *p = ptr + i * rstride + j * cstride;
+            if (isz == 2)
+                *(__fp16 *)p = ab[i][j];
+            else if (isz == 4)
+                *(float *)p = (float)ab[i][j];
+            else
+                memcpy(p, &ab[i][j], isz);
+        }
+}
+
+intptr_t sve_mmm_f16_kernel(const spec_t *ops) {
+    __fp16 ab[MR][NR];
+    memset(ab, 0, sizeof(ab));
+    for (const spec_t *s = ops;; s++) {
+        switch (s->disc) {
+            case DONE:
+                return 0;
+            case CLEAR:
+                memset(ab, 0, sizeof(ab));
+                break;
+            case ADD_MAT_MUL: {
+                long k = (long)s->f0;
+                add_mat_mul(ab, (const __fp16 *)s->f1, (const __fp16 *)s->f2, k);
+                break;
+            }
+            case STORE:
+                store_tile(ab, s);
+                break;
+            case LOAD_TILE: {
+                const __fp16 *src = (const __fp16 *)s->f1;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++) ab[i][j] = src[i * NR + j];
+                break;
+            }
+            case ADD_UNICAST: {
+                uint8_t *ptr = (uint8_t *)s->f0;
+                long rstride = (long)s->f1, cstride = (long)s->f2, isz = (long)s->f3;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++) {
+                        const uint8_t *p = ptr + i * rstride + j * cstride;
+                        if (isz == 2)
+                            ab[i][j] += *(const __fp16 *)p;
+                        else
+                            ab[i][j] += (__fp16) * (const float *)p;
+                    }
+                break;
+            }
+            case ADD_ROW_COL_PRODUCTS: {
+                const __fp16 *rows = (const __fp16 *)s->f0;
+                const __fp16 *cols = (const __fp16 *)s->f1;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++) ab[i][j] += rows[i] * cols[j];
+                break;
+            }
+            case SCALAR_MIN: { __fp16 v=f16_of(s->f0); for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]<v?ab[i][j]:v; break; }
+            case SCALAR_MAX: { __fp16 v=f16_of(s->f0); for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]>v?ab[i][j]:v; break; }
+            case SCALAR_ADD: { __fp16 v=f16_of(s->f0); for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]+=v; break; }
+            case SCALAR_MUL: { __fp16 v=f16_of(s->f0); for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]*=v; break; }
+            case SCALAR_SUB: { __fp16 v=f16_of(s->f0); for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=v-ab[i][j]; break; }
+            case SCALAR_SUBF:{ __fp16 v=f16_of(s->f0); for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]-v; break; }
+            case PER_ROW_MIN: { const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]<m[i]?ab[i][j]:m[i]; break; }
+            case PER_ROW_MAX: { const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]>m[i]?ab[i][j]:m[i]; break; }
+            case PER_ROW_ADD: { const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]+=m[i]; break; }
+            case PER_ROW_MUL: { const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]*=m[i]; break; }
+            case PER_ROW_SUB: { const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=m[i]-ab[i][j]; break; }
+            case PER_ROW_SUBF:{ const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]-m[i]; break; }
+            case PER_COL_MIN: { const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]<m[j]?ab[i][j]:m[j]; break; }
+            case PER_COL_MAX: { const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]>m[j]?ab[i][j]:m[j]; break; }
+            case PER_COL_ADD: { const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]+=m[j]; break; }
+            case PER_COL_MUL: { const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]*=m[j]; break; }
+            case PER_COL_SUB: { const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=m[j]-ab[i][j]; break; }
+            case PER_COL_SUBF:{ const __fp16*m=(const __fp16*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]-m[j]; break; }
+            default:
+                return 1;
+        }
+    }
+}
diff --git a/linalg/arm64/sve/sve_mmm_f32.c b/linalg/arm64/sve/sve_mmm_f32.c
new file mode 100644
index 0000000000..7a36929bfc
--- /dev/null
+++ b/linalg/arm64/sve/sve_mmm_f32.c
@@ -0,0 +1,159 @@
+// SVE f32 GEMM kernel for tract's MMM framework.
+//
+// Tile MR=8 x NR=8. The hot AddMatMul uses the vector-length-agnostic
+// broadcast-A rank-1 update: the NR columns are walked in svcntw() chunks with
+// whilelt predication, so the SAME code runs correctly (and uses the full
+// vector) at any SVE vector length — 128 to 2048-bit. The MR=8 accumulators are
+// held in SVE registers across the K loop. Fuse ops operate on an MRxNR tile in
+// memory (scalar C — they are not the hot path), mirroring the generic kernel.
+//
+// ABI: the kernel walks a *const FusedKerSpec<f32> array (40 bytes / entry,
+// discriminant u64 at offset 0, fields at 8/16/24/32) until Done, exactly like
+// the asm dispatcher. f32 GEMM consumes tract's native K-major packing
+// (pa[k*MR+m], pb[k*NR+n]) so no custom packing format is required.
+//
+// Returns 0 on success, 1 if asked to do an unsupported fused op.
+
+#include <arm_sve.h>
+#include <stdint.h>
+#include <string.h>
+
+#define MR 8
+#define NR 8
+
+// FusedKerSpec discriminants (must match frame/mmm/fuse.rs enum order).
+enum {
+    DONE = 0, CLEAR, LOAD_TILE,
+    SCALAR_MIN, SCALAR_MAX, SCALAR_ADD, SCALAR_MUL, SCALAR_SUB, SCALAR_SUBF,
+    LEAKY_RELU,
+    PER_ROW_MIN, PER_ROW_MAX, PER_ROW_ADD, PER_ROW_MUL, PER_ROW_SUB, PER_ROW_SUBF,
+    PER_COL_MIN, PER_COL_MAX, PER_COL_ADD, PER_COL_MUL, PER_COL_SUB, PER_COL_SUBF,
+    Q_SCALE, Q_SHR, Q_SHL,
+    ADD_UNICAST, ADD_ROW_COL_PRODUCTS, STORE, ADD_MAT_MUL
+};
+
+typedef struct {
+    uint64_t disc;
+    uint64_t f0, f1, f2, f3; // fields at byte offsets 8, 16, 24, 32
+} spec_t;
+
+// AddMatMul: ab[m][n] += sum_k pa[k*MR+m] * pb[k*NR+n]. VLA over NR.
+static void add_mat_mul(float ab[MR][NR], const float *pa, const float *pb, long k) {
+    for (long n0 = 0; n0 < NR; n0 += svcntw()) {
+        svbool_t pg = svwhilelt_b32((uint64_t)n0, (uint64_t)NR);
+        svfloat32_t a0 = svld1_f32(pg, &ab[0][n0]), a1 = svld1_f32(pg, &ab[1][n0]);
+        svfloat32_t a2 = svld1_f32(pg, &ab[2][n0]), a3 = svld1_f32(pg, &ab[3][n0]);
+        svfloat32_t a4 = svld1_f32(pg, &ab[4][n0]), a5 = svld1_f32(pg, &ab[5][n0]);
+        svfloat32_t a6 = svld1_f32(pg, &ab[6][n0]), a7 = svld1_f32(pg, &ab[7][n0]);
+        for (long kk = 0; kk < k; kk++) {
+            svfloat32_t b = svld1_f32(pg, &pb[kk * NR + n0]);
+            const float *arow = &pa[kk * MR];
+            a0 = svmla_n_f32_x(pg, a0, b, arow[0]);
+            a1 = svmla_n_f32_x(pg, a1, b, arow[1]);
+            a2 = svmla_n_f32_x(pg, a2, b, arow[2]);
+            a3 = svmla_n_f32_x(pg, a3, b, arow[3]);
+            a4 = svmla_n_f32_x(pg, a4, b, arow[4]);
+            a5 = svmla_n_f32_x(pg, a5, b, arow[5]);
+            a6 = svmla_n_f32_x(pg, a6, b, arow[6]);
+            a7 = svmla_n_f32_x(pg, a7, b, arow[7]);
+        }
+        svst1_f32(pg, &ab[0][n0], a0); svst1_f32(pg, &ab[1][n0], a1);
+        svst1_f32(pg, &ab[2][n0], a2); svst1_f32(pg, &ab[3][n0], a3);
+        svst1_f32(pg, &ab[4][n0], a4); svst1_f32(pg, &ab[5][n0], a5);
+        svst1_f32(pg, &ab[6][n0], a6); svst1_f32(pg, &ab[7][n0], a7);
+    }
+}
+
+static inline float f32_of(uint64_t bits) {
+    float f;
+    uint32_t lo = (uint32_t)bits;
+    memcpy(&f, &lo, 4);
+    return f;
+}
+
+// Store the MRxNR tile to memory with arbitrary row/col byte strides.
+static void store_tile(float ab[MR][NR], const spec_t *s) {
+    uint8_t *ptr = (uint8_t *)s->f0;
+    long rstride = (long)s->f1, cstride = (long)s->f2, isz = (long)s->f3;
+    for (long i = 0; i < MR; i++)
+        for (long j = 0; j < NR; j++) {
+            uint8_t *p = ptr + i * rstride + j * cstride;
+            if (isz == 4)
+                *(float *)p = ab[i][j];
+            else
+                memcpy(p, &ab[i][j], isz);
+        }
+}
+
+// Returns isize (64-bit) to match tract's kernel ABI — NOT int (would leave the
+// upper 32 bits of x0 undefined).
+intptr_t sve_mmm_f32_kernel(const spec_t *ops) {
+    float ab[MR][NR];
+    memset(ab, 0, sizeof(ab));
+    for (const spec_t *s = ops;; s++) {
+        switch (s->disc) {
+            case DONE:
+                return 0;
+            case CLEAR:
+                memset(ab, 0, sizeof(ab));
+                break;
+            case ADD_MAT_MUL: {
+                long k = (long)s->f0;
+                const float *pa = (const float *)s->f1;
+                const float *pb = (const float *)s->f2;
+                add_mat_mul(ab, pa, pb, k);
+                break;
+            }
+            case STORE:
+                store_tile(ab, s);
+                break;
+            case LOAD_TILE: {
+                // LoadTile(col_major_ptr, row_major_ptr); use the row-major one.
+                const float *src = (const float *)s->f1;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++) ab[i][j] = src[i * NR + j];
+                break;
+            }
+            case ADD_UNICAST: {
+                uint8_t *ptr = (uint8_t *)s->f0;
+                long rstride = (long)s->f1, cstride = (long)s->f2;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++)
+                        ab[i][j] += *(const float *)(ptr + i * rstride + j * cstride);
+                break;
+            }
+            case ADD_ROW_COL_PRODUCTS: {
+                const float *rows = (const float *)s->f0;
+                const float *cols = (const float *)s->f1;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++) ab[i][j] += rows[i] * cols[j];
+                break;
+            }
+            // ---- scalar fuse ops ----
+            case SCALAR_MIN: { float v = f32_of(s->f0); for (long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]<v?ab[i][j]:v; break; }
+            case SCALAR_MAX: { float v = f32_of(s->f0); for (long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]>v?ab[i][j]:v; break; }
+            case SCALAR_ADD: { float v = f32_of(s->f0); for (long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]+=v; break; }
+            case SCALAR_MUL: { float v = f32_of(s->f0); for (long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]*=v; break; }
+            case SCALAR_SUB: { float v = f32_of(s->f0); for (long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=v-ab[i][j]; break; }
+            case SCALAR_SUBF:{ float v = f32_of(s->f0); for (long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]-v; break; }
+            // ---- per-row fuse ops (one value per m-row) ----
+            case PER_ROW_MIN: { const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]<m[i]?ab[i][j]:m[i]; break; }
+            case PER_ROW_MAX: { const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]>m[i]?ab[i][j]:m[i]; break; }
+            case PER_ROW_ADD: { const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]+=m[i]; break; }
+            case PER_ROW_MUL: { const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]*=m[i]; break; }
+            case PER_ROW_SUB: { const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=m[i]-ab[i][j]; break; }
+            case PER_ROW_SUBF:{ const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]-m[i]; break; }
+            // ---- per-col fuse ops (one value per n-col) ----
+            case PER_COL_MIN: { const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]<m[j]?ab[i][j]:m[j]; break; }
+            case PER_COL_MAX: { const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]>m[j]?ab[i][j]:m[j]; break; }
+            case PER_COL_ADD: { const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]+=m[j]; break; }
+            case PER_COL_MUL: { const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]*=m[j]; break; }
+            case PER_COL_SUB: { const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=m[j]-ab[i][j]; break; }
+            case PER_COL_SUBF:{ const float*m=(const float*)s->f0; for(long i=0;i<MR;i++) for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]-m[j]; break; }
+            default:
+                // LeakyRelu / QScale / RoundingShiftRight / ShiftLeft: excluded
+                // by CAN_FUSE for f32 — should never arrive. Anything else: error.
+                return 1;
+        }
+    }
+}
diff --git a/linalg/arm64/sve/sve_mmm_i32.c b/linalg/arm64/sve/sve_mmm_i32.c
new file mode 100644
index 0000000000..85ae61c22c
--- /dev/null
+++ b/linalg/arm64/sve/sve_mmm_i32.c
@@ -0,0 +1,261 @@
+// SVE int8 -> int32 GEMM kernel for tract's MMM framework (the qmmm_i32 slot).
+//
+// Tile MR=8 x NR=8, i32 accumulator. The hot AddMatMul is the vector-length-
+// agnostic *widening* broadcast-A rank-1 update: per K-step it loads NR signed
+// bytes of a B row and sign-extends them to i32 with `svld1sb_s32`, then folds
+// MR `svmla_n_s32` updates with the (sign-extended) scalar from the A column.
+// The NR columns are walked in svcntw() chunks with whilelt predication, so the
+// SAME code is correct and full-width at any SVE vector length (128..2048-bit).
+//
+// Why widening MLA and not SDOT: SDOT reduces 4 K-contiguous i8 per i32 lane, so
+// it needs A and B packed K-contiguous within a lane. tract's PackedFormat is
+// K-major (mn-inner: for each k, r contiguous mn values), which is exactly what
+// the per-k widening update consumes — and what arm64simd's i32 kernel uses via
+// NEON SMLAL. A SDOT/SMMLA path would need a custom interleaved packer; that is
+// a separate (max-throughput) kernel, not this one.
+//
+// int8 inputs arrive via tract's native i8i8 packing (AddMatMul packing == 1).
+// The default i32i32 packing (packing == 0) is also handled (scalar) so the
+// generic auto-test surface (mmm_packed_packed_tests i32i32:0) passes.
+//
+// ABI: identical 40-byte FusedKerSpec<i32> walk as the f32 kernel (discriminant
+// u64 at offset 0, fields at 8/16/24/32). Fuse ops act on the MRxNR i32 tile in
+// memory (scalar C — not the hot path), including the quantization ops
+// q_scale / q_shr (rounding) / q_shl, ported bit-exact from
+// linalg/src/generic/rounding.rs.
+//
+// Returns 0 on success, 1 if asked to do an unsupported fused op / packing.
+
+#include <arm_sve.h>
+#include <stdint.h>
+#include <string.h>
+
+#define MR 8
+#define NR 8
+
+// FusedKerSpec discriminants (must match frame/mmm/fuse.rs enum order).
+enum {
+    DONE = 0, CLEAR, LOAD_TILE,
+    SCALAR_MIN, SCALAR_MAX, SCALAR_ADD, SCALAR_MUL, SCALAR_SUB, SCALAR_SUBF,
+    LEAKY_RELU,
+    PER_ROW_MIN, PER_ROW_MAX, PER_ROW_ADD, PER_ROW_MUL, PER_ROW_SUB, PER_ROW_SUBF,
+    PER_COL_MIN, PER_COL_MAX, PER_COL_ADD, PER_COL_MUL, PER_COL_SUB, PER_COL_SUBF,
+    Q_SCALE, Q_SHR, Q_SHL,
+    ADD_UNICAST, ADD_ROW_COL_PRODUCTS, STORE, ADD_MAT_MUL
+};
+
+// RoundingPolicy is #[repr(usize)] in fuse.rs: Native=0, Zero=1, Away=2,
+// MinusInf=3, PlusInf=4, Even=5, Odd=6.
+enum { RP_NATIVE = 0, RP_ZERO, RP_AWAY, RP_MINUSINF, RP_PLUSINF, RP_EVEN, RP_ODD };
+
+typedef struct {
+    uint64_t disc;
+    uint64_t f0, f1, f2, f3; // fields at byte offsets 8, 16, 24, 32
+} spec_t;
+
+// AddMatMul, i8 x i8 -> i32 (packing 1): ab[m][n] += sum_k pa[k*MR+m]*pb[k*NR+n].
+// VLA widening rank-1 update over NR.
+static void add_mat_mul_i8(int32_t ab[MR][NR], const int8_t *pa, const int8_t *pb, long k) {
+    for (long n0 = 0; n0 < NR; n0 += svcntw()) {
+        svbool_t pg = svwhilelt_b32((uint64_t)n0, (uint64_t)NR);
+        svint32_t a0 = svld1_s32(pg, &ab[0][n0]), a1 = svld1_s32(pg, &ab[1][n0]);
+        svint32_t a2 = svld1_s32(pg, &ab[2][n0]), a3 = svld1_s32(pg, &ab[3][n0]);
+        svint32_t a4 = svld1_s32(pg, &ab[4][n0]), a5 = svld1_s32(pg, &ab[5][n0]);
+        svint32_t a6 = svld1_s32(pg, &ab[6][n0]), a7 = svld1_s32(pg, &ab[7][n0]);
+        for (long kk = 0; kk < k; kk++) {
+            // Load NR int8 of B row kk, sign-extending each lane to i32.
+            svint32_t b = svld1sb_s32(pg, &pb[kk * NR + n0]);
+            const int8_t *arow = &pa[kk * MR];
+            a0 = svmla_n_s32_x(pg, a0, b, (int32_t)arow[0]);
+            a1 = svmla_n_s32_x(pg, a1, b, (int32_t)arow[1]);
+            a2 = svmla_n_s32_x(pg, a2, b, (int32_t)arow[2]);
+            a3 = svmla_n_s32_x(pg, a3, b, (int32_t)arow[3]);
+            a4 = svmla_n_s32_x(pg, a4, b, (int32_t)arow[4]);
+            a5 = svmla_n_s32_x(pg, a5, b, (int32_t)arow[5]);
+            a6 = svmla_n_s32_x(pg, a6, b, (int32_t)arow[6]);
+            a7 = svmla_n_s32_x(pg, a7, b, (int32_t)arow[7]);
+        }
+        svst1_s32(pg, &ab[0][n0], a0); svst1_s32(pg, &ab[1][n0], a1);
+        svst1_s32(pg, &ab[2][n0], a2); svst1_s32(pg, &ab[3][n0], a3);
+        svst1_s32(pg, &ab[4][n0], a4); svst1_s32(pg, &ab[5][n0], a5);
+        svst1_s32(pg, &ab[6][n0], a6); svst1_s32(pg, &ab[7][n0], a7);
+    }
+}
+
+// AddMatMul, i32 x i32 -> i32 (packing 0, default): only used by the auto-test
+// surface, never in production (quantized matmul uses the i8i8 packing). Scalar.
+static void add_mat_mul_i32(int32_t ab[MR][NR], const int32_t *pa, const int32_t *pb, long k) {
+    for (long kk = 0; kk < k; kk++) {
+        const int32_t *arow = &pa[kk * MR], *brow = &pb[kk * NR];
+        for (long i = 0; i < MR; i++)
+            for (long j = 0; j < NR; j++) ab[i][j] += arow[i] * brow[j];
+    }
+}
+
+// ---- Quantization helpers, ported bit-exact from generic/rounding.rs ----
+
+// i32::q_shr(shift, rp): rounding arithmetic shift right.
+static int32_t q_shr_i32(int32_t v, long shift, int rp) {
+    int32_t half = (int32_t)1 << (shift - 1);
+    int32_t a = v < 0 ? -v : v; // abs (test inputs are small; matches Rust .abs())
+    int32_t nudge;
+    switch (rp) {
+        case RP_ZERO:     nudge = -1; break;
+        case RP_MINUSINF: nudge = -(int32_t)(v >= 0); break;
+        case RP_PLUSINF:  nudge = -(int32_t)(v <= 0); break;
+        case RP_AWAY:     nudge = 0; break;
+        case RP_EVEN:     nudge = ((a >> shift) & 0x1) - 1; break;
+        case RP_ODD:      nudge = -((a >> shift) & 0x1); break;
+        default:          nudge = 0; break; // Native: unreachable for q ops
+    }
+    int32_t sign = (v > 0) - (v < 0); // signum: -1 / 0 / 1
+    return sign * ((a + half + nudge) >> shift);
+}
+
+// i32::q_scale(Scaler{mult, shift, policy}) with mult always present (the QScale
+// fused op carries an explicit multiplier). Mirrors `Mul<i32> for Scaler`.
+static int32_t q_scale_i32(int32_t v, long shift_in, int policy, int32_t mult) {
+    int64_t val = (int64_t)mult * (int64_t)v;
+    long shift = shift_in + 31;
+    if (shift > 0) {
+        int64_t half = (int64_t)1 << (shift - 1);
+        int64_t a = val < 0 ? -val : val;
+        int64_t nudge;
+        switch (policy) {
+            case RP_ZERO:     nudge = -1; break;
+            case RP_MINUSINF: nudge = -(int64_t)(val >= 0); break;
+            case RP_PLUSINF:  nudge = -(int64_t)(val <= 0); break;
+            case RP_AWAY:     nudge = 0; break;
+            case RP_EVEN:     nudge = ((a >> shift) & 0x1) - 1; break;
+            case RP_ODD:      nudge = -((a >> shift) & 0x1); break;
+            default:          nudge = 0; break;
+        }
+        int64_t sign = (val > 0) - (val < 0);
+        return (int32_t)(sign * ((a + half + nudge) >> shift));
+    } else {
+        return (int32_t)(val << (-shift));
+    }
+}
+
+// Store the MRxNR i32 tile to memory with arbitrary row/col byte strides,
+// truncating to the destination item size (matches generic store_t semantics
+// for the tested widths 1 and 4).
+static void store_tile(int32_t ab[MR][NR], const spec_t *s) {
+    uint8_t *ptr = (uint8_t *)s->f0;
+    long rstride = (long)s->f1, cstride = (long)s->f2, isz = (long)s->f3;
+    for (long i = 0; i < MR; i++)
+        for (long j = 0; j < NR; j++) {
+            uint8_t *p = ptr + i * rstride + j * cstride;
+            int32_t v = ab[i][j];
+            switch (isz) {
+                case 1: *(uint8_t *)p = (uint8_t)v; break;
+                case 2: *(uint16_t *)p = (uint16_t)v; break;
+                case 4: *(int32_t *)p = v; break;
+                case 8: { int64_t w = v; memcpy(p, &w, 8); break; }
+                default: memcpy(p, &v, isz < 4 ? (size_t)isz : 4); break;
+            }
+        }
+}
+
+// Returns isize (64-bit) to match tract's kernel ABI.
+intptr_t sve_mmm_i32_kernel(const spec_t *ops) {
+    int32_t ab[MR][NR];
+    memset(ab, 0, sizeof(ab));
+    for (const spec_t *s = ops;; s++) {
+        switch (s->disc) {
+            case DONE:
+                return 0;
+            case CLEAR:
+                memset(ab, 0, sizeof(ab));
+                break;
+            case ADD_MAT_MUL: {
+                long k = (long)s->f0;
+                long packing = (long)s->f3;
+                if (packing == 1) {
+                    add_mat_mul_i8(ab, (const int8_t *)s->f1, (const int8_t *)s->f2, k);
+                } else if (packing == 0) {
+                    add_mat_mul_i32(ab, (const int32_t *)s->f1, (const int32_t *)s->f2, k);
+                } else {
+                    return 1;
+                }
+                break;
+            }
+            case STORE:
+                store_tile(ab, s);
+                break;
+            case LOAD_TILE: {
+                // LoadTile(col_major_ptr, row_major_ptr); use the row-major one.
+                const int32_t *src = (const int32_t *)s->f1;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++) ab[i][j] = src[i * NR + j];
+                break;
+            }
+            case ADD_UNICAST: {
+                uint8_t *ptr = (uint8_t *)s->f0;
+                long rstride = (long)s->f1, cstride = (long)s->f2, isz = (long)s->f3;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++) {
+                        const uint8_t *p = ptr + i * rstride + j * cstride;
+                        if (isz == 1)
+                            ab[i][j] += *(const int8_t *)p; // sign-extend
+                        else
+                            ab[i][j] += *(const int32_t *)p;
+                    }
+                break;
+            }
+            case ADD_ROW_COL_PRODUCTS: {
+                const int32_t *rows = (const int32_t *)s->f0;
+                const int32_t *cols = (const int32_t *)s->f1;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++) ab[i][j] += rows[i] * cols[j];
+                break;
+            }
+            // ---- quantization fuse ops ----
+            case Q_SCALE: {
+                long shift = (long)s->f0;
+                int policy = (int)s->f1;
+                int32_t mult = (int32_t)s->f2;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++) ab[i][j] = q_scale_i32(ab[i][j], shift, policy, mult);
+                break;
+            }
+            case Q_SHR: {
+                long shift = (long)s->f0;
+                int policy = (int)s->f1;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++) ab[i][j] = q_shr_i32(ab[i][j], shift, policy);
+                break;
+            }
+            case Q_SHL: {
+                long shift = (long)s->f0;
+                for (long i = 0; i < MR; i++)
+                    for (long j = 0; j < NR; j++) ab[i][j] = ab[i][j] << shift;
+                break;
+            }
+            // ---- scalar fuse ops (value is an i32 in the low 32 bits of f0) ----
+            case SCALAR_MIN: { int32_t v=(int32_t)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]<v?ab[i][j]:v; break; }
+            case SCALAR_MAX: { int32_t v=(int32_t)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]>v?ab[i][j]:v; break; }
+            case SCALAR_ADD: { int32_t v=(int32_t)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]+=v; break; }
+            case SCALAR_MUL: { int32_t v=(int32_t)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]*=v; break; }
+            case SCALAR_SUB: { int32_t v=(int32_t)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=v-ab[i][j]; break; }
+            case SCALAR_SUBF:{ int32_t v=(int32_t)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]-v; break; }
+            // ---- per-row fuse ops (one i32 per m-row) ----
+            case PER_ROW_MIN: { const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]<m[i]?ab[i][j]:m[i]; break; }
+            case PER_ROW_MAX: { const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]>m[i]?ab[i][j]:m[i]; break; }
+            case PER_ROW_ADD: { const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]+=m[i]; break; }
+            case PER_ROW_MUL: { const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]*=m[i]; break; }
+            case PER_ROW_SUB: { const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=m[i]-ab[i][j]; break; }
+            case PER_ROW_SUBF:{ const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]-m[i]; break; }
+            // ---- per-col fuse ops (one i32 per n-col) ----
+            case PER_COL_MIN: { const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]<m[j]?ab[i][j]:m[j]; break; }
+            case PER_COL_MAX: { const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]>m[j]?ab[i][j]:m[j]; break; }
+            case PER_COL_ADD: { const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]+=m[j]; break; }
+            case PER_COL_MUL: { const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]*=m[j]; break; }
+            case PER_COL_SUB: { const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=m[j]-ab[i][j]; break; }
+            case PER_COL_SUBF:{ const int32_t*m=(const int32_t*)s->f0; for(long i=0;i<MR;i++)for(long j=0;j<NR;j++) ab[i][j]=ab[i][j]-m[j]; break; }
+            default:
+                // LeakyRelu excluded by CAN_FUSE_I32; anything else is an error.
+                return 1;
+        }
+    }
+}
diff --git a/linalg/arm64/sve/sve_mmm_i32_64x1.c b/linalg/arm64/sve/sve_mmm_i32_64x1.c
new file mode 100644
index 0000000000..aa4c89c502
--- /dev/null
+++ b/linalg/arm64/sve/sve_mmm_i32_64x1.c
@@ -0,0 +1,210 @@
+// SVE int8 -> int32 GEMV kernel for tract's MMM framework (the qmmv_i32 slot,
+// dispatched when N == 1: matrix x int8 column vector).
+//
+// Tile MR=64 x NR=1, i32 accumulator. The hot AddMatMul is the vector-length-
+// agnostic widening update vectorized over M: per K-step it loads MR signed
+// bytes of the A-panel column, sign-extends to i32 (svld1sb_s32), and folds a
+// single svmla_n_s32 with the (sign-extended) scalar B[k]. The MR rows are
+// walked in svcntw() chunks with whilelt predication, so the SAME code is
+// correct and full-width at any SVE vector length (128..2048-bit).
+//
+// Same rationale as the 8x8 kernel: widening MLA (not SDOT) consumes tract's
+// native K-major i8i8 packing directly. int8 inputs arrive via that packing
+// (AddMatMul packing == 1); the default i32i32 packing (packing == 0) is handled
+// scalar for the auto-test surface.
+//
+// ABI: identical 40-byte FusedKerSpec<i32> walk. At NR=1, per_col / scalar fuse
+// ops degenerate to a single broadcast value and per_row is element-wise over
+// the MR outputs. Quantization ops q_scale / q_shr / q_shl are ported bit-exact
+// from linalg/src/generic/rounding.rs. Returns 0 on success, 1 on an
+// unsupported fused op / packing.
+
+#include <arm_sve.h>
+#include <stdint.h>
+#include <string.h>
+
+#define MR 64
+#define NR 1
+
+enum {
+    DONE = 0, CLEAR, LOAD_TILE,
+    SCALAR_MIN, SCALAR_MAX, SCALAR_ADD, SCALAR_MUL, SCALAR_SUB, SCALAR_SUBF,
+    LEAKY_RELU,
+    PER_ROW_MIN, PER_ROW_MAX, PER_ROW_ADD, PER_ROW_MUL, PER_ROW_SUB, PER_ROW_SUBF,
+    PER_COL_MIN, PER_COL_MAX, PER_COL_ADD, PER_COL_MUL, PER_COL_SUB, PER_COL_SUBF,
+    Q_SCALE, Q_SHR, Q_SHL,
+    ADD_UNICAST, ADD_ROW_COL_PRODUCTS, STORE, ADD_MAT_MUL
+};
+
+enum { RP_NATIVE = 0, RP_ZERO, RP_AWAY, RP_MINUSINF, RP_PLUSINF, RP_EVEN, RP_ODD };
+
+typedef struct {
+    uint64_t disc;
+    uint64_t f0, f1, f2, f3;
+} spec_t;
+
+// AddMatMul, i8 x i8 -> i32 (packing 1): ab[m] += sum_k pa[k*MR+m]*pb[k].
+// VLA widening update over MR.
+static void add_mat_mul_i8(int32_t ab[MR], const int8_t *pa, const int8_t *pb, long k) {
+    for (long m0 = 0; m0 < MR; m0 += svcntw()) {
+        svbool_t pg = svwhilelt_b32((uint64_t)m0, (uint64_t)MR);
+        svint32_t acc = svld1_s32(pg, &ab[m0]);
+        for (long kk = 0; kk < k; kk++) {
+            svint32_t a = svld1sb_s32(pg, &pa[kk * MR + m0]); // load i8 col, sign-extend
+            acc = svmla_n_s32_x(pg, acc, a, (int32_t)pb[kk]);
+        }
+        svst1_s32(pg, &ab[m0], acc);
+    }
+}
+
+// AddMatMul, i32 x i32 -> i32 (packing 0, default): auto-test surface only.
+static void add_mat_mul_i32(int32_t ab[MR], const int32_t *pa, const int32_t *pb, long k) {
+    for (long kk = 0; kk < k; kk++) {
+        int32_t b = pb[kk];
+        const int32_t *acol = &pa[kk * MR];
+        for (long m = 0; m < MR; m++) ab[m] += acol[m] * b;
+    }
+}
+
+// ---- quantization helpers, ported bit-exact from generic/rounding.rs ----
+
+static int32_t q_shr_i32(int32_t v, long shift, int rp) {
+    int32_t half = (int32_t)1 << (shift - 1);
+    int32_t a = v < 0 ? -v : v;
+    int32_t nudge;
+    switch (rp) {
+        case RP_ZERO:     nudge = -1; break;
+        case RP_MINUSINF: nudge = -(int32_t)(v >= 0); break;
+        case RP_PLUSINF:  nudge = -(int32_t)(v <= 0); break;
+        case RP_AWAY:     nudge = 0; break;
+        case RP_EVEN:     nudge = ((a >> shift) & 0x1) - 1; break;
+        case RP_ODD:      nudge = -((a >> shift) & 0x1); break;
+        default:          nudge = 0; break;
+    }
+    int32_t sign = (v > 0) - (v < 0);
+    return sign * ((a + half + nudge) >> shift);
+}
+
+static int32_t q_scale_i32(int32_t v, long shift_in, int policy, int32_t mult) {
+    int64_t val = (int64_t)mult * (int64_t)v;
+    long shift = shift_in + 31;
+    if (shift > 0) {
+        int64_t half = (int64_t)1 << (shift - 1);
+        int64_t a = val < 0 ? -val : val;
+        int64_t nudge;
+        switch (policy) {
+            case RP_ZERO:     nudge = -1; break;
+            case RP_MINUSINF: nudge = -(int64_t)(val >= 0); break;
+            case RP_PLUSINF:  nudge = -(int64_t)(val <= 0); break;
+            case RP_AWAY:     nudge = 0; break;
+            case RP_EVEN:     nudge = ((a >> shift) & 0x1) - 1; break;
+            case RP_ODD:      nudge = -((a >> shift) & 0x1); break;
+            default:          nudge = 0; break;
+        }
+        int64_t sign = (val > 0) - (val < 0);
+        return (int32_t)(sign * ((a + half + nudge) >> shift));
+    } else {
+        return (int32_t)(val << (-shift));
+    }
+}
+
+intptr_t sve_mmm_i32_64x1_kernel(const spec_t *ops) {
+    int32_t ab[MR];
+    memset(ab, 0, sizeof(ab));
+    for (const spec_t *s = ops;; s++) {
+        switch (s->disc) {
+            case DONE:
+                return 0;
+            case CLEAR:
+                memset(ab, 0, sizeof(ab));
+                break;
+            case ADD_MAT_MUL: {
+                long k = (long)s->f0, packing = (long)s->f3;
+                if (packing == 1)
+                    add_mat_mul_i8(ab, (const int8_t *)s->f1, (const int8_t *)s->f2, k);
+                else if (packing == 0)
+                    add_mat_mul_i32(ab, (const int32_t *)s->f1, (const int32_t *)s->f2, k);
+                else
+                    return 1;
+                break;
+            }
+            case STORE: {
+                uint8_t *ptr = (uint8_t *)s->f0;
+                long rstride = (long)s->f1, isz = (long)s->f3;
+                for (long m = 0; m < MR; m++) {
+                    uint8_t *p = ptr + m * rstride;
+                    int32_t v = ab[m];
+                    switch (isz) {
+                        case 1: *(uint8_t *)p = (uint8_t)v; break;
+                        case 2: *(uint16_t *)p = (uint16_t)v; break;
+                        case 4: *(int32_t *)p = v; break;
+                        case 8: { int64_t w = v; memcpy(p, &w, 8); break; }
+                        default: memcpy(p, &v, isz < 4 ? (size_t)isz : 4); break;
+                    }
+                }
+                break;
+            }
+            case LOAD_TILE: {
+                const int32_t *src = (const int32_t *)s->f1; // row-major MR values
+                for (long m = 0; m < MR; m++) ab[m] = src[m];
+                break;
+            }
+            case ADD_UNICAST: {
+                uint8_t *ptr = (uint8_t *)s->f0;
+                long rstride = (long)s->f1, isz = (long)s->f3;
+                for (long m = 0; m < MR; m++) {
+                    const uint8_t *p = ptr + m * rstride;
+                    if (isz == 1)
+                        ab[m] += *(const int8_t *)p;
+                    else
+                        ab[m] += *(const int32_t *)p;
+                }
+                break;
+            }
+            case ADD_ROW_COL_PRODUCTS: {
+                const int32_t *rows = (const int32_t *)s->f0;
+                const int32_t *cols = (const int32_t *)s->f1;
+                for (long m = 0; m < MR; m++) ab[m] += rows[m] * cols[0];
+                break;
+            }
+            case Q_SCALE: {
+                long shift = (long)s->f0; int policy = (int)s->f1; int32_t mult = (int32_t)s->f2;
+                for (long m = 0; m < MR; m++) ab[m] = q_scale_i32(ab[m], shift, policy, mult);
+                break;
+            }
+            case Q_SHR: {
+                long shift = (long)s->f0; int policy = (int)s->f1;
+                for (long m = 0; m < MR; m++) ab[m] = q_shr_i32(ab[m], shift, policy);
+                break;
+            }
+            case Q_SHL: {
+                long shift = (long)s->f0;
+                for (long m = 0; m < MR; m++) ab[m] = ab[m] << shift;
+                break;
+            }
+            // scalar fuse ops (single i32 in low 32 bits of f0)
+            case SCALAR_MIN: { int32_t v=(int32_t)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]<v?ab[m]:v; break; }
+            case SCALAR_MAX: { int32_t v=(int32_t)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]>v?ab[m]:v; break; }
+            case SCALAR_ADD: { int32_t v=(int32_t)s->f0; for(long m=0;m<MR;m++) ab[m]+=v; break; }
+            case SCALAR_MUL: { int32_t v=(int32_t)s->f0; for(long m=0;m<MR;m++) ab[m]*=v; break; }
+            case SCALAR_SUB: { int32_t v=(int32_t)s->f0; for(long m=0;m<MR;m++) ab[m]=v-ab[m]; break; }
+            case SCALAR_SUBF:{ int32_t v=(int32_t)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]-v; break; }
+            // per-row fuse ops (one i32 per m-row)
+            case PER_ROW_MIN: { const int32_t*m_=(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]<m_[m]?ab[m]:m_[m]; break; }
+            case PER_ROW_MAX: { const int32_t*m_=(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]>m_[m]?ab[m]:m_[m]; break; }
+            case PER_ROW_ADD: { const int32_t*m_=(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]+=m_[m]; break; }
+            case PER_ROW_MUL: { const int32_t*m_=(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]*=m_[m]; break; }
+            case PER_ROW_SUB: { const int32_t*m_=(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]=m_[m]-ab[m]; break; }
+            case PER_ROW_SUBF:{ const int32_t*m_=(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]-m_[m]; break; }
+            // per-col fuse ops degenerate to a single broadcast value at NR=1
+            case PER_COL_MIN: { int32_t v=*(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]<v?ab[m]:v; break; }
+            case PER_COL_MAX: { int32_t v=*(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]>v?ab[m]:v; break; }
+            case PER_COL_ADD: { int32_t v=*(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]+=v; break; }
+            case PER_COL_MUL: { int32_t v=*(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]*=v; break; }
+            case PER_COL_SUB: { int32_t v=*(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]=v-ab[m]; break; }
+            case PER_COL_SUBF:{ int32_t v=*(const int32_t*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]-v; break; }
+            default:
+                return 1;
+        }
+    }
+}
diff --git a/linalg/arm64/sve/sve_mmv_f16_64x1.c b/linalg/arm64/sve/sve_mmv_f16_64x1.c
new file mode 100644
index 0000000000..93bf9f4cf1
--- /dev/null
+++ b/linalg/arm64/sve/sve_mmv_f16_64x1.c
@@ -0,0 +1,132 @@
+// SVE f16 GEMV kernel for tract's MMM framework (the mmv_f16 slot, dispatched
+// when N == 1: matrix x f16 column vector).
+//
+// Tile MR=64 x NR=1 with native f16 accumulation, the f16 sibling of
+// sve_mmv_f32_64x1.c. The hot AddMatMul is vectorized over M: per K-step it
+// loads MR f16 of the A-panel column and folds a single svmla_n_f16 (native f16
+// fused multiply-add) with the scalar B[k]. The MR rows are walked in svcnth()
+// chunks with whilelt predication, so one binary is correct and full-width at
+// any SVE VL 128..2048-bit.
+//
+// Gated on FEAT_SVE2 AND FEAT_FP16 (Rust side). Built with +fp16. At NR=1 the
+// per_col/scalar fuse ops degenerate to a broadcast and per_row is element-wise
+// over the MR outputs. LeakyRelu and the i32 quantization ops are excluded by
+// CAN_FUSE. Returns 0 on success, 1 otherwise.
+
+#include <arm_sve.h>
+#include <stdint.h>
+#include <string.h>
+
+#define MR 64
+#define NR 1
+
+enum {
+    DONE = 0, CLEAR, LOAD_TILE,
+    SCALAR_MIN, SCALAR_MAX, SCALAR_ADD, SCALAR_MUL, SCALAR_SUB, SCALAR_SUBF,
+    LEAKY_RELU,
+    PER_ROW_MIN, PER_ROW_MAX, PER_ROW_ADD, PER_ROW_MUL, PER_ROW_SUB, PER_ROW_SUBF,
+    PER_COL_MIN, PER_COL_MAX, PER_COL_ADD, PER_COL_MUL, PER_COL_SUB, PER_COL_SUBF,
+    Q_SCALE, Q_SHR, Q_SHL,
+    ADD_UNICAST, ADD_ROW_COL_PRODUCTS, STORE, ADD_MAT_MUL
+};
+
+typedef struct {
+    uint64_t disc;
+    uint64_t f0, f1, f2, f3;
+} spec_t;
+
+static inline __fp16 f16_of(uint64_t bits) {
+    __fp16 f;
+    uint16_t lo = (uint16_t)bits;
+    memcpy(&f, &lo, 2);
+    return f;
+}
+
+// AddMatMul: ab[m] += sum_k pa[k*MR+m] * pb[k]. VLA over MR (f16 lanes).
+static void add_mat_mul(__fp16 ab[MR], const __fp16 *pa, const __fp16 *pb, long k) {
+    for (long m0 = 0; m0 < MR; m0 += svcnth()) {
+        svbool_t pg = svwhilelt_b16((uint64_t)m0, (uint64_t)MR);
+        svfloat16_t acc = svld1_f16(pg, &ab[m0]);
+        for (long kk = 0; kk < k; kk++) {
+            svfloat16_t a = svld1_f16(pg, &pa[kk * MR + m0]);
+            acc = svmla_n_f16_x(pg, acc, a, pb[kk]);
+        }
+        svst1_f16(pg, &ab[m0], acc);
+    }
+}
+
+intptr_t sve_mmv_f16_64x1_kernel(const spec_t *ops) {
+    __fp16 ab[MR];
+    memset(ab, 0, sizeof(ab));
+    for (const spec_t *s = ops;; s++) {
+        switch (s->disc) {
+            case DONE:
+                return 0;
+            case CLEAR:
+                memset(ab, 0, sizeof(ab));
+                break;
+            case ADD_MAT_MUL: {
+                long k = (long)s->f0;
+                add_mat_mul(ab, (const __fp16 *)s->f1, (const __fp16 *)s->f2, k);
+                break;
+            }
+            case STORE: {
+                uint8_t *ptr = (uint8_t *)s->f0;
+                long rstride = (long)s->f1, isz = (long)s->f3;
+                for (long m = 0; m < MR; m++) {
+                    uint8_t *p = ptr + m * rstride;
+                    if (isz == 2)
+                        *(__fp16 *)p = ab[m];
+                    else if (isz == 4)
+                        *(float *)p = (float)ab[m];
+                    else
+                        memcpy(p, &ab[m], isz);
+                }
+                break;
+            }
+            case LOAD_TILE: {
+                const __fp16 *src = (const __fp16 *)s->f1;
+                for (long m = 0; m < MR; m++) ab[m] = src[m];
+                break;
+            }
+            case ADD_UNICAST: {
+                uint8_t *ptr = (uint8_t *)s->f0;
+                long rstride = (long)s->f1, isz = (long)s->f3;
+                for (long m = 0; m < MR; m++) {
+                    const uint8_t *p = ptr + m * rstride;
+                    if (isz == 2)
+                        ab[m] += *(const __fp16 *)p;
+                    else
+                        ab[m] += (__fp16) * (const float *)p;
+                }
+                break;
+            }
+            case ADD_ROW_COL_PRODUCTS: {
+                const __fp16 *rows = (const __fp16 *)s->f0;
+                const __fp16 *cols = (const __fp16 *)s->f1;
+                for (long m = 0; m < MR; m++) ab[m] += rows[m] * cols[0];
+                break;
+            }
+            case SCALAR_MIN: { __fp16 v=f16_of(s->f0); for(long m=0;m<MR;m++) ab[m]=ab[m]<v?ab[m]:v; break; }
+            case SCALAR_MAX: { __fp16 v=f16_of(s->f0); for(long m=0;m<MR;m++) ab[m]=ab[m]>v?ab[m]:v; break; }
+            case SCALAR_ADD: { __fp16 v=f16_of(s->f0); for(long m=0;m<MR;m++) ab[m]+=v; break; }
+            case SCALAR_MUL: { __fp16 v=f16_of(s->f0); for(long m=0;m<MR;m++) ab[m]*=v; break; }
+            case SCALAR_SUB: { __fp16 v=f16_of(s->f0); for(long m=0;m<MR;m++) ab[m]=v-ab[m]; break; }
+            case SCALAR_SUBF:{ __fp16 v=f16_of(s->f0); for(long m=0;m<MR;m++) ab[m]=ab[m]-v; break; }
+            case PER_ROW_MIN: { const __fp16*m_=(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]<m_[m]?ab[m]:m_[m]; break; }
+            case PER_ROW_MAX: { const __fp16*m_=(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]>m_[m]?ab[m]:m_[m]; break; }
+            case PER_ROW_ADD: { const __fp16*m_=(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]+=m_[m]; break; }
+            case PER_ROW_MUL: { const __fp16*m_=(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]*=m_[m]; break; }
+            case PER_ROW_SUB: { const __fp16*m_=(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]=m_[m]-ab[m]; break; }
+            case PER_ROW_SUBF:{ const __fp16*m_=(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]-m_[m]; break; }
+            case PER_COL_MIN: { __fp16 v=*(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]<v?ab[m]:v; break; }
+            case PER_COL_MAX: { __fp16 v=*(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]>v?ab[m]:v; break; }
+            case PER_COL_ADD: { __fp16 v=*(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]+=v; break; }
+            case PER_COL_MUL: { __fp16 v=*(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]*=v; break; }
+            case PER_COL_SUB: { __fp16 v=*(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]=v-ab[m]; break; }
+            case PER_COL_SUBF:{ __fp16 v=*(const __fp16*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]-v; break; }
+            default:
+                return 1;
+        }
+    }
+}
diff --git a/linalg/arm64/sve/sve_mmv_f32_64x1.c b/linalg/arm64/sve/sve_mmv_f32_64x1.c
new file mode 100644
index 0000000000..59d1f8932b
--- /dev/null
+++ b/linalg/arm64/sve/sve_mmv_f32_64x1.c
@@ -0,0 +1,128 @@
+// SVE f32 GEMV kernel for tract's MMM framework (the mmv_f32 slot, dispatched
+// when N == 1: matrix x f32 column vector).
+//
+// Tile MR=64 x NR=1. The hot AddMatMul is the vector-length-agnostic update
+// vectorized over M: per K-step it loads MR f32 of the A-panel column and folds
+// a single svmla_n_f32 with the scalar B[k]. The MR rows are walked in svcntw()
+// chunks with whilelt predication, so the SAME code is correct and full-width at
+// any SVE vector length (128..2048-bit).
+//
+// Sibling of sve_mmm_f32.c (the 8x8 GEMM); shares the same FusedKerSpec<f32> ABI
+// and fuse-op surface. At NR=1, per_col / scalar fuse ops degenerate to a single
+// broadcast value and per_row is element-wise over the MR outputs. As with the
+// f32 GEMM kernel, LeakyRelu and the i32 quantization ops are excluded by
+// CAN_FUSE. Returns 0 on success, 1 on an unsupported fused op.
+
+#include <arm_sve.h>
+#include <stdint.h>
+#include <string.h>
+
+#define MR 64
+#define NR 1
+
+enum {
+    DONE = 0, CLEAR, LOAD_TILE,
+    SCALAR_MIN, SCALAR_MAX, SCALAR_ADD, SCALAR_MUL, SCALAR_SUB, SCALAR_SUBF,
+    LEAKY_RELU,
+    PER_ROW_MIN, PER_ROW_MAX, PER_ROW_ADD, PER_ROW_MUL, PER_ROW_SUB, PER_ROW_SUBF,
+    PER_COL_MIN, PER_COL_MAX, PER_COL_ADD, PER_COL_MUL, PER_COL_SUB, PER_COL_SUBF,
+    Q_SCALE, Q_SHR, Q_SHL,
+    ADD_UNICAST, ADD_ROW_COL_PRODUCTS, STORE, ADD_MAT_MUL
+};
+
+typedef struct {
+    uint64_t disc;
+    uint64_t f0, f1, f2, f3;
+} spec_t;
+
+static inline float f32_of(uint64_t bits) {
+    float f;
+    uint32_t lo = (uint32_t)bits;
+    memcpy(&f, &lo, 4);
+    return f;
+}
+
+// AddMatMul: ab[m] += sum_k pa[k*MR+m] * pb[k]. VLA over MR.
+static void add_mat_mul(float ab[MR], const float *pa, const float *pb, long k) {
+    for (long m0 = 0; m0 < MR; m0 += svcntw()) {
+        svbool_t pg = svwhilelt_b32((uint64_t)m0, (uint64_t)MR);
+        svfloat32_t acc = svld1_f32(pg, &ab[m0]);
+        for (long kk = 0; kk < k; kk++) {
+            svfloat32_t a = svld1_f32(pg, &pa[kk * MR + m0]);
+            acc = svmla_n_f32_x(pg, acc, a, pb[kk]);
+        }
+        svst1_f32(pg, &ab[m0], acc);
+    }
+}
+
+intptr_t sve_mmv_f32_64x1_kernel(const spec_t *ops) {
+    float ab[MR];
+    memset(ab, 0, sizeof(ab));
+    for (const spec_t *s = ops;; s++) {
+        switch (s->disc) {
+            case DONE:
+                return 0;
+            case CLEAR:
+                memset(ab, 0, sizeof(ab));
+                break;
+            case ADD_MAT_MUL: {
+                long k = (long)s->f0;
+                add_mat_mul(ab, (const float *)s->f1, (const float *)s->f2, k);
+                break;
+            }
+            case STORE: {
+                uint8_t *ptr = (uint8_t *)s->f0;
+                long rstride = (long)s->f1, isz = (long)s->f3;
+                for (long m = 0; m < MR; m++) {
+                    uint8_t *p = ptr + m * rstride;
+                    if (isz == 4)
+                        *(float *)p = ab[m];
+                    else
+                        memcpy(p, &ab[m], isz);
+                }
+                break;
+            }
+            case LOAD_TILE: {
+                const float *src = (const float *)s->f1; // row-major MR values
+                for (long m = 0; m < MR; m++) ab[m] = src[m];
+                break;
+            }
+            case ADD_UNICAST: {
+                uint8_t *ptr = (uint8_t *)s->f0;
+                long rstride = (long)s->f1;
+                for (long m = 0; m < MR; m++) ab[m] += *(const float *)(ptr + m * rstride);
+                break;
+            }
+            case ADD_ROW_COL_PRODUCTS: {
+                const float *rows = (const float *)s->f0;
+                const float *cols = (const float *)s->f1;
+                for (long m = 0; m < MR; m++) ab[m] += rows[m] * cols[0];
+                break;
+            }
+            // scalar fuse ops (f32 bits in low 32 bits of f0)
+            case SCALAR_MIN: { float v=f32_of(s->f0); for(long m=0;m<MR;m++) ab[m]=ab[m]<v?ab[m]:v; break; }
+            case SCALAR_MAX: { float v=f32_of(s->f0); for(long m=0;m<MR;m++) ab[m]=ab[m]>v?ab[m]:v; break; }
+            case SCALAR_ADD: { float v=f32_of(s->f0); for(long m=0;m<MR;m++) ab[m]+=v; break; }
+            case SCALAR_MUL: { float v=f32_of(s->f0); for(long m=0;m<MR;m++) ab[m]*=v; break; }
+            case SCALAR_SUB: { float v=f32_of(s->f0); for(long m=0;m<MR;m++) ab[m]=v-ab[m]; break; }
+            case SCALAR_SUBF:{ float v=f32_of(s->f0); for(long m=0;m<MR;m++) ab[m]=ab[m]-v; break; }
+            // per-row fuse ops (one f32 per m-row)
+            case PER_ROW_MIN: { const float*m_=(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]<m_[m]?ab[m]:m_[m]; break; }
+            case PER_ROW_MAX: { const float*m_=(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]>m_[m]?ab[m]:m_[m]; break; }
+            case PER_ROW_ADD: { const float*m_=(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]+=m_[m]; break; }
+            case PER_ROW_MUL: { const float*m_=(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]*=m_[m]; break; }
+            case PER_ROW_SUB: { const float*m_=(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]=m_[m]-ab[m]; break; }
+            case PER_ROW_SUBF:{ const float*m_=(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]-m_[m]; break; }
+            // per-col fuse ops degenerate to a single broadcast value at NR=1
+            case PER_COL_MIN: { float v=*(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]<v?ab[m]:v; break; }
+            case PER_COL_MAX: { float v=*(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]>v?ab[m]:v; break; }
+            case PER_COL_ADD: { float v=*(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]+=v; break; }
+            case PER_COL_MUL: { float v=*(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]*=v; break; }
+            case PER_COL_SUB: { float v=*(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]=v-ab[m]; break; }
+            case PER_COL_SUBF:{ float v=*(const float*)s->f0; for(long m=0;m<MR;m++) ab[m]=ab[m]-v; break; }
+            default:
+                // LeakyRelu / QScale / RoundingShiftRight / ShiftLeft excluded by CAN_FUSE.
+                return 1;
+        }
+    }
+}
diff --git a/linalg/benches/amx_f32.rs b/linalg/benches/amx_f32.rs
new file mode 100644
index 0000000000..bf59f066cf
--- /dev/null
+++ b/linalg/benches/amx_f32.rs
@@ -0,0 +1,105 @@
+#![allow(dead_code)]
+// Kernel-level benchmark: Intel AMX bf16 GEMM for f32 matmul
+// (avx512amx_mmm_f32_16x16, TDPBF16PS over 16x16 f32 tile with K=32 bf16 inner)
+// vs the AVX-512 f32 16x12 path (avx512_mmm_f32_16x12, FMA) vs the AVX2/FMA
+// f32 16x6 path (fma_mmm_f32_16x6).
+//
+// The AMX path runs the f32f32_bf16 packing (index 1) which truncates f32 to
+// bf16 at pack time (round-to-nearest-even, matching VCVTNEPS2BF16) so the f32
+// accumulators carry the bf16 precision profile -- same trade-off as oneDNN
+// "fast-math" f32 matmul on AMX. The two reference kernels run their default
+// f32 packing (index 0).
+//
+// Skipped at runtime when has_amx_bf16() returns false (= CPUID lacks
+// amx-bf16/tile or the arch_prctl XSAVE permission was denied), and at build
+// time when the tract_amx_bf16 cfg was not emitted.
+use criterion::*;
+use tract_data::internal::*;
+use tract_linalg::mmm::{AsInputValue, FusedSpec, MatMatMul};
+
+fn run_kernel(
+    be: &mut Bencher,
+    mmm: &dyn MatMatMul,
+    packing: usize,
+    m: usize,
+    k: usize,
+    n: usize,
+) {
+    let a = Tensor::zero_dt(DatumType::F32, &[m, k]).unwrap();
+    let b = Tensor::zero_dt(DatumType::F32, &[k, n]).unwrap();
+    let (pack_a, pack_b) = &mmm.packings()[packing];
+    let pa = pack_a.prepare_one(&a, 1, 0).unwrap();
+    let pb = pack_b.prepare_one(&b, 0, 1).unwrap();
+    let mut scratch = unsafe { mmm.allocate_scratch_space() };
+    be.iter_custom(|iters| {
+        let mut dur = std::time::Duration::default();
+        for _ in 0..iters {
+            let t = std::time::Instant::now();
+            unsafe {
+                mmm.run_with_scratch_space(
+                    m,
+                    n,
+                    scratch.as_mut(),
+                    &[FusedSpec::AddMatMul {
+                        a: AsInputValue::Borrowed(&*pa),
+                        b: AsInputValue::Borrowed(&*pb),
+                        packing,
+                    }],
+                )
+                .unwrap()
+            };
+            dur += t.elapsed();
+        }
+        dur
+    });
+}
+
+fn benches(c: &mut Criterion) {
+    #[cfg(tract_amx_bf16)]
+    {
+        use tract_linalg::x86_64_fma::amx_bf16::has_amx_bf16;
+        use tract_linalg::x86_64_fma::mmm::*;
+        if !has_amx_bf16() {
+            eprintln!("AMX bf16 not available (CPUID + arch_prctl gate failed), skipping");
+            return;
+        }
+        // Same shapes as amx_i32 so reviewers can directly compare bf16->f32 vs
+        // i8->i32 throughput at matching M/K/N. K=32 (single tdpbf16ps step)
+        // and K=64 (one i8 tile) are tested via 256 / 256x256 / 512x512x512.
+        for &(m, k, n) in
+            &[(64usize, 256usize, 64usize), (256, 256, 256), (512, 512, 512), (1024, 1024, 64)]
+        {
+            let id = format!("{m}x{k}x{n}");
+            let mut g = c.benchmark_group("amx_f32/packed_packed");
+            g.throughput(Throughput::Elements((m * k * n) as u64));
+            // Reference: FMA f32 16x6 (the kernel mmm_f32 picks for these N).
+            g.bench_with_input(BenchmarkId::new("fma_16x6", &id), &(m, k, n), |b, &(m, k, n)| {
+                run_kernel(b, &*fma_mmm_f32_16x6.mmm(), 0, m, k, n)
+            });
+            if std::is_x86_feature_detected!("avx512f") {
+                // Reference: AVX-512 f32 16x12.
+                g.bench_with_input(
+                    BenchmarkId::new("avx512_16x12", &id),
+                    &(m, k, n),
+                    |b, &(m, k, n)| run_kernel(b, &*avx512_mmm_f32_16x12.mmm(), 0, m, k, n),
+                );
+            }
+            // AMX bf16 path (packing index 1 = f32f32_bf16: pack-time RNE
+            // conversion of f32 -> bf16, then TDPBF16PS in the inner loop).
+            g.bench_with_input(
+                BenchmarkId::new("avx512amx_bf16_16x16", &id),
+                &(m, k, n),
+                |b, &(m, k, n)| run_kernel(b, &*avx512amx_mmm_f32_16x16.mmm(), 1, m, k, n),
+            );
+            g.finish();
+        }
+    }
+    #[cfg(not(tract_amx_bf16))]
+    {
+        eprintln!("tract not built with AMX bf16 support (probe failed at build time)");
+        let _ = c;
+    }
+}
+
+criterion_group!(g, benches);
+criterion_main!(g);
diff --git a/linalg/benches/amx_i32.rs b/linalg/benches/amx_i32.rs
new file mode 100644
index 0000000000..79388ec05f
--- /dev/null
+++ b/linalg/benches/amx_i32.rs
@@ -0,0 +1,84 @@
+#![allow(dead_code)]
+// Kernel-level benchmark: Intel AMX int8 GEMM (avx512amx_mmm_i32_8x8, TDPBSSD over
+// 8x8 i32 tile with K=64 inner) vs the AVX-512 VNNI int8 path (avx512vnni_mmm_i32_8x8,
+// VPDPBUSD over PackedI8K4 with K=4 inner) vs the AVX2 int8 path
+// (avx2_mmm_i32_8x8, vpmaddubsw-style widening). All three run the same i8i8
+// packing index (1) over the same M/K/N so the only difference is the matmul
+// inner loop.
+use criterion::*;
+use tract_data::internal::*;
+use tract_linalg::mmm::{AsInputValue, FusedSpec, MatMatMul};
+
+fn run_kernel(be: &mut Bencher, mmm: &dyn MatMatMul, m: usize, k: usize, n: usize) {
+    let a = Tensor::zero_dt(DatumType::I8, &[m, k]).unwrap();
+    let b = Tensor::zero_dt(DatumType::I8, &[k, n]).unwrap();
+    let (pack_a, pack_b) = &mmm.packings()[1];
+    let pa = pack_a.prepare_one(&a, 1, 0).unwrap();
+    let pb = pack_b.prepare_one(&b, 0, 1).unwrap();
+    let mut scratch = unsafe { mmm.allocate_scratch_space() };
+    be.iter_custom(|iters| {
+        let mut dur = std::time::Duration::default();
+        for _ in 0..iters {
+            let t = std::time::Instant::now();
+            unsafe {
+                mmm.run_with_scratch_space(
+                    m,
+                    n,
+                    scratch.as_mut(),
+                    &[FusedSpec::AddMatMul {
+                        a: AsInputValue::Borrowed(&*pa),
+                        b: AsInputValue::Borrowed(&*pb),
+                        packing: 1,
+                    }],
+                )
+                .unwrap()
+            };
+            dur += t.elapsed();
+        }
+        dur
+    });
+}
+
+fn benches(c: &mut Criterion) {
+    #[cfg(tract_amx_int8)]
+    {
+        use tract_linalg::x86_64_fma::amx::has_amx_int8;
+        use tract_linalg::x86_64_fma::mmm::*;
+        if !has_amx_int8() {
+            eprintln!("AMX int8 not available (CPUID + arch_prctl gate failed), skipping");
+            return;
+        }
+        for &(m, k, n) in
+            &[(64usize, 256usize, 64usize), (256, 256, 256), (512, 512, 512), (1024, 1024, 64)]
+        {
+            let id = format!("{m}x{k}x{n}");
+            let mut g = c.benchmark_group("amx_i32/packed_packed");
+            g.throughput(Throughput::Elements((m * k * n) as u64));
+            g.bench_with_input(BenchmarkId::new("avx2", &id), &(m, k, n), |b, &(m, k, n)| {
+                run_kernel(b, &*avx2_mmm_i32_8x8.mmm(), m, k, n)
+            });
+            if std::is_x86_feature_detected!("avx512vnni") {
+                g.bench_with_input(
+                    BenchmarkId::new("avx512vnni", &id),
+                    &(m, k, n),
+                    |b, &(m, k, n)| run_kernel(b, &*avx512vnni_mmm_i32_8x8.mmm(), m, k, n),
+                );
+            }
+            g.bench_with_input(BenchmarkId::new("avx512amx_8x8", &id), &(m, k, n), |b, &(m, k, n)| {
+                run_kernel(b, &*avx512amx_mmm_i32_8x8.mmm(), m, k, n)
+            });
+            g.bench_with_input(BenchmarkId::new("avx512amx_16x16", &id), &(m, k, n), |b, &(m, k, n)| {
+                run_kernel(b, &*avx512amx_mmm_i32_16x16.mmm(), m, k, n)
+            });
+            g.finish();
+        }
+    }
+    #[cfg(not(tract_amx_int8))]
+    {
+        eprintln!("tract not built with AMX int8 support (probe failed at build time)");
+        let _ = c;
+    }
+}
+
+criterion_group!(g, benches);
+criterion_main!(g);
diff --git a/linalg/benches/avx512_zombies.rs b/linalg/benches/avx512_zombies.rs
new file mode 100644
index 0000000000..b600505f99
--- /dev/null
+++ b/linalg/benches/avx512_zombies.rs
@@ -0,0 +1,68 @@
+#![allow(dead_code)]
+
+use criterion::{Criterion, criterion_group, criterion_main};
+use tract_linalg::mmm::MatMatMul;
+
+#[path = "utils.rs"]
+mod utils;
+use utils::mat_mat_with_mm;
+
+fn run(c: &mut Criterion, name: &str, mmm: &dyn MatMatMul, m: usize, k: usize, n: usize) {
+    let mut group = c.benchmark_group(format!("avx512_zombie/{name}"));
+    let id = format!("{m}x{k}x{n}");
+    group.bench_with_input(
+        criterion::BenchmarkId::new("hot", &id),
+        &(tract_data::prelude::DatumType::F32, m, k, n, false),
+        |b, p| mat_mat_with_mm(b, mmm, p),
+    );
+    group.bench_with_input(
+        criterion::BenchmarkId::new("cold", &id),
+        &(tract_data::prelude::DatumType::F32, m, k, n, true),
+        |b, p| mat_mat_with_mm(b, mmm, p),
+    );
+}
+
+fn benches(c: &mut Criterion) {
+    if !std::is_x86_feature_detected!("avx512f") {
+        eprintln!("avx512f not available, skipping");
+        return;
+    }
+
+    use tract_data::prelude::DatumType::F32;
+    use tract_linalg::x86_64_fma::mmm::*;
+
+    // Representative large-K, square-ish M case.
+    let (m, k) = (64usize, 256usize);
+
+    // N = 5 : zombie was 32x5 vs old 64x3.
+    run(c, "N5_64x3_explicit", &*avx512_mmm_f32_64x3.mmm(), m, k, 5);
+    run(c, "N5_32x5_explicit", &*avx512_mmm_f32_32x5.mmm(), m, k, 5);
+
+    // N = 6 : zombie was 32x6 vs old 64x3.
+    run(c, "N6_64x3_explicit", &*avx512_mmm_f32_64x3.mmm(), m, k, 6);
+    run(c, "N6_32x6_explicit", &*avx512_mmm_f32_32x6.mmm(), m, k, 6);
+
+    // N = 8 : zombie was 16x8 vs old 48x4.
+    run(c, "N8_48x4_explicit", &*avx512_mmm_f32_48x4.mmm(), m, k, 8);
+    run(c, "N8_16x8_explicit", &*avx512_mmm_f32_16x8.mmm(), m, k, 8);
+
+    // What does the live dispatcher pick for these shapes? If the picker
+    // is healthy these match the zombie numbers above, kernel name printed
+    // to stderr at startup.
+    for n in [5usize, 6, 8] {
+        let mmm = tract_linalg::ops().mmm(F32, Some(m), Some(k), Some(n)).unwrap();
+        eprintln!("dispatcher@m={m},k={k},n={n} picked {}", mmm.name());
+        run(c, &format!("N{n}_dispatch"), &*mmm, m, k, n);
+    }
+
+    // Trace-only: a few shapes where M-padding overhead with the old
+    // picker was high. We expect the M-aware picker to pick smaller-mr
+    // kernels here.
+    for (m, n) in [(20usize, 2), (33, 3), (50, 4), (17, 5), (1000, 64)] {
+        let mmm = tract_linalg::ops().mmm(F32, Some(m), Some(k), Some(n)).unwrap();
+        eprintln!("dispatcher@m={m},k={k},n={n} picked {}", mmm.name());
+    }
+}
+
+criterion_group!(g, benches);
+criterion_main!(g);
diff --git a/linalg/benches/avxvnni_i32.rs b/linalg/benches/avxvnni_i32.rs
new file mode 100644
index 0000000000..19deb1f013
--- /dev/null
+++ b/linalg/benches/avxvnni_i32.rs
@@ -0,0 +1,97 @@
+#![allow(dead_code)]
+// Kernel-level benchmark: AVX-VNNI ymm int8 GEMM (avxvnni_mmm_i32_8x8,
+// VEX-encoded VPDPBUSD over PackedI8K4 with K=4 inner) vs the AVX2 emulation
+// path (avx2_mmm_i32_8x8, vpmaddubsw-style widening). Both kernels run the
+// same i8i8 packing index (1) over the same M/K/N so the only difference is
+// the matmul inner loop.
+//
+// Designed for Atom-class hosts that have AVX-VNNI but no AVX-512:
+//
+//   * Alder Lake / Raptor Lake / Meteor Lake E-cores (Gracemont, Crestmont)
+//   * Sierra Forest (Sierra Glen)
+//   * Clearwater Forest (Darkmont)
+//
+// Big cores with both AVX-512-VNNI and AVX-VNNI still run AVX-VNNI here for
+// comparison purposes; in production dispatch the EVEX-encoded
+// avx512vnni_mmm_i32_8x8 wins on those CPUs because it can later be widened
+// to zmm without an ISA-level rewrite.
+use criterion::*;
+use tract_data::internal::*;
+use tract_linalg::mmm::{AsInputValue, FusedSpec, MatMatMul};
+
+fn run_kernel(be: &mut Bencher, mmm: &dyn MatMatMul, m: usize, k: usize, n: usize) {
+    let a = Tensor::zero_dt(DatumType::I8, &[m, k]).unwrap();
+    let b = Tensor::zero_dt(DatumType::I8, &[k, n]).unwrap();
+    let (pack_a, pack_b) = &mmm.packings()[1];
+    let pa = pack_a.prepare_one(&a, 1, 0).unwrap();
+    let pb = pack_b.prepare_one(&b, 0, 1).unwrap();
+    let mut scratch = unsafe { mmm.allocate_scratch_space() };
+    be.iter_custom(|iters| {
+        let mut dur = std::time::Duration::default();
+        for _ in 0..iters {
+            let t = std::time::Instant::now();
+            unsafe {
+                mmm.run_with_scratch_space(
+                    m,
+                    n,
+                    scratch.as_mut(),
+                    &[FusedSpec::AddMatMul {
+                        a: AsInputValue::Borrowed(&*pa),
+                        b: AsInputValue::Borrowed(&*pb),
+                        packing: 1,
+                    }],
+                )
+                .unwrap()
+            };
+            dur += t.elapsed();
+        }
+        dur
+    });
+}
+
+fn benches(c: &mut Criterion) {
+    #[cfg(tract_avxvnni)]
+    {
+        use tract_linalg::x86_64_fma::avxvnni::has_avxvnni;
+        use tract_linalg::x86_64_fma::mmm::*;
+        if !has_avxvnni() {
+            eprintln!("AVX-VNNI not available (CPUID leaf 7.1 EAX.4 unset), skipping");
+            return;
+        }
+        // Same shapes as amx_i32 / vnni_i32 for direct side-by-side comparison.
+        for &(m, k, n) in
+            &[(64usize, 256usize, 64usize), (256, 256, 256), (512, 512, 512), (1024, 1024, 64)]
+        {
+            let id = format!("{m}x{k}x{n}");
+            let mut g = c.benchmark_group("avxvnni_i32/packed_packed");
+            g.throughput(Throughput::Elements((m * k * n) as u64));
+            g.bench_with_input(BenchmarkId::new("avx2", &id), &(m, k, n), |b, &(m, k, n)| {
+                run_kernel(b, &*avx2_mmm_i32_8x8.mmm(), m, k, n)
+            });
+            g.bench_with_input(BenchmarkId::new("avxvnni", &id), &(m, k, n), |b, &(m, k, n)| {
+                run_kernel(b, &*avxvnni_mmm_i32_8x8.mmm(), m, k, n)
+            });
+            // When the same host also reports AVX-512-VNNI, include it as a
+            // reference point: the same kernel body runs as EVEX/zmm-encoded
+            // VPDPBUSD, which should match the AVX-VNNI throughput on Sapphire
+            // Rapids+ but can diverge on Cooper/Cascade Lake where the EVEX
+            // decoder is on the AVX-512 fused unit.
+            if std::is_x86_feature_detected!("avx512vnni") {
+                g.bench_with_input(
+                    BenchmarkId::new("avx512vnni", &id),
+                    &(m, k, n),
+                    |b, &(m, k, n)| run_kernel(b, &*avx512vnni_mmm_i32_8x8.mmm(), m, k, n),
+                );
+            }
+            g.finish();
+        }
+    }
+    #[cfg(not(tract_avxvnni))]
+    {
+        eprintln!("tract not built with AVX-VNNI support (probe failed at build time)");
+        let _ = c;
+    }
+}
+
+criterion_group!(g, benches);
+criterion_main!(g);
diff --git a/linalg/benches/gelu.rs b/linalg/benches/gelu.rs
new file mode 100644
index 0000000000..31b1df8e05
--- /dev/null
+++ b/linalg/benches/gelu.rs
@@ -0,0 +1,44 @@
+use criterion::*;
+use tract_data::prelude::*;
+
+use tract_linalg::element_wise::ElementWiseKer;
+
+fn gelu_f32(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gelu_f32");
+    group.throughput(Throughput::Elements(1024));
+    let mut input = unsafe { Tensor::uninitialized_aligned::<f32>(&[1024], 16).unwrap() };
+    let input = unsafe { input.as_slice_mut_unchecked::<f32>() };
+    for (i, x) in input.iter_mut().enumerate() {
+        *x = (i as f32 / 10.0).sin() * 5.0;
+    }
+    group.bench_function("rust_scalar", |b| b.iter(|| rust_scalar(input)));
+    group.bench_function("linalg", |b| b.iter(|| linalg(input)));
+    #[cfg(target_arch = "aarch64")]
+    group.bench_function("linalg-asm-compose", |b| {
+        b.iter(|| tract_linalg::arm64::arm64simd_gelu_f32_4n::run(input, ()))
+    });
+    #[cfg(target_arch = "aarch64")]
+    group.bench_function("linalg-asm-fused", |b| {
+        b.iter(|| tract_linalg::arm64::arm64simd_gelu_f32_4n_fused::run(input, ()))
+    });
+}
+
+#[inline(never)]
+fn rust_scalar(input: &mut [f32]) {
+    // Match tract's GeluApproximate scalar formula (pow=3).
+    const SQRT_2_OVER_PI: f32 = 0.7978845608028654;
+    const COEF: f32 = 0.044715;
+    for x in input {
+        let v = *x;
+        let inner = SQRT_2_OVER_PI * (v + COEF * v * v * v);
+        *x = 0.5 * v * (1.0 + inner.tanh());
+    }
+}
+
+#[inline(never)]
+fn linalg(input: &mut [f32]) {
+    (tract_linalg::ops().gelu_f32)().run(input).unwrap();
+}
+
+criterion_group!(benches, gelu_f32);
+criterion_main!(benches);
diff --git a/linalg/benches/hardswish.rs b/linalg/benches/hardswish.rs
new file mode 100644
index 0000000000..0f96dc4ec6
--- /dev/null
+++ b/linalg/benches/hardswish.rs
@@ -0,0 +1,34 @@
+use criterion::*;
+use tract_data::prelude::*;
+
+use tract_linalg::element_wise::ElementWiseKer;
+
+fn hardswish_f32(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hardswish_f32");
+    group.throughput(Throughput::Elements(1024));
+    let mut input = unsafe { Tensor::uninitialized_aligned::<f32>(&[1024], 16).unwrap() };
+    let input = unsafe { input.as_slice_mut_unchecked::<f32>() };
+    group.bench_function("rust", |b| b.iter(|| rust_f32(input)));
+    group.bench_function("linalg", |b| b.iter(|| linalg32(input)));
+    #[cfg(target_arch = "aarch64")]
+    group.bench_function("linalg-asm", |b| {
+        b.iter(|| tract_linalg::arm64::arm64simd_hardswish_f32_8n::run(input, ()))
+    });
+}
+
+#[inline(never)]
+fn rust_f32(input: &mut [f32]) {
+    const INV6: f32 = 1.0 / 6.0;
+    for x in input {
+        let relu6 = ((*x + 3.0).min(6.0)).max(0.0);
+        *x = *x * relu6 * INV6;
+    }
+}
+
+#[inline(never)]
+fn linalg32(input: &mut [f32]) {
+    (tract_linalg::ops().hardswish_f32)().run(input).unwrap();
+}
+
+criterion_group!(benches, hardswish_f32);
+criterion_main!(benches);
diff --git a/linalg/benches/sigmoid.rs b/linalg/benches/sigmoid.rs
index c9868b654c..c095b242ab 100644
--- a/linalg/benches/sigmoid.rs
+++ b/linalg/benches/sigmoid.rs
@@ -4,18 +4,30 @@ extern crate tract_linalg;
 use criterion::Criterion;
 
 fn ssigmoid(c: &mut Criterion, n: usize) {
-    c.bench_function(&format!("ssigmoid_{n}"), move |be| {
+    c.bench_function(&format!("ssigmoid_tract_{n}"), move |be| {
         let mut s = (0..n).map(|i| i as f32 / 10.0).collect::<Vec<f32>>();
         let op = &(tract_linalg::ops().sigmoid_f32)();
         be.iter(|| op.run(&mut s));
     });
 }
 
+#[inline(never)]
+fn rust_sigmoid(x: &mut [f32]) {
+    for v in x {
+        *v = 1.0 / (1.0 + (-*v).exp());
+    }
+}
+
+fn ssigmoid_scalar(c: &mut Criterion, n: usize) {
+    c.bench_function(&format!("ssigmoid_scalar_{n}"), move |be| {
+        let mut s = (0..n).map(|i| i as f32 / 10.0).collect::<Vec<f32>>();
+        be.iter(|| rust_sigmoid(&mut s));
+    });
+}
+
 fn bs(c: &mut Criterion) {
-    ssigmoid(c, 4);
-    ssigmoid(c, 8);
-    ssigmoid(c, 128);
     ssigmoid(c, 1024);
+    ssigmoid_scalar(c, 1024);
 }
 
 criterion_group!(benches, bs);
diff --git a/linalg/benches/silu.rs b/linalg/benches/silu.rs
new file mode 100644
index 0000000000..acc44ebe20
--- /dev/null
+++ b/linalg/benches/silu.rs
@@ -0,0 +1,40 @@
+use criterion::*;
+use tract_data::prelude::*;
+
+use tract_linalg::element_wise::ElementWiseKer;
+
+fn silu_f32(c: &mut Criterion) {
+    let mut group = c.benchmark_group("silu_f32");
+    group.throughput(Throughput::Elements(1024));
+    let mut input = unsafe { Tensor::uninitialized_aligned::<f32>(&[1024], 16).unwrap() };
+    let input = unsafe { input.as_slice_mut_unchecked::<f32>() };
+    for (i, x) in input.iter_mut().enumerate() {
+        *x = (i as f32 / 10.0).sin() * 5.0;
+    }
+    group.bench_function("rust_scalar", |b| b.iter(|| rust_scalar(input)));
+    group.bench_function("linalg", |b| b.iter(|| linalg(input)));
+    #[cfg(target_arch = "aarch64")]
+    group.bench_function("linalg-asm-compose", |b| {
+        b.iter(|| tract_linalg::arm64::arm64simd_silu_f32_4n::run(input, ()))
+    });
+    #[cfg(target_arch = "aarch64")]
+    group.bench_function("linalg-asm-fused", |b| {
+        b.iter(|| tract_linalg::arm64::arm64simd_silu_f32_4n_fused::run(input, ()))
+    });
+}
+
+#[inline(never)]
+fn rust_scalar(input: &mut [f32]) {
+    for x in input {
+        let sigmoid = 1.0 / (1.0 + (-*x).exp());
+        *x = *x * sigmoid;
+    }
+}
+
+#[inline(never)]
+fn linalg(input: &mut [f32]) {
+    (tract_linalg::ops().silu_f32)().run(input).unwrap();
+}
+
+criterion_group!(benches, silu_f32);
+criterion_main!(benches);
diff --git a/linalg/benches/vnni_i32.rs b/linalg/benches/vnni_i32.rs
new file mode 100644
index 0000000000..6901427f4a
--- /dev/null
+++ b/linalg/benches/vnni_i32.rs
@@ -0,0 +1,70 @@
+#![allow(dead_code)]
+// Kernel-level benchmark: AVX-512 VNNI int8 GEMM over the K=4-inner PackedI8K4
+// layout (VPDPBUSD) vs the AVX2 int8 path (avx2_mmm_i32_8x8, vpmaddubsw-style
+// widening). Three columns: the AVX2 baseline, the 8x8 ymm VNNI kernel, and the
+// 16x16 zmm VNNI kernel (twice the columns per accumulator). All run the i8i8
+// packing (index 1) over the same M/K/N so the only difference is the matmul
+// inner loop and tile geometry.
+use criterion::*;
+use tract_data::internal::*;
+use tract_linalg::mmm::{AsInputValue, FusedSpec, MatMatMul};
+
+fn run_kernel(be: &mut Bencher, mmm: &dyn MatMatMul, m: usize, k: usize, n: usize) {
+    let a = Tensor::zero_dt(DatumType::I8, &[m, k]).unwrap();
+    let b = Tensor::zero_dt(DatumType::I8, &[k, n]).unwrap();
+    let (pack_a, pack_b) = &mmm.packings()[1];
+    let pa = pack_a.prepare_one(&a, 1, 0).unwrap();
+    let pb = pack_b.prepare_one(&b, 0, 1).unwrap();
+    let mut scratch = unsafe { mmm.allocate_scratch_space() };
+    be.iter_custom(|iters| {
+        let mut dur = std::time::Duration::default();
+        for _ in 0..iters {
+            let t = std::time::Instant::now();
+            unsafe {
+                mmm.run_with_scratch_space(
+                    m,
+                    n,
+                    scratch.as_mut(),
+                    &[FusedSpec::AddMatMul {
+                        a: AsInputValue::Borrowed(&*pa),
+                        b: AsInputValue::Borrowed(&*pb),
+                        packing: 1,
+                    }],
+                )
+                .unwrap()
+            };
+            dur += t.elapsed();
+        }
+        dur
+    });
+}
+
+fn benches(c: &mut Criterion) {
+    if !std::is_x86_feature_detected!("avx512vnni") {
+        eprintln!("avx512vnni not available, skipping");
+        return;
+    }
+    use tract_linalg::x86_64_fma::mmm::*;
+    for &(m, k, n) in
+        &[(64usize, 256usize, 64usize), (256, 256, 256), (512, 512, 512), (1024, 1024, 64)]
+    {
+        let id = format!("{m}x{k}x{n}");
+        let mut g = c.benchmark_group("vnni_i32/packed_packed");
+        g.throughput(Throughput::Elements((m * k * n) as u64));
+        g.bench_with_input(BenchmarkId::new("avx2", &id), &(m, k, n), |b, &(m, k, n)| {
+            run_kernel(b, &*avx2_mmm_i32_8x8.mmm(), m, k, n)
+        });
+        g.bench_with_input(BenchmarkId::new("avx512vnni", &id), &(m, k, n), |b, &(m, k, n)| {
+            run_kernel(b, &*avx512vnni_mmm_i32_8x8.mmm(), m, k, n)
+        });
+        g.bench_with_input(
+            BenchmarkId::new("avx512vnni_16x16", &id),
+            &(m, k, n),
+            |b, &(m, k, n)| run_kernel(b, &*avx512vnni_mmm_i32_16x16.mmm(), m, k, n),
+        );
+        g.finish();
+    }
+}
+
+criterion_group!(g, benches);
+criterion_main!(g);
diff --git a/linalg/benches/wasm.rs b/linalg/benches/wasm.rs
new file mode 100644
index 0000000000..a87cd3aa41
--- /dev/null
+++ b/linalg/benches/wasm.rs
@@ -0,0 +1,684 @@
+//! WASM kernel microbenches. Run on wasm32 only.
+//!
+//!   RUSTFLAGS='-C target-feature=+simd128' \
+//!     CARGO_TARGET_WASM32_WASIP1_RUNNER='wasmtime --env RUST_TEST_NOCAPTURE=1 --' \
+//!     cargo bench --release --target wasm32-wasip1 -p tract-linalg --bench wasm
+//!
+//! Re-run with `+simd128,+relaxed-simd` to compare baseline mul+add against
+//! the FMA emit driven by the `madd_f32x4!` macro in `linalg/src/wasm.rs`.
+
+#[cfg(not(target_arch = "wasm32"))]
+fn main() {
+    eprintln!("this bench only runs on wasm32 targets — skipping on host");
+}
+
+#[cfg(target_arch = "wasm32")]
+fn main() {
+    let target = if cfg!(target_feature = "relaxed-simd") {
+        "+simd128,+relaxed-simd (FMA)"
+    } else {
+        "+simd128 only (mul+add)"
+    };
+
+    eprintln!("=== WASM 8x8 GEMM microbench ({target}) ===");
+    bench_8x8::run();
+
+    eprintln!();
+    eprintln!("=== Isolated 32x1 GEMV microbench ({target}) ===");
+    bench_32x1::run();
+
+    eprintln!();
+    eprintln!("=== Isolated 16x1 GEMV microbench ({target}) ===");
+    bench_16x1::run();
+
+    eprintln!();
+    eprintln!("=== int8 (i8->i32) 4x4 GEMM: wasm SIMD vs generic scalar ({target}) ===");
+    bench_i8_4x4::run();
+
+    #[cfg(target_feature = "relaxed-simd")]
+    {
+        eprintln!();
+        eprintln!("=== int8 relaxed-dot prototype: relaxed_dot vs widening (4x4 tile) ===");
+        bench_relaxed_dot::run();
+    }
+    #[cfg(not(target_feature = "relaxed-simd"))]
+    eprintln!("\n(int8 relaxed-dot prototype skipped — rebuild with +relaxed-simd)");
+}
+
+#[cfg(target_arch = "wasm32")]
+mod bench_8x8 {
+    //! Microbench: time `wasm_f32_8x8` (the GEMM kernel for N>=2) at shapes
+    //! relevant to DFN3, transformer FFN, and CNN→GEMM workloads.
+
+    use std::time::Instant;
+    use tract_data::internal::*;
+    use tract_linalg::mmm::{AsInputValue, FusedSpec};
+
+    fn run_one(
+        kernel: &dyn tract_linalg::mmm::MatMatMul,
+        m: usize,
+        k: usize,
+        n: usize,
+        iters: usize,
+    ) -> f64 {
+        let packing = &kernel.packings()[0];
+        let a = Tensor::zero::<f32>(&[m, k]).unwrap();
+        let pa = packing.0.prepare_one(&a, 1, 0).unwrap();
+        let b = Tensor::zero::<f32>(&[k, n]).unwrap();
+        let pb = packing.1.prepare_one(&b, 0, 1).unwrap();
+        let mut c = Tensor::zero::<f32>(&[m, n]).unwrap();
+
+        for _ in 0..50 {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        n,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: 0,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(1)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        n,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: 0,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(1)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+        let elapsed = t0.elapsed();
+        elapsed.as_secs_f64() / iters as f64 * 1e9
+    }
+
+    fn pick(name: &str) -> Box<dyn tract_linalg::mmm::MatMatMul> {
+        let mut ops = tract_linalg::generic();
+        tract_linalg::wasm::plug(&mut ops);
+        for impl_ in ops.mmm_impls() {
+            if impl_.name() == name {
+                return impl_.clone();
+            }
+        }
+        panic!("kernel {name} not registered")
+    }
+
+    fn bench_shape(label: &str, m: usize, k: usize, n: usize, iters: usize) {
+        let k88 = pick("wasm_f32_8x8");
+        let ns = run_one(&*k88, m, k, n, iters);
+        let m_tiles = m.div_ceil(8);
+        let n_tiles = n.div_ceil(8);
+        let total_tiles = m_tiles * n_tiles;
+        let per_tile_ns = ns / total_tiles as f64;
+        eprintln!(
+            "{label} (m={m} k={k} n={n}, iters={iters}): {ns:.0} ns/call \
+             ({total_tiles} 8x8 tiles, {per_tile_ns:.1} ns/tile)"
+        );
+    }
+
+    pub fn run() {
+        // DFN3 N>1 GEMM case (the primary 8x8 hit on DFN3).
+        bench_shape("DFN3-style m=64 k=64 n=8", 64, 64, 8, 50_000);
+        // Larger N — typical batched/transformer GEMM.
+        bench_shape("m=64 k=64 n=64", 64, 64, 64, 10_000);
+        bench_shape("m=128 k=128 n=8", 128, 128, 8, 20_000);
+        bench_shape("m=128 k=128 n=64", 128, 128, 64, 5_000);
+        bench_shape("m=256 k=256 n=8", 256, 256, 8, 5_000);
+        bench_shape("m=256 k=256 n=64", 256, 256, 64, 1_000);
+        // Whisper-tiny FFN-ish (large K, small N).
+        bench_shape("m=384 k=1536 n=8", 384, 1536, 8, 1_000);
+    }
+}
+
+#[cfg(target_arch = "wasm32")]
+mod bench_32x1 {
+    //! Isolated, statistics-aware microbench for `wasm_f32_32x1` to investigate
+    //! the apparent regression at M=100/256 in `microbench_dispatch_gemv`. That
+    //! bench loops all 4 GEMV kernels back-to-back at every shape, biasing the
+    //! later-running kernel (32x1) with cache contention and thermal buildup.
+    //! This module benches 32x1 alone, with min-of-N reporting across
+    //! repetitions to expose variance honestly.
+
+    use std::time::Instant;
+    use tract_data::internal::*;
+    use tract_linalg::mmm::{AsInputValue, FusedSpec};
+
+    fn run_one(kernel: &dyn tract_linalg::mmm::MatMatMul, m: usize, k: usize, iters: usize) -> f64 {
+        let packing = &kernel.packings()[0];
+        let a = Tensor::zero::<f32>(&[m, k]).unwrap();
+        let pa = packing.0.prepare_one(&a, 1, 0).unwrap();
+        let b = Tensor::zero::<f32>(&[k, 1]).unwrap();
+        let pb = packing.1.prepare_one(&b, 0, 1).unwrap();
+        let mut c = Tensor::zero::<f32>(&[m, 1]).unwrap();
+
+        // Generous warmup — 200 calls primes the JIT and hot caches.
+        for _ in 0..200 {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        1,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: 0,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(0)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        1,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: 0,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(0)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+        let elapsed = t0.elapsed();
+        elapsed.as_secs_f64() / iters as f64 * 1e9
+    }
+
+    fn pick(name: &str) -> Box<dyn tract_linalg::mmm::MatMatMul> {
+        let mut ops = tract_linalg::generic();
+        tract_linalg::wasm::plug(&mut ops);
+        for impl_ in ops.mmm_impls() {
+            if impl_.name() == name {
+                return impl_.clone();
+            }
+        }
+        panic!("kernel {name} not registered")
+    }
+
+    fn bench_min_of_n(label: &str, m: usize, k: usize, iters: usize, repetitions: usize) {
+        let kernel = pick("wasm_f32_32x1");
+        let mut samples: Vec<f64> = Vec::with_capacity(repetitions);
+        for _ in 0..repetitions {
+            samples.push(run_one(&*kernel, m, k, iters));
+        }
+        samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        let min = samples[0];
+        let median = samples[samples.len() / 2];
+        let max = samples[samples.len() - 1];
+        let pct_spread = (max - min) / min * 100.0;
+        eprintln!(
+            "{label} (m={m} k={k}, {iters} iters × {repetitions} reps): \
+             min={min:.0} median={median:.0} max={max:.0} ns/call (spread {pct_spread:.0}%)"
+        );
+    }
+
+    pub fn run() {
+        // Suspect shapes from microbench_dispatch_gemv (apparent regression):
+        bench_min_of_n("M=100 k=256", 100, 256, 10_000, 10);
+        bench_min_of_n("M=256 k=256", 256, 256, 5_000, 10);
+        bench_min_of_n("M=256 k=512", 256, 512, 2_000, 10);
+        // Reference shapes (showed clean speedup before):
+        bench_min_of_n("M=24 k=256", 24, 256, 30_000, 10);
+        bench_min_of_n("M=64 k=96", 64, 96, 20_000, 10);
+    }
+}
+
+#[cfg(target_arch = "wasm32")]
+mod bench_16x1 {
+    //! Isolated 16x1 GEMV microbench — same methodology as bench_32x1.
+    //! 16x1 has 4 SIMD accumulators per K-step, which under +relaxed-simd
+    //! exposes the destructive-fmla accumulator recurrence (4-cycle latency
+    //! throttling throughput to 1 FMA/cycle even though Apple Silicon pipes
+    //! can do 4). Used to validate that the fix in linalg/src/wasm.rs (which
+    //! routes 16x1 through `madd_f32x4_nofma!` to use separate mul+add)
+    //! recovers the regression PR #2199 missed.
+    use std::time::Instant;
+    use tract_data::internal::*;
+    use tract_linalg::mmm::{AsInputValue, FusedSpec};
+
+    fn run_one(kernel: &dyn tract_linalg::mmm::MatMatMul, m: usize, k: usize, iters: usize) -> f64 {
+        let packing = &kernel.packings()[0];
+        let a = Tensor::zero::<f32>(&[m, k]).unwrap();
+        let pa = packing.0.prepare_one(&a, 1, 0).unwrap();
+        let b = Tensor::zero::<f32>(&[k, 1]).unwrap();
+        let pb = packing.1.prepare_one(&b, 0, 1).unwrap();
+        let mut c = Tensor::zero::<f32>(&[m, 1]).unwrap();
+
+        for _ in 0..200 {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        1,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: 0,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(0)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        1,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: 0,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(0)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+        let elapsed = t0.elapsed();
+        elapsed.as_secs_f64() / iters as f64 * 1e9
+    }
+
+    fn pick(name: &str) -> Box<dyn tract_linalg::mmm::MatMatMul> {
+        let mut ops = tract_linalg::generic();
+        tract_linalg::wasm::plug(&mut ops);
+        for impl_ in ops.mmm_impls() {
+            if impl_.name() == name {
+                return impl_.clone();
+            }
+        }
+        panic!("kernel {name} not registered")
+    }
+
+    fn bench_min_of_n(label: &str, m: usize, k: usize, iters: usize, repetitions: usize) {
+        let kernel = pick("wasm_f32_16x1");
+        let mut samples: Vec<f64> = Vec::with_capacity(repetitions);
+        for _ in 0..repetitions {
+            samples.push(run_one(&*kernel, m, k, iters));
+        }
+        samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        let min = samples[0];
+        let median = samples[samples.len() / 2];
+        let max = samples[samples.len() - 1];
+        let pct_spread = (max - min) / min * 100.0;
+        eprintln!(
+            "{label} (m={m} k={k}, {iters} iters × {repetitions} reps): \
+             min={min:.0} median={median:.0} max={max:.0} ns/call (spread {pct_spread:.0}%)"
+        );
+    }
+
+    pub fn run() {
+        // 16x1's natural band per plug()'s mmv_f32 closure: M ∈ 9..=16
+        bench_min_of_n("M=9 k=256", 9, 256, 30_000, 10);
+        bench_min_of_n("M=12 k=256", 12, 256, 30_000, 10);
+        bench_min_of_n("M=16 k=96", 16, 96, 30_000, 10);
+        bench_min_of_n("M=16 k=256", 16, 256, 20_000, 10);
+        bench_min_of_n("M=16 k=512", 16, 512, 10_000, 10);
+        bench_min_of_n("M=16 k=1024", 16, 1024, 5_000, 10);
+    }
+}
+
+#[cfg(target_arch = "wasm32")]
+mod bench_i8_4x4 {
+    //! int8 (i8->i32) GEMM microbench: the new SIMD `wasm_i32_4x4` vs the scalar
+    //! `generic_i32_4x4` fallback. Both kernels expose the *identical* i8i8
+    //! PackedI8K4 packing (packing index 1), the same 4x4 tile and i32
+    //! accumulator — so the ratio is a clean read on what the SIMD
+    //! widening-extmul AddMatMul buys over the generic scalar loop. min-of-N
+    //! reporting per kernel to keep the variance honest.
+
+    use std::time::Instant;
+    use tract_data::internal::*;
+    use tract_linalg::mmm::{AsInputValue, FusedSpec, MatMatMul};
+
+    // i8i8 packing slot is index 1 on both generic_i32_4x4 and wasm_i32_4x4.
+    const I8I8: usize = 1;
+
+    fn run_one(kernel: &dyn MatMatMul, m: usize, k: usize, n: usize, iters: usize) -> f64 {
+        let packing = &kernel.packings()[I8I8];
+        let a = Tensor::zero::<i8>(&[m, k]).unwrap();
+        let pa = packing.0.prepare_one(&a, 1, 0).unwrap();
+        let b = Tensor::zero::<i8>(&[k, n]).unwrap();
+        let pb = packing.1.prepare_one(&b, 0, 1).unwrap();
+        let mut c = Tensor::zero::<i32>(&[m, n]).unwrap();
+
+        // Warmup: prime the JIT and hot caches.
+        for _ in 0..50 {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        n,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: I8I8,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(1)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        n,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: I8I8,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(1)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+        let elapsed = t0.elapsed();
+        elapsed.as_secs_f64() / iters as f64 * 1e9
+    }
+
+    fn pick(name: &str) -> Box<dyn MatMatMul> {
+        let mut ops = tract_linalg::generic();
+        tract_linalg::wasm::plug(&mut ops);
+        for impl_ in ops.mmm_impls() {
+            if impl_.name() == name {
+                return impl_.clone();
+            }
+        }
+        panic!("kernel {name} not registered")
+    }
+
+    fn min_of_n(
+        kernel: &dyn MatMatMul,
+        m: usize,
+        k: usize,
+        n: usize,
+        iters: usize,
+        reps: usize,
+    ) -> f64 {
+        let mut samples: Vec<f64> = (0..reps).map(|_| run_one(kernel, m, k, n, iters)).collect();
+        samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        samples[0]
+    }
+
+    fn bench(label: &str, m: usize, k: usize, n: usize, iters: usize, reps: usize) {
+        let wasm = pick("wasm_i32_4x4");
+        let generic = pick("generic_i32_4x4");
+        let w = min_of_n(&*wasm, m, k, n, iters, reps);
+        let g = min_of_n(&*generic, m, k, n, iters, reps);
+        let tiles = m.div_ceil(4) * n.div_ceil(4);
+        eprintln!(
+            "{label} (m={m} k={k} n={n}, {iters} iters × {reps} reps): \
+             wasm={w:.0} generic={g:.0} ns/call  speedup={:.2}x  \
+             ({tiles} 4x4 tiles, wasm {:.1} ns/tile)",
+            g / w,
+            w / tiles as f64
+        );
+    }
+
+    pub fn run() {
+        // Square GEMMs across sizes (compute-bound, the SIMD path's home turf).
+        bench("square m=64 k=64 n=64", 64, 64, 64, 5_000, 8);
+        bench("square m=128 k=128 n=128", 128, 128, 128, 1_000, 8);
+        bench("square m=256 k=256 n=256", 256, 256, 256, 200, 8);
+        // Transformer-ish: large K, moderate M/N (MiniLM/FFN projections).
+        bench("m=128 k=384 n=384", 128, 384, 384, 500, 8);
+        bench("m=64 k=1536 n=64", 64, 1536, 64, 1_000, 8);
+        // CNN→GEMM (InceptionV1-style im2col), small N.
+        bench("m=256 k=256 n=16", 256, 256, 16, 2_000, 8);
+    }
+}
+
+// Prototype: int8 4x4 tile via `i32x4_relaxed_dot_i8x16_i7x16_add` (SDOT-analog,
+// 4 i8 MACs/lane, no widening) vs the deterministic widening path. Only compiles
+// under +relaxed-simd. Isolates a single cache-resident 4x4 tile so the ratio is
+// a pure instruction-density read. Includes a bit-exactness check on wasmtime.
+#[cfg(all(target_arch = "wasm32", target_feature = "relaxed-simd"))]
+mod bench_relaxed_dot {
+    use std::arch::wasm32::*;
+    use std::hint::black_box;
+    use std::time::Instant;
+
+    // Logical A is [4][k] row-major (a[m*k + ik]); logical B is [k][4] row-major
+    // (b[ik*4 + n]). Reference 4x4 = sum_ik A[m][ik] * B[ik][n].
+    fn reference_tile(a: &[i8], b: &[i8], k: usize) -> [i32; 16] {
+        let mut c = [0i32; 16];
+        for ik in 0..k {
+            for m in 0..4 {
+                for n in 0..4 {
+                    c[m * 4 + n] += a[m * k + ik] as i32 * b[ik * 4 + n] as i32;
+                }
+            }
+        }
+        c
+    }
+
+    // K-major A: out[ik*4 + m] = A[m][ik] (m inner) — what the widening kernel reads.
+    fn pack_a_kmajor(a: &[i8], k: usize) -> Vec<i8> {
+        let mut o = vec![0i8; k * 4];
+        for ik in 0..k {
+            for m in 0..4 {
+                o[ik * 4 + m] = a[m * k + ik];
+            }
+        }
+        o
+    }
+    // K-major B is exactly the logical [ik*4 + n] layout already.
+
+    // M-major A, K contiguous, K padded to mult of 4: out[m*kp + ik] = A[m][ik].
+    fn pack_a_mmajor(a: &[i8], k: usize) -> (Vec<i8>, usize) {
+        let kp = k.div_ceil(4) * 4;
+        let mut o = vec![0i8; 4 * kp];
+        for m in 0..4 {
+            for ik in 0..k {
+                o[m * kp + ik] = a[m * k + ik];
+            }
+        }
+        (o, kp)
+    }
+    // K=4-inner B: out[kb*16 + n*4 + kr] = B[4kb+kr][n] — the relaxed-dot layout.
+    fn pack_b_k4(b: &[i8], k: usize) -> Vec<i8> {
+        let kp = k.div_ceil(4) * 4;
+        let mut o = vec![0i8; kp * 4];
+        for kb in 0..kp / 4 {
+            for kr in 0..4 {
+                let kk = 4 * kb + kr;
+                if kk >= k {
+                    continue;
+                }
+                for n in 0..4 {
+                    o[kb * 16 + n * 4 + kr] = b[kk * 4 + n];
+                }
+            }
+        }
+        o
+    }
+
+    // Current deterministic approach: widen B to i32x4 per k, splat A, mul+add.
+    unsafe fn widening_tile(a_km: *const i8, b_km: *const i8, k: usize) -> [i32; 16] {
+        unsafe {
+            let mut acc = [i32x4_splat(0); 4];
+            for ik in 0..k {
+                let bw = v128_load32_zero(b_km.add(4 * ik) as *const u32);
+                let bw = i16x8_extend_low_i8x16(bw);
+                let bw = i32x4_extend_low_i16x8(bw);
+                let ar = a_km.add(4 * ik);
+                acc[0] = i32x4_add(acc[0], i32x4_mul(i32x4_splat(*ar.add(0) as i32), bw));
+                acc[1] = i32x4_add(acc[1], i32x4_mul(i32x4_splat(*ar.add(1) as i32), bw));
+                acc[2] = i32x4_add(acc[2], i32x4_mul(i32x4_splat(*ar.add(2) as i32), bw));
+                acc[3] = i32x4_add(acc[3], i32x4_mul(i32x4_splat(*ar.add(3) as i32), bw));
+            }
+            let mut c = [0i32; 16];
+            for m in 0..4 {
+                v128_store(c[m * 4..].as_mut_ptr() as *mut v128, acc[m]);
+            }
+            c
+        }
+    }
+
+    // Relaxed-dot: per 4-K block, one v128 B-load shared across 4 rows; each row
+    // broadcasts its 4 K-bytes and issues one relaxed_dot. 64 MACs in 4 dots.
+    unsafe fn relaxed_tile(apk: *const i8, bpk: *const i8, kp: usize) -> [i32; 16] {
+        unsafe {
+            let mut acc = [i32x4_splat(0); 4];
+            for kb in 0..kp / 4 {
+                let b_all = v128_load(bpk.add(kb * 16) as *const v128);
+                for m in 0..4 {
+                    let a4 = (apk.add(m * kp + kb * 4) as *const i32).read_unaligned();
+                    let a_m = i32x4_splat(a4);
+                    acc[m] = i32x4_relaxed_dot_i8x16_i7x16_add(a_m, b_all, acc[m]);
+                }
+            }
+            let mut c = [0i32; 16];
+            for m in 0..4 {
+                v128_store(c[m * 4..].as_mut_ptr() as *mut v128, acc[m]);
+            }
+            c
+        }
+    }
+
+    fn gen_data(k: usize, seed: i32, bits7: bool) -> Vec<i8> {
+        (0..k * 4)
+            .map(|i| {
+                let v = ((i as i32).wrapping_mul(97).wrapping_add(seed).wrapping_mul(31)) & 0xff;
+                let v = (v - 128) as i8; // full i8 range
+                if bits7 { (v as i32).clamp(-63, 63) as i8 } else { v }
+            })
+            .collect()
+    }
+
+    fn check(label: &str, k: usize, b_bits7: bool) {
+        let a = gen_data(k, 1, false);
+        let b = gen_data(k, 7, b_bits7);
+        let reference = reference_tile(&a, &b, k);
+
+        let a_km = pack_a_kmajor(&a, k);
+        let w = unsafe { widening_tile(a_km.as_ptr(), b.as_ptr(), k) };
+        assert_eq!(w, reference, "widening_tile mismatch ({label})");
+
+        let (a_mm, kp) = pack_a_mmajor(&a, k);
+        let b_k4 = pack_b_k4(&b, k);
+        let r = unsafe { relaxed_tile(a_mm.as_ptr(), b_k4.as_ptr(), kp) };
+        let exact = r == reference;
+        eprintln!(
+            "  correctness {label} (k={k}, B={}): widening=exact  relaxed={}",
+            if b_bits7 { "7-bit" } else { "full-i8" },
+            if exact { "EXACT" } else { "DIFFERS (non-deterministic intermediate)" }
+        );
+        if b_bits7 {
+            assert!(exact, "relaxed_dot must be exact when B is 7-bit ({label})");
+        }
+    }
+
+    fn time_relaxed(apk: &[i8], bpk: &[i8], kp: usize, iters: usize) -> f64 {
+        let mut sink = 0i32;
+        for _ in 0..50 {
+            sink ^= unsafe { relaxed_tile(apk.as_ptr(), bpk.as_ptr(), kp) }[0];
+        }
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            let c = unsafe {
+                relaxed_tile(black_box(apk).as_ptr(), black_box(bpk).as_ptr(), black_box(kp))
+            };
+            sink ^= c[5];
+        }
+        black_box(sink);
+        t0.elapsed().as_secs_f64() / iters as f64 * 1e9
+    }
+
+    fn time_widening(a_km: &[i8], b_km: &[i8], k: usize, iters: usize) -> f64 {
+        let mut sink = 0i32;
+        for _ in 0..50 {
+            sink ^= unsafe { widening_tile(a_km.as_ptr(), b_km.as_ptr(), k) }[0];
+        }
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            let c = unsafe {
+                widening_tile(black_box(a_km).as_ptr(), black_box(b_km).as_ptr(), black_box(k))
+            };
+            sink ^= c[5];
+        }
+        black_box(sink);
+        t0.elapsed().as_secs_f64() / iters as f64 * 1e9
+    }
+
+    fn min_of_n(f: &mut dyn FnMut() -> f64, reps: usize) -> f64 {
+        let mut s: Vec<f64> = (0..reps).map(|_| f()).collect();
+        s.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        s[0]
+    }
+
+    fn bench(k: usize, iters: usize, reps: usize) {
+        let a = gen_data(k, 1, false);
+        let b = gen_data(k, 7, false);
+        let a_km = pack_a_kmajor(&a, k);
+        let (a_mm, kp) = pack_a_mmajor(&a, k);
+        let b_k4 = pack_b_k4(&b, k);
+
+        let w = min_of_n(&mut || time_widening(&a_km, &b, k, iters), reps);
+        let r = min_of_n(&mut || time_relaxed(&a_mm, &b_k4, kp, iters), reps);
+        eprintln!(
+            "  4x4 tile k={k} ({iters} iters × {reps} reps): \
+             widening={w:.1} relaxed={r:.1} ns/call  speedup={:.2}x",
+            w / r
+        );
+    }
+
+    pub fn run() {
+        // Bit-exactness on wasmtime: full-i8 (engine-dependent intermediate) and
+        // 7-bit B (guaranteed no i16 overflow → deterministic on any engine).
+        check("k=64", 64, false);
+        check("k=64", 64, true);
+        check("k=260-padded", 260, false);
+        check("k=260-padded", 260, true);
+        eprintln!();
+        // Throughput: single cache-resident 4x4 tile across K depths.
+        bench(64, 200_000, 8);
+        bench(256, 50_000, 8);
+        bench(1024, 10_000, 8);
+        bench(1536, 8_000, 8);
+    }
+}
diff --git a/linalg/build.rs b/linalg/build.rs
index f57de0162e..69f6124912 100644
--- a/linalg/build.rs
+++ b/linalg/build.rs
@@ -15,6 +15,101 @@ fn include_amx() -> bool {
         || (env::var("CARGO_FEATURE_APPLE_AMX_IOS").is_ok() && os == "ios" && arch == "aarch64")
 }
 
+fn include_sme() -> bool {
+    let arch = var("CARGO_CFG_TARGET_ARCH");
+    let os = var("CARGO_CFG_TARGET_OS");
+    arch == "aarch64" && (os == "macos" || os == "linux")
+}
+
+// Probe whether the target assembler can actually assemble SME instructions.
+// Old binutils (e.g. the Debian stretch aarch64 cross-toolchain used in CI)
+// predate SME and reject the mnemonics even with `.arch armv9-a+sme2`, which
+// breaks the build. When the probe fails we skip the SME kernels entirely;
+// the matching `tract_sme` cfg keeps the Rust side from referencing the
+// (now absent) kernel symbols, and dispatch falls back to the portable path.
+fn assembler_supports_sme() -> bool {
+    cc::Build::new()
+        .file("arm64/sme/dummy_sme.S")
+        .cargo_metadata(false)
+        .cargo_warnings(false)
+        .warnings(false)
+        .try_compile("tract_sme_probe")
+        .is_ok()
+}
+
+// Probe whether the target assembler can actually assemble Intel AMX int8
+// instructions (`ldtilecfg`, `tilezero`, `tdpbusd`, `tilerelease`). Older
+// binutils (e.g. Debian stretch's gas 2.28) predate AMX and reject these
+// mnemonics outright, which would break the x86_64 build for users on those
+// toolchains. When the probe fails we skip the AMX kernel entirely; the
+// matching `tract_amx_int8` cfg keeps the Rust side from referencing the
+// (absent) kernel symbol, and `qmmm_i32` dispatch falls back to VNNI (or
+// AVX2 when VNNI is itself unavailable).
+fn assembler_supports_amx_int8() -> bool {
+    cc::Build::new()
+        .file("x86_64/avx512amx/dummy.S")
+        .cargo_metadata(false)
+        .cargo_warnings(false)
+        .warnings(false)
+        .try_compile("tract_amx_int8_probe")
+        .is_ok()
+}
+
+// Probe whether the assembler accepts the `{vex}` prefix on VPDPBUSD --
+// needed to force the AVX-VNNI (VEX) form instead of the AVX-512-VNNI
+// (EVEX) form gas defaults to. `{vex}` / `{evex}` instruction prefixes
+// were added in binutils 2.36; older toolchains reject them. When the
+// probe fails the avxvnni_mmm_i32_8x8 kernel is skipped and dispatch
+// falls back to the AVX2 emulation kernel on AVX-VNNI-only hardware.
+fn assembler_supports_avxvnni() -> bool {
+    cc::Build::new()
+        .file("x86_64/avx512amx/dummy_avxvnni.S")
+        .cargo_metadata(false)
+        .cargo_warnings(false)
+        .warnings(false)
+        .try_compile("tract_avxvnni_probe")
+        .is_ok()
+}
+
+// Probe whether the target assembler can assemble AMX bf16 instructions
+// (`tdpbf16ps`). Both int8 and bf16 AMX mnemonics require binutils >= 2.34,
+// so in practice this probe succeeds whenever `assembler_supports_amx_int8`
+// does. Provided separately so the two cfgs are independently controlled
+// and users on exotic toolchains can opt-out of just the bf16 kernel.
+fn assembler_supports_amx_bf16() -> bool {
+    cc::Build::new()
+        .file("x86_64/avx512amx/dummy_bf16.S")
+        .cargo_metadata(false)
+        .cargo_warnings(false)
+        .warnings(false)
+        .try_compile("tract_amx_bf16_probe")
+        .is_ok()
+}
+
+fn include_sve() -> bool {
+    // SVE/SVE2 lives on ARMv9 server/mobile cores (Neoverse V1+/N2+, Cortex-X2+,
+    // Graviton 3/4) — Linux aarch64. No Apple silicon has SVE.
+    var("CARGO_CFG_TARGET_ARCH") == "aarch64" && var("CARGO_CFG_TARGET_OS") == "linux"
+}
+
+// Probe whether the C compiler supports SVE intrinsics (arm_sve.h + `+sve`).
+// Old toolchains (e.g. the Debian stretch cross-gcc) lack them; when the probe
+// fails we skip the SVE kernels and the `tract_sve` cfg, so the Rust side never
+// references the (absent) symbols and dispatch falls back to NEON.
+fn compiler_supports_sve() -> bool {
+    let out_dir = path::PathBuf::from(var("OUT_DIR"));
+    let probe = out_dir.join("sve_probe.c");
+    fs::write(&probe, "#include <arm_sve.h>\nint p(void){ return (int)svcntw(); }\n").unwrap();
+    cc::Build::new()
+        .file(&probe)
+        .flag("-march=armv8.2-a+sve")
+        .cargo_metadata(false)
+        .cargo_warnings(false)
+        .warnings(false)
+        .try_compile("tract_sve_probe")
+        .is_ok()
+}
+
 fn jump_table() -> Vec<String> {
     println!("cargo:rerun-if-changed=src/frame/mmm/fuse.rs");
     std::fs::read_to_string("src/frame/mmm/fuse.rs")
@@ -79,11 +174,73 @@ fn main() {
     let suffix = env!("CARGO_PKG_VERSION").replace(['-', '.'], "_");
     make_extern_kernel_decl_macro(&out_dir, &suffix);
 
+    // `tract_sme` is set below only when both include_sme() and the assembler
+    // SME probe succeed; declare it so rustc's unexpected-cfg lint stays quiet.
+    println!("cargo:rustc-check-cfg=cfg(tract_sme)");
+    // Set below only when include_sve() and the SVE compiler probe both pass.
+    println!("cargo:rustc-check-cfg=cfg(tract_sve)");
+    // Set below only when the x86_64 assembler accepts AMX int8 mnemonics
+    // (avoids breaking the build on toolchains predating AMX).
+    println!("cargo:rustc-check-cfg=cfg(tract_amx_int8)");
+    // Set below only when the assembler accepts AMX bf16 mnemonics (tdpbf16ps).
+    println!("cargo:rustc-check-cfg=cfg(tract_amx_bf16)");
+    // Set below only when the assembler accepts the `{vex}` prefix on
+    // VPDPBUSD (binutils >= 2.36) -- needed for the AVX-VNNI ymm kernel.
+    println!("cargo:rustc-check-cfg=cfg(tract_avxvnni)");
+
     match arch.as_ref() {
         "x86_64" => {
             let mut files = preprocess_files("x86_64/fma", &[], &suffix, false);
             files.extend(preprocess_files("x86_64/avx512", &[], &suffix, false));
 
+            // Pull the AMX kernel templates out of the generic fma bulk-compile
+            // so they can be gated behind assembler probes below. All AMX
+            // mnemonics require gas >= 2.34; old toolchains (Debian stretch's
+            // binutils 2.28) would otherwise fail the whole build.
+            //
+            // Split by accumulator type:
+            //   avx512amx_*_i32_* → tdpbssd   → gated on tract_amx_int8
+            //   avx512amx_*_f32_* → tdpbf16ps → gated on tract_amx_bf16
+            let amx_int8_files: Vec<path::PathBuf> = files
+                .iter()
+                .filter(|f| {
+                    f.file_name()
+                        .and_then(|n| n.to_str())
+                        .map(|n| n.starts_with("avx512amx_") && n.contains("_i32_"))
+                        .unwrap_or(false)
+                })
+                .cloned()
+                .collect();
+            let amx_bf16_files: Vec<path::PathBuf> = files
+                .iter()
+                .filter(|f| {
+                    f.file_name()
+                        .and_then(|n| n.to_str())
+                        .map(|n| n.starts_with("avx512amx_") && n.contains("_f32_"))
+                        .unwrap_or(false)
+                })
+                .cloned()
+                .collect();
+            // AVX-VNNI ymm kernel: gas requires the `{vex}` instruction prefix
+            // (binutils 2.36+) -- pulled aside so the bulk -mfma compile, which
+            // is fine on older binutils, isn't broken when the AVX-VNNI cfg is
+            // disabled.
+            let avxvnni_files: Vec<path::PathBuf> = files
+                .iter()
+                .filter(|f| {
+                    f.file_name()
+                        .and_then(|n| n.to_str())
+                        .map(|n| n.starts_with("avxvnni_"))
+                        .unwrap_or(false)
+                })
+                .cloned()
+                .collect();
+            files.retain(|f| {
+                !amx_int8_files.contains(f)
+                    && !amx_bf16_files.contains(f)
+                    && !avxvnni_files.contains(f)
+            });
+
             if os == "windows" {
                 if use_masm() {
                     let mut lib_exe = cc::windows_registry::find(&target, "lib.exe")
@@ -132,6 +289,53 @@ fn main() {
             } else {
                 cc::Build::new().files(files).flag("-mfma").compile("x86_64_fma");
             }
+
+            // AMX int8 kernel: compile only when the assembler accepts the
+            // mnemonics, and the kernel template was actually pulled aside
+            // above. Unix only for now (the .S uses the GAS intel-syntax
+            // path). The `tract_amx_int8` cfg gates the Rust-side symbol
+            // reference: when the probe fails on old toolchains (e.g. Debian
+            // stretch's binutils 2.28), the kernel is omitted and `qmmm_i32`
+            // dispatch falls back to VNNI or AVX2 with no build error.
+            if os != "windows"
+                && !amx_int8_files.is_empty()
+                && assembler_supports_amx_int8()
+            {
+                cc::Build::new()
+                    .files(&amx_int8_files)
+                    .compile("x86_64_avx512amx");
+                println!("cargo:rustc-cfg=tract_amx_int8");
+            }
+
+            // AMX bf16 kernel for f32 matmul (tdpbf16ps). Same toolchain
+            // requirement and Unix-only constraint as the int8 path. When the
+            // probe fails, the `tract_amx_bf16` cfg stays unset and
+            // `plug_avx512amx_bf16` is compiled out — `mmm_f32` then falls
+            // back to AVX-512 / FMA without any build error.
+            if os != "windows"
+                && !amx_bf16_files.is_empty()
+                && assembler_supports_amx_bf16()
+            {
+                cc::Build::new()
+                    .files(&amx_bf16_files)
+                    .compile("x86_64_avx512amx_bf16");
+                println!("cargo:rustc-cfg=tract_amx_bf16");
+            }
+
+            // AVX-VNNI ymm int8 kernel. Independent of the AMX gates: this
+            // kernel ships VPDPBUSD-accelerated i8 GEMM to Atom-class cores
+            // (Alder Lake-E, Sierra Forest, Clearwater Forest / Darkmont)
+            // that have AVX-VNNI but no AVX-512, falling back to AVX2
+            // emulation when the runtime CPUID detection misses.
+            if os != "windows"
+                && !avxvnni_files.is_empty()
+                && assembler_supports_avxvnni()
+            {
+                cc::Build::new()
+                    .files(&avxvnni_files)
+                    .compile("x86_64_avxvnni");
+                println!("cargo:rustc-cfg=tract_avxvnni");
+            }
         }
         "arm" | "armv7" => {
             let files = preprocess_files("arm32/armvfpv2", &[], &suffix, false);
@@ -156,6 +360,30 @@ fn main() {
                 let files = preprocess_files("arm64/apple_amx", &[], &suffix, false);
                 cc::Build::new().files(files).compile("appleamx");
             }
+            if include_sme() && assembler_supports_sme() {
+                let files = preprocess_files("arm64/sme", &[], &suffix, false);
+                cc::Build::new().files(files).compile("sme");
+                println!("cargo:rustc-cfg=tract_sme");
+            }
+            if include_sve() && compiler_supports_sve() {
+                // VLA SVE kernels (C intrinsics, fixed symbols — not suffix-templated).
+                cc::Build::new()
+                    .file("arm64/sve/sve_mmm_f32.c")
+                    .file("arm64/sve/sve_mmv_f32_64x1.c")
+                    .file("arm64/sve/sve_mmm_i32.c")
+                    .file("arm64/sve/sve_mmm_i32_64x1.c")
+                    .flag("-march=armv8.2-a+sve")
+                    .compile("tract_sve_kernels");
+                // f16 kernels need native FP16 arithmetic (+fp16); compiled
+                // separately so the +sve-only kernels above never gain fp16
+                // codegen. Runtime-gated on has_fp16() as well as SVE2.
+                cc::Build::new()
+                    .file("arm64/sve/sve_mmm_f16.c")
+                    .file("arm64/sve/sve_mmv_f16_64x1.c")
+                    .flag("-march=armv8.2-a+sve+fp16")
+                    .compile("tract_sve_f16_kernels");
+                println!("cargo:rustc-cfg=tract_sve");
+            }
             if std::env::var("CARGO_FEATURE_NO_FP16").is_err() {
                 let config =
                     ConfigForHalf::probe().expect("No configuration found for fp16 support");
diff --git a/linalg/matmul-bench/Cargo.toml b/linalg/matmul-bench/Cargo.toml
index 45ec71c866..471dc451b7 100644
--- a/linalg/matmul-bench/Cargo.toml
+++ b/linalg/matmul-bench/Cargo.toml
@@ -8,20 +8,10 @@ edition = "2024"
 members = []
 
 [dependencies]
-cblas = { version = "0.3", optional = true }
-accelerate-src = { version = "0.3", optional = true }
-blis-src = { version = "0.2", features = ["static"], optional = true }
 matrixmultiply = "*"
 tract-data.workspace = true
 tract-linalg.workspace = true
 
-
-[features]
-default = []
-blas = ["cblas"]
-blis = ["blis-src", "blas"]
-accelerate = ["accelerate-src", "blas"]
-
 [build-dependencies]
 cc = "1.0"
 
diff --git a/linalg/matmul-bench/benches/matmul.rs b/linalg/matmul-bench/benches/matmul.rs
index 3359b6f658..a6a6e422ac 100644
--- a/linalg/matmul-bench/benches/matmul.rs
+++ b/linalg/matmul-bench/benches/matmul.rs
@@ -1,10 +1,4 @@
 #![allow(non_snake_case)]
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-#[cfg(feature = "blis")]
-extern crate blis_src;
-#[cfg(feature = "blis")]
-extern crate cblas;
 
 use criterion::measurement::WallTime;
 use criterion::*;
@@ -34,7 +28,6 @@ b!(tile_8x8);
 b!(ctile_8x8);
 b!(cpacked_tile_8x8);
 b!(matrixmultiply);
-b!(cblas);
 b!(tract);
 
 pub fn tract_blaslike(
@@ -93,7 +86,6 @@ fn matmul(c: &mut Criterion, m: usize, k: usize, n: usize) {
     ctile_8x8(&mut c, m, k, n);
     cpacked_tile_8x8(&mut c, m, k, n);
     matrixmultiply(&mut c, m, k, n);
-    cblas(&mut c, m, k, n);
     tract(&mut c, m, k, n);
     tract_blaslike(&mut c, m, k, n, f32::datum_type());
     tract_blaslike(&mut c, m, k, n, f16::datum_type());
diff --git a/linalg/matmul-bench/src/lib.rs b/linalg/matmul-bench/src/lib.rs
index e389222102..392fbb0650 100644
--- a/linalg/matmul-bench/src/lib.rs
+++ b/linalg/matmul-bench/src/lib.rs
@@ -1,10 +1,4 @@
 #![allow(non_snake_case)]
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-#[cfg(feature = "blis")]
-extern crate blis_src;
-#[cfg(feature = "blis")]
-extern crate cblas;
 
 pub fn naive(m: usize, k: usize, n: usize, a: &[f32], b: &[f32], c: &mut [f32]) {
     for row in 0..m {
@@ -379,29 +373,6 @@ pub fn matrixmultiply(m: usize, k: usize, n: usize, a: &[f32], b: &[f32], c: &mu
     }
 }
 
-#[allow(unused_variables, unused_mut)]
-pub fn cblas(m: usize, k: usize, n: usize, a: &[f32], b: &[f32], c: &mut [f32]) {
-    #[cfg(feature = "blas")]
-    unsafe {
-        cblas::sgemm(
-            cblas::Layout::RowMajor,
-            cblas::Transpose::None,
-            cblas::Transpose::None,
-            m as _,
-            n as _,
-            k as _,
-            1.0,
-            &a,
-            k as _,
-            &b,
-            n as _,
-            0.0,
-            c,
-            n as _,
-        )
-    }
-}
-
 pub fn tract(m: usize, k: usize, n: usize, a: &[f32], b: &[f32], c: &mut [f32]) {
     use tract_data::internal::*;
     use tract_linalg::frame::mmm::FusedSpec;
diff --git a/linalg/src/arm64.rs b/linalg/src/arm64.rs
index 82359f2104..47f7cd4ccf 100644
--- a/linalg/src/arm64.rs
+++ b/linalg/src/arm64.rs
@@ -4,6 +4,11 @@ mod apple_amx;
 mod arm64simd;
 pub mod cortex_a53;
 mod cortex_a55;
+// `tract_sme` is set by build.rs only when the assembler can assemble SME
+// (gates out e.g. the old Debian stretch aarch64 toolchain).
+#[cfg(all(any(target_os = "macos", target_os = "linux"), tract_sme))]
+mod sme;
+mod sve;
 //mod cortex_a72;
 //mod cortex_a73;
 pub use arm64simd::*;
@@ -403,6 +408,9 @@ pub fn plug(ops: &mut Ops) {
         }
     }
     ops.leaky_relu_f32 = Box::new(|| arm64simd_leaky_relu_f32_8n::ew());
+    ops.hardswish_f32 = Box::new(|| arm64simd_hardswish_f32_8n::ew());
+    ops.silu_f32 = Box::new(|| arm64simd_silu_f32_4n_fused::ew());
+    ops.gelu_f32 = Box::new(|| arm64simd_gelu_f32_4n_fused::ew());
     ops.sigmoid_f32 = Box::new(|| arm64simd_sigmoid_f32_4n::ew());
     ops.tanh_f32 = Box::new(|| arm64simd_tanh_f32_4n::ew());
     ops.max_f32 = Box::new(|| arm64simd_max_f32_16n::red());
@@ -425,4 +433,9 @@ pub fn plug(ops: &mut Ops) {
     {
         apple_amx::plug(ops);
     }
+    #[cfg(all(any(target_os = "macos", target_os = "linux"), tract_sme))]
+    {
+        sme::plug(ops);
+    }
+    sve::plug(ops);
 }
diff --git a/linalg/src/arm64/apple_amx.rs b/linalg/src/arm64/apple_amx.rs
index 68f48c79ce..bae28d5452 100644
--- a/linalg/src/arm64/apple_amx.rs
+++ b/linalg/src/arm64/apple_amx.rs
@@ -4,6 +4,7 @@ use crate::mmm::*;
 use tract_data::prelude::*;
 
 use super::has_amx;
+use super::{arm64fp16_mmm_f16_16x8_gen, arm64simd_mmm_f32_8x8_gen, arm64simd_mmm_f32_64x1_gen};
 
 const AMX: fn() -> bool = crate::arm64::has_amx;
 const CAN_FUSE: fn(&FusedSpec) -> bool = |f| !matches!(f, &FusedSpec::LeakyRelu(_));
@@ -15,11 +16,48 @@ MMMExternKernel!(apple_amx_mmm_f16_64x1<f16>(64, 1)@(128, 128) where(AMX) can_fu
 
 pub fn plug(ops: &mut Ops) {
     if has_amx() {
-        log::info!("AMX optimisation activated");
-        ops.mmm_f16 = Box::new(|_, _, _| apple_amx_mmm_f16_64x32.mmm());
-        ops.mmm_f32 = Box::new(|_, _, _| apple_amx_mmm_f32_32x32.mmm());
+        log::info!(
+            "AMX optimisation activated (A7v2: AMX only for f32 mmm with M>=32 AND N>=32; \
+             smaller shapes + all f32 mmv route to NEON kernels)"
+        );
+        // ----- A7v2 dispatch logic (data-driven) -----
+        //
+        // Empirical finding from /tmp/amx_vs_neon.md microbench (Apple M1 Pro):
+        // the AMX 32x32 kernel beats NEON 8x8 only when BOTH M and N are at
+        // least 32 — the AMX tile dimensions. At smaller shapes the per-tile
+        // padding waste + AMX dispatch overhead make NEON faster.
+        //
+        // Predicate validation: 88.3% accuracy on 512-shape sweep.
+        //
+        // Canary impact (measured 2026-05-13, see notes/tract-amx-low-m-investigation.md):
+        // turning AMX off entirely yielded:
+        //   df_dec       1.55× faster      mobilenetv2  1.59× faster
+        //   erb_dec      1.49×             squeezenet   1.22×
+        //   enc          1.17×             yolov8n      1.15× SLOWER
+        //   inception_v3 1.43× SLOWER      sam2_tiny    1.54× SLOWER
+        // The shape-aware predicate keeps the AMX wins for the heavy models
+        // (Inception, YOLO, SAM2) while routing small shapes to NEON.
+        ops.mmm_f32 = Box::new(|m, _, n| {
+            let big_enough = m.is_some_and(|m| m >= 32) && n.is_some_and(|n| n >= 32);
+            if big_enough { apple_amx_mmm_f32_32x32.mmm() } else { arm64simd_mmm_f32_8x8_gen.mmm() }
+        });
+        // mmv (n=1) f32: AMX 32x1 is dominated by NEON 64x1 across the entire
+        // shape sweep — confirmed by canary deltas on DFN3 (which is mmv-heavy).
+        // Always use NEON.
+        ops.mmv_f32 = Box::new(|_, _| arm64simd_mmm_f32_64x1_gen.mmm());
+
+        // ----- f16 paths kept conservative for now -----
+        //
+        // We didn't run the f16 microbench yet, so retain the original logic
+        // and the previous low-M-routes-to-NEON heuristic.
+        ops.mmm_f16 = Box::new(|m, _, _| {
+            if m.is_some_and(|m| m <= 16) {
+                arm64fp16_mmm_f16_16x8_gen.mmm()
+            } else {
+                apple_amx_mmm_f16_64x32.mmm()
+            }
+        });
         ops.mmv_f16 = Box::new(|_, _| apple_amx_mmm_f16_64x1.mmm());
-        ops.mmv_f32 = Box::new(|_, _| apple_amx_mmm_f32_32x1.mmm());
         ops.mmm_impls.extend_from_slice(&[
             apple_amx_mmm_f32_32x32.mmm(),
             apple_amx_mmm_f32_32x1.mmm(),
diff --git a/linalg/src/arm64/arm64simd.rs b/linalg/src/arm64/arm64simd.rs
index 646e7d1d6f..f90d92926f 100644
--- a/linalg/src/arm64/arm64simd.rs
+++ b/linalg/src/arm64/arm64simd.rs
@@ -1,14 +1,24 @@
 mod by_scalar;
+mod gelu;
+mod gelu_fused;
+mod hardswish;
 mod leaky_relu;
 mod max;
 mod panel_extract;
+mod silu;
+mod silu_fused;
 mod softmax;
 mod sum;
 mod unicast;
 
 pub use by_scalar::*;
+pub use gelu::arm64simd_gelu_f32_4n;
+pub use gelu_fused::arm64simd_gelu_f32_4n_fused;
+pub use hardswish::arm64simd_hardswish_f32_8n;
 pub use leaky_relu::arm64simd_leaky_relu_f32_8n;
 pub use max::arm64simd_max_f32_16n;
+pub use silu::arm64simd_silu_f32_4n;
+pub use silu_fused::arm64simd_silu_f32_4n_fused;
 pub use softmax::arm64simd_softmax2_fastcompact_f32_16n;
 pub use sum::arm64simd_sum_f32_16n;
 pub use unicast::*;
diff --git a/linalg/src/arm64/arm64simd/gelu.rs b/linalg/src/arm64/arm64simd/gelu.rs
new file mode 100644
index 0000000000..751bd143eb
--- /dev/null
+++ b/linalg/src/arm64/arm64simd/gelu.rs
@@ -0,0 +1,45 @@
+// Tanh-form GELU (pow=3) matching tract's GeluApproximate:
+//   gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+//
+// Composed at the kernel level: save the original x, compute the tanh
+// argument, call tract's NEON tanh kernel in place, then finish with the
+// 0.5 * x * (1 + tanh) multiply. Chunked to keep the scratch buffer L1-resident.
+
+ew_impl_wrap!(
+    f32,
+    arm64simd_gelu_f32_4n,
+    4,
+    4,
+    (),
+    #[inline(never)]
+    fn run(buf: &mut [f32], _: ()) {
+        const SQRT_2_OVER_PI: f32 = 0.7978845608028654;
+        const COEF: f32 = 0.044715;
+        const CHUNK: usize = 256;
+        let mut scratch = [0f32; CHUNK];
+        let mut start = 0;
+        while start < buf.len() {
+            let end = (start + CHUNK).min(buf.len());
+            let chunk = &mut buf[start..end];
+            let n = chunk.len();
+            // Save original x and pre-compute the tanh argument in place.
+            for i in 0..n {
+                let x = chunk[i];
+                scratch[i] = x;
+                chunk[i] = SQRT_2_OVER_PI * (x + COEF * x * x * x);
+            }
+            super::arm64simd_tanh_f32_4n::run(chunk, ());
+            // chunk now holds tanh(arg). Combine with saved x.
+            for i in 0..n {
+                chunk[i] = 0.5 * scratch[i] * (1.0 + chunk[i]);
+            }
+            start = end;
+        }
+    }
+);
+
+#[cfg(test)]
+pub mod test_arm64simd_gelu_f32_4n {
+    use super::*;
+    gelu_frame_tests!(true, f32, arm64simd_gelu_f32_4n);
+}
diff --git a/linalg/src/arm64/arm64simd/gelu_fused.rs b/linalg/src/arm64/arm64simd/gelu_fused.rs
new file mode 100644
index 0000000000..10bff88cb0
--- /dev/null
+++ b/linalg/src/arm64/arm64simd/gelu_fused.rs
@@ -0,0 +1,292 @@
+// Fused GELU (tanh-form, pow=3):
+//   gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+//
+// loop4 (16 lanes per iter) + loop1 (4-lane tail). Clones the tanh Padé
+// polynomial from arm64simd_tanh_f32_4n.S.j2, with pre-tanh argument
+// computation up front and the final 0.5*x*(1+tanh) combined via fmla.
+// Single memory pass (load + store), no scratch buffer.
+
+ew_impl_wrap!(
+    f32,
+    arm64simd_gelu_f32_4n_fused,
+    4,
+    4,
+    (),
+    #[inline(never)]
+    fn run(buf: &mut [f32], _: ()) {
+        // Tanh Padé coefficients (matches arm64simd_tanh_f32_4n.S.j2) +
+        // 3 GELU constants packed into the last vector lanes:
+        //   index 13: 0.5
+        //   index 14: sqrt(2/pi)              ≈ 0.7978846
+        //   index 15: 0.044715 * sqrt(2/pi)   ≈ 0.0356774
+        static COEFFS: [f32; 16] = [
+            -8.9,
+            8.9,
+            -8.488492677e-14,
+            5.277853000e-11,
+            -2.022500419e-8,
+            0.00001115424833,
+            0.003103950131,
+            0.1308400453,
+            0.9999999934,
+            0.0002546136580,
+            0.02449515379,
+            0.4641733162,
+            1.0,
+            0.5,
+            0.7978845608028654,
+            0.03567739613,
+        ];
+
+        assert!(buf.len() % 4 == 0);
+        if buf.is_empty() {
+            return;
+        }
+
+        unsafe {
+            let len = buf.len();
+            let ptr = buf.as_mut_ptr();
+            let coef_ptr = COEFFS.as_ptr();
+
+            // Register layout (loop4):
+            //   v0-v3: coefficients
+            //   v4:    sqrt(2/pi)  (broadcast of v3.s[2])
+            //   v5:    tanh clamp low (-8.9, dup v0.s[0])
+            //   v6:    tanh clamp high (8.9, dup v0.s[1])
+            //   v7:    0.5 (broadcast of v3.s[1])
+            //   v8-v11: 0.5 * original x (saved after load)
+            //   v16-v19: working (load -> pre_tanh -> clamped -> numerator)
+            //   v20-v23: x² for tanh polynomial
+            //   v24-v27: polynomial intermediates (denominator at end)
+            //   v28-v31: polynomial intermediates (also x³ temp before tanh)
+            std::arch::asm!("
+                ld1 {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [{coef}]
+                dup v5.4s, v0.s[0]
+                dup v6.4s, v0.s[1]
+                dup v7.4s, v3.s[1]
+                dup v4.4s, v3.s[2]
+
+                cmp {len}, #16
+                blt 9f
+
+                1:
+                    ld1 {{ v16.4s, v17.4s, v18.4s, v19.4s }}, [{ptr}]
+
+                    // Save 0.5 * original x into v8-v11
+                    fmul v8.4s, v16.4s, v7.4s
+                    fmul v9.4s, v17.4s, v7.4s
+                    fmul v10.4s, v18.4s, v7.4s
+                    fmul v11.4s, v19.4s, v7.4s
+
+                    // Compute x^3 into v28-v31
+                    fmul v28.4s, v16.4s, v16.4s
+                    fmul v29.4s, v17.4s, v17.4s
+                    fmul v30.4s, v18.4s, v18.4s
+                    fmul v31.4s, v19.4s, v19.4s
+
+                    fmul v28.4s, v28.4s, v16.4s
+                    fmul v29.4s, v29.4s, v17.4s
+                    fmul v30.4s, v30.4s, v18.4s
+                    fmul v31.4s, v31.4s, v19.4s
+
+                    // pre_tanh = sqrt(2/pi)*x + 0.0356774 * x^3
+                    fmul v16.4s, v16.4s, v4.4s
+                    fmul v17.4s, v17.4s, v4.4s
+                    fmul v18.4s, v18.4s, v4.4s
+                    fmul v19.4s, v19.4s, v4.4s
+
+                    fmla v16.4s, v28.4s, v3.s[3]
+                    fmla v17.4s, v29.4s, v3.s[3]
+                    fmla v18.4s, v30.4s, v3.s[3]
+                    fmla v19.4s, v31.4s, v3.s[3]
+
+                    // Clamp pre_tanh argument
+                    fmax v16.4s, v16.4s, v5.4s
+                    fmax v17.4s, v17.4s, v5.4s
+                    fmax v18.4s, v18.4s, v5.4s
+                    fmax v19.4s, v19.4s, v5.4s
+
+                    fmin v16.4s, v16.4s, v6.4s
+                    fmin v17.4s, v17.4s, v6.4s
+                    fmin v18.4s, v18.4s, v6.4s
+                    fmin v19.4s, v19.4s, v6.4s
+
+                    // Tanh Padé polynomial (cloned from arm64simd_tanh_f32_4n.S.j2)
+                    fmul v20.4s, v16.4s, v16.4s
+                    fmul v21.4s, v17.4s, v17.4s
+                    fmul v22.4s, v18.4s, v18.4s
+                    fmul v23.4s, v19.4s, v19.4s
+
+                    dup v24.4s, v0.s[3]
+                    fmla v24.4s, v20.4s, v0.s[2]
+                    dup v25.4s, v0.s[3]
+                    fmla v25.4s, v21.4s, v0.s[2]
+                    dup v26.4s, v0.s[3]
+                    fmla v26.4s, v22.4s, v0.s[2]
+                    dup v27.4s, v0.s[3]
+                    fmla v27.4s, v23.4s, v0.s[2]
+
+                    dup v28.4s, v1.s[0]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v29.4s, v1.s[0]
+                    fmla v29.4s, v21.4s, v25.4s
+                    dup v30.4s, v1.s[0]
+                    fmla v30.4s, v22.4s, v26.4s
+                    dup v31.4s, v1.s[0]
+                    fmla v31.4s, v23.4s, v27.4s
+
+                    dup v24.4s, v1.s[1]
+                    fmla v24.4s, v20.4s, v28.4s
+                    dup v25.4s, v1.s[1]
+                    fmla v25.4s, v21.4s, v29.4s
+                    dup v26.4s, v1.s[1]
+                    fmla v26.4s, v22.4s, v30.4s
+                    dup v27.4s, v1.s[1]
+                    fmla v27.4s, v23.4s, v31.4s
+
+                    dup v28.4s, v1.s[2]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v29.4s, v1.s[2]
+                    fmla v29.4s, v21.4s, v25.4s
+                    dup v30.4s, v1.s[2]
+                    fmla v30.4s, v22.4s, v26.4s
+                    dup v31.4s, v1.s[2]
+                    fmla v31.4s, v23.4s, v27.4s
+
+                    dup v24.4s, v1.s[3]
+                    fmla v24.4s, v20.4s, v28.4s
+                    dup v25.4s, v1.s[3]
+                    fmla v25.4s, v21.4s, v29.4s
+                    dup v26.4s, v1.s[3]
+                    fmla v26.4s, v22.4s, v30.4s
+                    dup v27.4s, v1.s[3]
+                    fmla v27.4s, v23.4s, v31.4s
+
+                    dup v28.4s, v2.s[0]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v29.4s, v2.s[0]
+                    fmla v29.4s, v21.4s, v25.4s
+                    dup v30.4s, v2.s[0]
+                    fmla v30.4s, v22.4s, v26.4s
+                    dup v31.4s, v2.s[0]
+                    fmla v31.4s, v23.4s, v27.4s
+
+                    fmul v16.4s, v16.4s, v28.4s
+                    fmul v17.4s, v17.4s, v29.4s
+                    fmul v18.4s, v18.4s, v30.4s
+                    fmul v19.4s, v19.4s, v31.4s
+
+                    dup v24.4s, v2.s[2]
+                    fmla v24.4s, v20.4s, v2.s[1]
+                    dup v25.4s, v2.s[2]
+                    fmla v25.4s, v21.4s, v2.s[1]
+                    dup v26.4s, v2.s[2]
+                    fmla v26.4s, v22.4s, v2.s[1]
+                    dup v27.4s, v2.s[2]
+                    fmla v27.4s, v23.4s, v2.s[1]
+
+                    dup v28.4s, v2.s[3]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v29.4s, v2.s[3]
+                    fmla v29.4s, v21.4s, v25.4s
+                    dup v30.4s, v2.s[3]
+                    fmla v30.4s, v22.4s, v26.4s
+                    dup v31.4s, v2.s[3]
+                    fmla v31.4s, v23.4s, v27.4s
+
+                    dup v24.4s, v3.s[0]
+                    fmla v24.4s, v20.4s, v28.4s
+                    dup v25.4s, v3.s[0]
+                    fmla v25.4s, v21.4s, v29.4s
+                    dup v26.4s, v3.s[0]
+                    fmla v26.4s, v22.4s, v30.4s
+                    dup v27.4s, v3.s[0]
+                    fmla v27.4s, v23.4s, v31.4s
+
+                    // tanh(pre_arg) = num/denom
+                    fdiv v16.4s, v16.4s, v24.4s
+                    fdiv v17.4s, v17.4s, v25.4s
+                    fdiv v18.4s, v18.4s, v26.4s
+                    fdiv v19.4s, v19.4s, v27.4s
+
+                    // result = 0.5*x * (1 + tanh) = (0.5*x) + (0.5*x) * tanh
+                    fmla v8.4s, v8.4s, v16.4s
+                    fmla v9.4s, v9.4s, v17.4s
+                    fmla v10.4s, v10.4s, v18.4s
+                    fmla v11.4s, v11.4s, v19.4s
+
+                    st1 {{ v8.4s, v9.4s, v10.4s, v11.4s }}, [{ptr}], #64
+                    sub {len}, {len}, #16
+                    cmp {len}, #16
+                    bge 1b
+
+                9:
+                cbz {len}, 3f
+
+                2:
+                    ld1 {{ v16.4s }}, [{ptr}]
+                    fmul v8.4s, v16.4s, v7.4s
+
+                    fmul v28.4s, v16.4s, v16.4s
+                    fmul v28.4s, v28.4s, v16.4s
+
+                    fmul v16.4s, v16.4s, v4.4s
+                    fmla v16.4s, v28.4s, v3.s[3]
+
+                    fmax v16.4s, v16.4s, v5.4s
+                    fmin v16.4s, v16.4s, v6.4s
+
+                    fmul v20.4s, v16.4s, v16.4s
+
+                    dup v24.4s, v0.s[3]
+                    fmla v24.4s, v20.4s, v0.s[2]
+                    dup v28.4s, v1.s[0]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v24.4s, v1.s[1]
+                    fmla v24.4s, v20.4s, v28.4s
+                    dup v28.4s, v1.s[2]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v24.4s, v1.s[3]
+                    fmla v24.4s, v20.4s, v28.4s
+                    dup v28.4s, v2.s[0]
+                    fmla v28.4s, v20.4s, v24.4s
+                    fmul v16.4s, v16.4s, v28.4s
+
+                    dup v24.4s, v2.s[2]
+                    fmla v24.4s, v20.4s, v2.s[1]
+                    dup v28.4s, v2.s[3]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v24.4s, v3.s[0]
+                    fmla v24.4s, v20.4s, v28.4s
+
+                    fdiv v16.4s, v16.4s, v24.4s
+
+                    fmla v8.4s, v8.4s, v16.4s
+
+                    st1 {{ v8.4s }}, [{ptr}], #16
+                    subs {len}, {len}, 4
+                    bne 2b
+
+                3:
+            ",
+            coef = in(reg) coef_ptr,
+            ptr = inout(reg) ptr => _,
+            len = inout(reg) len => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+            out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+            out("v16") _, out("v17") _, out("v18") _, out("v19") _,
+            out("v20") _, out("v21") _, out("v22") _, out("v23") _,
+            out("v24") _, out("v25") _, out("v26") _, out("v27") _,
+            out("v28") _, out("v29") _, out("v30") _, out("v31") _,
+            options(nostack),
+            );
+        }
+    }
+);
+
+#[cfg(test)]
+pub mod test_arm64simd_gelu_f32_4n_fused {
+    use super::*;
+    gelu_frame_tests!(true, f32, arm64simd_gelu_f32_4n_fused);
+}
diff --git a/linalg/src/arm64/arm64simd/hardswish.rs b/linalg/src/arm64/arm64simd/hardswish.rs
new file mode 100644
index 0000000000..ad25ba6949
--- /dev/null
+++ b/linalg/src/arm64/arm64simd/hardswish.rs
@@ -0,0 +1,63 @@
+ew_impl_wrap!(
+    f32,
+    arm64simd_hardswish_f32_8n,
+    8,
+    4,
+    (),
+    #[inline(never)]
+    fn run(buf: &mut [f32], _: ()) {
+        assert!(buf.len() % 8 == 0);
+        assert!(buf.len() > 0);
+        unsafe {
+            let len = buf.len();
+            let ptr = buf.as_ptr();
+            std::arch::asm!("
+                dup v0.4s, {three:v}.s[0]
+                dup v1.4s, {six:v}.s[0]
+                dup v2.4s, {inv6:v}.s[0]
+                movi v3.4s, #0
+                2:
+                    ldp q4, q5, [{ptr}]
+
+                    fadd v6.4s, v4.4s, v0.4s
+                    fadd v7.4s, v5.4s, v0.4s
+
+                    fmin v6.4s, v6.4s, v1.4s
+                    fmin v7.4s, v7.4s, v1.4s
+
+                    fmax v6.4s, v6.4s, v3.4s
+                    fmax v7.4s, v7.4s, v3.4s
+
+                    fmul v6.4s, v6.4s, v4.4s
+                    fmul v7.4s, v7.4s, v5.4s
+
+                    fmul v6.4s, v6.4s, v2.4s
+                    fmul v7.4s, v7.4s, v2.4s
+
+                    stp q6, q7, [{ptr}], #32
+                    subs {len}, {len}, 8
+                    bne 2b
+            ",
+            three = in(vreg) 3.0f32,
+            six = in(vreg) 6.0f32,
+            inv6 = in(vreg) 1.0f32 / 6.0f32,
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            out("v0") _,
+            out("v1") _,
+            out("v2") _,
+            out("v3") _,
+            out("q4") _,
+            out("q5") _,
+            out("q6") _,
+            out("q7") _,
+            );
+        }
+    }
+);
+
+#[cfg(test)]
+pub mod test_arm64simd_hardswish_f32_8n {
+    use super::*;
+    hardswish_frame_tests!(true, f32, arm64simd_hardswish_f32_8n);
+}
diff --git a/linalg/src/arm64/arm64simd/silu.rs b/linalg/src/arm64/arm64simd/silu.rs
new file mode 100644
index 0000000000..c07322c79e
--- /dev/null
+++ b/linalg/src/arm64/arm64simd/silu.rs
@@ -0,0 +1,34 @@
+ew_impl_wrap!(
+    f32,
+    arm64simd_silu_f32_4n,
+    4,
+    4,
+    (),
+    #[inline(never)]
+    fn run(buf: &mut [f32], _: ()) {
+        // SiLU(x) = x * sigmoid(x). Compose by saving the input chunk to a
+        // stack scratch buffer, running tract's NEON sigmoid kernel in place,
+        // then multiplying back by the saved original. Multiply loop
+        // auto-vectorises on aarch64.
+        const CHUNK: usize = 256;
+        let mut scratch = [0f32; CHUNK];
+        let mut start = 0;
+        while start < buf.len() {
+            let end = (start + CHUNK).min(buf.len());
+            let chunk = &mut buf[start..end];
+            let n = chunk.len();
+            scratch[..n].copy_from_slice(chunk);
+            super::arm64simd_sigmoid_f32_4n::run(chunk, ());
+            for i in 0..n {
+                chunk[i] *= scratch[i];
+            }
+            start = end;
+        }
+    }
+);
+
+#[cfg(test)]
+pub mod test_arm64simd_silu_f32_4n {
+    use super::*;
+    silu_frame_tests!(true, f32, arm64simd_silu_f32_4n);
+}
diff --git a/linalg/src/arm64/arm64simd/silu_fused.rs b/linalg/src/arm64/arm64simd/silu_fused.rs
new file mode 100644
index 0000000000..b45c33df5b
--- /dev/null
+++ b/linalg/src/arm64/arm64simd/silu_fused.rs
@@ -0,0 +1,246 @@
+// Fused SiLU: x * sigmoid(x).
+// loop4 (16 lanes per iter) + loop1 (4-lane tail).
+// Clones the sigmoid Padé polynomial from arm64simd_sigmoid_f32_4n.S.j2,
+// with the input saved before clamp (in v8-v11) and multiplied back at the
+// end. Single memory pass (load + store), no scratch buffer.
+
+ew_impl_wrap!(
+    f32,
+    arm64simd_silu_f32_4n_fused,
+    4,
+    4,
+    (),
+    #[inline(never)]
+    fn run(buf: &mut [f32], _: ()) {
+        // Sigmoid Padé coefficients (matches arm64simd_sigmoid_f32_4n.S.j2).
+        static COEFFS: [f32; 16] = [
+            -18.6,
+            18.6,
+            -4.433153405e-18,
+            1.169974371e-14,
+            -1.875289645e-11,
+            4.257889523e-8,
+            0.00004811817576,
+            0.008163842030,
+            0.2499999971,
+            3.922935744e-6,
+            0.001524872358,
+            0.1159886749,
+            1.0,
+            0.5,
+            0.0,
+            0.0,
+        ];
+
+        assert!(buf.len() % 4 == 0);
+        if buf.is_empty() {
+            return;
+        }
+
+        unsafe {
+            let len = buf.len();
+            let ptr = buf.as_mut_ptr();
+            let coef_ptr = COEFFS.as_ptr();
+
+            std::arch::asm!("
+                ld1 {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [{coef}]
+                dup v5.4s, v0.s[0]
+                dup v6.4s, v0.s[1]
+                dup v7.4s, v3.s[1]
+
+                cmp {len}, #16
+                blt 9f
+
+                1:
+                    ld1 {{ v16.4s, v17.4s, v18.4s, v19.4s }}, [{ptr}]
+
+                    mov v8.16b, v16.16b
+                    mov v9.16b, v17.16b
+                    mov v10.16b, v18.16b
+                    mov v11.16b, v19.16b
+
+                    fmax v16.4s, v16.4s, v5.4s
+                    fmax v17.4s, v17.4s, v5.4s
+                    fmax v18.4s, v18.4s, v5.4s
+                    fmax v19.4s, v19.4s, v5.4s
+
+                    fmin v16.4s, v16.4s, v6.4s
+                    fmin v17.4s, v17.4s, v6.4s
+                    fmin v18.4s, v18.4s, v6.4s
+                    fmin v19.4s, v19.4s, v6.4s
+
+                    fmul v20.4s, v16.4s, v16.4s
+                    fmul v21.4s, v17.4s, v17.4s
+                    fmul v22.4s, v18.4s, v18.4s
+                    fmul v23.4s, v19.4s, v19.4s
+
+                    dup v24.4s, v0.s[3]
+                    fmla v24.4s, v20.4s, v0.s[2]
+                    dup v25.4s, v0.s[3]
+                    fmla v25.4s, v21.4s, v0.s[2]
+                    dup v26.4s, v0.s[3]
+                    fmla v26.4s, v22.4s, v0.s[2]
+                    dup v27.4s, v0.s[3]
+                    fmla v27.4s, v23.4s, v0.s[2]
+
+                    dup v28.4s, v1.s[0]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v29.4s, v1.s[0]
+                    fmla v29.4s, v21.4s, v25.4s
+                    dup v30.4s, v1.s[0]
+                    fmla v30.4s, v22.4s, v26.4s
+                    dup v31.4s, v1.s[0]
+                    fmla v31.4s, v23.4s, v27.4s
+
+                    dup v24.4s, v1.s[1]
+                    fmla v24.4s, v20.4s, v28.4s
+                    dup v25.4s, v1.s[1]
+                    fmla v25.4s, v21.4s, v29.4s
+                    dup v26.4s, v1.s[1]
+                    fmla v26.4s, v22.4s, v30.4s
+                    dup v27.4s, v1.s[1]
+                    fmla v27.4s, v23.4s, v31.4s
+
+                    dup v28.4s, v1.s[2]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v29.4s, v1.s[2]
+                    fmla v29.4s, v21.4s, v25.4s
+                    dup v30.4s, v1.s[2]
+                    fmla v30.4s, v22.4s, v26.4s
+                    dup v31.4s, v1.s[2]
+                    fmla v31.4s, v23.4s, v27.4s
+
+                    dup v24.4s, v1.s[3]
+                    fmla v24.4s, v20.4s, v28.4s
+                    dup v25.4s, v1.s[3]
+                    fmla v25.4s, v21.4s, v29.4s
+                    dup v26.4s, v1.s[3]
+                    fmla v26.4s, v22.4s, v30.4s
+                    dup v27.4s, v1.s[3]
+                    fmla v27.4s, v23.4s, v31.4s
+
+                    dup v28.4s, v2.s[0]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v29.4s, v2.s[0]
+                    fmla v29.4s, v21.4s, v25.4s
+                    dup v30.4s, v2.s[0]
+                    fmla v30.4s, v22.4s, v26.4s
+                    dup v31.4s, v2.s[0]
+                    fmla v31.4s, v23.4s, v27.4s
+
+                    fmul v16.4s, v16.4s, v28.4s
+                    fmul v17.4s, v17.4s, v29.4s
+                    fmul v18.4s, v18.4s, v30.4s
+                    fmul v19.4s, v19.4s, v31.4s
+
+                    dup v24.4s, v2.s[2]
+                    fmla v24.4s, v20.4s, v2.s[1]
+                    dup v25.4s, v2.s[2]
+                    fmla v25.4s, v21.4s, v2.s[1]
+                    dup v26.4s, v2.s[2]
+                    fmla v26.4s, v22.4s, v2.s[1]
+                    dup v27.4s, v2.s[2]
+                    fmla v27.4s, v23.4s, v2.s[1]
+
+                    dup v28.4s, v2.s[3]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v29.4s, v2.s[3]
+                    fmla v29.4s, v21.4s, v25.4s
+                    dup v30.4s, v2.s[3]
+                    fmla v30.4s, v22.4s, v26.4s
+                    dup v31.4s, v2.s[3]
+                    fmla v31.4s, v23.4s, v27.4s
+
+                    dup v24.4s, v3.s[0]
+                    fmla v24.4s, v20.4s, v28.4s
+                    dup v25.4s, v3.s[0]
+                    fmla v25.4s, v21.4s, v29.4s
+                    dup v26.4s, v3.s[0]
+                    fmla v26.4s, v22.4s, v30.4s
+                    dup v27.4s, v3.s[0]
+                    fmla v27.4s, v23.4s, v31.4s
+
+                    fdiv v16.4s, v16.4s, v24.4s
+                    fdiv v17.4s, v17.4s, v25.4s
+                    fdiv v18.4s, v18.4s, v26.4s
+                    fdiv v19.4s, v19.4s, v27.4s
+
+                    fadd v16.4s, v16.4s, v7.4s
+                    fadd v17.4s, v17.4s, v7.4s
+                    fadd v18.4s, v18.4s, v7.4s
+                    fadd v19.4s, v19.4s, v7.4s
+
+                    fmul v16.4s, v16.4s, v8.4s
+                    fmul v17.4s, v17.4s, v9.4s
+                    fmul v18.4s, v18.4s, v10.4s
+                    fmul v19.4s, v19.4s, v11.4s
+
+                    st1 {{ v16.4s, v17.4s, v18.4s, v19.4s }}, [{ptr}], #64
+                    sub {len}, {len}, #16
+                    cmp {len}, #16
+                    bge 1b
+
+                9:
+                cbz {len}, 3f
+
+                2:
+                    ld1 {{ v16.4s }}, [{ptr}]
+                    mov v8.16b, v16.16b
+
+                    fmax v16.4s, v16.4s, v5.4s
+                    fmin v16.4s, v16.4s, v6.4s
+                    fmul v20.4s, v16.4s, v16.4s
+
+                    dup v24.4s, v0.s[3]
+                    fmla v24.4s, v20.4s, v0.s[2]
+                    dup v28.4s, v1.s[0]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v24.4s, v1.s[1]
+                    fmla v24.4s, v20.4s, v28.4s
+                    dup v28.4s, v1.s[2]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v24.4s, v1.s[3]
+                    fmla v24.4s, v20.4s, v28.4s
+                    dup v28.4s, v2.s[0]
+                    fmla v28.4s, v20.4s, v24.4s
+                    fmul v16.4s, v16.4s, v28.4s
+
+                    dup v24.4s, v2.s[2]
+                    fmla v24.4s, v20.4s, v2.s[1]
+                    dup v28.4s, v2.s[3]
+                    fmla v28.4s, v20.4s, v24.4s
+                    dup v24.4s, v3.s[0]
+                    fmla v24.4s, v20.4s, v28.4s
+
+                    fdiv v16.4s, v16.4s, v24.4s
+                    fadd v16.4s, v16.4s, v7.4s
+
+                    fmul v16.4s, v16.4s, v8.4s
+
+                    st1 {{ v16.4s }}, [{ptr}], #16
+                    subs {len}, {len}, 4
+                    bne 2b
+
+                3:
+            ",
+            coef = in(reg) coef_ptr,
+            ptr = inout(reg) ptr => _,
+            len = inout(reg) len => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+            out("v5") _, out("v6") _, out("v7") _,
+            out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+            out("v16") _, out("v17") _, out("v18") _, out("v19") _,
+            out("v20") _, out("v21") _, out("v22") _, out("v23") _,
+            out("v24") _, out("v25") _, out("v26") _, out("v27") _,
+            out("v28") _, out("v29") _, out("v30") _, out("v31") _,
+            options(nostack),
+            );
+        }
+    }
+);
+
+#[cfg(test)]
+pub mod test_arm64simd_silu_f32_4n_fused {
+    use super::*;
+    silu_frame_tests!(true, f32, arm64simd_silu_f32_4n_fused);
+}
diff --git a/linalg/src/arm64/sme.rs b/linalg/src/arm64/sme.rs
new file mode 100644
index 0000000000..5b8b194d6c
--- /dev/null
+++ b/linalg/src/arm64/sme.rs
@@ -0,0 +1,286 @@
+use crate::Ops;
+use crate::frame::mmm::ImplementationQuality::ManuallyOptimized;
+use crate::mmm::*;
+
+// CAN_FUSE: everything except LeakyRelu / QScale / RoundingShiftRight /
+// ShiftLeft. LoadTile, AddUnicast, AddRowColProducts, per-row/col/scalar
+// arithmetic, Clear, Store, AddMatMul are all in. (Matches AMX
+// `apple_amx.rs` CAN_FUSE, minus the i32-only quantization ops.)
+const CAN_FUSE: fn(&FusedSpec) -> bool = |f| {
+    !matches!(
+        f,
+        FusedSpec::LeakyRelu(_)
+            | FusedSpec::QScale(_, _, _)
+            | FusedSpec::RoundingShiftRight(_, _)
+            | FusedSpec::ShiftLeft(_)
+    )
+};
+
+const SME: fn() -> bool = has_sme;
+const SME2: fn() -> bool = has_sme2;
+
+// Streaming vector length in bytes, read via `RDSVL x0, #1` (encoding
+// 0x04bf5820). RDSVL is legal in non-streaming mode, but is UNDEFINED
+// unless FEAT_SME is implemented — callers MUST confirm FEAT_SME first
+// (sysctl on macOS, HWCAP2 on Linux) or this SIGILLs.
+#[cfg(any(target_os = "macos", target_os = "linux"))]
+unsafe fn streaming_vector_bytes() -> u64 {
+    let svl: u64;
+    unsafe {
+        std::arch::asm!(
+            ".inst 0x04bf5820", // rdsvl x0, #1
+            out("x0") svl,
+            options(nomem, nostack, preserves_flags),
+        );
+    }
+    svl
+}
+
+// Our SME kernels hardcode a 512-bit streaming vector length (16 f32 lanes
+// per ZA.S slice — the 32x32 and 64x1 tile geometries depend on it). A host
+// that advertises FEAT_SME with a different SVL would run the kernels with
+// mismatched geometry and produce silently-wrong results. The prime offender
+// is qemu-aarch64 user-mode emulation, which sets HWCAP2_SME / HWCAP2_SME2
+// but uses a non-512 SVL — that is exactly what makes the cross-compiled
+// aarch64 CI jobs (run under QEMU) fail. Reject any non-512 SVL here so we
+// fall back to the portable path. MUST only be called once FEAT_SME is known
+// present.
+#[cfg(any(target_os = "macos", target_os = "linux"))]
+fn sme_geometry_supported() -> bool {
+    // SVL = 512 bits = 64 bytes.
+    unsafe { streaming_vector_bytes() == 64 }
+}
+
+MMMExternKernel!(
+    sme_mmm_f32_32x32<f32>(32, 32)@(128, 128)
+    where(SME)
+    can_fuse(CAN_FUSE)
+    quality(ManuallyOptimized)
+);
+
+MMMExternKernel!(
+    sme_mmv_f32_64x1<f32>(64, 1)@(128, 128)
+    where(SME2)
+    can_fuse(CAN_FUSE)
+    quality(ManuallyOptimized)
+);
+
+#[cfg(target_os = "macos")]
+pub fn has_sme() -> bool {
+    // TRACT_SME_DISABLE=1 forces the SME path off so callers can A/B
+    // against the AMX path on the same binary.
+    if std::env::var_os("TRACT_SME_DISABLE").is_some() {
+        return false;
+    }
+    // hw.optional.arm.FEAT_SME is an INTEGER sysctl, not a string. The
+    // generic apple_get_syscall reads bytes-as-C-string which fails here
+    // (`\x01\x00\x00\x00` would compare against the ASCII "1"), so we
+    // read it as a u64 directly.
+    use std::ffi::{CString, c_char, c_int, c_void};
+    use std::ptr::null_mut;
+    unsafe extern "C" {
+        fn sysctlbyname(
+            name: *const c_char,
+            oldp: *mut c_void,
+            oldlenp: *mut usize,
+            newp: *mut c_void,
+            newlen: usize,
+        ) -> c_int;
+    }
+    let Ok(name) = CString::new("hw.optional.arm.FEAT_SME") else {
+        return false;
+    };
+    let mut value: u64 = 0;
+    let mut len: usize = std::mem::size_of::<u64>();
+    unsafe {
+        if sysctlbyname(name.as_ptr(), &mut value as *mut _ as *mut c_void, &mut len, null_mut(), 0)
+            != 0
+        {
+            return false;
+        }
+    }
+    // FEAT_SME present AND the streaming vector length matches our kernels'
+    // hardcoded 512-bit geometry.
+    value != 0 && sme_geometry_supported()
+}
+
+#[cfg(target_os = "linux")]
+pub fn has_sme() -> bool {
+    // HWCAP2_SME = 1 << 23 on aarch64 (kernel ABI).
+    const HWCAP2_SME: u64 = 1 << 23;
+    unsafe extern "C" {
+        fn getauxval(t: u64) -> u64;
+    }
+    const AT_HWCAP2: u64 = 26;
+    let feat = unsafe { (getauxval(AT_HWCAP2) & HWCAP2_SME) != 0 };
+    // FEAT_SME present AND the streaming vector length matches our kernels'
+    // hardcoded 512-bit geometry (rejects qemu-user, which advertises SME
+    // with a non-512 SVL — the cause of the cross-compiled CI failures).
+    feat && sme_geometry_supported()
+}
+
+#[cfg(not(any(target_os = "macos", target_os = "linux")))]
+pub fn has_sme() -> bool {
+    false
+}
+
+#[cfg(target_os = "macos")]
+pub fn has_sme2() -> bool {
+    // TRACT_SME_DISABLE=1 disables both SME and SME2 dispatch on the same
+    // binary so end users can A/B the entire SME backend.
+    if std::env::var_os("TRACT_SME_DISABLE").is_some() {
+        return false;
+    }
+    use std::ffi::{CString, c_char, c_int, c_void};
+    use std::ptr::null_mut;
+    unsafe extern "C" {
+        fn sysctlbyname(
+            name: *const c_char,
+            oldp: *mut c_void,
+            oldlenp: *mut usize,
+            newp: *mut c_void,
+            newlen: usize,
+        ) -> c_int;
+    }
+    let Ok(name) = CString::new("hw.optional.arm.FEAT_SME2") else {
+        return false;
+    };
+    let mut value: u64 = 0;
+    let mut len: usize = std::mem::size_of::<u64>();
+    unsafe {
+        if sysctlbyname(name.as_ptr(), &mut value as *mut _ as *mut c_void, &mut len, null_mut(), 0)
+            != 0
+        {
+            return false;
+        }
+    }
+    // FEAT_SME2 present AND the streaming vector length matches our kernels'
+    // hardcoded 512-bit geometry.
+    value != 0 && sme_geometry_supported()
+}
+
+#[cfg(target_os = "linux")]
+pub fn has_sme2() -> bool {
+    // HWCAP2_SME2 = 1 << 37 on aarch64 (kernel ABI).
+    const HWCAP2_SME2: u64 = 1 << 37;
+    unsafe extern "C" {
+        fn getauxval(t: u64) -> u64;
+    }
+    const AT_HWCAP2: u64 = 26;
+    let feat = unsafe { (getauxval(AT_HWCAP2) & HWCAP2_SME2) != 0 };
+    // FEAT_SME2 present AND the streaming vector length matches our kernels'
+    // hardcoded 512-bit geometry (rejects qemu-user, which advertises SME2
+    // with a non-512 SVL — the cause of the cross-compiled CI failures).
+    feat && sme_geometry_supported()
+}
+
+#[cfg(not(any(target_os = "macos", target_os = "linux")))]
+pub fn has_sme2() -> bool {
+    false
+}
+
+pub fn plug(ops: &mut Ops) {
+    if has_sme() {
+        log::info!("SME optimisation activated");
+        ops.mmm_f32 = Box::new(|_, _, _| sme_mmm_f32_32x32.mmm());
+        ops.mmm_impls.extend_from_slice(&[sme_mmm_f32_32x32.mmm()]);
+    }
+    if has_sme2() {
+        log::info!("SME2 GEMV optimisation activated");
+        ops.mmv_f32 = Box::new(|_, _| sme_mmv_f32_64x1.mmm());
+        ops.mmm_impls.extend_from_slice(&[sme_mmv_f32_64x1.mmm()]);
+    }
+    if !has_sme() && !has_sme2() {
+        log::info!("No SME optimisation");
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::frame::mmm::tests::packed_packed::PackedPackedProblem;
+    use tract_data::internal::Approximation;
+
+    // Phase 1A correctness: AddMatMul + Clear + Store + Done on a few
+    // shapes. Bypasses auto-tests (SME_OFF) by calling run/reference
+    // directly. Skipped if hardware lacks SME.
+    fn check_shape(m_tile: usize, k: usize, n_tile: usize) {
+        const MR: usize = 32;
+        const NR: usize = 32;
+        let m = m_tile * MR;
+        let n = n_tile * NR;
+        let a: Vec<f32> = (0..m * k).map(|i| (i as f32 * 0.013) - 1.5).collect();
+        let b: Vec<f32> = (0..k * n).map(|i| (i as f32 * 0.017) + 0.25).collect();
+        let pb = PackedPackedProblem::kernel(&*sme_mmm_f32_32x32, 0, a, b);
+        let expected = pb.reference().expect("scalar reference");
+        let found = pb.run().expect("SME kernel run");
+        found
+            .close_enough(&expected, Approximation::Approximate)
+            .unwrap_or_else(|e| panic!("SME mmm mismatch at k={k}: {e}"));
+    }
+
+    #[test]
+    fn sme_mmm_f32_32x32_k1() {
+        if !has_sme() {
+            eprintln!("SME not present, skipping");
+            return;
+        }
+        check_shape(1, 1, 1);
+    }
+
+    #[test]
+    fn sme_mmm_f32_32x32_k8() {
+        if !has_sme() {
+            return;
+        }
+        check_shape(1, 8, 1);
+    }
+
+    #[test]
+    fn sme_mmm_f32_32x32_k128() {
+        if !has_sme() {
+            return;
+        }
+        check_shape(1, 128, 1);
+    }
+
+    #[test]
+    fn sme_mmm_f32_32x32_multi_tile() {
+        if !has_sme() {
+            return;
+        }
+        // 64x64 output (2x2 tiles), K=64 — exercises the framework
+        // iterating across multiple kernel calls.
+        check_shape(2, 64, 2);
+    }
+
+    // Strided store path: hand-built Clear + Store chain with non-contig C.
+    #[test]
+    fn sme_store_non_contiguous() {
+        if !has_sme() {
+            return;
+        }
+        use crate::frame::mmm::{FusedKerSpec, OutputStoreKer};
+        const MR: usize = 32;
+        const NR: usize = 32;
+        let mut v: Vec<f32> = vec![f32::MAX; MR * 5 * NR * 3];
+        let c = OutputStoreKer {
+            ptr: v.as_mut_ptr() as _,
+            row_byte_stride: (4 * 3 * NR * 5) as isize,
+            col_byte_stride: 4 * 3,
+            item_size: 4,
+        };
+        let non_linear = [FusedKerSpec::<f32>::Clear, FusedKerSpec::Store(c), FusedKerSpec::Done];
+        let err = unsafe { (sme_mmm_f32_32x32.kernel)(&non_linear) };
+        assert_eq!(err, 0, "kernel returned non-zero error code");
+        let mut expected = vec![f32::MAX; v.len()];
+        for col in 0..NR {
+            for row in 0..MR {
+                expected[col * 3 + row * 3 * 5 * NR] = 0.0;
+            }
+        }
+        for (i, (got, exp)) in v.iter().zip(expected.iter()).enumerate() {
+            assert_eq!(got, exp, "mismatch at idx {i}: got {got} expected {exp}");
+        }
+    }
+}
diff --git a/linalg/src/arm64/sve.rs b/linalg/src/arm64/sve.rs
new file mode 100644
index 0000000000..a312a8b205
--- /dev/null
+++ b/linalg/src/arm64/sve.rs
@@ -0,0 +1,229 @@
+use crate::Ops;
+
+// `tract_sve` is set by build.rs only on aarch64-linux when the C compiler
+// supports SVE intrinsics. The kernel registration + extern live behind it so
+// non-SVE builds never reference the (absent) C symbol.
+#[cfg(tract_sve)]
+use crate::frame::mmm::ImplementationQuality::ManuallyOptimized;
+#[cfg(tract_sve)]
+use crate::mmm::*;
+#[cfg(tract_sve)]
+use crate::pack::PackedFormat;
+// Explicit import so `f16` is tract's half::f16 (LADatum), not rustc's builtin
+// primitive f16 — a glob import would not shadow the primitive.
+#[cfg(tract_sve)]
+use tract_data::prelude::f16;
+
+// f32 SVE kernel can't do LeakyRelu or the i32 quantization ops (matches the
+// arm64simd / SME f32 CAN_FUSE).
+#[cfg(tract_sve)]
+const CAN_FUSE: fn(&FusedSpec) -> bool = |f| {
+    !matches!(
+        f,
+        FusedSpec::LeakyRelu(_)
+            | FusedSpec::QScale(_, _, _)
+            | FusedSpec::RoundingShiftRight(_, _)
+            | FusedSpec::ShiftLeft(_)
+    )
+};
+
+// The i32 quantized kernel keeps the quantization fuse ops (QScale /
+// RoundingShiftRight / ShiftLeft) — they are the whole point of a quantized
+// kernel — and excludes only LeakyRelu (matches arm64simd's i32 surface; i32
+// LeakyRelu has no practical use and the C kernel does not implement it).
+#[cfg(tract_sve)]
+const CAN_FUSE_I32: fn(&FusedSpec) -> bool = |f| !matches!(f, FusedSpec::LeakyRelu(_));
+
+#[cfg(tract_sve)]
+const SVE2: fn() -> bool = has_sve2;
+
+// The f16 kernels need FEAT_SVE2 AND FEAT_FP16 (native f16 arithmetic).
+#[cfg(tract_sve)]
+const SVE2_FP16: fn() -> bool = || has_sve2() && crate::arm64::has_fp16();
+
+// The VLA SVE f32 GEMM kernel, implemented in C (arm64/sve/sve_mmm_f32.c) since
+// Rust has no stable SVE intrinsics. Broadcast-A rank-1 update, N-tile walked in
+// svcntw() chunks → correct and full-width at any VL.
+#[cfg(tract_sve)]
+mod sve_sys {
+    use crate::frame::mmm::FusedKerSpec;
+    use tract_data::prelude::f16;
+    unsafe extern "C" {
+        pub fn sve_mmm_f32_kernel(ops: *const FusedKerSpec<f32>) -> isize;
+        pub fn sve_mmv_f32_64x1_kernel(ops: *const FusedKerSpec<f32>) -> isize;
+        pub fn sve_mmm_i32_kernel(ops: *const FusedKerSpec<i32>) -> isize;
+        pub fn sve_mmm_i32_64x1_kernel(ops: *const FusedKerSpec<i32>) -> isize;
+        pub fn sve_mmm_f16_kernel(ops: *const FusedKerSpec<f16>) -> isize;
+        pub fn sve_mmv_f16_64x1_kernel(ops: *const FusedKerSpec<f16>) -> isize;
+    }
+}
+
+#[cfg(tract_sve)]
+MMMRustKernel!(sve_sys::sve_mmm_f32_kernel => sve_mmm_f32_8x8<f32>(8, 8)
+    where(SVE2)
+    can_fuse(CAN_FUSE)
+    quality(ManuallyOptimized)
+);
+
+// The VLA SVE f32 GEMV kernel (arm64/sve/sve_mmv_f32_64x1.c), MR=64 NR=1,
+// dispatched when N == 1 (matrix x f32 column vector). Wired to ops.mmv_f32.
+#[cfg(tract_sve)]
+MMMRustKernel!(sve_sys::sve_mmv_f32_64x1_kernel => sve_mmv_f32_64x1<f32>(64, 1)
+    where(SVE2)
+    can_fuse(CAN_FUSE)
+    quality(ManuallyOptimized)
+);
+
+// The VLA SVE int8 -> int32 GEMM kernel (arm64/sve/sve_mmm_i32.c). Consumes
+// tract's native i8i8 K-major packing via the widening rank-1 update (svld1sb +
+// svmla), and supports the i32 quantization fuse ops. Wired to ops.qmmm_i32.
+#[cfg(tract_sve)]
+MMMRustKernel!(sve_sys::sve_mmm_i32_kernel => sve_mmm_i32_8x8<i32>(8, 8)
+    where(SVE2)
+    can_fuse(CAN_FUSE_I32)
+    packing[1] = i8i8 => |k| k.with_packing(
+        PackedFormat::new(DatumType::I8, 8, 16),
+        PackedFormat::new(DatumType::I8, 8, 16),
+    );
+    quality(ManuallyOptimized)
+    store(i8)
+);
+
+// The VLA SVE int8 -> int32 GEMV kernel (arm64/sve/sve_mmm_i32_64x1.c), MR=64
+// NR=1, dispatched when N == 1. Same widening update vectorized over M. Wired to
+// ops.qmmv_i32.
+#[cfg(tract_sve)]
+MMMRustKernel!(sve_sys::sve_mmm_i32_64x1_kernel => sve_mmm_i32_64x1<i32>(64, 1)
+    where(SVE2)
+    can_fuse(CAN_FUSE_I32)
+    packing[1] = i8i8 => |k| k.with_packing(
+        PackedFormat::new(DatumType::I8, 64, 16),
+        PackedFormat::new(DatumType::I8, 1, 1),
+    );
+    quality(ManuallyOptimized)
+    store(i8)
+);
+
+// The VLA SVE f16 GEMM kernel (arm64/sve/sve_mmm_f16.c), native f16 FMA, gated on
+// SVE2 + FP16. Wired to ops.mmm_f16 when has_fp16().
+#[cfg(tract_sve)]
+MMMRustKernel!(sve_sys::sve_mmm_f16_kernel => sve_mmm_f16_8x8<f16>(8, 8)
+    where(SVE2_FP16)
+    can_fuse(CAN_FUSE)
+    quality(ManuallyOptimized)
+);
+
+// The VLA SVE f16 GEMV kernel (arm64/sve/sve_mmv_f16_64x1.c), MR=64 NR=1,
+// dispatched when N == 1. Wired to ops.mmv_f16 when has_fp16().
+#[cfg(tract_sve)]
+MMMRustKernel!(sve_sys::sve_mmv_f16_64x1_kernel => sve_mmv_f16_64x1<f16>(64, 1)
+    where(SVE2_FP16)
+    can_fuse(CAN_FUSE)
+    quality(ManuallyOptimized)
+);
+
+// SVE / SVE2 backend.
+//
+// Unlike SME (Apple M4) and AMX (Apple), SVE/SVE2 is NOT present on any Apple
+// silicon — it lives on ARMv9 server/mobile cores (Neoverse V1+/N2+, Cortex-X2+
+// / A510+, Graviton 3/4). So detection is Linux-only in practice; macOS always
+// returns false.
+//
+// The kernels are vector-length-agnostic (VLA): they read the vector width at
+// runtime via `whilelt` predication and `svcntw()`, so a single kernel is
+// correct at every VL (128..2048-bit). That means — unlike the SME kernels,
+// which hardcoded SVL=512 and needed an RDSVL gate — the SVE kernels need NO
+// vector-length gate for correctness. `rdvl_bytes()` is provided only for
+// optional VL-matched dispatch (selecting a wider-tiled variant when the
+// hardware VL is large), not for correctness.
+
+#[cfg(target_os = "linux")]
+pub fn has_sve() -> bool {
+    if std::env::var_os("TRACT_SVE_DISABLE").is_some() {
+        return false;
+    }
+    // HWCAP_SVE = 1 << 22 on aarch64 (kernel ABI).
+    const HWCAP_SVE: u64 = 1 << 22;
+    unsafe extern "C" {
+        fn getauxval(t: u64) -> u64;
+    }
+    const AT_HWCAP: u64 = 16;
+    unsafe { (getauxval(AT_HWCAP) & HWCAP_SVE) != 0 }
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn has_sve() -> bool {
+    // No Apple silicon implements SVE; no SVE on non-Linux targets we support.
+    false
+}
+
+#[cfg(target_os = "linux")]
+pub fn has_sve2() -> bool {
+    if std::env::var_os("TRACT_SVE_DISABLE").is_some() {
+        return false;
+    }
+    // HWCAP2_SVE2 = 1 << 1 on aarch64 (kernel ABI).
+    const HWCAP2_SVE2: u64 = 1 << 1;
+    unsafe extern "C" {
+        fn getauxval(t: u64) -> u64;
+    }
+    const AT_HWCAP2: u64 = 26;
+    unsafe { (getauxval(AT_HWCAP2) & HWCAP2_SVE2) != 0 }
+}
+
+#[cfg(not(target_os = "linux"))]
+pub fn has_sve2() -> bool {
+    false
+}
+
+/// SVE vector length in bytes, via `RDVL x0, #1` (encoding 0x04bf5020).
+/// Legal whenever FEAT_SVE is implemented; callers MUST confirm `has_sve()`
+/// first (RDVL is UNDEFINED without SVE and would SIGILL). Used only for
+/// optional VL-matched kernel selection — VLA kernels do not need it.
+#[cfg(target_os = "linux")]
+#[allow(dead_code)]
+pub fn rdvl_bytes() -> u64 {
+    let vl: u64;
+    unsafe {
+        std::arch::asm!(
+            ".inst 0x04bf5020", // rdvl x0, #1
+            out("x0") vl,
+            options(nomem, nostack, preserves_flags),
+        );
+    }
+    vl
+}
+
+pub fn plug(ops: &mut Ops) {
+    let _ = ops;
+    if has_sve2() {
+        #[cfg(target_os = "linux")]
+        log::info!("SVE2 optimisation available (VL = {} bytes)", rdvl_bytes());
+        #[cfg(tract_sve)]
+        {
+            // Force the SVE kernels for f32 mmm and i32 quantized mmm (mirrors the
+            // SME backend) and also register them as candidates. TRACT_SVE_DISABLE=1
+            // already turns the whole thing off via has_sve2().
+            ops.mmm_f32 = Box::new(|_, _, _| sve_mmm_f32_8x8.mmm());
+            ops.mmv_f32 = Box::new(|_, _| sve_mmv_f32_64x1.mmm());
+            ops.qmmm_i32 = Box::new(|_, _, _| sve_mmm_i32_8x8.mmm());
+            ops.qmmv_i32 = Box::new(|_, _| sve_mmm_i32_64x1.mmm());
+            ops.mmm_impls.extend_from_slice(&[
+                sve_mmm_f32_8x8.mmm(),
+                sve_mmv_f32_64x1.mmm(),
+                sve_mmm_i32_8x8.mmm(),
+                sve_mmm_i32_64x1.mmm(),
+            ]);
+            // f16 kernels additionally require FEAT_FP16.
+            if crate::arm64::has_fp16() {
+                ops.mmm_f16 = Box::new(|_, _, _| sve_mmm_f16_8x8.mmm());
+                ops.mmv_f16 = Box::new(|_, _| sve_mmv_f16_64x1.mmm());
+                ops.mmm_impls.extend_from_slice(&[sve_mmm_f16_8x8.mmm(), sve_mmv_f16_64x1.mmm()]);
+            }
+        }
+    } else if has_sve() {
+        log::info!("SVE (v1) present; SVE2 kernels not enabled");
+    } else {
+        log::info!("No SVE optimisation");
+    }
+}
diff --git a/linalg/src/frame/gelu.rs b/linalg/src/frame/gelu.rs
new file mode 100644
index 0000000000..364efcbb56
--- /dev/null
+++ b/linalg/src/frame/gelu.rs
@@ -0,0 +1,61 @@
+#[allow(unused_macros)]
+macro_rules! gelu_impl {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $cond: expr) => {
+        ew_impl!($ti, $func, $nr, $alignment_items);
+        #[cfg(test)]
+        paste! {
+            mod [<test_ $func>] {
+                use super::*;
+                gelu_frame_tests!($cond, $ti, $func);
+            }
+        }
+    };
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use crate::LADatum;
+    use crate::frame::element_wise::*;
+    use num_traits::{AsPrimitive, Float};
+    use proptest::test_runner::TestCaseResult;
+
+    #[macro_export]
+    macro_rules! gelu_frame_tests {
+        ($cond:expr, $t: ty, $ker:ty) => {
+            proptest::proptest! {
+                #[test]
+                fn prop(xs in proptest::collection::vec(-10f32..10.0, 0..100)) {
+                    if $cond {
+                        $crate::frame::gelu::test::test_gelu::<$ker, $t>(&*xs).unwrap()
+                    }
+                }
+            }
+            #[test]
+            fn trivial() {
+                if $cond {
+                    $crate::frame::gelu::test::test_gelu::<$ker, $t>(&[-5f32, -1.0, 0.0, 1.0, 5.0])
+                        .unwrap();
+                }
+            }
+        };
+    }
+
+    pub fn test_gelu<K: ElementWiseKer<T>, T: LADatum + Float>(values: &[f32]) -> TestCaseResult
+    where
+        f32: AsPrimitive<T>,
+    {
+        let data = tract_data::prelude::tensor1(values);
+        let data = data.cast_to::<T>().unwrap();
+        let data = data.try_as_plain().unwrap().as_slice::<T>().unwrap();
+        // Tanh-form GELU (pow=3): 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+        crate::frame::element_wise::test::test_element_wise::<K, T, _>(data, |x: T| {
+            let half: T = 0.5f32.as_();
+            let one: T = 1f32.as_();
+            let coef: T = 0.044715f32.as_();
+            let sqrt_2_over_pi: T = 0.7978845608028654f32.as_();
+            let inner = sqrt_2_over_pi * (x + coef * x * x * x);
+            half * x * (one + inner.tanh())
+        })
+    }
+}
diff --git a/linalg/src/frame/hardswish.rs b/linalg/src/frame/hardswish.rs
new file mode 100644
index 0000000000..f65b9957b1
--- /dev/null
+++ b/linalg/src/frame/hardswish.rs
@@ -0,0 +1,64 @@
+#[allow(unused_macros)]
+macro_rules! hardswish_impl {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $cond: expr) => {
+        ew_impl!($ti, $func, $nr, $alignment_items);
+        #[cfg(test)]
+        paste! {
+            mod [<test_ $func>] {
+                use super::*;
+                hardswish_frame_tests!($cond, $ti, $func);
+            }
+        }
+    };
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use crate::LADatum;
+    use crate::frame::element_wise::*;
+    use num_traits::{AsPrimitive, Float, Zero};
+    use proptest::test_runner::TestCaseResult;
+
+    #[macro_export]
+    macro_rules! hardswish_frame_tests {
+        ($cond:expr, $t: ty, $ker:ty) => {
+            proptest::proptest! {
+                #[test]
+                fn prop(xs in proptest::collection::vec(-25f32..25.0, 0..100)) {
+                    if $cond {
+                        $crate::frame::hardswish::test::test_hardswish::<$ker, $t>(&*xs).unwrap()
+                    }
+                }
+            }
+            #[test]
+            fn trivial() {
+                if $cond {
+                    $crate::frame::hardswish::test::test_hardswish::<$ker, $t>(&[
+                        -10f32, -3.0, -1.0, 0.0, 1.0, 3.0, 6.0, 10.0,
+                    ])
+                    .unwrap();
+                }
+            }
+        };
+    }
+
+    pub fn test_hardswish<K: ElementWiseKer<T>, T: LADatum + Float>(
+        values: &[f32],
+    ) -> TestCaseResult
+    where
+        f32: AsPrimitive<T>,
+    {
+        let data = tract_data::prelude::tensor1(values);
+        let data = data.cast_to::<T>().unwrap();
+        let data = data.try_as_plain().unwrap().as_slice::<T>().unwrap();
+        crate::frame::element_wise::test::test_element_wise::<K, T, _>(data, |x: T| {
+            let three: T = 3f32.as_();
+            let six: T = 6f32.as_();
+            let zero: T = T::zero();
+            let inv6: T = (1f32 / 6f32).as_();
+            let relu6 = ((x + three).min(six)).max(zero);
+            x * relu6 * inv6
+        })
+    }
+}
diff --git a/linalg/src/frame/mmm/mod.rs b/linalg/src/frame/mmm/mod.rs
index a03d15b1c4..bbe84cf8b9 100644
--- a/linalg/src/frame/mmm/mod.rs
+++ b/linalg/src/frame/mmm/mod.rs
@@ -16,8 +16,6 @@ mod storage;
 pub mod tests;
 
 use crate::multithread::Executor;
-#[cfg(feature = "multithread-mm")]
-use rayon::prelude::*;
 use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::fmt::Debug;
@@ -78,6 +76,10 @@ pub trait MatMatMul: Debug + dyn_clone::DynClone + Send + Sync + std::any::Any {
     fn quality(&self) -> ImplementationQuality;
     fn dynamic_boost(&self) -> isize;
 
+    /// Whether this kernel is runnable on the current CPU (platform feature
+    /// gate, e.g. FEAT_DotProd for the SDOT i8 kernel).
+    fn is_supported_here(&self) -> bool;
+
     #[allow(clippy::type_complexity)]
     fn packings(&self) -> &[(Box<dyn MMMInputFormat>, Box<dyn MMMInputFormat>)];
 
@@ -147,6 +149,10 @@ impl<K: MatMatMulKer> MatMatMul for K {
         MatMatMulKer::dynamic_boost(self)
     }
 
+    fn is_supported_here(&self) -> bool {
+        MatMatMulKer::is_supported_here(self)
+    }
+
     fn packings(&self) -> &[(Box<dyn MMMInputFormat>, Box<dyn MMMInputFormat>)] {
         self.packings()
     }
@@ -229,18 +235,37 @@ unsafe fn run_with_scratch_space_vec<K: MatMatMulKer>(
 ) -> TractResult<()> {
     unsafe {
         match crate::multithread::current_tract_executor() {
-            Executor::SingleThread => {
+            Executor::SingleThread => scratch.run_in_tls_scope(|scratch, tls| {
                 for ia in 0..m.divceil(ker.mr()) {
-                    scratch.run(ker, non_linear, ia, 0)?;
+                    scratch.run_one_tile(ker, non_linear, tls, ia, 0)?;
                 }
-                Ok(())
-            }
-            #[cfg(feature = "multithread-mm")]
-            Executor::MultiThread(pool) => pool.install(|| {
-                (0..m.div_ceil(ker.mr()))
-                    .into_par_iter()
-                    .try_for_each(|ia| scratch.run(ker, non_linear, ia, 0))
+                TractResult::Ok(())
             }),
+            #[cfg(feature = "multithread-mm")]
+            Executor::MultiThread(pool) => chunked_dispatch_rayon(
+                Some(&pool),
+                m.divceil(ker.mr()),
+                1,
+                |ia_start, ia_end, _, _| {
+                    scratch.run_in_tls_scope(|scratch, tls| {
+                        for ia in ia_start..ia_end {
+                            scratch.run_one_tile(ker, non_linear, tls, ia, 0)?;
+                        }
+                        TractResult::Ok(())
+                    })
+                },
+            ),
+            #[cfg(feature = "multithread-mm")]
+            Executor::RayonGlobal => {
+                chunked_dispatch_rayon(None, m.divceil(ker.mr()), 1, |ia_start, ia_end, _, _| {
+                    scratch.run_in_tls_scope(|scratch, tls| {
+                        for ia in ia_start..ia_end {
+                            scratch.run_one_tile(ker, non_linear, tls, ia, 0)?;
+                        }
+                        TractResult::Ok(())
+                    })
+                })
+            }
         }
     }
 }
@@ -254,23 +279,46 @@ unsafe fn run_with_scratch_space_col_outer<K: MatMatMulKer>(
 ) -> TractResult<()> {
     unsafe {
         match crate::multithread::current_tract_executor() {
-            Executor::SingleThread => {
+            Executor::SingleThread => scratch.run_in_tls_scope(|scratch, tls| {
                 for ib in 0..n.divceil(ker.nr()) {
                     for ia in 0..m.divceil(ker.mr()) {
-                        scratch.run(ker, non_linear, ia, ib)?;
+                        scratch.run_one_tile(ker, non_linear, tls, ia, ib)?;
                     }
                 }
-                Ok(())
-            }
-            #[cfg(feature = "multithread-mm")]
-            Executor::MultiThread(pool) => pool.install(|| {
-                (0..n.div_ceil(ker.nr())).into_par_iter().try_for_each(|ib| {
-                    for ia in 0..m.divceil(ker.mr()) {
-                        scratch.run(ker, non_linear, ia, ib)?;
-                    }
-                    Ok(())
-                })
+                TractResult::Ok(())
             }),
+            #[cfg(feature = "multithread-mm")]
+            Executor::MultiThread(pool) => chunked_dispatch_rayon(
+                Some(&pool),
+                m.divceil(ker.mr()),
+                n.divceil(ker.nr()),
+                |ia_start, ia_end, ib_start, ib_end| {
+                    scratch.run_in_tls_scope(|scratch, tls| {
+                        for ib in ib_start..ib_end {
+                            for ia in ia_start..ia_end {
+                                scratch.run_one_tile(ker, non_linear, tls, ia, ib)?;
+                            }
+                        }
+                        TractResult::Ok(())
+                    })
+                },
+            ),
+            #[cfg(feature = "multithread-mm")]
+            Executor::RayonGlobal => chunked_dispatch_rayon(
+                None,
+                m.divceil(ker.mr()),
+                n.divceil(ker.nr()),
+                |ia_start, ia_end, ib_start, ib_end| {
+                    scratch.run_in_tls_scope(|scratch, tls| {
+                        for ib in ib_start..ib_end {
+                            for ia in ia_start..ia_end {
+                                scratch.run_one_tile(ker, non_linear, tls, ia, ib)?;
+                            }
+                        }
+                        TractResult::Ok(())
+                    })
+                },
+            ),
         }
     }
 }
@@ -284,25 +332,130 @@ unsafe fn run_with_scratch_space_row_outer<K: MatMatMulKer>(
 ) -> TractResult<()> {
     unsafe {
         match crate::multithread::current_tract_executor() {
-            Executor::SingleThread => {
+            Executor::SingleThread => scratch.run_in_tls_scope(|scratch, tls| {
                 for ia in 0..m.divceil(ker.mr()) {
                     for ib in 0..n.divceil(ker.nr()) {
-                        scratch.run(ker, non_linear, ia, ib)?;
+                        scratch.run_one_tile(ker, non_linear, tls, ia, ib)?;
                     }
                 }
-                Ok(())
-            }
+                TractResult::Ok(())
+            }),
             #[cfg(feature = "multithread-mm")]
-            Executor::MultiThread(pool) => pool.install(|| {
-                pool.install(|| {
-                    (0..m.div_ceil(ker.mr())).into_par_iter().try_for_each(|ia| {
-                        for ib in 0..n.divceil(ker.nr()) {
-                            scratch.run(ker, non_linear, ia, ib)?;
+            Executor::MultiThread(pool) => chunked_dispatch_rayon(
+                Some(&pool),
+                m.divceil(ker.mr()),
+                n.divceil(ker.nr()),
+                |ia_start, ia_end, ib_start, ib_end| {
+                    scratch.run_in_tls_scope(|scratch, tls| {
+                        for ia in ia_start..ia_end {
+                            for ib in ib_start..ib_end {
+                                scratch.run_one_tile(ker, non_linear, tls, ia, ib)?;
+                            }
                         }
-                        Ok(())
+                        TractResult::Ok(())
                     })
-                })
-            }),
+                },
+            ),
+            #[cfg(feature = "multithread-mm")]
+            Executor::RayonGlobal => chunked_dispatch_rayon(
+                None,
+                m.divceil(ker.mr()),
+                n.divceil(ker.nr()),
+                |ia_start, ia_end, ib_start, ib_end| {
+                    scratch.run_in_tls_scope(|scratch, tls| {
+                        for ia in ia_start..ia_end {
+                            for ib in ib_start..ib_end {
+                                scratch.run_one_tile(ker, non_linear, tls, ia, ib)?;
+                            }
+                        }
+                        TractResult::Ok(())
+                    })
+                },
+            ),
         }
     }
 }
+
+/// Chunk grid for the 2D dispatch.
+///
+/// Mirrors ggml's `mul_mat` heuristic (`ggml/src/ggml-cpu/ggml-cpu.c:1378-1398`):
+///  * 16-tile panel chunks by default;
+///  * 64-tile chunks when one dimension is 1 (vec / vec-mat);
+///  * fallback to "block-per-thread along the longer axis" when the natural
+///    grid would have fewer than `4·nth` chunks.
+///
+/// Returns `(nchunks_m, nchunks_n, dr_m, dr_n)`.
+#[cfg(feature = "multithread-mm")]
+fn chunk_grid(n_panels_m: usize, n_panels_n: usize, nth: usize) -> (usize, usize, usize, usize) {
+    let chunk_size = if n_panels_m == 1 || n_panels_n == 1 { 64 } else { 16 };
+    let mut nchunks_m = n_panels_m.div_ceil(chunk_size);
+    let mut nchunks_n = n_panels_n.div_ceil(chunk_size);
+    if nchunks_m * nchunks_n < 4 * nth {
+        if n_panels_m > n_panels_n {
+            nchunks_m = nth;
+            nchunks_n = 1;
+        } else {
+            nchunks_m = 1;
+            nchunks_n = nth;
+        }
+    }
+    let dr_m = n_panels_m.div_ceil(nchunks_m).max(1);
+    let dr_n = n_panels_n.div_ceil(nchunks_n).max(1);
+    (nchunks_m, nchunks_n, dr_m, dr_n)
+}
+
+/// 2D chunked dispatcher across the (m_panels × n_panels) grid for the
+/// rayon path. Replaces a 1D `into_par_iter` over a single panel axis.
+/// Better-utilises threads on small/skewed shapes where one dimension has
+/// fewer panels than there are workers.
+///
+/// The closure receives **chunk bounds** (`ia_start, ia_end, ib_start, ib_end`),
+/// not per-tile indices. This lets the caller amortise per-worker setup
+/// (e.g. `ScratchSpaceImpl::run_in_tls_scope`) across all tiles in the
+/// chunk, mirroring #2206 for the multi-threaded path. The closure is
+/// invoked exactly once per rayon work item (and once total when the
+/// small-graph fallback path is taken).
+///
+/// `pool`:
+///   * `Some(p)` with `p.current_num_threads() > 1` → scoped via `p.install`
+///     (native, custom pool path).
+///   * `Some(p)` with single-thread pool, or `None` → dispatched via
+///     `into_par_iter` directly, which uses rayon's GLOBAL pool. This is
+///     the only working path on `wasm32-unknown-unknown` via
+///     `wasm_bindgen_rayon::init_thread_pool`.
+#[cfg(feature = "multithread-mm")]
+unsafe fn chunked_dispatch_rayon<F>(
+    pool: Option<&rayon::ThreadPool>,
+    n_panels_m: usize,
+    n_panels_n: usize,
+    run_chunk: F,
+) -> TractResult<()>
+where
+    F: Fn(usize, usize, usize, usize) -> TractResult<()> + Sync,
+{
+    use rayon::prelude::*;
+    if n_panels_m == 0 || n_panels_n == 0 {
+        return Ok(());
+    }
+    if n_panels_m * n_panels_n < crate::multithread::current_threading_panel_threshold() {
+        // Below the threading threshold: run the whole grid as a single chunk
+        // on the calling thread. Closure handles its own TLS scope.
+        return run_chunk(0, n_panels_m, 0, n_panels_n);
+    }
+    let use_global = pool.is_none_or(|p| p.current_num_threads() <= 1);
+    let body = || {
+        let nth = rayon::current_num_threads();
+        let (nchunks_m, nchunks_n, dr_m, dr_n) = chunk_grid(n_panels_m, n_panels_n, nth);
+        let total = nchunks_m * nchunks_n;
+        (0..total).into_par_iter().try_for_each(|idx| {
+            let im = idx % nchunks_m;
+            let in_ = idx / nchunks_m;
+            let ia_start = im * dr_m;
+            let ia_end = (ia_start + dr_m).min(n_panels_m);
+            let ib_start = in_ * dr_n;
+            let ib_end = (ib_start + dr_n).min(n_panels_n);
+            run_chunk(ia_start, ia_end, ib_start, ib_end)
+        })
+    };
+    if use_global { body() } else { pool.unwrap().install(body) }
+}
diff --git a/linalg/src/frame/mmm/scratch.rs b/linalg/src/frame/mmm/scratch.rs
index 8aa3193197..29190f5c8c 100644
--- a/linalg/src/frame/mmm/scratch.rs
+++ b/linalg/src/frame/mmm/scratch.rs
@@ -14,7 +14,7 @@ thread_local! {
 }
 
 #[derive(Default, Debug)]
-struct TLSScratch {
+pub(crate) struct TLSScratch {
     generation: usize,
     blob: Blob,
     ker_specs_16: Vec<FusedKerSpec<f16>>,
@@ -204,33 +204,55 @@ impl<TI: LADatum> ScratchSpaceImpl<TI> {
         down: usize,
         right: usize,
     ) -> TractResult<()> {
+        // Per-tile entry: enter the TLS scope (does sync once) then run a single
+        // tile. Single-threaded callers should prefer `run_in_tls_scope`+
+        // `run_one_tile` to amortise the TLS borrow + sync over many tiles.
         unsafe {
-            TLS.with_borrow_mut(|tls| {
-                tls.sync(self);
-                if down < self.valid_down_tiles && right < self.valid_right_tiles {
-                    self.for_valid_tile(ker, specs, tls, down, right)?;
-                    let err = ker.kernel(tls.ker_specs());
-                    debug_assert_eq!(err, 0, "Kernel return error {err}");
-                } else {
-                    let remnant_down =
-                        if down < self.valid_down_tiles { ker.mr() } else { self.remnant_down };
-                    let remnant_right =
-                        if right < self.valid_right_tiles { ker.nr() } else { self.remnant_right };
-                    self.for_border_tile(
-                        ker,
-                        specs,
-                        tls,
-                        down,
-                        right,
-                        remnant_down,
-                        remnant_right,
-                    )?;
-                    let err = ker.kernel(tls.ker_specs());
-                    debug_assert_eq!(err, 0, "Kernel return error {err}");
-                    self.postprocess_tile(specs, tls, down, right, remnant_down, remnant_right)?;
-                }
-                Ok(())
-            })
+            self.run_in_tls_scope(|this, tls| this.run_one_tile(ker, specs, tls, down, right))
+        }
+    }
+
+    /// Borrow the per-thread scratch blob for a single MMM call and `sync` it
+    /// once. The closure is invoked once with a mutable reference to the TLS
+    /// scratch and to `self`. Used by single-threaded matmul drivers to avoid
+    /// re-entering TLS / re-running `sync` per tile.
+    pub(crate) unsafe fn run_in_tls_scope<F, R>(&self, f: F) -> R
+    where
+        F: FnOnce(&Self, &mut TLSScratch) -> R,
+    {
+        TLS.with_borrow_mut(|tls| {
+            tls.sync(self);
+            f(self, tls)
+        })
+    }
+
+    /// Run a single tile against an already-borrowed TLS scratch. Caller is
+    /// responsible for entering `run_in_tls_scope` first (so `sync` has run).
+    #[inline(always)]
+    pub(crate) unsafe fn run_one_tile(
+        &self,
+        ker: &impl MatMatMulKer<Acc = TI>,
+        specs: &[FusedSpec],
+        tls: &mut TLSScratch,
+        down: usize,
+        right: usize,
+    ) -> TractResult<()> {
+        unsafe {
+            if down < self.valid_down_tiles && right < self.valid_right_tiles {
+                self.for_valid_tile(ker, specs, tls, down, right)?;
+                let err = ker.kernel(tls.ker_specs());
+                debug_assert_eq!(err, 0, "Kernel return error {err}");
+            } else {
+                let remnant_down =
+                    if down < self.valid_down_tiles { ker.mr() } else { self.remnant_down };
+                let remnant_right =
+                    if right < self.valid_right_tiles { ker.nr() } else { self.remnant_right };
+                self.for_border_tile(ker, specs, tls, down, right, remnant_down, remnant_right)?;
+                let err = ker.kernel(tls.ker_specs());
+                debug_assert_eq!(err, 0, "Kernel return error {err}");
+                self.postprocess_tile(specs, tls, down, right, remnant_down, remnant_right)?;
+            }
+            Ok(())
         }
     }
 
diff --git a/linalg/src/frame/mmm/tests/fuse.rs b/linalg/src/frame/mmm/tests/fuse.rs
index 135eaa413b..1cdb3a8d60 100644
--- a/linalg/src/frame/mmm/tests/fuse.rs
+++ b/linalg/src/frame/mmm/tests/fuse.rs
@@ -29,6 +29,11 @@ macro_rules! mmm_kernel_fuse_tests {
             fn store_non_contiguous() {
                 test::store_non_contiguous::<_, $tc, $ti>($ker)
             }
+
+            #[test]
+            fn add_unicast_non_contiguous() {
+                test::add_unicast_non_contiguous::<_, $ti>($ker)
+            }
             proptest::proptest! {
                 #[test]
                 fn return_c_prop(c in tile::<_, $ti>($ker)) {
@@ -151,6 +156,57 @@ where
     assert_eq!(v, expected);
 }
 
+/// `Clear` + `AddUnicast(strided)` + `Store(contiguous)` and check the
+/// source pattern reaches the destination. Counterpart of
+/// `store_non_contiguous` on the read side; `return_c_plus_d` uses
+/// `mmm_stride_storage` (tightly packed) and so doesn't exercise this.
+pub fn add_unicast_non_contiguous<K, TI>(ker: &K)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TI: LADatum + AsPrimitive<TI>,
+    usize: AsPrimitive<TI>,
+{
+    if !ker.is_supported_here() {
+        return;
+    }
+    let item = std::mem::size_of::<TI>();
+    let row_stride_items = 3 * ker.nr() * 5;
+    let col_stride_items = 3;
+    // Source: a non-contiguous buffer with distinct values at the used
+    // (r, c) cells and sentinel garbage everywhere else.
+    let mut src: Vec<TI> = vec![TI::max_value(); ker.mr() * row_stride_items];
+    for r in 0..ker.mr() {
+        for c in 0..ker.nr() {
+            src[r * row_stride_items + c * col_stride_items] = (1 + c + r * ker.nr()).as_();
+        }
+    }
+    let src_store = OutputStoreKer {
+        ptr: src.as_ptr() as _,
+        row_byte_stride: (item * row_stride_items) as isize,
+        col_byte_stride: (item * col_stride_items) as isize,
+        item_size: item,
+    };
+    // Destination: tightly-packed output for easy comparison.
+    let mut dst: Vec<TI> = vec![TI::min_value(); ker.mr() * ker.nr()];
+    let dst_store = OutputStoreKer {
+        ptr: dst.as_ptr() as _,
+        row_byte_stride: (item * ker.nr()) as isize,
+        col_byte_stride: item as isize,
+        item_size: item,
+    };
+    let non_linear = tvec![
+        FusedKerSpec::Clear,
+        FusedKerSpec::AddUnicast(src_store),
+        FusedKerSpec::Store(dst_store),
+        FusedKerSpec::Done,
+    ];
+    let err = ker.kernel(&non_linear);
+    assert_eq!(err, 0);
+    let expected: Vec<TI> = (0..ker.mr() * ker.nr()).map(|i| (1 + i).as_()).collect();
+    display_error(&dst, &expected, ker.mr(), ker.nr());
+    assert_eq!(dst, expected);
+}
+
 pub fn fused_ops<K, TI, E>(ker: &K, c: &[TI], ops: &[FusedKerSpec<TI>], expect: E)
 where
     K: MatMatMulKer<Acc = TI>,
diff --git a/linalg/src/frame/mmm/tests/packed_packed.rs b/linalg/src/frame/mmm/tests/packed_packed.rs
index 63248d2ed2..8437990c72 100644
--- a/linalg/src/frame/mmm/tests/packed_packed.rs
+++ b/linalg/src/frame/mmm/tests/packed_packed.rs
@@ -1,7 +1,7 @@
+use crate::WeightType;
 use crate::block_quant::PackedBlockQuantFormat;
 use crate::mmm::tests::display_error;
 use crate::mmm::{AsInputValue, FusedKerSpec, FusedSpec, MatMatMul, MatMatMulKer, OutputStoreKer};
-use crate::pack::PackedFormat;
 use proptest::collection::vec;
 use proptest::prelude::*;
 use std::fmt::Debug;
@@ -255,9 +255,8 @@ impl<K: MatMatMulKer> PackedPackedProblem<K> {
 
     pub fn padded_inputs(&self) -> TractResult<(Tensor, Tensor)> {
         let (pack_a, pack_b) = &self.ker.packings()[self.packing];
-        assert!(pack_b.k_alignment() == 1);
         let (m, k, n) = self.mkn();
-        let k_aligned = k.next_multiple_of(pack_a.k_alignment());
+        let k_aligned = k.next_multiple_of(pack_a.k_alignment().max(pack_b.k_alignment()));
 
         let mut a = Tensor::zero::<f32>(&[m, k_aligned])?;
         for row in 0..m {
@@ -265,8 +264,8 @@ impl<K: MatMatMulKer> PackedPackedProblem<K> {
                 a.try_as_plain_mut()?.to_array_view_mut()?[[row, col]] = self.a[col + k * row];
             }
         }
-        if let Some(pf) = pack_a.downcast_ref::<PackedFormat>() {
-            a = a.cast_to_dt(pf.dt)?.into_owned();
+        if let WeightType::Plain(dt) = pack_a.precursor() {
+            a = a.cast_to_dt(dt)?.into_owned();
         }
         let mut b = Tensor::zero::<f32>(&[k_aligned, n])?;
         for row in 0..k {
@@ -274,8 +273,8 @@ impl<K: MatMatMulKer> PackedPackedProblem<K> {
                 b.try_as_plain_mut()?.to_array_view_mut()?[[row, col]] = self.b[col + n * row];
             }
         }
-        if let Some(pf) = pack_b.downcast_ref::<PackedFormat>() {
-            b = b.cast_to_dt(pf.dt)?.into_owned();
+        if let WeightType::Plain(dt) = pack_b.precursor() {
+            b = b.cast_to_dt(dt)?.into_owned();
         }
 
         Ok((a, b))
@@ -283,9 +282,9 @@ impl<K: MatMatMulKer> PackedPackedProblem<K> {
 
     pub fn reference(&self) -> TractResult<Tensor> {
         let (m, k, n) = self.mkn();
-        let pack_a = &self.ker.packings()[self.packing].0;
+        let (pack_a, pack_b) = &self.ker.packings()[self.packing];
         let (mut a, b) = self.padded_inputs()?;
-        let k_aligned = k.next_multiple_of(pack_a.k_alignment());
+        let k_aligned = k.next_multiple_of(pack_a.k_alignment().max(pack_b.k_alignment()));
         if let Some(pbqf) = pack_a.downcast_ref::<PackedBlockQuantFormat>() {
             a = pbqf.simulate_precision_loss(a, 1)?;
         };
@@ -312,8 +311,7 @@ impl<K: MatMatMulKer> PackedPackedProblem<K> {
     pub fn run(&self) -> TractResult<Tensor> {
         let (m, k, n) = self.mkn();
         let (pack_a, pack_b) = &self.ker.packings()[self.packing];
-        assert!(pack_b.k_alignment() == 1);
-        let k_aligned = k.next_multiple_of(pack_a.k_alignment());
+        let k_aligned = k.next_multiple_of(pack_a.k_alignment().max(pack_b.k_alignment()));
 
         let (a, b) = self.padded_inputs()?;
         let pa = pack_a.prepare_one(&a, 1, 0)?;
diff --git a/linalg/src/frame/mod.rs b/linalg/src/frame/mod.rs
index 8528eea350..01128bff3c 100644
--- a/linalg/src/frame/mod.rs
+++ b/linalg/src/frame/mod.rs
@@ -8,6 +8,10 @@ pub mod unicast;
 #[macro_use]
 pub mod by_scalar;
 #[macro_use]
+pub mod gelu;
+#[macro_use]
+pub mod hardswish;
+#[macro_use]
 pub mod leaky_relu;
 #[macro_use]
 pub mod lut;
@@ -20,6 +24,8 @@ pub mod reduce;
 #[macro_use]
 pub mod sigmoid;
 #[macro_use]
+pub mod silu;
+#[macro_use]
 pub mod tanh;
 #[macro_use]
 pub mod weights;
diff --git a/linalg/src/frame/pack.rs b/linalg/src/frame/pack.rs
index a41adf1ced..631d725633 100644
--- a/linalg/src/frame/pack.rs
+++ b/linalg/src/frame/pack.rs
@@ -311,6 +311,8 @@ impl PackedFormat {
                 32 => pack_mn_major::<[u8; 32]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
                 48 => pack_mn_major::<[u8; 48]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
                 64 => pack_mn_major::<[u8; 64]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
+                96 => pack_mn_major::<[u8; 96]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
+                128 => pack_mn_major::<[u8; 128]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
                 _ => {
                     let mut packer = self.write_with_k_outer(pb, k_range.len(), mn_range.len());
                     for k in k_range {
@@ -403,6 +405,19 @@ impl PackedFormat {
 
 pub trait PackingWriter<T: Copy> {
     fn write(&mut self, t: T);
+
+    /// Write a contiguous slice of values. The default implementation falls
+    /// back to per-element `write`; concrete writers may override with a
+    /// `memcpy`-class fast path when the destination layout permits it.
+    ///
+    /// The output produced by `write_slice(s)` must be byte-identical to
+    /// `for &t in s { self.write(t); }` for any input.
+    #[inline]
+    fn write_slice(&mut self, ts: &[T]) {
+        for t in ts {
+            self.write(*t);
+        }
+    }
 }
 
 #[derive(Debug)]
@@ -434,6 +449,17 @@ where
             self.ptr = self.ptr.offset(1);
         }
     }
+
+    #[inline]
+    fn write_slice(&mut self, ts: &[T]) {
+        // KOutSinglePanelWriter writes elements consecutively with no panel
+        // boundaries. A direct `copy_nonoverlapping` is byte-identical to the
+        // per-element loop.
+        unsafe {
+            std::ptr::copy_nonoverlapping(ts.as_ptr(), self.ptr, ts.len());
+            self.ptr = self.ptr.add(ts.len());
+        }
+    }
 }
 
 #[derive(Debug)]
@@ -506,6 +532,57 @@ where
             }
         }
     }
+
+    #[inline]
+    fn write_slice(&mut self, ts: &[T]) {
+        // Fast path: the slice fits entirely within the current panel. Writes
+        // are then guaranteed to be `ts.len()` consecutive memory locations
+        // followed by the same panel/lane bookkeeping the per-element path
+        // performs. This produces byte-identical output to a per-element loop.
+        //
+        // When the slice would cross a panel boundary, fall back to the
+        // per-element path so all transition logic stays in one place.
+        let n = ts.len();
+        if n == 0 {
+            return;
+        }
+        if n < self.remain {
+            // Strictly inside the current panel: bulk copy, then advance.
+            unsafe {
+                std::ptr::copy_nonoverlapping(ts.as_ptr(), self.ptr, n);
+                self.ptr = self.ptr.add(n);
+            }
+            self.remain -= n;
+        } else if n == self.remain {
+            // Exactly fills the current panel: bulk copy, then run the same
+            // panel-transition bookkeeping that `write` does on its final
+            // element. The transition is performed unconditionally here
+            // (rather than calling `write` for the last element) to keep the
+            // semantics identical even when the trait is inlined separately.
+            unsafe {
+                std::ptr::copy_nonoverlapping(ts.as_ptr(), self.ptr, n);
+                self.ptr = self.ptr.add(n);
+                self.current_panel += 1;
+                if self.current_panel == self.panels {
+                    self.ptr = self.ptr.offset(self.next_lane);
+                    self.current_panel = 0;
+                } else {
+                    self.ptr = self.ptr.offset(self.next_panel);
+                }
+                if self.current_panel == self.panels - 1 {
+                    self.remain = self.last_panel_width;
+                } else {
+                    self.remain = self.panel_width;
+                }
+            }
+        } else {
+            // Spans a panel boundary. Fall back to per-element writes so the
+            // panel-transition state machine handles every step.
+            for t in ts {
+                self.write(*t);
+            }
+        }
+    }
 }
 
 #[derive(Debug)]
@@ -615,6 +692,235 @@ unsafe fn pack_mn_major<Chunk: Copy>(
     }
 }
 
+// K=4-inner packing writer (PackedI8K4 layout), fed in K-OUTER order (same feed
+// as KOutWriter, used by the im2col patchers): for each k, all mn. Within a panel,
+// element (k, local_mn) lands at (k/4)*r*4 + local_mn*4 + (k%4), so consecutive mn
+// for a fixed k are stride-4 stores.
+#[derive(Debug)]
+pub struct KOut4Writer<'p, T>
+where
+    T: Copy + std::fmt::Debug,
+{
+    base: *mut T,
+    r4: usize,        // r * 4
+    panel_len: usize, // k_aligned * r
+    panels: usize,
+    panel_width: usize,
+    last_panel_width: usize,
+    kb: usize, // k / 4
+    kr: usize, // k % 4
+    panel: usize,
+    local_mn: usize,
+    _phantom: PhantomData<&'p T>,
+}
+
+impl<'p, T> KOut4Writer<'p, T>
+where
+    T: Copy + std::fmt::Debug,
+{
+    pub fn new(base: *mut T, r: usize, panel_len: usize, mn: usize) -> KOut4Writer<'p, T> {
+        let panels = mn.divceil(r).max(1);
+        let last_panel_width = mn - (panels - 1) * r;
+        KOut4Writer {
+            base,
+            r4: r * 4,
+            panel_len,
+            panels,
+            panel_width: r,
+            last_panel_width,
+            kb: 0,
+            kr: 0,
+            panel: 0,
+            local_mn: 0,
+            _phantom: PhantomData,
+        }
+    }
+    #[inline(always)]
+    fn panel_width(&self) -> usize {
+        if self.panel == self.panels - 1 { self.last_panel_width } else { self.panel_width }
+    }
+    #[inline(always)]
+    fn advance(&mut self, by: usize) {
+        self.local_mn += by;
+        if self.local_mn >= self.panel_width() {
+            self.local_mn = 0;
+            self.panel += 1;
+            if self.panel == self.panels {
+                self.panel = 0;
+                self.kr += 1;
+                if self.kr == 4 {
+                    self.kr = 0;
+                    self.kb += 1;
+                }
+            }
+        }
+    }
+}
+
+impl<T> PackingWriter<T> for KOut4Writer<'_, T>
+where
+    T: Copy + std::fmt::Debug,
+{
+    #[inline(always)]
+    fn write(&mut self, t: T) {
+        unsafe {
+            let off = self.panel * self.panel_len + self.kb * self.r4 + self.local_mn * 4 + self.kr;
+            *self.base.add(off) = t;
+        }
+        self.advance(1);
+    }
+
+    #[inline]
+    fn write_slice(&mut self, ts: &[T]) {
+        let n = ts.len();
+        if n == 0 {
+            return;
+        }
+        let pw = self.panel_width();
+        if self.local_mn + n <= pw {
+            // Whole slice stays inside the current (panel, k): tight stride-4 store.
+            unsafe {
+                let mut d = self.base.add(
+                    self.panel * self.panel_len + self.kb * self.r4 + self.local_mn * 4 + self.kr,
+                );
+                for &t in ts {
+                    *d = t;
+                    d = d.add(4);
+                }
+            }
+            self.advance(n);
+        } else {
+            for &t in ts {
+                self.write(t);
+            }
+        }
+    }
+}
+
+// K=4-inner packing for SDOT/relaxed-dot int8 matmul: 4 contiguous K per mn-lane.
+// Layout: out[(k/4)*r*4 + m*4 + (k%4)] = src[m,k]. k_alignment=4. Matmul path uses
+// pack_view; the conv im2col patchers feed write_with_k_outer in K-outer order.
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+pub struct PackedI8K4 {
+    pub r: usize,
+    pub align: usize,
+}
+impl PackedI8K4 {
+    pub fn new(r: usize) -> Self {
+        PackedI8K4 { r, align: 16 }
+    }
+    fn panel(&self, k: usize) -> usize {
+        (k.div_ceil(4) * 4) * self.r
+    }
+    pub fn single_panel_len(&self, k: usize) -> usize {
+        self.panel(k)
+    }
+    pub fn len(&self, k: usize, mn: usize) -> usize {
+        mn.divceil(self.r) * self.panel(k)
+    }
+    pub fn alignment(&self) -> usize {
+        self.align
+    }
+    // One-pass K-outer writer for the conv im2col patchers (fed: for each k, all mn).
+    pub fn write_with_k_outer<'p, T: Copy + std::fmt::Debug>(
+        &self,
+        pb: *mut T,
+        k: usize,
+        mn: usize,
+    ) -> KOut4Writer<'p, T> {
+        KOut4Writer::new(pb, self.r, self.panel(k), mn)
+    }
+    // K=4-inner pack from a (possibly strided) view: out[(k/4)*r*4 + m*4 + (k%4)] = src[m,k].
+    pub fn pack_view(
+        &self,
+        t: &TensorView,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        let k = t.shape()[k_axis];
+        let mn = t.shape()[mn_axis];
+        let kp = k.div_ceil(4) * 4;
+        let pl = kp * self.r;
+        let panels = mn.div_ceil(self.r);
+        let st = t.strides();
+        let mut blob = unsafe { Blob::new_for_size_and_align(panels * pl, self.align) };
+        blob.as_bytes_mut().fill(0);
+        let (ks, ms) = (st[k_axis], st[mn_axis]);
+        let kblocks = kp / 4;
+        unsafe {
+            let src = t.as_ptr_unchecked::<i8>();
+            let dst = blob.as_mut_ptr() as *mut i8;
+            for p in 0..panels {
+                let pw = self.r.min(mn - p * self.r);
+                let panel = dst.add(p * pl);
+                let mn0 = (p * self.r) as isize;
+                for kb in 0..kblocks {
+                    for kr in 0..4 {
+                        let kk = kb * 4 + kr;
+                        if kk >= k {
+                            break;
+                        }
+                        let srow = src.offset(kk as isize * ks + mn0 * ms);
+                        let dcol = panel.add(kb * self.r * 4 + kr);
+                        for lm in 0..pw {
+                            *dcol.add(lm * 4) = *srow.offset(lm as isize * ms);
+                        }
+                    }
+                }
+            }
+        }
+        Ok(Box::new(EagerPackedInput {
+            fact: PackedExoticFact { format: Box::new(self.clone()), mn: mn.to_dim(), k },
+            packed: blob.into(),
+            panel_bytes: pl,
+            mn,
+        }))
+    }
+}
+impl std::fmt::Display for PackedI8K4 {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "I8K4[{}]", self.r)
+    }
+}
+impl MMMInputFormat for PackedI8K4 {
+    fn prepare_tensor(&self, t: &Tensor, k_axis: usize, mn_axis: usize) -> TractResult<Tensor> {
+        Ok(PackedMatrixStorage::new(self.prepare_one(t, k_axis, mn_axis)?)
+            .into_tensor(t.datum_type()))
+    }
+    fn prepare_one(
+        &self,
+        t: &Tensor,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        self.pack_view(&t.view(), k_axis, mn_axis)
+    }
+    fn precursor(&self) -> WeightType {
+        WeightType::Plain(i8::datum_type())
+    }
+    fn r(&self) -> usize {
+        self.r
+    }
+    fn k_alignment(&self) -> usize {
+        4
+    }
+    fn merge_with<'o, 'a: 'o, 'b: 'o>(
+        &'a self,
+        o: &'b dyn MMMInputFormat,
+    ) -> Option<&'o dyn MMMInputFormat> {
+        o.downcast_ref::<PackedI8K4>().filter(|x| x.r == self.r).map(|_| self as _)
+    }
+    fn mem_size(&self, k: TDim, mn: TDim) -> TDim {
+        mn.divceil(self.r) * self.panel(k.to_usize().unwrap_or(0))
+    }
+    fn extract_at_mn_f16(&self, _: &EagerPackedInput, _: usize, _: &mut [f16]) -> TractResult<()> {
+        bail!("no f16 extract")
+    }
+    fn extract_at_mn_f32(&self, _: &EagerPackedInput, _: usize, _: &mut [f32]) -> TractResult<()> {
+        bail!("no f32 extract")
+    }
+}
+
 pub trait Packing {
     fn packing(r: usize) -> PackedFormat;
 }
@@ -761,6 +1067,188 @@ mod test {
         }
 
     }
+    // ---- PackedI8K4 (K=4-inner SMOPA/SDOT layout) dedicated tests ----------
+    //
+    // PackedI8K4 has two independent producers that MUST agree byte-for-byte:
+    //   * `pack_view`           — the matmul path, reads a (possibly strided)
+    //                             TensorView and packs in one shot.
+    //   * `write_with_k_outer`  — the conv/im2col path, fed element-by-element
+    //                             in K-OUTER order (for each k, all mn).
+    // Both must equal the canonical layout
+    //     out[panel*pl + (k/4)*r*4 + local_mn*4 + (k%4)] = src[k, panel*r+local_mn]
+    // with pl = ceil(K/4)*4 * r, and every padding byte (K%4 tail, partial last
+    // mn panel) left at zero.
+    #[derive(Debug, Clone)]
+    struct PackI8K4Problem {
+        k: usize,
+        mn: usize,
+        r: usize,
+        // false: input tensor is [k, mn] (k_axis=0, mn_axis=1) — contiguous read.
+        // true : input tensor is [mn, k] (k_axis=1, mn_axis=0) — strided read,
+        //        mirroring how the "A" operand is fed.
+        is_a: bool,
+    }
+
+    impl PackI8K4Problem {
+        // Canonical logical matrix, always indexed [k, mn].
+        fn logical(&self) -> Array2<i8> {
+            Array2::from_shape_fn((self.k, self.mn), |(kk, m)| {
+                (kk.wrapping_mul(31).wrapping_add(m.wrapping_mul(17)).wrapping_add(1)) as i8
+            })
+        }
+
+        fn panel_len(&self) -> usize {
+            (self.k.div_ceil(4) * 4) * self.r
+        }
+
+        // The layout every producer must reproduce.
+        fn reference(&self) -> Vec<i8> {
+            let logical = self.logical();
+            let r = self.r;
+            let pl = self.panel_len();
+            let panels = self.mn.div_ceil(r);
+            let mut out = vec![0i8; panels * pl];
+            for p in 0..panels {
+                let pw = r.min(self.mn - p * r);
+                for kk in 0..self.k {
+                    for lm in 0..pw {
+                        let m = p * r + lm;
+                        let off = p * pl + (kk / 4) * r * 4 + lm * 4 + (kk % 4);
+                        out[off] = logical[[kk, m]];
+                    }
+                }
+            }
+            out
+        }
+
+        // The matmul path: pack a TensorView, then read it back panel by panel.
+        fn pack_view_bytes(&self) -> Vec<i8> {
+            let logical = self.logical();
+            let packer = super::PackedI8K4::new(self.r);
+            let (tensor, k_axis, mn_axis) = if self.is_a {
+                // [mn, k] with entry [m, kk] == logical[kk, m]; reads are strided.
+                let a = Array2::from_shape_fn((self.mn, self.k), |(m, kk)| logical[[kk, m]]);
+                (a.into_tensor(), 1usize, 0usize)
+            } else {
+                (logical.clone().into_tensor(), 0usize, 1usize)
+            };
+            let packed = packer.pack_view(&tensor.view(), k_axis, mn_axis).unwrap();
+            let pl = self.panel_len();
+            let panels = self.mn.div_ceil(self.r);
+            assert_eq!(packed.panels_count(), panels);
+            assert_eq!(packed.k(), self.k);
+            assert_eq!(packed.mn(), self.mn);
+            let mut out = vec![0i8; panels * pl];
+            unsafe {
+                for p in 0..panels {
+                    let ptr = packed.panel_bytes(p, None).unwrap() as *const i8;
+                    std::ptr::copy_nonoverlapping(ptr, out.as_mut_ptr().add(p * pl), pl);
+                }
+            }
+            out
+        }
+
+        // The conv path: feed the writer in K-outer order (for each k, all mn).
+        fn writer_bytes(&self) -> Vec<i8> {
+            let logical = self.logical();
+            let packer = super::PackedI8K4::new(self.r);
+            let total = packer.len(self.k, self.mn);
+            assert_eq!(total, self.mn.div_ceil(self.r) * self.panel_len());
+            let mut buf = vec![0i8; total];
+            {
+                let mut w = packer.write_with_k_outer(buf.as_mut_ptr(), self.k, self.mn);
+                for kk in 0..self.k {
+                    for m in 0..self.mn {
+                        super::PackingWriter::write(&mut w, logical[[kk, m]]);
+                    }
+                }
+            }
+            buf
+        }
+
+        fn check(&self) {
+            let reference = self.reference();
+            assert_eq!(
+                self.pack_view_bytes(),
+                reference,
+                "pack_view disagrees with reference for {self:?}"
+            );
+            assert_eq!(
+                self.writer_bytes(),
+                reference,
+                "write_with_k_outer disagrees with reference for {self:?}"
+            );
+        }
+    }
+
+    impl Arbitrary for PackI8K4Problem {
+        type Parameters = ();
+        type Strategy = BoxedStrategy<PackI8K4Problem>;
+        fn arbitrary_with(_: ()) -> Self::Strategy {
+            // r is the tile width used by the int8 kernels (SMOPA 32, SDOT 8, ...).
+            (any::<bool>(), prop::sample::select(vec![4usize, 8, 16, 32]), 1usize..40, 1usize..40)
+                .prop_map(|(is_a, r, k, mn)| PackI8K4Problem { k, mn, r, is_a })
+                .boxed()
+        }
+    }
+
+    proptest::proptest! {
+        #[test]
+        fn pack_i8k4_prop(pb in any::<PackI8K4Problem>()) {
+            pb.check();
+        }
+    }
+
+    fn k4(k: usize, mn: usize, r: usize, is_a: bool) -> PackI8K4Problem {
+        PackI8K4Problem { k, mn, r, is_a }
+    }
+
+    #[test]
+    fn i8k4_smallest() {
+        k4(1, 1, 4, false).check();
+        k4(1, 1, 4, true).check();
+    }
+
+    #[test]
+    fn i8k4_exact_tile() {
+        // K and mn land exactly on the 4 / r boundaries: no padding anywhere.
+        k4(4, 4, 4, false).check();
+        k4(8, 32, 32, false).check();
+        k4(8, 32, 32, true).check();
+    }
+
+    #[test]
+    fn i8k4_k_not_multiple_of_4() {
+        // K%4 tail must be zero-padded inside each panel.
+        for k in [1, 2, 3, 5, 6, 7, 9] {
+            k4(k, 4, 4, false).check();
+            k4(k, 7, 8, true).check();
+        }
+    }
+
+    #[test]
+    fn i8k4_partial_last_panel() {
+        // mn not a multiple of r: last panel is narrower, tail lanes are zero.
+        k4(5, 7, 4, false).check();
+        k4(5, 7, 4, true).check();
+        k4(4, 33, 32, false).check();
+        k4(4, 33, 32, true).check();
+        k4(3, 1, 32, false).check();
+    }
+
+    #[test]
+    fn i8k4_single_wide_tile() {
+        // One narrow panel inside a wide (r=32) tile.
+        k4(7, 1, 32, false).check();
+        k4(7, 5, 16, true).check();
+    }
+
+    #[test]
+    fn i8k4_many_panels() {
+        k4(13, 100, 8, false).check();
+        k4(13, 100, 8, true).check();
+        k4(17, 65, 16, false).check();
+    }
 
     #[test]
     fn simple_b_1() {
diff --git a/linalg/src/frame/silu.rs b/linalg/src/frame/silu.rs
new file mode 100644
index 0000000000..5d51154149
--- /dev/null
+++ b/linalg/src/frame/silu.rs
@@ -0,0 +1,58 @@
+#[allow(unused_macros)]
+macro_rules! silu_impl {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $cond: expr) => {
+        ew_impl!($ti, $func, $nr, $alignment_items);
+        #[cfg(test)]
+        paste! {
+            mod [<test_ $func>] {
+                use super::*;
+                silu_frame_tests!($cond, $ti, $func);
+            }
+        }
+    };
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use crate::LADatum;
+    use crate::frame::element_wise::*;
+    use num_traits::{AsPrimitive, Float};
+    use proptest::test_runner::TestCaseResult;
+
+    #[macro_export]
+    macro_rules! silu_frame_tests {
+        ($cond:expr, $t: ty, $ker:ty) => {
+            proptest::proptest! {
+                #[test]
+                fn prop(xs in proptest::collection::vec(-10f32..10.0, 0..100)) {
+                    if $cond {
+                        $crate::frame::silu::test::test_silu::<$ker, $t>(&*xs).unwrap()
+                    }
+                }
+            }
+            #[test]
+            fn trivial() {
+                if $cond {
+                    $crate::frame::silu::test::test_silu::<$ker, $t>(&[-5f32, -1.0, 0.0, 1.0, 5.0])
+                        .unwrap();
+                }
+            }
+        };
+    }
+
+    pub fn test_silu<K: ElementWiseKer<T>, T: LADatum + Float>(values: &[f32]) -> TestCaseResult
+    where
+        f32: AsPrimitive<T>,
+    {
+        let data = tract_data::prelude::tensor1(values);
+        let data = data.cast_to::<T>().unwrap();
+        let data = data.try_as_plain().unwrap().as_slice::<T>().unwrap();
+        crate::frame::element_wise::test::test_element_wise::<K, T, _>(data, |x: T| {
+            let one: T = 1f32.as_();
+            let neg_x = T::zero() - x;
+            let sigmoid = one / (one + neg_x.exp());
+            x * sigmoid
+        })
+    }
+}
diff --git a/linalg/src/generic.rs b/linalg/src/generic.rs
index f2030ff0bd..bf9a2fcb65 100644
--- a/linalg/src/generic.rs
+++ b/linalg/src/generic.rs
@@ -1,11 +1,14 @@
 pub mod by_scalar;
 pub mod erf;
+pub mod gelu;
+pub mod hardswish;
 pub mod leaky_relu;
 pub mod lut;
 pub mod mmm;
 pub mod reduce;
 pub mod rounding;
 pub mod sigmoid;
+pub mod silu;
 pub mod tanh;
 pub mod unicast;
 
@@ -17,11 +20,14 @@ use crate::{BinOp, LinalgRegistry};
 
 pub use self::by_scalar::{HMulByScalar8, SMulByScalar4};
 pub use self::erf::SErf4;
+pub use self::gelu::{HGelu8, SGelu4};
+pub use self::hardswish::{HHardSwish8, SHardSwish4};
 pub use self::leaky_relu::{HLeakyRelu8, SLeakyRelu4};
 pub use self::lut::GenericLut8;
 pub use self::reduce::softmax_l2::SSoftMaxL2;
 pub use self::rounding::{ScaleShiftAndRound, Scaler};
 pub use self::sigmoid::{HSigmoid8, SSigmoid4};
+pub use self::silu::{HSiLU8, SSiLU4};
 pub use self::tanh::{HTanh8, STanh4};
 
 pub(crate) fn register_all_unicast(registry: &mut LinalgRegistry) {
diff --git a/linalg/src/generic/gelu.rs b/linalg/src/generic/gelu.rs
new file mode 100644
index 0000000000..d62d09dd46
--- /dev/null
+++ b/linalg/src/generic/gelu.rs
@@ -0,0 +1,88 @@
+#![allow(clippy::excessive_precision)]
+use crate::frame::element_wise::ElementWiseKer;
+use tract_data::internal::*;
+
+// Tanh-form GELU approximation matching tract's GeluApproximate (pow=3, the
+// canonical Hendrycks-Gimpel/Open-AI form):
+//
+//     gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+//
+// The fast variant (pow=2) is not exposed here; the graph op falls back to
+// scalar when fast_impl=true.
+
+const SQRT_2_OVER_PI: f32 = 0.7978845608028654;
+const COEF: f32 = 0.044715;
+
+#[derive(Clone, Debug)]
+pub struct SGelu4;
+
+impl ElementWiseKer<f32> for SGelu4 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        4
+    }
+
+    fn run(x: &mut [f32], _: ()) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| {
+            let v = *px;
+            let inner = SQRT_2_OVER_PI * (v + COEF * v * v * v);
+            *px = 0.5 * v * (1.0 + inner.tanh());
+        });
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct HGelu8;
+
+impl ElementWiseKer<f16> for HGelu8 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        8
+    }
+
+    fn run(x: &mut [f16], _: ()) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| {
+            let v = px.to_f32();
+            let inner = SQRT_2_OVER_PI * (v + COEF * v * v * v);
+            *px = f16::from_f32(0.5 * v * (1.0 + inner.tanh()));
+        });
+    }
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod s {
+    gelu_frame_tests!(true, f32, crate::generic::gelu::SGelu4);
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod h {
+    gelu_frame_tests!(true, tract_data::internal::f16, crate::generic::gelu::HGelu8);
+}
diff --git a/linalg/src/generic/hardswish.rs b/linalg/src/generic/hardswish.rs
new file mode 100644
index 0000000000..79612b72bd
--- /dev/null
+++ b/linalg/src/generic/hardswish.rs
@@ -0,0 +1,80 @@
+#![allow(clippy::excessive_precision)]
+use crate::frame::element_wise::ElementWiseKer;
+use tract_data::internal::*;
+use tract_num_traits::Zero;
+
+#[derive(Clone, Debug)]
+pub struct SHardSwish4;
+
+impl ElementWiseKer<f32> for SHardSwish4 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        4
+    }
+
+    fn run(x: &mut [f32], _: ()) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        const INV6: f32 = 1.0 / 6.0;
+        x.iter_mut().for_each(|px| {
+            let relu6 = ((*px + 3.0).min(6.0)).max(0.0);
+            *px = *px * relu6 * INV6;
+        });
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct HHardSwish8;
+
+impl ElementWiseKer<f16> for HHardSwish8 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        8
+    }
+
+    fn run(x: &mut [f16], _: ()) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        let three = f16::from_f32(3.0);
+        let six = f16::from_f32(6.0);
+        let inv6 = f16::from_f32(1.0 / 6.0);
+        x.iter_mut().for_each(|px| {
+            let relu6 = ((*px + three).min(six)).max(f16::zero());
+            *px = *px * relu6 * inv6;
+        });
+    }
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod s {
+    hardswish_frame_tests!(true, f32, crate::generic::hardswish::SHardSwish4);
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod h {
+    hardswish_frame_tests!(true, tract_data::internal::f16, crate::generic::hardswish::HHardSwish8);
+}
diff --git a/linalg/src/generic/silu.rs b/linalg/src/generic/silu.rs
new file mode 100644
index 0000000000..189921db08
--- /dev/null
+++ b/linalg/src/generic/silu.rs
@@ -0,0 +1,76 @@
+#![allow(clippy::excessive_precision)]
+use crate::frame::element_wise::ElementWiseKer;
+use tract_data::internal::*;
+
+#[derive(Clone, Debug)]
+pub struct SSiLU4;
+
+impl ElementWiseKer<f32> for SSiLU4 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        4
+    }
+
+    fn run(x: &mut [f32], _: ()) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| {
+            let sigmoid = 1.0 / (1.0 + (-*px).exp());
+            *px = *px * sigmoid;
+        });
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct HSiLU8;
+
+impl ElementWiseKer<f16> for HSiLU8 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        8
+    }
+
+    fn run(x: &mut [f16], _: ()) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| {
+            let x_f32 = px.to_f32();
+            let sigmoid = 1.0 / (1.0 + (-x_f32).exp());
+            *px = f16::from_f32(x_f32 * sigmoid);
+        });
+    }
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod s {
+    silu_frame_tests!(true, f32, crate::generic::silu::SSiLU4);
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod h {
+    silu_frame_tests!(true, tract_data::internal::f16, crate::generic::silu::HSiLU8);
+}
diff --git a/linalg/src/lib.rs b/linalg/src/lib.rs
index ec1528b3ca..f64488bd8e 100644
--- a/linalg/src/lib.rs
+++ b/linalg/src/lib.rs
@@ -89,6 +89,12 @@ pub struct Ops {
     pub tanh_f16: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f16>> + Send + Sync>,
     pub tanh_f32: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f32>> + Send + Sync>,
     pub erf_f32: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f32>> + Send + Sync>,
+    pub hardswish_f16: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f16>> + Send + Sync>,
+    pub hardswish_f32: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f32>> + Send + Sync>,
+    pub silu_f16: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f16>> + Send + Sync>,
+    pub silu_f32: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f32>> + Send + Sync>,
+    pub gelu_f16: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f16>> + Send + Sync>,
+    pub gelu_f32: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f32>> + Send + Sync>,
     pub lut_u8: Box<dyn Fn(&[u8]) -> Box<dyn lut::Lut> + Send + Sync>,
 
     pub max_f16: Box<dyn Fn() -> Box<dyn reduce::Reduce<f16>> + Send + Sync>,
@@ -221,6 +227,12 @@ pub fn generic() -> Ops {
         tanh_f16: Box::new(|| generic::HTanh8::ew()),
         tanh_f32: Box::new(|| generic::STanh4::ew()),
         erf_f32: Box::new(|| generic::SErf4::ew()),
+        hardswish_f16: Box::new(|| generic::HHardSwish8::ew()),
+        hardswish_f32: Box::new(|| generic::SHardSwish4::ew()),
+        silu_f16: Box::new(|| generic::HSiLU8::ew()),
+        silu_f32: Box::new(|| generic::SSiLU4::ew()),
+        gelu_f16: Box::new(|| generic::HGelu8::ew()),
+        gelu_f32: Box::new(|| generic::SGelu4::ew()),
         lut_u8: Box::new(|table: &[u8]| Box::new(lut::LutImpl::<generic::GenericLut8>::new(table))),
         max_f16: Box::new(|| generic::reduce::max::HMax8::red()),
         max_f32: Box::new(|| generic::reduce::max::SMax4::red()),
diff --git a/linalg/src/multithread.rs b/linalg/src/multithread.rs
index 51f2f07c0a..ecf7b56708 100644
--- a/linalg/src/multithread.rs
+++ b/linalg/src/multithread.rs
@@ -1,4 +1,6 @@
 use std::cell::RefCell;
+#[cfg(feature = "multithread-mm")]
+use std::sync::atomic::{AtomicUsize, Ordering};
 #[allow(unused_imports)]
 use std::sync::{Arc, Mutex};
 
@@ -11,6 +13,16 @@ pub enum Executor {
     SingleThread,
     #[cfg(feature = "multithread-mm")]
     MultiThread(Arc<ThreadPool>),
+    /// Use rayon's GLOBAL thread pool — the one set up by
+    /// `wasm_bindgen_rayon::init_thread_pool` on `wasm32-unknown-unknown`,
+    /// or rayon's auto-initialised default on native.
+    ///
+    /// Exists because `Arc<rayon::ThreadPool>` cannot be constructed on
+    /// `wasm32-unknown-unknown`: rayon's default `spawn_handler` calls
+    /// `std::thread::spawn`, which is unsupported there. The only working
+    /// route is rayon's global pool, accessed via `into_par_iter` directly.
+    #[cfg(feature = "multithread-mm")]
+    RayonGlobal,
 }
 
 impl Executor {
@@ -55,3 +67,27 @@ pub fn multithread_tract_scope<R, F: FnOnce() -> R>(pool: Executor, f: F) -> R {
     TLS_EXECUTOR_OVERRIDE.set(previous);
     result
 }
+
+/// Threshold (in panels) below which the rayon MMM dispatcher skips
+/// parallelism and runs inline single-threaded. Below this size,
+/// per-call dispatch overhead (~5 µs native, ~50 µs wasm-bindgen-rayon
+/// worker) exceeds the parallel speedup.
+///
+/// Default `64`. Tune higher for many-small-MMM workloads (mobile vision,
+/// streaming RNN) or lower for transformer-class workloads where every MMM
+/// is large. `0` disables the gate entirely (always thread).
+#[cfg(feature = "multithread-mm")]
+static THREADING_PANEL_THRESHOLD: AtomicUsize = AtomicUsize::new(64);
+
+/// Read the current MMM panel-count threshold for the rayon path.
+#[cfg(feature = "multithread-mm")]
+pub fn current_threading_panel_threshold() -> usize {
+    THREADING_PANEL_THRESHOLD.load(Ordering::Relaxed)
+}
+
+/// Set the MMM panel-count threshold for the rayon path. Default is `64`.
+/// Pass `0` to thread regardless of size.
+#[cfg(feature = "multithread-mm")]
+pub fn set_threading_panel_threshold(panels: usize) {
+    THREADING_PANEL_THRESHOLD.store(panels, Ordering::Relaxed);
+}
diff --git a/linalg/src/wasm.rs b/linalg/src/wasm.rs
index fc6b027029..d6e2af9e50 100644
--- a/linalg/src/wasm.rs
+++ b/linalg/src/wasm.rs
@@ -11,9 +11,100 @@ use crate::mmm::FusedKerSpec;
 use crate::mmm::ImplementationQuality;
 use crate::{Ops, Scaler};
 
+#[cfg(target_feature = "relaxed-simd")]
+use crate::frame::element_wise::ElementWiseKer;
+
+// f32x4 mul+add → relaxed FMA when the build has +relaxed-simd, else explicit
+// mul+add. Lets the MMM kernels emit f32x4.relaxed_madd without duplicating
+// kernel source. Per PR #2199: LLVM does not auto-emit relaxed_madd from
+// f32x4_add(f32x4_mul(...)) even with +relaxed-simd — hand emission is needed.
+//
+// Caller must have `use std::arch::wasm32::*;` in scope (every kernel does).
+// Args are passed (acc, a, b); evaluation order differs between the two arms
+// (acc-first in baseline, acc-last in FMA), so callers must pass simple
+// variable names rather than expressions with side effects.
+#[cfg(target_feature = "relaxed-simd")]
+macro_rules! madd_f32x4 {
+    ($acc:expr, $a:expr, $b:expr) => {
+        f32x4_relaxed_madd($a, $b, $acc)
+    };
+}
+
+#[cfg(not(target_feature = "relaxed-simd"))]
+macro_rules! madd_f32x4 {
+    ($acc:expr, $a:expr, $b:expr) => {
+        f32x4_add($acc, f32x4_mul($a, $b))
+    };
+}
+
+// Always-non-fused madd. Used by kernels with ≤4 SIMD accumulators per K-step
+// (wasm_f32_4x1, _8x1, _16x1, _4x4), where the destructive `fmla.4s`
+// emitted by +relaxed-simd creates a 4-cycle accumulator RAW recurrence
+// that throttles throughput to 1 FMA/cycle even though Apple-class ARM64
+// pipes can do 4. The separate `fmul.4s; fadd.4s` form gives each multiply
+// a fresh destination register, letting the OoO renamer overlap the next
+// iteration's multiply with the in-flight add. Measured: under
+// +simd128,+relaxed-simd these kernels are 19-28% slower than under
+// +simd128 when using the fused form on Apple M1 — both wasmtime
+// (Cranelift) and Node 20 (V8) reproduce identically. Wider kernels
+// (wasm_f32_32x1 with 8 accs, wasm_f32_8x8 with 16) keep the fused form
+// because their pipe is saturated and FMA's 1-instruction-per-madd wins.
+//
+// Cross-check: XNNPACK only ships wasmrelaxedsimd-fma GEMM kernels at
+// NR=8 (i.e. ≥8 accumulator-equivalents), independently arriving at the
+// same threshold without writing it down.
+macro_rules! madd_f32x4_nofma {
+    ($acc:expr, $a:expr, $b:expr) => {
+        f32x4_add($acc, f32x4_mul($a, $b))
+    };
+}
+
 pub fn plug(ops: &mut Ops) {
     ops.mmm_impls.push(wasm_f32_4x4.mmm());
-    ops.mmm_f32 = Box::new(|_m, _k, _n| wasm_f32_4x4.mmm());
+    ops.mmm_impls.push(wasm_f32_4x1.mmm());
+    ops.mmm_impls.push(wasm_f32_8x1.mmm());
+    ops.mmm_impls.push(wasm_f32_16x1.mmm());
+    ops.mmm_impls.push(wasm_f32_32x1.mmm());
+    ops.mmm_impls.push(wasm_f32_8x8.mmm());
+    // int8 -> i32 matmul: SIMD kernel (was generic scalar). ManuallyOptimized so
+    // strategize's retain() keeps it over generic_i32_4x4 for i8 packing.
+    ops.mmm_impls.push(wasm_i32_4x4.mmm());
+    ops.qmmm_i32 = Box::new(|_, _, _| wasm_i32_4x4.mmm());
+    // Selection paths. Both rely on kernel_selection::strategize honouring
+    // the mmm_f32 / mmv_f32 callback, which it only does when the callback's
+    // kernel is tagged ManuallyOptimized. Otherwise strategize falls through
+    // to list_impls, whose retain() keeps only the top ImplementationQuality
+    // and drops every TargetOptimized kernel.
+    //   - N>1 (GEMM): mmm_f32 returns 8x8, so 8x8 MUST be ManuallyOptimized.
+    //     If it were TargetOptimized it would be dropped by retain(), and the
+    //     N>1 branch's max(nr*mr) over the surviving (ManuallyOptimized) GEMV
+    //     kernels would pick wasm_f32_32x1 — a matrix×vector kernel — for
+    //     every GEMM.
+    //   - N=1 (GEMV): mmv_f32 routes by M-band to the kernel whose MR fits.
+    //     The four GEMV kernels are ManuallyOptimized for the same reason —
+    //     without the tag strategize discards the callback and picks
+    //     max(mr)=32x1 for every M, leaving up to ~37% on the table for
+    //     small-M GEMV.
+    ops.mmm_f32 = Box::new(|_m, _k, _n| wasm_f32_8x8.mmm());
+    // Bands derived from microbench_dispatch_gemv. At each band edge, using
+    // the next-larger kernel beats halving outer iterations of the smaller
+    // one (1 outer with ILP-absorbed padding > 2 outer with kernel preamble
+    // doubled). M=4/8/16 are exact tile fits at the lower edges; M=17/9/5
+    // are the first values where the next-larger kernel wins.
+    ops.mmv_f32 = Box::new(|m, _k| match m.unwrap_or(0) {
+        0..=4 => wasm_f32_4x1.mmm(),
+        5..=8 => wasm_f32_8x1.mmm(),
+        9..=16 => wasm_f32_16x1.mmm(),
+        _ => wasm_f32_32x1.mmm(),
+    });
+    // Relaxed-SIMD activation kernels (FMA path). Only installed when the
+    // build has `+relaxed-simd`; otherwise the slots stay at the generic
+    // scalar polynomial.
+    #[cfg(target_feature = "relaxed-simd")]
+    {
+        ops.sigmoid_f32 = Box::new(|| WasmSigmoid4Relaxed::ew());
+        ops.tanh_f32 = Box::new(|| WasmTanh4Relaxed::ew());
+    }
 }
 
 unsafe fn kernel_f32_4x4(mut pnl: *const FusedKerSpec<f32>) -> isize {
@@ -240,10 +331,10 @@ unsafe fn kernel_f32_4x4(mut pnl: *const FusedKerSpec<f32>) -> isize {
                 }
                 FusedKerSpec::AddRowColProducts(rows, cols) => {
                     let cols = v128_load(cols as *const v128);
-                    ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(*rows.add(0)), cols));
-                    ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(*rows.add(1)), cols));
-                    ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(*rows.add(2)), cols));
-                    ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(*rows.add(3)), cols));
+                    ab0 = madd_f32x4_nofma!(ab0, f32x4_splat(*rows.add(0)), cols);
+                    ab1 = madd_f32x4_nofma!(ab1, f32x4_splat(*rows.add(1)), cols);
+                    ab2 = madd_f32x4_nofma!(ab2, f32x4_splat(*rows.add(2)), cols);
+                    ab3 = madd_f32x4_nofma!(ab3, f32x4_splat(*rows.add(3)), cols);
                 }
                 FusedKerSpec::Store(tile) => {
                     let mut ptr: *mut u8 = tile.ptr;
@@ -285,17 +376,2784 @@ unsafe fn kernel_f32_4x4(mut pnl: *const FusedKerSpec<f32>) -> isize {
                     for i in 0..k {
                         let a = std::slice::from_raw_parts(a.offset(4 * i as isize), 4);
                         let b = v128_load(b.offset(i as isize));
-                        ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(a[0]), b));
-                        ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(a[1]), b));
-                        ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(a[2]), b));
-                        ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(a[3]), b));
+                        ab0 = madd_f32x4_nofma!(ab0, f32x4_splat(a[0]), b);
+                        ab1 = madd_f32x4_nofma!(ab1, f32x4_splat(a[1]), b);
+                        ab2 = madd_f32x4_nofma!(ab2, f32x4_splat(a[2]), b);
+                        ab3 = madd_f32x4_nofma!(ab3, f32x4_splat(a[3]), b);
                     }
                 }
             }
             pnl = pnl.add(1);
         }
+        0
     }
-    0
 }
 
 MMMRustKernel!(kernel_f32_4x4 => wasm_f32_4x4<f32>(4,4)@(4,4) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 4x1 kernel — GEMV-shaped variant for matrix-vector products
+/// (single-column outputs, e.g., streaming-RNN inference where each frame's
+/// activation is a single column). Mirrors the 4x4 kernel's FusedKerSpec
+/// match arms but collapses the column dimension from 4 to 1: a single
+/// f32x4 accumulator holds 4 output rows × 1 output column packed as
+/// [ab[0], ab[1], ab[2], ab[3]].
+///
+/// Selection: tract-core's einsum kernel_selection::strategize() prefers
+/// kernels with nr() == 1 when op.n.is_one(), so this kernel is
+/// automatically picked for N=1 cases once registered.
+unsafe fn kernel_f32_4x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Single accumulator: 4 rows × 1 col, packed into one f32x4.
+        // lane[i] holds ab[i] = the output value for row i (col 0).
+        let mut ab = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    ab = f32x4_splat(0.0);
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    // Tile is 4 rows × 1 col = 4 contiguous f32s = 1 v128
+                    ab = v128_load(rows as *const v128);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    ab = f32x4_min(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    ab = f32x4_max(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    ab = f32x4_add(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    ab = f32x4_mul(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    ab = f32x4_sub(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    ab = f32x4_sub(ab, f32x4_splat(a));
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let zero = f32x4_splat(0.0);
+                    let mask = f32x4_gt(ab, zero);
+                    ab = v128_bitselect(ab, f32x4_mul(f32x4_splat(a), ab), mask);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    // 4 row values, applied to ab's 4 lanes in order
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_min(r, ab);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_max(r, ab);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_add(r, ab);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_mul(r, ab);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_sub(r, ab);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_sub(ab, r);
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    // Single col value broadcast to all 4 rows
+                    ab = f32x4_min(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    ab = f32x4_max(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    ab = f32x4_add(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    ab = f32x4_mul(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    ab = f32x4_sub(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    ab = f32x4_sub(ab, f32x4_splat(*cols));
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    ab = f32x4_mul(f32x4_splat(scaler.scale), ab);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab = f32x4_mul(s, ab);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    ab = f32x4_mul(s, ab);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 4 rows × 1 col, with row_byte_stride between rows (col_stride irrelevant for N=1)
+                    let mut ptr: *const u8 = tile.ptr;
+                    let m0 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m1 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m2 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m3 = *(ptr as *const f32);
+                    ab = f32x4_add(ab, f32x4(m0, m1, m2, m3));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    // ab[i] += rows[i] * cols[0]  (cols[0] is the single col)
+                    let r = v128_load(rows as *const v128);
+                    let c = f32x4_splat(*cols);
+                    ab = madd_f32x4_nofma!(ab, r, c);
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 4 rows × 1 col, write each lane to a separate row
+                    let mut ptr: *mut u8 = tile.ptr;
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab);
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A is packed [k][MR=4]: each k iter loads 4 contiguous f32s = 1 v128.
+                    // B is packed [k][NR=1]: each k iter loads 1 scalar f32, broadcast.
+                    // ab[i] += a[i] * b for all i in 0..4 → SIMD: ab += a_vec * b_splat
+                    let a = pa as *const v128;
+                    let b = pb as *const f32;
+                    for i in 0..k {
+                        let a_vec = v128_load(a.offset(i as isize));
+                        let b_splat = f32x4_splat(*b.offset(i as isize));
+                        ab = madd_f32x4_nofma!(ab, a_vec, b_splat);
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+// ManuallyOptimized so kernel_selection::strategize honours the M-band
+// dispatch in mmv_f32 below. See module-level comment on plug().
+MMMRustKernel!(kernel_f32_4x1 => wasm_f32_4x1<f32>(4,1)@(4,1) quality(ImplementationQuality::ManuallyOptimized));
+
+/// WASM SIMD f32 8x1 kernel — wider GEMV variant for matrix-vector products
+/// on large M. Uses TWO independent f32x4 accumulators (rows 0-3 in ab_top,
+/// rows 4-7 in ab_bot), enabling 2-way ILP within each k-iteration:
+/// the inner loop issues two independent f32x4_add(f32x4_mul(...)) ops per
+/// k-step, breaking the data-dependency chain depth from K to ~K/2 at the
+/// hardware pipeline level.
+///
+/// Compared to wasm_f32_4x1 (1 accumulator, k-serial dep chain), this is
+/// targeted at GEMV ops where M is a multiple of 8 (or close to it). For
+/// M=256 GRU gate matmuls (the dominant GEMV in DFN3), this should yield
+/// ~2x speedup on the inner loop on hardware where SIMD FMLA throughput
+/// exceeds 1 op/cycle.
+///
+/// Selection: `kernel_selection::strategize()` prefers max mr() for n=1
+/// cases, so this kernel automatically wins over wasm_f32_4x1 for all N=1
+/// ops once registered (including small-M cases where it slightly wastes
+/// rows — for M=1 lsnr_fc-style ops, that's 7-of-8 row waste, but those
+/// ops are <1% of frame so the regression is noise).
+unsafe fn kernel_f32_8x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Two accumulators: 8 rows × 1 col packed as [ab_top, ab_bot]
+        // ab_top.lane[i] holds row i (i in 0..4); ab_bot.lane[i] holds row i+4
+        let mut ab_top = f32x4_splat(0.0);
+        let mut ab_bot = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    ab_top = f32x4_splat(0.0);
+                    ab_bot = f32x4_splat(0.0);
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    // 8 rows × 1 col = 8 contiguous f32 = 2 v128
+                    let p = rows as *const v128;
+                    ab_top = *p;
+                    ab_bot = *p.add(1);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_min(s, ab_top);
+                    ab_bot = f32x4_min(s, ab_bot);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_max(s, ab_top);
+                    ab_bot = f32x4_max(s, ab_bot);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_add(s, ab_top);
+                    ab_bot = f32x4_add(s, ab_bot);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_sub(s, ab_top);
+                    ab_bot = f32x4_sub(s, ab_bot);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_sub(ab_top, s);
+                    ab_bot = f32x4_sub(ab_bot, s);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let s = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+                    let mask_t = f32x4_gt(ab_top, zero);
+                    let mask_b = f32x4_gt(ab_bot, zero);
+                    ab_top = v128_bitselect(ab_top, f32x4_mul(s, ab_top), mask_t);
+                    ab_bot = v128_bitselect(ab_bot, f32x4_mul(s, ab_bot), mask_b);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_min(r_t, ab_top);
+                    ab_bot = f32x4_min(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_max(r_t, ab_top);
+                    ab_bot = f32x4_max(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_add(r_t, ab_top);
+                    ab_bot = f32x4_add(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_mul(r_t, ab_top);
+                    ab_bot = f32x4_mul(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_sub(r_t, ab_top);
+                    ab_bot = f32x4_sub(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_sub(ab_top, r_t);
+                    ab_bot = f32x4_sub(ab_bot, r_b);
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_min(c, ab_top);
+                    ab_bot = f32x4_min(c, ab_bot);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_max(c, ab_top);
+                    ab_bot = f32x4_max(c, ab_bot);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_add(c, ab_top);
+                    ab_bot = f32x4_add(c, ab_bot);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_mul(c, ab_top);
+                    ab_bot = f32x4_mul(c, ab_bot);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_sub(c, ab_top);
+                    ab_bot = f32x4_sub(c, ab_bot);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_sub(ab_top, c);
+                    ab_bot = f32x4_sub(ab_bot, c);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let s = f32x4_splat(scaler.scale);
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 8 rows × 1 col, stride is row_byte_stride between rows
+                    let mut ptr: *const u8 = tile.ptr;
+                    let m0 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m1 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m2 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m3 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m4 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m5 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m6 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m7 = *(ptr as *const f32);
+                    ab_top = f32x4_add(ab_top, f32x4(m0, m1, m2, m3));
+                    ab_bot = f32x4_add(ab_bot, f32x4(m4, m5, m6, m7));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let p = rows as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    let c = f32x4_splat(*cols);
+                    ab_top = madd_f32x4_nofma!(ab_top, r_t, c);
+                    ab_bot = madd_f32x4_nofma!(ab_bot, r_b, c);
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 8 rows × 1 col, write each lane to a separate row
+                    let mut ptr: *mut u8 = tile.ptr;
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_bot);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_bot);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_bot);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_bot);
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A: packed [k][MR=8] = each k iter loads 8 f32 = 2 v128
+                    // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast
+                    // The two fmadd ops on (ab_top, ab_bot) are independent — 2-way ILP per iter.
+                    let a = pa as *const v128;
+                    let b = pb as *const f32;
+                    for i in 0..k {
+                        let a_t = v128_load(a.offset((2 * i) as isize));
+                        let a_b = v128_load(a.offset((2 * i + 1) as isize));
+                        let b_splat = f32x4_splat(*b.offset(i as isize));
+                        ab_top = madd_f32x4_nofma!(ab_top, a_t, b_splat);
+                        ab_bot = madd_f32x4_nofma!(ab_bot, a_b, b_splat);
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_8x1 => wasm_f32_8x1<f32>(8,1)@(8,1) quality(ImplementationQuality::ManuallyOptimized));
+
+/// WASM SIMD f32 16x1 kernel — wider GEMV variant for matrix-vector products
+/// on very large M. Uses FOUR independent f32x4 accumulators (rows 0-3,
+/// 4-7, 8-11, 12-15), enabling 4-way ILP within each k-iteration.
+///
+/// Compared to wasm_f32_8x1 (2 accumulators, 2-way ILP), this exposes more
+/// parallel work to the SIMD pipelines, beneficial on hardware with 3+
+/// SIMD execution units (most modern ARM and x86).
+unsafe fn kernel_f32_16x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Four accumulators: 16 rows × 1 col packed as [ab_q0, ab_q1, ab_q2, ab_q3]
+        // ab_q0 = rows 0-3, ab_q1 = rows 4-7, ab_q2 = rows 8-11, ab_q3 = rows 12-15
+        let mut ab_q0 = f32x4_splat(0.0);
+        let mut ab_q1 = f32x4_splat(0.0);
+        let mut ab_q2 = f32x4_splat(0.0);
+        let mut ab_q3 = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    let z = f32x4_splat(0.0);
+                    ab_q0 = z;
+                    ab_q1 = z;
+                    ab_q2 = z;
+                    ab_q3 = z;
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    let p = rows as *const v128;
+                    ab_q0 = *p;
+                    ab_q1 = *p.add(1);
+                    ab_q2 = *p.add(2);
+                    ab_q3 = *p.add(3);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_min(s, ab_q0);
+                    ab_q1 = f32x4_min(s, ab_q1);
+                    ab_q2 = f32x4_min(s, ab_q2);
+                    ab_q3 = f32x4_min(s, ab_q3);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_max(s, ab_q0);
+                    ab_q1 = f32x4_max(s, ab_q1);
+                    ab_q2 = f32x4_max(s, ab_q2);
+                    ab_q3 = f32x4_max(s, ab_q3);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_add(s, ab_q0);
+                    ab_q1 = f32x4_add(s, ab_q1);
+                    ab_q2 = f32x4_add(s, ab_q2);
+                    ab_q3 = f32x4_add(s, ab_q3);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_sub(s, ab_q0);
+                    ab_q1 = f32x4_sub(s, ab_q1);
+                    ab_q2 = f32x4_sub(s, ab_q2);
+                    ab_q3 = f32x4_sub(s, ab_q3);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_sub(ab_q0, s);
+                    ab_q1 = f32x4_sub(ab_q1, s);
+                    ab_q2 = f32x4_sub(ab_q2, s);
+                    ab_q3 = f32x4_sub(ab_q3, s);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let s = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+                    let m0 = f32x4_gt(ab_q0, zero);
+                    ab_q0 = v128_bitselect(ab_q0, f32x4_mul(s, ab_q0), m0);
+                    let m1 = f32x4_gt(ab_q1, zero);
+                    ab_q1 = v128_bitselect(ab_q1, f32x4_mul(s, ab_q1), m1);
+                    let m2 = f32x4_gt(ab_q2, zero);
+                    ab_q2 = v128_bitselect(ab_q2, f32x4_mul(s, ab_q2), m2);
+                    let m3 = f32x4_gt(ab_q3, zero);
+                    ab_q3 = v128_bitselect(ab_q3, f32x4_mul(s, ab_q3), m3);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_min(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_min(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_min(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_min(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_max(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_max(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_max(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_max(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_add(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_add(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_add(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_add(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_mul(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_mul(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_mul(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_mul(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_sub(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_sub(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_sub(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_sub(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_sub(ab_q0, v128_load(p));
+                    ab_q1 = f32x4_sub(ab_q1, v128_load(p.add(1)));
+                    ab_q2 = f32x4_sub(ab_q2, v128_load(p.add(2)));
+                    ab_q3 = f32x4_sub(ab_q3, v128_load(p.add(3)));
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_min(c, ab_q0);
+                    ab_q1 = f32x4_min(c, ab_q1);
+                    ab_q2 = f32x4_min(c, ab_q2);
+                    ab_q3 = f32x4_min(c, ab_q3);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_max(c, ab_q0);
+                    ab_q1 = f32x4_max(c, ab_q1);
+                    ab_q2 = f32x4_max(c, ab_q2);
+                    ab_q3 = f32x4_max(c, ab_q3);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_add(c, ab_q0);
+                    ab_q1 = f32x4_add(c, ab_q1);
+                    ab_q2 = f32x4_add(c, ab_q2);
+                    ab_q3 = f32x4_add(c, ab_q3);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_mul(c, ab_q0);
+                    ab_q1 = f32x4_mul(c, ab_q1);
+                    ab_q2 = f32x4_mul(c, ab_q2);
+                    ab_q3 = f32x4_mul(c, ab_q3);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_sub(c, ab_q0);
+                    ab_q1 = f32x4_sub(c, ab_q1);
+                    ab_q2 = f32x4_sub(c, ab_q2);
+                    ab_q3 = f32x4_sub(c, ab_q3);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_sub(ab_q0, c);
+                    ab_q1 = f32x4_sub(ab_q1, c);
+                    ab_q2 = f32x4_sub(ab_q2, c);
+                    ab_q3 = f32x4_sub(ab_q3, c);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let s = f32x4_splat(scaler.scale);
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 16 rows × 1 col, with row_byte_stride between rows
+                    let mut ptr: *const u8 = tile.ptr;
+                    let mut ms = [0f32; 16];
+                    for i in 0..16 {
+                        ms[i] = *(ptr as *const f32);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                    ab_q0 = f32x4_add(ab_q0, f32x4(ms[0], ms[1], ms[2], ms[3]));
+                    ab_q1 = f32x4_add(ab_q1, f32x4(ms[4], ms[5], ms[6], ms[7]));
+                    ab_q2 = f32x4_add(ab_q2, f32x4(ms[8], ms[9], ms[10], ms[11]));
+                    ab_q3 = f32x4_add(ab_q3, f32x4(ms[12], ms[13], ms[14], ms[15]));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let p = rows as *const v128;
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = madd_f32x4_nofma!(ab_q0, v128_load(p), c);
+                    ab_q1 = madd_f32x4_nofma!(ab_q1, v128_load(p.add(1)), c);
+                    ab_q2 = madd_f32x4_nofma!(ab_q2, v128_load(p.add(2)), c);
+                    ab_q3 = madd_f32x4_nofma!(ab_q3, v128_load(p.add(3)), c);
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 16 rows × 1 col, write each lane to a separate row
+                    let mut ptr: *mut u8 = tile.ptr;
+                    for ab in [ab_q0, ab_q1, ab_q2, ab_q3].iter() {
+                        *(ptr as *mut f32) = f32x4_extract_lane::<0>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<1>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<2>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<3>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A: packed [k][MR=16] = each k iter loads 16 f32 = 4 v128
+                    // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast
+                    // 4 INDEPENDENT fmadds per k-iter — 4-way ILP
+                    let a = pa as *const v128;
+                    let b = pb as *const f32;
+                    for i in 0..k {
+                        let a0 = v128_load(a.offset((4 * i) as isize));
+                        let a1 = v128_load(a.offset((4 * i + 1) as isize));
+                        let a2 = v128_load(a.offset((4 * i + 2) as isize));
+                        let a3 = v128_load(a.offset((4 * i + 3) as isize));
+                        let bs = f32x4_splat(*b.offset(i as isize));
+                        ab_q0 = madd_f32x4_nofma!(ab_q0, a0, bs);
+                        ab_q1 = madd_f32x4_nofma!(ab_q1, a1, bs);
+                        ab_q2 = madd_f32x4_nofma!(ab_q2, a2, bs);
+                        ab_q3 = madd_f32x4_nofma!(ab_q3, a3, bs);
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_16x1 => wasm_f32_16x1<f32>(16,1)@(16,1) quality(ImplementationQuality::ManuallyOptimized));
+
+/// WASM SIMD f32 32x1 kernel — widest GEMV variant for matrix-vector products
+/// on very large M. Uses EIGHT independent f32x4 accumulators (rows 0-3, 4-7,
+/// 8-11, 12-15, 16-19, 20-23, 24-27, 28-31), enabling 8-way ILP within each
+/// k-iteration.
+///
+/// Compared to wasm_f32_16x1 (4 accumulators, 4-way ILP), this halves the
+/// per-call dispatch overhead for M=256 GRU gates (8 calls instead of 16),
+/// and exposes 8 independent fmadd dependency chains. On hosts with 16+
+/// physical SIMD registers (x86_64 has 16 xmm, ARM64 has 32 NEON), the 8
+/// accumulators fit without spilling. Mirrors `apple_amx_mmm_f32_32x1` MR.
+///
+/// Selection: `kernel_selection::strategize()` prefers max mr() for n=1
+/// cases, so this kernel automatically wins over wasm_f32_16x1 for M >= 32.
+unsafe fn kernel_f32_32x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Eight accumulators: 32 rows × 1 col packed as [ab_q0..ab_q7]
+        // ab_q0 = rows 0-3, ab_q1 = rows 4-7, ..., ab_q7 = rows 28-31
+        let mut ab_q0 = f32x4_splat(0.0);
+        let mut ab_q1 = f32x4_splat(0.0);
+        let mut ab_q2 = f32x4_splat(0.0);
+        let mut ab_q3 = f32x4_splat(0.0);
+        let mut ab_q4 = f32x4_splat(0.0);
+        let mut ab_q5 = f32x4_splat(0.0);
+        let mut ab_q6 = f32x4_splat(0.0);
+        let mut ab_q7 = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    let z = f32x4_splat(0.0);
+                    ab_q0 = z;
+                    ab_q1 = z;
+                    ab_q2 = z;
+                    ab_q3 = z;
+                    ab_q4 = z;
+                    ab_q5 = z;
+                    ab_q6 = z;
+                    ab_q7 = z;
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    let p = rows as *const v128;
+                    ab_q0 = *p;
+                    ab_q1 = *p.add(1);
+                    ab_q2 = *p.add(2);
+                    ab_q3 = *p.add(3);
+                    ab_q4 = *p.add(4);
+                    ab_q5 = *p.add(5);
+                    ab_q6 = *p.add(6);
+                    ab_q7 = *p.add(7);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_min(s, ab_q0);
+                    ab_q1 = f32x4_min(s, ab_q1);
+                    ab_q2 = f32x4_min(s, ab_q2);
+                    ab_q3 = f32x4_min(s, ab_q3);
+                    ab_q4 = f32x4_min(s, ab_q4);
+                    ab_q5 = f32x4_min(s, ab_q5);
+                    ab_q6 = f32x4_min(s, ab_q6);
+                    ab_q7 = f32x4_min(s, ab_q7);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_max(s, ab_q0);
+                    ab_q1 = f32x4_max(s, ab_q1);
+                    ab_q2 = f32x4_max(s, ab_q2);
+                    ab_q3 = f32x4_max(s, ab_q3);
+                    ab_q4 = f32x4_max(s, ab_q4);
+                    ab_q5 = f32x4_max(s, ab_q5);
+                    ab_q6 = f32x4_max(s, ab_q6);
+                    ab_q7 = f32x4_max(s, ab_q7);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_add(s, ab_q0);
+                    ab_q1 = f32x4_add(s, ab_q1);
+                    ab_q2 = f32x4_add(s, ab_q2);
+                    ab_q3 = f32x4_add(s, ab_q3);
+                    ab_q4 = f32x4_add(s, ab_q4);
+                    ab_q5 = f32x4_add(s, ab_q5);
+                    ab_q6 = f32x4_add(s, ab_q6);
+                    ab_q7 = f32x4_add(s, ab_q7);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                    ab_q4 = f32x4_mul(s, ab_q4);
+                    ab_q5 = f32x4_mul(s, ab_q5);
+                    ab_q6 = f32x4_mul(s, ab_q6);
+                    ab_q7 = f32x4_mul(s, ab_q7);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_sub(s, ab_q0);
+                    ab_q1 = f32x4_sub(s, ab_q1);
+                    ab_q2 = f32x4_sub(s, ab_q2);
+                    ab_q3 = f32x4_sub(s, ab_q3);
+                    ab_q4 = f32x4_sub(s, ab_q4);
+                    ab_q5 = f32x4_sub(s, ab_q5);
+                    ab_q6 = f32x4_sub(s, ab_q6);
+                    ab_q7 = f32x4_sub(s, ab_q7);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_sub(ab_q0, s);
+                    ab_q1 = f32x4_sub(ab_q1, s);
+                    ab_q2 = f32x4_sub(ab_q2, s);
+                    ab_q3 = f32x4_sub(ab_q3, s);
+                    ab_q4 = f32x4_sub(ab_q4, s);
+                    ab_q5 = f32x4_sub(ab_q5, s);
+                    ab_q6 = f32x4_sub(ab_q6, s);
+                    ab_q7 = f32x4_sub(ab_q7, s);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let s = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+                    let m0 = f32x4_gt(ab_q0, zero);
+                    ab_q0 = v128_bitselect(ab_q0, f32x4_mul(s, ab_q0), m0);
+                    let m1 = f32x4_gt(ab_q1, zero);
+                    ab_q1 = v128_bitselect(ab_q1, f32x4_mul(s, ab_q1), m1);
+                    let m2 = f32x4_gt(ab_q2, zero);
+                    ab_q2 = v128_bitselect(ab_q2, f32x4_mul(s, ab_q2), m2);
+                    let m3 = f32x4_gt(ab_q3, zero);
+                    ab_q3 = v128_bitselect(ab_q3, f32x4_mul(s, ab_q3), m3);
+                    let m4 = f32x4_gt(ab_q4, zero);
+                    ab_q4 = v128_bitselect(ab_q4, f32x4_mul(s, ab_q4), m4);
+                    let m5 = f32x4_gt(ab_q5, zero);
+                    ab_q5 = v128_bitselect(ab_q5, f32x4_mul(s, ab_q5), m5);
+                    let m6 = f32x4_gt(ab_q6, zero);
+                    ab_q6 = v128_bitselect(ab_q6, f32x4_mul(s, ab_q6), m6);
+                    let m7 = f32x4_gt(ab_q7, zero);
+                    ab_q7 = v128_bitselect(ab_q7, f32x4_mul(s, ab_q7), m7);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_min(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_min(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_min(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_min(v128_load(p.add(3)), ab_q3);
+                    ab_q4 = f32x4_min(v128_load(p.add(4)), ab_q4);
+                    ab_q5 = f32x4_min(v128_load(p.add(5)), ab_q5);
+                    ab_q6 = f32x4_min(v128_load(p.add(6)), ab_q6);
+                    ab_q7 = f32x4_min(v128_load(p.add(7)), ab_q7);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_max(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_max(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_max(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_max(v128_load(p.add(3)), ab_q3);
+                    ab_q4 = f32x4_max(v128_load(p.add(4)), ab_q4);
+                    ab_q5 = f32x4_max(v128_load(p.add(5)), ab_q5);
+                    ab_q6 = f32x4_max(v128_load(p.add(6)), ab_q6);
+                    ab_q7 = f32x4_max(v128_load(p.add(7)), ab_q7);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_add(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_add(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_add(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_add(v128_load(p.add(3)), ab_q3);
+                    ab_q4 = f32x4_add(v128_load(p.add(4)), ab_q4);
+                    ab_q5 = f32x4_add(v128_load(p.add(5)), ab_q5);
+                    ab_q6 = f32x4_add(v128_load(p.add(6)), ab_q6);
+                    ab_q7 = f32x4_add(v128_load(p.add(7)), ab_q7);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_mul(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_mul(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_mul(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_mul(v128_load(p.add(3)), ab_q3);
+                    ab_q4 = f32x4_mul(v128_load(p.add(4)), ab_q4);
+                    ab_q5 = f32x4_mul(v128_load(p.add(5)), ab_q5);
+                    ab_q6 = f32x4_mul(v128_load(p.add(6)), ab_q6);
+                    ab_q7 = f32x4_mul(v128_load(p.add(7)), ab_q7);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_sub(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_sub(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_sub(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_sub(v128_load(p.add(3)), ab_q3);
+                    ab_q4 = f32x4_sub(v128_load(p.add(4)), ab_q4);
+                    ab_q5 = f32x4_sub(v128_load(p.add(5)), ab_q5);
+                    ab_q6 = f32x4_sub(v128_load(p.add(6)), ab_q6);
+                    ab_q7 = f32x4_sub(v128_load(p.add(7)), ab_q7);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_sub(ab_q0, v128_load(p));
+                    ab_q1 = f32x4_sub(ab_q1, v128_load(p.add(1)));
+                    ab_q2 = f32x4_sub(ab_q2, v128_load(p.add(2)));
+                    ab_q3 = f32x4_sub(ab_q3, v128_load(p.add(3)));
+                    ab_q4 = f32x4_sub(ab_q4, v128_load(p.add(4)));
+                    ab_q5 = f32x4_sub(ab_q5, v128_load(p.add(5)));
+                    ab_q6 = f32x4_sub(ab_q6, v128_load(p.add(6)));
+                    ab_q7 = f32x4_sub(ab_q7, v128_load(p.add(7)));
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_min(c, ab_q0);
+                    ab_q1 = f32x4_min(c, ab_q1);
+                    ab_q2 = f32x4_min(c, ab_q2);
+                    ab_q3 = f32x4_min(c, ab_q3);
+                    ab_q4 = f32x4_min(c, ab_q4);
+                    ab_q5 = f32x4_min(c, ab_q5);
+                    ab_q6 = f32x4_min(c, ab_q6);
+                    ab_q7 = f32x4_min(c, ab_q7);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_max(c, ab_q0);
+                    ab_q1 = f32x4_max(c, ab_q1);
+                    ab_q2 = f32x4_max(c, ab_q2);
+                    ab_q3 = f32x4_max(c, ab_q3);
+                    ab_q4 = f32x4_max(c, ab_q4);
+                    ab_q5 = f32x4_max(c, ab_q5);
+                    ab_q6 = f32x4_max(c, ab_q6);
+                    ab_q7 = f32x4_max(c, ab_q7);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_add(c, ab_q0);
+                    ab_q1 = f32x4_add(c, ab_q1);
+                    ab_q2 = f32x4_add(c, ab_q2);
+                    ab_q3 = f32x4_add(c, ab_q3);
+                    ab_q4 = f32x4_add(c, ab_q4);
+                    ab_q5 = f32x4_add(c, ab_q5);
+                    ab_q6 = f32x4_add(c, ab_q6);
+                    ab_q7 = f32x4_add(c, ab_q7);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_mul(c, ab_q0);
+                    ab_q1 = f32x4_mul(c, ab_q1);
+                    ab_q2 = f32x4_mul(c, ab_q2);
+                    ab_q3 = f32x4_mul(c, ab_q3);
+                    ab_q4 = f32x4_mul(c, ab_q4);
+                    ab_q5 = f32x4_mul(c, ab_q5);
+                    ab_q6 = f32x4_mul(c, ab_q6);
+                    ab_q7 = f32x4_mul(c, ab_q7);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_sub(c, ab_q0);
+                    ab_q1 = f32x4_sub(c, ab_q1);
+                    ab_q2 = f32x4_sub(c, ab_q2);
+                    ab_q3 = f32x4_sub(c, ab_q3);
+                    ab_q4 = f32x4_sub(c, ab_q4);
+                    ab_q5 = f32x4_sub(c, ab_q5);
+                    ab_q6 = f32x4_sub(c, ab_q6);
+                    ab_q7 = f32x4_sub(c, ab_q7);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_sub(ab_q0, c);
+                    ab_q1 = f32x4_sub(ab_q1, c);
+                    ab_q2 = f32x4_sub(ab_q2, c);
+                    ab_q3 = f32x4_sub(ab_q3, c);
+                    ab_q4 = f32x4_sub(ab_q4, c);
+                    ab_q5 = f32x4_sub(ab_q5, c);
+                    ab_q6 = f32x4_sub(ab_q6, c);
+                    ab_q7 = f32x4_sub(ab_q7, c);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let s = f32x4_splat(scaler.scale);
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                    ab_q4 = f32x4_mul(s, ab_q4);
+                    ab_q5 = f32x4_mul(s, ab_q5);
+                    ab_q6 = f32x4_mul(s, ab_q6);
+                    ab_q7 = f32x4_mul(s, ab_q7);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                    ab_q4 = f32x4_mul(s, ab_q4);
+                    ab_q5 = f32x4_mul(s, ab_q5);
+                    ab_q6 = f32x4_mul(s, ab_q6);
+                    ab_q7 = f32x4_mul(s, ab_q7);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                    ab_q4 = f32x4_mul(s, ab_q4);
+                    ab_q5 = f32x4_mul(s, ab_q5);
+                    ab_q6 = f32x4_mul(s, ab_q6);
+                    ab_q7 = f32x4_mul(s, ab_q7);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 32 rows × 1 col, with row_byte_stride between rows
+                    let mut ptr: *const u8 = tile.ptr;
+                    let mut ms = [0f32; 32];
+                    for i in 0..32 {
+                        ms[i] = *(ptr as *const f32);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                    ab_q0 = f32x4_add(ab_q0, f32x4(ms[0], ms[1], ms[2], ms[3]));
+                    ab_q1 = f32x4_add(ab_q1, f32x4(ms[4], ms[5], ms[6], ms[7]));
+                    ab_q2 = f32x4_add(ab_q2, f32x4(ms[8], ms[9], ms[10], ms[11]));
+                    ab_q3 = f32x4_add(ab_q3, f32x4(ms[12], ms[13], ms[14], ms[15]));
+                    ab_q4 = f32x4_add(ab_q4, f32x4(ms[16], ms[17], ms[18], ms[19]));
+                    ab_q5 = f32x4_add(ab_q5, f32x4(ms[20], ms[21], ms[22], ms[23]));
+                    ab_q6 = f32x4_add(ab_q6, f32x4(ms[24], ms[25], ms[26], ms[27]));
+                    ab_q7 = f32x4_add(ab_q7, f32x4(ms[28], ms[29], ms[30], ms[31]));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let p = rows as *const v128;
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = madd_f32x4!(ab_q0, v128_load(p), c);
+                    ab_q1 = madd_f32x4!(ab_q1, v128_load(p.add(1)), c);
+                    ab_q2 = madd_f32x4!(ab_q2, v128_load(p.add(2)), c);
+                    ab_q3 = madd_f32x4!(ab_q3, v128_load(p.add(3)), c);
+                    ab_q4 = madd_f32x4!(ab_q4, v128_load(p.add(4)), c);
+                    ab_q5 = madd_f32x4!(ab_q5, v128_load(p.add(5)), c);
+                    ab_q6 = madd_f32x4!(ab_q6, v128_load(p.add(6)), c);
+                    ab_q7 = madd_f32x4!(ab_q7, v128_load(p.add(7)), c);
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 32 rows × 1 col, write each lane to a separate row
+                    let mut ptr: *mut u8 = tile.ptr;
+                    for ab in [ab_q0, ab_q1, ab_q2, ab_q3, ab_q4, ab_q5, ab_q6, ab_q7].iter() {
+                        *(ptr as *mut f32) = f32x4_extract_lane::<0>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<1>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<2>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<3>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A: packed [k][MR=32] = each k iter loads 32 f32 = 8 v128
+                    // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast
+                    // 8 INDEPENDENT fmadds per k-iter — 8-way ILP
+                    let a = pa as *const v128;
+                    let b = pb as *const f32;
+                    for i in 0..k {
+                        let a0 = v128_load(a.offset((8 * i) as isize));
+                        let a1 = v128_load(a.offset((8 * i + 1) as isize));
+                        let a2 = v128_load(a.offset((8 * i + 2) as isize));
+                        let a3 = v128_load(a.offset((8 * i + 3) as isize));
+                        let a4 = v128_load(a.offset((8 * i + 4) as isize));
+                        let a5 = v128_load(a.offset((8 * i + 5) as isize));
+                        let a6 = v128_load(a.offset((8 * i + 6) as isize));
+                        let a7 = v128_load(a.offset((8 * i + 7) as isize));
+                        let bs = f32x4_splat(*b.offset(i as isize));
+                        ab_q0 = madd_f32x4!(ab_q0, a0, bs);
+                        ab_q1 = madd_f32x4!(ab_q1, a1, bs);
+                        ab_q2 = madd_f32x4!(ab_q2, a2, bs);
+                        ab_q3 = madd_f32x4!(ab_q3, a3, bs);
+                        ab_q4 = madd_f32x4!(ab_q4, a4, bs);
+                        ab_q5 = madd_f32x4!(ab_q5, a5, bs);
+                        ab_q6 = madd_f32x4!(ab_q6, a6, bs);
+                        ab_q7 = madd_f32x4!(ab_q7, a7, bs);
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_32x1 => wasm_f32_32x1<f32>(32,1)@(32,1) quality(ImplementationQuality::ManuallyOptimized));
+
+/// WASM SIMD f32 8x8 kernel — wide MM tile (8 rows × 8 cols, 16 v128 accumulators).
+/// Each row uses 2 v128: cols 0-3 in `_lo`, cols 4-7 in `_hi`. 16 accumulators
+/// is at the limit of WASM's 16 logical SIMD register slots; this tests the
+/// register-pressure boundary. For DFN3 ops, all M and N are multiples of 8,
+/// so 8x8 fits cleanly with no padding waste.
+unsafe fn kernel_f32_8x8(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // 8 rows × 8 cols = 16 f32x4 accumulators (cols 0-3 in _lo, cols 4-7 in _hi)
+        let mut a0lo = f32x4_splat(0.0);
+        let mut a0hi = f32x4_splat(0.0);
+        let mut a1lo = f32x4_splat(0.0);
+        let mut a1hi = f32x4_splat(0.0);
+        let mut a2lo = f32x4_splat(0.0);
+        let mut a2hi = f32x4_splat(0.0);
+        let mut a3lo = f32x4_splat(0.0);
+        let mut a3hi = f32x4_splat(0.0);
+        let mut a4lo = f32x4_splat(0.0);
+        let mut a4hi = f32x4_splat(0.0);
+        let mut a5lo = f32x4_splat(0.0);
+        let mut a5hi = f32x4_splat(0.0);
+        let mut a6lo = f32x4_splat(0.0);
+        let mut a6hi = f32x4_splat(0.0);
+        let mut a7lo = f32x4_splat(0.0);
+        let mut a7hi = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    let z = f32x4_splat(0.0);
+                    a0lo = z;
+                    a0hi = z;
+                    a1lo = z;
+                    a1hi = z;
+                    a2lo = z;
+                    a2hi = z;
+                    a3lo = z;
+                    a3hi = z;
+                    a4lo = z;
+                    a4hi = z;
+                    a5lo = z;
+                    a5hi = z;
+                    a6lo = z;
+                    a6hi = z;
+                    a7lo = z;
+                    a7hi = z;
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    // 8 rows × 8 cols = 16 v128 (2 per row, contiguous lo+hi)
+                    let p = rows as *const v128;
+                    a0lo = *p.add(0);
+                    a0hi = *p.add(1);
+                    a1lo = *p.add(2);
+                    a1hi = *p.add(3);
+                    a2lo = *p.add(4);
+                    a2hi = *p.add(5);
+                    a3lo = *p.add(6);
+                    a3hi = *p.add(7);
+                    a4lo = *p.add(8);
+                    a4hi = *p.add(9);
+                    a5lo = *p.add(10);
+                    a5hi = *p.add(11);
+                    a6lo = *p.add(12);
+                    a6hi = *p.add(13);
+                    a7lo = *p.add(14);
+                    a7hi = *p.add(15);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_min(s, a0lo);
+                    a0hi = f32x4_min(s, a0hi);
+                    a1lo = f32x4_min(s, a1lo);
+                    a1hi = f32x4_min(s, a1hi);
+                    a2lo = f32x4_min(s, a2lo);
+                    a2hi = f32x4_min(s, a2hi);
+                    a3lo = f32x4_min(s, a3lo);
+                    a3hi = f32x4_min(s, a3hi);
+                    a4lo = f32x4_min(s, a4lo);
+                    a4hi = f32x4_min(s, a4hi);
+                    a5lo = f32x4_min(s, a5lo);
+                    a5hi = f32x4_min(s, a5hi);
+                    a6lo = f32x4_min(s, a6lo);
+                    a6hi = f32x4_min(s, a6hi);
+                    a7lo = f32x4_min(s, a7lo);
+                    a7hi = f32x4_min(s, a7hi);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_max(s, a0lo);
+                    a0hi = f32x4_max(s, a0hi);
+                    a1lo = f32x4_max(s, a1lo);
+                    a1hi = f32x4_max(s, a1hi);
+                    a2lo = f32x4_max(s, a2lo);
+                    a2hi = f32x4_max(s, a2hi);
+                    a3lo = f32x4_max(s, a3lo);
+                    a3hi = f32x4_max(s, a3hi);
+                    a4lo = f32x4_max(s, a4lo);
+                    a4hi = f32x4_max(s, a4hi);
+                    a5lo = f32x4_max(s, a5lo);
+                    a5hi = f32x4_max(s, a5hi);
+                    a6lo = f32x4_max(s, a6lo);
+                    a6hi = f32x4_max(s, a6hi);
+                    a7lo = f32x4_max(s, a7lo);
+                    a7hi = f32x4_max(s, a7hi);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_add(s, a0lo);
+                    a0hi = f32x4_add(s, a0hi);
+                    a1lo = f32x4_add(s, a1lo);
+                    a1hi = f32x4_add(s, a1hi);
+                    a2lo = f32x4_add(s, a2lo);
+                    a2hi = f32x4_add(s, a2hi);
+                    a3lo = f32x4_add(s, a3lo);
+                    a3hi = f32x4_add(s, a3hi);
+                    a4lo = f32x4_add(s, a4lo);
+                    a4hi = f32x4_add(s, a4hi);
+                    a5lo = f32x4_add(s, a5lo);
+                    a5hi = f32x4_add(s, a5hi);
+                    a6lo = f32x4_add(s, a6lo);
+                    a6hi = f32x4_add(s, a6hi);
+                    a7lo = f32x4_add(s, a7lo);
+                    a7hi = f32x4_add(s, a7hi);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_sub(s, a0lo);
+                    a0hi = f32x4_sub(s, a0hi);
+                    a1lo = f32x4_sub(s, a1lo);
+                    a1hi = f32x4_sub(s, a1hi);
+                    a2lo = f32x4_sub(s, a2lo);
+                    a2hi = f32x4_sub(s, a2hi);
+                    a3lo = f32x4_sub(s, a3lo);
+                    a3hi = f32x4_sub(s, a3hi);
+                    a4lo = f32x4_sub(s, a4lo);
+                    a4hi = f32x4_sub(s, a4hi);
+                    a5lo = f32x4_sub(s, a5lo);
+                    a5hi = f32x4_sub(s, a5hi);
+                    a6lo = f32x4_sub(s, a6lo);
+                    a6hi = f32x4_sub(s, a6hi);
+                    a7lo = f32x4_sub(s, a7lo);
+                    a7hi = f32x4_sub(s, a7hi);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_sub(a0lo, s);
+                    a0hi = f32x4_sub(a0hi, s);
+                    a1lo = f32x4_sub(a1lo, s);
+                    a1hi = f32x4_sub(a1hi, s);
+                    a2lo = f32x4_sub(a2lo, s);
+                    a2hi = f32x4_sub(a2hi, s);
+                    a3lo = f32x4_sub(a3lo, s);
+                    a3hi = f32x4_sub(a3hi, s);
+                    a4lo = f32x4_sub(a4lo, s);
+                    a4hi = f32x4_sub(a4hi, s);
+                    a5lo = f32x4_sub(a5lo, s);
+                    a5hi = f32x4_sub(a5hi, s);
+                    a6lo = f32x4_sub(a6lo, s);
+                    a6hi = f32x4_sub(a6hi, s);
+                    a7lo = f32x4_sub(a7lo, s);
+                    a7hi = f32x4_sub(a7hi, s);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let s = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+                    let m0a = f32x4_gt(a0lo, zero);
+                    a0lo = v128_bitselect(a0lo, f32x4_mul(s, a0lo), m0a);
+                    let m0b = f32x4_gt(a0hi, zero);
+                    a0hi = v128_bitselect(a0hi, f32x4_mul(s, a0hi), m0b);
+                    let m1a = f32x4_gt(a1lo, zero);
+                    a1lo = v128_bitselect(a1lo, f32x4_mul(s, a1lo), m1a);
+                    let m1b = f32x4_gt(a1hi, zero);
+                    a1hi = v128_bitselect(a1hi, f32x4_mul(s, a1hi), m1b);
+                    let m2a = f32x4_gt(a2lo, zero);
+                    a2lo = v128_bitselect(a2lo, f32x4_mul(s, a2lo), m2a);
+                    let m2b = f32x4_gt(a2hi, zero);
+                    a2hi = v128_bitselect(a2hi, f32x4_mul(s, a2hi), m2b);
+                    let m3a = f32x4_gt(a3lo, zero);
+                    a3lo = v128_bitselect(a3lo, f32x4_mul(s, a3lo), m3a);
+                    let m3b = f32x4_gt(a3hi, zero);
+                    a3hi = v128_bitselect(a3hi, f32x4_mul(s, a3hi), m3b);
+                    let m4a = f32x4_gt(a4lo, zero);
+                    a4lo = v128_bitselect(a4lo, f32x4_mul(s, a4lo), m4a);
+                    let m4b = f32x4_gt(a4hi, zero);
+                    a4hi = v128_bitselect(a4hi, f32x4_mul(s, a4hi), m4b);
+                    let m5a = f32x4_gt(a5lo, zero);
+                    a5lo = v128_bitselect(a5lo, f32x4_mul(s, a5lo), m5a);
+                    let m5b = f32x4_gt(a5hi, zero);
+                    a5hi = v128_bitselect(a5hi, f32x4_mul(s, a5hi), m5b);
+                    let m6a = f32x4_gt(a6lo, zero);
+                    a6lo = v128_bitselect(a6lo, f32x4_mul(s, a6lo), m6a);
+                    let m6b = f32x4_gt(a6hi, zero);
+                    a6hi = v128_bitselect(a6hi, f32x4_mul(s, a6hi), m6b);
+                    let m7a = f32x4_gt(a7lo, zero);
+                    a7lo = v128_bitselect(a7lo, f32x4_mul(s, a7lo), m7a);
+                    let m7b = f32x4_gt(a7hi, zero);
+                    a7hi = v128_bitselect(a7hi, f32x4_mul(s, a7hi), m7b);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_min(r0, a0lo);
+                    a0hi = f32x4_min(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_min(r1, a1lo);
+                    a1hi = f32x4_min(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_min(r2, a2lo);
+                    a2hi = f32x4_min(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_min(r3, a3lo);
+                    a3hi = f32x4_min(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_min(r4, a4lo);
+                    a4hi = f32x4_min(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_min(r5, a5lo);
+                    a5hi = f32x4_min(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_min(r6, a6lo);
+                    a6hi = f32x4_min(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_min(r7, a7lo);
+                    a7hi = f32x4_min(r7, a7hi);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_max(r0, a0lo);
+                    a0hi = f32x4_max(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_max(r1, a1lo);
+                    a1hi = f32x4_max(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_max(r2, a2lo);
+                    a2hi = f32x4_max(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_max(r3, a3lo);
+                    a3hi = f32x4_max(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_max(r4, a4lo);
+                    a4hi = f32x4_max(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_max(r5, a5lo);
+                    a5hi = f32x4_max(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_max(r6, a6lo);
+                    a6hi = f32x4_max(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_max(r7, a7lo);
+                    a7hi = f32x4_max(r7, a7hi);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_add(r0, a0lo);
+                    a0hi = f32x4_add(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_add(r1, a1lo);
+                    a1hi = f32x4_add(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_add(r2, a2lo);
+                    a2hi = f32x4_add(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_add(r3, a3lo);
+                    a3hi = f32x4_add(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_add(r4, a4lo);
+                    a4hi = f32x4_add(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_add(r5, a5lo);
+                    a5hi = f32x4_add(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_add(r6, a6lo);
+                    a6hi = f32x4_add(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_add(r7, a7lo);
+                    a7hi = f32x4_add(r7, a7hi);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_mul(r0, a0lo);
+                    a0hi = f32x4_mul(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_mul(r1, a1lo);
+                    a1hi = f32x4_mul(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_mul(r2, a2lo);
+                    a2hi = f32x4_mul(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_mul(r3, a3lo);
+                    a3hi = f32x4_mul(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_mul(r4, a4lo);
+                    a4hi = f32x4_mul(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_mul(r5, a5lo);
+                    a5hi = f32x4_mul(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_mul(r6, a6lo);
+                    a6hi = f32x4_mul(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_mul(r7, a7lo);
+                    a7hi = f32x4_mul(r7, a7hi);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_sub(r0, a0lo);
+                    a0hi = f32x4_sub(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_sub(r1, a1lo);
+                    a1hi = f32x4_sub(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_sub(r2, a2lo);
+                    a2hi = f32x4_sub(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_sub(r3, a3lo);
+                    a3hi = f32x4_sub(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_sub(r4, a4lo);
+                    a4hi = f32x4_sub(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_sub(r5, a5lo);
+                    a5hi = f32x4_sub(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_sub(r6, a6lo);
+                    a6hi = f32x4_sub(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_sub(r7, a7lo);
+                    a7hi = f32x4_sub(r7, a7hi);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_sub(a0lo, r0);
+                    a0hi = f32x4_sub(a0hi, r0);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_sub(a1lo, r1);
+                    a1hi = f32x4_sub(a1hi, r1);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_sub(a2lo, r2);
+                    a2hi = f32x4_sub(a2hi, r2);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_sub(a3lo, r3);
+                    a3hi = f32x4_sub(a3hi, r3);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_sub(a4lo, r4);
+                    a4hi = f32x4_sub(a4hi, r4);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_sub(a5lo, r5);
+                    a5hi = f32x4_sub(a5hi, r5);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_sub(a6lo, r6);
+                    a6hi = f32x4_sub(a6hi, r6);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_sub(a7lo, r7);
+                    a7hi = f32x4_sub(a7hi, r7);
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_min(clo, a0lo);
+                    a0hi = f32x4_min(chi, a0hi);
+                    a1lo = f32x4_min(clo, a1lo);
+                    a1hi = f32x4_min(chi, a1hi);
+                    a2lo = f32x4_min(clo, a2lo);
+                    a2hi = f32x4_min(chi, a2hi);
+                    a3lo = f32x4_min(clo, a3lo);
+                    a3hi = f32x4_min(chi, a3hi);
+                    a4lo = f32x4_min(clo, a4lo);
+                    a4hi = f32x4_min(chi, a4hi);
+                    a5lo = f32x4_min(clo, a5lo);
+                    a5hi = f32x4_min(chi, a5hi);
+                    a6lo = f32x4_min(clo, a6lo);
+                    a6hi = f32x4_min(chi, a6hi);
+                    a7lo = f32x4_min(clo, a7lo);
+                    a7hi = f32x4_min(chi, a7hi);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_max(clo, a0lo);
+                    a0hi = f32x4_max(chi, a0hi);
+                    a1lo = f32x4_max(clo, a1lo);
+                    a1hi = f32x4_max(chi, a1hi);
+                    a2lo = f32x4_max(clo, a2lo);
+                    a2hi = f32x4_max(chi, a2hi);
+                    a3lo = f32x4_max(clo, a3lo);
+                    a3hi = f32x4_max(chi, a3hi);
+                    a4lo = f32x4_max(clo, a4lo);
+                    a4hi = f32x4_max(chi, a4hi);
+                    a5lo = f32x4_max(clo, a5lo);
+                    a5hi = f32x4_max(chi, a5hi);
+                    a6lo = f32x4_max(clo, a6lo);
+                    a6hi = f32x4_max(chi, a6hi);
+                    a7lo = f32x4_max(clo, a7lo);
+                    a7hi = f32x4_max(chi, a7hi);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_add(clo, a0lo);
+                    a0hi = f32x4_add(chi, a0hi);
+                    a1lo = f32x4_add(clo, a1lo);
+                    a1hi = f32x4_add(chi, a1hi);
+                    a2lo = f32x4_add(clo, a2lo);
+                    a2hi = f32x4_add(chi, a2hi);
+                    a3lo = f32x4_add(clo, a3lo);
+                    a3hi = f32x4_add(chi, a3hi);
+                    a4lo = f32x4_add(clo, a4lo);
+                    a4hi = f32x4_add(chi, a4hi);
+                    a5lo = f32x4_add(clo, a5lo);
+                    a5hi = f32x4_add(chi, a5hi);
+                    a6lo = f32x4_add(clo, a6lo);
+                    a6hi = f32x4_add(chi, a6hi);
+                    a7lo = f32x4_add(clo, a7lo);
+                    a7hi = f32x4_add(chi, a7hi);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_mul(clo, a0lo);
+                    a0hi = f32x4_mul(chi, a0hi);
+                    a1lo = f32x4_mul(clo, a1lo);
+                    a1hi = f32x4_mul(chi, a1hi);
+                    a2lo = f32x4_mul(clo, a2lo);
+                    a2hi = f32x4_mul(chi, a2hi);
+                    a3lo = f32x4_mul(clo, a3lo);
+                    a3hi = f32x4_mul(chi, a3hi);
+                    a4lo = f32x4_mul(clo, a4lo);
+                    a4hi = f32x4_mul(chi, a4hi);
+                    a5lo = f32x4_mul(clo, a5lo);
+                    a5hi = f32x4_mul(chi, a5hi);
+                    a6lo = f32x4_mul(clo, a6lo);
+                    a6hi = f32x4_mul(chi, a6hi);
+                    a7lo = f32x4_mul(clo, a7lo);
+                    a7hi = f32x4_mul(chi, a7hi);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_sub(clo, a0lo);
+                    a0hi = f32x4_sub(chi, a0hi);
+                    a1lo = f32x4_sub(clo, a1lo);
+                    a1hi = f32x4_sub(chi, a1hi);
+                    a2lo = f32x4_sub(clo, a2lo);
+                    a2hi = f32x4_sub(chi, a2hi);
+                    a3lo = f32x4_sub(clo, a3lo);
+                    a3hi = f32x4_sub(chi, a3hi);
+                    a4lo = f32x4_sub(clo, a4lo);
+                    a4hi = f32x4_sub(chi, a4hi);
+                    a5lo = f32x4_sub(clo, a5lo);
+                    a5hi = f32x4_sub(chi, a5hi);
+                    a6lo = f32x4_sub(clo, a6lo);
+                    a6hi = f32x4_sub(chi, a6hi);
+                    a7lo = f32x4_sub(clo, a7lo);
+                    a7hi = f32x4_sub(chi, a7hi);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_sub(a0lo, clo);
+                    a0hi = f32x4_sub(a0hi, chi);
+                    a1lo = f32x4_sub(a1lo, clo);
+                    a1hi = f32x4_sub(a1hi, chi);
+                    a2lo = f32x4_sub(a2lo, clo);
+                    a2hi = f32x4_sub(a2hi, chi);
+                    a3lo = f32x4_sub(a3lo, clo);
+                    a3hi = f32x4_sub(a3hi, chi);
+                    a4lo = f32x4_sub(a4lo, clo);
+                    a4hi = f32x4_sub(a4hi, chi);
+                    a5lo = f32x4_sub(a5lo, clo);
+                    a5hi = f32x4_sub(a5hi, chi);
+                    a6lo = f32x4_sub(a6lo, clo);
+                    a6hi = f32x4_sub(a6hi, chi);
+                    a7lo = f32x4_sub(a7lo, clo);
+                    a7hi = f32x4_sub(a7hi, chi);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let s = f32x4_splat(scaler.scale);
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 8 rows × 8 cols, each row laid out per col_byte_stride
+                    let mut ptr: *const u8 = tile.ptr;
+                    for ab_pair in [
+                        (&mut a0lo, &mut a0hi),
+                        (&mut a1lo, &mut a1hi),
+                        (&mut a2lo, &mut a2hi),
+                        (&mut a3lo, &mut a3hi),
+                        (&mut a4lo, &mut a4hi),
+                        (&mut a5lo, &mut a5hi),
+                        (&mut a6lo, &mut a6hi),
+                        (&mut a7lo, &mut a7hi),
+                    ]
+                    .iter_mut()
+                    {
+                        let m0 = *(ptr as *const f32);
+                        let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                        let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                        let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                        let m4 = *(ptr.offset(tile.col_byte_stride * 4) as *const f32);
+                        let m5 = *(ptr.offset(tile.col_byte_stride * 5) as *const f32);
+                        let m6 = *(ptr.offset(tile.col_byte_stride * 6) as *const f32);
+                        let m7 = *(ptr.offset(tile.col_byte_stride * 7) as *const f32);
+                        let (lo, hi) = ab_pair;
+                        **lo = f32x4_add(**lo, f32x4(m0, m1, m2, m3));
+                        **hi = f32x4_add(**hi, f32x4(m4, m5, m6, m7));
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    let r0 = f32x4_splat(*rows.add(0));
+                    a0lo = madd_f32x4!(a0lo, r0, clo);
+                    a0hi = madd_f32x4!(a0hi, r0, chi);
+                    let r1 = f32x4_splat(*rows.add(1));
+                    a1lo = madd_f32x4!(a1lo, r1, clo);
+                    a1hi = madd_f32x4!(a1hi, r1, chi);
+                    let r2 = f32x4_splat(*rows.add(2));
+                    a2lo = madd_f32x4!(a2lo, r2, clo);
+                    a2hi = madd_f32x4!(a2hi, r2, chi);
+                    let r3 = f32x4_splat(*rows.add(3));
+                    a3lo = madd_f32x4!(a3lo, r3, clo);
+                    a3hi = madd_f32x4!(a3hi, r3, chi);
+                    let r4 = f32x4_splat(*rows.add(4));
+                    a4lo = madd_f32x4!(a4lo, r4, clo);
+                    a4hi = madd_f32x4!(a4hi, r4, chi);
+                    let r5 = f32x4_splat(*rows.add(5));
+                    a5lo = madd_f32x4!(a5lo, r5, clo);
+                    a5hi = madd_f32x4!(a5hi, r5, chi);
+                    let r6 = f32x4_splat(*rows.add(6));
+                    a6lo = madd_f32x4!(a6lo, r6, clo);
+                    a6hi = madd_f32x4!(a6hi, r6, chi);
+                    let r7 = f32x4_splat(*rows.add(7));
+                    a7lo = madd_f32x4!(a7lo, r7, clo);
+                    a7hi = madd_f32x4!(a7hi, r7, chi);
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 8 rows × 8 cols stores
+                    let mut ptr: *mut u8 = tile.ptr;
+                    for (lo, hi) in [
+                        (a0lo, a0hi),
+                        (a1lo, a1hi),
+                        (a2lo, a2hi),
+                        (a3lo, a3hi),
+                        (a4lo, a4hi),
+                        (a5lo, a5hi),
+                        (a6lo, a6hi),
+                        (a7lo, a7hi),
+                    ]
+                    .iter()
+                    {
+                        *(ptr as *mut f32) = f32x4_extract_lane::<0>(*lo);
+                        *(ptr.offset(tile.col_byte_stride) as *mut f32) =
+                            f32x4_extract_lane::<1>(*lo);
+                        *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) =
+                            f32x4_extract_lane::<2>(*lo);
+                        *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) =
+                            f32x4_extract_lane::<3>(*lo);
+                        *(ptr.offset(tile.col_byte_stride * 4) as *mut f32) =
+                            f32x4_extract_lane::<0>(*hi);
+                        *(ptr.offset(tile.col_byte_stride * 5) as *mut f32) =
+                            f32x4_extract_lane::<1>(*hi);
+                        *(ptr.offset(tile.col_byte_stride * 6) as *mut f32) =
+                            f32x4_extract_lane::<2>(*hi);
+                        *(ptr.offset(tile.col_byte_stride * 7) as *mut f32) =
+                            f32x4_extract_lane::<3>(*hi);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A: packed [k][MR=8] = each k iter loads 8 row values
+                    // B: packed [k][NR=8] = each k iter loads 8 col values as 2 v128
+                    let a = pa as *const f32;
+                    let b = pb as *const v128;
+                    for i in 0..k {
+                        let arow = std::slice::from_raw_parts(a.offset(8 * i as isize), 8);
+                        let blo = v128_load(b.offset((2 * i) as isize));
+                        let bhi = v128_load(b.offset((2 * i + 1) as isize));
+                        let s = f32x4_splat(arow[0]);
+                        a0lo = madd_f32x4!(a0lo, s, blo);
+                        a0hi = madd_f32x4!(a0hi, s, bhi);
+                        let s = f32x4_splat(arow[1]);
+                        a1lo = madd_f32x4!(a1lo, s, blo);
+                        a1hi = madd_f32x4!(a1hi, s, bhi);
+                        let s = f32x4_splat(arow[2]);
+                        a2lo = madd_f32x4!(a2lo, s, blo);
+                        a2hi = madd_f32x4!(a2hi, s, bhi);
+                        let s = f32x4_splat(arow[3]);
+                        a3lo = madd_f32x4!(a3lo, s, blo);
+                        a3hi = madd_f32x4!(a3hi, s, bhi);
+                        let s = f32x4_splat(arow[4]);
+                        a4lo = madd_f32x4!(a4lo, s, blo);
+                        a4hi = madd_f32x4!(a4hi, s, bhi);
+                        let s = f32x4_splat(arow[5]);
+                        a5lo = madd_f32x4!(a5lo, s, blo);
+                        a5hi = madd_f32x4!(a5hi, s, bhi);
+                        let s = f32x4_splat(arow[6]);
+                        a6lo = madd_f32x4!(a6lo, s, blo);
+                        a6hi = madd_f32x4!(a6hi, s, bhi);
+                        let s = f32x4_splat(arow[7]);
+                        a7lo = madd_f32x4!(a7lo, s, blo);
+                        a7hi = madd_f32x4!(a7hi, s, bhi);
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+// ManuallyOptimized so kernel_selection::strategize honours the mmm_f32
+// callback that returns it for N>1 GEMM (see the `plug` comment) — otherwise
+// strategize drops it and routes every GEMM onto the 32x1 GEMV kernel.
+MMMRustKernel!(kernel_f32_8x8 => wasm_f32_8x8<f32>(8,8)@(8,8) quality(ImplementationQuality::ManuallyOptimized));
+
+// Wasm SIMD int8 -> i32 matmul kernel (4x4). WASM's only integer dot
+// (i32x4.relaxed_dot_i8x16_i7x16) is non-deterministic for full i8 (its 2nd
+// operand is i7), so for a bit-exact kernel the AddMatMul K-loop uses widening
+// i8->i32 + i32x4 mul/add (an extmul/SMLAL-style outer product). The quant
+// epilogue + fuse ops reuse the bit-exact scalar path (q_scale/q_shr/q_shl),
+// which is O(MR*NR) and negligible vs the O(MR*NR*K) inner loop. Bit-identical
+// to generic_i32_4x4; selected for i8 matmul via its ManuallyOptimized quality
+// (WASM had no int8 matmul kernel — int8 fell back to the generic scalar one).
+#[inline(never)]
+unsafe fn kernel_i32_4x4(mut pnl: *const FusedKerSpec<i32>) -> isize {
+    use crate::ScaleShiftAndRound;
+    use std::arch::wasm32::*;
+    unsafe {
+        let mut ab = [[0i32; 4]; 4];
+        loop {
+            if pnl.is_null() {
+                break;
+            }
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => ab = [[0i32; 4]; 4],
+                FusedKerSpec::LoadTile(col_major, _row_major) => {
+                    for row in 0..4 {
+                        for col in 0..4 {
+                            ab[row][col] = *col_major.add(col * 4 + row);
+                        }
+                    }
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] += a;
+                        }
+                    }
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] *= a;
+                        }
+                    }
+                }
+                FusedKerSpec::ScalarMin(m) => {
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] = ab[i][j].min(m);
+                        }
+                    }
+                }
+                FusedKerSpec::ScalarMax(m) => {
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] = ab[i][j].max(m);
+                        }
+                    }
+                }
+                FusedKerSpec::ScalarSub(m) => {
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] = m - ab[i][j];
+                        }
+                    }
+                }
+                FusedKerSpec::ScalarSubF(m) => {
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] -= m;
+                        }
+                    }
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] = if ab[i][j] > 0 { ab[i][j] } else { a * ab[i][j] };
+                        }
+                    }
+                }
+                FusedKerSpec::PerRowMin(m) => {
+                    for i in 0..4 {
+                        let v = *m.add(i);
+                        for j in 0..4 {
+                            ab[i][j] = ab[i][j].min(v);
+                        }
+                    }
+                }
+                FusedKerSpec::PerRowMax(m) => {
+                    for i in 0..4 {
+                        let v = *m.add(i);
+                        for j in 0..4 {
+                            ab[i][j] = ab[i][j].max(v);
+                        }
+                    }
+                }
+                FusedKerSpec::PerRowAdd(m) => {
+                    for i in 0..4 {
+                        let v = *m.add(i);
+                        for j in 0..4 {
+                            ab[i][j] += v;
+                        }
+                    }
+                }
+                FusedKerSpec::PerRowMul(m) => {
+                    for i in 0..4 {
+                        let v = *m.add(i);
+                        for j in 0..4 {
+                            ab[i][j] *= v;
+                        }
+                    }
+                }
+                FusedKerSpec::PerRowSub(m) => {
+                    for i in 0..4 {
+                        let v = *m.add(i);
+                        for j in 0..4 {
+                            ab[i][j] = v - ab[i][j];
+                        }
+                    }
+                }
+                FusedKerSpec::PerRowSubF(m) => {
+                    for i in 0..4 {
+                        let v = *m.add(i);
+                        for j in 0..4 {
+                            ab[i][j] -= v;
+                        }
+                    }
+                }
+                FusedKerSpec::PerColMin(m) => {
+                    let c = std::slice::from_raw_parts(m, 4);
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] = ab[i][j].min(c[j]);
+                        }
+                    }
+                }
+                FusedKerSpec::PerColMax(m) => {
+                    let c = std::slice::from_raw_parts(m, 4);
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] = ab[i][j].max(c[j]);
+                        }
+                    }
+                }
+                FusedKerSpec::PerColAdd(m) => {
+                    let c = std::slice::from_raw_parts(m, 4);
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] += c[j];
+                        }
+                    }
+                }
+                FusedKerSpec::PerColMul(m) => {
+                    let c = std::slice::from_raw_parts(m, 4);
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] *= c[j];
+                        }
+                    }
+                }
+                FusedKerSpec::PerColSub(m) => {
+                    let c = std::slice::from_raw_parts(m, 4);
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] = c[j] - ab[i][j];
+                        }
+                    }
+                }
+                FusedKerSpec::PerColSubF(m) => {
+                    let c = std::slice::from_raw_parts(m, 4);
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] -= c[j];
+                        }
+                    }
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    for i in 0..4 {
+                        let r = *rows.add(i);
+                        for j in 0..4 {
+                            ab[i][j] += r * *cols.add(j);
+                        }
+                    }
+                }
+                FusedKerSpec::AddUnicast(other) => {
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            let p = other.ptr.offset(
+                                other.row_byte_stride * i as isize
+                                    + other.col_byte_stride * j as isize,
+                            );
+                            let v = match other.item_size {
+                                1 => *(p as *const i8) as i32,
+                                4 => *(p as *const i32),
+                                _ => return 1,
+                            };
+                            ab[i][j] += v;
+                        }
+                    }
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] = ab[i][j].q_shl(shift);
+                        }
+                    }
+                }
+                FusedKerSpec::RoundingShiftRight(shift, rp) => {
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] = ab[i][j].q_shr(shift, rp);
+                        }
+                    }
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let s = Scaler::from_fuse_params(shift, rp, mult);
+                    for i in 0..4 {
+                        for j in 0..4 {
+                            ab[i][j] = ab[i][j].q_scale(s);
+                        }
+                    }
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing } => {
+                    if packing == 1 {
+                        let a = pa as *const i8;
+                        let b = pb as *const i8;
+                        let mut acc = [
+                            v128_load(ab[0].as_ptr() as *const v128),
+                            v128_load(ab[1].as_ptr() as *const v128),
+                            v128_load(ab[2].as_ptr() as *const v128),
+                            v128_load(ab[3].as_ptr() as *const v128),
+                        ];
+                        // PackedI8K4 (K=4-inner): per 4-K block, one B v128 load is
+                        // shared across the 4 rows; each row broadcasts its 4 K bytes
+                        // (a[kb*16 + m*4 ..]) and issues one relaxed_dot (16 MACs).
+                        // b[kb*16 + n*4 + kr]. Tail K (k%4) is zero-padded by the packer.
+                        #[cfg(target_feature = "relaxed-simd")]
+                        for kb in 0..k.div_ceil(4) {
+                            let b_all = v128_load(b.add(kb * 16) as *const v128);
+                            for (m, acc_m) in acc.iter_mut().enumerate() {
+                                let a4 = (a.add(kb * 16 + m * 4) as *const i32).read_unaligned();
+                                *acc_m = i32x4_relaxed_dot_i8x16_i7x16_add(
+                                    i32x4_splat(a4),
+                                    b_all,
+                                    *acc_m,
+                                );
+                            }
+                        }
+                        // Deterministic fallback (no relaxed-simd): standard PackedFormat
+                        // K-major (4 i8 per k), widening outer-product accumulate.
+                        #[cfg(not(target_feature = "relaxed-simd"))]
+                        for ik in 0..k {
+                            let bw = v128_load32_zero(b.add(4 * ik) as *const u32);
+                            let bw = i16x8_extend_low_i8x16(bw);
+                            let bw = i32x4_extend_low_i16x8(bw);
+                            let ar = a.add(4 * ik);
+                            acc[0] =
+                                i32x4_add(acc[0], i32x4_mul(i32x4_splat(*ar.add(0) as i32), bw));
+                            acc[1] =
+                                i32x4_add(acc[1], i32x4_mul(i32x4_splat(*ar.add(1) as i32), bw));
+                            acc[2] =
+                                i32x4_add(acc[2], i32x4_mul(i32x4_splat(*ar.add(2) as i32), bw));
+                            acc[3] =
+                                i32x4_add(acc[3], i32x4_mul(i32x4_splat(*ar.add(3) as i32), bw));
+                        }
+                        v128_store(ab[0].as_mut_ptr() as *mut v128, acc[0]);
+                        v128_store(ab[1].as_mut_ptr() as *mut v128, acc[1]);
+                        v128_store(ab[2].as_mut_ptr() as *mut v128, acc[2]);
+                        v128_store(ab[3].as_mut_ptr() as *mut v128, acc[3]);
+                    } else if packing == 0 {
+                        // i32 x i32, K-major (scalar; rare path).
+                        let a = pa as *const i32;
+                        let b = pb as *const i32;
+                        for ik in 0..k {
+                            for i in 0..4 {
+                                let av = *a.add(4 * ik + i);
+                                for j in 0..4 {
+                                    ab[i][j] += av * *b.add(4 * ik + j);
+                                }
+                            }
+                        }
+                    } else {
+                        return 1;
+                    }
+                }
+                FusedKerSpec::Store(tile) => match tile.item_size {
+                    1 => {
+                        for i in 0..4 {
+                            for j in 0..4 {
+                                let loc = tile.ptr.offset(
+                                    tile.row_byte_stride * i as isize
+                                        + tile.col_byte_stride * j as isize,
+                                ) as *mut u8;
+                                *loc = ab[i][j] as u8;
+                            }
+                        }
+                    }
+                    4 => {
+                        for i in 0..4 {
+                            for j in 0..4 {
+                                let loc = tile.ptr.offset(
+                                    tile.row_byte_stride * i as isize
+                                        + tile.col_byte_stride * j as isize,
+                                ) as *mut i32;
+                                *loc = ab[i][j];
+                            }
+                        }
+                    }
+                    _ => return 1,
+                },
+            };
+            pnl = pnl.add(1);
+        }
+    }
+    0
+}
+
+// i8i8 packing for wasm_i32_4x4. Under +relaxed-simd the kernel uses the
+// `i32x4_relaxed_dot_i8x16_i7x16_add` SDOT-analog, which wants 4 contiguous K per
+// mn-lane → PackedI8K4 (K=4-inner). Without relaxed-simd it uses the widening
+// outer-product, which wants K-major → standard PackedFormat. Both are picked at
+// compile time so the kernel's AddMatMul and the packer always agree.
+#[cfg(target_feature = "relaxed-simd")]
+fn wasm_i8_packing() -> impl crate::mmm::MMMInputFormat {
+    crate::pack::PackedI8K4::new(4)
+}
+#[cfg(not(target_feature = "relaxed-simd"))]
+fn wasm_i8_packing() -> impl crate::mmm::MMMInputFormat {
+    use crate::pack::Packing;
+    i8::packing(4)
+}
+
+MMMRustKernel!(kernel_i32_4x4 => wasm_i32_4x4<i32>(4,4)
+    packing[1] = i8i8 => |k| k.with_packing(wasm_i8_packing(), wasm_i8_packing());
+    quality(ImplementationQuality::ManuallyOptimized)
+    store(i8)
+);
+
+#[cfg(test)]
+mod dispatch_trace {
+    fn trace_one(label: &str, m: Option<usize>, k: Option<usize>, n: Option<usize>) {
+        let mut ops = crate::generic();
+        super::plug(&mut ops);
+        let mmm = ops.mmm(tract_data::prelude::DatumType::F32, m, k, n).unwrap();
+        eprintln!(
+            "DFN3 {} (m={:?} k={:?} n={:?}) => {}  [mr={}, nr={}]",
+            label,
+            m,
+            k,
+            n,
+            mmm.name(),
+            mmm.mr(),
+            mmm.nr()
+        );
+    }
+
+    #[test]
+    fn dfn3_shapes() {
+        // DFN3 N=1 GEMV ops (the dominant matrix-vector cases)
+        trace_one("lsnr_fc-style m=1 k=512", Some(1), Some(512), Some(1));
+        trace_one("small m=16 k=96", Some(16), Some(96), Some(1));
+        trace_one("medium m=32 k=256", Some(32), Some(256), Some(1));
+        trace_one("GRU m=256 k=256", Some(256), Some(256), Some(1));
+        trace_one("post-rnn m=256 k=512", Some(256), Some(512), Some(1));
+        trace_one("frame-encoder m=64 k=96", Some(64), Some(96), Some(1));
+        // N>1 sanity: should hit 8x8
+        trace_one("MM m=64 k=64 n=8", Some(64), Some(64), Some(8));
+    }
+
+    /// Exercise every M-band edge of mmv_f32 to lock in the dispatch.
+    /// Lower edge of each band = perfect-tile size; upper edge = last
+    /// M before crossover to the next kernel.
+    #[test]
+    fn band_edges() {
+        // 4x1 band: M ∈ 0..=4
+        trace_one("band 4x1 lo m=1", Some(1), Some(64), Some(1));
+        trace_one("band 4x1 hi m=4", Some(4), Some(64), Some(1));
+        // 8x1 band: M ∈ 5..=8
+        trace_one("band 8x1 lo m=5", Some(5), Some(64), Some(1));
+        trace_one("band 8x1 hi m=8", Some(8), Some(64), Some(1));
+        // 16x1 band: M ∈ 9..=16
+        trace_one("band 16x1 lo m=9", Some(9), Some(64), Some(1));
+        trace_one("band 16x1 hi m=16", Some(16), Some(64), Some(1));
+        // 32x1 band: M ≥ 17
+        trace_one("band 32x1 lo m=17", Some(17), Some(64), Some(1));
+        trace_one("band 32x1 hi m=512", Some(512), Some(64), Some(1));
+    }
+
+    /// Regression guard for the GEMM/GEMV dispatch.
+    ///
+    /// `kernel_selection::strategize` honours the `mmm_f32` / `mmv_f32`
+    /// callback only when the returned kernel is `ManuallyOptimized`;
+    /// otherwise it falls through to `list_impls`, whose `retain()` drops
+    /// every `TargetOptimized` kernel, and for N>1 then picks `max(nr*mr)`
+    /// over the surviving `ManuallyOptimized` GEMV kernels — i.e.
+    /// `wasm_f32_32x1`, a matrix×vector kernel, for every GEMM. So every
+    /// kernel reachable through the dispatch callbacks must be
+    /// `ManuallyOptimized`.
+    #[test]
+    fn dispatch_kernels_are_manually_optimized() {
+        use crate::mmm::ImplementationQuality::ManuallyOptimized;
+        let mut ops = crate::generic();
+        super::plug(&mut ops);
+        for (label, m, k, n) in [
+            ("GEMM m=64 k=64 n=8", 64, 64, 8),
+            ("GEMM m=256 k=256 n=256", 256, 256, 256),
+            ("GEMM m=1024 k=576 n=10", 1024, 576, 10),
+            ("GEMV m=1 k=512 n=1", 1, 512, 1),
+            ("GEMV m=256 k=256 n=1", 256, 256, 1),
+        ] {
+            let mmm =
+                ops.mmm(tract_data::prelude::DatumType::F32, Some(m), Some(k), Some(n)).unwrap();
+            assert_eq!(
+                mmm.quality(),
+                ManuallyOptimized,
+                "{label}: dispatch returned {} tagged {:?} — strategize would \
+                 discard it and reroute onto a GEMV kernel",
+                mmm.name(),
+                mmm.quality(),
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+mod microbench_32x1 {
+    //! Quick microbench: time per-call cost for the kernel kit's GEMV path
+    //! on DFN3-shaped inputs. Compares 16x1 vs 32x1 head-to-head by
+    //! dispatching the named kernel directly.
+    //!
+    //! Run with:
+    //!   RUSTFLAGS='-C target-feature=+simd128' \
+    //!     CARGO_TARGET_WASM32_WASIP1_RUNNER='wasmtime --env RUST_TEST_NOCAPTURE=1 --' \
+    //!     cargo test --release --target=wasm32-wasip1 -p tract-linalg \
+    //!     wasm::microbench_32x1::microbench -- --nocapture --ignored
+
+    use crate::mmm::{AsInputValue, FusedSpec};
+    use std::time::Instant;
+    use tract_data::internal::*;
+    use tract_data::prelude::*;
+
+    fn run_one(kernel: &dyn crate::mmm::MatMatMul, m: usize, k: usize, iters: usize) -> f64 {
+        // Pack A (m,k) and B (k,1)
+        let packing = &kernel.packings()[0];
+        let a = Tensor::zero::<f32>(&[m, k]).unwrap();
+        let pa = packing.0.prepare_one(&a, 1, 0).unwrap();
+        let b = Tensor::zero::<f32>(&[k, 1]).unwrap();
+        let pb = packing.1.prepare_one(&b, 0, 1).unwrap();
+        let mut c = Tensor::zero::<f32>(&[m, 1]).unwrap();
+
+        // Warmup
+        for _ in 0..50 {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        1,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: 0,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(0)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+
+        // Timed
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        1,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: 0,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(0)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+        let elapsed = t0.elapsed();
+        elapsed.as_secs_f64() / iters as f64 * 1e9 // ns/call
+    }
+
+    fn pick(name: &str) -> Box<dyn crate::mmm::MatMatMul> {
+        let mut ops = crate::generic();
+        super::plug(&mut ops);
+        for impl_ in ops.mmm_impls() {
+            if impl_.name() == name {
+                return impl_.clone();
+            }
+        }
+        panic!("kernel {name} not registered")
+    }
+
+    fn bench_shape(label: &str, m: usize, k: usize, iters: usize) {
+        let k16 = pick("wasm_f32_16x1");
+        let k32 = pick("wasm_f32_32x1");
+        let ns16 = run_one(&*k16, m, k, iters);
+        let ns32 = run_one(&*k32, m, k, iters);
+        let calls16 = m.div_ceil(16);
+        let calls32 = m.div_ceil(32);
+        let delta = (ns32 - ns16) / ns16 * 100.0;
+        eprintln!(
+            "{label} (m={m}, k={k}, iters={iters}): 16x1={ns16:.1} ns/call ({calls16} kernel calls); 32x1={ns32:.1} ns/call ({calls32} kernel calls); Δ={delta:+.2}% ; per-frame call ns: 16x1={n16:.1} 32x1={n32:.1} pf-Δ={dpf:+.2}%",
+            n16 = ns16 * calls16 as f64,
+            n32 = ns32 * calls32 as f64,
+            dpf = (ns32 * calls32 as f64 - ns16 * calls16 as f64) / (ns16 * calls16 as f64) * 100.0,
+        );
+    }
+
+    #[test]
+    #[ignore]
+    fn microbench() {
+        eprintln!("=== DFN3 GEMV microbench: 16x1 vs 32x1 ===");
+        // DFN3 GRU gates (highest call count)
+        bench_shape("GRU m=256 k=256", 256, 256, 5_000);
+        // post-RNN
+        bench_shape("post-rnn m=256 k=512", 256, 512, 3_000);
+        // frame encoder
+        bench_shape("frame-encoder m=64 k=96", 64, 96, 20_000);
+        // perfect tile
+        bench_shape("perfect-tile m=32 k=256", 32, 256, 20_000);
+    }
+
+    /// Numerical-equivalence sanity check between 16x1 and 32x1 kernels on a
+    /// real-shape matmul with non-trivial inputs.
+    ///
+    /// Under `+simd128` (no relaxed-simd): both kernels emit
+    /// `f32x4_add(f32x4_mul(...))` via `madd_f32x4!`, so the K-loop order is
+    /// identical and outputs are bit-identical.
+    ///
+    /// Under `+simd128,+relaxed-simd`: 32x1 uses `f32x4.relaxed_madd` (fused
+    /// FMA) via `madd_f32x4!`, while 16x1 uses separate `mul+add` via
+    /// `madd_f32x4_nofma!` to avoid the destructive-accumulator recurrence
+    /// that throttles ≤4-accumulator kernels (see header comment on
+    /// `madd_f32x4_nofma`). Outputs drift by ≤1 ulp per K-step from the
+    /// rounding difference between fused and separate ops. We accept that
+    /// drift with a generous relative tolerance.
+    #[test]
+    fn numerical_consistency_16x1_vs_32x1() {
+        let m = 256usize;
+        let k = 256usize;
+        let mut a_data = vec![0f32; m * k];
+        for (i, x) in a_data.iter_mut().enumerate() {
+            *x = ((i % 13) as f32 - 6.0) * 0.1 + ((i / 17) % 11) as f32 * 0.07;
+        }
+        let mut b_data = vec![0f32; k];
+        for (i, x) in b_data.iter_mut().enumerate() {
+            *x = (i as f32).sin() * 0.5;
+        }
+        let a = Tensor::from_shape(&[m, k], &a_data).unwrap();
+        let b = Tensor::from_shape(&[k, 1], &b_data).unwrap();
+
+        let run = |name: &str| -> Vec<f32> {
+            let kernel = pick(name);
+            let packing = &kernel.packings()[0];
+            let pa = packing.0.prepare_one(&a, 1, 0).unwrap();
+            let pb = packing.1.prepare_one(&b, 0, 1).unwrap();
+            let mut c = Tensor::zero::<f32>(&[m, 1]).unwrap();
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        1,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: 0,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(0)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+            c.try_as_plain().unwrap().as_slice::<f32>().unwrap().to_vec()
+        };
+
+        let c16 = run("wasm_f32_16x1");
+        let c32 = run("wasm_f32_32x1");
+
+        #[cfg(not(target_feature = "relaxed-simd"))]
+        {
+            for (i, (x16, x32)) in c16.iter().zip(c32.iter()).enumerate() {
+                assert!(
+                    x16.to_bits() == x32.to_bits(),
+                    "row {i}: 16x1={x16} (bits 0x{:x}) != 32x1={x32} (bits 0x{:x})",
+                    x16.to_bits(),
+                    x32.to_bits()
+                );
+            }
+            eprintln!("bit-identity OK over m={m} k={k} ({} rows)", m);
+        }
+
+        #[cfg(target_feature = "relaxed-simd")]
+        {
+            // K=256 accumulator drift on fp32 between FMA and separate mul+add
+            // can grow up to roughly K × 0.5 ulp ≈ 128 ulp in the accumulator.
+            // For small-magnitude outputs that translates to ~1e-4 relative.
+            // We use 1e-4 as the tolerance — tight enough to catch real bugs
+            // (typically 1e-2+ drift) but generous for legitimate FMA drift.
+            let mut max_abs = 0.0f32;
+            let mut max_rel = 0.0f32;
+            for (i, (x16, x32)) in c16.iter().zip(c32.iter()).enumerate() {
+                let abs = (x16 - x32).abs();
+                let scale = x16.abs().max(x32.abs()).max(1.0e-9);
+                let rel = abs / scale;
+                assert!(
+                    rel < 1.0e-4,
+                    "row {i}: relative drift {rel:e} too large; 16x1={x16} 32x1={x32}"
+                );
+                if abs > max_abs {
+                    max_abs = abs;
+                }
+                if rel > max_rel {
+                    max_rel = rel;
+                }
+            }
+            eprintln!(
+                "relaxed-simd consistency OK over m={m} k={k}: max abs={max_abs:.3e}, max rel={max_rel:.3e}"
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+mod microbench_dispatch_gemv {
+    //! Microbench: 4x1 vs 8x1 vs 16x1 vs 32x1 GEMV kernels across the M
+    //! range. Drives the dispatch-fix decision — the M-band callback in
+    //! plug() routes small-M to smaller kernels, but only takes effect
+    //! once the kernels are tagged ManuallyOptimized (otherwise
+    //! kernel_selection::strategize bypasses the callback and always
+    //! picks max(mr) = 32x1).
+    //!
+    //! Run with:
+    //!   RUSTFLAGS='-C target-feature=+simd128' \
+    //!     CARGO_TARGET_WASM32_WASIP1_RUNNER='wasmtime --env RUST_TEST_NOCAPTURE=1 --' \
+    //!     cargo test --release --target=wasm32-wasip1 -p tract-linalg \
+    //!     wasm::microbench_dispatch_gemv::microbench -- --nocapture --ignored
+
+    use crate::mmm::{AsInputValue, FusedSpec};
+    use std::time::Instant;
+    use tract_data::internal::*;
+    use tract_data::prelude::*;
+
+    fn run_one(kernel: &dyn crate::mmm::MatMatMul, m: usize, k: usize, iters: usize) -> f64 {
+        let packing = &kernel.packings()[0];
+        let a = Tensor::zero::<f32>(&[m, k]).unwrap();
+        let pa = packing.0.prepare_one(&a, 1, 0).unwrap();
+        let b = Tensor::zero::<f32>(&[k, 1]).unwrap();
+        let pb = packing.1.prepare_one(&b, 0, 1).unwrap();
+        let mut c = Tensor::zero::<f32>(&[m, 1]).unwrap();
+
+        for _ in 0..50 {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        1,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: 0,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(0)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            unsafe {
+                kernel
+                    .run(
+                        m,
+                        1,
+                        &[
+                            FusedSpec::AddMatMul {
+                                a: AsInputValue::Borrowed(&*pa),
+                                b: AsInputValue::Borrowed(&*pb),
+                                packing: 0,
+                            },
+                            FusedSpec::Store(kernel.c_view(Some(0), Some(0)).wrap(&c.view_mut())),
+                        ],
+                    )
+                    .unwrap();
+            }
+        }
+        let elapsed = t0.elapsed();
+        elapsed.as_secs_f64() / iters as f64 * 1e9
+    }
+
+    fn pick(name: &str) -> Box<dyn crate::mmm::MatMatMul> {
+        let mut ops = crate::generic();
+        super::plug(&mut ops);
+        for impl_ in ops.mmm_impls() {
+            if impl_.name() == name {
+                return impl_.clone();
+            }
+        }
+        panic!("kernel {name} not registered")
+    }
+
+    fn bench_shape(label: &str, m: usize, k: usize, iters: usize) {
+        let k4 = pick("wasm_f32_4x1");
+        let k8 = pick("wasm_f32_8x1");
+        let k16 = pick("wasm_f32_16x1");
+        let k32 = pick("wasm_f32_32x1");
+        let n4 = run_one(&*k4, m, k, iters);
+        let n8 = run_one(&*k8, m, k, iters);
+        let n16 = run_one(&*k16, m, k, iters);
+        let n32 = run_one(&*k32, m, k, iters);
+        let entries = [("4x1", n4), ("8x1", n8), ("16x1", n16), ("32x1", n32)];
+        let winner = entries.iter().min_by(|a, b| a.1.partial_cmp(&b.1).unwrap()).unwrap();
+        let delta_vs_32 = (winner.1 - n32) / n32 * 100.0;
+        eprintln!(
+            "{label} (m={m} k={k}): 4x1={n4:.0} 8x1={n8:.0} 16x1={n16:.0} 32x1={n32:.0} ns; \
+             winner={} ({:.0} ns, Δ vs 32x1: {delta_vs_32:+.1}%)",
+            winner.0, winner.1
+        );
+    }
+
+    #[test]
+    #[ignore]
+    fn microbench() {
+        eprintln!("=== WASM GEMV dispatch microbench: 4x1 vs 8x1 vs 16x1 vs 32x1 ===");
+        // M ≤ 16 — small-M region; the M-band callback's choices win clearly.
+        bench_shape("M=1   k=512", 1, 512, 50_000);
+        bench_shape("M=8   k=64 ", 8, 64, 50_000);
+        bench_shape("M=8   k=512", 8, 512, 20_000);
+        bench_shape("M=12  k=256", 12, 256, 50_000);
+        bench_shape("M=16  k=96 ", 16, 96, 50_000);
+        bench_shape("M=16  k=256", 16, 256, 30_000);
+        // M ≥ 17 — 32x1 wins (16x1 needs 2 outer iters, 32x1 single iter
+        // with ILP absorbing the row padding).
+        bench_shape("M=24  k=256", 24, 256, 30_000);
+        bench_shape("M=32  k=256", 32, 256, 20_000);
+        bench_shape("M=64  k=96 ", 64, 96, 20_000);
+        bench_shape("M=100 k=256", 100, 256, 10_000);
+        bench_shape("M=256 k=256", 256, 256, 5_000);
+    }
+}
+
+// Relaxed-SIMD activation kernels (f32, FMA path).
+//
+// `f32x4_relaxed_madd(a, b, c)` computes `a * b + c`. On hosts with hardware
+// FMA (all ARM64, x86_64 with FMA3) it lowers to a single fused, single-
+// rounded instruction. On hosts without, it falls back to mul+add — hence
+// "relaxed". The result is therefore not bit-deterministic across all hosts,
+// but it is at least as accurate as the separate mul+add (FMA does fewer
+// roundings).
+//
+// For sigmoid/tanh polynomial evaluation, the 14 muladds in the Horner chain
+// fuse cleanly. Measured ~1.65x over the baseline-simd128 explicit kernel and
+// over LLVM auto-vec'd scalar on V8.
+//
+// Gated on `target_feature = "relaxed-simd"` because `f32x4_relaxed_madd`
+// requires the relaxed-simd proposal to be enabled at compile time.
+// ---------------------------------------------------------------------------
+
+#[cfg(target_feature = "relaxed-simd")]
+#[derive(Clone, Debug)]
+pub struct WasmSigmoid4Relaxed;
+
+#[cfg(target_feature = "relaxed-simd")]
+impl ElementWiseKer<f32> for WasmSigmoid4Relaxed {
+    fn name() -> &'static str {
+        "wasm_relaxed_simd"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        4
+    }
+
+    fn run(buf: &mut [f32], _: ()) {
+        use std::arch::wasm32::*;
+
+        debug_assert!(buf.len() % Self::nr() == 0);
+        debug_assert!(buf.as_ptr() as usize % Self::alignment_bytes() == 0);
+
+        // Coefficients match generic/sigmoid.rs::ssigmoid bit-for-bit.
+        // Output may differ by ≤1 ulp from scalar on FMA hosts (more accurate).
+        const LOW: f32 = -18.6;
+        const HIGH: f32 = -LOW;
+
+        const ALPHA_13: f32 = -4.433153405e-18;
+        const ALPHA_11: f32 = 1.169974371e-14;
+        const ALPHA_9: f32 = -1.875289645e-11;
+        const ALPHA_7: f32 = 4.257889523e-8;
+        const ALPHA_5: f32 = 0.00004811817576;
+        const ALPHA_3: f32 = 0.008163842030;
+        const ALPHA_1: f32 = 0.2499999971;
+
+        const BETA_6: f32 = 3.922935744e-6;
+        const BETA_4: f32 = 0.001524872358;
+        const BETA_2: f32 = 0.1159886749;
+        const BETA_0: f32 = 1.0;
+
+        unsafe {
+            let lo = f32x4_splat(LOW);
+            let hi = f32x4_splat(HIGH);
+
+            let a13 = f32x4_splat(ALPHA_13);
+            let a11 = f32x4_splat(ALPHA_11);
+            let a9 = f32x4_splat(ALPHA_9);
+            let a7 = f32x4_splat(ALPHA_7);
+            let a5 = f32x4_splat(ALPHA_5);
+            let a3 = f32x4_splat(ALPHA_3);
+            let a1 = f32x4_splat(ALPHA_1);
+
+            let b6 = f32x4_splat(BETA_6);
+            let b4 = f32x4_splat(BETA_4);
+            let b2 = f32x4_splat(BETA_2);
+            let b0 = f32x4_splat(BETA_0);
+
+            let half = f32x4_splat(0.5);
+
+            let mut p = buf.as_mut_ptr();
+            let end = p.add(buf.len());
+            while p < end {
+                let v = v128_load(p as *const v128);
+                let x = f32x4_min(hi, f32x4_max(lo, v));
+                let x2 = f32x4_mul(x, x);
+
+                // Horner numerator with FMA: pn = x2 * pn + a_n
+                let pn = a13;
+                let pn = f32x4_relaxed_madd(x2, pn, a11);
+                let pn = f32x4_relaxed_madd(x2, pn, a9);
+                let pn = f32x4_relaxed_madd(x2, pn, a7);
+                let pn = f32x4_relaxed_madd(x2, pn, a5);
+                let pn = f32x4_relaxed_madd(x2, pn, a3);
+                let pn = f32x4_relaxed_madd(x2, pn, a1);
+                let pn = f32x4_mul(pn, x);
+
+                // Horner denominator with FMA
+                let qn = b6;
+                let qn = f32x4_relaxed_madd(x2, qn, b4);
+                let qn = f32x4_relaxed_madd(x2, qn, b2);
+                let qn = f32x4_relaxed_madd(x2, qn, b0);
+
+                let r = f32x4_add(f32x4_div(pn, qn), half);
+                v128_store(p as *mut v128, r);
+                p = p.add(4);
+            }
+        }
+    }
+}
+
+#[cfg(target_feature = "relaxed-simd")]
+#[derive(Clone, Debug)]
+pub struct WasmTanh4Relaxed;
+
+#[cfg(target_feature = "relaxed-simd")]
+impl ElementWiseKer<f32> for WasmTanh4Relaxed {
+    fn name() -> &'static str {
+        "wasm_relaxed_simd"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        4
+    }
+
+    fn run(buf: &mut [f32], _: ()) {
+        use std::arch::wasm32::*;
+
+        debug_assert!(buf.len() % Self::nr() == 0);
+        debug_assert!(buf.as_ptr() as usize % Self::alignment_bytes() == 0);
+
+        const LOW: f32 = -8.9;
+        const HIGH: f32 = 8.9;
+
+        const ALPHA_13: f32 = -8.488492677e-14;
+        const ALPHA_11: f32 = 5.277853000e-11;
+        const ALPHA_9: f32 = -2.022500419e-8;
+        const ALPHA_7: f32 = 0.00001115424833;
+        const ALPHA_5: f32 = 0.003103950131;
+        const ALPHA_3: f32 = 0.1308400453;
+        const ALPHA_1: f32 = 0.9999999934;
+
+        const BETA_6: f32 = 0.0002546136580;
+        const BETA_4: f32 = 0.02449515379;
+        const BETA_2: f32 = 0.4641733162;
+        const BETA_0: f32 = 1.0;
+
+        unsafe {
+            let lo = f32x4_splat(LOW);
+            let hi = f32x4_splat(HIGH);
+
+            let a13 = f32x4_splat(ALPHA_13);
+            let a11 = f32x4_splat(ALPHA_11);
+            let a9 = f32x4_splat(ALPHA_9);
+            let a7 = f32x4_splat(ALPHA_7);
+            let a5 = f32x4_splat(ALPHA_5);
+            let a3 = f32x4_splat(ALPHA_3);
+            let a1 = f32x4_splat(ALPHA_1);
+
+            let b6 = f32x4_splat(BETA_6);
+            let b4 = f32x4_splat(BETA_4);
+            let b2 = f32x4_splat(BETA_2);
+            let b0 = f32x4_splat(BETA_0);
+
+            let mut p = buf.as_mut_ptr();
+            let end = p.add(buf.len());
+            while p < end {
+                let v = v128_load(p as *const v128);
+                let x = f32x4_min(hi, f32x4_max(lo, v));
+                let x2 = f32x4_mul(x, x);
+
+                let pn = a13;
+                let pn = f32x4_relaxed_madd(x2, pn, a11);
+                let pn = f32x4_relaxed_madd(x2, pn, a9);
+                let pn = f32x4_relaxed_madd(x2, pn, a7);
+                let pn = f32x4_relaxed_madd(x2, pn, a5);
+                let pn = f32x4_relaxed_madd(x2, pn, a3);
+                let pn = f32x4_relaxed_madd(x2, pn, a1);
+                let pn = f32x4_mul(pn, x);
+
+                let qn = b6;
+                let qn = f32x4_relaxed_madd(x2, qn, b4);
+                let qn = f32x4_relaxed_madd(x2, qn, b2);
+                let qn = f32x4_relaxed_madd(x2, qn, b0);
+
+                let r = f32x4_div(pn, qn);
+                v128_store(p as *mut v128, r);
+                p = p.add(4);
+            }
+        }
+    }
+}
+
+#[cfg(all(test, target_feature = "relaxed-simd"))]
+#[macro_use]
+mod test_wasm_sigmoid_relaxed {
+    sigmoid_frame_tests!(true, f32, crate::wasm::WasmSigmoid4Relaxed);
+}
+
+#[cfg(all(test, target_feature = "relaxed-simd"))]
+#[macro_use]
+mod test_wasm_tanh_relaxed {
+    tanh_frame_tests!(true, f32, crate::wasm::WasmTanh4Relaxed);
+}
+
+#[cfg(all(test, target_feature = "relaxed-simd"))]
+mod microbench_activations {
+    //! Microbench: WASM SIMD sigmoid/tanh vs the generic scalar fallback.
+    //! Sizes mirror typical RNN/transformer hidden dims (256, 512, 1024).
+    //!
+    //! Run with:
+    //!   RUSTFLAGS='-C target-feature=+simd128' \
+    //!     CARGO_TARGET_WASM32_WASIP1_RUNNER='wasmtime --env RUST_TEST_NOCAPTURE=1 --' \
+    //!     cargo test --release --target=wasm32-wasip1 -p tract-linalg \
+    //!     wasm::microbench_activations::microbench -- --nocapture --ignored
+    use crate::frame::element_wise::ElementWiseKer;
+    use std::time::Instant;
+
+    fn ns_per_call<K: ElementWiseKer<f32>>(buf: &mut [f32], iters: usize) -> f64 {
+        // Warmup
+        for _ in 0..50 {
+            K::run(buf, ());
+        }
+        let t0 = Instant::now();
+        for _ in 0..iters {
+            K::run(buf, ());
+        }
+        let elapsed = t0.elapsed();
+        elapsed.as_secs_f64() / iters as f64 * 1e9
+    }
+
+    fn bench(label: &str, n: usize, iters: usize) {
+        // Same input for both kernels — rebuild between to avoid post-clamp
+        // saturation mucking up the measurement.
+        let make = || (0..n).map(|i| ((i % 37) as f32 - 18.0) * 0.5).collect::<Vec<f32>>();
+
+        let mut buf = make();
+        let scalar_sig = ns_per_call::<crate::generic::sigmoid::SSigmoid4>(&mut buf, iters);
+        let mut buf = make();
+        let simd_sig = ns_per_call::<crate::wasm::WasmSigmoid4Relaxed>(&mut buf, iters);
+        let mut buf = make();
+        let scalar_tanh = ns_per_call::<crate::generic::tanh::STanh4>(&mut buf, iters);
+        let mut buf = make();
+        let simd_tanh = ns_per_call::<crate::wasm::WasmTanh4Relaxed>(&mut buf, iters);
+
+        eprintln!(
+            "{label} n={n} iters={iters}: \
+             sigmoid scalar={scalar_sig:.0} ns simd={simd_sig:.0} ns ({:.2}x); \
+             tanh scalar={scalar_tanh:.0} ns simd={simd_tanh:.0} ns ({:.2}x)",
+            scalar_sig / simd_sig,
+            scalar_tanh / simd_tanh,
+        );
+    }
+
+    #[test]
+    #[ignore]
+    fn microbench() {
+        eprintln!("=== WASM SIMD activations: scalar vs simd ===");
+        bench("hidden=256", 256, 5_000);
+        bench("hidden=512", 512, 3_000);
+        bench("hidden=1024", 1024, 2_000);
+    }
+}
diff --git a/linalg/src/x86_64_fma.rs b/linalg/src/x86_64_fma.rs
index 733d1e7121..84a4cfc9c8 100644
--- a/linalg/src/x86_64_fma.rs
+++ b/linalg/src/x86_64_fma.rs
@@ -5,6 +5,9 @@ use crate::x86_64_fma::softmax::x86_64_fma_softmax2_fastcompact_f32_32n;
 
 pub mod mmm;
 
+pub mod amx;
+pub mod amx_bf16;
+pub mod avxvnni;
 pub mod by_scalar;
 mod intel;
 pub mod max;
@@ -14,6 +17,7 @@ pub mod softmax;
 const AVX2: fn() -> bool = || is_x86_feature_detected!("avx2");
 const FMA: fn() -> bool = || is_x86_feature_detected!("fma");
 const AVX512F: fn() -> bool = || is_x86_feature_detected!("avx512f");
+const AVX512VNNI: fn() -> bool = || is_x86_feature_detected!("avx512vnni");
 
 tanh_impl!(f32, fma_tanh_f32, 8, 8, is_x86_feature_detected!("fma"));
 sigmoid_impl!(f32, fma_sigmoid_f32, 8, 8, is_x86_feature_detected!("fma"));
diff --git a/linalg/src/x86_64_fma/amx.rs b/linalg/src/x86_64_fma/amx.rs
new file mode 100644
index 0000000000..685a9618c8
--- /dev/null
+++ b/linalg/src/x86_64_fma/amx.rs
@@ -0,0 +1,262 @@
+// Intel AMX int8 support: A packing format and runtime gate.
+//
+// The kernel `avx512amx_mmm_i32_8x8` uses TDPBSSD (signed-signed). Per
+// iteration of its inner loop it consumes one 8x64-byte A tile and one
+// 16x32-byte B tile and updates an 8x8 i32 C tile. The B-side packing
+// matches the existing K=4-inner `PackedI8K4` layout, so it is reused
+// unchanged. The A-side packing is novel: AMX's tile-A semantics require
+// M-major-within-panel row-major bytes, which is incompatible with the
+// K-major-outer `PackedI8K4`. `PackedAmxA` below produces that layout.
+//
+// Runtime gate: CPUID `amx-int8` is necessary but not sufficient on Linux —
+// the kernel must also call `arch_prctl(ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)`
+// to receive AMX tile-data XSAVE permission from the kernel before any tile
+// instruction can run. `has_amx_int8()` performs both checks once and caches
+// the result; it returns false on non-Linux even if CPUID reports AMX.
+
+use std::sync::OnceLock;
+
+use tract_data::internal::*;
+
+use crate::WeightType;
+use crate::frame::mmm::{
+    EagerPackedInput, MMMInputFormat, MMMInputValue, PackedExoticFact, PackedMatrixStorage,
+};
+
+/// Per-cache geometry from CPUID leaf 4 deterministic cache parameters
+/// (the mechanism oneDNN's `platform::get_per_core_cache_size` ultimately
+/// reads). Used here for runtime adaptive choices that depend on the
+/// hardware -- e.g. picking `tileloadd` vs `tileloaddt1` based on whether
+/// the matmul working set fits in L1d (oneDNN's `try_load_nt` heuristic).
+#[derive(Clone, Copy, Debug, Default)]
+pub struct CacheSizes {
+    pub l1d_bytes: usize,
+    pub l2_bytes: usize,
+    pub l3_bytes: usize,
+}
+
+/// Probe per-core L1d/L2/L3 cache sizes via CPUID leaf 4 deterministic
+/// cache parameters. Iterates sub-leaves 0, 1, 2, ... until cache type = 0
+/// (no more caches). Each cache is described by:
+///   EAX[4:0]  = cache type (0=null, 1=data, 2=instr, 3=unified)
+///   EAX[7:5]  = cache level (1, 2, 3, ...)
+///   EBX[11:0] = ways - 1
+///   EBX[21:12]= partitions - 1
+///   EBX[31:22]= line_size_bytes - 1
+///   ECX       = sets - 1
+///   cache_bytes = (ways+1) * (partitions+1) * (line_size+1) * (sets+1)
+/// Returns zeros for unknown levels (e.g. on a CPU without an L3, or if
+/// the CPUID interface is unavailable). Memoised; called at most once.
+pub fn cache_sizes() -> CacheSizes {
+    static CACHE: OnceLock<CacheSizes> = OnceLock::new();
+    *CACHE.get_or_init(|| {
+        let mut out = CacheSizes::default();
+        for sub in 0..16 {
+            let r = std::arch::x86_64::__cpuid_count(4, sub);
+            let cache_type = r.eax & 0x1F;
+            if cache_type == 0 {
+                break;
+            }
+            let level = (r.eax >> 5) & 0x7;
+            let ways = ((r.ebx >> 22) & 0x3FF) + 1;
+            let partitions = ((r.ebx >> 12) & 0x3FF) + 1;
+            let line_size = (r.ebx & 0xFFF) + 1;
+            let sets = r.ecx + 1;
+            let bytes = (ways as usize) * (partitions as usize)
+                * (line_size as usize) * (sets as usize);
+            // type=1 (data), type=3 (unified) for L1d / L2 / L3
+            match (level, cache_type) {
+                (1, 1) => out.l1d_bytes = bytes,
+                (2, 1 | 3) => out.l2_bytes = bytes,
+                (3, 1 | 3) => out.l3_bytes = bytes,
+                _ => {}
+            }
+        }
+        out
+    })
+}
+
+/// Detect AMX-INT8 + AMX-TILE via CPUID leaf 7 sub-leaf 0 (EDX bits 24-25).
+/// Stable-Rust friendly: `is_x86_feature_detected!("amx-int8")` is gated on
+/// the nightly `x86_amx_intrinsics` feature, so we read CPUID by hand.
+fn cpu_has_amx_int8() -> bool {
+    if !std::is_x86_feature_detected!("avx512f") {
+        return false;
+    }
+    let r = std::arch::x86_64::__cpuid_count(7, 0);
+    // bit 24 = AMX-TILE, bit 25 = AMX-INT8 in EDX.
+    const AMX_TILE: u32 = 1 << 24;
+    const AMX_INT8: u32 = 1 << 25;
+    (r.edx & AMX_TILE) != 0 && (r.edx & AMX_INT8) != 0
+}
+
+/// Linux only: ask the kernel for permission to use the AMX tile-data XSAVE
+/// state via `arch_prctl(ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)`. Returns
+/// true if the kernel grants permission (or if the process already has it).
+/// Exposed via `request_amx_tile_xcomp_perm()` below so the bf16 path can
+/// share the same OS-level gate.
+#[cfg(target_os = "linux")]
+unsafe fn request_amx_xcomp_perm() -> bool {
+    // x86_64 syscall: rax=158 (arch_prctl), rdi=0x1023 (REQ_XCOMP_PERM),
+    // rsi=18 (XFEATURE_XTILEDATA). Returns 0 on success.
+    let rc: i64;
+    unsafe {
+        std::arch::asm!(
+            "syscall",
+            in("rax") 158i64,
+            in("rdi") 0x1023i64,
+            in("rsi") 18i64,
+            lateout("rax") rc,
+            out("rcx") _,
+            out("r11") _,
+            options(nostack),
+        );
+    }
+    rc == 0
+}
+
+/// Memoised wrapper around `request_amx_xcomp_perm` -- arch_prctl has a
+/// process-wide effect and only needs to be called once for the whole
+/// lifetime of the process. Returns true iff the OS has granted permission
+/// for XFEATURE_XTILEDATA (and hence enables both AMX int8 AND AMX bf16
+/// kernels). Returns false on non-Linux.
+pub fn request_amx_tile_xcomp_perm() -> bool {
+    static GATE: OnceLock<bool> = OnceLock::new();
+    *GATE.get_or_init(|| {
+        #[cfg(target_os = "linux")]
+        {
+            unsafe { request_amx_xcomp_perm() }
+        }
+        #[cfg(not(target_os = "linux"))]
+        {
+            false
+        }
+    })
+}
+
+/// Returns true iff Intel AMX int8 is available AND the OS has granted this
+/// process permission to use the AMX tile-data XSAVE state. Result is
+/// memoised — the arch_prctl call has process-wide effect and only needs to
+/// run once.
+pub fn has_amx_int8() -> bool {
+    static GATE: OnceLock<bool> = OnceLock::new();
+    *GATE.get_or_init(|| cpu_has_amx_int8() && request_amx_tile_xcomp_perm())
+}
+
+/// AMX-friendly A packing: per `r`-row panel, M-rows are laid out row-major
+/// across `K_padded = ceil(K / 64) * 64` contiguous bytes per row. AMX's
+/// `tileloadd` with stride = K_padded reads exactly 8 contiguous M-rows of
+/// 64 K-bytes each per call.
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+pub struct PackedAmxA {
+    pub r: usize,
+    pub align: usize,
+}
+
+impl PackedAmxA {
+    pub fn new(r: usize) -> Self {
+        PackedAmxA { r, align: 64 }
+    }
+    fn k_padded(&self, k: usize) -> usize {
+        k.div_ceil(64) * 64
+    }
+    fn panel(&self, k: usize) -> usize {
+        self.k_padded(k) * self.r
+    }
+    pub fn single_panel_len(&self, k: usize) -> usize {
+        self.panel(k)
+    }
+    pub fn len(&self, k: usize, mn: usize) -> usize {
+        mn.div_ceil(self.r) * self.panel(k)
+    }
+    pub fn alignment(&self) -> usize {
+        self.align
+    }
+
+    pub fn pack_view(
+        &self,
+        t: &TensorView,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        let k = t.shape()[k_axis];
+        let mn = t.shape()[mn_axis];
+        let kp = self.k_padded(k);
+        let pl = kp * self.r;
+        let panels = mn.div_ceil(self.r);
+        let st = t.strides();
+        let (ks, ms) = (st[k_axis], st[mn_axis]);
+        let mut blob = unsafe { Blob::new_for_size_and_align(panels * pl, self.align) };
+        blob.as_bytes_mut().fill(0);
+        unsafe {
+            let src = t.as_ptr_unchecked::<i8>();
+            let dst = blob.as_mut_ptr() as *mut i8;
+            for p in 0..panels {
+                let pw = self.r.min(mn - p * self.r);
+                let panel = dst.add(p * pl);
+                let mn0 = (p * self.r) as isize;
+                for lm in 0..pw {
+                    let drow = panel.add(lm * kp);
+                    let srow_base = src.offset((mn0 + lm as isize) * ms);
+                    for kk in 0..k {
+                        *drow.add(kk) = *srow_base.offset(kk as isize * ks);
+                    }
+                }
+            }
+        }
+        Ok(Box::new(EagerPackedInput {
+            fact: PackedExoticFact { format: Box::new(self.clone()), mn: mn.to_dim(), k },
+            packed: blob.into(),
+            panel_bytes: pl,
+            mn,
+        }))
+    }
+}
+
+impl std::fmt::Display for PackedAmxA {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "AmxA[{}]", self.r)
+    }
+}
+
+impl MMMInputFormat for PackedAmxA {
+    fn prepare_tensor(&self, t: &Tensor, k_axis: usize, mn_axis: usize) -> TractResult<Tensor> {
+        Ok(PackedMatrixStorage::new(self.prepare_one(t, k_axis, mn_axis)?)
+            .into_tensor(t.datum_type()))
+    }
+    fn prepare_one(
+        &self,
+        t: &Tensor,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        self.pack_view(&t.view(), k_axis, mn_axis)
+    }
+    fn precursor(&self) -> WeightType {
+        WeightType::Plain(i8::datum_type())
+    }
+    fn r(&self) -> usize {
+        self.r
+    }
+    fn k_alignment(&self) -> usize {
+        // AMX consumes K=64 bytes per tdpbssd inner iteration; the packer
+        // already pads internally, but expose the alignment so upstream
+        // schedulers can reason about K-blocking.
+        64
+    }
+    fn merge_with<'o, 'a: 'o, 'b: 'o>(
+        &'a self,
+        o: &'b dyn MMMInputFormat,
+    ) -> Option<&'o dyn MMMInputFormat> {
+        o.downcast_ref::<PackedAmxA>().filter(|x| x.r == self.r).map(|_| self as _)
+    }
+    fn mem_size(&self, k: TDim, mn: TDim) -> TDim {
+        mn.divceil(self.r) * self.panel(k.to_usize().unwrap_or(0))
+    }
+    fn extract_at_mn_f16(&self, _: &EagerPackedInput, _: usize, _: &mut [f16]) -> TractResult<()> {
+        bail!("no f16 extract")
+    }
+    fn extract_at_mn_f32(&self, _: &EagerPackedInput, _: usize, _: &mut [f32]) -> TractResult<()> {
+        bail!("no f32 extract")
+    }
+}
diff --git a/linalg/src/x86_64_fma/amx_bf16.rs b/linalg/src/x86_64_fma/amx_bf16.rs
new file mode 100644
index 0000000000..ba64cfa16a
--- /dev/null
+++ b/linalg/src/x86_64_fma/amx_bf16.rs
@@ -0,0 +1,315 @@
+// Intel AMX bf16 support: f32 -> bf16 packers and the AMX bf16 runtime gate.
+//
+// The kernel `avx512amx_mmm_f32_16x16` uses TDPBF16PS (bf16 x bf16 -> f32) to
+// accelerate f32 matmul on Sapphire Rapids+ AMX hardware. The inputs are
+// truncated from f32 to bf16 at pack time (round-to-nearest-even, matching
+// Intel's VCVTNEPS2BF16 semantics); the f32 accumulators are bit-identical
+// to a "scalar bf16 multiply + f32 accumulate" reference but DIFFER from a
+// pure-f32 FMA reference by ~1 / 2^8 relative per multiply (bf16 has 8
+// mantissa bits vs f32's 23). This precision loss is the same as oneDNN
+// "fast-math" f32 matmul on AMX and is acceptable for inference workloads
+// (LLMs, CNNs) that already tolerate bf16.
+//
+// Tile geometry mirrors the i32 16x16 kernel: 16 rows x 64 colsb per tile.
+// Per TDPBF16PS: 16 M-rows x 16 N-cols x 32 K-bf16 = 8192 fma operations
+// per single instruction -- the same throughput as TDPBSSD.
+
+use std::sync::OnceLock;
+
+use tract_data::internal::*;
+
+use crate::WeightType;
+use crate::frame::mmm::{
+    EagerPackedInput, MMMInputFormat, MMMInputValue, PackedExoticFact, PackedMatrixStorage,
+};
+
+/// Detect AMX-BF16 + AMX-TILE via CPUID leaf 7 sub-leaf 0 (EDX bits 22, 24).
+/// AMX-BF16 is the bit-22 capability; AMX-TILE (bit 24) is mandatory for any
+/// AMX use. Returns false unless both are present.
+fn cpu_has_amx_bf16() -> bool {
+    if !std::is_x86_feature_detected!("avx512f") {
+        return false;
+    }
+    let r = std::arch::x86_64::__cpuid_count(7, 0);
+    const AMX_BF16: u32 = 1 << 22;
+    const AMX_TILE: u32 = 1 << 24;
+    (r.edx & AMX_BF16) != 0 && (r.edx & AMX_TILE) != 0
+}
+
+/// Returns true iff Intel AMX bf16 is available AND the OS has granted this
+/// process permission to use the AMX tile-data XSAVE state. Reuses the
+/// arch_prctl XCOMP-perm request mechanism from the int8 path -- the same
+/// XFEATURE_XTILEDATA permission gates both data types.
+pub fn has_amx_bf16() -> bool {
+    static GATE: OnceLock<bool> = OnceLock::new();
+    *GATE.get_or_init(|| cpu_has_amx_bf16() && super::amx::request_amx_tile_xcomp_perm())
+}
+
+/// Convert an f32 to bf16 with round-to-nearest-even (matches Intel's
+/// VCVTNEPS2BF16). NaN inputs are preserved as quiet NaN. Used by the bf16
+/// packers below (scalar; AMX hardware is on Sapphire Rapids+ which has the
+/// AVX-512-BF16 intrinsic for batched conversion, but packing is amortised
+/// over many kernel calls so the scalar path is fine).
+#[inline]
+pub fn f32_to_bf16_rne(x: f32) -> u16 {
+    let bits = x.to_bits();
+    // NaN check: exponent all-ones and mantissa nonzero.
+    if (bits & 0x7F80_0000) == 0x7F80_0000 && (bits & 0x007F_FFFF) != 0 {
+        // Quiet NaN: set the top mantissa bit of the bf16 result.
+        ((bits >> 16) as u16) | 0x0040
+    } else {
+        // round-to-nearest-even: add 0x7FFF + (lsb of bf16) before truncating.
+        let lsb = (bits >> 16) & 1;
+        let rounding = 0x0000_7FFF + lsb;
+        (bits.wrapping_add(rounding) >> 16) as u16
+    }
+}
+
+/// AMX-friendly A packing for f32 matmul via bf16. Per `r`-row panel, the
+/// M-rows are laid out row-major in bf16 across `K_padded` contiguous bf16
+/// per row (K_padded = ceil(K/32)*32, so each row is a whole number of
+/// AMX K-tile widths). Source is f32; conversion happens at pack time.
+///
+///   panel_bytes = r * K_padded * 2  (each bf16 = 2 bytes)
+///
+/// AMX `tileloadd` with stride = K_padded*2 reads exactly 16 M-rows of
+/// 64 bytes (= 32 bf16) per call -- one inner-K iter's worth.
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+pub struct PackedAmxBf16A {
+    pub r: usize,
+    pub align: usize,
+}
+
+impl PackedAmxBf16A {
+    pub fn new(r: usize) -> Self {
+        PackedAmxBf16A { r, align: 64 }
+    }
+    fn k_padded(&self, k: usize) -> usize {
+        k.div_ceil(32) * 32
+    }
+    fn panel(&self, k: usize) -> usize {
+        self.k_padded(k) * self.r * 2
+    }
+    pub fn single_panel_len(&self, k: usize) -> usize {
+        self.panel(k)
+    }
+    pub fn len(&self, k: usize, mn: usize) -> usize {
+        mn.div_ceil(self.r) * self.panel(k)
+    }
+    pub fn alignment(&self) -> usize {
+        self.align
+    }
+
+    pub fn pack_view(
+        &self,
+        t: &TensorView,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        let k = t.shape()[k_axis];
+        let mn = t.shape()[mn_axis];
+        let kp = self.k_padded(k);
+        let pl = kp * self.r * 2; // bytes per panel
+        let panels = mn.div_ceil(self.r);
+        let st = t.strides();
+        let (ks, ms) = (st[k_axis], st[mn_axis]);
+        let mut blob = unsafe { Blob::new_for_size_and_align(panels * pl, self.align) };
+        blob.as_bytes_mut().fill(0);
+        unsafe {
+            let src = t.as_ptr_unchecked::<f32>();
+            let dst = blob.as_mut_ptr() as *mut u16;
+            for p in 0..panels {
+                let pw = self.r.min(mn - p * self.r);
+                let panel = dst.add(p * (kp * self.r)); // panel_offset in u16 elements
+                let mn0 = (p * self.r) as isize;
+                for lm in 0..pw {
+                    let drow = panel.add(lm * kp);
+                    let srow_base = src.offset((mn0 + lm as isize) * ms);
+                    for kk in 0..k {
+                        let v = *srow_base.offset(kk as isize * ks);
+                        *drow.add(kk) = f32_to_bf16_rne(v);
+                    }
+                }
+            }
+        }
+        Ok(Box::new(EagerPackedInput {
+            fact: PackedExoticFact { format: Box::new(self.clone()), mn: mn.to_dim(), k },
+            packed: blob.into(),
+            panel_bytes: pl,
+            mn,
+        }))
+    }
+}
+
+impl std::fmt::Display for PackedAmxBf16A {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "AmxBf16A[{}]", self.r)
+    }
+}
+
+impl MMMInputFormat for PackedAmxBf16A {
+    fn prepare_tensor(&self, t: &Tensor, k_axis: usize, mn_axis: usize) -> TractResult<Tensor> {
+        Ok(PackedMatrixStorage::new(self.prepare_one(t, k_axis, mn_axis)?)
+            .into_tensor(t.datum_type()))
+    }
+    fn prepare_one(
+        &self,
+        t: &Tensor,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        self.pack_view(&t.view(), k_axis, mn_axis)
+    }
+    fn k_alignment(&self) -> usize {
+        // tdpbf16ps consumes 32 bf16 per K-step.
+        32
+    }
+    fn r(&self) -> usize {
+        self.r
+    }
+    fn precursor(&self) -> WeightType {
+        WeightType::Plain(f32::datum_type())
+    }
+    fn merge_with<'o, 'a: 'o, 'b: 'o>(
+        &'a self,
+        o: &'b dyn MMMInputFormat,
+    ) -> Option<&'o dyn MMMInputFormat> {
+        o.downcast_ref::<PackedAmxBf16A>().filter(|x| x.r == self.r).map(|_| self as _)
+    }
+    fn mem_size(&self, k: TDim, mn: TDim) -> TDim {
+        mn.divceil(self.r) * self.panel(k.to_usize().unwrap_or(0))
+    }
+    fn extract_at_mn_f16(&self, _: &EagerPackedInput, _: usize, _: &mut [f16]) -> TractResult<()> {
+        bail!("no f16 extract")
+    }
+    fn extract_at_mn_f32(&self, _: &EagerPackedInput, _: usize, _: &mut [f32]) -> TractResult<()> {
+        bail!("no f32 extract")
+    }
+}
+
+/// AMX-friendly B packing for f32 matmul via bf16 (analog of PackedI8K4 but
+/// K=2-inner instead of K=4-inner -- tdpbf16ps groups 2 bf16 per K-step).
+///
+/// Per K=2 block: r N-cols x 2 K-bf16 = r * 2 * 2 bytes = 4r bytes.
+/// Block layout: byte (n*4 + ki*2..(n*4 + ki*2 + 2)) = bf16 of B[2kb+ki, n].
+/// For r=16: 64 bytes per K=2 block, 16 blocks per K=32 AMX tile -> 1024 B.
+///
+/// One AMX `tileloadd` with stride = 4r bytes reads 16 K-pair-rows of
+/// r * 4 bytes each = one inner-K iter's worth of B.
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+pub struct PackedBf16K2 {
+    pub r: usize,
+    pub align: usize,
+}
+
+impl PackedBf16K2 {
+    pub fn new(r: usize) -> Self {
+        PackedBf16K2 { r, align: 64 }
+    }
+    fn k_padded(&self, k: usize) -> usize {
+        k.div_ceil(2) * 2
+    }
+    fn panel(&self, k: usize) -> usize {
+        self.k_padded(k) * self.r * 2
+    }
+    pub fn single_panel_len(&self, k: usize) -> usize {
+        self.panel(k)
+    }
+    pub fn len(&self, k: usize, mn: usize) -> usize {
+        mn.div_ceil(self.r) * self.panel(k)
+    }
+    pub fn alignment(&self) -> usize {
+        self.align
+    }
+    pub fn pack_view(
+        &self,
+        t: &TensorView,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        let k = t.shape()[k_axis];
+        let mn = t.shape()[mn_axis];
+        let kp = self.k_padded(k);
+        let pl = kp * self.r * 2; // bytes per panel
+        let panels = mn.div_ceil(self.r);
+        let st = t.strides();
+        let mut blob = unsafe { Blob::new_for_size_and_align(panels * pl, self.align) };
+        blob.as_bytes_mut().fill(0);
+        let (ks, ms) = (st[k_axis], st[mn_axis]);
+        let kblocks = kp / 2;
+        unsafe {
+            let src = t.as_ptr_unchecked::<f32>();
+            let dst = blob.as_mut_ptr() as *mut u16;
+            for p in 0..panels {
+                let pw = self.r.min(mn - p * self.r);
+                let panel = dst.add(p * (kp * self.r));
+                let mn0 = (p * self.r) as isize;
+                for kb in 0..kblocks {
+                    for ki in 0..2 {
+                        let kk = kb * 2 + ki;
+                        if kk >= k {
+                            break;
+                        }
+                        let srow = src.offset(kk as isize * ks + mn0 * ms);
+                        let dblock = panel.add(kb * self.r * 2 + ki);
+                        for lm in 0..pw {
+                            let v = *srow.offset(lm as isize * ms);
+                            *dblock.add(lm * 2) = f32_to_bf16_rne(v);
+                        }
+                    }
+                }
+            }
+        }
+        Ok(Box::new(EagerPackedInput {
+            fact: PackedExoticFact { format: Box::new(self.clone()), mn: mn.to_dim(), k },
+            packed: blob.into(),
+            panel_bytes: pl,
+            mn,
+        }))
+    }
+}
+
+impl std::fmt::Display for PackedBf16K2 {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "Bf16K2[{}]", self.r)
+    }
+}
+
+impl MMMInputFormat for PackedBf16K2 {
+    fn prepare_tensor(&self, t: &Tensor, k_axis: usize, mn_axis: usize) -> TractResult<Tensor> {
+        Ok(PackedMatrixStorage::new(self.prepare_one(t, k_axis, mn_axis)?)
+            .into_tensor(t.datum_type()))
+    }
+    fn prepare_one(
+        &self,
+        t: &Tensor,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        self.pack_view(&t.view(), k_axis, mn_axis)
+    }
+    fn k_alignment(&self) -> usize {
+        2
+    }
+    fn r(&self) -> usize {
+        self.r
+    }
+    fn precursor(&self) -> WeightType {
+        WeightType::Plain(f32::datum_type())
+    }
+    fn merge_with<'o, 'a: 'o, 'b: 'o>(
+        &'a self,
+        o: &'b dyn MMMInputFormat,
+    ) -> Option<&'o dyn MMMInputFormat> {
+        o.downcast_ref::<PackedBf16K2>().filter(|x| x.r == self.r).map(|_| self as _)
+    }
+    fn mem_size(&self, k: TDim, mn: TDim) -> TDim {
+        mn.divceil(self.r) * self.panel(k.to_usize().unwrap_or(0))
+    }
+    fn extract_at_mn_f16(&self, _: &EagerPackedInput, _: usize, _: &mut [f16]) -> TractResult<()> {
+        bail!("no f16 extract")
+    }
+    fn extract_at_mn_f32(&self, _: &EagerPackedInput, _: usize, _: &mut [f32]) -> TractResult<()> {
+        bail!("no f32 extract")
+    }
+}
diff --git a/linalg/src/x86_64_fma/avxvnni.rs b/linalg/src/x86_64_fma/avxvnni.rs
new file mode 100644
index 0000000000..5d98637865
--- /dev/null
+++ b/linalg/src/x86_64_fma/avxvnni.rs
@@ -0,0 +1,42 @@
+// AVX-VNNI int8 GEMM runtime gate.
+//
+// AVX-VNNI (CPUID leaf 7 sub-leaf 1 EAX bit 4) is the VEX-encoded sibling of
+// AVX-512-VNNI's VPDPBUSD: same i32 += u8 * s8 dot4 semantics, but addressable
+// from VEX (= AVX2-class) decoders. It exists primarily for Atom-class
+// server / E-core SKUs that have AVX2 + AVX-VNNI but no AVX-512:
+//
+//   * Alder Lake / Raptor Lake / Meteor Lake E-cores (Gracemont, Crestmont)
+//   * Sierra Forest (Sierra Glen)
+//   * Clearwater Forest (Darkmont)
+//
+// On a CPU with AVX-512-VNNI (Cascade Lake, Ice Lake, Sapphire Rapids+), this
+// detector still returns true if CPUID leaf 7.1 EAX.4 is set -- some big-core
+// SKUs report AVX-VNNI alongside AVX-512-VNNI -- but the dispatch in mmm.rs
+// prefers the EVEX-encoded avx512vnni kernel in that case (same throughput,
+// 32 zmm registers available for unrolling). The AVX-VNNI kernel is only
+// selected when AVX-512-VNNI is absent.
+
+use std::sync::OnceLock;
+
+/// CPUID leaf 7 sub-leaf 1, EAX bit 4 = AVX-VNNI (Intel SDM Vol 2 Table 1-7).
+/// Sub-leaf 1 is only valid when CPUID.7.0.EAX (the max sub-leaf field) >= 1;
+/// older CPUs return zeroed structures. We check the max-sub-leaf first to
+/// avoid a misleading bit on pre-AVX-VNNI silicon.
+fn cpu_has_avxvnni() -> bool {
+    if !std::is_x86_feature_detected!("avx2") {
+        return false;
+    }
+    let max_sub = std::arch::x86_64::__cpuid_count(7, 0).eax;
+    if max_sub < 1 {
+        return false;
+    }
+    let r = std::arch::x86_64::__cpuid_count(7, 1);
+    (r.eax & (1 << 4)) != 0
+}
+
+/// Returns true iff CPUID reports AVX-VNNI on this CPU. Memoised; no OS
+/// permission gate is required (unlike AMX, AVX-VNNI uses no extended state).
+pub fn has_avxvnni() -> bool {
+    static GATE: OnceLock<bool> = OnceLock::new();
+    *GATE.get_or_init(cpu_has_avxvnni)
+}
diff --git a/linalg/src/x86_64_fma/mmm.rs b/linalg/src/x86_64_fma/mmm.rs
index 5ddd6a8938..b2a45d4c46 100644
--- a/linalg/src/x86_64_fma/mmm.rs
+++ b/linalg/src/x86_64_fma/mmm.rs
@@ -1,10 +1,65 @@
 use crate::Ops;
 use crate::block_quant::*;
 use crate::mmm::ImplementationQuality::ManuallyOptimized;
-use crate::pack::PackedFormat;
+use crate::mmm::MatMatMul;
+use crate::pack::{PackedFormat, PackedI8K4};
 
+use super::amx::{PackedAmxA, has_amx_int8};
+#[cfg(tract_amx_bf16)]
+use super::amx_bf16::{PackedAmxBf16A, PackedBf16K2, has_amx_bf16};
+#[cfg(tract_avxvnni)]
+use super::avxvnni::has_avxvnni;
 use super::*;
 
+#[cfg(tract_amx_int8)]
+const AVX512AMX: fn() -> bool = has_amx_int8;
+#[cfg(tract_amx_bf16)]
+const AVX512AMX_BF16: fn() -> bool = has_amx_bf16;
+#[cfg(tract_avxvnni)]
+const AVXVNNI: fn() -> bool = has_avxvnni;
+
+/// One candidate kernel in a dispatcher's pool, with its tile geometry
+/// and a relative-throughput scale (1.0 = baseline, used to break
+/// near-ties between kernels with similar tile waste).
+#[derive(Clone, Copy)]
+struct KernelChoice {
+    mr: usize,
+    nr: usize,
+    scale: f32,
+    ctor: fn() -> Box<dyn MatMatMul>,
+}
+
+/// Fraction of the M-or-N axis covered by useful work after rounding up
+/// to the kernel's tile size. 1.0 = exact fit; smaller is worse.
+/// Empty axis (d == 0) is treated as "no waste" — no work to misallocate.
+fn tile_util(d: usize, tile: usize) -> f32 {
+    if d == 0 {
+        return 1.0;
+    }
+    let batches = d.div_ceil(tile);
+    d as f32 / (batches * tile) as f32
+}
+
+/// Pick the kernel that maximises `scale * m_util * n_util`. Ties are
+/// broken first in favour of fewer total tile passes (less loop
+/// overhead), then in favour of larger `nr` (more K-loop amortisation
+/// per inner iteration). An unknown M or N is treated as
+/// "large enough" — its utilisation contribution is 1.0.
+fn pick_mmm(candidates: &[KernelChoice], m: Option<usize>, n: Option<usize>) -> Box<dyn MatMatMul> {
+    let key = |c: &KernelChoice| -> (f32, i32, i32) {
+        let m_u = m.map(|m| tile_util(m, c.mr)).unwrap_or(1.0);
+        let n_u = n.map(|n| tile_util(n, c.nr)).unwrap_or(1.0);
+        let m_b = m.map(|m| m.div_ceil(c.mr)).unwrap_or(1) as i32;
+        let n_b = n.map(|n| n.div_ceil(c.nr)).unwrap_or(1) as i32;
+        (c.scale * m_u * n_u, -(m_b * n_b), c.nr as i32)
+    };
+    let best = candidates
+        .iter()
+        .max_by(|a, b| key(a).partial_cmp(&key(b)).unwrap())
+        .expect("non-empty kernel pool");
+    (best.ctor)()
+}
+
 MMMExternKernel!(fma_mmm_f32_8x8 <f32>(8, 8)@(256,4) where(FMA) quality(ManuallyOptimized));
 MMMExternKernel!(fma_mmm_f32_16x6<f32>(16,6)@(256,4) where(FMA) quality(ManuallyOptimized));
 MMMExternKernel!(fma_mmm_f32_16x5<f32>(16,5)@(256,4) where(FMA) quality(ManuallyOptimized));
@@ -48,18 +103,240 @@ MMMExternKernel! { avx2_mmm_i32_8x8<i32>(8,8)@(256,4) where(AVX2)
     store(i8)
 }
 
+// AVX-512 VNNI int8 GEMM: same 8x8 column-accumulator tile and quantization
+// epilogue as avx2_mmm_i32_8x8, but the i8i8 matmul inner loop uses VPDPBUSD
+// (4-way K dot) over the K=4-inner PackedI8K4 layout. VPDPBUSD is u8*s8, so the
+// kernel offsets A by +128 and removes the 128*sum_k(B) bias per column before
+// the epilogue, making the i32 accumulators bit-identical to the AVX2 path.
+MMMExternKernel! { avx512vnni_mmm_i32_8x8<i32>(8,8)@(256,4) where(AVX512VNNI)
+    packing[1] = i8i8 => |k| k.with_packing(PackedI8K4::new(8), PackedI8K4::new(8));
+    quality(ManuallyOptimized)
+    store(i8)
+}
+
+// AVX-512 VNNI int8 GEMM, zmm-wide 16x16 sibling of avx512vnni_mmm_i32_8x8.
+// Accumulators are ROW-MAJOR (zmm{m} = row m of C, 16 columns per zmm), so one
+// VPDPBUSD covers 16 columns x 4 K and the K=4 inner step issues 16 of them
+// (one per row) = 1024 mul-adds/block, 2x the 8x8 ymm kernel's work per
+// iteration. Same +128 A-bias / per-column correction as the 8x8 kernel, and
+// the same PackedI8K4 layout (r=16 for both A and B). This is the int8
+// throughput tier of qmmm_i32 for big cores with AVX-512-VNNI but no AMX
+// (Cascade Lake / Ice Lake / Tiger Lake server + client).
+//
+// boost(50) lifts it above the 8x8 VNNI candidate in the einsum kernel-selection
+// scorer for unknown shapes, while staying below the AMX 16x16 kernels' boost(100)
+// so AMX still wins when both are present.
+MMMExternKernel! { avx512vnni_mmm_i32_16x16<i32>(16,16)@(64,4) where(AVX512VNNI)
+    packing[1] = i8i8 => |k| k.with_packing(PackedI8K4::new(16), PackedI8K4::new(16));
+    quality(ManuallyOptimized)
+    boost(|| 50)
+    store(i8)
+}
+
+// AVX-VNNI ymm int8 GEMM: byte-for-byte the same body as avx512vnni_mmm_i32_8x8
+// (8x8 ymm accumulators, PackedI8K4 inner-K, +128 bias trick), but the
+// VPDPBUSD instructions are forced to the VEX (AVX-VNNI) encoding via the
+// `{vex}` prefix. Runs on Atom-class cores (Alder Lake-E, Sierra Forest,
+// Clearwater Forest / Darkmont) which have AVX-VNNI but no AVX-512. On big
+// cores with both AVX-512-VNNI and AVX-VNNI (Sapphire Rapids+, some Alder
+// Lake P-core SKUs) dispatch prefers the EVEX-encoded kernel above.
+#[cfg(tract_avxvnni)]
+MMMExternKernel! { avxvnni_mmm_i32_8x8<i32>(8,8)@(256,4) where(AVXVNNI)
+    packing[1] = i8i8 => |k| k.with_packing(PackedI8K4::new(8), PackedI8K4::new(8));
+    quality(ManuallyOptimized)
+    store(i8)
+}
+
+// Same epilogue as avx512vnni_mmm_i32_8x8 (8x8 ymm accumulators), but the i8i8
+// matmul inner loop uses TDPBSSD (16-M x 16-N x 64-K mul-acc per instruction)
+// over AMX tiles. A's packing is novel (PackedAmxA, M-major-within-panel,
+// K-padded to multiples of 64); B reuses VNNI's K=4-inner PackedI8K4 layout
+// unchanged. TDPBSSD is s8 x s8 so no +128 bias trick — accumulators are
+// bit-identical to AVX2/VNNI. Gated by `where(AVX512AMX)` (= CPUID amx-int8
+// AND Linux XSAVE permission via arch_prctl).
+#[cfg(tract_amx_int8)]
+MMMExternKernel! { avx512amx_mmm_i32_8x8<i32>(8,8)@(64,4) where(AVX512AMX)
+    packing[1] = i8i8 => |k| k.with_packing(PackedAmxA::new(8), PackedI8K4::new(8));
+    quality(ManuallyOptimized)
+    store(i8)
+}
+
+// 16x16 i32 sibling. One tdpbssd does 16*16*64 = 16384 mul-adds (4x the 8x8).
+// Same A/B packing (PackedAmxA, PackedI8K4) just with r=16. Row-major
+// accumulators (zmm{m} = row m of C) so the hot path (Clear -> AddMatMul ->
+// Store) needs no transpose.
+//
+// boost(100) pushes this kernel above the equally-ManuallyOptimized AVX-512-VNNI
+// and AMX 8x8 candidates in the einsum kernel-selection scorer (which uses
+// `-quality_cost*1000 + boost` per kernel). When more than one dim is symbolic
+// the shape-adaptive `qmmm_i32` picker isn't invoked, so the boost is what
+// causes the optimizer to prefer the 16x16 tile for unknown-shape matmuls.
+#[cfg(tract_amx_int8)]
+MMMExternKernel! { avx512amx_mmm_i32_16x16<i32>(16,16)@(64,4) where(AVX512AMX)
+    packing[1] = i8i8 => |k| k.with_packing(PackedAmxA::new(16), PackedI8K4::new(16));
+    quality(ManuallyOptimized)
+    boost(|| 100)
+    store(i8)
+}
+
+// AMX bf16 16x16 kernel for f32 matmul: uses TDPBF16PS (bf16 x bf16 -> f32).
+// f32 inputs are truncated to bf16 at pack time (round-to-nearest-even, matching
+// Intel VCVTNEPS2BF16). One tdpbf16ps consumes 16M x 16N x 32K bf16 = 8192 fma
+// per instruction. f32 accumulators differ from a pure-f32 reference by ~1/2^8
+// relative per multiply (bf16 = 8 mantissa bits vs f32's 23) -- same precision
+// loss profile as oneDNN "fast-math" f32 matmul on AMX, acceptable for
+// inference workloads (LLMs, CNNs) that already tolerate bf16.
+//
+// Default packing[0] (the framework's PackedFormat<f32>) is retained so the
+// kernel can still be selected for f32 paths even when the BF16 packer
+// isn't a precursor match; packing[1] is the fast bf16-from-f32 path.
+// boost(100) puts this AMX kernel above the AVX-512 f32 / FMA f32 kernels at
+// the same ManuallyOptimized tier so the einsum scorer prefers it whenever
+// supported, mirroring the i32 16x16 behaviour. The bf16 vs f32 precision
+// trade is intentional and amortised over the same call sites that already
+// use bf16-via-`dotbf16ps`-style fast-math elsewhere in the stack.
+#[cfg(tract_amx_bf16)]
+MMMExternKernel! { avx512amx_mmm_f32_16x16<f32>(16,16)@(64,4) where(AVX512AMX_BF16)
+    packing[1] = f32f32_bf16 => |k| k.with_packing(PackedAmxBf16A::new(16), PackedBf16K2::new(16));
+    quality(ManuallyOptimized)
+    boost(|| 100)
+}
+
 pub fn plug(ops: &mut Ops) {
     if is_x86_feature_detected!("avx2") {
         plug_avx2(ops);
+        // AVX-VNNI runs on AVX2-only Atom-class cores (Alder Lake-E, Sierra
+        // Forest, Clearwater Forest / Darkmont). Plug it here so big cores
+        // can overlay AVX-512-VNNI / AMX on top below.
+        #[cfg(tract_avxvnni)]
+        if has_avxvnni() {
+            plug_avxvnni(ops);
+        }
         if is_x86_feature_detected!("fma") {
             plug_fma(ops);
             if is_x86_feature_detected!("avx512f") {
                 plug_avx512f(ops);
+                if is_x86_feature_detected!("avx512vnni") {
+                    plug_avx512vnni(ops);
+                    // AMX int8 preferred over VNNI when both available AND the OS
+                    // has granted XSAVE tile-data permission (see `has_amx_int8`).
+                    #[cfg(tract_amx_int8)]
+                    if has_amx_int8() {
+                        plug_avx512amx_int8(ops);
+                    }
+                }
+                // AMX bf16 for f32 matmul is independent of int8/VNNI gates:
+                // a future Xeon SKU could ship AMX-BF16 without VNNI, and the
+                // permission gate is shared with the int8 path inside has_amx_bf16().
+                #[cfg(tract_amx_bf16)]
+                if has_amx_bf16() {
+                    plug_avx512amx_bf16(ops);
+                }
             }
         }
     }
 }
 
+pub fn plug_avx512vnni(ops: &mut Ops) {
+    ops.mmm_impls.push(avx512vnni_mmm_i32_8x8.mmm());
+    ops.mmm_impls.push(avx512vnni_mmm_i32_16x16.mmm());
+    // Shape-adaptive dispatch mirroring the AMX int8 path: the zmm 16x16 tile is
+    // the throughput champion when each of M and N fills at least one tile; the
+    // 8x8 ymm kernel has lower per-call setup (smaller epilogue, half the
+    // accumulator file) and wins on small problems where the 16x16 tile-padding
+    // overhead dominates. Unknown dims default to the 16x16 champion. (No K gate:
+    // one VPDPBUSD step is only 4 K-bytes, so any K is fine; the choice is about
+    // filling the 16-wide M/N tile.)
+    ops.qmmm_i32 = Box::new(|m, _, n| {
+        let big = |o: Option<usize>, t: usize| o.is_none_or(|v| v >= t);
+        if big(m, 16) && big(n, 16) {
+            avx512vnni_mmm_i32_16x16.mmm()
+        } else {
+            avx512vnni_mmm_i32_8x8.mmm()
+        }
+    });
+    log::info!("qmmm_i32: x86_64/avx512vnni (16x16 + 8x8 adaptive) activated");
+}
+
+#[cfg(tract_avxvnni)]
+pub fn plug_avxvnni(ops: &mut Ops) {
+    ops.mmm_impls.push(avxvnni_mmm_i32_8x8.mmm());
+    // On AVX-VNNI-only cores (no AVX-512) this is the int8 throughput champion;
+    // replace the AVX2 emulation default. On big cores that also have
+    // AVX-512-VNNI, plug_avx512vnni below runs after this and clobbers
+    // qmmm_i32 again with the EVEX kernel.
+    ops.qmmm_i32 = Box::new(|_, _, _| avxvnni_mmm_i32_8x8.mmm());
+    log::info!("qmmm_i32: x86_64/avxvnni (VEX-encoded VPDPBUSD) activated");
+}
+
+#[cfg(tract_amx_bf16)]
+pub fn plug_avx512amx_bf16(ops: &mut Ops) {
+    ops.mmm_impls.push(avx512amx_mmm_f32_16x16.mmm());
+    // Save the previously-installed f32 picker so we can defer to it when
+    // the AMX kernel isn't a good fit (small M/N, or K < 32 -- one TDPBF16PS
+    // consumes 32 bf16 K-lanes so the panel must have at least one full step).
+    let prev: crate::MMMImpl = std::mem::replace(
+        &mut ops.mmm_f32,
+        Box::new(|_, _, _| unreachable!()),
+    );
+    ops.mmm_f32 = Box::new(move |m, k, n| {
+        let big = |o: Option<usize>, t: usize| o.is_none_or(|v| v >= t);
+        // Same dispatch shape as the int8 16x16/8x8 split: hand off to AMX
+        // only when each axis comfortably fills at least one tile. The 32-K
+        // threshold matches PackedAmxBf16A::k_alignment() (one tdpbf16ps =
+        // 32 bf16 K-lanes); below that, the AVX-512 / FMA path's smaller
+        // tiles waste less work.
+        if big(m, 16) && big(n, 16) && big(k, 32) {
+            avx512amx_mmm_f32_16x16.mmm()
+        } else {
+            prev(m, k, n)
+        }
+    });
+    let c = super::amx::cache_sizes();
+    log::info!(
+        "mmm_f32: x86_64/avx512amx_bf16 (16x16) overlay activated; \
+         L1d={} KB, L2={} KB, L3={} KB",
+        c.l1d_bytes / 1024,
+        c.l2_bytes / 1024,
+        c.l3_bytes / 1024,
+    );
+}
+
+#[cfg(tract_amx_int8)]
+pub fn plug_avx512amx_int8(ops: &mut Ops) {
+    ops.mmm_impls.push(avx512amx_mmm_i32_8x8.mmm());
+    ops.mmm_impls.push(avx512amx_mmm_i32_16x16.mmm());
+    // Shape-adaptive dispatch:
+    //   - 16x16 hits the full AMX tile (1024 B/tile, 16384 mul-adds per
+    //     tdpbssd) and is the throughput champion when at least one tile
+    //     of each dim is fully utilised.
+    //   - 8x8 has lower per-call setup cost (1/4 the tile-store scratch,
+    //     half the prefetch budget, smaller epilogue) and beats 16x16 on
+    //     small problems where the framework's tile-padding overhead
+    //     dominates.
+    // The exact crossover should be re-validated on AMX HW; oneDNN uses
+    // similar shape-based MR/NR selection for its BRGEMM ukernel variants.
+    ops.qmmm_i32 = Box::new(|m, k, n| {
+        // m, k, n are Option<usize> -- None means "unknown / streaming dim".
+        // For unknown dims default to the throughput champion (16x16); only
+        // fall back to 8x8 when a static dim is known to be tiny.
+        let big = |o: Option<usize>, t: usize| o.is_none_or(|v| v >= t);
+        if big(m, 16) && big(n, 16) && big(k, 64) {
+            avx512amx_mmm_i32_16x16.mmm()
+        } else {
+            avx512amx_mmm_i32_8x8.mmm()
+        }
+    });
+    let c = super::amx::cache_sizes();
+    log::info!(
+        "qmmm_i32: x86_64/avx512amx_int8 (16x16 + 8x8 adaptive) activated; \
+         L1d={} KB, L2={} KB, L3={} KB",
+        c.l1d_bytes / 1024,
+        c.l2_bytes / 1024,
+        c.l3_bytes / 1024,
+    );
+}
+
 pub fn plug_avx2(ops: &mut Ops) {
     ops.mmm_impls.push(mmm::avx2_mmm_i32_8x8.mmm());
     ops.qmmm_i32 = Box::new(|_, _, _| mmm::avx2_mmm_i32_8x8.mmm());
@@ -79,67 +356,28 @@ pub fn plug_fma(ops: &mut Ops) {
 
     ops.mmv_f32 = Box::new(|_, _| fma_mmm_f32_64x1.mmm());
 
-    ops.mmm_f32 = Box::new(|_, _, n| {
-        if n.is_none() {
-            return fma_mmm_f32_16x6.mmm();
-        }
-
-        let n = n.unwrap();
-
-        match n {
-            1 => unreachable!("should've been mmv"),
-            2 => return fma_mmm_f32_40x2.mmm(),
-            3 => return fma_mmm_f32_32x3.mmm(),
-            4 => return fma_mmm_f32_24x4.mmm(),
-            5 => return fma_mmm_f32_16x5.mmm(),
-            6 => return fma_mmm_f32_16x6.mmm(),
-            8 => return fma_mmm_f32_8x8.mmm(),
-            _ => {}
-        };
-
-        let scaling_baseline = 60.0;
-        let kernel_normalized_perf = [
-            44.0 / scaling_baseline, // 8x8
-            54.0 / scaling_baseline, // 2x6
-            54.0 / scaling_baseline, // 2x5
-            54.0 / scaling_baseline, // 3x4
-            54.0 / scaling_baseline, // 4x3
-            54.0 / scaling_baseline, // 5x2
-        ];
-
-        fn compute_efficiency(n: usize, kernel_width: usize, scale: f32) -> f32 {
-            let kernel_width = kernel_width as f32;
-            let n = n as f32;
-            let batch_count = (n / kernel_width).ceil();
-            let actual_count = batch_count * kernel_width;
-            let multi_batch_penalty = 1.0 - batch_count / 100.0;
-            n / actual_count * scale * multi_batch_penalty
-        }
+    // Hand-tuned for low N; calibration came from past measurements.
+    // For other N, fall back to a generic (M, N)-aware tile-utilisation
+    // picker over the same kernel pool.
+    const FMA_CHOICES: &[KernelChoice] = &[
+        KernelChoice { mr: 8, nr: 8, scale: 44.0 / 60.0, ctor: || fma_mmm_f32_8x8.mmm() },
+        KernelChoice { mr: 16, nr: 6, scale: 54.0 / 60.0, ctor: || fma_mmm_f32_16x6.mmm() },
+        KernelChoice { mr: 16, nr: 5, scale: 54.0 / 60.0, ctor: || fma_mmm_f32_16x5.mmm() },
+        KernelChoice { mr: 24, nr: 4, scale: 54.0 / 60.0, ctor: || fma_mmm_f32_24x4.mmm() },
+        KernelChoice { mr: 32, nr: 3, scale: 54.0 / 60.0, ctor: || fma_mmm_f32_32x3.mmm() },
+        KernelChoice { mr: 40, nr: 2, scale: 54.0 / 60.0, ctor: || fma_mmm_f32_40x2.mmm() },
+    ];
 
-        let efficiencies = [
-            compute_efficiency(n, 8, kernel_normalized_perf[0]),
-            compute_efficiency(n, 6, kernel_normalized_perf[1]),
-            compute_efficiency(n, 5, kernel_normalized_perf[2]),
-            compute_efficiency(n, 4, kernel_normalized_perf[3]),
-            compute_efficiency(n, 3, kernel_normalized_perf[4]),
-            compute_efficiency(n, 2, kernel_normalized_perf[5]),
-        ];
-
-        let best_idx = efficiencies
-            .iter()
-            .copied()
-            .enumerate()
-            .fold((0, 0.0), |max, val| if val.1 > max.1 { val } else { max });
-
-        match best_idx.0 {
-            0 => fma_mmm_f32_8x8.mmm(),
-            1 => fma_mmm_f32_16x6.mmm(),
-            2 => fma_mmm_f32_16x5.mmm(),
-            3 => fma_mmm_f32_24x4.mmm(),
-            4 => fma_mmm_f32_32x3.mmm(),
-            5 => fma_mmm_f32_40x2.mmm(),
-            _ => unreachable!("not a valid index"),
-        }
+    ops.mmm_f32 = Box::new(|m, _, n| match n {
+        None => fma_mmm_f32_16x6.mmm(),
+        Some(1) => unreachable!("should've been mmv"),
+        Some(2) => fma_mmm_f32_40x2.mmm(),
+        Some(3) => fma_mmm_f32_32x3.mmm(),
+        Some(4) => fma_mmm_f32_24x4.mmm(),
+        Some(5) => fma_mmm_f32_16x5.mmm(),
+        Some(6) => fma_mmm_f32_16x6.mmm(),
+        Some(8) => fma_mmm_f32_8x8.mmm(),
+        Some(_) => pick_mmm(FMA_CHOICES, m, n),
     });
     log::info!("mmm_f32, mmv_f32: x86_64/fma activated");
 
@@ -154,19 +392,135 @@ pub fn plug_avx512f(ops: &mut Ops) {
     ops.mmm_impls.push(avx512_mmm_f32_80x2.mmm());
     ops.mmm_impls.push(avx512_mmm_f32_48x4.mmm());
     ops.mmm_impls.push(avx512_mmm_f32_64x3.mmm());
+    ops.mmm_impls.push(avx512_mmm_f32_32x6.mmm());
+    ops.mmm_impls.push(avx512_mmm_f32_32x5.mmm());
     ops.mmm_impls.push(avx512_mmm_f32_16x12.mmm());
+    ops.mmm_impls.push(avx512_mmm_f32_16x8.mmm());
     ops.mmv_f32 = Box::new(|m, _k| match m {
         Some(m) if m < 31 => avx512_mmm_f32_16x1.mmm(),
         _ => avx512_mmm_f32_128x1.mmm(),
     });
 
-    ops.mmm_f32 = Box::new(|m, _, n| match (m, n) {
-        (_, Some(1)) => unreachable!("should've been mmv"),
-        (_, Some(2)) => avx512_mmm_f32_80x2.mmm(),
-        (Some(m), _) if m <= 16 => mmm::avx512_mmm_f32_16x12.mmm(),
-        (_, Some(n)) if n % 4 == 0 && n % 3 != 0 && n < 32 => avx512_mmm_f32_48x4.mmm(),
-        (_, Some(n)) if n < 32 => avx512_mmm_f32_64x3.mmm(),
-        _ => avx512_mmm_f32_16x12.mmm(),
+    // No measured per-kernel scaling on AVX-512 yet; all kernels start
+    // at 1.0 and the picker decides on (M, N) tile waste alone.
+    const AVX512_CHOICES: &[KernelChoice] = &[
+        KernelChoice { mr: 16, nr: 8, scale: 1.0, ctor: || avx512_mmm_f32_16x8.mmm() },
+        KernelChoice { mr: 16, nr: 12, scale: 1.0, ctor: || avx512_mmm_f32_16x12.mmm() },
+        KernelChoice { mr: 32, nr: 5, scale: 1.0, ctor: || avx512_mmm_f32_32x5.mmm() },
+        KernelChoice { mr: 32, nr: 6, scale: 1.0, ctor: || avx512_mmm_f32_32x6.mmm() },
+        KernelChoice { mr: 48, nr: 4, scale: 1.0, ctor: || avx512_mmm_f32_48x4.mmm() },
+        KernelChoice { mr: 64, nr: 3, scale: 1.0, ctor: || avx512_mmm_f32_64x3.mmm() },
+        KernelChoice { mr: 80, nr: 2, scale: 1.0, ctor: || avx512_mmm_f32_80x2.mmm() },
+        KernelChoice { mr: 128, nr: 1, scale: 1.0, ctor: || avx512_mmm_f32_128x1.mmm() },
+    ];
+
+    ops.mmm_f32 = Box::new(|m, _, n| {
+        if let Some(1) = n {
+            unreachable!("should've been mmv");
+        }
+        pick_mmm(AVX512_CHOICES, m, n)
     });
     log::info!("mmm_f32, mmv_f32: x86_64/avx512f activated");
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::frame::mmm::{AsInputValue, FusedSpec};
+    use tract_data::internal::*;
+
+    #[test]
+    fn avx512_128x1_add_unicast_with_strided_c() -> TractResult<()> {
+        if !is_x86_feature_detected!("avx512f") {
+            return Ok(());
+        }
+        let (m, k_each, n) = (1000usize, 256usize, 13usize);
+        let a0: Vec<f32> = (0..m * k_each).map(|i| ((i % 17) as f32 - 8.0) / 16.0).collect();
+        let a1: Vec<f32> = (0..m * k_each).map(|i| ((i % 19) as f32 - 9.0) / 18.0).collect();
+        let b0: Vec<f32> = (0..k_each * n).map(|i| ((i % 13) as f32 - 6.0) / 13.0).collect();
+        let b1: Vec<f32> = (0..k_each * n).map(|i| ((i % 11) as f32 - 5.0) / 10.0).collect();
+
+        let mut expected = vec![0.0f32; m * n];
+        for r in 0..m {
+            for c in 0..n {
+                let mut acc = 0.0f32;
+                for kk in 0..k_each {
+                    acc += a0[r * k_each + kk] * b0[kk * n + c];
+                    acc += a1[r * k_each + kk] * b1[kk * n + c];
+                }
+                expected[r * n + c] = acc;
+            }
+        }
+
+        let ker = avx512_mmm_f32_128x1.mmm();
+        let (pack_a, pack_b) = &ker.packings()[0];
+        let pack_one =
+            |buf: Vec<f32>, rows, cols, m_axis, k_axis, pack: &dyn crate::mmm::MMMInputFormat| {
+                let t =
+                    tract_ndarray::Array2::from_shape_vec((rows, cols), buf).unwrap().into_tensor();
+                pack.prepare_one(&t, k_axis, m_axis).unwrap()
+            };
+        let pa0 = pack_one(a0, m, k_each, 0, 1, &**pack_a);
+        let pa1 = pack_one(a1, m, k_each, 0, 1, &**pack_a);
+        let pb0 = pack_one(b0, k_each, n, 1, 0, &**pack_b);
+        let pb1 = pack_one(b1, k_each, n, 1, 0, &**pack_b);
+
+        // C-buffer layout with row stride > nr*sizeof, matching squeezenet conv10's
+        // (M=1000, spatial=13, N=13) view: M-stride is 169 floats, not nr=1.
+        let spatial = 13usize;
+        let mut c_backing = Tensor::zero::<f32>(&[m, spatial, n])?;
+        let c_spec = unsafe { ker.c_from_data_and_strides(4, (spatial * n) as isize, 1) };
+
+        unsafe {
+            let c_view = c_backing.view_mut();
+            let c = c_spec.wrap(&c_view);
+            let ops: TVec<FusedSpec> = tvec!(
+                FusedSpec::AddMatMul {
+                    a: AsInputValue::Borrowed(&*pa0),
+                    b: AsInputValue::Borrowed(&*pb0),
+                    packing: 0,
+                },
+                FusedSpec::Store(c),
+            );
+            ker.run(m, n, &ops)?;
+        }
+
+        unsafe {
+            let c_view = c_backing.view_mut();
+            let c_for_unicast = c_spec.wrap(&c_view);
+            let c_for_store = c_spec.wrap(&c_view);
+            let ops: TVec<FusedSpec> = tvec!(
+                FusedSpec::AddMatMul {
+                    a: AsInputValue::Borrowed(&*pa1),
+                    b: AsInputValue::Borrowed(&*pb1),
+                    packing: 0,
+                },
+                FusedSpec::AddUnicast(c_for_unicast),
+                FusedSpec::Store(c_for_store),
+            );
+            ker.run(m, n, &ops)?;
+        }
+
+        let c_slice = c_backing.to_plain_array_view::<f32>()?;
+        let mut max_err = 0.0f32;
+        let mut wrong_cells = 0;
+        for r in 0..m {
+            for cc in 0..n {
+                let got = c_slice[[r, 0, cc]];
+                let exp = expected[r * n + cc];
+                let e = (got - exp).abs();
+                if e > 1e-3 {
+                    wrong_cells += 1;
+                }
+                max_err = max_err.max(e);
+            }
+        }
+        assert!(
+            max_err < 1e-3,
+            "avx512_mmm_f32_128x1 wrong output at squeezenet shape: \
+             max_err={max_err}, {wrong_cells}/{} cells off",
+            m * n,
+        );
+        Ok(())
+    }
+}
diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_128x1.S.j2 b/linalg/x86_64/avx512/avx512_mmm_f32_128x1.S.j2
index 9c11793c24..26ff030348 100644
--- a/linalg/x86_64/avx512/avx512_mmm_f32_128x1.S.j2
+++ b/linalg/x86_64/avx512/avx512_mmm_f32_128x1.S.j2
@@ -54,12 +54,53 @@ Windows ABI:
     mov     r10,    [rdi + 8]           // c ptr
     mov     rsi,    [rdi + 16]          // row stride
 
+    cmp     rsi, 4
+    jne     {{L}}add_unicast_generic
+
     {% for row in range(0, 8) %}
         vaddps zmm{{row}}, zmm{{row}}, [ r10 + {{ row * 64 }} ]
     {% endfor %}
 
     jmp    {{L}}non_linear_loop
 
+{{L}}add_unicast_generic:
+    mov     eax,    0
+    {% for i in range(0, 4) %}
+        pinsrd  xmm14, eax, {{i}}
+        add     eax,    esi
+    {% endfor %}
+    {% for i in range(0, 4) %}
+        pinsrd  xmm15, eax, {{i}}
+        add     eax,    esi
+    {% endfor %}
+    {% for i in range(0, 4) %}
+        pinsrd  xmm12, eax, {{i}}
+        add     eax,    esi
+    {% endfor %}
+    {% for i in range(0, 4) %}
+        pinsrd  xmm13, eax, {{i}}
+        add     eax,    esi
+    {% endfor %}
+    vperm2f128      ymm14, ymm14, ymm15, 32
+    vperm2f128      ymm13, ymm12, ymm13, 32
+    vinsertf32x8    zmm14, zmm14, ymm13, 1
+
+    kxnorw          k1, k1, k1
+    vgatherdps      zmm12{k1}, [r10 + zmm14]
+    vaddps          zmm0, zmm0, zmm12
+
+    imul    esi,    16
+    vpbroadcastd    zmm15, esi
+
+    {% for j in range(1, 8) %}
+        vpaddd      zmm14, zmm14, zmm15
+        kxnorw      k1, k1, k1
+        vgatherdps  zmm12{k1}, [r10 + zmm14]
+        vaddps      zmm{{j}}, zmm{{j}}, zmm12
+    {% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
 {{L}}add_row_col_products:
     mov             rax, [ rdi + 8 ]
     mov             rbx, [ rdi + 16 ]
diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_16x1.S.j2 b/linalg/x86_64/avx512/avx512_mmm_f32_16x1.S.j2
index 5e8daf2639..48987c3462 100644
--- a/linalg/x86_64/avx512/avx512_mmm_f32_16x1.S.j2
+++ b/linalg/x86_64/avx512/avx512_mmm_f32_16x1.S.j2
@@ -78,27 +78,31 @@ Windows ABI:
     jmp    {{L}}non_linear_loop
 
 {{L}}add_unicast_generic:
-    mov r8, [0]
-//     mov     eax,    0
-// {% for i in range(0, 4) %}
-//     pinsrd  xmm14, eax, {{i}}
-//     add     eax,    esi
-// {% endfor %}
-// {% for i in range(0, 4) %}
-//     pinsrd  xmm15, eax, {{i}}
-//     add     eax,    esi
-// {% endfor %}
-//
-//     vperm2f128      zmm14,  zmm14, zmm15,         32 // zmm14 <- xmm14::xmm15
-//
-// {% for i in range(0, 8) %}
-//     vpcmpeqd        zmm15,  zmm15, zmm15
-//     vgatherdps      zmm12,  [ r10 + zmm14 ], zmm15
-//
-//     vaddps          zmm{{i}},   zmm{{i}},   zmm12
-//     lea             r10, [ r10 + rsi * 8 ]
-// {% endfor %}
-//
+    mov     eax,    0
+    {% for i in range(0, 4) %}
+        pinsrd  xmm14, eax, {{i}}
+        add     eax,    esi
+    {% endfor %}
+    {% for i in range(0, 4) %}
+        pinsrd  xmm15, eax, {{i}}
+        add     eax,    esi
+    {% endfor %}
+    {% for i in range(0, 4) %}
+        pinsrd  xmm12, eax, {{i}}
+        add     eax,    esi
+    {% endfor %}
+    {% for i in range(0, 4) %}
+        pinsrd  xmm13, eax, {{i}}
+        add     eax,    esi
+    {% endfor %}
+    vperm2f128      ymm14, ymm14, ymm15, 32
+    vperm2f128      ymm13, ymm12, ymm13, 32
+    vinsertf32x8    zmm14, zmm14, ymm13, 1
+
+    kxnorw          k1, k1, k1
+    vgatherdps      zmm12{k1}, [r10 + zmm14]
+    vaddps          zmm0, zmm0, zmm12
+
     jmp    {{L}}non_linear_loop
 
 {{L}}add_row_col_products:
diff --git a/linalg/x86_64/avx512amx/dummy.S b/linalg/x86_64/avx512amx/dummy.S
new file mode 100644
index 0000000000..544e9c749f
--- /dev/null
+++ b/linalg/x86_64/avx512amx/dummy.S
@@ -0,0 +1,29 @@
+// Build-time capability probe for the assembler, used by build.rs
+// (assembler_supports_amx_int8). Older binutils — notably the Debian stretch
+// x86_64 cross-toolchain in CI — predate AMX and cannot assemble these
+// mnemonics. If this file fails to assemble, build.rs skips the AMX kernels
+// and the `tract_amx_int8` cfg, and the runtime falls back to VNNI (or AVX2)
+// for `qmmm_i32`. Not linked into anything.
+.intel_syntax noprefix
+.text
+.globl tract_amx_int8_probe
+tract_amx_int8_probe:
+    push rbp
+    mov rbp, rsp
+    sub rsp, 64                  // room for the tilecfg block
+    mov qword ptr [rsp], 0
+    mov qword ptr [rsp+8], 0
+    mov qword ptr [rsp+16], 0
+    mov qword ptr [rsp+24], 0
+    mov qword ptr [rsp+32], 0
+    mov qword ptr [rsp+40], 0
+    mov qword ptr [rsp+48], 0
+    mov qword ptr [rsp+56], 0
+    mov byte ptr [rsp], 1        // palette = 1
+    ldtilecfg [rsp]
+    tilezero tmm0
+    tdpbusd  tmm0, tmm1, tmm2
+    tilerelease
+    mov rsp, rbp
+    pop rbp
+    ret
diff --git a/linalg/x86_64/avx512amx/dummy_avxvnni.S b/linalg/x86_64/avx512amx/dummy_avxvnni.S
new file mode 100644
index 0000000000..0579b2ed84
--- /dev/null
+++ b/linalg/x86_64/avx512amx/dummy_avxvnni.S
@@ -0,0 +1,16 @@
+// Build-time capability probe for the assembler, used by build.rs
+// (assembler_supports_avxvnni). Checks that the assembler accepts the
+// `{vex}` prefix on VPDPBUSD, which forces the AVX-VNNI (VEX-encoded)
+// form instead of the AVX-512-VNNI (EVEX-encoded) form gas defaults to.
+// Requires binutils >= 2.36 (which added `{vex}`/`{evex}` prefixes for
+// explicit encoding selection). When the probe fails the AVX-VNNI kernel
+// is skipped and dispatch falls back to AVX2 emulation on AVX-VNNI-only
+// hardware (Clearwater Forest / Sierra Forest / Alder Lake E-cores).
+// Not linked into anything.
+.intel_syntax noprefix
+.text
+.globl tract_avxvnni_probe
+tract_avxvnni_probe:
+    // AVX-VNNI: u8 x s8 -> i32 dot4 (VEX-encoded)
+    {vex} vpdpbusd ymm0, ymm1, ymm2
+    ret
diff --git a/linalg/x86_64/avx512amx/dummy_bf16.S b/linalg/x86_64/avx512amx/dummy_bf16.S
new file mode 100644
index 0000000000..03f5a8d7a4
--- /dev/null
+++ b/linalg/x86_64/avx512amx/dummy_bf16.S
@@ -0,0 +1,28 @@
+// Build-time capability probe for the assembler, used by build.rs
+// (assembler_supports_amx_bf16). Checks that the assembler accepts the
+// TDPBF16PS mnemonic (AMX bf16 dot-product). Same binutils version requirement
+// as AMX int8 (>= 2.34); provided as a separate probe so the two cfgs can be
+// set independently if needed. Not linked into anything.
+.intel_syntax noprefix
+.text
+.globl tract_amx_bf16_probe
+tract_amx_bf16_probe:
+    push rbp
+    mov rbp, rsp
+    sub rsp, 64
+    mov qword ptr [rsp], 0
+    mov qword ptr [rsp+8], 0
+    mov qword ptr [rsp+16], 0
+    mov qword ptr [rsp+24], 0
+    mov qword ptr [rsp+32], 0
+    mov qword ptr [rsp+40], 0
+    mov qword ptr [rsp+48], 0
+    mov qword ptr [rsp+56], 0
+    mov byte ptr [rsp], 1        // palette = 1
+    ldtilecfg [rsp]
+    tilezero tmm0
+    tdpbf16ps tmm0, tmm1, tmm2   // AMX bf16: the instruction this probe checks
+    tilerelease
+    mov rsp, rbp
+    pop rbp
+    ret
diff --git a/linalg/x86_64/fma/avx512amx_mmm_f32_16x16.S.j2 b/linalg/x86_64/fma/avx512amx_mmm_f32_16x16.S.j2
new file mode 100644
index 0000000000..654cc660cd
--- /dev/null
+++ b/linalg/x86_64/fma/avx512amx_mmm_f32_16x16.S.j2
@@ -0,0 +1,514 @@
+// vim: set syntax=asm :
+//
+// Intel AMX bf16 GEMM kernel, 16 M-rows x 16 N-cols f32 accumulator output.
+//
+// One `tdpbf16ps tmm0, tmm1, tmm2` instruction performs:
+//   tmm0[m, n] += sum_{k=0..31} A[m, k] * B[k, n]   (multiplies in bf16,
+//                                                    accumulates in f32)
+// for m=0..15, n=0..15: 16 * 16 * 32 = 8192 fma per single instruction --
+// the same throughput as TDPBSSD on the same hardware. Accelerates f32
+// matmul on Sapphire Rapids+ at the cost of bf16 input truncation
+// (~1/256 relative error per multiply; sqrt(K) compounded by FMA chain).
+//
+// Tile geometry (palette 1, the maximum-bytes AMX tile shape):
+//   tmm0 = C accumulator: 16 rows x 64 colsb = 16 M-rows x 16 N-cols of f32
+//   tmm1 = A tile:        16 rows x 64 colsb = 16 M-rows x 32 K-bf16 / iter
+//   tmm2 = B tile:        16 rows x 64 colsb = 16 K-pair-rows x 16 N x 2 bf16
+//
+// A is packed via PackedAmxBf16A(16): per panel of 16 M-rows, row-major
+// within the panel, K-bf16 contiguous along the row, K_padded =
+// ceil(K/32)*32 bf16. Source f32 is truncated to bf16 at pack time using
+// round-to-nearest-even (matches VCVTNEPS2BF16 semantics).
+//
+// B is packed via PackedBf16K2(16): per K=2 block, 16 N-cols x 2 K-bf16 =
+// 64 bytes; 16 K-blocks per tmm2 tile. Source f32 -> bf16 same as A.
+//
+// REGISTER LAYOUT (mirrors the i32 16x16 sibling):
+//   zmm0..zmm15 = accumulators, ROW-MAJOR: zmm{m} = row m of C as 16 f32
+//                 lanes [C[m, 0], C[m, 1], ..., C[m, 15]].
+
+{% if msvc %}
+
+_text segment
+avx512amx_mmm_f32_16x16_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}avx512amx_mmm_f32_16x16_{{suffix}}
+{{G}}avx512amx_mmm_f32_16x16_{{suffix}}:
+.cfi_startproc
+
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+{% if family == "windows" %}
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    push        rdi
+    push        rsi
+
+    mov         rdi, rcx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+.cfi_def_cfa_offset 64
+{% endif %}
+
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+
+    // Reserve 64 bytes for the AMX tile-config block, zero it, populate
+    // palette + dims (all three tiles are 16 rows x 64 colsb). Same shape
+    // as the i32 16x16 sibling.
+    sub         rsp, 64
+    vpxor       xmm15, xmm15, xmm15
+    vmovdqu     [rsp     ], xmm15
+    vmovdqu     [rsp + 16], xmm15
+    vmovdqu     [rsp + 32], xmm15
+    vmovdqu     [rsp + 48], xmm15
+    mov         byte ptr  [rsp + 0 ], 1    // palette = 1
+    mov         word ptr  [rsp + 16], 64   // colsb[0] = 64 (tmm0)
+    mov         word ptr  [rsp + 18], 64   // colsb[1] = 64 (tmm1)
+    mov         word ptr  [rsp + 20], 64   // colsb[2] = 64 (tmm2)
+    mov         byte ptr  [rsp + 48], 16   // rows[0]  = 16 (tmm0)
+    mov         byte ptr  [rsp + 49], 16   // rows[1]  = 16 (tmm1)
+    mov         byte ptr  [rsp + 50], 16   // rows[2]  = 16 (tmm2)
+    ldtilecfg   [rsp]
+
+{% include "dispatcher.j2" %}
+
+{{L}}clear:
+    {% for r in range(0, 16) %}
+    vpxorq      zmm{{r}}, zmm{{r}}, zmm{{r}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     r12,    [rdi + 32]   // packing
+    mov     rbx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rcx,    [rdi + 8]    // k
+    test    rcx,    rcx
+    jz      {{L}}non_linear_loop
+
+    cmp     r12, 1
+    je      {{L}}main_loop_packed_packed_bf16
+
+{{L}}main_loop_packed_packed:
+    // Generic f32 x f32 fallback path (non-AMX). For row-major
+    // accumulators zmm{m} = row m of C, accumulating C[m, n] += A[m, k] *
+    // B[k, n]: load 16 B values for this K row into zmm16, then for each
+    // m broadcast A[m, k] and FMA add to zmm{m}.
+    vmovups         zmm16, [rbx]               // 16 f32 of B at this K row
+
+    {% for m in range(0, 16) %}
+        vbroadcastss    zmm17, dword ptr [rax + {{m}} * 4]
+        vfmadd231ps     zmm{{m}}, zmm16, zmm17
+    {% endfor %}
+
+    add             rax,    64                 // 16 f32 lanes per K step
+    add             rbx,    64
+    dec             rcx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed_bf16:
+    // AMX bf16 layout:
+    //   A panel: 16 M-rows x K_padded bf16 ROW-major within the panel
+    //            (PackedAmxBf16A, K_padded = ceil(K/32)*32 bf16 =
+    //            ceil(K/32)*64 bytes per row).
+    //   B panel: PackedBf16K2(16) -- 16 N-cols x 2 K-bf16 per K=2 block,
+    //            with 16 K-blocks per tdpbf16ps iter (16 K-pair-rows x
+    //            64 colsb).
+    //
+    // Per tdpbf16ps: tmm0[m, n] += sum_{k=0..31} A[m, k] * B[k, n]
+    // with multiplies in bf16 and accumulation in f32. Inner loop steps
+    // along K in 32-bf16 chunks (= 64 bytes per A row).
+
+    // r8 <- K_padded_in_bytes = ceil(k/32)*64 = byte-stride between A's
+    // M-rows. (Each bf16 is 2 bytes, so K_padded_bf16 * 2.)
+    mov             r8,     rcx
+    add             r8,     31
+    and             r8,     -32
+    shl             r8,     1                  // *2 (bf16 = 2 bytes)
+
+    // rcx <- ceil(k/32) = number of K=32 AMX inner iterations.
+    add             rcx,    31
+    shr             rcx,    5
+
+    // r9 <- 64 = byte-stride between B's K-pair-rows (16 N-cols * 4 bytes
+    // per K-pair = 16 * 4 = 64).
+    mov             r9,     64
+
+    tilezero        tmm0
+
+{{L}}loop_32k_amx_bf16_16x16:
+    // oneDNN-aligned cache strategy (same as the i32 sibling):
+    //   A -> cached (tileloadd + prefetcht0 to L1), reused across N-tiles.
+    //   B -> non-temporal (tileloaddt1 + prefetcht1 to L2), streams once.
+    // Each iter advances A by 64 bytes and B by 1024 bytes; we prime the
+    // first 6 of next-iter's 16 B cache lines and let the SPR HW stream
+    // prefetcher cover the remaining 10.
+    prefetcht0      [rax + 64]
+    prefetcht1      [rbx + 1024]
+    prefetcht1      [rbx + 1088]
+    prefetcht1      [rbx + 1152]
+    prefetcht1      [rbx + 1216]
+    prefetcht1      [rbx + 1280]
+    prefetcht1      [rbx + 1344]
+    tileloadd       tmm1,   [rax + r8 * 1]     // A tile (cached): stride = K_padded_bytes
+    tileloaddt1     tmm2,   [rbx + r9 * 1]     // B tile (non-temporal): stride = 64
+    tdpbf16ps       tmm0,   tmm1,   tmm2
+    add             rax,    64                 // +32 bf16 in A row 0
+    add             rbx,    1024               // 16 K-pairs * 64 = 1024 B
+    dec             rcx
+    jnz             {{L}}loop_32k_amx_bf16_16x16
+
+    // tmm0 -> stack scratch (16 rows x 64 bytes = 1024 B row-major f32).
+    // Each row's 16 f32 are contiguous, so one 64-byte load per row.
+    sub             rsp,    1024
+    mov             r10,    rsp
+    mov             r11,    64
+    tilestored      [r10 + r11 * 1], tmm0
+
+    {% for m in range(0, 16) %}
+    vmovups         zmm{{m}}, [r10 + {{ m * 64 }}]
+    {% endfor %}
+
+    add             rsp,    1024
+
+    jmp             {{L}}non_linear_loop
+
+// ---- Scalar / per-row / per-col f32 epilogues ----------------------------
+
+{{L}}scalar_min:
+    vbroadcastss    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vminps zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_max:
+    vbroadcastss    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vmaxps zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_add:
+    vbroadcastss    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vaddps zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_mul:
+    vbroadcastss    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vmulps zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_sub:
+    // non-flipped sub = operand - acc (matches fma_mmm_ymm_ops.j2 scalar macro)
+    vbroadcastss    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vsubps zmm{{r}}, zmm16, zmm{{r}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_sub_flipped:
+    // flipped sub = acc - operand
+    vbroadcastss    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vsubps zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}leaky_relu:
+    // C[m, n] = (C[m, n] >= 0) ? C[m, n] : alpha * C[m, n]
+    vbroadcastss    zmm17, dword ptr [rdi + 8]      // alpha
+    vpxorq          zmm16, zmm16, zmm16             // 0.0
+    {% for r in range(0, 16) %}
+    vmulps          zmm18, zmm{{r}}, zmm17          // alpha * x
+    vcmpps          k1, zmm{{r}}, zmm16, 1          // imm 1 = LT (signed): 1 where x < 0
+    vblendmps       zmm{{r}}{k1}, zmm{{r}}, zmm18
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_min:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vbroadcastss    zmm16, dword ptr [rax + {{m * 4}}]
+    vminps          zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_max:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vbroadcastss    zmm16, dword ptr [rax + {{m * 4}}]
+    vmaxps          zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_add:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vbroadcastss    zmm16, dword ptr [rax + {{m * 4}}]
+    vaddps          zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_mul:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vbroadcastss    zmm16, dword ptr [rax + {{m * 4}}]
+    vmulps          zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_sub:
+    // non-flipped sub = operand - acc
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vbroadcastss    zmm16, dword ptr [rax + {{m * 4}}]
+    vsubps          zmm{{m}}, zmm16, zmm{{m}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_sub_flipped:
+    // flipped sub = acc - operand
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vbroadcastss    zmm16, dword ptr [rax + {{m * 4}}]
+    vsubps          zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_min:
+    mov     rax, [rdi + 8]
+    vmovups         zmm16, [rax]
+    {% for r in range(0, 16) %}vminps zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_max:
+    mov     rax, [rdi + 8]
+    vmovups         zmm16, [rax]
+    {% for r in range(0, 16) %}vmaxps zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_add:
+    mov     rax, [rdi + 8]
+    vmovups         zmm16, [rax]
+    {% for r in range(0, 16) %}vaddps zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_mul:
+    mov     rax, [rdi + 8]
+    vmovups         zmm16, [rax]
+    {% for r in range(0, 16) %}vmulps zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_sub:
+    // non-flipped sub = operand - acc
+    mov     rax, [rdi + 8]
+    vmovups         zmm16, [rax]
+    {% for r in range(0, 16) %}vsubps zmm{{r}}, zmm16, zmm{{r}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_sub_flipped:
+    // flipped sub = acc - operand
+    mov     rax, [rdi + 8]
+    vmovups         zmm16, [rax]
+    {% for r in range(0, 16) %}vsubps zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}load_tile:
+    // Scratch layout is COL-MAJOR f32 (col_byte_stride = item_size * MR =
+    // 4 * 16 = 64): tile[col][row] at offset col*64 + row*4. Gather row m's
+    // 16 cols at index step 64.
+    mov         r8, [rdi + 8]
+    vmovdqa32   zmm16, [rip + {{L}}lane_offsets_64]
+    {% for m in range(0, 16) %}
+    mov         eax, 0xFFFF
+    kmovw       k1, eax
+    vpgatherdd  zmm{{m}}{k1}, [r8 + zmm16 + {{m * 4}}]
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_unicast:
+    mov     r10,    [rdi + 8]           // c ptr (base)
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+    mov     r8,     [rdi + 32]          // item size (4 for f32)
+
+    cmp     r8, 4
+    jne     {{L}}unsupported            // f32 kernel: only item_size=4
+
+    // i32-strided gather (f32 same bit-width: vpgatherdd is correct).
+    mov             eax, ebx
+    vmovd           xmm16, eax
+    vpbroadcastd    zmm16, xmm16
+    vpmulld         zmm16, zmm16, [rip + {{L}}lane_indices]
+
+    {% for m in range(0, 16) %}
+    mov             eax, 0xFFFF
+    kmovw           k1, eax
+    vpgatherdd      zmm17{k1}, [r10 + zmm16]
+    vaddps          zmm{{m}}, zmm{{m}}, zmm17
+    add             r10, rsi
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    // bias[m, n] = row_data[m] * col_data[n], FMA-add to C[m, n].
+    mov             rax, [rdi + 8]
+    mov             rbx, [rdi + 16]
+
+    vmovups         zmm16, [rbx]                          // 16 col_data values
+
+    {% for m in range(0, 16) %}
+    vbroadcastss    zmm17, dword ptr [rax + {{m * 4}}]    // splat row_data[m]
+    vfmadd231ps     zmm{{m}}, zmm17, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+// ---- q_scale / q_shr / q_shl: not meaningful for f32, stub to unsupported.
+{{L}}q_scale:
+{{L}}q_shl:
+{{L}}q_shr:
+    jmp     {{L}}unsupported
+
+// ---- Store ---------------------------------------------------------------
+
+{{L}}store:
+    mov     r8,     [rdi + 8]               // c ptr
+    mov     rsi,    [rdi + 16]              // row stride
+    mov     rdx,    [rdi + 24]              // col stride
+    mov     rcx,    [rdi + 32]              // item size
+
+    cmp     rcx, 4
+    jne     {{L}}unsupported                // f32 kernel: only item_size=4
+
+    cmp     rdx, 4
+    je      {{L}}store_strides_f32_row_contig
+
+    // Generic f32 strided store
+    {% for m in range(0, 16) %}
+    mov     r10, r8
+    vextracti32x4   xmm20, zmm{{m}}, 0
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 1
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 2
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 3
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    add     r8, rsi
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_f32_row_contig:
+    // C is row-major in memory: each row's 16 f32 are contiguous; one
+    // 64-byte vmovups per row.
+    {% for m in range(0, 16) %}
+    vmovups     [r8], zmm{{m}}
+    add         r8, rsi
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}return:
+    tilerelease
+    add         rsp, 64
+
+    ldmxcsr     [rsp + 4]
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+// ---- Read-only data (RIP-relative) ---------------------------------------
+
+.p2align 6
+{{L}}lane_offsets_64:
+    .int  0,  64, 128, 192, 256, 320, 384, 448
+    .int  512, 576, 640, 704, 768, 832, 896, 960
+
+.p2align 6
+{{L}}lane_indices:
+    .int  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+{% if msvc %}
+avx512amx_mmm_f32_16x16_{{suffix}} endp
+_text ends
+end
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/linalg/x86_64/fma/avx512amx_mmm_i32_16x16.S.j2 b/linalg/x86_64/fma/avx512amx_mmm_i32_16x16.S.j2
new file mode 100644
index 0000000000..5c97cb7c6a
--- /dev/null
+++ b/linalg/x86_64/fma/avx512amx_mmm_i32_16x16.S.j2
@@ -0,0 +1,932 @@
+// vim: set syntax=asm :
+//
+// Intel AMX int8 GEMM kernel, 16 M-rows x 16 N-cols i32 accumulator output.
+//
+// One `tdpbssd tmm0, tmm1, tmm2` instruction performs:
+//   tmm0[m, n] += sum_{k=0..63} A[m, k] * B[k, n]
+// for m=0..15, n=0..15: 16 * 16 * 64 = 16384 mul-adds per single instruction.
+// That's 4x the work-per-instruction of the 8x8 sibling kernel, hitting the
+// full AMX i8 tile geometry (max colsb=64, max rows=16, max bytes=1024).
+//
+// Tile geometry (palette 1):
+//   tmm0 = C accumulator: 16 rows x 64 colsb = 16 M-rows x 16 N-cols of i32
+//   tmm1 = A tile:        16 rows x 64 colsb = 16 M-rows x 64 K-bytes per iter
+//   tmm2 = B tile:        16 rows x 64 colsb = 16 K-pair-rows x 16 N-cols * 4 K
+//
+// `tdpbssd` is signed-signed, so no +128 trick is needed; the i32 accumulators
+// are bit-identical to the AVX2 / VNNI / 8x8-AMX reference paths.
+//
+// A is packed via PackedAmxA(16): per panel of 16 M-rows, row-major within the
+// panel, K-bytes contiguous along the row, K_padded = ceil(K/64)*64.
+// B reuses PackedI8K4(16): per K=4 block, 16 N-cols * 4 K-bytes = 64 bytes;
+// 16 such K-blocks per tmm2 tile = 1024 bytes = one tileloadd.
+//
+// REGISTER LAYOUT
+//   zmm0..zmm15 = accumulators, ROW-MAJOR: zmm{m} holds the 16 i32 lanes
+//                 [C[m, 0], C[m, 1], ..., C[m, 15]] for row m.
+// This matches the row-major i32 layout that `tilestored` writes directly,
+// so the hot path (Clear -> AddMatMul -> Store) needs no transpose.
+
+{% if msvc %}
+
+_text segment
+avx512amx_mmm_i32_16x16_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}avx512amx_mmm_i32_16x16_{{suffix}}
+{{G}}avx512amx_mmm_i32_16x16_{{suffix}}:
+.cfi_startproc
+
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+{% if family == "windows" %}
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    push        rdi
+    push        rsi
+
+    mov         rdi, rcx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+.cfi_def_cfa_offset 64
+{% endif %}
+
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+
+    // Reserve 64 bytes for the AMX tile-config block, zero it, populate
+    // palette + dims (all three tiles are 16 rows x 64 colsb, the maximum
+    // i8 tile geometry on Sapphire Rapids / Emerald Rapids / Granite Rapids).
+    sub         rsp, 64
+    vpxor       xmm15, xmm15, xmm15
+    vmovdqu     [rsp     ], xmm15
+    vmovdqu     [rsp + 16], xmm15
+    vmovdqu     [rsp + 32], xmm15
+    vmovdqu     [rsp + 48], xmm15
+    mov         byte ptr  [rsp + 0 ], 1    // palette = 1
+    mov         word ptr  [rsp + 16], 64   // colsb[0] = 64 (tmm0)
+    mov         word ptr  [rsp + 18], 64   // colsb[1] = 64 (tmm1)
+    mov         word ptr  [rsp + 20], 64   // colsb[2] = 64 (tmm2)
+    mov         byte ptr  [rsp + 48], 16   // rows[0]  = 16 (tmm0)
+    mov         byte ptr  [rsp + 49], 16   // rows[1]  = 16 (tmm1)
+    mov         byte ptr  [rsp + 50], 16   // rows[2]  = 16 (tmm2)
+    ldtilecfg   [rsp]
+
+{% include "dispatcher.j2" %}
+
+{{L}}clear:
+    // vzeroall only zeros lower-256 of zmm0..15; explicitly zero the full
+    // accumulators (zmm0..zmm15) for AMX.
+    {% for r in range(0, 16) %}
+    vpxorq      zmm{{r}}, zmm{{r}}, zmm{{r}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     r12,    [rdi + 32]   // packing
+    mov     rbx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rcx,    [rdi + 8]    // k
+    test    rcx,    rcx
+    jz      {{L}}non_linear_loop
+
+    cmp     r12, 1
+    je      {{L}}main_loop_packed_packed_i8i8
+
+{{L}}main_loop_packed_packed:
+    // Generic i32 x i32 fallback path (not AMX). For row-major accumulators
+    // with zmm{m} = row m of C, accumulating C[m, n] += A[m, k] * B[k, n]:
+    //   - load 16 B values for this K row into zmm16 (row of B)
+    //   - for each m: broadcast A[m, k], multiply by zmm16, add to zmm{m}
+    vmovups         zmm16, [rbx]               // 16 i32 of B at this K row
+
+    {% for m in range(0, 16) %}
+        vpbroadcastd    zmm17, dword ptr [rax + {{m}} * 4]
+        vpmulld         zmm18, zmm16, zmm17
+        vpaddd          zmm{{m}}, zmm{{m}}, zmm18
+    {% endfor %}
+
+    add             rax,    64                 // 16 i32 lanes per K step
+    add             rbx,    64
+    dec             rcx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed_i8i8:
+    // AMX i8 layout:
+    //   A panel: 16 M-rows x K_padded K-bytes ROW-major within the panel
+    //            (PackedAmxA, K_padded = ceil(K/64)*64).
+    //   B panel: PackedI8K4(16) -- 16 N-cols x 4 K-bytes per K=4 block, with
+    //            16 K-blocks per tileloadd (16 K-pair-rows x 64 colsb).
+    //
+    // Per tdpbssd: tmm0[m, n] += sum_{k=0..63} A[m, k] * B[k, n].
+    // Inner loop steps along K in 64-K chunks.
+
+    // r8 <- K_padded = ceil(k/64) * 64 = byte-stride between A's M-rows.
+    mov             r8,     rcx
+    add             r8,     63
+    and             r8,     -64
+
+    // rcx <- ceil(k/64) = number of K=64 AMX inner iterations.
+    add             rcx,    63
+    shr             rcx,    6
+
+    // r9 <- 64 = byte-stride between B's K-pair-rows (each row = 16 N-cols * 4 K).
+    mov             r9,     64
+
+    tilezero        tmm0
+
+{{L}}loop_64k_amx_i8i8_16x16:
+    // Cache strategy follows oneDNN's AMX BRGEMM heuristics (Intel-backed):
+    //   - A is reused across N-tiles in tract's outer matmul loop, so we use
+    //     `tileloadd` (cached, brings into L1) and `prefetcht0` (to L1) for A.
+    //   - B streams through once per kernel call (one B-panel per N-tile), and
+    //     for the AMX-typical large-matmul case the B working set exceeds the
+    //     32 KB L1d on Sapphire Rapids+. We use `tileloaddt1` (non-temporal,
+    //     bypasses L1) and `prefetcht1` (to L2) for B -- the same pattern
+    //     oneDNN picks when its footprint heuristic crosses the L1 threshold.
+    //   - Sapphire Rapids has 16 L1d Fill Buffers (LFBs); each in-flight
+    //     prefetch/load consumes one. The previous version's 17 prefetches +
+    //     2 active tileloadds overflowed the LFB budget. The reduced count
+    //     below leaves headroom and lets the HW streaming prefetcher cover
+    //     the remaining B-panel lines.
+    //
+    // A advances 64 B / iter (one cache line). B advances 1024 B / iter
+    // (16 cache lines). We prime 6 of the next 16 B-lines at +1024..+1344,
+    // then trust the HW stream prefetcher (very aggressive on SPR/EMR/GNR)
+    // to cover lines +1408..+1984.
+    prefetcht0      [rax + 64]                 // next A-row K-block (to L1)
+    prefetcht1      [rbx + 1024]               // next B-panel head (to L2)
+    prefetcht1      [rbx + 1088]
+    prefetcht1      [rbx + 1152]
+    prefetcht1      [rbx + 1216]
+    prefetcht1      [rbx + 1280]
+    prefetcht1      [rbx + 1344]
+    tileloadd       tmm1,   [rax + r8 * 1]     // A tile (cached): stride = K_padded
+    tileloaddt1     tmm2,   [rbx + r9 * 1]     // B tile (non-temporal): stride = 64
+    tdpbssd         tmm0,   tmm1,   tmm2
+    add             rax,    64                 // +64 K-bytes in A row 0
+    add             rbx,    1024               // 16 K-pairs * 64 = 1024 bytes
+    dec             rcx
+    jnz             {{L}}loop_64k_amx_i8i8_16x16
+
+    // tmm0 -> stack scratch (16 rows x 64 bytes = 1024 B row-major i32).
+    // Then load each row into zmm0..zmm15.  Row m's 16 i32 are contiguous
+    // in memory, so each load is a single 64-byte vmovdqu32.
+    sub             rsp,    1024
+    mov             r10,    rsp
+    mov             r11,    64
+    tilestored      [r10 + r11 * 1], tmm0
+
+    {% for m in range(0, 16) %}
+    vmovdqu32       zmm{{m}}, [r10 + {{ m * 64 }}]
+    {% endfor %}
+
+    add             rsp,    1024
+
+    jmp             {{L}}non_linear_loop
+
+// ---- Scalar / per-row / per-col elementwise epilogues -------------------
+
+{{L}}scalar_min:
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpminsd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_max:
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpmaxsd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_add:
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpaddd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_mul:
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpmulld zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_sub:
+    // non-flipped sub = operand - acc (matches fma_mmm_ymm_ops.j2 scalar macro)
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpsubd zmm{{r}}, zmm16, zmm{{r}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_sub_flipped:
+    // flipped sub = acc - operand
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpsubd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}leaky_relu:
+    // C[m, n] = (C[m, n] >= 0) ? C[m, n] : alpha * C[m, n]
+    vpbroadcastd    zmm17, dword ptr [rdi + 8]    // alpha as i32 scale factor
+    vpxorq          zmm16, zmm16, zmm16
+    {% for r in range(0, 16) %}
+    vpmulld         zmm18, zmm{{r}}, zmm17
+    vpcmpgtd        k1,    zmm16, zmm{{r}}        // 1 where C < 0
+    vpblendmd       zmm{{r}}{k1}, zmm{{r}}, zmm18
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_min:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpminsd         zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_max:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpmaxsd         zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_add:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpaddd          zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_mul:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpmulld         zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_sub:
+    // non-flipped sub = operand - acc
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpsubd          zmm{{m}}, zmm16, zmm{{m}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_sub_flipped:
+    // flipped sub = acc - operand
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpsubd          zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_min:
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpminsd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_max:
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpmaxsd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_add:
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpaddd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_mul:
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpmulld zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_sub:
+    // non-flipped sub = operand - acc
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpsubd zmm{{r}}, zmm16, zmm{{r}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_sub_flipped:
+    // flipped sub = acc - operand
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpsubd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}load_tile:
+    // Scratch layout is COL-MAJOR i32 from scratch.rs Store/AddUnicast remnant:
+    //   tile[col][row] at offset (col*MR + row)*4 with MR=16
+    //   = offset col*64 + row*4
+    // For row-major accumulators we gather row m's 16 cols at index step 64.
+    mov         r8, [rdi + 8]
+    vmovdqa32   zmm16, [rip + {{L}}lane_offsets_64]    // [0, 64, 128, ..., 15*64]
+    {% for m in range(0, 16) %}
+    mov         eax, 0xFFFF
+    kmovw       k1, eax
+    vpgatherdd  zmm{{m}}{k1}, [r8 + zmm16 + {{m * 4}}]
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_unicast:
+    mov     r10,    [rdi + 8]           // c ptr (base)
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+    mov     r8,     [rdi + 32]          // item size
+
+    cmp     r8,    4
+    je      {{L}}non_linear_addc_i32
+
+    // i8 path: read 16 i8 from [r10 + m*rsi + n*rbx] for n=0..15, sign-extend
+    // to i32, add to zmm{m}. Use a stack scratch buffer (16 bytes per row).
+    sub     rsp, 16
+    {% for m in range(0, 16) %}
+    mov     r8, r10
+    {% for n in range(0, 16) %}
+    mov     al, [r8]
+    mov     byte ptr [rsp + {{n}}], al
+    add     r8, rbx
+    {% endfor %}
+    vpmovsxbd       zmm16, [rsp]
+    vpaddd          zmm{{m}}, zmm{{m}}, zmm16
+    add     r10, rsi
+    {% endfor %}
+    add     rsp, 16
+    jmp     {{L}}non_linear_loop
+
+{{L}}non_linear_addc_i32:
+    // i32 strided read of external (or scratch) tile. Build per-lane index
+    // vector [0, rbx, 2*rbx, ..., 15*rbx] once, then gather row by row.
+    mov             eax, ebx
+    vmovd xmm16, eax
+    vpbroadcastd    zmm16, xmm16
+    vpmulld         zmm16, zmm16, [rip + {{L}}lane_indices]    // [0, rbx, 2*rbx, ...]
+
+    {% for m in range(0, 16) %}
+    mov             eax, 0xFFFF
+    kmovw           k1, eax
+    vpgatherdd      zmm17{k1}, [r10 + zmm16]
+    vpaddd          zmm{{m}}, zmm{{m}}, zmm17
+    add             r10, rsi
+    {% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    // bias[m, n] = row_data[m] * col_data[n], add to C[m, n].
+    // For row-major regs: load 16 col_data values once into zmm16,
+    //                     for each m: broadcast row_data[m], FMA add.
+    mov             rax, [rdi + 8]
+    mov             rbx, [rdi + 16]
+
+    vmovdqu32       zmm16, [rax]                          // 16 row_data values
+    vmovdqu32       zmm17, [rbx]                          // 16 col_data values
+
+    {% for m in range(0, 16) %}
+    vpbroadcastd    zmm18, dword ptr [rax + {{m * 4}}]    // splat row_data[m]
+    vpmulld         zmm19, zmm18, zmm17
+    vpaddd          zmm{{m}}, zmm{{m}}, zmm19
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+// ---- Q-scale (mult-shift with rounding) ---------------------------------
+
+{{L}}q_scale:
+    mov             r8, [rdi + 16]                  // policy
+    vpbroadcastd    zmm16, dword ptr [rdi + 24]     // multi (broadcast i32)
+
+    mov             rax, 1
+    vmovq xmm17, rax
+    vpbroadcastq    zmm17, xmm17                    // zmm17 <- 1 (i64 lanes)
+
+    mov             rax, [rdi + 8]                  // shift
+    add             rax, 31
+    vmovq xmm18, rax
+    vpbroadcastq    zmm18, xmm18                    // zmm18 <- (shift+31) (i64 lanes)
+
+    vpsubq          zmm19, zmm18, zmm17
+    vpsllvq         zmm19, zmm17, zmm19             // zmm19 <- 1 << (shift+31-1)  (i64)
+
+    // Per-lane interleave mask for blending evens / shifted-odds.
+    // bit i = 1 means take from "evens" source in vpblendmd; bit 0,2,4,...,14 set.
+    mov             eax, 0x5555
+    kmovw           k7, eax
+
+    cmp     r8, 1
+    je      {{L}}q_scale_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_scale_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_scale_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_scale_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_scale_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_scale_rounding_odd
+
+    jmp     {{L}}unsupported
+
+{{L}}q_scale_rounding_zero:           // signum * ( (abs + nudge - 1) >> shift )
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16                 // even-lane i32 -> i64 mul
+    vpmuldq     zmm21, zmm21, zmm16                 // odd-lane  i32 -> i64 mul
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    vpsubq      zmm20, zmm20, zmm17
+    vpsubq      zmm21, zmm21, zmm17
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20             // k7=0x5555: evens from zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_away:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16
+    vpmuldq     zmm21, zmm21, zmm16
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_minus_inf:      // nudge by -1 where input was negative
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpxorq      zmm22, zmm22, zmm22
+    vpcmpgtd    k1,    zmm{{i}}, zmm22              // k1: 1 where input > 0 (we want the inverse, see below)
+    knotw       k1,    k1                           // 1 where input <= 0  -- we want "input was negative => subtract 1"
+    // For "<0": use compare against 0 with vpcmpltd
+    vpxorq      zmm22, zmm22, zmm22
+    vpcmpltd    k1,    zmm{{i}}, zmm22              // 1 where input < 0
+    vmovdqa32   zmm23{k1}{z}, [rip + {{L}}all_ones_i32] // (1 << 0) per neg lane, 0 elsewhere
+
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16
+    vpmuldq     zmm21, zmm21, zmm16
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    // Subtract 1 from i64-evens / i64-odds where the original i32 input was < 0.
+    vpsrldq     zmm24, zmm23, 4
+    vpmovsxdq   zmm25, ymm23
+    vpmovsxdq   zmm26, ymm24
+    vpsubq      zmm20, zmm20, zmm25
+    vpsubq      zmm21, zmm21, zmm26
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_plus_inf:       // nudge by +1 where input was non-negative
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpxorq      zmm22, zmm22, zmm22
+    vpcmpled    k1,    zmm22, zmm{{i}}              // 1 where input >= 0
+    vmovdqa32   zmm23{k1}{z}, [rip + {{L}}all_ones_i32]
+
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16
+    vpmuldq     zmm21, zmm21, zmm16
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    vpsrldq     zmm24, zmm23, 4
+    vpmovsxdq   zmm25, ymm23
+    vpmovsxdq   zmm26, ymm24
+    vpsubq      zmm20, zmm20, zmm25
+    vpsubq      zmm21, zmm21, zmm26
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_even:           // banker's: round half to even
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16
+    vpmuldq     zmm21, zmm21, zmm16
+
+    vpsrlq      zmm22, zmm20, xmm18
+    vpandq      zmm22, zmm22, zmm17
+    vpaddq      zmm20, zmm20, zmm22
+    vpsubq      zmm20, zmm20, zmm17
+
+    vpsrlq      zmm22, zmm21, xmm18
+    vpandq      zmm22, zmm22, zmm17
+    vpaddq      zmm21, zmm21, zmm22
+    vpsubq      zmm21, zmm21, zmm17
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_odd:            // round half to odd
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16
+    vpmuldq     zmm21, zmm21, zmm16
+
+    vpsrlq      zmm22, zmm20, xmm18
+    vpandq      zmm22, zmm22, zmm17
+    vpsubq      zmm20, zmm20, zmm22
+
+    vpsrlq      zmm22, zmm21, xmm18
+    vpandq      zmm22, zmm22, zmm17
+    vpsubq      zmm21, zmm21, zmm22
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shl:
+    mov             eax, [rdi + 8]                  // -shift (count: i32)
+    vmovd xmm16, eax
+    vpbroadcastd    zmm16, xmm16
+    {% for i in range(0, 16) %}vpsllvd zmm{{i}}, zmm{{i}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr:
+    mov             r8,  [rdi + 16]                 // policy
+
+    mov             eax, 1
+    vmovd xmm16, eax
+    vpbroadcastd    zmm16, xmm16                    // zmm16 <- 1 (i32 lanes)
+
+    mov             eax, [rdi + 8]                  // shift
+    vmovd xmm17, eax
+    vpbroadcastd    zmm17, xmm17                    // zmm17 <- shift (i32 lanes)
+
+    mov             ebx, 1
+    mov             cl,  al
+    sub             cl,  1
+    sal             ebx, cl                         // ebx <- 1 << (shift - 1)
+    vmovd xmm18, ebx
+    vpbroadcastd    zmm18, xmm18                    // zmm18 <- "half"
+
+    vpxorq          zmm19, zmm19, zmm19             // zeroes
+
+    cmp     r8, 1
+    je      {{L}}q_shr_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_shr_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_shr_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_shr_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_shr_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_shr_rounding_odd
+
+    jmp     {{L}}unsupported
+
+{{L}}q_shr_rounding_zero:
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsubd      zmm20, zmm20, zmm16
+    vpaddd      zmm20, zmm20, zmm18
+    vpsravd     zmm20, zmm20, zmm17
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_away:
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpaddd      zmm20, zmm20, zmm18
+    vpsravd     zmm20, zmm20, zmm17
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_minus_inf:
+{% for i in range(0, 16) %}
+    vpsubd      zmm{{i}}, zmm{{i}}, zmm16
+    vpaddd      zmm{{i}}, zmm{{i}}, zmm18
+    vpsravd     zmm{{i}}, zmm{{i}}, zmm17
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_plus_inf:
+{% for i in range(0, 16) %}
+    vpaddd      zmm{{i}}, zmm{{i}}, zmm18
+    vpsravd     zmm{{i}}, zmm{{i}}, zmm17
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_even:
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsravd     zmm21, zmm20, zmm17
+    vpandq      zmm21, zmm21, zmm16
+    vpsubd      zmm21, zmm21, zmm16             // nudge = ((abs >>l shift) & 1) - 1
+    vpaddd      zmm20, zmm20, zmm21
+    vpaddd      zmm20, zmm20, zmm18
+    vpsravd     zmm20, zmm20, zmm17
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_odd:
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsravd     zmm21, zmm20, zmm17
+    vpandq      zmm21, zmm21, zmm16
+    vpsubd      zmm21, zmm19, zmm21             // nudge = -((abs >>l shift) & 1)
+    vpaddd      zmm20, zmm20, zmm21
+    vpaddd      zmm20, zmm20, zmm18
+    vpsravd     zmm20, zmm20, zmm17
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+// ---- Store ---------------------------------------------------------------
+
+{{L}}store:
+    mov     r8,     [rdi + 8]               // c ptr
+    mov     rsi,    [rdi + 16]              // row stride
+    mov     rdx,    [rdi + 24]              // col stride
+    mov     rcx,    [rdi + 32]              // item size
+
+    cmp     rcx, 4
+    je      {{L}}store_strides_i32
+    // else: i8 fallthrough
+
+    cmp     rdx, 1
+    je      {{L}}store_strides_i8_row_contig
+
+    // Generic i8 strided store: per row, per lane scalar byte stores
+    {% for m in range(0, 16) %}
+    mov     r10, r8
+    // Extract from each 128-bit slice of zmm{{m}}
+    vextracti32x4   xmm20, zmm{{m}}, 0
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         byte ptr [r10], bl
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 1
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         byte ptr [r10], bl
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 2
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         byte ptr [r10], bl
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 3
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         byte ptr [r10], bl
+    add         r10, rdx
+    {% endfor %}
+    add     r8, rsi
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_i8_row_contig:
+    // Each row is 16 i8 contiguous; one vpmovdb per row.
+    {% for m in range(0, 16) %}
+    vpmovdb     [r8], zmm{{m}}
+    add         r8, rsi
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_i32:
+    cmp     rdx, 4
+    je      {{L}}store_strides_i32_row_contig
+
+    // Generic i32 strided store
+    {% for m in range(0, 16) %}
+    mov     r10, r8
+    vextracti32x4   xmm20, zmm{{m}}, 0
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 1
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 2
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 3
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    add     r8, rsi
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_i32_row_contig:
+    // C is row-major in memory: each row's 16 i32 are contiguous; one
+    // 64-byte aligned-or-unaligned store per row.
+    {% for m in range(0, 16) %}
+    vmovdqu32   [r8], zmm{{m}}
+    add         r8, rsi
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}return:
+    tilerelease
+    add         rsp, 64
+
+    ldmxcsr     [rsp + 4]
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+// ---- Read-only data (RIP-relative) ---------------------------------------
+
+.p2align 6
+{{L}}lane_offsets_64:
+    .int  0,  64, 128, 192, 256, 320, 384, 448
+    .int  512, 576, 640, 704, 768, 832, 896, 960
+
+.p2align 6
+{{L}}lane_indices:
+    .int  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+.p2align 6
+{{L}}all_ones_i32:
+    .int  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+
+{% if msvc %}
+avx512amx_mmm_i32_16x16_{{suffix}} endp
+_text ends
+end
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/linalg/x86_64/fma/avx512amx_mmm_i32_8x8.S.j2 b/linalg/x86_64/fma/avx512amx_mmm_i32_8x8.S.j2
new file mode 100644
index 0000000000..ff65b6484a
--- /dev/null
+++ b/linalg/x86_64/fma/avx512amx_mmm_i32_8x8.S.j2
@@ -0,0 +1,764 @@
+{#
+// vim: set syntax=asm :
+
+/* mmm 8x8:
+
+    ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+#}
+
+{% if msvc %}
+
+_text segment
+avx512amx_mmm_i32_8x8_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}avx512amx_mmm_i32_8x8_{{suffix}}
+{{G}}avx512amx_mmm_i32_8x8_{{suffix}}:
+.cfi_startproc
+
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+{% if family == "windows" %}
+// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch
+// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    push        rdi
+    push        rsi
+
+    mov         rdi, rcx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+.cfi_def_cfa_offset 64
+{% endif %}
+
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+
+    // Reserve 64 bytes of stack for the AMX tile-config block, zero it,
+    // populate palette + tile dimensions, then ldtilecfg. The tile config
+    // stays live for the whole function; tilerelease is emitted at return.
+    //
+    // tmm0 = C accumulator: 8 rows x 32 colsb (8 M-rows x 8 N-cols of i32)
+    // tmm1 = A tile: 8 rows x 64 colsb (8 M-rows x 64 K-bytes per inner iter)
+    // tmm2 = B tile: 16 rows x 32 colsb (16 K-pair-rows x 8 N-cols * 4 K-bytes)
+    sub         rsp, 64
+    vpxor       xmm15, xmm15, xmm15
+    vmovdqu     [rsp     ], xmm15
+    vmovdqu     [rsp + 16], xmm15
+    vmovdqu     [rsp + 32], xmm15
+    vmovdqu     [rsp + 48], xmm15
+    mov         byte ptr  [rsp + 0 ], 1   // palette = 1
+    mov         word ptr  [rsp + 16], 32  // colsb[0] = 32 (tmm0)
+    mov         word ptr  [rsp + 18], 64  // colsb[1] = 64 (tmm1)
+    mov         word ptr  [rsp + 20], 32  // colsb[2] = 32 (tmm2)
+    mov         byte ptr  [rsp + 48], 8   // rows[0]  = 8  (tmm0)
+    mov         byte ptr  [rsp + 49], 8   // rows[1]  = 8  (tmm1)
+    mov         byte ptr  [rsp + 50], 16  // rows[2]  = 16 (tmm2)
+    ldtilecfg   [rsp]
+
+{% include "dispatcher.j2" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     r12,    [rdi + 32]   // packing
+    mov     rbx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rcx,    [rdi + 8]    // k
+    test    rcx,    rcx
+    jz      {{L}}non_linear_loop
+
+    cmp     r12, 1
+    je      {{L}}main_loop_packed_packed_i8i8
+
+{{L}}main_loop_packed_packed:
+    vmovaps         ymm12,  [rax]
+
+    {% for i in range(0, 8) %}
+        vbroadcastss    ymm14, dword ptr [rbx + {{i}} * 4]
+        vpmulld         ymm13, ymm12, ymm14
+        vpaddd          ymm{{i}}, ymm{{i}}, ymm13
+    {% endfor %}
+
+    add             rax,    32
+    add             rbx,    32
+    dec             rcx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed_i8i8:
+    // AMX i8 layout: A panel is 8 M-rows x K_padded K-bytes ROW-major within
+    // each 8-row panel (PackedAmxA8); B panel reuses the existing VNNI K=4-
+    // inner format (8 N-cols x 4 K-bytes per K-block, 16 such blocks per
+    // K=64 AMX tile).  K is padded to a multiple of 64 by the packer.
+    //
+    // tdpbssd is s8 x s8 -> i32 (Sapphire Rapids+), so no +128 trick is needed:
+    // the i32 accumulators are bit-identical to the AVX2 / VNNI paths.
+    //
+    // Per tdpbssd: tmm0[m, n] += sum_{k=0..63} A[m, k] * B[k, n]
+    //   (16 M-rows x 16 N-i32-lanes x 64 K = 16384 mul-acc per instruction)
+
+    // r8 <- K_padded = ceil(k/64) * 64 = byte-stride between A's M-rows.
+    mov             r8,     rcx
+    add             r8,     63
+    and             r8,     -64
+
+    // rcx <- ceil(k/64) = number of K=64 AMX inner iterations.
+    add             rcx,    63
+    shr             rcx,    6
+
+    // r9 <- 32 = byte-stride between B's K-pair-rows.
+    mov             r9,     32
+
+    tilezero        tmm0
+
+{{L}}loop_64k_amx_i8i8:
+    // Prefetch the data we'll need ONE iteration ahead. tileloadd brings
+    // the active tile data into L1 on demand; the prefetcht0 hints below
+    // ask the hardware prefetcher to start the L2->L1 fill for the next
+    // iter's A row (64 B further along the K axis) and the next iter's
+    // B panel (512 B = 8 cache lines further). For the long K loops
+    // (K>=256) the B-side prefetch matters most since each iter consumes
+    // 8 cache lines of B vs 1 cache line of A row 0.
+    prefetcht0      [rax + 64]
+    prefetcht0      [rbx + 512]
+    prefetcht0      [rbx + 576]
+    prefetcht0      [rbx + 640]
+    prefetcht0      [rbx + 704]
+    prefetcht0      [rbx + 768]
+    prefetcht0      [rbx + 832]
+    prefetcht0      [rbx + 896]
+    prefetcht0      [rbx + 960]
+    tileloadd       tmm1,   [rax + r8 * 1]         // A tile: stride r8 = K_padded
+    tileloadd       tmm2,   [rbx + r9 * 1]         // B tile: stride r9 = 32
+    tdpbssd         tmm0,   tmm1,   tmm2
+    add             rax,    64                     // +64 K-bytes in A row 0
+    add             rbx,    512                    // +16 K-pairs * 32 = 512 B bytes
+    dec             rcx
+    jnz             {{L}}loop_64k_amx_i8i8
+
+    // tmm0 -> ymm0..ymm7 via 256-byte stack scratch (8 rows x 32 bytes).
+    // After tilestored, the layout is row-major i32: byte (m*32 + n*4) = C[m, n].
+    // We need ymm{n} = column n of C with 8 i32 lanes (rows m=0..7) — the
+    // dispatcher epilogue convention.  So we (a) load 8 ymms = 8 rows of C,
+    // then (b) transpose 8x8 i32 in place.
+    sub             rsp,    256
+    mov             r10,    rsp
+    mov             r11,    32
+    tilestored      [r10 + r11 * 1], tmm0
+
+    {% for r in range(0, 8) %}
+    vmovdqu         ymm{{r}}, [r10 + {{ r * 32 }}]
+    {% endfor %}
+
+    add             rsp,    256
+
+    // 8x8 i32 transpose: ymm0..ymm7 row-major -> column-major in place.
+    // Stage 1: interleave 32-bit dwords pairwise (ymm0..ymm7 -> ymm8..ymm15).
+    vpunpckldq      ymm8,   ymm0, ymm1     // [r0[0], r1[0], r0[1], r1[1], r0[4], r1[4], r0[5], r1[5]]
+    vpunpckhdq      ymm9,   ymm0, ymm1
+    vpunpckldq      ymm10,  ymm2, ymm3
+    vpunpckhdq      ymm11,  ymm2, ymm3
+    vpunpckldq      ymm12,  ymm4, ymm5
+    vpunpckhdq      ymm13,  ymm4, ymm5
+    vpunpckldq      ymm14,  ymm6, ymm7
+    vpunpckhdq      ymm15,  ymm6, ymm7
+
+    // Stage 2: interleave 64-bit quads (ymm8..ymm15 -> ymm0..ymm7).
+    vpunpcklqdq     ymm0,   ymm8,  ymm10   // [r0[0], r1[0], r2[0], r3[0], r0[4], r1[4], r2[4], r3[4]]
+    vpunpckhqdq     ymm1,   ymm8,  ymm10
+    vpunpcklqdq     ymm2,   ymm9,  ymm11
+    vpunpckhqdq     ymm3,   ymm9,  ymm11
+    vpunpcklqdq     ymm4,   ymm12, ymm14
+    vpunpckhqdq     ymm5,   ymm12, ymm14
+    vpunpcklqdq     ymm6,   ymm13, ymm15
+    vpunpckhqdq     ymm7,   ymm13, ymm15
+
+    // Stage 3: cross-lane permute (128-bit halves). Two phases so we can
+    // overwrite the inputs incrementally without clobbering needed data.
+    vperm2i128      ymm8,   ymm0, ymm4, 0x20    // col 0: low(y0) | low(y4)
+    vperm2i128      ymm9,   ymm1, ymm5, 0x20    // col 1
+    vperm2i128      ymm10,  ymm2, ymm6, 0x20    // col 2
+    vperm2i128      ymm11,  ymm3, ymm7, 0x20    // col 3
+    vperm2i128      ymm12,  ymm0, ymm4, 0x31    // col 4: high(y0) | high(y4)
+    vperm2i128      ymm13,  ymm1, ymm5, 0x31    // col 5
+    vperm2i128      ymm14,  ymm2, ymm6, 0x31    // col 6
+    vperm2i128      ymm15,  ymm3, ymm7, 0x31    // col 7
+
+    vmovdqa         ymm0,   ymm8
+    vmovdqa         ymm1,   ymm9
+    vmovdqa         ymm2,   ymm10
+    vmovdqa         ymm3,   ymm11
+    vmovdqa         ymm4,   ymm12
+    vmovdqa         ymm5,   ymm13
+    vmovdqa         ymm6,   ymm14
+    vmovdqa         ymm7,   ymm15
+
+    jmp             {{L}}non_linear_loop
+
+{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_i32_scalars.j2" %}
+{% set mr = 8 %}{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_i32_per_rows.j2" %}
+{% set mr = 8 %}{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_i32_per_cols.j2" %}
+{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_load_tile.j2" %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+    mov     r8,     [rdi + 32]          // item size
+
+    cmp     r8,    4
+    je      {{L}}non_linear_addc_i32
+
+{#
+// This is not great as vgatherdps reads 32-bits values and goes beyond our buffer. Probably harmless though.
+// Commented and replaced with the "mov al" loop beyond to pacify valgrind.
+// ymm14 and ymm15 are the same as in the non_linear_addc_i32 case (compute them before the test right above here.
+// {% for i in range(0, 8) %}
+//     vpcmpeqd        ymm15, ymm15, ymm15
+//     vgatherdps      ymm12, [ r10 + ymm14 ], ymm15   // 0xxx 1xxx 2xxx 3xxx 4xxx 5xxx 6xxx 7xxx
+//
+//     // we need to go through vpmovsxbd, shuffling naively erases signs
+//     vpshufb         ymm12, ymm12, ymm10             // 0123 0123 0123 0123 4567 4567 4567 4567
+//
+//     vpermd          ymm12, ymm11, ymm12             // 0123 4567
+//     vpmovsxbd       ymm12, xmm12                    // sign extend
+//
+//     vpaddd          ymm{{i}},   ymm{{i}},   ymm12
+//     add             r10, rbx
+// {% endfor %}
+#}
+
+    {% for col in range(0, 8) %}
+        mov r8, r10
+        {% for half in range(0, 2) %}
+            {% for lane in range(0, 4) %}
+                mov al, [ r8 ]
+                add r8, rsi
+                movsx eax, al
+                pinsrd xmm10, eax, {{lane}}
+            {% endfor %}
+            vperm2f128  ymm10,   ymm10,   ymm10,  1
+        {% endfor %}
+        vpaddd ymm{{col}}, ymm{{col}}, ymm10
+        add r10, rbx
+    {% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}non_linear_addc_i32:
+
+    mov     eax,    0
+{% for i in range(0, 4) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+    vpermq          ymm14, ymm14, 78 // 0b01001110
+{% for i in range(0, 4) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+    vpermq          ymm14, ymm14, 78 // 0b01001110
+
+
+{% if msvc %}
+    vpbroadcastd    ymm10, dword ptr [ offset byte_shuffle ]
+    vmovups         ymm11, dword ptr [ offset i128_shuffle ]
+{% else %}
+    vpbroadcastd    ymm10, [ rip + {{L}}byte_shuffle ]
+    vmovups         ymm11, [ rip + {{L}}i128_shuffle ]
+{% endif %}
+
+{% for i in range(0, 8) %}
+    vpcmpeqd        ymm15, ymm15, ymm15
+    vgatherdps      ymm12, [ r10 + ymm14 ], ymm15
+    vpaddd          ymm{{i}},   ymm{{i}},   ymm12
+    add             r10, rbx
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{% if msvc %}
+.data
+byte_shuffle dd              201851904 // 0x0c080400
+i128_shuffle dd              0, 4
+.code
+{% else %}
+{{L}}byte_shuffle: .int            201851904 // 0x0c080400
+{{L}}i128_shuffle: .int            0, 4
+{% endif %}
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         ymm12,  [rax]
+
+{% for i in range(0, 8) %}
+    vbroadcastss    ymm14, dword ptr [rbx + {{ i * 4 }} ]
+    vpmulld         ymm15, ymm12, ymm14
+    vpaddd          ymm{{i}}, ymm{{i}}, ymm15
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale:
+    mov             r8, [ rdi + 16 ]        // policy
+    vbroadcastss    ymm8, dword ptr [rdi + 24] // multi
+
+    mov             rax, 1
+    movq            xmm9, rax
+    vpbroadcastq    ymm9, xmm9              // ymm9 <- 1
+
+    mov             rax, [ rdi + 8 ]        // xmm10 <- shift + 31
+    add             rax, 31
+    movq            xmm10, rax
+    vpbroadcastq    ymm10, xmm10
+
+    mov             rax, 1
+    movq            xmm11, rax
+    vpsubq          ymm12, ymm10, ymm9      // shift+31 - 1
+    vpsllq          ymm11, ymm9, xmm12      // ymm11 <- 1 << (shift + 31 - 1)
+
+    cmp     r8, 1
+    je      {{L}}q_scale_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_scale_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_scale_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_scale_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_scale_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_scale_rounding_odd
+
+    jmp    {{L}}unsupported
+
+{{L}}q_scale_rounding_zero:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsubq      ymm14, ymm14, ymm9
+    vpsubq      ymm15, ymm15, ymm9
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_away:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_minus_inf:           // signum * ( (abs << 32 + 1<<30+shift) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    // sign extract for nudging in the right direction
+    vpxor       ymm13, ymm13, ymm13
+    vpcmpgtd    ymm13, ymm{{i}}, ymm13      // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros)
+    vpsrld      ymm13, ymm13, 31            // then just 0 or 1
+
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrldq     ymm13, ymm13, 4             // ymm13 <- s1, s2, .., s7, 0
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_plus_inf:           // signum * ( (abs << 32 + 1<<30+shift) >> shift )
+
+    vpbroadcastd ymm9, xmm9
+
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpxor       ymm13, ymm13, ymm13
+
+    // sign extract for nudging in the right direction
+    vpcmpgtd    ymm13, ymm{{i}}, ymm13      // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros)
+    vpaddd      ymm13, ymm13, ymm9          // if val >= 0 { 0i32 } else { 1i32 }
+
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrldq     ymm13, ymm13, 4             // ymm13 <- s1, s2, .., s7, 0
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_even:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpsrlq      ymm12, ymm14, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpaddq      ymm14, ymm14, ymm12
+    vpsubq      ymm14, ymm14, ymm9
+
+    vpsrlq      ymm12, ymm15, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpaddq      ymm15, ymm15, ymm12
+    vpsubq      ymm15, ymm15, ymm9
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_odd:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpsrlq      ymm12, ymm14, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrlq      ymm12, ymm15, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_shl:
+    mov             eax, [ rdi + 8 ]        // xmm10 <- -shift (8 times)
+    movd            xmm10, eax
+    vpbroadcastd    ymm10, xmm10
+
+{% for i in range(0, 8) %}
+    vpsllvd     ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr:
+    mov             r8, [ rdi + 16 ]        // policy
+
+    mov             eax, 1
+    movd            xmm9, eax
+    vpbroadcastd    ymm9, xmm9              // ymm9 <- 1u32 (8 times)
+
+    mov             eax, [ rdi + 8 ]        // xmm10 <- shift (8 times)
+    movd            xmm10, eax
+    vpbroadcastd    ymm10, xmm10
+
+    mov             ebx, 1
+    mov             cl, al
+    sub             cl, 1                  // rcx <- shift -1
+    sal             ebx, cl                // rbx <- (1 << (shift - 1))
+    movd            xmm11, ebx
+    vpbroadcastd    ymm11, xmm11            // ymm11 <- "half"
+
+    vpxor           ymm12, ymm12, ymm12     // ymm12 <- zeroes
+
+    cmp     r8, 1
+    je      {{L}}q_shr_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_shr_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_shr_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_shr_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_shr_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_shr_rounding_odd
+
+    jmp    {{L}}unsupported
+
+{{L}}q_shr_rounding_zero:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsubd      ymm14, ymm14, ymm9
+    vpaddd      ymm14, ymm14, ymm11
+    vpsravd     ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_away:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpaddd      ymm14, ymm14, ymm11
+    vpsravd     ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_minus_inf:
+{% for i in range(0, 8) %}
+    vpsubd  ymm{{i}}, ymm{{i}}, ymm9
+    vpaddd  ymm{{i}}, ymm{{i}}, ymm11
+    vpsravd ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_plus_inf:
+{% for i in range(0, 8) %}
+    vpaddd  ymm{{i}}, ymm{{i}}, ymm11
+    vpsravd ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_even:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsravd ymm13, ymm14, ymm10
+    vpand   ymm13, ymm13, ymm9
+    vpsubd  ymm13, ymm13, ymm9          // nudge = ((abs >>l shift) & 0x01) - 1
+    vpaddd  ymm14, ymm14, ymm13         // add nudge
+    vpaddd  ymm14, ymm14, ymm11         // add half
+    vpsravd ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_odd:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsravd ymm13, ymm14, ymm10
+    vpand   ymm13, ymm13, ymm9
+    vpsubd  ymm13, ymm12, ymm13          // nudge = - ((abs >>l shift) & 0x01)
+    vpaddd  ymm14, ymm14, ymm13         // add nudge
+    vpaddd  ymm14, ymm14, ymm11         // add half
+    vpsravd ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rdx,    [rdi + 24]          // col stride
+    mov     rcx,    [rdi + 32]          // item size
+
+    cmp     rcx,    4
+    je      {{L}}store_strides_i32
+
+    {% for col in range(0, 8) %}
+        mov r10, r8
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         byte ptr [r10], bl
+            add         r10, rsi
+        {% endfor %}
+        vperm2f128  ymm{{col}},   ymm{{col}},   ymm{{col}},  1
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         byte ptr [r10], bl
+            add         r10, rsi
+        {% endfor %}
+        add r8, rdx
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_i32:
+    {% for col in range(0, 8) %}
+        mov r10,    r8
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         dword ptr [r10], ebx
+            add         r10, rsi
+        {% endfor %}
+        vperm2f128  ymm{{col}},   ymm{{col}},   ymm{{col}},  1
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         dword ptr [r10], ebx
+            add         r10, rsi
+        {% endfor %}
+        add r8, rdx
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}return:
+    // Tear down AMX state: release tile registers and reclaim the tile-config
+    // stack space we allocated right after the standard prologue.
+    tilerelease
+    add         rsp, 64
+
+    ldmxcsr     [rsp + 4]
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+
+{{L}}one_32bit:
+{% if msvc %}
+    dd      1
+{% else %}
+    .int    1
+{% endif %}
+
+{% if msvc %}
+avx512amx_mmm_i32_8x8_{{suffix}} endp
+_text ends
+end
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/linalg/x86_64/fma/avx512vnni_mmm_i32_16x16.S.j2 b/linalg/x86_64/fma/avx512vnni_mmm_i32_16x16.S.j2
new file mode 100644
index 0000000000..4c61169ff9
--- /dev/null
+++ b/linalg/x86_64/fma/avx512vnni_mmm_i32_16x16.S.j2
@@ -0,0 +1,885 @@
+// vim: set syntax=asm :
+//
+// AVX-512 VNNI int8 GEMM kernel, 16 M-rows x 16 N-cols i32 accumulator output.
+//
+// The zmm-wide (512-bit) sibling of avx512vnni_mmm_i32_8x8: where the 8x8
+// kernel accumulates 8 columns per ymm, this one accumulates 16 columns per
+// zmm over 16 rows, so one VPDPBUSD covers a 16-lane x 4-K = 64 mul-add slab
+// and the K=4 inner step issues 16 of them (one per row) -- 1024 mul-adds per
+// K=4 block, 2x the work-per-iteration of the 8x8 ymm kernel. It is the int8
+// throughput tier of qmmm_i32 on big cores that have AVX-512-VNNI but no AMX
+// (Cascade Lake / Ice Lake / Tiger Lake server + client SKUs).
+//
+// VPDPBUSD is u8 x s8, so (like the 8x8 kernel) the A bytes are offset by +128
+// to become u8 and the resulting 128 * sum_k(B[n]) bias is removed per column
+// after the loop; the i32 accumulators are then bit-identical to the AVX2 /
+// VNNI-8x8 / AMX reference paths.
+//
+// A and B both use PackedI8K4(16): per K=4 block, 16 elements x 4 K-bytes = 64
+// bytes, element e at byte offset e*4 holding [e, 4kb..4kb+3]; K is zero-padded
+// to a multiple of 4 by the packer.
+//
+// REGISTER LAYOUT
+//   zmm0..zmm15 = accumulators, ROW-MAJOR: zmm{m} holds the 16 i32 lanes
+//                 [C[m, 0], C[m, 1], ..., C[m, 15]] for row m. Row-major makes
+//                 the per-column +128 bias a single vector subtract and lets
+//                 the Store path write each row with one vmovdqu32.
+//   zmm16 = B K=4 block (lane n = B[n, 4kb..]);  zmm17 = u8 ones (0x01010101);
+//   zmm18 = broadcast A[m, 4kb..] (+128 -> u8);  zmm19 = bias (sum_k B[n]);
+//   zmm20 = 0x80808080 (the +128 byte bias added to A).
+
+{% if msvc %}
+
+_text segment
+avx512vnni_mmm_i32_16x16_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}avx512vnni_mmm_i32_16x16_{{suffix}}
+{{G}}avx512vnni_mmm_i32_16x16_{{suffix}}:
+.cfi_startproc
+
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+{% if family == "windows" %}
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    push        rdi
+    push        rsi
+
+    mov         rdi, rcx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+.cfi_def_cfa_offset 64
+{% endif %}
+
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+
+{% include "dispatcher.j2" %}
+
+{{L}}clear:
+    // vzeroall only zeros lower-256 of zmm0..15; explicitly zero the full
+    // accumulators (zmm0..zmm15) for AMX.
+    {% for r in range(0, 16) %}
+    vpxorq      zmm{{r}}, zmm{{r}}, zmm{{r}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     r12,    [rdi + 32]   // packing
+    mov     rbx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rcx,    [rdi + 8]    // k
+    test    rcx,    rcx
+    jz      {{L}}non_linear_loop
+
+    cmp     r12, 1
+    je      {{L}}main_loop_packed_packed_i8i8
+
+{{L}}main_loop_packed_packed:
+    // Generic i32 x i32 fallback path (not AMX). For row-major accumulators
+    // with zmm{m} = row m of C, accumulating C[m, n] += A[m, k] * B[k, n]:
+    //   - load 16 B values for this K row into zmm16 (row of B)
+    //   - for each m: broadcast A[m, k], multiply by zmm16, add to zmm{m}
+    vmovups         zmm16, [rbx]               // 16 i32 of B at this K row
+
+    {% for m in range(0, 16) %}
+        vpbroadcastd    zmm17, dword ptr [rax + {{m}} * 4]
+        vpmulld         zmm18, zmm16, zmm17
+        vpaddd          zmm{{m}}, zmm{{m}}, zmm18
+    {% endfor %}
+
+    add             rax,    64                 // 16 i32 lanes per K step
+    add             rbx,    64
+    dec             rcx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed_i8i8:
+    // PackedI8K4(16) for both A and B: per K=4 block, 16 elements x 4 K-bytes =
+    // 64 bytes, element e at byte offset e*4 holding [e, 4kb..4kb+3].
+    //   B block  -> zmm16, lane n = B[n, 4kb..]              (the s8 operand)
+    //   A[m] (its 4 K-bytes) broadcast to all 16 lanes, +128 -> the u8 operand
+    // VPDPBUSD zmm{m}, A_bcast(u8), Bblock(s8): lane n += sum_t (A[m,t]+128)*B[n,t]
+    //   = C[m, n] + 128 * sum_t B[n, t]. That 128*sum_k(B[n]) bias is the same
+    // for every row m, so it is accumulated once per column in zmm19 (via a u8
+    // all-ones VPDPBUSD) and subtracted from every accumulator after the loop,
+    // leaving the i32 accumulators bit-identical to the AVX2 / 8x8 paths.
+
+    add             rcx,    3
+    shr             rcx,    2                      // rcx <- ceil(k/4) K=4 blocks
+
+    mov             r8d,    0x01010101
+    vmovd           xmm17,  r8d
+    vpbroadcastd    zmm17,  xmm17                  // zmm17 <- u8 ones (sum of B)
+
+    mov             r8d,    0x80808080
+    vmovd           xmm20,  r8d
+    vpbroadcastd    zmm20,  xmm20                  // zmm20 <- byte 0x80 (A + 128)
+
+    vpxorq          zmm19,  zmm19,  zmm19          // zmm19 <- per-col sum_k B[n]
+
+{{L}}loop_4k_i8i8_16x16:
+    vmovdqu32       zmm16,  [rbx]                  // B block: lane n = B[n, 4kb..]
+    vpdpbusd        zmm19,  zmm17,  zmm16          // sum_k B[n] += sum_t B[n, 4kb+t]
+
+    {% for m in range(0, 16) %}
+    vpbroadcastd    zmm18,  dword ptr [rax + {{ m * 4 }}]
+    vpaddb          zmm18,  zmm18,  zmm20          // s8 -> u8 (+128, modular)
+    vpdpbusd        zmm{{m}}, zmm18, zmm16         // acc[m][n] += sum_t (A[m]+128)*B[n]
+    {% endfor %}
+
+    add             rax,    64                     // next A K=4 block (16 rows * 4 K)
+    add             rbx,    64                     // next B K=4 block (16 cols * 4 K)
+    dec             rcx
+    jnz             {{L}}loop_4k_i8i8_16x16
+
+    // remove the +128 bias added on A: acc[m][n] -= 128 * sum_k B[n] (per column)
+    vpslld          zmm19,  zmm19,  7              // lane n <- 128 * sum_k B[n]
+    {% for m in range(0, 16) %}
+    vpsubd          zmm{{m}}, zmm{{m}}, zmm19
+    {% endfor %}
+
+    jmp             {{L}}non_linear_loop
+
+// ---- Scalar / per-row / per-col elementwise epilogues -------------------
+
+{{L}}scalar_min:
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpminsd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_max:
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpmaxsd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_add:
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpaddd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_mul:
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpmulld zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_sub:
+    // non-flipped sub = operand - acc (matches fma_mmm_ymm_ops.j2 scalar macro)
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpsubd zmm{{r}}, zmm16, zmm{{r}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}scalar_sub_flipped:
+    // flipped sub = acc - operand
+    vpbroadcastd    zmm16, dword ptr [rdi + 8]
+    {% for r in range(0, 16) %}vpsubd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}leaky_relu:
+    // C[m, n] = (C[m, n] >= 0) ? C[m, n] : alpha * C[m, n]
+    vpbroadcastd    zmm17, dword ptr [rdi + 8]    // alpha as i32 scale factor
+    vpxorq          zmm16, zmm16, zmm16
+    {% for r in range(0, 16) %}
+    vpmulld         zmm18, zmm{{r}}, zmm17
+    vpcmpgtd        k1,    zmm16, zmm{{r}}        // 1 where C < 0
+    vpblendmd       zmm{{r}}{k1}, zmm{{r}}, zmm18
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_min:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpminsd         zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_max:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpmaxsd         zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_add:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpaddd          zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_mul:
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpmulld         zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_sub:
+    // non-flipped sub = operand - acc
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpsubd          zmm{{m}}, zmm16, zmm{{m}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_row_sub_flipped:
+    // flipped sub = acc - operand
+    mov rax, [rdi + 8]
+    {% for m in range(0, 16) %}vpbroadcastd    zmm16, dword ptr [rax + {{m * 4}}]
+    vpsubd          zmm{{m}}, zmm{{m}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_min:
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpminsd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_max:
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpmaxsd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_add:
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpaddd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_mul:
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpmulld zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_sub:
+    // non-flipped sub = operand - acc
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpsubd zmm{{r}}, zmm16, zmm{{r}}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}per_col_sub_flipped:
+    // flipped sub = acc - operand
+    mov     rax, [rdi + 8]
+    vmovdqu32       zmm16, [rax]
+    {% for r in range(0, 16) %}vpsubd zmm{{r}}, zmm{{r}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}load_tile:
+    // Scratch layout is COL-MAJOR i32 from scratch.rs Store/AddUnicast remnant:
+    //   tile[col][row] at offset (col*MR + row)*4 with MR=16
+    //   = offset col*64 + row*4
+    // For row-major accumulators we gather row m's 16 cols at index step 64.
+    mov         r8, [rdi + 8]
+    vmovdqa32   zmm16, [rip + {{L}}lane_offsets_64]    // [0, 64, 128, ..., 15*64]
+    {% for m in range(0, 16) %}
+    mov         eax, 0xFFFF
+    kmovw       k1, eax
+    vpgatherdd  zmm{{m}}{k1}, [r8 + zmm16 + {{m * 4}}]
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_unicast:
+    mov     r10,    [rdi + 8]           // c ptr (base)
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+    mov     r8,     [rdi + 32]          // item size
+
+    cmp     r8,    4
+    je      {{L}}non_linear_addc_i32
+
+    // i8 path: read 16 i8 from [r10 + m*rsi + n*rbx] for n=0..15, sign-extend
+    // to i32, add to zmm{m}. Use a stack scratch buffer (16 bytes per row).
+    sub     rsp, 16
+    {% for m in range(0, 16) %}
+    mov     r8, r10
+    {% for n in range(0, 16) %}
+    mov     al, [r8]
+    mov     byte ptr [rsp + {{n}}], al
+    add     r8, rbx
+    {% endfor %}
+    vpmovsxbd       zmm16, [rsp]
+    vpaddd          zmm{{m}}, zmm{{m}}, zmm16
+    add     r10, rsi
+    {% endfor %}
+    add     rsp, 16
+    jmp     {{L}}non_linear_loop
+
+{{L}}non_linear_addc_i32:
+    // i32 strided read of external (or scratch) tile. Build per-lane index
+    // vector [0, rbx, 2*rbx, ..., 15*rbx] once, then gather row by row.
+    mov             eax, ebx
+    vmovd xmm16, eax
+    vpbroadcastd    zmm16, xmm16
+    vpmulld         zmm16, zmm16, [rip + {{L}}lane_indices]    // [0, rbx, 2*rbx, ...]
+
+    {% for m in range(0, 16) %}
+    mov             eax, 0xFFFF
+    kmovw           k1, eax
+    vpgatherdd      zmm17{k1}, [r10 + zmm16]
+    vpaddd          zmm{{m}}, zmm{{m}}, zmm17
+    add             r10, rsi
+    {% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    // bias[m, n] = row_data[m] * col_data[n], add to C[m, n].
+    // For row-major regs: load 16 col_data values once into zmm16,
+    //                     for each m: broadcast row_data[m], FMA add.
+    mov             rax, [rdi + 8]
+    mov             rbx, [rdi + 16]
+
+    vmovdqu32       zmm16, [rax]                          // 16 row_data values
+    vmovdqu32       zmm17, [rbx]                          // 16 col_data values
+
+    {% for m in range(0, 16) %}
+    vpbroadcastd    zmm18, dword ptr [rax + {{m * 4}}]    // splat row_data[m]
+    vpmulld         zmm19, zmm18, zmm17
+    vpaddd          zmm{{m}}, zmm{{m}}, zmm19
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+// ---- Q-scale (mult-shift with rounding) ---------------------------------
+
+{{L}}q_scale:
+    mov             r8, [rdi + 16]                  // policy
+    vpbroadcastd    zmm16, dword ptr [rdi + 24]     // multi (broadcast i32)
+
+    mov             rax, 1
+    vmovq xmm17, rax
+    vpbroadcastq    zmm17, xmm17                    // zmm17 <- 1 (i64 lanes)
+
+    mov             rax, [rdi + 8]                  // shift
+    add             rax, 31
+    vmovq xmm18, rax
+    vpbroadcastq    zmm18, xmm18                    // zmm18 <- (shift+31) (i64 lanes)
+
+    vpsubq          zmm19, zmm18, zmm17
+    vpsllvq         zmm19, zmm17, zmm19             // zmm19 <- 1 << (shift+31-1)  (i64)
+
+    // Per-lane interleave mask for blending evens / shifted-odds.
+    // bit i = 1 means take from "evens" source in vpblendmd; bit 0,2,4,...,14 set.
+    mov             eax, 0x5555
+    kmovw           k7, eax
+
+    cmp     r8, 1
+    je      {{L}}q_scale_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_scale_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_scale_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_scale_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_scale_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_scale_rounding_odd
+
+    jmp     {{L}}unsupported
+
+{{L}}q_scale_rounding_zero:           // signum * ( (abs + nudge - 1) >> shift )
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16                 // even-lane i32 -> i64 mul
+    vpmuldq     zmm21, zmm21, zmm16                 // odd-lane  i32 -> i64 mul
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    vpsubq      zmm20, zmm20, zmm17
+    vpsubq      zmm21, zmm21, zmm17
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20             // k7=0x5555: evens from zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_away:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16
+    vpmuldq     zmm21, zmm21, zmm16
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_minus_inf:      // nudge by -1 where input was negative
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpxorq      zmm22, zmm22, zmm22
+    vpcmpgtd    k1,    zmm{{i}}, zmm22              // k1: 1 where input > 0 (we want the inverse, see below)
+    knotw       k1,    k1                           // 1 where input <= 0  -- we want "input was negative => subtract 1"
+    // For "<0": use compare against 0 with vpcmpltd
+    vpxorq      zmm22, zmm22, zmm22
+    vpcmpltd    k1,    zmm{{i}}, zmm22              // 1 where input < 0
+    vmovdqa32   zmm23{k1}{z}, [rip + {{L}}all_ones_i32] // (1 << 0) per neg lane, 0 elsewhere
+
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16
+    vpmuldq     zmm21, zmm21, zmm16
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    // Subtract 1 from i64-evens / i64-odds where the original i32 input was < 0.
+    vpsrldq     zmm24, zmm23, 4
+    vpmovsxdq   zmm25, ymm23
+    vpmovsxdq   zmm26, ymm24
+    vpsubq      zmm20, zmm20, zmm25
+    vpsubq      zmm21, zmm21, zmm26
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_plus_inf:       // nudge by +1 where input was non-negative
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpxorq      zmm22, zmm22, zmm22
+    vpcmpled    k1,    zmm22, zmm{{i}}              // 1 where input >= 0
+    vmovdqa32   zmm23{k1}{z}, [rip + {{L}}all_ones_i32]
+
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16
+    vpmuldq     zmm21, zmm21, zmm16
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    vpsrldq     zmm24, zmm23, 4
+    vpmovsxdq   zmm25, ymm23
+    vpmovsxdq   zmm26, ymm24
+    vpsubq      zmm20, zmm20, zmm25
+    vpsubq      zmm21, zmm21, zmm26
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_even:           // banker's: round half to even
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16
+    vpmuldq     zmm21, zmm21, zmm16
+
+    vpsrlq      zmm22, zmm20, xmm18
+    vpandq      zmm22, zmm22, zmm17
+    vpaddq      zmm20, zmm20, zmm22
+    vpsubq      zmm20, zmm20, zmm17
+
+    vpsrlq      zmm22, zmm21, xmm18
+    vpandq      zmm22, zmm22, zmm17
+    vpaddq      zmm21, zmm21, zmm22
+    vpsubq      zmm21, zmm21, zmm17
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_odd:            // round half to odd
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsrldq     zmm21, zmm20, 4
+    vpmuldq     zmm20, zmm20, zmm16
+    vpmuldq     zmm21, zmm21, zmm16
+
+    vpsrlq      zmm22, zmm20, xmm18
+    vpandq      zmm22, zmm22, zmm17
+    vpsubq      zmm20, zmm20, zmm22
+
+    vpsrlq      zmm22, zmm21, xmm18
+    vpandq      zmm22, zmm22, zmm17
+    vpsubq      zmm21, zmm21, zmm22
+
+    vpaddq      zmm20, zmm20, zmm19
+    vpaddq      zmm21, zmm21, zmm19
+
+    vpsrlq      zmm20, zmm20, xmm18
+    vpsrlq      zmm21, zmm21, xmm18
+
+    vpslldq     zmm21, zmm21, 4
+    vpblendmd   zmm20{k7}, zmm21, zmm20
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shl:
+    mov             eax, [rdi + 8]                  // -shift (count: i32)
+    vmovd xmm16, eax
+    vpbroadcastd    zmm16, xmm16
+    {% for i in range(0, 16) %}vpsllvd zmm{{i}}, zmm{{i}}, zmm16
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr:
+    mov             r8,  [rdi + 16]                 // policy
+
+    mov             eax, 1
+    vmovd xmm16, eax
+    vpbroadcastd    zmm16, xmm16                    // zmm16 <- 1 (i32 lanes)
+
+    mov             eax, [rdi + 8]                  // shift
+    vmovd xmm17, eax
+    vpbroadcastd    zmm17, xmm17                    // zmm17 <- shift (i32 lanes)
+
+    mov             ebx, 1
+    mov             cl,  al
+    sub             cl,  1
+    sal             ebx, cl                         // ebx <- 1 << (shift - 1)
+    vmovd xmm18, ebx
+    vpbroadcastd    zmm18, xmm18                    // zmm18 <- "half"
+
+    vpxorq          zmm19, zmm19, zmm19             // zeroes
+
+    cmp     r8, 1
+    je      {{L}}q_shr_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_shr_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_shr_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_shr_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_shr_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_shr_rounding_odd
+
+    jmp     {{L}}unsupported
+
+{{L}}q_shr_rounding_zero:
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsubd      zmm20, zmm20, zmm16
+    vpaddd      zmm20, zmm20, zmm18
+    vpsravd     zmm20, zmm20, zmm17
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_away:
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpaddd      zmm20, zmm20, zmm18
+    vpsravd     zmm20, zmm20, zmm17
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_minus_inf:
+{% for i in range(0, 16) %}
+    vpsubd      zmm{{i}}, zmm{{i}}, zmm16
+    vpaddd      zmm{{i}}, zmm{{i}}, zmm18
+    vpsravd     zmm{{i}}, zmm{{i}}, zmm17
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_plus_inf:
+{% for i in range(0, 16) %}
+    vpaddd      zmm{{i}}, zmm{{i}}, zmm18
+    vpsravd     zmm{{i}}, zmm{{i}}, zmm17
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_even:
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsravd     zmm21, zmm20, zmm17
+    vpandq      zmm21, zmm21, zmm16
+    vpsubd      zmm21, zmm21, zmm16             // nudge = ((abs >>l shift) & 1) - 1
+    vpaddd      zmm20, zmm20, zmm21
+    vpaddd      zmm20, zmm20, zmm18
+    vpsravd     zmm20, zmm20, zmm17
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_odd:
+{% for i in range(0, 16) %}
+    vpabsd      zmm20, zmm{{i}}
+    vpsravd     zmm21, zmm20, zmm17
+    vpandq      zmm21, zmm21, zmm16
+    vpsubd      zmm21, zmm19, zmm21             // nudge = -((abs >>l shift) & 1)
+    vpaddd      zmm20, zmm20, zmm21
+    vpaddd      zmm20, zmm20, zmm18
+    vpsravd     zmm20, zmm20, zmm17
+    // emulate AVX2 vpsignd (no AVX-512 form): apply sign of original acc.
+    vpxorq      zmm26, zmm26, zmm26
+    vpcmpgtd    k1, zmm26, zmm{{i}}                 // k1 = 1 where acc < 0
+    vpsubd      zmm27, zmm26, zmm20                 // zmm27 = -zmm20
+    vpblendmd   zmm{{i}}{k1}, zmm20, zmm27
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+// ---- Store ---------------------------------------------------------------
+
+{{L}}store:
+    mov     r8,     [rdi + 8]               // c ptr
+    mov     rsi,    [rdi + 16]              // row stride
+    mov     rdx,    [rdi + 24]              // col stride
+    mov     rcx,    [rdi + 32]              // item size
+
+    cmp     rcx, 4
+    je      {{L}}store_strides_i32
+    // else: i8 fallthrough
+
+    cmp     rdx, 1
+    je      {{L}}store_strides_i8_row_contig
+
+    // Generic i8 strided store: per row, per lane scalar byte stores
+    {% for m in range(0, 16) %}
+    mov     r10, r8
+    // Extract from each 128-bit slice of zmm{{m}}
+    vextracti32x4   xmm20, zmm{{m}}, 0
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         byte ptr [r10], bl
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 1
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         byte ptr [r10], bl
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 2
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         byte ptr [r10], bl
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 3
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         byte ptr [r10], bl
+    add         r10, rdx
+    {% endfor %}
+    add     r8, rsi
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_i8_row_contig:
+    // Each row is 16 i8 contiguous; one vpmovdb per row.
+    {% for m in range(0, 16) %}
+    vpmovdb     [r8], zmm{{m}}
+    add         r8, rsi
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_i32:
+    cmp     rdx, 4
+    je      {{L}}store_strides_i32_row_contig
+
+    // Generic i32 strided store
+    {% for m in range(0, 16) %}
+    mov     r10, r8
+    vextracti32x4   xmm20, zmm{{m}}, 0
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 1
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 2
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    vextracti32x4   xmm20, zmm{{m}}, 3
+    {% for n in range(0, 4) %}
+    vpextrd     ebx, xmm20, {{n}}
+    mov         dword ptr [r10], ebx
+    add         r10, rdx
+    {% endfor %}
+    add     r8, rsi
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_i32_row_contig:
+    // C is row-major in memory: each row's 16 i32 are contiguous; one
+    // 64-byte aligned-or-unaligned store per row.
+    {% for m in range(0, 16) %}
+    vmovdqu32   [r8], zmm{{m}}
+    add         r8, rsi
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}return:
+    ldmxcsr     [rsp + 4]
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+// ---- Read-only data (RIP-relative) ---------------------------------------
+
+.p2align 6
+{{L}}lane_offsets_64:
+    .int  0,  64, 128, 192, 256, 320, 384, 448
+    .int  512, 576, 640, 704, 768, 832, 896, 960
+
+.p2align 6
+{{L}}lane_indices:
+    .int  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+.p2align 6
+{{L}}all_ones_i32:
+    .int  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+
+{% if msvc %}
+avx512vnni_mmm_i32_16x16_{{suffix}} endp
+_text ends
+end
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/linalg/x86_64/fma/avx512vnni_mmm_i32_8x8.S.j2 b/linalg/x86_64/fma/avx512vnni_mmm_i32_8x8.S.j2
new file mode 100644
index 0000000000..ed5d4540af
--- /dev/null
+++ b/linalg/x86_64/fma/avx512vnni_mmm_i32_8x8.S.j2
@@ -0,0 +1,676 @@
+{#
+// vim: set syntax=asm :
+
+/* mmm 8x8:
+
+    ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+#}
+
+{% if msvc %}
+
+_text segment
+avx512vnni_mmm_i32_8x8_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}avx512vnni_mmm_i32_8x8_{{suffix}}
+{{G}}avx512vnni_mmm_i32_8x8_{{suffix}}:
+.cfi_startproc
+
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+{% if family == "windows" %}
+// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch
+// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    push        rdi
+    push        rsi
+
+    mov         rdi, rcx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+.cfi_def_cfa_offset 64
+{% endif %}
+
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+
+{% include "dispatcher.j2" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     r12,    [rdi + 32]   // packing
+    mov     rbx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rcx,    [rdi + 8]    // k
+    test    rcx,    rcx
+    jz      {{L}}non_linear_loop
+
+    cmp     r12, 1
+    je      {{L}}main_loop_packed_packed_i8i8
+
+{{L}}main_loop_packed_packed:
+    vmovaps         ymm12,  [rax]
+
+    {% for i in range(0, 8) %}
+        vbroadcastss    ymm14, dword ptr [rbx + {{i}} * 4]
+        vpmulld         ymm13, ymm12, ymm14
+        vpaddd          ymm{{i}}, ymm{{i}}, ymm13
+    {% endfor %}
+
+    add             rax,    32
+    add             rbx,    32
+    dec             rcx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed_i8i8:
+    // PackedI8K4 layout: per K=4 block, the A panel is 8 rows x 4 K-bytes (32
+    // bytes, lane m = A[m, 4kb..4kb+3]) and the B panel is 8 cols x 4 K-bytes
+    // (lane n = B[n, 4kb..4kb+3]). VPDPBUSD is u8 x s8, so A is offset by +128
+    // (-> u8) and the resulting 128*sum_k(B[n]) bias is removed per column after
+    // the loop, leaving the i32 accumulators identical to the AVX2 path.
+
+    add             rcx,    3
+    shr             rcx,    2                      // rcx <- ceil(k/4) K=4 blocks
+
+    mov             r8d,    0x01010101
+    movd            xmm11,  r8d
+    vpbroadcastd    ymm11,  xmm11                  // ymm11 <- u8 ones (sum of B)
+
+    mov             r8d,    0x80808080
+    movd            xmm12,  r8d
+    vpbroadcastd    ymm12,  xmm12                  // ymm12 <- byte 0x80 (A + 128)
+
+    vpxor           ymm10,  ymm10,  ymm10          // ymm10 <- per-col sum_k B[n]
+
+{{L}}loop_4k_i8i8:
+    vmovdqu         ymm8,   [rax]                  // A block: lane m = A[m,4kb..]
+    vpaddb          ymm8,   ymm8,   ymm12          // s8 -> u8 (+128, modular)
+
+    vmovdqu         ymm9,   [rbx]                  // B block: lane n = B[n,4kb..]
+    vpdpbusd        ymm10,  ymm11,  ymm9           // sum_k B[n] += sum_t B[n,4kb+t]
+
+    {% for n in range(0, 8) %}
+    vpbroadcastd    ymm13,  dword ptr [rbx + {{n}} * 4]
+    vpdpbusd        ymm{{n}}, ymm8, ymm13          // acc[n][m] += sum_t (A[m]+128)*B[n]
+    {% endfor %}
+
+    add             rax,    32
+    add             rbx,    32
+    dec             rcx
+    jnz             {{L}}loop_4k_i8i8
+
+    // remove the +128 bias added on A: acc[n] -= 128 * sum_k B[n]
+    vpslld          ymm10,  ymm10,  7              // lane n <- 128 * sum_k B[n]
+    {% for n in range(0, 8) %}
+    mov             r8d,    {{n}}
+    movd            xmm14,  r8d
+    vpbroadcastd    ymm14,  xmm14                  // index = n in every lane
+    vpermd          ymm15,  ymm14,  ymm10          // splat 128*sum_k B[n]
+    vpsubd          ymm{{n}}, ymm{{n}}, ymm15
+    {% endfor %}
+
+    jmp             {{L}}non_linear_loop
+
+{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_i32_scalars.j2" %}
+{% set mr = 8 %}{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_i32_per_rows.j2" %}
+{% set mr = 8 %}{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_i32_per_cols.j2" %}
+{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_load_tile.j2" %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+    mov     r8,     [rdi + 32]          // item size
+
+    cmp     r8,    4
+    je      {{L}}non_linear_addc_i32
+
+{#
+// This is not great as vgatherdps reads 32-bits values and goes beyond our buffer. Probably harmless though.
+// Commented and replaced with the "mov al" loop beyond to pacify valgrind.
+// ymm14 and ymm15 are the same as in the non_linear_addc_i32 case (compute them before the test right above here.
+// {% for i in range(0, 8) %}
+//     vpcmpeqd        ymm15, ymm15, ymm15
+//     vgatherdps      ymm12, [ r10 + ymm14 ], ymm15   // 0xxx 1xxx 2xxx 3xxx 4xxx 5xxx 6xxx 7xxx
+//
+//     // we need to go through vpmovsxbd, shuffling naively erases signs
+//     vpshufb         ymm12, ymm12, ymm10             // 0123 0123 0123 0123 4567 4567 4567 4567
+//
+//     vpermd          ymm12, ymm11, ymm12             // 0123 4567
+//     vpmovsxbd       ymm12, xmm12                    // sign extend
+//
+//     vpaddd          ymm{{i}},   ymm{{i}},   ymm12
+//     add             r10, rbx
+// {% endfor %}
+#}
+
+    {% for col in range(0, 8) %}
+        mov r8, r10
+        {% for half in range(0, 2) %}
+            {% for lane in range(0, 4) %}
+                mov al, [ r8 ]
+                add r8, rsi
+                movsx eax, al
+                pinsrd xmm10, eax, {{lane}}
+            {% endfor %}
+            vperm2f128  ymm10,   ymm10,   ymm10,  1
+        {% endfor %}
+        vpaddd ymm{{col}}, ymm{{col}}, ymm10
+        add r10, rbx
+    {% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}non_linear_addc_i32:
+
+    mov     eax,    0
+{% for i in range(0, 4) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+    vpermq          ymm14, ymm14, 78 // 0b01001110
+{% for i in range(0, 4) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+    vpermq          ymm14, ymm14, 78 // 0b01001110
+
+
+{% if msvc %}
+    vpbroadcastd    ymm10, dword ptr [ offset byte_shuffle ]
+    vmovups         ymm11, dword ptr [ offset i128_shuffle ]
+{% else %}
+    vpbroadcastd    ymm10, [ rip + {{L}}byte_shuffle ]
+    vmovups         ymm11, [ rip + {{L}}i128_shuffle ]
+{% endif %}
+
+{% for i in range(0, 8) %}
+    vpcmpeqd        ymm15, ymm15, ymm15
+    vgatherdps      ymm12, [ r10 + ymm14 ], ymm15
+    vpaddd          ymm{{i}},   ymm{{i}},   ymm12
+    add             r10, rbx
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{% if msvc %}
+.data
+byte_shuffle dd              201851904 // 0x0c080400
+i128_shuffle dd              0, 4
+.code
+{% else %}
+{{L}}byte_shuffle: .int            201851904 // 0x0c080400
+{{L}}i128_shuffle: .int            0, 4
+{% endif %}
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         ymm12,  [rax]
+
+{% for i in range(0, 8) %}
+    vbroadcastss    ymm14, dword ptr [rbx + {{ i * 4 }} ]
+    vpmulld         ymm15, ymm12, ymm14
+    vpaddd          ymm{{i}}, ymm{{i}}, ymm15
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale:
+    mov             r8, [ rdi + 16 ]        // policy
+    vbroadcastss    ymm8, dword ptr [rdi + 24] // multi
+
+    mov             rax, 1
+    movq            xmm9, rax
+    vpbroadcastq    ymm9, xmm9              // ymm9 <- 1
+
+    mov             rax, [ rdi + 8 ]        // xmm10 <- shift + 31
+    add             rax, 31
+    movq            xmm10, rax
+    vpbroadcastq    ymm10, xmm10
+
+    mov             rax, 1
+    movq            xmm11, rax
+    vpsubq          ymm12, ymm10, ymm9      // shift+31 - 1
+    vpsllq          ymm11, ymm9, xmm12      // ymm11 <- 1 << (shift + 31 - 1)
+
+    cmp     r8, 1
+    je      {{L}}q_scale_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_scale_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_scale_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_scale_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_scale_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_scale_rounding_odd
+
+    jmp    {{L}}unsupported
+
+{{L}}q_scale_rounding_zero:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsubq      ymm14, ymm14, ymm9
+    vpsubq      ymm15, ymm15, ymm9
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_away:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_minus_inf:           // signum * ( (abs << 32 + 1<<30+shift) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    // sign extract for nudging in the right direction
+    vpxor       ymm13, ymm13, ymm13
+    vpcmpgtd    ymm13, ymm{{i}}, ymm13      // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros)
+    vpsrld      ymm13, ymm13, 31            // then just 0 or 1
+
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrldq     ymm13, ymm13, 4             // ymm13 <- s1, s2, .., s7, 0
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_plus_inf:           // signum * ( (abs << 32 + 1<<30+shift) >> shift )
+
+    vpbroadcastd ymm9, xmm9
+
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpxor       ymm13, ymm13, ymm13
+
+    // sign extract for nudging in the right direction
+    vpcmpgtd    ymm13, ymm{{i}}, ymm13      // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros)
+    vpaddd      ymm13, ymm13, ymm9          // if val >= 0 { 0i32 } else { 1i32 }
+
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrldq     ymm13, ymm13, 4             // ymm13 <- s1, s2, .., s7, 0
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_even:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpsrlq      ymm12, ymm14, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpaddq      ymm14, ymm14, ymm12
+    vpsubq      ymm14, ymm14, ymm9
+
+    vpsrlq      ymm12, ymm15, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpaddq      ymm15, ymm15, ymm12
+    vpsubq      ymm15, ymm15, ymm9
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_odd:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpsrlq      ymm12, ymm14, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrlq      ymm12, ymm15, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_shl:
+    mov             eax, [ rdi + 8 ]        // xmm10 <- -shift (8 times)
+    movd            xmm10, eax
+    vpbroadcastd    ymm10, xmm10
+
+{% for i in range(0, 8) %}
+    vpsllvd     ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr:
+    mov             r8, [ rdi + 16 ]        // policy
+
+    mov             eax, 1
+    movd            xmm9, eax
+    vpbroadcastd    ymm9, xmm9              // ymm9 <- 1u32 (8 times)
+
+    mov             eax, [ rdi + 8 ]        // xmm10 <- shift (8 times)
+    movd            xmm10, eax
+    vpbroadcastd    ymm10, xmm10
+
+    mov             ebx, 1
+    mov             cl, al
+    sub             cl, 1                  // rcx <- shift -1
+    sal             ebx, cl                // rbx <- (1 << (shift - 1))
+    movd            xmm11, ebx
+    vpbroadcastd    ymm11, xmm11            // ymm11 <- "half"
+
+    vpxor           ymm12, ymm12, ymm12     // ymm12 <- zeroes
+
+    cmp     r8, 1
+    je      {{L}}q_shr_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_shr_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_shr_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_shr_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_shr_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_shr_rounding_odd
+
+    jmp    {{L}}unsupported
+
+{{L}}q_shr_rounding_zero:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsubd      ymm14, ymm14, ymm9
+    vpaddd      ymm14, ymm14, ymm11
+    vpsravd     ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_away:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpaddd      ymm14, ymm14, ymm11
+    vpsravd     ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_minus_inf:
+{% for i in range(0, 8) %}
+    vpsubd  ymm{{i}}, ymm{{i}}, ymm9
+    vpaddd  ymm{{i}}, ymm{{i}}, ymm11
+    vpsravd ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_plus_inf:
+{% for i in range(0, 8) %}
+    vpaddd  ymm{{i}}, ymm{{i}}, ymm11
+    vpsravd ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_even:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsravd ymm13, ymm14, ymm10
+    vpand   ymm13, ymm13, ymm9
+    vpsubd  ymm13, ymm13, ymm9          // nudge = ((abs >>l shift) & 0x01) - 1
+    vpaddd  ymm14, ymm14, ymm13         // add nudge
+    vpaddd  ymm14, ymm14, ymm11         // add half
+    vpsravd ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_odd:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsravd ymm13, ymm14, ymm10
+    vpand   ymm13, ymm13, ymm9
+    vpsubd  ymm13, ymm12, ymm13          // nudge = - ((abs >>l shift) & 0x01)
+    vpaddd  ymm14, ymm14, ymm13         // add nudge
+    vpaddd  ymm14, ymm14, ymm11         // add half
+    vpsravd ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rdx,    [rdi + 24]          // col stride
+    mov     rcx,    [rdi + 32]          // item size
+
+    cmp     rcx,    4
+    je      {{L}}store_strides_i32
+
+    {% for col in range(0, 8) %}
+        mov r10, r8
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         byte ptr [r10], bl
+            add         r10, rsi
+        {% endfor %}
+        vperm2f128  ymm{{col}},   ymm{{col}},   ymm{{col}},  1
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         byte ptr [r10], bl
+            add         r10, rsi
+        {% endfor %}
+        add r8, rdx
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_i32:
+    {% for col in range(0, 8) %}
+        mov r10,    r8
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         dword ptr [r10], ebx
+            add         r10, rsi
+        {% endfor %}
+        vperm2f128  ymm{{col}},   ymm{{col}},   ymm{{col}},  1
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         dword ptr [r10], ebx
+            add         r10, rsi
+        {% endfor %}
+        add r8, rdx
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}return:
+    ldmxcsr     [rsp + 4]
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+
+{{L}}one_32bit:
+{% if msvc %}
+    dd      1
+{% else %}
+    .int    1
+{% endif %}
+
+{% if msvc %}
+avx512vnni_mmm_i32_8x8_{{suffix}} endp
+_text ends
+end
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/linalg/x86_64/fma/avxvnni_mmm_i32_8x8.S.j2 b/linalg/x86_64/fma/avxvnni_mmm_i32_8x8.S.j2
new file mode 100644
index 0000000000..904335c511
--- /dev/null
+++ b/linalg/x86_64/fma/avxvnni_mmm_i32_8x8.S.j2
@@ -0,0 +1,685 @@
+{#
+// vim: set syntax=asm :
+
+/* AVX-VNNI int8 GEMM (mmm 8x8), VEX-encoded VPDPBUSD.
+//
+// Body-identical to avx512vnni_mmm_i32_8x8 (same 8-row x 8-col ymm accumulators,
+// same PackedI8K4 inner-K layout, same +128 bias trick to bridge VPDPBUSD's
+// u8 x s8 into the AVX2 s8 x s8 reference). The only difference is that the
+// two VPDPBUSD instructions are prefixed with {vex} so gas emits the AVX-VNNI
+// (VEX) form instead of the AVX-512-VNNI (EVEX) form it defaults to. The VEX
+// form runs on Atom-class cores (Alder Lake E-cores, Sierra Forest, Clearwater
+// Forest / Darkmont) which have AVX-VNNI but no AVX-512, where the existing
+// avx512vnni kernel would fault.
+
+    ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+#}
+
+{% if msvc %}
+
+_text segment
+avxvnni_mmm_i32_8x8_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}avxvnni_mmm_i32_8x8_{{suffix}}
+{{G}}avxvnni_mmm_i32_8x8_{{suffix}}:
+.cfi_startproc
+
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+{% if family == "windows" %}
+// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch
+// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    push        rdi
+    push        rsi
+
+    mov         rdi, rcx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+.cfi_def_cfa_offset 64
+{% endif %}
+
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+
+{% include "dispatcher.j2" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     r12,    [rdi + 32]   // packing
+    mov     rbx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rcx,    [rdi + 8]    // k
+    test    rcx,    rcx
+    jz      {{L}}non_linear_loop
+
+    cmp     r12, 1
+    je      {{L}}main_loop_packed_packed_i8i8
+
+{{L}}main_loop_packed_packed:
+    vmovaps         ymm12,  [rax]
+
+    {% for i in range(0, 8) %}
+        vbroadcastss    ymm14, dword ptr [rbx + {{i}} * 4]
+        vpmulld         ymm13, ymm12, ymm14
+        vpaddd          ymm{{i}}, ymm{{i}}, ymm13
+    {% endfor %}
+
+    add             rax,    32
+    add             rbx,    32
+    dec             rcx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed_i8i8:
+    // PackedI8K4 layout: per K=4 block, the A panel is 8 rows x 4 K-bytes (32
+    // bytes, lane m = A[m, 4kb..4kb+3]) and the B panel is 8 cols x 4 K-bytes
+    // (lane n = B[n, 4kb..4kb+3]). VPDPBUSD is u8 x s8, so A is offset by +128
+    // (-> u8) and the resulting 128*sum_k(B[n]) bias is removed per column after
+    // the loop, leaving the i32 accumulators identical to the AVX2 path.
+
+    add             rcx,    3
+    shr             rcx,    2                      // rcx <- ceil(k/4) K=4 blocks
+
+    mov             r8d,    0x01010101
+    movd            xmm11,  r8d
+    vpbroadcastd    ymm11,  xmm11                  // ymm11 <- u8 ones (sum of B)
+
+    mov             r8d,    0x80808080
+    movd            xmm12,  r8d
+    vpbroadcastd    ymm12,  xmm12                  // ymm12 <- byte 0x80 (A + 128)
+
+    vpxor           ymm10,  ymm10,  ymm10          // ymm10 <- per-col sum_k B[n]
+
+{{L}}loop_4k_i8i8:
+    vmovdqu         ymm8,   [rax]                  // A block: lane m = A[m,4kb..]
+    vpaddb          ymm8,   ymm8,   ymm12          // s8 -> u8 (+128, modular)
+
+    vmovdqu         ymm9,   [rbx]                  // B block: lane n = B[n,4kb..]
+    {vex} vpdpbusd  ymm10, ymm11,  ymm9           // sum_k B[n] += sum_t B[n,4kb+t]
+
+    {% for n in range(0, 8) %}
+    vpbroadcastd    ymm13,  dword ptr [rbx + {{n}} * 4]
+    {vex} vpdpbusd  ymm{{n}}, ymm8, ymm13          // acc[n][m] += sum_t (A[m]+128)*B[n]
+    {% endfor %}
+
+    add             rax,    32
+    add             rbx,    32
+    dec             rcx
+    jnz             {{L}}loop_4k_i8i8
+
+    // remove the +128 bias added on A: acc[n] -= 128 * sum_k B[n]
+    vpslld          ymm10,  ymm10,  7              // lane n <- 128 * sum_k B[n]
+    {% for n in range(0, 8) %}
+    mov             r8d,    {{n}}
+    movd            xmm14,  r8d
+    vpbroadcastd    ymm14,  xmm14                  // index = n in every lane
+    vpermd          ymm15,  ymm14,  ymm10          // splat 128*sum_k B[n]
+    vpsubd          ymm{{n}}, ymm{{n}}, ymm15
+    {% endfor %}
+
+    jmp             {{L}}non_linear_loop
+
+{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_i32_scalars.j2" %}
+{% set mr = 8 %}{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_i32_per_rows.j2" %}
+{% set mr = 8 %}{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_i32_per_cols.j2" %}
+{% set from = 0 %}{% set to = 7 %}{% include "fma_mmm_load_tile.j2" %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+    mov     r8,     [rdi + 32]          // item size
+
+    cmp     r8,    4
+    je      {{L}}non_linear_addc_i32
+
+{#
+// This is not great as vgatherdps reads 32-bits values and goes beyond our buffer. Probably harmless though.
+// Commented and replaced with the "mov al" loop beyond to pacify valgrind.
+// ymm14 and ymm15 are the same as in the non_linear_addc_i32 case (compute them before the test right above here.
+// {% for i in range(0, 8) %}
+//     vpcmpeqd        ymm15, ymm15, ymm15
+//     vgatherdps      ymm12, [ r10 + ymm14 ], ymm15   // 0xxx 1xxx 2xxx 3xxx 4xxx 5xxx 6xxx 7xxx
+//
+//     // we need to go through vpmovsxbd, shuffling naively erases signs
+//     vpshufb         ymm12, ymm12, ymm10             // 0123 0123 0123 0123 4567 4567 4567 4567
+//
+//     vpermd          ymm12, ymm11, ymm12             // 0123 4567
+//     vpmovsxbd       ymm12, xmm12                    // sign extend
+//
+//     vpaddd          ymm{{i}},   ymm{{i}},   ymm12
+//     add             r10, rbx
+// {% endfor %}
+#}
+
+    {% for col in range(0, 8) %}
+        mov r8, r10
+        {% for half in range(0, 2) %}
+            {% for lane in range(0, 4) %}
+                mov al, [ r8 ]
+                add r8, rsi
+                movsx eax, al
+                pinsrd xmm10, eax, {{lane}}
+            {% endfor %}
+            vperm2f128  ymm10,   ymm10,   ymm10,  1
+        {% endfor %}
+        vpaddd ymm{{col}}, ymm{{col}}, ymm10
+        add r10, rbx
+    {% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}non_linear_addc_i32:
+
+    mov     eax,    0
+{% for i in range(0, 4) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+    vpermq          ymm14, ymm14, 78 // 0b01001110
+{% for i in range(0, 4) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+    vpermq          ymm14, ymm14, 78 // 0b01001110
+
+
+{% if msvc %}
+    vpbroadcastd    ymm10, dword ptr [ offset byte_shuffle ]
+    vmovups         ymm11, dword ptr [ offset i128_shuffle ]
+{% else %}
+    vpbroadcastd    ymm10, [ rip + {{L}}byte_shuffle ]
+    vmovups         ymm11, [ rip + {{L}}i128_shuffle ]
+{% endif %}
+
+{% for i in range(0, 8) %}
+    vpcmpeqd        ymm15, ymm15, ymm15
+    vgatherdps      ymm12, [ r10 + ymm14 ], ymm15
+    vpaddd          ymm{{i}},   ymm{{i}},   ymm12
+    add             r10, rbx
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{% if msvc %}
+.data
+byte_shuffle dd              201851904 // 0x0c080400
+i128_shuffle dd              0, 4
+.code
+{% else %}
+{{L}}byte_shuffle: .int            201851904 // 0x0c080400
+{{L}}i128_shuffle: .int            0, 4
+{% endif %}
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         ymm12,  [rax]
+
+{% for i in range(0, 8) %}
+    vbroadcastss    ymm14, dword ptr [rbx + {{ i * 4 }} ]
+    vpmulld         ymm15, ymm12, ymm14
+    vpaddd          ymm{{i}}, ymm{{i}}, ymm15
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale:
+    mov             r8, [ rdi + 16 ]        // policy
+    vbroadcastss    ymm8, dword ptr [rdi + 24] // multi
+
+    mov             rax, 1
+    movq            xmm9, rax
+    vpbroadcastq    ymm9, xmm9              // ymm9 <- 1
+
+    mov             rax, [ rdi + 8 ]        // xmm10 <- shift + 31
+    add             rax, 31
+    movq            xmm10, rax
+    vpbroadcastq    ymm10, xmm10
+
+    mov             rax, 1
+    movq            xmm11, rax
+    vpsubq          ymm12, ymm10, ymm9      // shift+31 - 1
+    vpsllq          ymm11, ymm9, xmm12      // ymm11 <- 1 << (shift + 31 - 1)
+
+    cmp     r8, 1
+    je      {{L}}q_scale_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_scale_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_scale_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_scale_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_scale_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_scale_rounding_odd
+
+    jmp    {{L}}unsupported
+
+{{L}}q_scale_rounding_zero:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsubq      ymm14, ymm14, ymm9
+    vpsubq      ymm15, ymm15, ymm9
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_away:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_minus_inf:           // signum * ( (abs << 32 + 1<<30+shift) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    // sign extract for nudging in the right direction
+    vpxor       ymm13, ymm13, ymm13
+    vpcmpgtd    ymm13, ymm{{i}}, ymm13      // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros)
+    vpsrld      ymm13, ymm13, 31            // then just 0 or 1
+
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrldq     ymm13, ymm13, 4             // ymm13 <- s1, s2, .., s7, 0
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_plus_inf:           // signum * ( (abs << 32 + 1<<30+shift) >> shift )
+
+    vpbroadcastd ymm9, xmm9
+
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpxor       ymm13, ymm13, ymm13
+
+    // sign extract for nudging in the right direction
+    vpcmpgtd    ymm13, ymm{{i}}, ymm13      // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros)
+    vpaddd      ymm13, ymm13, ymm9          // if val >= 0 { 0i32 } else { 1i32 }
+
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrldq     ymm13, ymm13, 4             // ymm13 <- s1, s2, .., s7, 0
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_even:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpsrlq      ymm12, ymm14, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpaddq      ymm14, ymm14, ymm12
+    vpsubq      ymm14, ymm14, ymm9
+
+    vpsrlq      ymm12, ymm15, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpaddq      ymm15, ymm15, ymm12
+    vpsubq      ymm15, ymm15, ymm9
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_odd:           // signum * ( (abs + nudge) >> shift )
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpsrlq      ymm12, ymm14, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrlq      ymm12, ymm15, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_shl:
+    mov             eax, [ rdi + 8 ]        // xmm10 <- -shift (8 times)
+    movd            xmm10, eax
+    vpbroadcastd    ymm10, xmm10
+
+{% for i in range(0, 8) %}
+    vpsllvd     ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr:
+    mov             r8, [ rdi + 16 ]        // policy
+
+    mov             eax, 1
+    movd            xmm9, eax
+    vpbroadcastd    ymm9, xmm9              // ymm9 <- 1u32 (8 times)
+
+    mov             eax, [ rdi + 8 ]        // xmm10 <- shift (8 times)
+    movd            xmm10, eax
+    vpbroadcastd    ymm10, xmm10
+
+    mov             ebx, 1
+    mov             cl, al
+    sub             cl, 1                  // rcx <- shift -1
+    sal             ebx, cl                // rbx <- (1 << (shift - 1))
+    movd            xmm11, ebx
+    vpbroadcastd    ymm11, xmm11            // ymm11 <- "half"
+
+    vpxor           ymm12, ymm12, ymm12     // ymm12 <- zeroes
+
+    cmp     r8, 1
+    je      {{L}}q_shr_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_shr_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_shr_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_shr_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_shr_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_shr_rounding_odd
+
+    jmp    {{L}}unsupported
+
+{{L}}q_shr_rounding_zero:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsubd      ymm14, ymm14, ymm9
+    vpaddd      ymm14, ymm14, ymm11
+    vpsravd     ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_away:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpaddd      ymm14, ymm14, ymm11
+    vpsravd     ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_minus_inf:
+{% for i in range(0, 8) %}
+    vpsubd  ymm{{i}}, ymm{{i}}, ymm9
+    vpaddd  ymm{{i}}, ymm{{i}}, ymm11
+    vpsravd ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_plus_inf:
+{% for i in range(0, 8) %}
+    vpaddd  ymm{{i}}, ymm{{i}}, ymm11
+    vpsravd ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_even:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsravd ymm13, ymm14, ymm10
+    vpand   ymm13, ymm13, ymm9
+    vpsubd  ymm13, ymm13, ymm9          // nudge = ((abs >>l shift) & 0x01) - 1
+    vpaddd  ymm14, ymm14, ymm13         // add nudge
+    vpaddd  ymm14, ymm14, ymm11         // add half
+    vpsravd ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_odd:
+{% for i in range(0, 8) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsravd ymm13, ymm14, ymm10
+    vpand   ymm13, ymm13, ymm9
+    vpsubd  ymm13, ymm12, ymm13          // nudge = - ((abs >>l shift) & 0x01)
+    vpaddd  ymm14, ymm14, ymm13         // add nudge
+    vpaddd  ymm14, ymm14, ymm11         // add half
+    vpsravd ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rdx,    [rdi + 24]          // col stride
+    mov     rcx,    [rdi + 32]          // item size
+
+    cmp     rcx,    4
+    je      {{L}}store_strides_i32
+
+    {% for col in range(0, 8) %}
+        mov r10, r8
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         byte ptr [r10], bl
+            add         r10, rsi
+        {% endfor %}
+        vperm2f128  ymm{{col}},   ymm{{col}},   ymm{{col}},  1
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         byte ptr [r10], bl
+            add         r10, rsi
+        {% endfor %}
+        add r8, rdx
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_i32:
+    {% for col in range(0, 8) %}
+        mov r10,    r8
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         dword ptr [r10], ebx
+            add         r10, rsi
+        {% endfor %}
+        vperm2f128  ymm{{col}},   ymm{{col}},   ymm{{col}},  1
+        {% for row in range(0, 4) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         dword ptr [r10], ebx
+            add         r10, rsi
+        {% endfor %}
+        add r8, rdx
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}return:
+    ldmxcsr     [rsp + 4]
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+
+{{L}}one_32bit:
+{% if msvc %}
+    dd      1
+{% else %}
+    .int    1
+{% endif %}
+
+{% if msvc %}
+avxvnni_mmm_i32_8x8_{{suffix}} endp
+_text ends
+end
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/metal/src/kernels/array/array_ops.metal b/metal/src/kernels/array/array_ops.metal
index 151822ea04..2a5353c3d0 100644
--- a/metal/src/kernels/array/array_ops.metal
+++ b/metal/src/kernels/array/array_ops.metal
@@ -241,6 +241,101 @@ typedef decltype(rotate_half_nd2<float>) rotate_half_nd2_t;
         "array_ops::rotate_half_nd2_" #tname)]] [[kernel]] rotate_half_nd2_t   \
         rotate_half_nd2<type>;
 
+// Diagonal gather (Transformer-XL rel-pos skew, folded):
+//   out[..., i, k] = in[..., i, offset + k - i], 0 on out-of-bounds.
+// Leading axes are flattened by the host into one batch axis.  Each thread
+// owns one (b, i, k) output element.
+//
+// params layout: [offset, t_q, r_in, out_len,
+//                 in_stride_b, in_stride_i, in_stride_r,
+//                 out_stride_b, out_stride_i, out_stride_k]
+template <typename T>
+[[kernel]] void diag_gather(device const void *input_b [[buffer(0)]],
+                            device void *output_b [[buffer(1)]],
+                            constant const int32_t *params [[buffer(2)]],
+                            uint3 tpig [[thread_position_in_grid]]) {
+    const int32_t k = (int32_t)tpig.x;
+    const int32_t i = (int32_t)tpig.y;
+    const int32_t b = (int32_t)tpig.z;
+
+    const int32_t offset = params[0];
+    const int32_t t_q = params[1];
+    const int32_t r_in = params[2];
+    const int32_t out_len = params[3];
+    const int32_t in_stride_b = params[4];
+    const int32_t in_stride_i = params[5];
+    const int32_t in_stride_r = params[6];
+    const int32_t out_stride_b = params[7];
+    const int32_t out_stride_i = params[8];
+    const int32_t out_stride_k = params[9];
+
+    if (k >= out_len || i >= t_q)
+        return;
+
+    device const T *input = (device const T *)input_b;
+    device T *output = (device T *)output_b;
+
+    const int32_t out_idx = b * out_stride_b + i * out_stride_i + k * out_stride_k;
+    const int32_t r = offset + k - i;
+    if (r >= 0 && r < r_in) {
+        const int32_t in_idx = b * in_stride_b + i * in_stride_i + r * in_stride_r;
+        output[out_idx] = input[in_idx];
+    } else {
+        output[out_idx] = (T)0;
+    }
+}
+
+typedef decltype(diag_gather<float>) diag_gather_t;
+
+#define INSTANTIATE_DIAG_GATHER(tname, type)                                   \
+    template [[host_name(                                                      \
+        "array_ops::diag_gather_" #tname)]] [[kernel]] diag_gather_t           \
+        diag_gather<type>;
+
+// Gather along one axis:
+//   out[i_pre, i_n, i_post] = data[i_pre, indices[i_n], i_post]
+// where the host flattens to (pre × a_size × post) for data and
+// (pre × n_indices × post) for output.  Negative indices wrap with `a_size`,
+// matching the CPU contract.
+//
+// params layout: [pre, a_size, post, n_indices]
+template <typename T>
+[[kernel]] void gather(device const void *data_b [[buffer(0)]],
+                       device const void *indices_b [[buffer(1)]],
+                       device void *output_b [[buffer(2)]],
+                       constant const int32_t *params [[buffer(3)]],
+                       uint3 tpig [[thread_position_in_grid]]) {
+    const int32_t i_post = (int32_t)tpig.x;
+    const int32_t i_n = (int32_t)tpig.y;
+    const int32_t i_pre = (int32_t)tpig.z;
+
+    const int32_t pre = params[0];
+    const int32_t a_size = params[1];
+    const int32_t post = params[2];
+    const int32_t n_indices = params[3];
+
+    if (i_post >= post || i_n >= n_indices || i_pre >= pre)
+        return;
+
+    device const T *data = (device const T *)data_b;
+    device const long *indices = (device const long *)indices_b;
+    device T *output = (device T *)output_b;
+
+    long k = indices[i_n];
+    if (k < 0)
+        k += a_size;
+
+    const long in_off = ((long)i_pre * a_size + k) * post + i_post;
+    const long out_off = ((long)i_pre * n_indices + i_n) * post + i_post;
+    output[out_off] = data[in_off];
+}
+
+typedef decltype(gather<float>) gather_t;
+
+#define INSTANTIATE_GATHER(tname, type)                                        \
+    template [[host_name(                                                      \
+        "array_ops::gather_" #tname)]] [[kernel]] gather_t gather<type>;
+
 // Copy kernels: only u8/u16/u32/u64 (copy is type-size based)
 INSTANTIATE_COPY(u8, uint8_t)
 INSTANTIATE_COPY(u16, uint16_t)
@@ -276,3 +371,11 @@ INSTANTIATE_CAST_FROM(u64, uint64_t)
 // Rotate half: only float types
 INSTANTIATE_ROTATE_HALF_OP(f32, float)
 INSTANTIATE_ROTATE_HALF_OP(f16, half)
+
+// Diagonal gather: f32 and f16 only.
+INSTANTIATE_DIAG_GATHER(f32, float)
+INSTANTIATE_DIAG_GATHER(f16, half)
+
+// Axis Gather: f32 and f16 only (indices are int64).
+INSTANTIATE_GATHER(f32, float)
+INSTANTIATE_GATHER(f16, half)
diff --git a/metal/src/kernels/array/diag_gather.rs b/metal/src/kernels/array/diag_gather.rs
new file mode 100644
index 0000000000..04a5b588b9
--- /dev/null
+++ b/metal/src/kernels/array/diag_gather.rs
@@ -0,0 +1,194 @@
+use crate::encoder::EncoderExt;
+use crate::{LibraryName, MetalStream};
+use anyhow::{Context, ensure};
+use metal::MTLSize;
+use std::fmt;
+use tract_core::internal::*;
+use tract_gpu::tensor::DeviceTensor;
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct DiagGather;
+
+impl fmt::Display for DiagGather {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+impl DiagGather {
+    pub fn is_supported_dt(dt: DatumType) -> bool {
+        matches!(dt, DatumType::F32 | DatumType::F16)
+    }
+
+    pub fn kernel_name(&self, dt: DatumType) -> TractResult<String> {
+        ensure!(Self::is_supported_dt(dt), "Unsupported dt {:?} for metal diag_gather op", dt);
+        let tname = DeviceTensor::tname(dt)?;
+        Ok(format!("array_ops::diag_gather_{tname}"))
+    }
+
+    pub fn eval(
+        &self,
+        stream: &MetalStream,
+        input: &DeviceTensor,
+        offset: i64,
+        out_len: usize,
+    ) -> TractResult<DeviceTensor> {
+        let rank = input.rank();
+        ensure!(rank >= 2);
+        let mut out_shape: TVec<usize> = input.shape().into();
+        out_shape[rank - 1] = out_len;
+        let output = unsafe { DeviceTensor::uninitialized_dt(input.datum_type(), &out_shape)? };
+        self.dispatch_eval(stream, input, offset, out_len, &output)?;
+        stream.wait_until_completed()?;
+        Ok(output)
+    }
+
+    pub fn dispatch_eval(
+        &self,
+        stream: &MetalStream,
+        input: &DeviceTensor,
+        offset: i64,
+        out_len: usize,
+        output: &DeviceTensor,
+    ) -> TractResult<()> {
+        stream.retain_tensor(input);
+        stream.retain_tensor(output);
+
+        let rank = input.rank();
+        ensure!(rank >= 2);
+        ensure!(output.rank() == rank);
+        ensure!(output.datum_type() == input.datum_type());
+        let in_shape = input.shape();
+        let out_shape = output.shape();
+        ensure!(in_shape[..rank - 2] == out_shape[..rank - 2]);
+        ensure!(in_shape[rank - 2] == out_shape[rank - 2]);
+        ensure!(out_shape[rank - 1] == out_len);
+
+        let offset_i32: i32 = offset.try_into().context("DiagGather offset overflows i32")?;
+        let out_len_i32: i32 = out_len.try_into().context("DiagGather out_len overflows i32")?;
+
+        // Flatten the (rank-2) leading axes into one batch axis.  Assumes the
+        // leading block is plain row-major (encoder use: rank-4 BxHxTxR with
+        // natural strides), so the batch stride is `t_q * (R or out_len)`.
+        let in_strides = input.strides();
+        let out_strides = output.strides();
+        let batch: usize = in_shape[..rank - 2].iter().product();
+        let t_q = in_shape[rank - 2];
+        let r_in = in_shape[rank - 1];
+        let in_stride_b: i32 = if rank >= 3 { (t_q * r_in) as i32 } else { 0 };
+        let in_stride_i = in_strides[rank - 2] as i32;
+        let in_stride_r = in_strides[rank - 1] as i32;
+        let out_stride_b: i32 = if rank >= 3 { (t_q * out_len) as i32 } else { 0 };
+        let out_stride_i = out_strides[rank - 2] as i32;
+        let out_stride_k = out_strides[rank - 1] as i32;
+
+        let params: [i32; 10] = [
+            offset_i32,
+            t_q as i32,
+            r_in as i32,
+            out_len_i32,
+            in_stride_b,
+            in_stride_i,
+            in_stride_r,
+            out_stride_b,
+            out_stride_i,
+            out_stride_k,
+        ];
+
+        let pipeline =
+            stream.load_pipeline(LibraryName::ArrayOps, &self.kernel_name(input.datum_type())?)?;
+        let command_buffer = stream.command_buffer();
+        command_buffer.encode(|encoder| {
+            encoder.set_compute_pipeline_state(&pipeline);
+            encoder.set_metal_tensor(0, input, metal::MTLResourceUsage::Read);
+            encoder.set_metal_tensor(1, output, metal::MTLResourceUsage::Write);
+            encoder.set_slice(2, &params);
+            let grid_size = MTLSize { width: out_len as _, height: t_q as _, depth: batch as _ };
+            let group_size = MTLSize { width: 1, height: 1, depth: 1 };
+            encoder.dispatch_thread_groups(grid_size, group_size);
+        });
+        Ok(())
+    }
+}
+
+pub fn metal_diag_gather_dispatch(
+    input: &DeviceTensor,
+    offset: i64,
+    out_len: usize,
+    output: &DeviceTensor,
+) -> TractResult<()> {
+    crate::with_metal_stream(|stream| {
+        DiagGather.dispatch_eval(stream, input, offset, out_len, output)
+    })
+}
+
+crate::register_metal_op!(tract_transformers::ops::diag_gather::DiagGather, |source, node, op| {
+    rule_if!(DiagGather::is_supported_dt(source.node_input_facts(node.id)?[0].datum_type));
+    Ok(Some(Box::new(tract_gpu::ops::diag_gather::GpuDiagGather::new(
+        op.offset.clone(),
+        op.out_len.clone(),
+        "Metal",
+        metal_diag_gather_dispatch,
+    ))))
+});
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::utils::with_borrowed_metal_stream;
+    use tract_core::internal::Tensor;
+    use tract_core::plan::TurnState;
+    use tract_gpu::tensor::IntoDevice;
+    use tract_transformers::ops::diag_gather as cpu_dg;
+
+    fn run_against_cpu(shape: &[usize], offset: i64, out_len: usize) -> TractResult<()> {
+        with_borrowed_metal_stream(|stream| {
+            let len: usize = shape.iter().product();
+            let data: Vec<f32> = (0..len).map(|i| i as f32).collect();
+            let cpu_in = Tensor::from_shape(shape, &data)?;
+            let metal_in = cpu_in.clone().into_device()?;
+
+            let cpu_op =
+                cpu_dg::DiagGather { offset: (offset as i64).to_dim(), out_len: out_len.to_dim() };
+            let session = TurnState::default();
+            let cpu_out = cpu_op.eval_with_session(0, &session, tvec![cpu_in.into_tvalue()])?[0]
+                .clone()
+                .into_tensor();
+            let metal_out = DiagGather.eval(stream, &metal_in, offset, out_len)?;
+            cpu_out
+                .close_enough(&metal_out.to_host()?.into_tensor(), Approximation::Exact)
+                .with_context(|| format!("shape={shape:?} offset={offset} out_len={out_len}"))
+        })
+    }
+
+    #[test]
+    fn test_diag_gather_skew_basic() -> TractResult<()> {
+        let t = 4;
+        run_against_cpu(&[2, t, 2 * t - 1], (t - 1) as i64, t)
+    }
+
+    #[test]
+    fn test_diag_gather_rank4_encoder_like() -> TractResult<()> {
+        let t = 14;
+        run_against_cpu(&[1, 8, t, 2 * t - 1], (t - 1) as i64, t)
+    }
+
+    #[test]
+    fn test_diag_gather_out_of_bounds_zero_fill() -> TractResult<()> {
+        let r = 5;
+        let t = 4;
+        run_against_cpu(&[1, t, r], 1, 8)
+    }
+
+    #[test]
+    fn test_diag_gather_partial_overlap() -> TractResult<()> {
+        let t = 4;
+        let r = 6;
+        run_against_cpu(&[1, t, r], 0, t)
+    }
+
+    #[test]
+    fn test_diag_gather_rank2() -> TractResult<()> {
+        run_against_cpu(&[5, 9], 4, 5)
+    }
+}
diff --git a/metal/src/kernels/array/gather.rs b/metal/src/kernels/array/gather.rs
new file mode 100644
index 0000000000..ccb224d1cf
--- /dev/null
+++ b/metal/src/kernels/array/gather.rs
@@ -0,0 +1,183 @@
+use crate::encoder::EncoderExt;
+use crate::{LibraryName, MetalStream};
+use anyhow::ensure;
+use metal::MTLSize;
+use std::fmt;
+use tract_core::internal::*;
+use tract_gpu::tensor::DeviceTensor;
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct Gather;
+
+impl fmt::Display for Gather {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+impl Gather {
+    pub fn is_supported_dt(dt: DatumType) -> bool {
+        matches!(dt, DatumType::F32 | DatumType::F16)
+    }
+
+    pub fn kernel_name(&self, dt: DatumType) -> TractResult<String> {
+        ensure!(Self::is_supported_dt(dt), "Unsupported dt {:?} for metal gather op", dt);
+        let tname = DeviceTensor::tname(dt)?;
+        Ok(format!("array_ops::gather_{tname}"))
+    }
+
+    pub fn eval(
+        &self,
+        stream: &MetalStream,
+        data: &DeviceTensor,
+        indices: &DeviceTensor,
+        axis: usize,
+    ) -> TractResult<DeviceTensor> {
+        ensure!(data.rank() > axis);
+        let mut out_shape: TVec<usize> = data.shape()[..axis].into();
+        out_shape.extend(indices.shape().iter().copied());
+        out_shape.extend(data.shape()[axis + 1..].iter().copied());
+        let output = unsafe { DeviceTensor::uninitialized_dt(data.datum_type(), &out_shape)? };
+        self.dispatch_eval(stream, data, indices, axis, &output)?;
+        stream.wait_until_completed()?;
+        Ok(output)
+    }
+
+    pub fn dispatch_eval(
+        &self,
+        stream: &MetalStream,
+        data: &DeviceTensor,
+        indices: &DeviceTensor,
+        axis: usize,
+        output: &DeviceTensor,
+    ) -> TractResult<()> {
+        stream.retain_tensor(data);
+        stream.retain_tensor(indices);
+        stream.retain_tensor(output);
+
+        ensure!(data.rank() > axis);
+        ensure!(indices.datum_type() == i64::datum_type());
+        ensure!(output.datum_type() == data.datum_type());
+
+        let data_shape = data.shape();
+        let pre: usize = data_shape[..axis].iter().product();
+        let a_size: usize = data_shape[axis];
+        let post: usize = data_shape[axis + 1..].iter().product();
+        let n_indices: usize = indices.shape().iter().product();
+
+        let expected: usize = pre * n_indices * post;
+        ensure!(
+            output.shape().iter().product::<usize>() == expected,
+            "Gather output shape mismatch: data={:?} axis={} indices={:?} output={:?}",
+            data_shape,
+            axis,
+            indices.shape(),
+            output.shape()
+        );
+
+        let params: [i32; 4] = [pre as i32, a_size as i32, post as i32, n_indices as i32];
+
+        let pipeline =
+            stream.load_pipeline(LibraryName::ArrayOps, &self.kernel_name(data.datum_type())?)?;
+        let command_buffer = stream.command_buffer();
+        command_buffer.encode(|encoder| {
+            encoder.set_compute_pipeline_state(&pipeline);
+            encoder.set_metal_tensor(0, data, metal::MTLResourceUsage::Read);
+            encoder.set_metal_tensor(1, indices, metal::MTLResourceUsage::Read);
+            encoder.set_metal_tensor(2, output, metal::MTLResourceUsage::Write);
+            encoder.set_slice(3, &params);
+            let grid_size = MTLSize { width: post as _, height: n_indices as _, depth: pre as _ };
+            let group_size = MTLSize { width: 1, height: 1, depth: 1 };
+            encoder.dispatch_thread_groups(grid_size, group_size);
+        });
+        Ok(())
+    }
+}
+
+pub fn metal_gather_dispatch(
+    data: &DeviceTensor,
+    indices: &DeviceTensor,
+    axis: usize,
+    output: &DeviceTensor,
+) -> TractResult<()> {
+    crate::with_metal_stream(|stream| Gather.dispatch_eval(stream, data, indices, axis, output))
+}
+
+crate::register_metal_op!(tract_core::ops::array::Gather, |source, node, op| {
+    let facts = source.node_input_facts(node.id)?;
+    rule_if!(facts[0].is_plain());
+    rule_if!(Gather::is_supported_dt(facts[0].datum_type));
+    rule_if!(facts[1].datum_type == i64::datum_type());
+    rule_if!(op.output_type.is_none() || op.output_type == Some(facts[0].datum_type));
+    Ok(Some(Box::new(tract_gpu::ops::gather::GpuGather::new(
+        op.axis,
+        "Metal",
+        metal_gather_dispatch,
+    ))))
+});
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::utils::with_borrowed_metal_stream;
+    use tract_core::internal::Tensor;
+    use tract_core::ops::array::Gather as CpuGather;
+    use tract_gpu::tensor::IntoDevice;
+
+    fn run_against_cpu(
+        data_shape: &[usize],
+        indices_shape: &[usize],
+        indices_data: &[i64],
+        axis: usize,
+    ) -> TractResult<()> {
+        with_borrowed_metal_stream(|stream| {
+            let n: usize = data_shape.iter().product();
+            let data = Tensor::from_shape(
+                data_shape,
+                &(0..n).map(|i| i as f32 / 10.0).collect::<Vec<_>>(),
+            )?;
+            let indices = Tensor::from_shape(indices_shape, indices_data)?;
+            let metal_data = data.clone().into_device()?;
+            let metal_indices = indices.clone().into_device()?;
+
+            let cpu_op = CpuGather::new(axis);
+            let cpu_out = cpu_op.eval(tvec![data.into_tvalue(), indices.into_tvalue()])?[0]
+                .clone()
+                .into_tensor();
+            let metal_out = Gather.eval(stream, &metal_data, &metal_indices, axis)?;
+            cpu_out
+                .close_enough(&metal_out.to_host()?.into_tensor(), Approximation::Exact)
+                .with_context(|| {
+                    format!(
+                        "data={data_shape:?} indices={indices_shape:?} axis={axis} \
+                         indices_data={indices_data:?}"
+                    )
+                })
+        })
+    }
+
+    #[test]
+    fn test_gather_embedding() -> TractResult<()> {
+        run_against_cpu(&[1025, 640], &[1, 1], &[42], 0)
+    }
+
+    #[test]
+    fn test_gather_embedding_multi() -> TractResult<()> {
+        run_against_cpu(&[100, 16], &[2, 3], &[0, 1, 99, 50, 25, 7], 0)
+    }
+
+    #[test]
+    fn test_gather_axis_1() -> TractResult<()> {
+        run_against_cpu(&[3, 10, 4], &[2], &[0, 9], 1)
+    }
+
+    #[test]
+    fn test_gather_negative_indices() -> TractResult<()> {
+        run_against_cpu(&[100, 4], &[3], &[-1, -100, -50], 0)
+    }
+
+    #[test]
+    fn test_gather_scalar_index() -> TractResult<()> {
+        run_against_cpu(&[5, 8], &[], &[3], 0)
+    }
+}
diff --git a/metal/src/kernels/array/mod.rs b/metal/src/kernels/array/mod.rs
index 3d2690fc5d..d9a51b1dc2 100644
--- a/metal/src/kernels/array/mod.rs
+++ b/metal/src/kernels/array/mod.rs
@@ -1,12 +1,18 @@
 mod cast;
 mod copy;
+mod diag_gather;
 mod dispatch;
+mod gather;
 mod rotate_half;
 
 pub use cast::Cast;
 pub use cast::metal_cast_dispatch;
 pub use copy::Memcpy;
+pub use diag_gather::DiagGather;
+pub use diag_gather::metal_diag_gather_dispatch;
 pub use dispatch::metal_copy_nd_dispatch;
+pub use gather::Gather;
+pub use gather::metal_gather_dispatch;
 pub use rotate_half::RotateHalf;
 pub use rotate_half::metal_rotate_half_dispatch;
 
@@ -32,5 +38,17 @@ pub fn all_functions() -> Vec<String> {
             .flat_map(|(dt1, dt2)| Cast.kernel_name(dt1, dt2).into_iter()),
     );
 
+    functions.extend(
+        tract_gpu::tensor::DeviceTensor::SUPPORTED_DT
+            .into_iter()
+            .flat_map(|dt| DiagGather.kernel_name(dt).into_iter()),
+    );
+
+    functions.extend(
+        tract_gpu::tensor::DeviceTensor::SUPPORTED_DT
+            .into_iter()
+            .flat_map(|dt| Gather.kernel_name(dt).into_iter()),
+    );
+
     functions.into_iter().collect()
 }
diff --git a/metal/src/kernels/nn/mod.rs b/metal/src/kernels/nn/mod.rs
index 270dd7a674..9ff4b13ea8 100644
--- a/metal/src/kernels/nn/mod.rs
+++ b/metal/src/kernels/nn/mod.rs
@@ -40,11 +40,11 @@ pub fn all_functions() -> Vec<String> {
             .flat_map(|dt| Softmax.kernel_name(dt).into_iter()),
     );
 
-    functions.extend(
-        tract_gpu::tensor::DeviceTensor::SUPPORTED_DT
+    functions.extend(tract_gpu::tensor::DeviceTensor::SUPPORTED_DT.into_iter().flat_map(|dt| {
+        [false, true]
             .into_iter()
-            .flat_map(|dt| ScaledMaskedSoftmax.kernel_name(dt).into_iter()),
-    );
+            .flat_map(move |mb| ScaledMaskedSoftmax.kernel_name(dt, mb).into_iter())
+    }));
 
     functions.extend(
         tract_gpu::tensor::DeviceTensor::SUPPORTED_DT
diff --git a/metal/src/kernels/nn/nn_ops.metal b/metal/src/kernels/nn/nn_ops.metal
index f67d08f201..5d27078345 100644
--- a/metal/src/kernels/nn/nn_ops.metal
+++ b/metal/src/kernels/nn/nn_ops.metal
@@ -495,6 +495,126 @@ template [[host_name("nn_ops::scaled_masked_softmax_nd5_"
                      "f16")]] [[kernel]] scaled_masked_softmax_nd5_t
     scaled_masked_softmax_nd5<half>;
 
+// Bool-mask variant: mask is uchar (0/1).  Masked positions are substituted
+// with -inf before softmax (so exp(-inf)=0 naturally zeroes them in the
+// output).  When post_mask is non-zero, fully-masked rows — whose softmax
+// would otherwise be NaN — are written as 0 instead.
+template <typename F>
+[[kernel]] void scaled_bool_masked_softmax_nd5(
+    device const void *input_b, device const void *mask_b,
+    constant float *scale_b, device void *output_b,
+    constant uint *post_mask_b, constant const size_t shape[5],
+    constant const size_t strides[5],
+    constant const size_t mask_strides[5],
+    constant const size_t out_strides[5],
+
+    uint3 tgpig [[threadgroup_position_in_grid]],
+    uint tiisg [[thread_index_in_simdgroup]],
+    uint tpsg [[threads_per_simdgroup]],
+    uint3 tptg [[thread_position_in_threadgroup]],
+    uint3 tptgN [[threads_per_threadgroup]],
+
+    threadgroup float *tgmem [[threadgroup(0)]]) {
+
+    const uint tid = tptg.x;
+    const uint tg_sz = tptgN.x;
+    const uint sg_id = tid / tpsg;
+    const uint lane = tiisg;
+
+    const size_t row = (size_t)tgpig.x;
+    const size_t h = (size_t)tgpig.y;
+    const size_t z = (size_t)tgpig.z;
+    const size_t z0 = z / shape[1];
+    const size_t z1 = z % shape[1];
+
+    device const F *x = (device const F *)input_b;
+    device const uchar *mask = (device const uchar *)mask_b;
+    device F *out = (device F *)output_b;
+
+    const float scale = *scale_b;
+    const bool post_mask = *post_mask_b != 0;
+
+    x += row * strides[3] + h * strides[2] + z1 * strides[1] + z0 * strides[0];
+    mask += row * mask_strides[3] + h * mask_strides[2] +
+            z1 * mask_strides[1] + z0 * mask_strides[0];
+    out += row * out_strides[3] + h * out_strides[2] + z1 * out_strides[1] +
+           z0 * out_strides[0];
+
+    threadgroup float *buf_iw = tgmem;
+    threadgroup float *vals = tgmem + 32;
+
+    const uint simd_size = tpsg;
+    const uint num_sg = (tg_sz + simd_size - 1u) / simd_size;
+    const size_t cols = shape[4];
+
+    // 1) Substitute -inf at masked positions, then take row max
+    float max_val = -INFINITY;
+    for (size_t col = (size_t)tid; col < cols; col += (size_t)tg_sz) {
+        const bool m = mask[col * mask_strides[4]] != 0;
+        const float xv = (float)x[col * strides[4]] * scale;
+        const float v = m ? xv : -INFINITY;
+        vals[col] = v;
+        max_val = metal::max(max_val, v);
+    }
+
+    float sg_max = simd_max(max_val);
+    if (lane == 0) {
+        buf_iw[sg_id] = sg_max;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (sg_id == 0) {
+        float x0 = (lane < num_sg) ? buf_iw[lane] : -INFINITY;
+        float block_max = simd_max(x0);
+        if (lane == 0)
+            buf_iw[0] = block_max;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    max_val = buf_iw[0];
+
+    // 2) exp(vals - max) and row sum
+    float sum = 0.0f;
+    for (size_t col = (size_t)tid; col < cols; col += (size_t)tg_sz) {
+        float e = exp(vals[col] - max_val);
+        vals[col] = e;
+        sum += e;
+    }
+
+    float sg_sum = simd_sum(sum);
+    if (lane == 0) {
+        buf_iw[sg_id] = sg_sum;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (sg_id == 0) {
+        float x0 = (lane < num_sg) ? buf_iw[lane] : 0.0f;
+        float block_sum = simd_sum(x0);
+        if (lane == 0)
+            buf_iw[0] = block_sum;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    sum = buf_iw[0];
+
+    // Row-uniform: sum <= 0 (or NaN) iff every position was masked.  With
+    // post_mask we write 0 in that case to scrub the NaN; otherwise we let
+    // 1/sum propagate.
+    const bool zero_row = post_mask && !(sum > 0.0f);
+    const float inv_sum = 1.0f / sum;
+
+    for (size_t col = (size_t)tid; col < cols; col += (size_t)tg_sz) {
+        float y = zero_row ? 0.0f : vals[col] * inv_sum;
+        out[col * out_strides[4]] = (F)y;
+    }
+}
+
+typedef decltype(scaled_bool_masked_softmax_nd5<float>)
+    scaled_bool_masked_softmax_nd5_t;
+
+template [[host_name("nn_ops::scaled_bool_masked_softmax_nd5_"
+                     "f32")]] [[kernel]] scaled_bool_masked_softmax_nd5_t
+    scaled_bool_masked_softmax_nd5<float>;
+template [[host_name("nn_ops::scaled_bool_masked_softmax_nd5_"
+                     "f16")]] [[kernel]] scaled_bool_masked_softmax_nd5_t
+    scaled_bool_masked_softmax_nd5<half>;
+
 constant float GELU_COEF_A = 0.044715f;
 constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
 
diff --git a/metal/src/kernels/nn/scaled_masked_softmax.rs b/metal/src/kernels/nn/scaled_masked_softmax.rs
index 5ce0323d6d..89b7c9f263 100644
--- a/metal/src/kernels/nn/scaled_masked_softmax.rs
+++ b/metal/src/kernels/nn/scaled_masked_softmax.rs
@@ -14,14 +14,23 @@ impl ScaledMaskedSoftmax {
         matches!(dt, DatumType::F32 | DatumType::F16)
     }
 
-    pub fn kernel_name(&self, dt: DatumType) -> TractResult<String> {
+    pub fn is_supported_mask_dt(input_dt: DatumType, mask_dt: DatumType) -> bool {
+        mask_dt == input_dt || mask_dt == bool::datum_type()
+    }
+
+    pub fn kernel_name(&self, dt: DatumType, mask_is_bool: bool) -> TractResult<String> {
         ensure!(
             Self::is_supported_dt(dt),
-            "Unsupported dt {:?} for metal scaled masked softmaxop",
+            "Unsupported dt {:?} for metal scaled masked softmax op",
             dt
         );
         let tname = DeviceTensor::tname(dt)?;
-        Ok(format!("nn_ops::scaled_masked_softmax_nd5_{tname}"))
+        let stem = if mask_is_bool {
+            "scaled_bool_masked_softmax_nd5"
+        } else {
+            "scaled_masked_softmax_nd5"
+        };
+        Ok(format!("nn_ops::{stem}_{tname}"))
     }
 
     pub fn eval(
@@ -30,9 +39,10 @@ impl ScaledMaskedSoftmax {
         input: &DeviceTensor,
         scale: &Tensor,
         mask: &DeviceTensor,
+        post_softmax_mask: bool,
     ) -> TractResult<DeviceTensor> {
         let output = unsafe { DeviceTensor::uninitialized_dt(input.datum_type(), input.shape())? };
-        self.dispatch_eval(stream, input, scale, mask, &output)?;
+        self.dispatch_eval(stream, input, scale, mask, post_softmax_mask, &output)?;
         stream.wait_until_completed()?;
         Ok(output)
     }
@@ -43,6 +53,7 @@ impl ScaledMaskedSoftmax {
         input: &DeviceTensor,
         scale: &Tensor,
         mask: &DeviceTensor,
+        post_softmax_mask: bool,
         output: &DeviceTensor,
     ) -> TractResult<()> {
         stream.retain_tensor(input);
@@ -54,7 +65,10 @@ impl ScaledMaskedSoftmax {
         ensure!(input.rank() <= 5);
         ensure!(mask.rank() == input.rank());
         ensure!(output.datum_type() == input.datum_type());
-        ensure!(mask.datum_type() == input.datum_type());
+        let mask_is_bool = mask.datum_type() == bool::datum_type();
+        ensure!(Self::is_supported_mask_dt(input.datum_type(), mask.datum_type()));
+        // post_softmax_mask is meaningful only with a bool mask (CPU contract).
+        ensure!(!post_softmax_mask || mask_is_bool);
         let scale = scale.cast_to::<f32>()?;
 
         let shape = pad(input.shape(), 1);
@@ -72,8 +86,10 @@ impl ScaledMaskedSoftmax {
         let tg_floats = 32 + inner_len;
         let tg_bytes = tg_floats * f32::datum_type().size_of();
 
-        let pipeline =
-            stream.load_pipeline(LibraryName::NNOps, &self.kernel_name(input.datum_type())?)?;
+        let pipeline = stream.load_pipeline(
+            LibraryName::NNOps,
+            &self.kernel_name(input.datum_type(), mask_is_bool)?,
+        )?;
 
         let command_buffer = stream.command_buffer();
         command_buffer.encode(|encoder| {
@@ -82,10 +98,18 @@ impl ScaledMaskedSoftmax {
             encoder.set_metal_tensor(1, mask, metal::MTLResourceUsage::Read);
             encoder.set_tensor(2, &scale);
             encoder.set_metal_tensor(3, output, metal::MTLResourceUsage::Write);
-            encoder.set_slice(4, &shape);
-            encoder.set_slice(5, &strides);
-            encoder.set_slice(6, &mask_strides);
-            encoder.set_slice(7, &out_strides);
+            // Bool-mask kernel takes a `post_mask` flag at slot 4; the
+            // float-mask kernel doesn't, so slots shift down by one.
+            let next_slot = if mask_is_bool {
+                encoder.set_slice(4, &[post_softmax_mask as u32]);
+                5
+            } else {
+                4
+            };
+            encoder.set_slice(next_slot, &shape);
+            encoder.set_slice(next_slot + 1, &strides);
+            encoder.set_slice(next_slot + 2, &mask_strides);
+            encoder.set_slice(next_slot + 3, &out_strides);
             encoder.set_threadgroup_memory_length(0, tg_bytes as _);
             let grid_size = MTLSize {
                 width: shape[3] as _,
@@ -111,22 +135,27 @@ pub fn metal_scaled_masked_softmax_dispatch(
     input: &DeviceTensor,
     scale: &Tensor,
     mask: &DeviceTensor,
+    post_softmax_mask: bool,
     output: &DeviceTensor,
 ) -> TractResult<()> {
     crate::with_metal_stream(|stream| {
-        ScaledMaskedSoftmax.dispatch_eval(stream, input, scale, mask, output)
+        ScaledMaskedSoftmax.dispatch_eval(stream, input, scale, mask, post_softmax_mask, output)
     })
 }
 
 crate::register_metal_op!(
     tract_transformers::ops::scaled_masked_softmax::ScaledMaskedSoftmax,
     |source, node, op| {
-        rule_if!(!op.post_softmax_mask);
-        rule_if!(ScaledMaskedSoftmax::is_supported_dt(
-            source.node_input_facts(node.id)?[0].datum_type
+        let facts = source.node_input_facts(node.id)?;
+        rule_if!(ScaledMaskedSoftmax::is_supported_dt(facts[0].datum_type));
+        rule_if!(ScaledMaskedSoftmax::is_supported_mask_dt(
+            facts[0].datum_type,
+            facts[1].datum_type,
         ));
+        rule_if!(!op.post_softmax_mask || facts[1].datum_type == bool::datum_type());
         Ok(Some(Box::new(tract_gpu::ops::scaled_masked_softmax::GpuScaledMaskedSoftmax::new(
             op.scale.clone(),
+            op.post_softmax_mask,
             "Metal",
             metal_scaled_masked_softmax_dispatch,
         ))))
@@ -171,7 +200,7 @@ mod tests {
                 .eval(tvec![a.to_host()?.into_tvalue(), mask.to_host()?.into_tvalue()])?[0]
                 .clone()
                 .into_tensor();
-            let metal_output = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask)?;
+            let metal_output = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask, false)?;
             cpu_output
                 .close_enough(&metal_output.to_host()?.into_tensor(), Approximation::Approximate)?;
             Ok(())
@@ -201,13 +230,63 @@ mod tests {
                 .eval(tvec![a.to_host()?.into_tvalue(), mask.to_host()?.into_tvalue()])?[0]
                 .clone()
                 .into_tensor();
-            let metal_output = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask)?;
+            let metal_output = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask, false)?;
             cpu_output
                 .close_enough(&metal_output.to_host()?.into_tensor(), Approximation::Approximate)?;
             Ok(())
         })
     }
 
+    /// Bool-mask path with a fully-masked row.  Without post_softmax_mask
+    /// the output is NaN (matches CPU); with it on, the NaN is scrubbed to 0.
+    #[test]
+    fn test_scaled_bool_masked_softmax_post_mask_scrubs_nan() -> TractResult<()> {
+        with_borrowed_metal_stream(|stream| {
+            let m = 3;
+            let n = 5;
+            let scale: Arc<_> = tensor0(0.125f32).into();
+            // Row 0: fully masked.  Row 1: partially masked.  Row 2: fully unmasked.
+            let mask_data: Vec<bool> = (0..m)
+                .flat_map(|r| {
+                    (0..n).map(move |c| match r {
+                        0 => false,
+                        1 => c >= 2,
+                        _ => true,
+                    })
+                })
+                .collect();
+            let mask = Tensor::from_shape(&[1, 1, m, n], &mask_data)?.into_device()?;
+            let a = Tensor::from_shape(
+                &[1, 1, m, n],
+                &(0..m * n).map(|f| f as f32).collect::<Vec<_>>(),
+            )?
+            .into_device()?;
+
+            for post in [false, true] {
+                let cpu = scaled_masked_softmax::ScaledMaskedSoftmax {
+                    scale: scale.clone(),
+                    post_softmax_mask: post,
+                };
+                let cpu_out = cpu
+                    .eval(tvec![a.to_host()?.into_tvalue(), mask.to_host()?.into_tvalue()])?[0]
+                    .clone()
+                    .into_tensor();
+                let metal_out = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask, post)?;
+                let metal_host = metal_out.to_host()?.into_tensor();
+                let cpu_slice = cpu_out.view().as_slice::<f32>().unwrap();
+                let metal_slice = metal_host.view().as_slice::<f32>().unwrap();
+                for (i, (c, g)) in cpu_slice.iter().zip(metal_slice.iter()).enumerate() {
+                    if c.is_nan() {
+                        assert!(g.is_nan(), "post={post} idx={i}: cpu NaN, metal {g}");
+                    } else {
+                        assert!((c - g).abs() < 1e-5, "post={post} idx={i}: cpu {c} metal {g}");
+                    }
+                }
+            }
+            Ok(())
+        })
+    }
+
     proptest::proptest! {
         #[test]
         fn scaled_masked_softmax_prop_f32(pb in any::<ScaledMaskedSoftmaxProblem<f32>>()) {
@@ -300,7 +379,7 @@ mod tests {
                 let mask =
                     Tensor::from_shape(self.mask_shape.as_slice(), &self.mask)?.into_device()?;
                 let scale: Arc<_> = tensor0::<F>(0.125f32.as_()).into();
-                let metal_output = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask)?;
+                let metal_output = ScaledMaskedSoftmax.eval(stream, &a, &scale, &mask, false)?;
                 Ok(metal_output.to_host()?.into_tensor())
             })
         }
diff --git a/metal/src/rewrite_rules/fuse_axis_op.rs b/metal/src/rewrite_rules/fuse_axis_op.rs
index 465b69928d..c5b4b0c9b5 100644
--- a/metal/src/rewrite_rules/fuse_axis_op.rs
+++ b/metal/src/rewrite_rules/fuse_axis_op.rs
@@ -107,9 +107,7 @@ pub fn fuse_axis_op(
 
     let node_name = &node.name;
 
-    let Some(in_nodes) = model.all_prec(node.id)? else {
-        return Ok(None);
-    };
+    rule_if_some!(in_nodes = model.all_prec(node.id)?);
 
     let mut grouped_axis_ops: TVec<TVec<GpuAxisOp>> = tvec![];
     let mut tap_inputs = tvec![];
@@ -200,7 +198,7 @@ pub fn fuse_move_axis(
     }
 
     // Fuse consecutive MoveAxis if possible
-    let Some(cursor) = model.single_succ(axis_node.id)? else { return Ok(None) };
+    rule_if_some!(cursor = model.single_succ(axis_node.id)?);
     if let (AxisOp::Move(from_1, to_1), AxisOp::Move(from_2, to_2)) = (
         axis_op.inner.clone(),
         cursor.op_as::<GpuAxisOp>().map(|ax_op| ax_op.inner.clone()).unwrap_or(AxisOp::Add(0)),
@@ -225,7 +223,7 @@ pub fn fuse_move_axis(
     }
 
     // Add(x) -> Move(x, y)
-    let Some(cursor) = model.single_prec(axis_node.id)? else { return Ok(None) };
+    rule_if_some!(cursor = model.single_prec(axis_node.id)?);
     if let (AxisOp::Move(from_1, to_1), AxisOp::Add(ax)) = (
         axis_op.inner.clone(),
         cursor.op_as::<GpuAxisOp>().map(|ax_op| ax_op.inner.clone()).unwrap_or(AxisOp::Rm(0)),
diff --git a/metal/src/transform.rs b/metal/src/transform.rs
index 36cf53e4cc..468ffb4208 100644
--- a/metal/src/transform.rs
+++ b/metal/src/transform.rs
@@ -167,9 +167,7 @@ fn try_make_metal_op(
     });
 
     let input_facts = source.node_input_facts(node.id)?;
-    if !input_facts.iter().all(|f| DeviceTensor::is_supported_dt(f.datum_type)) {
-        return Ok(None);
-    }
+    rule_if!(input_facts.iter().all(|f| DeviceTensor::is_supported_dt(f.datum_type)));
 
     // Copy-based ops are fully generic (no backend-specific dispatch needed).
     if let Some(op) = tract_gpu::ops::copy_based::try_make_copy_based_op(source, node)? {
@@ -234,8 +232,37 @@ impl Translate<TypedFact, Box<dyn TypedOp>, TypedFact, Box<dyn TypedOp>> for Met
             }
         }
 
-        // Single-op translation
-        if let Some(gpu_op) = try_make_metal_op(source, node)? {
+        // Single-op translation.  See the matching CUDA path for rationale:
+        // pre-check the gpu_op's output_facts against the already-translated
+        // target-side input shapes before wiring, so a stale Reshape (e.g.
+        // after pulsification has changed an upstream axis size) falls back
+        // to CPU rather than aborting the whole Metal transform.
+        let target_inputs: TVec<TypedFact> = node
+            .inputs
+            .iter()
+            .map(|i| target.outlet_fact(mapping[i]).map(|f| f.clone()))
+            .collect::<TractResult<_>>()?;
+        // Mirror sync_inputs_if_required(ToDevice): wrap non-device facts as
+        // device facts so the GPU op's `output_facts` sees uniform device
+        // inputs, matching what it'll receive after sync nodes are wired.
+        // Mixed inputs (e.g. host kv-cache + device current activation) make
+        // `output_facts` bail with "Inconsistent facts", wrongly tripping CPU
+        // fallback.
+        let target_inputs_post_sync: TVec<TypedFact> = target_inputs
+            .iter()
+            .map(|f| -> TractResult<TypedFact> {
+                if f.as_device_fact().is_some() {
+                    Ok(f.clone())
+                } else {
+                    Ok(tract_gpu::fact::DeviceFact::from_host(f.clone())?.into_exotic_fact())
+                }
+            })
+            .collect::<TractResult<_>>()?;
+        let target_input_post_sync_refs: TVec<&TypedFact> =
+            target_inputs_post_sync.iter().collect();
+        if let Some(gpu_op) = try_make_metal_op(source, node)?
+            && gpu_op.output_facts(&target_input_post_sync_refs).is_ok()
+        {
             let device_inputs =
                 sync_inputs_if_required(target, node, mapping, DeviceSyncKind::ToDevice)?;
             let outlet_ids = target.wire_node(node.name.clone(), gpu_op, &device_inputs)?;
diff --git a/nnef/nnef-resources/tests/test_json_resource.rs b/nnef/nnef-resources/tests/test_json_resource.rs
index f1ee16da68..c8dbdbacbe 100644
--- a/nnef/nnef-resources/tests/test_json_resource.rs
+++ b/nnef/nnef-resources/tests/test_json_resource.rs
@@ -4,7 +4,6 @@ use tract_nnef_resources::internal::JsonLoader;
 #[test]
 fn load_model_with_json_resource() -> TractResult<()> {
     let model = tract_nnef::nnef()
-        .with_tract_core()
         .with_tract_resource()
         .with_resource_loader(JsonLoader)
         .model_for_path("tests/nnef_with_json")?;
diff --git a/nnef/src/ast/dump_doc.rs b/nnef/src/ast/dump_doc.rs
index 5e4cd0de13..522df6c4a6 100644
--- a/nnef/src/ast/dump_doc.rs
+++ b/nnef/src/ast/dump_doc.rs
@@ -82,7 +82,7 @@ mod test {
     #[test]
     fn doc_example() -> TractResult<()> {
         let d = TempDir::new()?;
-        let nnef = crate::nnef().with_tract_core().with_tract_resource();
+        let nnef = crate::nnef().with_tract_resource();
         DocDumper::to_directory(d.path(), &nnef)?;
         Ok(())
     }
diff --git a/nnef/src/ast/parse.rs b/nnef/src/ast/parse.rs
index 4309af3086..68bd486fa5 100644
--- a/nnef/src/ast/parse.rs
+++ b/nnef/src/ast/parse.rs
@@ -348,7 +348,7 @@ fn rvalue(i: &str) -> R<'_, RValue> {
     bin!(exp, sub, tag("^"));
     bin!(mul, exp, one_of("*/"));
     bin!(add, mul, one_of("+-"));
-    bin!(comp, add, alt((tag("=="), tag("!="), tag("<"), tag(">"), tag("<="), tag(">="))));
+    bin!(comp, add, alt((tag("=="), tag("!="), tag("<="), tag(">="), tag("<"), tag(">"))));
     bin!(boolean, comp, alt((tag("||"), tag("&&"))));
     bin!(in_for, boolean, tag("in"));
 
@@ -794,6 +794,20 @@ mod test {
         p(rvalue, "scalar(2 ^ (bits - 1) - integer(symmetric) if signed else 0)");
     }
 
+    #[test]
+    fn test_rvalue_comparison_operators() {
+        p(rvalue, "a == b");
+        p(rvalue, "a != b");
+        p(rvalue, "a < b");
+        p(rvalue, "a > b");
+        p(rvalue, "a <= b");
+        p(rvalue, "a >= b");
+        p(rvalue, "a<=b");
+        p(rvalue, "a>=b");
+        p(rvalue, "(a + b) <= (c + d)");
+        p(rvalue, "a >= b && c <= d");
+    }
+
     #[test]
     fn test_comprehenion() {
         p(comprehension_expr, "[for i in range_of(output_size) yield output_size * sampling_rate]");
diff --git a/nnef/src/deser.rs b/nnef/src/deser.rs
index c7786899e7..a060f4ec8f 100644
--- a/nnef/src/deser.rs
+++ b/nnef/src/deser.rs
@@ -358,7 +358,7 @@ impl ResolvedInvocation<'_> {
     where
         T: CoerceFrom<Value>,
     {
-        let Some(rv) = self.get_named_arg(name) else { return Ok(None) };
+        rule_if_some!(rv = self.get_named_arg(name));
         let v = rv
             .resolve(builder, &[])
             .with_context(|| format!("Resolving argument `{name}' ({rv:?})"))?;
@@ -419,7 +419,7 @@ impl ResolvedInvocation<'_> {
     where
         T: CoerceFrom<Value>,
     {
-        let Some(rv) = self.get_named_arg(name) else { return Ok(None) };
+        rule_if_some!(rv = self.get_named_arg(name));
         let v = rv
             .resolve(builder, &[])
             .with_context(|| format!("Resolving argument `{name}' ({rv:?})"))?;
diff --git a/nnef/src/framework.rs b/nnef/src/framework.rs
index dad241a859..edf819073a 100644
--- a/nnef/src/framework.rs
+++ b/nnef/src/framework.rs
@@ -27,7 +27,7 @@ impl Default for Nnef {
     fn default() -> Nnef {
         Nnef {
             stdlib: stdlib(),
-            registries: vec![crate::ops::tract_nnef()],
+            registries: vec![crate::ops::tract_nnef(), crate::ops::tract_core()],
             resource_loaders: vec![
                 GraphNnefLoader.into_boxed(),
                 DatLoader.into_boxed(),
@@ -52,12 +52,12 @@ impl Nnef {
         self
     }
 
-    pub fn enable_tract_core(&mut self) {
-        self.registries.push(crate::ops::tract_core());
+    pub fn disable_tract_core(&mut self) {
+        self.registries.retain(|r| r.id.0 != "tract_core");
     }
 
-    pub fn with_tract_core(mut self) -> Self {
-        self.registries.push(crate::ops::tract_core());
+    pub fn without_tract_core(mut self) -> Self {
+        self.disable_tract_core();
         self
     }
 
diff --git a/nnef/src/ops/core/einsum.rs b/nnef/src/ops/core/einsum.rs
index 5b40e732b9..43fc6831a5 100644
--- a/nnef/src/ops/core/einsum.rs
+++ b/nnef/src/ops/core/einsum.rs
@@ -1,10 +1,12 @@
 use crate::internal::*;
 use crate::ser::*;
 use tract_core::ops::einsum::EinSum;
+use tract_core::ops::einsum::einsum_matmul::EinSumMatMul;
 use tract_core::tract_data::itertools::Itertools;
 
 pub fn register(registry: &mut Registry) {
     registry.register_dumper(ser);
+    registry.register_dumper(ser_matmul);
     registry.register_primitive(
         "tract_core_einsum",
         &parameters(),
@@ -45,11 +47,26 @@ pub fn parameters_q() -> Vec<Parameter> {
 }
 
 pub fn ser(ast: &mut IntoAst, node: &TypedNode, op: &EinSum) -> TractResult<Option<Arc<RValue>>> {
-    if op.q_params.is_some() { ser_einsum_q(ast, node) } else { ser_einsum(ast, node) }
+    if op.q_params.is_some() { ser_einsum_q(ast, node, op) } else { ser_einsum(ast, node, op) }
 }
 
-pub fn ser_einsum(ast: &mut IntoAst, node: &TypedNode) -> TractResult<Option<Arc<RValue>>> {
-    let einsum = node.op_as::<EinSum>().unwrap();
+pub fn ser_matmul(
+    ast: &mut IntoAst,
+    node: &TypedNode,
+    op: &EinSumMatMul,
+) -> TractResult<Option<Arc<RValue>>> {
+    if op.op.q_params.is_some() {
+        ser_einsum_q(ast, node, &op.op)
+    } else {
+        ser_einsum(ast, node, &op.op)
+    }
+}
+
+pub fn ser_einsum(
+    ast: &mut IntoAst,
+    node: &TypedNode,
+    einsum: &EinSum,
+) -> TractResult<Option<Arc<RValue>>> {
     let inputs: Vec<_> = node.inputs.iter().map(|i| (*ast.mapping[i]).clone()).collect();
     Ok(Some(invocation(
         "tract_core_einsum",
@@ -62,8 +79,11 @@ pub fn ser_einsum(ast: &mut IntoAst, node: &TypedNode) -> TractResult<Option<Arc
     )))
 }
 
-pub fn ser_einsum_q(ast: &mut IntoAst, node: &TypedNode) -> TractResult<Option<Arc<RValue>>> {
-    let einsum = node.op_as::<EinSum>().unwrap();
+pub fn ser_einsum_q(
+    ast: &mut IntoAst,
+    node: &TypedNode,
+    einsum: &EinSum,
+) -> TractResult<Option<Arc<RValue>>> {
     let inputs = node.inputs.iter().map(|i| (*ast.mapping[i]).clone()).collect_vec();
     Ok(Some(invocation(
         "tract_core_einsum_q",
diff --git a/nnef/src/ops/core/qconv.rs b/nnef/src/ops/core/qconv.rs
index 215d14af0e..46be937980 100644
--- a/nnef/src/ops/core/qconv.rs
+++ b/nnef/src/ops/core/qconv.rs
@@ -34,6 +34,7 @@ fn qconv_parameters() -> Vec<Parameter> {
         TypeName::Scalar.spec().named("b_scale"),
         TypeName::Integer.spec().named("c0"),
         TypeName::Scalar.spec().named("c_scale"),
+        TypeName::String.spec().named("output_dt").default(""),
     ]
 }
 
@@ -50,6 +51,7 @@ fn qconv_unary_dump(
     for (ix, name) in ["b0", "b_scale", "a0", "a_scale", "c0", "c_scale"].iter().enumerate() {
         named_args.push((name, (*ast.mapping[&node.inputs[3 + ix]]).clone()));
     }
+    named_args.push(("output_dt", crate::ser::datum_type(node.outputs[0].fact.datum_type)));
 
     let wire = ast.mapping[&node.inputs[0]].clone();
     ensure!(op.kernel_fmt == KernelFormat::OIHW);
@@ -88,16 +90,22 @@ fn qconv_load(builder: &mut ModelBuilder, invocation: &ResolvedInvocation) -> Tr
     qparams.swap(1, 3);
     inputs.extend(qparams.iter().cloned());
 
-    let Some(c0) = &builder.model.outlet_fact(qparams[4])?.konst else {
-        bail!("For quantized convolution, output quantization must be static");
+    let output_dt = if let Ok(s) = invocation.named_arg_as::<String>(builder, "output_dt")
+        && !s.is_empty()
+    {
+        s.parse::<DatumType>()?
+    } else {
+        let Some(c0) = &builder.model.outlet_fact(qparams[4])?.konst else {
+            bail!("For quantized convolution, output quantization must be static");
+        };
+        let Some(c_scale) = &builder.model.outlet_fact(qparams[5])?.konst else {
+            bail!("For quantized convolution, output quantization must be static");
+        };
+        input_fact.datum_type.with_qparams(QParams::ZpScale {
+            zero_point: c0.cast_to_scalar()?,
+            scale: c_scale.cast_to_scalar()?,
+        })
     };
-    let Some(c_scale) = &builder.model.outlet_fact(qparams[5])?.konst else {
-        bail!("For quantized convolution, output quantization must be static");
-    };
-    let output_dt = input_fact.datum_type.with_qparams(QParams::ZpScale {
-        zero_point: c0.cast_to_scalar()?,
-        scale: c_scale.cast_to_scalar()?,
-    });
 
     let op: Box<dyn TypedOp> =
         Box::new(Conv::new(pool_spec, KernelFormat::OIHW, group, Some(output_dt)));
diff --git a/nnef/src/ops/core/scan.rs b/nnef/src/ops/core/scan.rs
index 5d8ef1ba5b..a95b413470 100644
--- a/nnef/src/ops/core/scan.rs
+++ b/nnef/src/ops/core/scan.rs
@@ -43,6 +43,7 @@ pub fn register(registry: &mut Registry) {
             .named("output"),
             TypeName::Integer.spec().named("skip").default(0), // needed for pulse
             TypeName::Integer.spec().named("reset_every_turn").default(0), // needed for pulse
+            TypeName::Integer.spec().named("external_state").default(0),
         ],
         &[("outputs", TypeName::Scalar.tensor().array())],
         de_scan,
@@ -129,6 +130,7 @@ fn ser_scan(ast: &mut IntoAst, node: &TypedNode, op: &Scan) -> TractResult<Optio
             ("output", array(outputs)),
             ("skip", numeric(op.skip)),
             ("reset_every_turn", numeric(op.reset_every_turn)),
+            ("external_state", numeric(op.external_state)),
         ],
     );
     ast.fragments.insert(body.decl.id.clone(), body);
@@ -248,6 +250,7 @@ fn de_scan(builder: &mut ModelBuilder, invocation: &ResolvedInvocation) -> Tract
     let skip: usize = invocation.named_arg_as(builder, "skip")?;
     let mut op = Scan::new(body.model, input_mapping, output_mapping, skip)?;
     op.reset_every_turn = invocation.named_arg_as(builder, "reset_every_turn")?;
+    op.external_state = invocation.named_arg_as(builder, "external_state")?;
     builder.wire(op, &outer_inputs)
 }
 
diff --git a/nnef/src/ops/nnef/deser.rs b/nnef/src/ops/nnef/deser.rs
index 0ff9cb0663..2d6c6e3014 100644
--- a/nnef/src/ops/nnef/deser.rs
+++ b/nnef/src/ops/nnef/deser.rs
@@ -892,6 +892,26 @@ pub fn unstack(builder: &mut ModelBuilder, invocation: &ResolvedInvocation) -> T
         .map(Value::from)
 }
 
+// fragment copy<?>( x: tensor<?> ) -> ( y: tensor<?> );
+//
+// Identity on the value, but graph.quant may declare a different
+// quantization for the output named tensor, in which case the operator is
+// the spec-compliant way to express a requantization. If declared output
+// type matches the input, return the input outlet unchanged (no node
+// added); otherwise wire a `Cast`, which performs real byte-level
+// requantization for any (zp, scale) change between two quantized
+// variants of the same physical type.
+pub fn copy(builder: &mut ModelBuilder, invocation: &ResolvedInvocation) -> TractResult<Value> {
+    let input: OutletId = invocation.named_arg_as(builder, "x")?;
+    if let Some(Some(to)) = invocation.dt_from_quant_file.first().copied() {
+        let in_dt = builder.model.outlet_fact(input)?.datum_type;
+        if to != in_dt {
+            return builder.wire(cast(to), &[input]);
+        }
+    }
+    Ok(Value::Wire(input))
+}
+
 /*
  * fragment softmax( x: tensor<scalar>, axes: integer[] = [1] ) -> ( y: tensor<scalar> )
  * {
diff --git a/nnef/src/ops/nnef/mod.rs b/nnef/src/ops/nnef/mod.rs
index f640ea3a52..d3984c081e 100644
--- a/nnef/src/ops/nnef/mod.rs
+++ b/nnef/src/ops/nnef/mod.rs
@@ -45,6 +45,8 @@ pub fn tract_nnef() -> Registry {
     primitive(&mut registry, "stack", deser::stack);
     primitive(&mut registry, "unstack", deser::unstack);
 
+    primitive(&mut registry, "copy", deser::copy);
+
     registry.register_binary("add", &ops::math::Add {});
     registry.register_binary("sub", &ops::math::Sub {});
     registry.register_binary("mul", &ops::math::Mul {});
diff --git a/nnef/src/ops/nnef/ser.rs b/nnef/src/ops/nnef/ser.rs
index 85102dd349..a085a479c0 100644
--- a/nnef/src/ops/nnef/ser.rs
+++ b/nnef/src/ops/nnef/ser.rs
@@ -266,6 +266,9 @@ pub fn conv(
     node: &TypedNode,
     op: &ops::cnn::conv::Conv,
 ) -> TractResult<Option<Arc<RValue>>> {
+    if op.q_params.is_some() && !node.outputs[0].fact.datum_type.is_quantized() {
+        return Ok(None);
+    }
     conv_like(ast, node, &op.pool_spec, op.group, false, None)
 }
 
@@ -470,9 +473,7 @@ pub fn softmax(
     node: &TypedNode,
     op: &ops::nn::Softmax,
 ) -> TractResult<Option<Arc<RValue>>> {
-    if op.kind != SoftmaxKind::default() {
-        return Ok(None);
-    }
+    rule_if!(op.kind == SoftmaxKind::default());
     let litteral_axes: Vec<_> = op.axes.iter().map(|&it| (it as i64).into()).collect();
     Ok(Some(invocation(
         "softmax",
@@ -502,9 +503,7 @@ pub fn rewrite_matmul_to_same_rank(
 ) -> TractResult<Option<TypedModelPatch>> {
     let a_rank = block_quant_aware_input_shape(model.outlet_fact(node.inputs[0])?)?.len();
     let b_rank = block_quant_aware_input_shape(model.outlet_fact(node.inputs[1])?)?.len();
-    if a_rank == b_rank {
-        return Ok(None);
-    }
+    rule_if!(a_rank != b_rank);
     let mut patch = TypedModelPatch::default();
     let mut inputs = patch.taps(model, &node.inputs)?;
     for i in a_rank..a_rank.max(b_rank) {
@@ -529,7 +528,7 @@ pub fn rewrite_consistent_quantized_conv(
 ) -> TractResult<Option<TypedModelPatch>> {
     let facts = model.node_input_facts(node.id)?;
     if facts.len() > 3 {
-        ensure!(facts[3..9].iter().all(|fact| fact.konst.is_some()));
+        rule_if!(facts[3..9].iter().all(|fact| fact.konst.is_some()));
         for ix in [0, 1] {
             let fact = model.outlet_fact(node.inputs[ix])?;
             if !fact.datum_type.is_quantized() {
@@ -553,3 +552,37 @@ pub fn rewrite_consistent_quantized_conv(
     }
     Ok(None)
 }
+
+pub fn rewrite_same_lower_conv_to_explicit(
+    _ctx: &(),
+    model: &TypedModel,
+    node: &TypedNode,
+    _name: &str,
+    op: &Conv,
+) -> TractResult<Option<TypedModelPatch>> {
+    use tract_core::ops::cnn::PaddingSpec;
+    rule_if!(op.pool_spec.padding == PaddingSpec::SameLower);
+    let input_fact = model.outlet_fact(node.inputs[0])?;
+    let input_shape = op.pool_spec.data_format.shape(input_fact.shape.to_tvec())?;
+    let spatial_dims = input_shape.hw_dims();
+    let rank = op.pool_spec.rank();
+    let mut before = TVec::new();
+    let mut after = TVec::new();
+    for i in 0..rank {
+        let input_dim = spatial_dims[i]
+            .to_usize()
+            .with_context(|| format!("SameLower conv input dim {i} is not concrete"))?;
+        let computed = PaddingSpec::SameLower.compute_one(
+            i,
+            &input_dim,
+            op.pool_spec.kernel_shape[i],
+            op.pool_spec.dilation(i),
+            op.pool_spec.stride(i),
+        );
+        before.push(computed.pad_before);
+        after.push(computed.pad_after);
+    }
+    let mut new_op = op.clone();
+    new_op.pool_spec.padding = PaddingSpec::Explicit(before, after);
+    TypedModelPatch::replace_single_op(model, node, &node.inputs, new_op).map(Some)
+}
diff --git a/nnef/src/ser.rs b/nnef/src/ser.rs
index 97d3bea4c5..c78a652195 100644
--- a/nnef/src/ser.rs
+++ b/nnef/src/ser.rs
@@ -24,6 +24,10 @@ pub fn rewrite_model(model: &mut TypedModel) -> TractResult<()> {
         )
         .with_rule_for("rewrite_kernel_conv_in_oihw", rewrite_kernel_conv_in_oihw)
         .with_rule_for("rewrite_kernel_deconv_in_oihw", rewrite_kernel_deconv_in_oihw)
+        .with_rule_for(
+            "rewrite_same_lower_conv_to_explicit",
+            crate::ops::nnef::ser::rewrite_same_lower_conv_to_explicit,
+        )
         .with_rule_for(
             "rewrite_consistent_quantized_conv",
             crate::ops::nnef::ser::rewrite_consistent_quantized_conv,
diff --git a/nnef/src/transform.rs b/nnef/src/transform.rs
index e43d27fffe..414f0a2a5a 100644
--- a/nnef/src/transform.rs
+++ b/nnef/src/transform.rs
@@ -34,7 +34,7 @@ impl ModelTransform for PatchTransform {
 
         // Run the builder in a block so it (and its borrow of model) is dropped before we mutate model
         let (patch_model, taps, scope) = {
-            let framework = crate::nnef().with_tract_core();
+            let framework = crate::nnef();
 
             let doc = Document {
                 version: "1.0".into(),
@@ -65,11 +65,9 @@ impl ModelTransform for PatchTransform {
             let taps_ref = &mut taps;
             builder.wire_resolver =
                 Some(Box::new(move |name: &str, patch_model: &mut TypedModel| {
-                    let Some(node_id) =
-                        model_ref.nodes.iter().find(|n| n.name == name).map(|n| n.id)
-                    else {
-                        return Ok(None);
-                    };
+                    rule_if_some!(
+                        node_id = model_ref.nodes.iter().find(|n| n.name == name).map(|n| n.id)
+                    );
                     let original_outlet = OutletId::new(node_id, 0);
                     let fact = model_ref.outlet_fact(original_outlet)?.clone();
                     let patch_outlet = patch_model.add_source(name, fact)?;
diff --git a/nnef/tests/scatter_round_trip.rs b/nnef/tests/scatter_round_trip.rs
index 55f8855fb1..c1a2a5a70a 100644
--- a/nnef/tests/scatter_round_trip.rs
+++ b/nnef/tests/scatter_round_trip.rs
@@ -4,7 +4,7 @@ use tract_core::ops::cast::Cast;
 use tract_nnef::internal::*;
 
 fn round_trip(model: &TypedModel) -> TractResult<TypedModel> {
-    let nnef = tract_nnef::nnef().with_tract_core();
+    let nnef = tract_nnef::nnef();
     let mut buffer = vec![];
     nnef.write_to_tar(model, &mut buffer)?;
     nnef.model_for_read(&mut &*buffer)
diff --git a/onnx-opl/src/lib.rs b/onnx-opl/src/lib.rs
index d0fba6c13a..5efa07630b 100644
--- a/onnx-opl/src/lib.rs
+++ b/onnx-opl/src/lib.rs
@@ -17,7 +17,6 @@ pub trait WithOnnx {
 
 impl WithOnnx for tract_nnef::framework::Nnef {
     fn enable_onnx(&mut self) {
-        self.enable_tract_core();
         self.registries.push(onnx_opl_registry());
     }
     fn with_onnx(mut self) -> Self {
diff --git a/onnx-opl/src/resize.rs b/onnx-opl/src/resize.rs
index 5798a4fbcc..a9c7fbd198 100644
--- a/onnx-opl/src/resize.rs
+++ b/onnx-opl/src/resize.rs
@@ -329,25 +329,21 @@ impl TypedOp for Resize {
         node: &TypedNode,
     ) -> TractResult<Option<TypedModelPatch>> {
         // Lower nearest-neighbor integer-scale upsamples to Reshape → Tile → Reshape
-        if !matches!(self.interpolator, Interpolator::Nearest) {
-            return Ok(None);
-        }
-        let Some(scales_input) = self.optional_scales_input else { return Ok(None) };
+        rule_if!(matches!(self.interpolator, Interpolator::Nearest));
+        rule_if_some!(scales_input = self.optional_scales_input);
         let input_fact = model.outlet_fact(node.inputs[0])?;
         let scales_fact = model.outlet_fact(node.inputs[scales_input])?;
-        let Some(scales_tensor) = &scales_fact.konst else { return Ok(None) };
+        rule_if_some!(scales_tensor = &scales_fact.konst);
         let scales: Vec<f32> =
             scales_tensor.cast_to::<f32>()?.try_as_plain()?.as_slice::<f32>()?.to_vec();
 
         // Check all scales are positive integers
         let int_scales: Vec<usize> = scales.iter().map(|&s| s.round() as usize).collect();
-        if scales.iter().zip(&int_scales).any(|(&s, &i)| (s - i as f32).abs() > 1e-5 || i == 0) {
-            return Ok(None);
-        }
+        rule_if!(
+            scales.iter().zip(&int_scales).all(|(&s, &i)| (s - i as f32).abs() <= 1e-5 && i != 0)
+        );
         // Only if at least one axis actually upsamples
-        if int_scales.iter().all(|&s| s == 1) {
-            return Ok(None);
-        }
+        rule_if!(int_scales.iter().any(|&s| s != 1));
 
         let input_shape = &input_fact.shape;
 
diff --git a/onnx/Cargo.toml b/onnx/Cargo.toml
index 82899a2bc1..8623cf64bd 100644
--- a/onnx/Cargo.toml
+++ b/onnx/Cargo.toml
@@ -30,6 +30,7 @@ tract-nnef.workspace = true
 tract-hir.workspace = true
 tract-onnx-opl.workspace = true
 tract-extra.workspace = true
+tract-transformers.workspace = true
 
 [dev-dependencies]
 criterion.workspace = true
diff --git a/onnx/protos/onnx/onnx.proto b/onnx/protos/onnx/onnx.proto
index c6fd4b22c2..541d36fed2 100644
--- a/onnx/protos/onnx/onnx.proto
+++ b/onnx/protos/onnx/onnx.proto
@@ -297,7 +297,21 @@ message TensorProto {
     UINT64 = 13;
     COMPLEX64 = 14;     // complex with float32 real and imaginary components
     COMPLEX128 = 15;    // complex with float64 real and imaginary components
-    // Future extensions go here.
+
+    // Non-IEEE floating-point format based on IEEE754 single-precision
+    // floating-point number truncated to 16 bits. 1 sign, 8 exp, 7 mantissa.
+    BFLOAT16 = 16;
+
+    // 8-bit floating-point variants (ONNX opset 19+).
+    FLOAT8E4M3FN = 17;
+    FLOAT8E4M3FNUZ = 18;
+    FLOAT8E5M2 = 19;
+    FLOAT8E5M2FNUZ = 20;
+
+    // 4-bit integer and float variants (ONNX opset 21+).
+    UINT4 = 21;
+    INT4 = 22;
+    FLOAT4E2M1 = 23;
   }
 
   // The shape of the tensor.
diff --git a/onnx/protos/onnx/onnx.proto3 b/onnx/protos/onnx/onnx.proto3
index f96dcee58b..453155aa8d 100644
--- a/onnx/protos/onnx/onnx.proto3
+++ b/onnx/protos/onnx/onnx.proto3
@@ -507,6 +507,17 @@ message TensorProto {
     // floating-point number truncated to 16 bits.
     // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits.
     BFLOAT16 = 16;
+
+    // 8-bit floating-point variants (introduced in ONNX opset 19).
+    FLOAT8E4M3FN = 17;
+    FLOAT8E4M3FNUZ = 18;
+    FLOAT8E5M2 = 19;
+    FLOAT8E5M2FNUZ = 20;
+
+    // 4-bit integer and float variants (introduced in ONNX opset 21+).
+    UINT4 = 21;
+    INT4 = 22;
+    FLOAT4E2M1 = 23;
   }
 
   // The shape of the tensor.
diff --git a/onnx/src/model.rs b/onnx/src/model.rs
index 340e18cb69..606b12b325 100644
--- a/onnx/src/model.rs
+++ b/onnx/src/model.rs
@@ -84,7 +84,13 @@ impl ParsingContext<'_> {
                 let id = model.add_const(input.name.to_owned(), init)?;
                 outlets_by_name.insert(input.name.to_owned(), id);
             } else {
-                let fact = input.r#type.as_ref().unwrap().value.as_ref().unwrap();
+                let fact = input
+                    .r#type
+                    .as_ref()
+                    .with_context(|| format!("graph input `{}` has no type", input.name))?
+                    .value
+                    .as_ref()
+                    .with_context(|| format!("graph input `{}` has no type value", input.name))?;
                 #[allow(irrefutable_let_patterns)]
                 let fact: InferenceFact = if let pb::type_proto::Value::TensorType(fact) = fact {
                     translate_inference_fact(&ctx, fact, true)
diff --git a/onnx/src/ops/cast.rs b/onnx/src/ops/cast.rs
index 7ad2471f73..acccb9e64f 100644
--- a/onnx/src/ops/cast.rs
+++ b/onnx/src/ops/cast.rs
@@ -68,6 +68,8 @@ impl ElementWiseMiniOp for Cast {
         let from = model.outlet_fact(node.inputs[0])?.datum_type;
         if from == self.to {
             Ok(Some(TypedModelPatch::replace_single_op(model, node, &node.inputs, Identity)?))
+        } else if from == TDim::datum_type() && self.to == i32::datum_type() {
+            Ok(Some(TypedModelPatch::replace_single_op(model, node, &node.inputs, Identity)?))
         } else if from == String::datum_type() && self.to == f32::datum_type() {
             Ok(None)
         } else {
diff --git a/onnx/src/ops/nn/attention.rs b/onnx/src/ops/nn/attention.rs
new file mode 100644
index 0000000000..d9ddd3371d
--- /dev/null
+++ b/onnx/src/ops/nn/attention.rs
@@ -0,0 +1,300 @@
+use crate::model::ParsingContext;
+use crate::pb::NodeProto;
+use tract_core::ops::array::TypedConcat;
+use tract_core::ops::change_axes::AxisOp;
+use tract_core::ops::math::add;
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+use tract_transformers::ops::sdpa::Sdpa;
+
+pub fn attention(
+    _ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let softcap = node.get_attr_opt::<f32>("softcap")?.unwrap_or(0.0);
+    if softcap != 0.0 {
+        bail!("Attention: softcap is not supported");
+    }
+    let qk_matmul_output_mode = node.get_attr_opt::<i64>("qk_matmul_output_mode")?.unwrap_or(0);
+    if qk_matmul_output_mode != 0 {
+        bail!("Attention: qk_matmul_output_mode is not supported");
+    }
+
+    let q_num_heads = node.get_attr_opt::<i64>("q_num_heads")?.map(|v| v as usize);
+    let kv_num_heads = node.get_attr_opt::<i64>("kv_num_heads")?.map(|v| v as usize);
+    let is_causal = node.get_attr_opt::<i64>("is_causal")?.unwrap_or(0) != 0;
+    let scale = node.get_attr_opt::<f32>("scale")?;
+
+    let have_nonpad_kv_seqlen = node.input.len() > 6 && !node.input[6].is_empty();
+    if have_nonpad_kv_seqlen {
+        bail!("Attention: nonpad_kv_seqlen input is not supported");
+    }
+    let have_mask = node.input.len() > 3 && !node.input[3].is_empty();
+    let have_past_key = node.input.len() > 4 && !node.input[4].is_empty();
+    let have_past_value = node.input.len() > 5 && !node.input[5].is_empty();
+
+    let have_present_key = node.output.len() > 1 && !node.output[1].is_empty();
+    let have_present_value = node.output.len() > 2 && !node.output[2].is_empty();
+
+    Ok((
+        expand(AttentionOp {
+            q_num_heads,
+            kv_num_heads,
+            is_causal,
+            scale,
+            have_mask,
+            have_past_key,
+            have_past_value,
+            have_present_key,
+            have_present_value,
+        }),
+        vec![],
+    ))
+}
+
+#[derive(Debug, Clone)]
+struct AttentionOp {
+    q_num_heads: Option<usize>,
+    kv_num_heads: Option<usize>,
+    is_causal: bool,
+    scale: Option<f32>,
+    have_mask: bool,
+    have_past_key: bool,
+    have_past_value: bool,
+    have_present_key: bool,
+    have_present_value: bool,
+}
+
+impl AttentionOp {
+    fn mask_input_idx(&self) -> Option<usize> {
+        self.have_mask.then_some(3)
+    }
+
+    fn past_key_input_idx(&self) -> Option<usize> {
+        self.have_past_key.then_some(3 + self.have_mask as usize)
+    }
+
+    fn past_value_input_idx(&self) -> Option<usize> {
+        self.have_past_value.then_some(3 + self.have_mask as usize + self.have_past_key as usize)
+    }
+}
+
+fn wire_3d_to_4d(
+    prefix: &str,
+    model: &mut TypedModel,
+    x: OutletId,
+    total_dim: TDim,
+    num_heads: usize,
+) -> TractResult<OutletId> {
+    let head_dim = total_dim.clone() / num_heads;
+    let after_reshape = model.wire_node(
+        format!("{prefix}.reshape"),
+        AxisOp::Reshape(2, tvec![total_dim], tvec![num_heads.to_dim(), head_dim]),
+        &[x],
+    )?[0];
+    // (B, S, H, D) → Move(2, 1) → (B, H, S, D)
+    model
+        .wire_node(format!("{prefix}.transpose"), AxisOp::Move(2, 1), &[after_reshape])
+        .map(|v| v[0])
+}
+
+impl Expansion for AttentionOp {
+    fn name(&self) -> StaticName {
+        "OnnxAttention".into()
+    }
+
+    fn nboutputs(&self) -> TractResult<usize> {
+        Ok(1 + self.have_present_key as usize + self.have_present_value as usize)
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        let n_in = 3
+            + self.have_mask as usize
+            + self.have_past_key as usize
+            + self.have_past_value as usize;
+        let n_out = 1 + self.have_present_key as usize + self.have_present_value as usize;
+        check_input_arity(inputs, n_in)?;
+        check_output_arity(outputs, n_out)?;
+        s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+        // Output Y has same rank and same batch/seq dims as Q; head dim may differ in
+        // diff-head-sizes attention (V head dim != Q head dim), so we only propagate rank.
+        s.equals(&inputs[0].rank, &outputs[0].rank)?;
+        if self.have_present_key {
+            s.equals(&inputs[0].datum_type, &outputs[1].datum_type)?;
+        }
+        if self.have_present_value {
+            let pv_idx = 1 + self.have_present_key as usize;
+            s.equals(&inputs[0].datum_type, &outputs[pv_idx].datum_type)?;
+        }
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let q_fact = model.outlet_fact(inputs[0])?.clone();
+        let is_4d = q_fact.rank() == 4;
+        let dt = q_fact.datum_type;
+        let acc_dt = DatumType::F32;
+
+        // Build 4D Q, K, V for Sdpa: (B, heads, S, head_dim)
+        let (q4, k4, v4, q_hdim_3d) = if is_4d {
+            (inputs[0], inputs[1], inputs[2], None)
+        } else {
+            let q_hdim = q_fact.shape[2].clone();
+            let k_hdim = model.outlet_fact(inputs[1])?.shape[2].clone();
+            let v_hdim = model.outlet_fact(inputs[2])?.shape[2].clone();
+            let q_num_heads = self.q_num_heads.context("q_num_heads required for 3D Attention")?;
+            let kv_num_heads =
+                self.kv_num_heads.context("kv_num_heads required for 3D Attention")?;
+            let q4 = wire_3d_to_4d(
+                &format!("{prefix}.q"),
+                model,
+                inputs[0],
+                q_hdim.clone(),
+                q_num_heads,
+            )?;
+            let k4 = wire_3d_to_4d(&format!("{prefix}.k"), model, inputs[1], k_hdim, kv_num_heads)?;
+            let v4 = wire_3d_to_4d(&format!("{prefix}.v"), model, inputs[2], v_hdim, kv_num_heads)?;
+            (q4, k4, v4, Some(q_hdim))
+        };
+
+        // Handle KV cache: concat [past, current] along the sequence axis (2)
+        let (k_for_attn, v_for_attn, present_k, present_v) =
+            if self.have_past_key || self.have_present_key {
+                let k_full = if self.have_past_key {
+                    let past_k = inputs[self.past_key_input_idx().unwrap()];
+                    model.wire_node(
+                        format!("{prefix}.concat_k"),
+                        TypedConcat { axis: 2 },
+                        &[past_k, k4],
+                    )?[0]
+                } else {
+                    k4
+                };
+                let v_full = if self.have_past_value {
+                    let past_v = inputs[self.past_value_input_idx().unwrap()];
+                    model.wire_node(
+                        format!("{prefix}.concat_v"),
+                        TypedConcat { axis: 2 },
+                        &[past_v, v4],
+                    )?[0]
+                } else {
+                    v4
+                };
+                let pk = self.have_present_key.then_some(k_full);
+                let pv = self.have_present_value.then_some(v_full);
+                (k_full, v_full, pk, pv)
+            } else {
+                (k4, v4, None, None)
+            };
+
+        // Build explicit ONNX mask from input (if provided); pad rank to 4
+        let explicit_mask = if self.have_mask {
+            let m = inputs[self.mask_input_idx().unwrap()];
+            let m_rank = model.outlet_fact(m)?.rank();
+            let mut m = m;
+            for i in m_rank..4 {
+                m = model.wire_node(format!("{prefix}.mask_add_axis_{i}"), AxisOp::Add(0), &[m])?
+                    [0];
+            }
+            Some(m)
+        } else {
+            None
+        };
+
+        // Build causal mask: ONNX semantics are Q[i] sees K[j] iff j <= i (simple lower-tri,
+        // no offset). When shapes are concrete, materialise an explicit (1,1,q_seq,kv_seq)
+        // additive mask and let Sdpa run without is_causal so both branches agree.
+        // When shapes are symbolic (e.g. dynamic seq len at runtime), fall back to Sdpa's
+        // own is_causal flag, which is exact when q_seq == kv_seq (the normal training case).
+        let (causal_mask, sdpa_is_causal) = if self.is_causal {
+            let q_seq = model.outlet_fact(q4)?.shape[2].to_usize().ok();
+            let kv_seq = model.outlet_fact(k_for_attn)?.shape[2].to_usize().ok();
+            if let (Some(qs), Some(ks)) = (q_seq, kv_seq) {
+                let arr = tract_ndarray::Array2::<f32>::from_shape_fn((qs, ks), |(i, j)| {
+                    if j <= i { 0.0f32 } else { f32::NEG_INFINITY }
+                });
+                let mask_tensor: Tensor = arr.into();
+                let c = model.add_const(format!("{prefix}.causal_mask"), mask_tensor)?;
+                let mut m = c;
+                for i in 0..2 {
+                    m = model.wire_node(
+                        format!("{prefix}.causal_mask_unsqueeze_{i}"),
+                        AxisOp::Add(0),
+                        &[m],
+                    )?[0];
+                }
+                (Some(m), false)
+            } else {
+                (None, true)
+            }
+        } else {
+            (None, false)
+        };
+
+        // Combine explicit mask + causal mask (both are additive bias terms)
+        let mask = match (explicit_mask, causal_mask) {
+            (Some(em), Some(cm)) => Some(
+                wire_with_rank_broadcast(
+                    format!("{prefix}.mask_combined"),
+                    model,
+                    add(),
+                    &[em, cm],
+                )?[0],
+            ),
+            (m, None) | (None, m) => m,
+        };
+
+        // Wire Sdpa
+        let mut sdpa_inputs = tvec![q4, k_for_attn, v_for_attn];
+        if let Some(m) = mask {
+            sdpa_inputs.push(m);
+        }
+        let sdpa = Sdpa {
+            scale: self.scale.map(tensor0),
+            datum_type: dt,
+            acc_datum_type: acc_dt,
+            is_causal: sdpa_is_causal,
+        };
+        let y4 = model.wire_node(format!("{prefix}.sdpa"), sdpa, &sdpa_inputs)?[0];
+
+        // For 3D output: Move(1,2) then merge head dims back.
+        // Output shape is (B, S, q_heads * v_head_dim) — note: v_head_dim may differ from
+        // q_head_dim in diff-head-sizes (MLA-style) attention.
+        let y = if q_hdim_3d.is_some() {
+            // (B, q_heads, S, v_head_dim) → Move(1,2) → (B, S, q_heads, v_head_dim)
+            let y_transposed =
+                model.wire_node(format!("{prefix}.y_transpose"), AxisOp::Move(1, 2), &[y4])?[0];
+            let y4_fact = model.outlet_fact(y4)?.clone();
+            let q_heads_dim = y4_fact.shape[1].clone();
+            let v_head_dim = y4_fact.shape[3].clone();
+            let y_hdim = q_heads_dim.clone() * v_head_dim.clone();
+            // (B, S, q_heads, v_head_dim) → (B, S, q_heads*v_head_dim)
+            model.wire_node(
+                format!("{prefix}.y_reshape"),
+                AxisOp::Reshape(2, tvec![q_heads_dim, v_head_dim], tvec![y_hdim]),
+                &[y_transposed],
+            )?[0]
+        } else {
+            y4
+        };
+
+        let mut result = tvec![y];
+        if let Some(pk) = present_k {
+            result.push(pk);
+        }
+        if let Some(pv) = present_v {
+            result.push(pv);
+        }
+        Ok(result)
+    }
+}
diff --git a/onnx/src/ops/nn/gelu.rs b/onnx/src/ops/nn/gelu.rs
new file mode 100644
index 0000000000..65bace64bd
--- /dev/null
+++ b/onnx/src/ops/nn/gelu.rs
@@ -0,0 +1,72 @@
+use crate::model::ParsingContext;
+use crate::pb::NodeProto;
+use tract_core::ops::math::{add, erf, mul};
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+
+pub fn gelu(
+    _ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let approximate = node.get_attr_opt::<String>("approximate")?.unwrap_or_default();
+    if approximate == "tanh" {
+        Ok((tract_core::ops::nn::gelu_approximate::gelu_approximate(false).into_hir(), vec![]))
+    } else {
+        Ok((expand(GeluExact), vec![]))
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+struct GeluExact;
+
+impl Expansion for GeluExact {
+    fn name(&self) -> StaticName {
+        "GeluExact".into()
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        check_input_arity(inputs, 1)?;
+        check_output_arity(outputs, 1)?;
+        s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+        s.equals(&inputs[0].shape, &outputs[0].shape)?;
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let dt = model.outlet_fact(inputs[0])?.datum_type;
+        // gelu(x) = x * 0.5 * (1 + erf(x / sqrt(2)))
+        let inv_sqrt2 = tensor0((2.0f32).sqrt().recip()).cast_to_dt(dt)?.into_owned();
+        let c_inv_sqrt2 = model.add_const(format!("{prefix}.inv_sqrt2"), inv_sqrt2)?;
+        let x_scaled = wire_with_rank_broadcast(
+            format!("{prefix}.scale"),
+            model,
+            mul(),
+            &[inputs[0], c_inv_sqrt2],
+        )?[0];
+        let erf_x = model.wire_node(format!("{prefix}.erf"), erf(), &[x_scaled])?[0];
+        let c_one =
+            model.add_const(format!("{prefix}.one"), tensor0(1f32).cast_to_dt(dt)?.into_owned())?;
+        let one_plus_erf =
+            wire_with_rank_broadcast(format!("{prefix}.add_one"), model, add(), &[erf_x, c_one])?
+                [0];
+        let c_half = model
+            .add_const(format!("{prefix}.half"), tensor0(0.5f32).cast_to_dt(dt)?.into_owned())?;
+        let half_x = wire_with_rank_broadcast(
+            format!("{prefix}.half_x"),
+            model,
+            mul(),
+            &[inputs[0], c_half],
+        )?[0];
+        wire_with_rank_broadcast(format!("{prefix}.out"), model, mul(), &[half_x, one_plus_erf])
+    }
+}
diff --git a/onnx/src/ops/nn/gelu_contrib.rs b/onnx/src/ops/nn/gelu_contrib.rs
new file mode 100644
index 0000000000..f5f299d0d3
--- /dev/null
+++ b/onnx/src/ops/nn/gelu_contrib.rs
@@ -0,0 +1,172 @@
+use crate::model::ParsingContext;
+use crate::pb::NodeProto;
+use tract_core::ops::math::{add, erf, mul, tanh};
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+
+// com.microsoft fused Gelu activations. All lower onto existing element-wise primitives.
+//   BiasGelu(x, bias)  = erf_gelu(x + bias)            (exact, erf-based)
+//   FastGelu(x, bias?) = tanh_gelu(x + bias)           (tanh approximation)
+//   QuickGelu(x)       = x * sigmoid(alpha * x)         (alpha attr, default 1.702)
+
+fn scalar(model: &mut TypedModel, name: String, v: f32, dt: DatumType) -> TractResult<OutletId> {
+    model.add_const(name, tensor0(v).cast_to_dt(dt)?.into_owned())
+}
+
+// 0.5 * x * (1 + erf(x / sqrt(2)))
+fn erf_gelu(
+    model: &mut TypedModel,
+    prefix: &str,
+    x: OutletId,
+    dt: DatumType,
+) -> TractResult<OutletId> {
+    let inv_sqrt2 = scalar(model, format!("{prefix}.inv_sqrt2"), (2.0f32).sqrt().recip(), dt)?;
+    let scaled =
+        wire_with_rank_broadcast(format!("{prefix}.scale"), model, mul(), &[x, inv_sqrt2])?[0];
+    let e = model.wire_node(format!("{prefix}.erf"), erf(), &[scaled])?[0];
+    let one = scalar(model, format!("{prefix}.one"), 1.0, dt)?;
+    let one_plus =
+        wire_with_rank_broadcast(format!("{prefix}.one_plus"), model, add(), &[e, one])?[0];
+    let half = scalar(model, format!("{prefix}.half"), 0.5, dt)?;
+    let half_x = wire_with_rank_broadcast(format!("{prefix}.half_x"), model, mul(), &[x, half])?[0];
+    Ok(wire_with_rank_broadcast(format!("{prefix}.out"), model, mul(), &[half_x, one_plus])?[0])
+}
+
+// 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+fn tanh_gelu(
+    model: &mut TypedModel,
+    prefix: &str,
+    x: OutletId,
+    dt: DatumType,
+) -> TractResult<OutletId> {
+    let x2 = model.wire_node(format!("{prefix}.x2"), mul(), &[x, x])?[0];
+    let x3 = model.wire_node(format!("{prefix}.x3"), mul(), &[x2, x])?[0];
+    let c1 = scalar(model, format!("{prefix}.c1"), 0.044715, dt)?;
+    let c1x3 = wire_with_rank_broadcast(format!("{prefix}.c1x3"), model, mul(), &[x3, c1])?[0];
+    let inner = wire_with_rank_broadcast(format!("{prefix}.inner"), model, add(), &[x, c1x3])?[0];
+    let c0 = scalar(model, format!("{prefix}.c0"), (2.0f32 / std::f32::consts::PI).sqrt(), dt)?;
+    let scaled =
+        wire_with_rank_broadcast(format!("{prefix}.c0inner"), model, mul(), &[inner, c0])?[0];
+    let th = model.wire_node(format!("{prefix}.tanh"), tanh(), &[scaled])?[0];
+    let one = scalar(model, format!("{prefix}.one"), 1.0, dt)?;
+    let one_plus =
+        wire_with_rank_broadcast(format!("{prefix}.one_plus"), model, add(), &[th, one])?[0];
+    let half = scalar(model, format!("{prefix}.half"), 0.5, dt)?;
+    let half_x = wire_with_rank_broadcast(format!("{prefix}.half_x"), model, mul(), &[x, half])?[0];
+    Ok(wire_with_rank_broadcast(format!("{prefix}.out"), model, mul(), &[half_x, one_plus])?[0])
+}
+
+macro_rules! simple_rules {
+    () => {
+        fn rules<'r, 'p: 'r, 's: 'r>(
+            &'s self,
+            s: &mut Solver<'r>,
+            inputs: &'p [TensorProxy],
+            outputs: &'p [TensorProxy],
+        ) -> InferenceResult {
+            check_output_arity(outputs, 1)?;
+            s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+            s.equals(&inputs[0].shape, &outputs[0].shape)?;
+            Ok(())
+        }
+    };
+}
+
+// ---- BiasGelu ----
+pub fn bias_gelu(
+    _ctx: &ParsingContext,
+    _node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    Ok((expand(BiasGelu), vec![]))
+}
+#[derive(Debug, Clone)]
+struct BiasGelu;
+impl Expansion for BiasGelu {
+    fn name(&self) -> StaticName {
+        "BiasGelu".into()
+    }
+    simple_rules!();
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let dt = model.outlet_fact(inputs[0])?.datum_type;
+        let biased = wire_with_rank_broadcast(
+            format!("{prefix}.bias"),
+            model,
+            add(),
+            &[inputs[0], inputs[1]],
+        )?[0];
+        Ok(tvec!(erf_gelu(model, prefix, biased, dt)?))
+    }
+}
+
+// ---- FastGelu ----
+pub fn fast_gelu(
+    _ctx: &ParsingContext,
+    _node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    Ok((expand(FastGelu), vec![]))
+}
+#[derive(Debug, Clone)]
+struct FastGelu;
+impl Expansion for FastGelu {
+    fn name(&self) -> StaticName {
+        "FastGelu".into()
+    }
+    simple_rules!();
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let dt = model.outlet_fact(inputs[0])?.datum_type;
+        let x = if inputs.len() > 1 {
+            wire_with_rank_broadcast(
+                format!("{prefix}.bias"),
+                model,
+                add(),
+                &[inputs[0], inputs[1]],
+            )?[0]
+        } else {
+            inputs[0]
+        };
+        Ok(tvec!(tanh_gelu(model, prefix, x, dt)?))
+    }
+}
+
+// ---- QuickGelu ----
+pub fn quick_gelu(
+    _ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let alpha = node.get_attr_opt::<f32>("alpha")?.unwrap_or(1.702);
+    Ok((expand(QuickGelu { alpha }), vec![]))
+}
+#[derive(Debug, Clone, new)]
+struct QuickGelu {
+    alpha: f32,
+}
+impl Expansion for QuickGelu {
+    fn name(&self) -> StaticName {
+        "QuickGelu".into()
+    }
+    simple_rules!();
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let dt = model.outlet_fact(inputs[0])?.datum_type;
+        let alpha = scalar(model, format!("{prefix}.alpha"), self.alpha, dt)?;
+        let ax =
+            wire_with_rank_broadcast(format!("{prefix}.ax"), model, mul(), &[inputs[0], alpha])?[0];
+        let s =
+            model.wire_node(format!("{prefix}.sigmoid"), tract_core::ops::nn::sigmoid(), &[ax])?[0];
+        Ok(tvec!(wire_with_rank_broadcast(prefix, model, mul(), &[inputs[0], s])?[0]))
+    }
+}
diff --git a/onnx/src/ops/nn/group_norm.rs b/onnx/src/ops/nn/group_norm.rs
new file mode 100644
index 0000000000..07bb7ac834
--- /dev/null
+++ b/onnx/src/ops/nn/group_norm.rs
@@ -0,0 +1,167 @@
+use crate::model::ParsingContext;
+use crate::pb::NodeProto;
+use tract_core::ops::cast::cast;
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+use tract_hir::ops::math::{add, mul, rsqrt, square, sub};
+use tract_hir::ops::nn::{Reduce, Reducer};
+
+pub fn group_normalization(
+    ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let epsilon = node.get_attr_opt("epsilon")?.unwrap_or(1e-5);
+    let num_groups: usize = node.get_attr("num_groups")?;
+    // Before opset 21, `scale`/`bias` are per-group (shape [num_groups]); from opset 21 on they
+    // are per-channel (shape [C]), matching the other normalization operators.
+    let per_channel_affine = ctx.onnx_operator_set_version >= 21;
+    Ok((expand(GroupNorm { epsilon, num_groups, per_channel_affine }), vec![]))
+}
+
+#[derive(Debug, Clone, new)]
+struct GroupNorm {
+    epsilon: f32,
+    num_groups: usize,
+    per_channel_affine: bool,
+}
+
+// Broadcast a 1-D parameter [K] to rank `target_rank` with K on axis 1: [1, K, 1, .., 1].
+fn broadcast_to_channel_axis(
+    model: &mut TypedModel,
+    base: &str,
+    outlet: OutletId,
+    target_rank: usize,
+) -> TractResult<OutletId> {
+    let mut wire = model.wire_node(format!("{base}.ax0"), AxisOp::Add(0), &[outlet])?;
+    for ax in 2..target_rank {
+        wire = model.wire_node(format!("{base}.ax{ax}"), AxisOp::Add(ax), &wire)?;
+    }
+    Ok(wire[0])
+}
+
+impl Expansion for GroupNorm {
+    fn name(&self) -> StaticName {
+        "GroupNorm".into()
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        check_input_arity(inputs, 3)?;
+        check_output_arity(outputs, 1)?;
+        s.equals(&inputs[0].datum_type, &inputs[1].datum_type)?;
+        s.equals(&inputs[0].datum_type, &inputs[2].datum_type)?;
+        s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+        s.equals(&inputs[0].shape, &outputs[0].shape)?;
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let fact = model.outlet_fact(inputs[0])?.clone();
+        let rank = fact.rank();
+        let dt = fact.datum_type;
+        ensure!(rank >= 2, "GroupNormalization expects rank >= 2, got {rank}");
+        let c = fact.shape[1].clone();
+        let groups = self.num_groups.to_dim();
+        let channels_per_group = c.clone().div_ceil(self.num_groups as u64);
+
+        // Per the ONNX spec (`stash_type`, default 1=FLOAT), mean/variance are computed in f32;
+        // this matters for f16/bf16 inputs. Cast in, normalize in f32, cast back before the affine.
+        let stash = DatumType::F32;
+        let x = model.wire_node(format!("{prefix}.cast_in"), cast(stash), &inputs[0..1])?;
+
+        // Split the channel axis: (N, C, *spatial) -> (N, G, C/G, *spatial), rank + 1.
+        let grouped = model.wire_node(
+            format!("{prefix}.split"),
+            AxisOp::Reshape(1, tvec![c.clone()], tvec![groups.clone(), channels_per_group.clone()]),
+            &x,
+        )?;
+
+        // Normalize each group over C/G and all spatial dims (axes 2..=rank in the split tensor).
+        let red_axes: Vec<i64> = (2..=rank as i64).collect();
+        let mean = Reduce::new(Some(red_axes.clone()), true, Reducer::Mean).wire(
+            &format!("{prefix}.mean"),
+            model,
+            &grouped,
+        )?;
+        let diff = wire_with_rank_broadcast(
+            format!("{prefix}.diff"),
+            model,
+            sub(),
+            &[grouped[0], mean[0]],
+        )?;
+        let sq = model.wire_node(format!("{prefix}.sq"), square(), &diff)?;
+        let var = Reduce::new(Some(red_axes), true, Reducer::Mean).wire(
+            &format!("{prefix}.var"),
+            model,
+            &sq,
+        )?;
+        let eps = model.add_const(
+            format!("{prefix}.eps"),
+            tensor0(self.epsilon).cast_to_dt(stash)?.into_owned(),
+        )?;
+        let var_eps =
+            wire_with_rank_broadcast(format!("{prefix}.var_eps"), model, add(), &[var[0], eps])?;
+        let inv = model.wire_node(format!("{prefix}.rsqrt"), rsqrt(), &var_eps)?;
+        let normed_f32 =
+            wire_with_rank_broadcast(format!("{prefix}.normed"), model, mul(), &[diff[0], inv[0]])?;
+        // Back to the input dtype before the (dtype-native) scale/bias affine.
+        let normed = model.wire_node(format!("{prefix}.cast_out"), cast(dt), &normed_f32)?;
+
+        let merge = |model: &mut TypedModel, name: String, wire: &[OutletId]| {
+            model.wire_node(
+                name,
+                AxisOp::Reshape(
+                    1,
+                    tvec![groups.clone(), channels_per_group.clone()],
+                    tvec![c.clone()],
+                ),
+                wire,
+            )
+        };
+
+        if self.per_channel_affine {
+            // scale/bias are [C]: merge back to (N, C, *spatial), then apply per-channel affine.
+            let merged = merge(model, format!("{prefix}.merge"), &normed)?;
+            let scale =
+                broadcast_to_channel_axis(model, &format!("{prefix}.scale"), inputs[1], rank)?;
+            let scaled = wire_with_rank_broadcast(
+                format!("{prefix}.scaled"),
+                model,
+                mul(),
+                &[merged[0], scale],
+            )?;
+            let bias =
+                broadcast_to_channel_axis(model, &format!("{prefix}.bias"), inputs[2], rank)?;
+            wire_with_rank_broadcast(prefix, model, add(), &[scaled[0], bias])
+        } else {
+            // scale/bias are [num_groups]: apply on the grouped tensor, then merge back.
+            let gr_rank = rank + 1;
+            let scale =
+                broadcast_to_channel_axis(model, &format!("{prefix}.scale"), inputs[1], gr_rank)?;
+            let scaled = wire_with_rank_broadcast(
+                format!("{prefix}.scaled"),
+                model,
+                mul(),
+                &[normed[0], scale],
+            )?;
+            let bias =
+                broadcast_to_channel_axis(model, &format!("{prefix}.bias"), inputs[2], gr_rank)?;
+            let biased = wire_with_rank_broadcast(
+                format!("{prefix}.biased"),
+                model,
+                add(),
+                &[scaled[0], bias],
+            )?;
+            merge(model, prefix.to_string(), &biased)
+        }
+    }
+}
diff --git a/onnx/src/ops/nn/lp_norm.rs b/onnx/src/ops/nn/lp_norm.rs
new file mode 100644
index 0000000000..370ff3efa6
--- /dev/null
+++ b/onnx/src/ops/nn/lp_norm.rs
@@ -0,0 +1,61 @@
+use crate::model::ParsingContext;
+use crate::pb::NodeProto;
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+
+pub fn lp_normalization(
+    _ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let axis = node.get_attr_opt("axis")?.unwrap_or(-1);
+    let p: i64 = node.get_attr_opt("p")?.unwrap_or(2);
+    ensure!(p == 1 || p == 2, "LpNormalization only supports p=1 or p=2, got p={p}");
+    Ok((expand(LpNorm { axis, p }), vec![]))
+}
+
+#[derive(Debug, Clone, new)]
+struct LpNorm {
+    axis: i64,
+    p: i64,
+}
+
+impl Expansion for LpNorm {
+    fn name(&self) -> StaticName {
+        "LpNorm".into()
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        check_input_arity(inputs, 1)?;
+        check_output_arity(outputs, 1)?;
+        s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+        s.equals(&inputs[0].shape, &outputs[0].shape)?;
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let rank = model.outlet_fact(inputs[0])?.rank() as i64;
+        let axis = if self.axis < 0 { self.axis + rank } else { self.axis };
+        // Lp norm along `axis`, keeping the reduced dim so it broadcasts against the input.
+        let reducer = if self.p == 1 {
+            tract_hir::ops::nn::Reducer::L1
+        } else {
+            tract_hir::ops::nn::Reducer::L2
+        };
+        let norm = tract_hir::ops::nn::Reduce::new(Some(vec![axis]), true, reducer).wire(
+            &format!("{prefix}.norm"),
+            model,
+            &inputs[0..1],
+        )?;
+        wire_with_rank_broadcast(prefix, model, tract_hir::ops::math::div(), &[inputs[0], norm[0]])
+    }
+}
diff --git a/onnx/src/ops/nn/mat_mul_nbits.rs b/onnx/src/ops/nn/mat_mul_nbits.rs
new file mode 100644
index 0000000000..2d267613a0
--- /dev/null
+++ b/onnx/src/ops/nn/mat_mul_nbits.rs
@@ -0,0 +1,162 @@
+use crate::model::ParsingContext;
+use crate::pb::NodeProto;
+use tract_core::ops::cast::cast;
+use tract_core::ops::einsum::EinSum;
+use tract_core::ops::math::add;
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+
+// com.microsoft MatMulNBits: Y = A @ dequant(B)^T (+ bias)
+//   A:           float [.., K]
+//   B (Q4):      uint8 [N, n_blocks, blob]   (blob = block_size/2, two 4-bit weights per byte)
+//   scales:      float [N * n_blocks]
+//   zero_points: uint8 [N * ceil(n_blocks/2)] packed (optional; default 8)
+//   bias:        float [N] (optional)
+// The quantized weight is constant, so we dequantize it to a float [N, K] weight in Rust and
+// emit a plain matmul (EinSum). A fused int4 kernel would be a follow-up perf optimization.
+pub fn mat_mul_nbits(
+    _ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let k: usize = node.get_attr("K")?;
+    let n: usize = node.get_attr("N")?;
+    let bits: usize = node.get_attr_opt("bits")?.unwrap_or(4);
+    let block_size: usize = node.get_attr("block_size")?;
+    ensure!(bits == 4, "MatMulNBits: only bits=4 is supported (got {bits})");
+    let mut opt = crate::model::optional_inputs(node).skip(3);
+    let zp_input = opt.next().unwrap();
+    let gidx_input = opt.next().unwrap();
+    let bias_input = opt.next().unwrap();
+    ensure!(gidx_input.is_none(), "MatMulNBits: g_idx (act-order) is unsupported");
+    Ok((expand(MatMulNBits { k, n, block_size, zp_input, bias_input }), vec![]))
+}
+
+#[derive(Debug, Clone)]
+struct MatMulNBits {
+    k: usize,
+    n: usize,
+    block_size: usize,
+    zp_input: Option<usize>,
+    bias_input: Option<usize>,
+}
+
+impl Expansion for MatMulNBits {
+    fn name(&self) -> StaticName {
+        "MatMulNBits".into()
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        check_output_arity(outputs, 1)?;
+        s.equals(&outputs[0].datum_type, &inputs[0].datum_type)?;
+        let n = self.n.to_dim();
+        s.given(&inputs[0].rank, move |s, rank| {
+            let rank = rank as usize;
+            s.equals(&outputs[0].rank, rank as i64)?;
+            for ax in 0..rank - 1 {
+                s.equals(&outputs[0].shape[ax], &inputs[0].shape[ax])?;
+            }
+            s.equals(&outputs[0].shape[rank - 1], n.clone())
+        })?;
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let (k, n, block_size) = (self.k, self.n, self.block_size);
+        let n_blocks = k.div_ceil(block_size);
+        let blob = block_size.div_ceil(2);
+        let zp_blob = n_blocks.div_ceil(2);
+
+        // Read the constant quantized weight, scales and (optional) zero points.
+        let b_k = model
+            .outlet_fact(inputs[1])?
+            .konst
+            .clone()
+            .context("MatMulNBits: quantized weight B must be a constant")?;
+        let b_plain = b_k.try_as_plain()?;
+        let b: &[u8] = b_plain.as_slice()?;
+        let scales_k = model
+            .outlet_fact(inputs[2])?
+            .konst
+            .clone()
+            .context("MatMulNBits: scales must be a constant")?;
+        let scales_f = scales_k.cast_to::<f32>()?;
+        let scales_plain = scales_f.try_as_plain()?;
+        let scales: &[f32] = scales_plain.as_slice()?;
+        let zp_k = if let Some(i) = self.zp_input {
+            Some(
+                model
+                    .outlet_fact(inputs[i])?
+                    .konst
+                    .clone()
+                    .context("MatMulNBits: zero_points must be a constant")?,
+            )
+        } else {
+            None
+        };
+        let zp_plain = match &zp_k {
+            Some(t) => Some(t.try_as_plain()?),
+            None => None,
+        };
+        let zp: Option<&[u8]> = match &zp_plain {
+            Some(p) => Some(p.as_slice::<u8>()?),
+            None => None,
+        };
+
+        // Dequantize to a [N, K] float weight.
+        let mut w = vec![0f32; n * k];
+        for col in 0..n {
+            for blk in 0..n_blocks {
+                let scale = scales[col * n_blocks + blk];
+                let zero = match zp {
+                    Some(zp) => {
+                        let byte = zp[col * zp_blob + blk / 2];
+                        if blk % 2 == 0 { byte & 0x0F } else { byte >> 4 }
+                    }
+                    None => 8,
+                } as f32;
+                let base = col * n_blocks * blob + blk * blob;
+                for i in 0..block_size {
+                    let kk = blk * block_size + i;
+                    if kk >= k {
+                        break;
+                    }
+                    let byte = b[base + i / 2];
+                    let q = if i % 2 == 0 { byte & 0x0F } else { byte >> 4 } as f32;
+                    w[col * k + kk] = (q - zero) * scale;
+                }
+            }
+        }
+        let w = model.add_const(format!("{prefix}.weight"), Tensor::from_shape(&[n, k], &w)?)?;
+
+        // Y = A @ W^T, contracting K. Computed in f32, then cast to the input dtype.
+        let dt = model.outlet_fact(inputs[0])?.datum_type;
+        let rank = model.outlet_fact(inputs[0])?.rank();
+        let a =
+            model.wire_node(format!("{prefix}.cast_a"), cast(f32::datum_type()), &[inputs[0]])?[0];
+        let lead: String = "abcdefgh".chars().take(rank - 1).collect();
+        let axes =
+            AxesMapping::from_strs(&[format!("{lead}k"), "nk".to_string()], &[format!("{lead}n")])?;
+        let y = model.wire_node(
+            format!("{prefix}.matmul"),
+            EinSum::new(axes, f32::datum_type()),
+            &[a, w],
+        )?[0];
+        let mut y = model.wire_node(format!("{prefix}.cast_y"), cast(dt), &[y])?[0];
+
+        if let Some(i) = self.bias_input {
+            y = wire_with_rank_broadcast(format!("{prefix}.bias"), model, add(), &[y, inputs[i]])?
+                [0];
+        }
+        Ok(tvec!(y))
+    }
+}
diff --git a/onnx/src/ops/nn/mish.rs b/onnx/src/ops/nn/mish.rs
new file mode 100644
index 0000000000..e80b9d69b5
--- /dev/null
+++ b/onnx/src/ops/nn/mish.rs
@@ -0,0 +1,44 @@
+use tract_core::ops::math::{add, exp, ln, mul, tanh};
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+
+#[derive(Debug, Clone, Default)]
+pub struct Mish;
+
+impl Expansion for Mish {
+    fn name(&self) -> StaticName {
+        "Mish".into()
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        check_input_arity(inputs, 1)?;
+        check_output_arity(outputs, 1)?;
+        s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+        s.equals(&inputs[0].shape, &outputs[0].shape)?;
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let dt = model.outlet_fact(inputs[0])?.datum_type;
+        // mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+        let exp_x = model.wire_node(format!("{prefix}.exp"), exp(), &[inputs[0]])?[0];
+        let c_one =
+            model.add_const(format!("{prefix}.one"), tensor0(1f32).cast_to_dt(dt)?.into_owned())?;
+        let one_plus_exp =
+            wire_with_rank_broadcast(format!("{prefix}.add_one"), model, add(), &[exp_x, c_one])?
+                [0];
+        let softplus = model.wire_node(format!("{prefix}.ln"), ln(), &[one_plus_exp])?[0];
+        let tanh_sp = model.wire_node(format!("{prefix}.tanh"), tanh(), &[softplus])?[0];
+        model.wire_node(prefix, mul(), &[inputs[0], tanh_sp])
+    }
+}
diff --git a/onnx/src/ops/nn/mod.rs b/onnx/src/ops/nn/mod.rs
index c98a626184..ad6056574b 100644
--- a/onnx/src/ops/nn/mod.rs
+++ b/onnx/src/ops/nn/mod.rs
@@ -7,13 +7,26 @@ use crate::model::{OnnxOpRegister, ParsingContext};
 use crate::pb::NodeProto;
 use crate::pb_helpers::OptionExt;
 
+mod attention;
 mod batch_norm;
 mod conv_transpose;
 mod dropout;
+mod gelu;
+mod gelu_contrib;
+mod group_norm;
 mod instance_norm;
 mod layer_norm;
+mod lp_norm;
 mod lrn;
+mod mat_mul_nbits;
+mod mish;
+mod multi_head_attention;
+mod mvn;
 mod reduce;
+mod rms_norm;
+mod rms_norm_contrib;
+mod rotary_embedding;
+mod skip_layer_norm;
 
 pub fn arg_max_min(
     _ctx: &ParsingContext,
@@ -44,14 +57,18 @@ pub fn register_all_ops(reg: &mut OnnxOpRegister) {
     reg.insert("GlobalAveragePool", |_, _| Ok((expand(ops::nn::GlobalAvgPool), vec![])));
     reg.insert("GlobalLpPool", global_lp_pool);
     reg.insert("GlobalMaxPool", |_, _| Ok((expand(ops::nn::GlobalMaxPool), vec![])));
+    reg.insert("GroupNormalization", group_norm::group_normalization);
     reg.insert("Hardmax", layer_hard_max);
     reg.insert("HardSigmoid", hard_sigmoid);
     reg.insert("InstanceNormalization", instance_norm::instance_normalization);
     reg.insert("LayerNormalization", layer_norm::layer_norm);
     reg.insert("LeakyRelu", leaky_relu);
+    reg.insert("LpNormalization", lp_norm::lp_normalization);
     reg.insert("LogSoftmax", layer_log_soft_max);
     reg.insert("LRN", lrn::lrn);
+    reg.insert("MatMulNBits", mat_mul_nbits::mat_mul_nbits);
     reg.insert("MaxPool", max_pool);
+    reg.insert("MeanVarianceNormalization", mvn::mean_variance_normalization);
     reg.insert("ParametricSoftplus", parametric_softplus);
     reg.insert("QLinearConv", conv_qlinear);
     reg.insert("PRelu", |_, _| Ok((expand(Prelu), vec![])));
@@ -71,8 +88,24 @@ pub fn register_all_ops(reg: &mut OnnxOpRegister) {
     reg.insert("ThresholdedRelu", thresholded_relu);
     reg.insert("Selu", selu);
     reg.insert("Sigmoid", |_, _| Ok((ops::nn::sigmoid().into_hir(), vec![])));
+    reg.insert("Attention", attention::attention);
+    reg.insert("Gelu", gelu::gelu);
+    reg.insert("BiasGelu", gelu_contrib::bias_gelu);
+    reg.insert("FastGelu", gelu_contrib::fast_gelu);
+    reg.insert("QuickGelu", gelu_contrib::quick_gelu);
     reg.insert("HardSwish", |_, _| Ok((ops::nn::hard_swish().into_hir(), vec![])));
+    reg.insert("Mish", |_, _| Ok((expand(mish::Mish), vec![])));
+    reg.insert("MultiHeadAttention", multi_head_attention::multi_head_attention);
+    reg.insert("RMSNormalization", rms_norm::rms_normalization);
+    reg.insert("RotaryEmbedding", rotary_embedding::rotary_embedding);
+    reg.insert("SimplifiedLayerNormalization", rms_norm::rms_normalization);
+    reg.insert(
+        "SkipSimplifiedLayerNormalization",
+        rms_norm_contrib::skip_simplified_layer_normalization,
+    );
+    reg.insert("SkipLayerNormalization", skip_layer_norm::skip_layer_normalization);
     reg.insert("Softmax", layer_soft_max);
+    reg.insert("Swish", |_, _| Ok((tract_core::ops::nn::silu::silu().into_hir(), vec![])));
     reg.insert("Softplus", |_, _| Ok((expand(ops::activations::Softplus), vec![])));
     reg.insert("Softsign", |_, _| Ok((expand(ops::activations::Softsign), vec![])));
 }
diff --git a/onnx/src/ops/nn/multi_head_attention.rs b/onnx/src/ops/nn/multi_head_attention.rs
new file mode 100644
index 0000000000..7de8dc5050
--- /dev/null
+++ b/onnx/src/ops/nn/multi_head_attention.rs
@@ -0,0 +1,150 @@
+use crate::model::{ParsingContext, optional_outputs};
+use crate::pb::NodeProto;
+use tract_core::ops::change_axes::AxisOp;
+use tract_hir::internal::*;
+use tract_transformers::ops::sdpa::Sdpa;
+
+// com.microsoft MultiHeadAttention (scoped: unpacked Q/K/V, bidirectional).
+//   inputs:  query(0), key(1), value(2), bias(3?), key_padding_mask(4?), attention_bias(5?),
+//            past_key(6?), past_value(7?)
+//   outputs: output(0), present_key(1?), present_value(2?)
+// Standard (non-causal) multi-head attention lowered onto Sdpa. Bias, masks, packed QKV and
+// past KV cache are rejected with clear errors.
+pub fn multi_head_attention(
+    _ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let num_heads: usize = node.get_attr("num_heads")?;
+    let scale = node.get_attr_opt::<f32>("scale")?;
+    ensure!(
+        node.input.len() >= 3 && !node.input[1].is_empty() && !node.input[2].is_empty(),
+        "MultiHeadAttention: requires unpacked query, key and value inputs"
+    );
+    for i in 3..node.input.len() {
+        ensure!(
+            node.input[i].is_empty(),
+            "MultiHeadAttention: optional input #{i} (bias / mask / past KV) is unsupported"
+        );
+    }
+    let mut oo = optional_outputs(node).skip(1);
+    let present_k = oo.next().unwrap().is_some();
+    let present_v = oo.next().unwrap().is_some();
+    Ok((expand(MultiHeadAttention { num_heads, scale, present_k, present_v }), vec![]))
+}
+
+#[derive(Debug, Clone)]
+struct MultiHeadAttention {
+    num_heads: usize,
+    scale: Option<f32>,
+    present_k: bool,
+    present_v: bool,
+}
+
+// [B, S, heads*head_size] -> [B, heads, S, head_size]
+fn to_4d(
+    model: &mut TypedModel,
+    prefix: &str,
+    x: OutletId,
+    total: TDim,
+    heads: usize,
+) -> TractResult<OutletId> {
+    let head_dim = total.clone() / heads;
+    let reshaped = model.wire_node(
+        format!("{prefix}.reshape"),
+        AxisOp::Reshape(2, tvec![total], tvec![heads.to_dim(), head_dim]),
+        &[x],
+    )?[0];
+    Ok(model.wire_node(format!("{prefix}.transpose"), AxisOp::Move(2, 1), &[reshaped])?[0])
+}
+
+impl Expansion for MultiHeadAttention {
+    fn name(&self) -> StaticName {
+        "MultiHeadAttention".into()
+    }
+
+    fn nboutputs(&self) -> TractResult<usize> {
+        Ok(1 + self.present_k as usize + self.present_v as usize)
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        check_input_arity(inputs, 3)?;
+        check_output_arity(outputs, self.nboutputs()?)?;
+        s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+        s.equals(&inputs[0].shape, &outputs[0].shape)?;
+        let nh = self.num_heads;
+        if self.present_k {
+            s.equals(&inputs[0].datum_type, &outputs[1].datum_type)?;
+            s.given(&inputs[1].shape, move |s, ks| {
+                s.equals(
+                    &outputs[1].shape,
+                    tvec![ks[0].clone(), nh.to_dim(), ks[1].clone(), ks[2].clone() / nh],
+                )
+            })?;
+        }
+        if self.present_v {
+            let vi = 1 + self.present_k as usize;
+            s.equals(&inputs[0].datum_type, &outputs[vi].datum_type)?;
+            s.given(&inputs[2].shape, move |s, vs| {
+                s.equals(
+                    &outputs[vi].shape,
+                    tvec![vs[0].clone(), nh.to_dim(), vs[1].clone(), vs[2].clone() / nh],
+                )
+            })?;
+        }
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let q_fact = model.outlet_fact(inputs[0])?.clone();
+        let dt = q_fact.datum_type;
+        ensure!(q_fact.rank() == 3, "MultiHeadAttention: expected 3D query [B, S, hidden]");
+        let q_hidden = q_fact.shape[2].clone();
+        let k_hidden = model.outlet_fact(inputs[1])?.shape[2].clone();
+        let v_hidden = model.outlet_fact(inputs[2])?.shape[2].clone();
+
+        let q4 = to_4d(model, &format!("{prefix}.q"), inputs[0], q_hidden, self.num_heads)?;
+        let k4 = to_4d(model, &format!("{prefix}.k"), inputs[1], k_hidden, self.num_heads)?;
+        let v4 = to_4d(model, &format!("{prefix}.v"), inputs[2], v_hidden, self.num_heads)?;
+
+        // Bidirectional attention (no causal mask). Sdpa default scale is 1/sqrt(head_dim).
+        let sdpa = Sdpa {
+            scale: self.scale.map(tensor0),
+            datum_type: dt,
+            acc_datum_type: DatumType::F32,
+            is_causal: false,
+        };
+        let y4 = model.wire_node(format!("{prefix}.sdpa"), sdpa, &[q4, k4, v4])?[0];
+
+        let y_t = model.wire_node(format!("{prefix}.y_transpose"), AxisOp::Move(1, 2), &[y4])?[0];
+        let yf = model.outlet_fact(y4)?.clone();
+        let (heads_dim, head_dim) = (yf.shape[1].clone(), yf.shape[3].clone());
+        let y = model.wire_node(
+            format!("{prefix}.y_reshape"),
+            AxisOp::Reshape(
+                2,
+                tvec![heads_dim.clone(), head_dim.clone()],
+                tvec![heads_dim * head_dim],
+            ),
+            &[y_t],
+        )?[0];
+
+        let mut out = tvec!(y);
+        if self.present_k {
+            out.push(k4);
+        }
+        if self.present_v {
+            out.push(v4);
+        }
+        Ok(out)
+    }
+}
diff --git a/onnx/src/ops/nn/mvn.rs b/onnx/src/ops/nn/mvn.rs
new file mode 100644
index 0000000000..fefa6af58d
--- /dev/null
+++ b/onnx/src/ops/nn/mvn.rs
@@ -0,0 +1,83 @@
+use crate::model::ParsingContext;
+use crate::pb::NodeProto;
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+use tract_hir::ops::math::{add, div, sqrt, square, sub};
+use tract_hir::ops::nn::{Reduce, Reducer};
+
+// ONNX MeanVarianceNormalization is defined as a function:
+//   mean   = ReduceMean(X, axes)
+//   var    = ReduceMean(X^2, axes) - mean^2
+//   Y      = (X - mean) / (Sqrt(var) + 1e-9)
+// Note epsilon (1e-9) is added *outside* the square root, and is fixed (not an attribute).
+const EPSILON: f32 = 1e-9;
+
+pub fn mean_variance_normalization(
+    _ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let axes = node.get_attr_opt_vec("axes")?.unwrap_or_else(|| vec![0, 2, 3]);
+    Ok((expand(MeanVarianceNorm { axes }), vec![]))
+}
+
+#[derive(Debug, Clone, new)]
+struct MeanVarianceNorm {
+    axes: Vec<i64>,
+}
+
+impl Expansion for MeanVarianceNorm {
+    fn name(&self) -> StaticName {
+        "MeanVarianceNorm".into()
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        check_input_arity(inputs, 1)?;
+        check_output_arity(outputs, 1)?;
+        s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+        s.equals(&inputs[0].shape, &outputs[0].shape)?;
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let input_fact = model.outlet_fact(inputs[0])?.clone();
+        let rank = input_fact.rank() as i64;
+        let dt = input_fact.datum_type;
+        let axes: Vec<i64> = self.axes.iter().map(|&a| if a < 0 { a + rank } else { a }).collect();
+
+        let mean = Reduce::new(Some(axes.clone()), true, Reducer::Mean).wire(
+            &format!("{prefix}.mean"),
+            model,
+            &inputs[0..1],
+        )?;
+        let x_sq = model.wire_node(format!("{prefix}.sq"), square(), &inputs[0..1])?;
+        let ex_sq = Reduce::new(Some(axes), true, Reducer::Mean).wire(
+            &format!("{prefix}.ex_sq"),
+            model,
+            &x_sq,
+        )?;
+        let mean_sq = model.wire_node(format!("{prefix}.mean_sq"), square(), &mean)?;
+        let var = model.wire_node(format!("{prefix}.var"), sub(), &[ex_sq[0], mean_sq[0]])?;
+        let std = model.wire_node(format!("{prefix}.std"), sqrt(), &var)?;
+        let eps = model
+            .add_const(format!("{prefix}.eps"), tensor0(EPSILON).cast_to_dt(dt)?.into_owned())?;
+        let std_eps =
+            wire_with_rank_broadcast(format!("{prefix}.std_eps"), model, add(), &[std[0], eps])?;
+        let centered = wire_with_rank_broadcast(
+            format!("{prefix}.centered"),
+            model,
+            sub(),
+            &[inputs[0], mean[0]],
+        )?;
+        wire_with_rank_broadcast(prefix, model, div(), &[centered[0], std_eps[0]])
+    }
+}
diff --git a/onnx/src/ops/nn/rms_norm.rs b/onnx/src/ops/nn/rms_norm.rs
new file mode 100644
index 0000000000..1b9d72c2a5
--- /dev/null
+++ b/onnx/src/ops/nn/rms_norm.rs
@@ -0,0 +1,90 @@
+use crate::model::ParsingContext;
+use crate::pb::NodeProto;
+use tract_core::ops::cast::cast;
+use tract_core::ops::math::{add, mul, rsqrt};
+use tract_core::ops::nn::{Reduce, Reducer};
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+
+pub fn rms_normalization(
+    _ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let axis = node.get_attr_opt::<isize>("axis")?.unwrap_or(-1);
+    let epsilon = node.get_attr_opt("epsilon")?.unwrap_or(1e-5f32);
+    let have_bias = node.input.len() >= 3 && !node.input[2].is_empty();
+    Ok((expand(RmsNormalization { axis, epsilon, have_bias }), vec![]))
+}
+
+#[derive(Debug, Clone)]
+struct RmsNormalization {
+    axis: isize,
+    epsilon: f32,
+    have_bias: bool,
+}
+
+impl Expansion for RmsNormalization {
+    fn name(&self) -> StaticName {
+        "RmsNormalization".into()
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        check_input_arity(inputs, 2 + self.have_bias as usize)?;
+        check_output_arity(outputs, 1)?;
+        s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+        s.equals(&inputs[0].shape, &outputs[0].shape)?;
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let x_fact = model.outlet_fact(inputs[0])?.clone();
+        let rank = x_fact.rank();
+        let axis =
+            if self.axis < 0 { (self.axis + rank as isize) as usize } else { self.axis as usize };
+        let dt = x_fact.datum_type;
+        let stash_dt = DatumType::F32;
+
+        let axes: TVec<usize> = (axis..rank).collect();
+
+        let x_cast = model.wire_node(format!("{prefix}.cast_x"), cast(stash_dt), &[inputs[0]])?[0];
+        let mean_sq = model.wire_node(
+            format!("{prefix}.mean_sq"),
+            Reduce { axes, reducer: Reducer::MeanOfSquares },
+            &[x_cast],
+        )?[0];
+        let eps = model.add_const(
+            format!("{prefix}.eps"),
+            tensor0(self.epsilon).cast_to_dt(stash_dt)?.into_owned(),
+        )?;
+        let mean_sq_eps =
+            wire_with_rank_broadcast(format!("{prefix}.add_eps"), model, add(), &[mean_sq, eps])?
+                [0];
+        let inv_rms = model.wire_node(format!("{prefix}.rsqrt"), rsqrt(), &[mean_sq_eps])?[0];
+        let normalized =
+            wire_with_rank_broadcast(format!("{prefix}.norm"), model, mul(), &[x_cast, inv_rms])?
+                [0];
+        let normalized_cast =
+            model.wire_node(format!("{prefix}.cast_out"), cast(dt), &[normalized])?[0];
+        let scaled = wire_with_rank_broadcast(
+            format!("{prefix}.scaled"),
+            model,
+            mul(),
+            &[normalized_cast, inputs[1]],
+        )?[0];
+        if self.have_bias {
+            wire_with_rank_broadcast(prefix, model, add(), &[scaled, inputs[2]])
+        } else {
+            Ok(tvec![scaled])
+        }
+    }
+}
diff --git a/onnx/src/ops/nn/rms_norm_contrib.rs b/onnx/src/ops/nn/rms_norm_contrib.rs
new file mode 100644
index 0000000000..de9e6ce3d1
--- /dev/null
+++ b/onnx/src/ops/nn/rms_norm_contrib.rs
@@ -0,0 +1,131 @@
+use crate::model::{ParsingContext, optional_outputs};
+use crate::pb::NodeProto;
+use tract_core::ops::cast::cast;
+use tract_core::ops::math::{add, mul, rsqrt};
+use tract_core::ops::nn::{Reduce, Reducer};
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+
+// com.microsoft SkipSimplifiedLayerNormalization:
+//   input_skip_bias_sum = input + skip   (ORT >= 1.19 does NOT apply the optional bias here)
+//   output              = RMSNorm(input_skip_bias_sum, last axis) * gamma
+// Outputs: output(0), mean(1, unsupported), inv_std_var(2, opt), input_skip_bias_sum(3, opt).
+pub fn skip_simplified_layer_normalization(
+    _ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let epsilon = node.get_attr_opt("epsilon")?.unwrap_or(1e-5f32);
+    let mut oo = optional_outputs(node).skip(1);
+    let mean_out = oo.next().unwrap();
+    let invstd_out = oo.next().unwrap();
+    let sum_out = oo.next().unwrap();
+    ensure!(mean_out.is_none(), "SkipSimplifiedLayerNormalization: mean output is unsupported");
+    Ok((
+        expand(SkipSimplifiedLayerNorm {
+            epsilon,
+            invstd: invstd_out.is_some(),
+            sum: sum_out.is_some(),
+        }),
+        vec![],
+    ))
+}
+
+#[derive(Debug, Clone)]
+struct SkipSimplifiedLayerNorm {
+    epsilon: f32,
+    invstd: bool,
+    sum: bool,
+}
+
+impl Expansion for SkipSimplifiedLayerNorm {
+    fn name(&self) -> StaticName {
+        "SkipSimplifiedLayerNorm".into()
+    }
+
+    fn nboutputs(&self) -> TractResult<usize> {
+        Ok(1 + self.invstd as usize + self.sum as usize)
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        // (input, skip, gamma[, bias]); ORT does not apply the optional bias for the simplified
+        // variant, so a 4th input is accepted but ignored.
+        ensure!(
+            inputs.len() == 3 || inputs.len() == 4,
+            "SkipSimplifiedLayerNormalization expects 3 or 4 inputs, got {}",
+            inputs.len()
+        );
+        check_output_arity(outputs, self.nboutputs()?)?;
+        s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+        s.equals(&inputs[0].shape, &outputs[0].shape)?;
+        if self.sum {
+            // input_skip_bias_sum has the same type/shape as the input; it is the last output.
+            let si = 1 + self.invstd as usize;
+            s.equals(&inputs[0].datum_type, &outputs[si].datum_type)?;
+            s.equals(&inputs[0].shape, &outputs[si].shape)?;
+        }
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let fact = model.outlet_fact(inputs[0])?.clone();
+        let rank = fact.rank();
+        let dt = fact.datum_type;
+        let stash = DatumType::F32;
+
+        // input + skip. NB: ORT (>= 1.19) does not apply the optional bias input for the simplified
+        // (RMSNorm) variant -- verified empirically -- so we match that and ignore it. RMSNorm is
+        // biasless in practice (Llama/Phi).
+        let sum = wire_with_rank_broadcast(
+            format!("{prefix}.skip"),
+            model,
+            add(),
+            &[inputs[0], inputs[1]],
+        )?[0];
+
+        // RMSNorm over the last axis (in f32), then scale by gamma.
+        let x_cast = model.wire_node(format!("{prefix}.cast_x"), cast(stash), &[sum])?[0];
+        let mean_sq = model.wire_node(
+            format!("{prefix}.mean_sq"),
+            Reduce { axes: tvec![rank - 1], reducer: Reducer::MeanOfSquares },
+            &[x_cast],
+        )?[0];
+        let eps = model.add_const(
+            format!("{prefix}.eps"),
+            tensor0(self.epsilon).cast_to_dt(stash)?.into_owned(),
+        )?;
+        let mean_sq_eps =
+            wire_with_rank_broadcast(format!("{prefix}.add_eps"), model, add(), &[mean_sq, eps])?
+                [0];
+        let inv_rms = model.wire_node(format!("{prefix}.rsqrt"), rsqrt(), &[mean_sq_eps])?[0];
+        let normalized =
+            wire_with_rank_broadcast(format!("{prefix}.norm"), model, mul(), &[x_cast, inv_rms])?
+                [0];
+        let normalized_cast =
+            model.wire_node(format!("{prefix}.cast_out"), cast(dt), &[normalized])?[0];
+        let output = wire_with_rank_broadcast(
+            format!("{prefix}.scaled"),
+            model,
+            mul(),
+            &[normalized_cast, inputs[2]],
+        )?[0];
+
+        let mut outputs = tvec!(output);
+        if self.invstd {
+            outputs.push(inv_rms);
+        }
+        if self.sum {
+            outputs.push(sum);
+        }
+        Ok(outputs)
+    }
+}
diff --git a/onnx/src/ops/nn/rotary_embedding.rs b/onnx/src/ops/nn/rotary_embedding.rs
new file mode 100644
index 0000000000..e01a13a661
--- /dev/null
+++ b/onnx/src/ops/nn/rotary_embedding.rs
@@ -0,0 +1,211 @@
+use crate::model::ParsingContext;
+use crate::pb::NodeProto;
+use tract_core::ops::array::{Gather, Slice, TypedConcat};
+use tract_core::ops::math::{add, mul, sub};
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+
+// ONNX RotaryEmbedding (opset 23). Mirrors onnx/reference/ops/op_rotary_embedding.py:
+//   * normalize input to [batch, seq, heads, head_size]
+//   * gather cos/sin caches by position_ids (when provided)
+//   * rotate the first `rotary_embedding_dim` channels (NeoX halves, or GPT-J interleaved pairs)
+//   * concatenate the untouched tail back and restore the original layout
+pub fn rotary_embedding(
+    _ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let interleaved = node.get_attr_opt::<i64>("interleaved")?.unwrap_or(0) != 0;
+    let num_heads = node.get_attr_opt::<i64>("num_heads")?.unwrap_or(0) as usize;
+    let rotary_embedding_dim =
+        node.get_attr_opt::<i64>("rotary_embedding_dim")?.unwrap_or(0) as usize;
+    Ok((expand(RotaryEmbedding { interleaved, num_heads, rotary_embedding_dim }), vec![]))
+}
+
+#[derive(Debug, Clone, new)]
+struct RotaryEmbedding {
+    interleaved: bool,
+    num_heads: usize,
+    rotary_embedding_dim: usize,
+}
+
+impl Expansion for RotaryEmbedding {
+    fn name(&self) -> StaticName {
+        "RotaryEmbedding".into()
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        ensure!(
+            inputs.len() == 3 || inputs.len() == 4,
+            "RotaryEmbedding expects 3 or 4 inputs, got {}",
+            inputs.len()
+        );
+        check_output_arity(outputs, 1)?;
+        // Output keeps the input tensor's type and shape.
+        s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+        s.equals(&inputs[0].shape, &outputs[0].shape)?;
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let in_fact = model.outlet_fact(inputs[0])?.clone();
+        let in_rank = in_fact.rank();
+        ensure!(in_rank == 3 || in_rank == 4, "RotaryEmbedding expects rank 3 or 4, got {in_rank}");
+        let has_position_ids = inputs.len() == 4;
+        let two = 2usize.to_dim();
+
+        // 1. Normalize input to [batch, seq, heads, head_size].
+        let x = if in_rank == 4 {
+            // [B, N, S, H] -> [B, S, N, H]
+            model.wire_node(format!("{prefix}.to_bsnh"), AxisOp::Move(1, 2), &[inputs[0]])?[0]
+        } else {
+            // [B, S, hidden] -> [B, S, N, H]
+            ensure!(self.num_heads > 0, "RotaryEmbedding with a 3D input requires num_heads");
+            let hidden = in_fact.shape[2].clone();
+            let head_size = hidden.clone().div_ceil(self.num_heads as u64);
+            model.wire_node(
+                format!("{prefix}.split_heads"),
+                AxisOp::Reshape(2, tvec![hidden], tvec![self.num_heads.to_dim(), head_size]),
+                &[inputs[0]],
+            )?[0]
+        };
+
+        let head_size = model.outlet_fact(x)?.shape[3].clone();
+        let rotary_dim = if self.rotary_embedding_dim == 0 {
+            head_size.clone()
+        } else {
+            self.rotary_embedding_dim.to_dim()
+        };
+        let half = rotary_dim.clone().div_ceil(2);
+
+        // 2. Split off the rotary part (and the pass-through tail when partial).
+        let x_rotate = model.wire_node(
+            format!("{prefix}.rotary"),
+            Slice::new(3, 0, rotary_dim.clone()),
+            &[x],
+        )?[0];
+        let passthrough = if rotary_dim != head_size {
+            Some(
+                model.wire_node(
+                    format!("{prefix}.passthrough"),
+                    Slice::new(3, rotary_dim.clone(), head_size.clone()),
+                    &[x],
+                )?[0],
+            )
+        } else {
+            None
+        };
+
+        // 3. Prepare cos/sin as [B, S, 1, half] (gathering by position_ids if present).
+        let mut prep = |tag: &str, cache: usize| -> TractResult<OutletId> {
+            let gathered = if has_position_ids {
+                model.wire_node(
+                    format!("{prefix}.{tag}_gather"),
+                    Gather::new(0),
+                    &[inputs[cache], inputs[3]],
+                )?[0]
+            } else {
+                inputs[cache]
+            };
+            Ok(model.wire_node(format!("{prefix}.{tag}_unsqueeze"), AxisOp::Add(2), &[gathered])?
+                [0])
+        };
+        let cos = prep("cos", 1)?;
+        let sin = prep("sin", 2)?;
+
+        // 4. Extract the two rotated components.
+        let (x1, x2) = if self.interleaved {
+            // [.., rotary_dim] -> [.., half, 2], then take the even/odd lanes.
+            let pairs = model.wire_node(
+                format!("{prefix}.pairs"),
+                AxisOp::Reshape(3, tvec![rotary_dim.clone()], tvec![half.clone(), two.clone()]),
+                &[x_rotate],
+            )?[0];
+            let even = model.wire_node(format!("{prefix}.even"), Slice::new(4, 0, 1), &[pairs])?[0];
+            let x1 = model.wire_node(format!("{prefix}.even_sq"), AxisOp::Rm(4), &[even])?[0];
+            let odd = model.wire_node(format!("{prefix}.odd"), Slice::new(4, 1, 2), &[pairs])?[0];
+            let x2 = model.wire_node(format!("{prefix}.odd_sq"), AxisOp::Rm(4), &[odd])?[0];
+            (x1, x2)
+        } else {
+            let x1 = model.wire_node(
+                format!("{prefix}.x1"),
+                Slice::new(3, 0, half.clone()),
+                &[x_rotate],
+            )?[0];
+            let x2 = model.wire_node(
+                format!("{prefix}.x2"),
+                Slice::new(3, half.clone(), rotary_dim.clone()),
+                &[x_rotate],
+            )?[0];
+            (x1, x2)
+        };
+
+        // 5. real = cos*x1 - sin*x2 ; imag = sin*x1 + cos*x2
+        let cos_x1 =
+            wire_with_rank_broadcast(format!("{prefix}.cos_x1"), model, mul(), &[cos, x1])?[0];
+        let sin_x2 =
+            wire_with_rank_broadcast(format!("{prefix}.sin_x2"), model, mul(), &[sin, x2])?[0];
+        let real =
+            wire_with_rank_broadcast(format!("{prefix}.real"), model, sub(), &[cos_x1, sin_x2])?[0];
+        let sin_x1 =
+            wire_with_rank_broadcast(format!("{prefix}.sin_x1"), model, mul(), &[sin, x1])?[0];
+        let cos_x2 =
+            wire_with_rank_broadcast(format!("{prefix}.cos_x2"), model, mul(), &[cos, x2])?[0];
+        let imag =
+            wire_with_rank_broadcast(format!("{prefix}.imag"), model, add(), &[sin_x1, cos_x2])?[0];
+
+        // 6. Reassemble the rotated channels.
+        let rotated = if self.interleaved {
+            let real5 = model.wire_node(format!("{prefix}.real_unsq"), AxisOp::Add(4), &[real])?[0];
+            let imag5 = model.wire_node(format!("{prefix}.imag_unsq"), AxisOp::Add(4), &[imag])?[0];
+            let interleaved = model.wire_node(
+                format!("{prefix}.interleave"),
+                TypedConcat::new(4),
+                &[real5, imag5],
+            )?[0];
+            model.wire_node(
+                format!("{prefix}.merge_pairs"),
+                AxisOp::Reshape(3, tvec![half.clone(), two.clone()], tvec![rotary_dim.clone()]),
+                &[interleaved],
+            )?[0]
+        } else {
+            model.wire_node(
+                format!("{prefix}.concat_halves"),
+                TypedConcat::new(3),
+                &[real, imag],
+            )?[0]
+        };
+
+        // 7. Re-attach the pass-through tail.
+        let out_bsnh = if let Some(pt) = passthrough {
+            model.wire_node(format!("{prefix}.concat_tail"), TypedConcat::new(3), &[rotated, pt])?
+                [0]
+        } else {
+            rotated
+        };
+
+        // 8. Restore the original layout.
+        let out = if in_rank == 4 {
+            // [B, S, N, H] -> [B, N, S, H]
+            model.wire_node(prefix.to_string(), AxisOp::Move(2, 1), &[out_bsnh])?
+        } else {
+            let hidden = in_fact.shape[2].clone();
+            let head_size = hidden.clone().div_ceil(self.num_heads as u64);
+            model.wire_node(
+                prefix.to_string(),
+                AxisOp::Reshape(2, tvec![self.num_heads.to_dim(), head_size], tvec![hidden]),
+                &[out_bsnh],
+            )?
+        };
+        Ok(out)
+    }
+}
diff --git a/onnx/src/ops/nn/skip_layer_norm.rs b/onnx/src/ops/nn/skip_layer_norm.rs
new file mode 100644
index 0000000000..c0a61f288e
--- /dev/null
+++ b/onnx/src/ops/nn/skip_layer_norm.rs
@@ -0,0 +1,154 @@
+use crate::model::{ParsingContext, optional_outputs};
+use crate::pb::NodeProto;
+use tract_core::ops::cast::cast;
+use tract_core::ops::math::{add, div, mul, rsqrt, square, sub};
+use tract_core::ops::nn::{Reduce, Reducer};
+use tract_hir::internal::*;
+use tract_hir::ops::logic::wire_with_rank_broadcast;
+
+// com.microsoft SkipLayerNormalization:
+//   input_skip_bias_sum = input + skip (+ bias)
+//   output = LayerNorm(input_skip_bias_sum, last axis) * gamma (+ beta)
+// Inputs:  input(0), skip(1), gamma(2), beta(3, opt), bias(4, opt)
+// Outputs: output(0), mean(1, opt), inv_std_var(2, opt), input_skip_bias_sum(3, opt)
+pub fn skip_layer_normalization(
+    _ctx: &ParsingContext,
+    node: &NodeProto,
+) -> TractResult<(Box<dyn InferenceOp>, Vec<String>)> {
+    let epsilon = node.get_attr_opt("epsilon")?.unwrap_or(1e-5f32);
+    let have_beta = node.input.len() >= 4 && !node.input[3].is_empty();
+    let have_bias = node.input.len() >= 5 && !node.input[4].is_empty();
+    let mut oo = optional_outputs(node).skip(1);
+    let mean = oo.next().unwrap().is_some();
+    let invstd = oo.next().unwrap().is_some();
+    let sum = oo.next().unwrap().is_some();
+    Ok((expand(SkipLayerNorm { epsilon, have_beta, have_bias, mean, invstd, sum }), vec![]))
+}
+
+#[derive(Debug, Clone)]
+struct SkipLayerNorm {
+    epsilon: f32,
+    have_beta: bool,
+    have_bias: bool,
+    mean: bool,
+    invstd: bool,
+    sum: bool,
+}
+
+impl Expansion for SkipLayerNorm {
+    fn name(&self) -> StaticName {
+        "SkipLayerNorm".into()
+    }
+
+    fn nboutputs(&self) -> TractResult<usize> {
+        Ok(1 + self.mean as usize + self.invstd as usize + self.sum as usize)
+    }
+
+    fn rules<'r, 'p: 'r, 's: 'r>(
+        &'s self,
+        s: &mut Solver<'r>,
+        inputs: &'p [TensorProxy],
+        outputs: &'p [TensorProxy],
+    ) -> InferenceResult {
+        check_input_arity(inputs, 3 + self.have_beta as usize + self.have_bias as usize)?;
+        check_output_arity(outputs, self.nboutputs()?)?;
+        s.equals(&inputs[0].datum_type, &outputs[0].datum_type)?;
+        s.equals(&inputs[0].shape, &outputs[0].shape)?;
+        if self.sum {
+            let si = 1 + self.mean as usize + self.invstd as usize;
+            s.equals(&inputs[0].datum_type, &outputs[si].datum_type)?;
+            s.equals(&inputs[0].shape, &outputs[si].shape)?;
+        }
+        Ok(())
+    }
+
+    fn wire(
+        &self,
+        prefix: &str,
+        model: &mut TypedModel,
+        inputs: &[OutletId],
+    ) -> TractResult<TVec<OutletId>> {
+        let fact = model.outlet_fact(inputs[0])?.clone();
+        let rank = fact.rank();
+        let dt = fact.datum_type;
+        let stash = DatumType::F32;
+        let axes: TVec<usize> = tvec![rank - 1];
+
+        // input_skip_bias_sum = input + skip (+ bias)
+        let mut sum = wire_with_rank_broadcast(
+            format!("{prefix}.skip"),
+            model,
+            add(),
+            &[inputs[0], inputs[1]],
+        )?[0];
+        if self.have_bias {
+            sum = wire_with_rank_broadcast(
+                format!("{prefix}.bias"),
+                model,
+                add(),
+                &[sum, inputs[4]],
+            )?[0];
+        }
+
+        // LayerNorm over the last axis, computed in f32.
+        let x = model.wire_node(format!("{prefix}.cast_x"), cast(stash), &[sum])?[0];
+        // mean / var via Sum / count (the core Reducer has no Mean variant).
+        let count: TDim = fact.shape[rank - 1].clone();
+        let count = model.add_const(format!("{prefix}.count"), tensor0(count))?;
+        let count = model.wire_node(format!("{prefix}.count_f32"), cast(stash), &[count])?[0];
+        let sum_x = model.wire_node(
+            format!("{prefix}.sum_x"),
+            Reduce { axes: axes.clone(), reducer: Reducer::Sum },
+            &[x],
+        )?[0];
+        let mean =
+            wire_with_rank_broadcast(format!("{prefix}.mean"), model, div(), &[sum_x, count])?[0];
+        let d = wire_with_rank_broadcast(format!("{prefix}.d"), model, sub(), &[x, mean])?[0];
+        let dd = model.wire_node(format!("{prefix}.dd"), square(), &[d])?[0];
+        let sum_dd = model.wire_node(
+            format!("{prefix}.sum_dd"),
+            Reduce { axes, reducer: Reducer::Sum },
+            &[dd],
+        )?[0];
+        let var =
+            wire_with_rank_broadcast(format!("{prefix}.var"), model, div(), &[sum_dd, count])?[0];
+        let eps = model.add_const(
+            format!("{prefix}.eps"),
+            tensor0(self.epsilon).cast_to_dt(stash)?.into_owned(),
+        )?;
+        let var_eps =
+            wire_with_rank_broadcast(format!("{prefix}.var_eps"), model, add(), &[var, eps])?[0];
+        let inv_std = model.wire_node(format!("{prefix}.rsqrt"), rsqrt(), &[var_eps])?[0];
+        let normalized =
+            wire_with_rank_broadcast(format!("{prefix}.norm"), model, mul(), &[d, inv_std])?[0];
+        let normalized = model.wire_node(format!("{prefix}.cast_out"), cast(dt), &[normalized])?[0];
+
+        // scale by gamma (+ beta)
+        let mut output = wire_with_rank_broadcast(
+            format!("{prefix}.scaled"),
+            model,
+            mul(),
+            &[normalized, inputs[2]],
+        )?[0];
+        if self.have_beta {
+            output = wire_with_rank_broadcast(
+                format!("{prefix}.beta"),
+                model,
+                add(),
+                &[output, inputs[3]],
+            )?[0];
+        }
+
+        let mut outputs = tvec!(output);
+        if self.mean {
+            outputs.push(mean);
+        }
+        if self.invstd {
+            outputs.push(inv_std);
+        }
+        if self.sum {
+            outputs.push(sum);
+        }
+        Ok(outputs)
+    }
+}
diff --git a/onnx/src/ops/rec/common.rs b/onnx/src/ops/rec/common.rs
index 13671a7086..af542e3382 100644
--- a/onnx/src/ops/rec/common.rs
+++ b/onnx/src/ops/rec/common.rs
@@ -246,11 +246,8 @@ impl CommonRec {
             });
         }
 
-        let scan_outputs = target.wire_node(
-            prefix,
-            tract_core::ops::scan::Scan::new(body, input_mapping, output_mapping, 0)?,
-            &outer_inputs,
-        )?;
+        let scan = tract_core::ops::scan::Scan::new(body, input_mapping, output_mapping, 0)?;
+        let scan_outputs = target.wire_node(prefix, scan, &outer_inputs)?;
 
         let mut result = tvec!();
         if let Some(slot) = self.optional_y_output {
diff --git a/onnx/src/pb_helpers.rs b/onnx/src/pb_helpers.rs
index ceec7543fc..426e50c363 100644
--- a/onnx/src/pb_helpers.rs
+++ b/onnx/src/pb_helpers.rs
@@ -98,7 +98,11 @@ pub trait AttrScalarType<'a>: 'a + Sized {
 impl<'a> AttrScalarType<'a> for DatumType {
     fn get_attr_opt_scalar(node: &'a NodeProto, name: &str) -> TractResult<Option<Self>> {
         i32::get_attr_opt_scalar(node, name)?
-            .map(|d| tensor_proto::DataType::try_from(d).unwrap().try_into())
+            .map(|d| {
+                tensor_proto::DataType::try_from(d)
+                    .map_err(|e| format_err!("unknown ONNX TensorProto.DataType ({d}): {e}"))?
+                    .try_into()
+            })
             .transpose()
     }
 }
@@ -318,9 +322,13 @@ impl NodeProto {
             Some(attr) => attr,
             _ => return Ok(None),
         };
-        self.expect_attr(name, AttributeType::try_from(attr.r#type).unwrap() == ty, || {
-            format!("{}, got {}", ty, attr.r#type)
+        let attr_ty = AttributeType::try_from(attr.r#type).map_err(|e| {
+            format_err!(
+                "attribute {name} declares unknown ONNX AttributeType ({}): {e}",
+                attr.r#type
+            )
         })?;
+        self.expect_attr(name, attr_ty == ty, || format!("{}, got {}", ty, attr.r#type))?;
         Ok(Some(attr))
     }
 
diff --git a/onnx/src/prost/onnx.rs b/onnx/src/prost/onnx.rs
index 6df67174c1..2a0b419ce9 100644
--- a/onnx/src/prost/onnx.rs
+++ b/onnx/src/prost/onnx.rs
@@ -592,6 +592,20 @@ pub mod tensor_proto {
         /// floating-point number truncated to 16 bits.
         /// This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits.
         Bfloat16 = 16,
+        /// 8-bit floating point, with 4 exponent bits and 3 mantissa bits, NaN only on `S.1111.111`.
+        Float8e4m3fn = 17,
+        /// 8-bit floating point, with 4 exponent bits and 3 mantissa bits, no NaN, no infinities.
+        Float8e4m3fnuz = 18,
+        /// 8-bit floating point, with 5 exponent bits and 2 mantissa bits.
+        Float8e5m2 = 19,
+        /// 8-bit floating point, with 5 exponent bits and 2 mantissa bits, no NaN, no infinities.
+        Float8e5m2fnuz = 20,
+        /// 4-bit unsigned integer.
+        Uint4 = 21,
+        /// 4-bit signed integer (two's complement).
+        Int4 = 22,
+        /// 4-bit floating point, 2 exponent bits and 1 mantissa bit.
+        Float4e2m1 = 23,
     }
     impl DataType {
         /// String value of the enum field names used in the ProtoBuf definition.
@@ -617,6 +631,13 @@ pub mod tensor_proto {
                 DataType::Complex64 => "COMPLEX64",
                 DataType::Complex128 => "COMPLEX128",
                 DataType::Bfloat16 => "BFLOAT16",
+                DataType::Float8e4m3fn => "FLOAT8E4M3FN",
+                DataType::Float8e4m3fnuz => "FLOAT8E4M3FNUZ",
+                DataType::Float8e5m2 => "FLOAT8E5M2",
+                DataType::Float8e5m2fnuz => "FLOAT8E5M2FNUZ",
+                DataType::Uint4 => "UINT4",
+                DataType::Int4 => "INT4",
+                DataType::Float4e2m1 => "FLOAT4E2M1",
             }
         }
     }
diff --git a/onnx/src/tensor.rs b/onnx/src/tensor.rs
index 954c719536..1937f6e434 100644
--- a/onnx/src/tensor.rs
+++ b/onnx/src/tensor.rs
@@ -35,7 +35,10 @@ pub fn translate_inference_fact(
     include_unknown_symbols: bool,
 ) -> TractResult<InferenceFact> {
     let mut fact = InferenceFact::default();
-    fact = fact.with_datum_type(DataType::try_from(t.elem_type).unwrap().try_into()?);
+    let dt: DatumType = DataType::try_from(t.elem_type)
+        .map_err(|e| format_err!("unknown ONNX TensorProto.DataType ({}): {e}", t.elem_type))?
+        .try_into()?;
+    fact = fact.with_datum_type(dt);
     if let Some(shape) = &t.shape {
         let shape: TVec<DimFact> = shape
             .dim
@@ -135,7 +138,9 @@ pub fn load_tensor(
     t: &TensorProto,
     path: Option<&str>,
 ) -> TractResult<Tensor> {
-    let dt = DataType::try_from(t.data_type).unwrap().try_into()?;
+    let dt = DataType::try_from(t.data_type)
+        .map_err(|e| format_err!("unknown ONNX TensorProto.DataType ({}): {e}", t.data_type))?
+        .try_into()?;
     let shape: Vec<usize> = t.dims.iter().map(|&i| i as usize).collect();
     // detect if the tensor is rather in an external file than inside the onnx file directly
     let is_external = t.data_location.is_some()
@@ -208,3 +213,37 @@ pub fn proto_from_reader<R: ::std::io::Read>(mut r: R) -> TractResult<TensorProt
     let b = bytes::Bytes::from(v);
     TensorProto::decode(b).context("Can not parse protobuf input")
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Regression: an ONNX TensorProto with a DataType value that prost
+    // doesn't recognise used to abort the process via `unwrap()` on the
+    // `try_from` error. Now it must produce a normal TractError.
+    #[test]
+    fn unknown_elem_type_does_not_panic() {
+        assert!(DataType::try_from(9999_i32).is_err());
+    }
+
+    // Post-BF16 dtypes (FLOAT8E4M3FN .. FLOAT4E2M1) are now decoded by prost.
+    // tract still doesn't support them in DatumType, but the conversion error
+    // should name the dtype rather than fall through a `_` arm with `Unknown`.
+    #[test]
+    fn newer_dtypes_decode_then_error_cleanly() {
+        for elem_type in 17..=23 {
+            let dt = DataType::try_from(elem_type)
+                .unwrap_or_else(|e| panic!("dtype {elem_type} not decoded by prost: {e}"));
+            let err = DatumType::try_from(dt)
+                .expect_err("tract should not claim support for FLOAT8/INT4/FLOAT4 dtypes yet");
+            let msg = format!("{err:?}");
+            assert!(
+                msg.contains("Float8")
+                    || msg.contains("Uint4")
+                    || msg.contains("Int4")
+                    || msg.contains("Float4"),
+                "DatumType conversion error for {dt:?} did not name the dtype: {msg}",
+            );
+        }
+    }
+}
diff --git a/pulse-opl/src/concat.rs b/pulse-opl/src/concat.rs
index 566bb647c2..6461312de0 100644
--- a/pulse-opl/src/concat.rs
+++ b/pulse-opl/src/concat.rs
@@ -38,7 +38,16 @@ impl TypedOp for PulsedSameAxisConcat {
     as_op!();
 
     fn output_facts(&self, inputs: &[&TypedFact]) -> TractResult<TVec<TypedFact>> {
-        Ok(tvec!(inputs[0].clone()))
+        // Inputs are (pre_prefix, stream, post_suffix); the streaming output
+        // has the same shape as `stream` (input 1).  The pulsed-op version
+        // builds its PulsedFact from the same input.  Previously this
+        // returned `inputs[0]` — the small constant pre-buffer — which broke
+        // any pass that re-derived the typed shape post-pulsification (e.g.
+        // CUDA/Metal translation walking the pulsified preprocessor: every
+        // downstream op saw the pre-buffer shape instead of the pulse-axis
+        // size and produced collapsed outputs).
+        ensure!(inputs.len() == 3, "Expect 3 inputs");
+        Ok(tvec!(inputs[1].clone()))
     }
 }
 
diff --git a/pulse-opl/src/delay.rs b/pulse-opl/src/delay.rs
index 7b4b874418..579355fb5f 100644
--- a/pulse-opl/src/delay.rs
+++ b/pulse-opl/src/delay.rs
@@ -91,7 +91,12 @@ impl OpState for DelayState {
             if self.buffer.is_none() {
                 let mut shape = input.shape().to_owned();
                 shape[op.axis] = buffered;
-                self.buffer = Some(Tensor::uninitialized_dt(input.datum_type(), &shape)?);
+                // Zero-init: the buffer holds the streaming context preceding the
+                // first pulse, and silence (zero) is the only sensible default.
+                // Uninitialized memory leaks into the first `delay` output frames
+                // and diverges from the GPU op (which zero-inits), making any
+                // per-node comparison meaningless on the warmup region.
+                self.buffer = Some(Tensor::zero_dt(input.datum_type(), &shape)?);
             };
             let mut output = Tensor::uninitialized_dt(input.datum_type(), &output_shape)?;
             self.apply_delay_unchecked(op, &input, &mut output);
diff --git a/pulse-opl/src/lib.rs b/pulse-opl/src/lib.rs
index e620d18610..0deb5c0c44 100644
--- a/pulse-opl/src/lib.rs
+++ b/pulse-opl/src/lib.rs
@@ -6,7 +6,9 @@ mod deconv_delay;
 mod delay;
 mod mask;
 mod pad;
+mod range;
 mod slice;
+mod window;
 
 pub use tract_nnef;
 pub use tract_nnef::tract_core;
@@ -21,7 +23,9 @@ pub mod ops {
     pub use super::delay::{Delay, DelayState};
     pub use super::mask::PulseMask;
     pub use super::pad::PulsePad;
+    pub use super::range::PulsedRange;
     pub use super::slice::PulsedAxisSlice;
+    pub use super::window::WindowOnAxis;
 }
 
 pub trait WithPulse {
@@ -31,7 +35,6 @@ pub trait WithPulse {
 
 impl WithPulse for tract_nnef::framework::Nnef {
     fn enable_pulse(&mut self) {
-        self.enable_tract_core();
         self.registries.push(tract_nnef_registry());
     }
     fn with_pulse(mut self) -> Self {
diff --git a/pulse-opl/src/range.rs b/pulse-opl/src/range.rs
new file mode 100644
index 0000000000..99a906fa48
--- /dev/null
+++ b/pulse-opl/src/range.rs
@@ -0,0 +1,130 @@
+//! `PulsedRange` — pulsified `tract_core::ops::array::Range`.
+//!
+//! A 0-input streaming generator: at pulse `n` it emits
+//! `[start + step·(n·P), …, start + step·((n+1)·P − 1)]` of length `P`
+//! (the pulse size on the streaming axis).  State is just `current_pos`,
+//! mirroring `PulsePadOpState`.
+//!
+//! The op is registered as a `PulsedOp` only — there's no NNEF/typed-side
+//! representation (it lives strictly between the pulsifier and runtime).
+
+use tract_nnef::internal::*;
+use tract_nnef::tract_core::trivial_op_state_freeze;
+
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+pub struct PulsedRange {
+    /// Output dtype (matches `start`'s dtype on the typed side).
+    pub datum_type: DatumType,
+    /// 0-d const `start` value, of dtype `datum_type` (TDim or i64/…).
+    pub start: Tensor,
+    /// 0-d const `step` value, of dtype `datum_type`.
+    pub step: Tensor,
+    /// Symbolic total length of the range (= `(end - start)/step`).
+    pub stream_dim: TDim,
+    /// Pulse size on the (single) streaming axis.
+    pub pulse: usize,
+}
+
+impl Op for PulsedRange {
+    fn name(&self) -> StaticName {
+        "PulsedRange".into()
+    }
+
+    fn info(&self) -> TractResult<Vec<String>> {
+        Ok(vec![format!(
+            "dtype: {:?} start: {:?} step: {:?} dim: {} pulse: {}",
+            self.datum_type, self.start, self.step, self.stream_dim, self.pulse,
+        )])
+    }
+
+    op_as_typed_op!();
+}
+
+impl EvalOp for PulsedRange {
+    fn is_stateless(&self) -> bool {
+        false
+    }
+
+    fn state(
+        &self,
+        _session: &TurnState,
+        _node_id: usize,
+    ) -> TractResult<Option<Box<dyn OpState>>> {
+        Ok(Some(Box::<PulsedRangeState>::default()))
+    }
+}
+
+impl TypedOp for PulsedRange {
+    fn output_facts(&self, _inputs: &[&TypedFact]) -> TractResult<TVec<TypedFact>> {
+        Ok(tvec!(self.datum_type.fact([self.pulse.to_dim()])))
+    }
+
+    as_op!();
+}
+
+#[derive(Debug, Clone, Default, Hash, PartialEq, Eq)]
+struct PulsedRangeState {
+    current_pos: usize,
+}
+
+impl OpState for PulsedRangeState {
+    fn eval(
+        &mut self,
+        session: &mut TurnState,
+        op: &dyn Op,
+        _inputs: TVec<TValue>,
+    ) -> TractResult<TVec<TValue>> {
+        let op = op.downcast_ref::<PulsedRange>().ok_or_else(|| format_err!("Wrong Op type"))?;
+        let pulse = op.pulse;
+        let base = self.current_pos;
+        self.current_pos += pulse;
+
+        let tensor = if op.start.datum_type() == TDim::datum_type() {
+            // TDim-input form: `Range::make` materialises as i64 regardless
+            // of `op.datum_type` (which is what `Range::output_facts` writes
+            // for the TDim-input branch — see `core/src/ops/array/range.rs`).
+            let start = op
+                .start
+                .try_as_plain()?
+                .to_scalar::<TDim>()?
+                .eval(&session.resolved_symbols)
+                .to_i64()?;
+            let step = op
+                .step
+                .try_as_plain()?
+                .to_scalar::<TDim>()?
+                .eval(&session.resolved_symbols)
+                .to_i64()?;
+            let data: Vec<i64> =
+                (0..pulse).map(|i| start + step * (base as i64 + i as i64)).collect();
+            tract_nnef::tract_core::ndarray::Array1::from_vec(data).into_dyn().into_tensor()
+        } else {
+            dispatch_numbers!(make_pulse(op.datum_type)(&op.start, &op.step, base, pulse))?
+        };
+
+        Ok(tvec!(tensor.into_tvalue()))
+    }
+}
+
+fn make_pulse<T>(start: &Tensor, step: &Tensor, base: usize, pulse: usize) -> TractResult<Tensor>
+where
+    T: Datum
+        + Copy
+        + tract_num_traits::NumCast
+        + std::ops::Add<Output = T>
+        + std::ops::Mul<Output = T>,
+{
+    let start = *start.try_as_plain()?.to_scalar::<T>()?;
+    let step = *step.try_as_plain()?.to_scalar::<T>()?;
+    let base_t: T = tract_num_traits::cast(base as i64)
+        .ok_or_else(|| format_err!("PulsedRange: base {base} doesn't fit in target dtype"))?;
+    let mut data = Vec::with_capacity(pulse);
+    let mut v = start + step * base_t;
+    for _ in 0..pulse {
+        data.push(v);
+        v = v + step;
+    }
+    Ok(tract_nnef::tract_core::ndarray::Array1::from_vec(data).into_dyn().into_tensor())
+}
+
+trivial_op_state_freeze!(PulsedRangeState);
diff --git a/pulse-opl/src/window.rs b/pulse-opl/src/window.rs
new file mode 100644
index 0000000000..0351596aef
--- /dev/null
+++ b/pulse-opl/src/window.rs
@@ -0,0 +1,122 @@
+//! `WindowOnAxis` — typed op produced by Blockify for banded-mask sections.
+//!
+//! Inserts a static `window` axis after the streaming `axis`, where each
+//! window slot `w ∈ [0, W)` carries a `start`-shifted view of the input:
+//!
+//! ```text
+//! input  shape: [..., S, ...]                       (S at `axis`)
+//! output shape: [..., S, W, ...]                    (W inserted after `axis`)
+//! output[..., s, w, ...] = input[..., s + start + w, ...]
+//!                          if 0 ≤ s + start + w < S
+//!                          0 otherwise
+//! ```
+//!
+//! `start = 0` is the future window (slot 0 = current, ex03).  `start = -(W-1)`
+//! is the past window (slot W-1 = current, ex04).  Mixed values are allowed
+//! provided `start ≤ 0 ≤ start + W - 1` (the window straddles current).
+//!
+//! Pulsification (in `tract_pulse::ops::window`) emits `Delay(0, W-1)` plus a
+//! per-pulse reshape that splits the streaming-axis pulse into `[1, W]` and
+//! retimes `stream.delay` by `-start` so the consumer's logical time anchors
+//! to the *latest* chunk in the buffer when `start < 0` (vs the oldest when
+//! `start = 0`).  The op only ever lives between the Blockify rewrite pass
+//! and pulsification.
+
+use tract_nnef::internal::*;
+use tract_nnef::tract_core::ndarray::{ArrayD, Axis, Slice};
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct WindowOnAxis {
+    pub axis: usize,
+    pub window: usize,
+    /// Offset of slot 0 relative to the streaming index `s`.  Default 0.
+    /// Negative values shift the window into the past (slot 0 looks
+    /// backward); positive values shift into the future (skipping current).
+    pub start: i64,
+    /// Value used to fill window slots that read past the input bounds —
+    /// the leading `-start` slots of pulse 0 (past window), the trailing
+    /// slots after the stream ends (future window).  Defaults to a
+    /// scalar zero of the input dtype.  For chunk-index wires the
+    /// caller passes a sentinel value that makes downstream band-mask
+    /// predicates evaluate to "out of band" at the boundary.
+    pub pad_value: Arc<Tensor>,
+}
+
+impl WindowOnAxis {
+    /// Convenience constructor for the future-window form (slot 0 =
+    /// current) with default zero pad of the given dtype.
+    pub fn future(axis: usize, window: usize, dt: DatumType) -> TractResult<Self> {
+        let pad_value = Tensor::zero_scalar_dt(dt)?.into_arc_tensor();
+        Ok(Self { axis, window, start: 0, pad_value })
+    }
+
+    fn eval_t<T>(&self, input: TValue) -> TractResult<TValue>
+    where
+        T: Datum + Copy,
+    {
+        let input = input.to_plain_array_view::<T>()?;
+        let s = input.shape()[self.axis] as i64;
+        let mut out_shape: Vec<usize> = input.shape().to_vec();
+        out_shape.insert(self.axis + 1, self.window);
+        // Fill out-of-bounds slots with pad_value (scalar) so the caller
+        // can pick boundary semantics — zero for data wires, sentinel for
+        // chunk-index wires that drive a downstream band predicate.
+        let pad: T = self.pad_value.cast_to_scalar::<T>()?;
+        let mut output: ArrayD<T> = ArrayD::from_elem(out_shape, pad);
+        for w in 0..self.window {
+            let shift = self.start + w as i64;
+            // For each w: output[s, w, …] = input[s + shift, …] when in
+            // bounds.  Valid `s` range and corresponding source indices:
+            //   shift ≥ 0: src [shift, S),    dst [0, S-shift)
+            //   shift < 0: src [0, S+shift),  dst [-shift, S)
+            let src_start = shift.max(0).min(s) as usize;
+            let src_end = (s + shift.min(0)).max(0).min(s) as usize;
+            if src_start >= src_end {
+                continue;
+            }
+            let dst_start = (-shift).max(0).min(s) as usize;
+            let dst_end = dst_start + (src_end - src_start);
+            let src = input.slice_axis(Axis(self.axis), Slice::from(src_start..src_end));
+            let mut dst_w = output.index_axis_mut(Axis(self.axis + 1), w);
+            dst_w.slice_axis_mut(Axis(self.axis), Slice::from(dst_start..dst_end)).assign(&src);
+        }
+        Ok(output.into_tensor().into_tvalue())
+    }
+}
+
+impl Op for WindowOnAxis {
+    fn name(&self) -> StaticName {
+        "WindowOnAxis".into()
+    }
+
+    fn info(&self) -> TractResult<Vec<String>> {
+        Ok(vec![format!("axis: {} window: {} start: {}", self.axis, self.window, self.start)])
+    }
+
+    op_as_typed_op!();
+}
+
+impl EvalOp for WindowOnAxis {
+    fn is_stateless(&self) -> bool {
+        true
+    }
+
+    fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>> {
+        let input = args_1!(inputs);
+        Ok(tvec!(dispatch_numbers!(Self::eval_t(input.datum_type())(self, input))?))
+    }
+}
+
+impl TypedOp for WindowOnAxis {
+    as_op!();
+
+    fn output_facts(&self, inputs: &[&TypedFact]) -> TractResult<TVec<TypedFact>> {
+        ensure!(self.axis < inputs[0].rank(), "WindowOnAxis: axis {} out of range", self.axis);
+        ensure!(self.window > 0, "WindowOnAxis: window must be > 0");
+        let mut shape: TVec<TDim> = inputs[0].shape.iter().cloned().collect();
+        shape.insert(self.axis + 1, self.window.to_dim());
+        let mut fact = inputs[0].without_value();
+        fact.shape = shape.into();
+        Ok(tvec!(fact))
+    }
+}
diff --git a/pulse/Cargo.toml b/pulse/Cargo.toml
index ece76961be..698f4061f6 100644
--- a/pulse/Cargo.toml
+++ b/pulse/Cargo.toml
@@ -21,4 +21,5 @@ lazy_static.workspace = true
 log.workspace = true
 serde.workspace = true
 tract-pulse-opl.workspace = true
+tract-transformers.workspace = true
 
diff --git a/pulse/src/blockify.rs b/pulse/src/blockify.rs
new file mode 100644
index 0000000000..10c4cf1f17
--- /dev/null
+++ b/pulse/src/blockify.rs
@@ -0,0 +1,2313 @@
+//! Blockify — typed-model rewrite that factors block-diagonal / banded
+//! attention structure into the graph topology, so the result has a
+//! single streaming axis everywhere and pulsifies under v1's existing
+//! machinery.
+//!
+//! # Recogniser scope
+//!
+//! Banded masks `chunk(axis_a) − chunk(axis_b) ∈ [lower, upper]`
+//! (block-diagonal is the special case `lower == upper == 0`):
+//!
+//!   EinSum([a, b]) producing a multi-T-axis score
+//!   → body op(s) consuming scores and a mask wire whose
+//!     `uniform_tdim` carries one of the recognised AST shapes
+//!     (`Eq(coord_a/k, coord_b/k)` for block-diagonal, or
+//!     `Mul([Ge(upper, D), Ge(D, lower)])` with `D = coord_a/k −
+//!     coord_b/k` for banded — both forms are produced by core
+//!     `reduce()` after Eq/And/Ge propagation)
+//!   → Reduce<Sum> on a streaming axis, contracting EinSum, or
+//!     ScaledMaskedSoftmax + downstream contracting EinSum
+//!
+//! `mask.lower > 0` (purely-future, skipping current) and `mask.upper < 0`
+//! (purely-past) are rejected — they don't appear in attention masks and
+//! would need different pulsifier wiring.
+//!
+//! # Pipeline
+//!
+//! 1. **Detect** quadratic sections globally (`find_quadratic_sections`):
+//!    connected components of multi-T-axis nodes, with at least one
+//!    `uniform_tdim`-annotated wire whose AST decodes to a `MaskForm`.
+//!    Determine the score axis the terminator contracts and translate
+//!    it to mask frame.
+//! 2. **Substitute** the streaming symbol globally `T → k·S` via core's
+//!    `substitute_symbols`.
+//! 3. **Rewrite** one `TypedModelPatch` per section
+//!    (`build_section_patch`).  Sections are independent so patches
+//!    apply in sequence.  A recognised section gets fully rewritten or
+//!    Blockify bails — no partial rewrites silently left for downstream
+//!    pulsification to choke on.
+//!
+//! # Section rewrite
+//!
+//! Three initiator flavours (`wire_initiator` dispatches by op type):
+//!
+//! * **Data EinSum** (`wire_initiator_einsum`): the score-matrix
+//!   producer.  Tap each input, split its T-axis at `k`, and on the
+//!   contracted side wrap with `WindowOnAxis(W) + flatten(W, k) → W·k`
+//!   so the chunked einsum's contracted within-chunk axis carries `W·k`
+//!   rather than `k` elements.
+//! * **uniform_tdim mask head** (`wire_uniform_tdim_initiator`): the
+//!   multi-T-axis Sub/Eq at the top of the mask-construction chain.
+//!   Each input is a single-T-axis chunk-id wire (`chunk_row`,
+//!   `chunk_col`); `chunkify_uniform_tdim_input` taps it, casts TDim →
+//!   I64 (PulsePad's `dispatch_copy_by_size!` fill needs `Copy`), splits
+//!   the T-axis, moves the chunk axis to position 0, and on the
+//!   contracted side wraps with `WindowOnAxis` using a **sentinel pad
+//!   value** so out-of-stream boundary slots produce values way outside
+//!   the band; downstream Ge/Le evaluate to `false` there.  After Sub,
+//!   the result is cast back to the source dtype so downstream body ops
+//!   that tap external constants (e.g. the `0` in `ge(diff, 0)`) match.
+//! * **MultiBroadcastTo** (`wire_initiator_multibroadcastto`): the
+//!   `select(mask, scores, scores * 0.0 + -inf)` false-branch pattern,
+//!   where declutter folds the chain to a `MultiBroadcastTo` of a small
+//!   const up to score's `[T, T]` shape.  The op's input is non-
+//!   streaming, so we tap and rank-bump it to the chunked-frame rank;
+//!   subsequent body-op broadcasting fills in the chunked dims with the
+//!   constant value.
+//!
+//! Body ops (`wire_body`) are replayed op-cloned in the chunked frame.
+//! Each input is one of:
+//!
+//! * **chunked** (in the `chunked` map): pass through.  May or may not
+//!   carry a streaming axis — broadcast constants from the
+//!   MultiBroadcastTo initiator have rank-bumped shape with no streaming
+//!   axis, and that's fine.
+//! * **other external** (taps): rank-bumped with `AddAxis(0)` to match
+//!   the chunked frame so rank-strict consumers (TypedBinOp, …) accept
+//!   them.
+//!
+//! `axes_mapping::track_axis` asserts each chunked input's chunk axis
+//! reaches a unique output axis position — bails if the op would
+//! disconnect it.  Body ops with explicit axis params (currently
+//! `Softmax`) get axis indices shifted by `+1` via
+//! `translate_body_op_axes` to account for the chunk axis inserted at
+//! position 0.
+//!
+//! # Window-slot offsets
+//!
+//! * `contracted_axis == mask.axis_a`: `start = mask.lower` — consumer
+//!   logical chunk `c` on the kept axis (= axis_b), window covers
+//!   `chunk(axis_a) ∈ [c + lower, c + upper]`.
+//! * `contracted_axis == mask.axis_b`: `start = -mask.upper` — kept
+//!   axis = axis_a, window covers `chunk(axis_b) ∈ [c - upper, c -
+//!   lower]`.
+//!
+//! `contracted_axis` lives in mask frame (0 or 1).  Score and mask align
+//! via right-aligned broadcasting; the recogniser translates score-
+//! frame axes from `axes_mapping::track_axis` to mask frame via
+//! `axis - (score_rank - 2)`.
+//!
+//! Output `stream.delay` = `max(0, end_of_window)` chunks (positive when
+//! the window extends past `c`, zero when fully causal).
+//!
+//! For EinSum terminators (e.g. attention's `attn @ V`), auxiliary
+//! inputs whose stream axis tracks through the terminator einsum to the
+//! contracted score axis are also windowed, so all inputs to the
+//! terminator share the same W·k contracted-axis size.
+//!
+//! # Runtime dependencies
+//!
+//! * `tract_pulse_opl::ops::PulsedRange` — pulsifies the source's
+//!   `Range(0, T)` chunk-id chain that
+//!   `chunkify_uniform_tdim_input` taps.  Without it, `Range` falls
+//!   through `NonPulsingWrappingOp` and produces a fresh symbolic shape
+//!   the rest of pulsification can't match.
+//! * `WindowOnAxis::pad_value` — set per-input to either `zero` (data
+//!   wires) or a sentinel (chunk-id wires), depending on the initiator.
+//!
+//! # Known workarounds
+//!
+//! * Sentinel pad value bounded by `i32::MAX/4`: tract's `i64 → TDim`
+//!   tensor cast routes through `i32` (`data/src/tensor.rs:1250`), so
+//!   larger sentinels truncate to small junk and the band predicate
+//!   evaluates true on boundary slots.  REVISIT: fix the cast upstream
+//!   and lift the cap.
+//! * TDim → I64 cast on chunk-id wires before windowing, then back to
+//!   TDim after the chunked Sub: `PulsePad`'s fill uses
+//!   `dispatch_copy_by_size!` which doesn't include TDim (not `Copy`).
+//!   REVISIT: add a clone-fill arm to `PulsePad` for TDim and drop the
+//!   round-trip.
+
+use crate::internal::*;
+use std::collections::{BTreeSet, HashMap};
+use tract_core::axes::AxesMapping;
+use tract_core::model::TypedModelPatch;
+use tract_core::ops::binary::TypedBinOp;
+use tract_core::ops::change_axes::AxisOp;
+use tract_core::ops::einsum::EinSum;
+use tract_core::ops::nn::{Reduce, Reducer};
+use tract_core::transform::ModelTransform;
+use tract_transformers::ops::DiagGather;
+
+/// Configuration for the Blockify ModelTransform.
+#[derive(Debug, Default, Clone, serde::Deserialize)]
+pub struct BlockifyConfig {
+    /// Streaming symbol the model's quadratic sections are quadratic in.
+    /// Defaults to "S" (matches the convention used by the pulse transform).
+    pub symbol: Option<String>,
+}
+
+/// Property key holding the symbol introduced by a Blockify rewrite — the
+/// new (chunk-counting) streaming symbol that downstream consumers (e.g.
+/// the pulse transform) should use.  Stored as a 1-element string tensor.
+pub const BLOCKIFY_CHUNK_SYMBOL: &str = "blockify.chunk_symbol";
+
+/// Property key holding the chunk size `k`.  Pulse values originally
+/// expressed in token-units must be divided by this to convert to
+/// chunk-units after Blockify runs.  Stored as a scalar i64 tensor.
+pub const BLOCKIFY_CHUNK_SIZE: &str = "blockify.chunk_size";
+
+/// Property key holding the original (pre-substitution) streaming symbol
+/// name.  Mostly informational.  Stored as a 1-element string tensor.
+pub const BLOCKIFY_ORIGINAL_SYMBOL: &str = "blockify.original_symbol";
+
+#[derive(Debug)]
+pub struct BlockifyTransform(pub BlockifyConfig);
+
+impl ModelTransform for BlockifyTransform {
+    fn name(&self) -> std::borrow::Cow<'static, str> {
+        "blockify".into()
+    }
+
+    fn transform(&self, model: &mut TypedModel) -> TractResult<()> {
+        let symbol_name = self.0.symbol.as_deref().unwrap_or("S");
+        let stream_sym = model.symbols.sym(symbol_name);
+        let sections = find_quadratic_sections(model, &stream_sym)?;
+        if sections.is_empty() {
+            return Ok(());
+        }
+        let k = sections[0].mask.chunk_size;
+        if !sections.iter().all(|s| s.mask.chunk_size == k) {
+            bail!(
+                "Blockify found multiple quadratic sections with mismatched chunk \
+                 sizes; a single global substitution cannot cover them.  \
+                 Refusing to blockify rather than produce a partial rewrite."
+            );
+        }
+
+        let chunk_sym = model.symbols.new_with_prefix("S");
+        let subs: HashMap<Symbol, TDim> =
+            HashMap::from([(stream_sym.clone(), chunk_sym.to_dim() * k)]);
+        let new_model = model.substitute_symbols(&subs)?;
+        *model = new_model;
+        rewrite_sections(model, &chunk_sym, k)?;
+        model.properties.insert(
+            BLOCKIFY_ORIGINAL_SYMBOL.to_string(),
+            tensor1(&[symbol_name.to_string()]).into_arc_tensor(),
+        );
+        Ok(())
+    }
+}
+
+pub fn has_quadratic_sections(model: &TypedModel, stream_sym: &Symbol) -> TractResult<bool> {
+    Ok(!find_quadratic_sections(model, stream_sym)?.is_empty())
+}
+
+/// Rewrite every quadratic section in `model`.  The model is expected to
+/// already be in post-substitute form (streaming dim = `multiplier · chunk_sym`).
+/// `substitute_multiplier` is recorded as `BLOCKIFY_CHUNK_SIZE` for downstream
+/// pulse-value translation; the section rewrite itself uses each section's
+/// own `mask.chunk_size` for its chunked Reshape.
+pub fn rewrite_sections(
+    model: &mut TypedModel,
+    chunk_sym: &Symbol,
+    substitute_multiplier: i64,
+) -> TractResult<bool> {
+    let sections = find_quadratic_sections(model, chunk_sym)?;
+    if sections.is_empty() {
+        return Ok(false);
+    }
+    let k = sections[0].mask.chunk_size;
+    if !sections.iter().all(|s| s.mask.chunk_size == k) {
+        bail!(
+            "Blockify found multiple quadratic sections with mismatched chunk \
+             sizes; a single global substitution cannot cover them.  \
+             Refusing to blockify rather than produce a partial rewrite."
+        );
+    }
+
+    for sec in &sections {
+        let patch = build_section_patch(model, sec, chunk_sym, sec.mask.chunk_size)?;
+        patch.apply(model)?;
+    }
+
+    model.properties.insert(
+        BLOCKIFY_CHUNK_SYMBOL.to_string(),
+        tensor1(&[format!("{chunk_sym}")]).into_arc_tensor(),
+    );
+    model
+        .properties
+        .insert(BLOCKIFY_CHUNK_SIZE.to_string(), tensor0(substitute_multiplier).into_arc_tensor());
+    Ok(true)
+}
+
+/// Read back the `(chunk_symbol, chunk_size)` ancillary outputs that
+/// `BlockifyTransform` writes to model properties.  Returns `None` if the
+/// model wasn't blockified (or those properties aren't present).
+pub fn blockify_output(model: &TypedModel) -> Option<(Symbol, i64)> {
+    let k = model.properties.get(BLOCKIFY_CHUNK_SIZE)?.cast_to_scalar::<i64>().ok()?;
+    let name_tensor = model.properties.get(BLOCKIFY_CHUNK_SYMBOL)?;
+    let view = name_tensor.to_plain_array_view::<String>().ok()?;
+    let name = view.iter().next()?;
+    Some((model.symbols.sym(name), k))
+}
+
+/// If `einsum_node`'s only multi-T-axis successor in `sec` is a single
+/// DiagGather, return its node id — the pair forms a fused initiator
+/// (DiagGather drives the chunked rewrite; the einsum is tapped through).
+fn section_only_diag_gather_consumer(
+    model: &TypedModel,
+    einsum_node: &TypedNode,
+    sec: &QuadraticSection,
+) -> Option<usize> {
+    let consumers: Vec<_> = model
+        .outlet_successors(OutletId::new(einsum_node.id, 0))
+        .iter()
+        .filter(|s| sec.section.contains(&s.node))
+        .collect();
+    if consumers.len() != 1 {
+        return None;
+    }
+    let dg_id = consumers[0].node;
+    if !model.nodes[dg_id].op_is::<DiagGather>() {
+        return None;
+    }
+    Some(dg_id)
+}
+
+fn streaming_positions(fact: &TypedFact, stream_sym: &Symbol) -> TVec<usize> {
+    fact.shape
+        .iter()
+        .enumerate()
+        .filter(|(_, d)| d.symbols().contains(stream_sym))
+        .map(|(i, _)| i)
+        .collect()
+}
+
+/// A connected subgraph of the typed model where every wire has multi-T-axis
+/// shape (≥2 streaming-symbol axes), bracketed by single-T-axis wires.
+///
+/// Phase 1+2+3 of Blockify recognition produces this structure op-agnostically.
+/// Phase 4 (the rewrite) consumes it and dispatches per op-type.
+#[derive(Debug)]
+struct QuadraticSection {
+    /// All nodes whose output wire has multi-T-axis shape.  The rewriter
+    /// reads `initiators`/`terminators` directly today; the full set is
+    /// kept here because phase 4 body-chain handling (Softmax, Add, etc.)
+    /// will need to walk it.
+    #[allow(dead_code)]
+    section: BTreeSet<usize>,
+    /// Subset of `section` whose inputs are all outside it (= "rise to quadratic").
+    initiators: Vec<usize>,
+    /// Nodes outside `section` consuming an in-section wire (= "drop back to linear").
+    terminators: Vec<usize>,
+    /// Mask form extracted from the section.  Determines the rewrite shape
+    /// (block-diagonal vs banded) and carries the chunk size.
+    mask: MaskForm,
+    /// Score-matrix axis (in the mask's frame, 0 or 1) that is contracted
+    /// by the terminator op.  For Reduce<Sum>, that's the reduced axis.
+    /// For an EinSum terminator, it's the streaming axis of input 0 that
+    /// doesn't appear in the output.  The windowed input(s) are those
+    /// whose stream axis maps to this score axis.
+    contracted_axis: usize,
+}
+
+/// Closed enum of mask forms the recogniser handles today.  All forms are a
+/// banded predicate on `chunk(axis_a) - chunk(axis_b) ∈ [lower, upper]`;
+/// the canonical block-diagonal mask is the special case `lower == upper == 0`.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+struct MaskForm {
+    chunk_size: i64,
+    lower: i64,
+    upper: i64,
+    /// Axis whose chunk index appears with positive sign in the diff.
+    axis_a: usize,
+    /// Axis whose chunk index appears negated in the diff.
+    axis_b: usize,
+}
+
+impl MaskForm {
+    fn is_block_diag(&self) -> bool {
+        self.lower == 0 && self.upper == 0
+    }
+}
+
+/// Connected components over the subgraph induced by `nodes` on the model's
+/// dataflow.  Two nodes in `nodes` are in the same component iff one's
+/// output is consumed by the other.  Returns one `BTreeSet` per component,
+/// ordered by smallest node id.
+fn connected_components(model: &TypedModel, nodes: &BTreeSet<usize>) -> Vec<BTreeSet<usize>> {
+    let mut parent: HashMap<usize, usize> = nodes.iter().map(|&n| (n, n)).collect();
+    fn uf_find(p: &mut HashMap<usize, usize>, x: usize) -> usize {
+        let px = p[&x];
+        if px == x {
+            return x;
+        }
+        let r = uf_find(p, px);
+        p.insert(x, r);
+        r
+    }
+    fn uf_union(p: &mut HashMap<usize, usize>, x: usize, y: usize) {
+        let rx = uf_find(p, x);
+        let ry = uf_find(p, y);
+        if rx != ry {
+            p.insert(rx, ry);
+        }
+    }
+    for &nid in nodes {
+        for cons in model.outlet_successors(OutletId::new(nid, 0)) {
+            if nodes.contains(&cons.node) {
+                uf_union(&mut parent, nid, cons.node);
+            }
+        }
+    }
+    let mut groups: HashMap<usize, BTreeSet<usize>> = HashMap::default();
+    for &nid in nodes {
+        let root = uf_find(&mut parent, nid);
+        groups.entry(root).or_default().insert(nid);
+    }
+    let mut out: Vec<BTreeSet<usize>> = groups.into_values().collect();
+    out.sort_by_key(|g| *g.iter().next().unwrap_or(&usize::MAX));
+    out
+}
+
+/// Phase 1+2+3: detect every section of the graph where wires go multi-T-axis.
+///
+/// The graph may contain several independent quadratic subgraphs (e.g. two
+/// attention layers in parallel); each comes back as its own section.  For
+/// each candidate section we verify that at least one wire carries
+/// `uniform_tdim` or `region_of_interest` (phase 2) and that some wire has a
+/// recognisable mask form (phase 3); sections that fail either check are
+/// dropped from the result.
+fn find_quadratic_sections(
+    model: &TypedModel,
+    stream_sym: &Symbol,
+) -> TractResult<Vec<QuadraticSection>> {
+    let is_multi_t_axis = |fact: &TypedFact| {
+        fact.shape.iter().filter(|d| d.symbols().contains(stream_sym)).count() >= 2
+    };
+
+    // Phase 1a — collect all multi-T-axis nodes.
+    let multi_nodes: BTreeSet<usize> = model
+        .nodes
+        .iter()
+        .filter(|n| n.outputs.len() == 1 && is_multi_t_axis(&n.outputs[0].fact))
+        .map(|n| n.id)
+        .collect();
+    if multi_nodes.is_empty() {
+        return Ok(vec![]);
+    }
+
+    // Phase 1b — connected components over the multi-T-axis subgraph.
+    let groups = connected_components(model, &multi_nodes);
+
+    // For each component, run phase 2 + 3.  Drop components that don't have
+    // a recognisable mask anchoring them.
+    let mut sections: Vec<QuadraticSection> = vec![];
+    for section in groups {
+        let initiators: Vec<usize> = section
+            .iter()
+            .copied()
+            .filter(|&nid| !model.nodes[nid].inputs.iter().any(|i| section.contains(&i.node)))
+            .collect();
+
+        let mut terminators_set: BTreeSet<usize> = BTreeSet::new();
+        for &nid in &section {
+            for cons in model.outlet_successors(OutletId::new(nid, 0)) {
+                if !section.contains(&cons.node) {
+                    terminators_set.insert(cons.node);
+                }
+            }
+        }
+        let terminators: Vec<usize> = terminators_set.into_iter().collect();
+
+        // Phase 2 — at least one annotated wire.
+        let any_annotated = section.iter().any(|&nid| {
+            let fact = &model.nodes[nid].outputs[0].fact;
+            fact.uniform_tdim.is_some() || fact.region_of_interest.is_some()
+        });
+        if !any_annotated {
+            continue;
+        }
+
+        // Phase 3 — recognise a mask form.
+        let mut mask: Option<MaskForm> = None;
+        for &nid in &section {
+            let fact = &model.nodes[nid].outputs[0].fact;
+            let Some(uniform) = &fact.uniform_tdim else {
+                continue;
+            };
+            let streaming_axes: TVec<usize> = fact
+                .shape
+                .iter()
+                .enumerate()
+                .filter(|(_, d)| d.symbols().contains(stream_sym))
+                .map(|(i, _)| i)
+                .collect();
+            if let Some(form) = decode_mask(uniform, &streaming_axes) {
+                mask = Some(form);
+                break;
+            }
+        }
+        let Some(mask) = mask else {
+            continue;
+        };
+
+        // Phase 3b — find the score-matrix axis the terminator contracts.
+        // All terminators of one section must agree (otherwise the section
+        // would have inconsistent structure).
+        let mut contracted_axis: Option<usize> = None;
+        let mut contracted_ok = true;
+        for &t_id in &terminators {
+            let t_node = &model.nodes[t_id];
+            let Ok(ax) = detect_contracted_score_axis(model, t_node, stream_sym) else {
+                contracted_ok = false;
+                break;
+            };
+            if let Some(prev) = contracted_axis
+                && prev != ax
+            {
+                contracted_ok = false;
+                break;
+            }
+            contracted_axis = Some(ax);
+        }
+        let Some(contracted_axis) = (if contracted_ok { contracted_axis } else { None }) else {
+            continue;
+        };
+
+        sections.push(QuadraticSection { section, initiators, terminators, mask, contracted_axis });
+    }
+
+    Ok(sections)
+}
+
+/// Find the score-matrix axis (one of the two streaming axes of the
+/// terminator's input 0) that is contracted away by the terminator op,
+/// **translated into mask frame** (so it's directly comparable to
+/// `mask.axis_a` / `mask.axis_b`).  Score and mask align via right-
+/// aligned broadcasting; mask is always rank-2 in the recogniser scope,
+/// so the translation is `axis - (score_rank - 2)`.
+///
+/// * Reduce<Sum>: it's the reduced axis (must be one of the streaming axes).
+/// * EinSum: it's the streaming axis of input 0 that doesn't track to a
+///   unique output axis (i.e. is summed over).
+fn detect_contracted_score_axis(
+    model: &TypedModel,
+    terminator: &TypedNode,
+    stream_sym: &Symbol,
+) -> TractResult<usize> {
+    let input_fact = model.outlet_fact(terminator.inputs[0])?;
+    let streaming_axes = streaming_positions(input_fact, stream_sym);
+    ensure!(
+        streaming_axes.len() == 2,
+        "Terminator score input has {} streaming axes, expected 2",
+        streaming_axes.len()
+    );
+    let score_rank = input_fact.rank();
+    let rank_diff = score_rank
+        .checked_sub(2)
+        .ok_or_else(|| format_err!("Terminator score input rank {score_rank} < 2; expected ≥ 2"))?;
+    let to_mask_frame = |score_axis: usize| -> TractResult<usize> {
+        score_axis.checked_sub(rank_diff).ok_or_else(|| {
+            format_err!(
+                "Terminator score axis {score_axis} doesn't map to mask frame \
+                 (rank_diff={rank_diff})"
+            )
+        })
+    };
+    if let Some(reduce) = terminator.op_as::<Reduce>() {
+        for &ax in &streaming_axes {
+            if reduce.axes.contains(&ax) {
+                return to_mask_frame(ax);
+            }
+        }
+        bail!("Reduce terminator doesn't reduce a streaming axis of the score input");
+    }
+    if let Some(einsum) = terminator.op_as::<EinSum>() {
+        for &ax in &streaming_axes {
+            let mapped = einsum.axes.track_axis((InOut::In(0), ax), InOut::Out(0))?;
+            if mapped.is_none() {
+                return to_mask_frame(ax);
+            }
+        }
+        bail!("EinSum terminator doesn't contract any streaming axis of input 0");
+    }
+    bail!("Unsupported terminator op for contracted-axis detection: {}", terminator.op.name())
+}
+
+// Pattern is gone — see `rewrite` below, which derives the same per-op
+// data from the QuadraticSection on the fly.
+
+/// Recognise a mask `uniform_tdim` expression.  Returns the closed-enum
+/// `MaskForm` description on success, `None` otherwise.
+///
+/// Today's recogniser handles two AST shapes, both reducing to the same
+/// banded structure `chunk(axis_a) - chunk(axis_b) ∈ [lower, upper]`:
+///
+/// 1. `Eq(coord_a/k, coord_b/k)`              — block-diagonal (lower=upper=0)
+/// 2. `Mul([Ge(upper, D), Ge(D, lower)])`     — banded, `D = coord_a/k - coord_b/k`
+///
+/// Both forms are produced by `core` after `reduce()` (see comparison.rs and
+/// the And-of-Ge propagation in binary.rs).  Other AST shapes are rejected.
+fn decode_mask(expr: &TDim, streaming_axes: &[usize]) -> Option<MaskForm> {
+    if streaming_axes.len() != 2 {
+        return None;
+    }
+    let want: BTreeSet<usize> = streaming_axes.iter().copied().collect();
+
+    // Form 1 — block-diagonal Eq.
+    if let TDim::Eq(lhs, rhs) = expr {
+        let (axis_a, k_a) = decode_coord_div(lhs)?;
+        let (axis_b, k_b) = decode_coord_div(rhs)?;
+        if k_a != k_b {
+            return None;
+        }
+        let got: BTreeSet<usize> = [axis_a, axis_b].into_iter().collect();
+        if want != got {
+            return None;
+        }
+        return Some(MaskForm { chunk_size: k_a as i64, lower: 0, upper: 0, axis_a, axis_b });
+    }
+
+    // Form 2 — banded Mul of two Ge's.
+    if let TDim::Mul(terms) = expr
+        && terms.len() == 2
+    {
+        for (a, b) in [(&terms[0], &terms[1]), (&terms[1], &terms[0])] {
+            if let Some(form) = decode_banded_terms(a, b)
+                && want == [form.axis_a, form.axis_b].into_iter().collect()
+            {
+                return Some(form);
+            }
+        }
+    }
+    None
+}
+
+/// `upper_term = Ge(Val(upper), D)` and `lower_term = Ge(D, Val(lower))`.
+/// `D = coord_a/k - coord_b/k`.
+fn decode_banded_terms(upper_term: &TDim, lower_term: &TDim) -> Option<MaskForm> {
+    let TDim::Ge(u_val, d_upper) = upper_term else {
+        return None;
+    };
+    let TDim::Val(upper) = **u_val else {
+        return None;
+    };
+    let TDim::Ge(d_lower, l_val) = lower_term else {
+        return None;
+    };
+    let TDim::Val(lower) = **l_val else {
+        return None;
+    };
+    if d_lower != d_upper {
+        return None;
+    }
+    let (axis_a, axis_b, k) = decode_diff(d_lower)?;
+    Some(MaskForm { chunk_size: k as i64, lower, upper, axis_a, axis_b })
+}
+
+/// Match `Add([MulInt(-1, Div(Sym(🎯b), k)), Div(Sym(🎯a), k)])` (the canonical
+/// `coord_a/k - coord_b/k` after `reduce()`) and return `(axis_a, axis_b, k)`.
+fn decode_diff(expr: &TDim) -> Option<(usize, usize, u64)> {
+    let TDim::Add(terms) = expr else {
+        return None;
+    };
+    if terms.len() != 2 {
+        return None;
+    }
+    for (pos, neg) in [(&terms[0], &terms[1]), (&terms[1], &terms[0])] {
+        let Some((axis_a, k_a)) = decode_coord_div(pos) else {
+            continue;
+        };
+        let TDim::MulInt(-1, neg_inner) = neg else {
+            continue;
+        };
+        let Some((axis_b, k_b)) = decode_coord_div(neg_inner) else {
+            continue;
+        };
+        if k_a == k_b {
+            return Some((axis_a, axis_b, k_a));
+        }
+    }
+    None
+}
+
+/// Match `Div(Sym(🎯<axis>), k)` and return `(axis, k)`.
+fn decode_coord_div(expr: &TDim) -> Option<(usize, u64)> {
+    let TDim::Div(num, k) = expr else {
+        return None;
+    };
+    let TDim::Sym(sym) = num.as_ref() else {
+        return None;
+    };
+    let axis = tract_core::ops::logic::sym_to_coord_axis(sym)?;
+    Some((axis, *k))
+}
+
+/// Build a TypedModelPatch that chunkifies one quadratic section.
+///
+/// Reads as "iterate initiators, walk the body, iterate terminators" — each
+/// role iterates op-agnostically over `sec.initiators` / section nodes /
+/// `sec.terminators` and dispatches to per-op-type sub-functions that wire
+/// the chunked equivalent into the patch.  Unhandled op-types bubble up as
+/// `Err` from the per-role dispatcher: a recognised section either gets
+/// fully rewritten or fails loudly (no partial rewrites silently left for
+/// downstream pulsification to trip over).
+fn build_section_patch(
+    model: &TypedModel,
+    sec: &QuadraticSection,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<TypedModelPatch> {
+    ensure!(sec.mask.lower <= 0);
+    ensure!(sec.mask.upper >= 0);
+    let mut patch = TypedModelPatch::default();
+    // Map from original outlet to its chunked equivalent inside the patch.
+    let mut chunked: HashMap<OutletId, OutletId> = HashMap::default();
+    // Nodes wired as part of a fused initiator (e.g. einsum → DiagGather
+    // for the Transformer-XL relative-position pattern).  These should be
+    // skipped by the regular initiator and body loops since their
+    // chunked equivalent is already in `chunked`.
+    let mut already_wired: BTreeSet<usize> = BTreeSet::new();
+    // Boundary outlets to redirect via `shunt_outside` after wiring the
+    // merge reshape: (original outlet, chunked-form outlet inside patch).
+    let mut shunts: Vec<(OutletId, OutletId)> = vec![];
+
+    // Fused EinSum + DiagGather initiator: when an EinSum's only
+    // multi-T-axis section consumer is a DiagGather, route the pair
+    // through DiagGather's chunker and mark both as already-wired.
+    for &nid in &sec.initiators {
+        let einsum_node = &model.nodes[nid];
+        if !einsum_node.op_is::<EinSum>() {
+            continue;
+        }
+        let Some(dg_id) = section_only_diag_gather_consumer(model, einsum_node, sec) else {
+            continue;
+        };
+        let dg_node = &model.nodes[dg_id];
+        let dg_in_fact = model.outlet_fact(dg_node.inputs[0])?;
+        let dg_in_streaming = streaming_positions(dg_in_fact, chunk_sym);
+        if dg_in_streaming.len() != 1 {
+            bail!(
+                "EinSum+DiagGather initiator: DG input must have a single streaming axis, got {dg_in_streaming:?}"
+            );
+        }
+        let dg_op = dg_node.op_as::<DiagGather>().unwrap();
+        let dg_chunked = wire_initiator_diag_gather(
+            &mut patch,
+            model,
+            dg_node,
+            dg_op,
+            &sec.mask,
+            sec.contracted_axis,
+            chunk_sym,
+            k,
+        )?;
+        chunked.insert(OutletId::new(nid, 0), dg_chunked);
+        chunked.insert(OutletId::new(dg_id, 0), dg_chunked);
+        already_wired.insert(nid);
+        already_wired.insert(dg_id);
+    }
+
+    // ── 1. Initiators ────────────────────────────────────────────────────
+    // Two flavours:
+    //   - Data initiators (e.g. score-matrix EinSum): tap, split, optional
+    //     WindowOnAxis on the contracted side, wire chunked op.
+    //   - Mask-construction initiators (multi-T-axis with uniform_tdim,
+    //     typically Eq/Sub of two single-T-axis chunk-index wires): tap
+    //     each input, split its T-axis, move the chunk axis to position
+    //     0, optionally WindowOnAxis on the contracted side with a
+    //     **sentinel pad value** so the band predicate on out-of-stream
+    //     boundary slots evaluates to false.  Wire the binop with the
+    //     chunked inputs.  The result lives in `chunked` like any other
+    //     section wire.
+    for &nid in &sec.initiators {
+        if already_wired.contains(&nid) {
+            continue;
+        }
+        let node = &model.nodes[nid];
+        let out = if node.outputs[0].fact.uniform_tdim.is_some() {
+            wire_uniform_tdim_initiator(
+                &mut patch,
+                model,
+                node,
+                &sec.mask,
+                sec.contracted_axis,
+                chunk_sym,
+                k,
+            )?
+        } else {
+            wire_initiator(&mut patch, model, node, &sec.mask, sec.contracted_axis, chunk_sym, k)?
+        };
+        chunked.insert(OutletId::new(nid, 0), out);
+    }
+    ensure!(!chunked.is_empty());
+
+    // ── 2. Body ──────────────────────────────────────────────────────────
+    // Walk the section in topological order, skipping initiators (already
+    // wired) and terminators (out-of-section by definition).  Multi-T-axis
+    // uniform_tdim body ops (e.g. Ge/Le/And/Cast on the chunked mask
+    // chain) are processed like any other body op now: their inputs come
+    // from `chunked` (the upstream chunked mask outlet), their outputs
+    // feed back into `chunked` for downstream consumers.
+    for &nid in &model.eval_order()? {
+        if !sec.section.contains(&nid) {
+            continue;
+        }
+        if sec.initiators.contains(&nid) {
+            continue;
+        }
+        if already_wired.contains(&nid) {
+            continue;
+        }
+        let node = &model.nodes[nid];
+        let out = wire_body(
+            &mut patch,
+            model,
+            node,
+            &sec.mask,
+            sec.contracted_axis,
+            &chunked,
+            chunk_sym,
+            k,
+        )?;
+        chunked.insert(OutletId::new(nid, 0), out);
+    }
+
+    // ── 3. Terminators ───────────────────────────────────────────────────
+    for &nid in &sec.terminators {
+        let node = &model.nodes[nid];
+        let (boundary, chunked_form) = wire_terminator(
+            &mut patch,
+            model,
+            node,
+            &chunked,
+            &sec.mask,
+            sec.contracted_axis,
+            chunk_sym,
+            k,
+        )?;
+        shunts.push((boundary, chunked_form));
+    }
+
+    // ── 4. Boundary merges + shunts ──────────────────────────────────────
+    for (boundary, chunked_form) in shunts {
+        let merged = wire_merge_reshape(
+            &mut patch,
+            &model.nodes[boundary.node].name,
+            chunked_form,
+            chunk_sym,
+            k,
+        )?;
+        let merged = wire_affine_tail_pad(&mut patch, model, boundary, merged, chunk_sym, k)?;
+        patch.shunt_outside(model, boundary, merged)?;
+    }
+
+    Ok(patch)
+}
+
+/// Pad the chunked outlet with `c` constant-zero frames to match a
+/// boundary outlet with streaming dim `c + k·S` (vs the merged `k·S`).
+/// Restores the tail `wire_chunk_split` trimmed pre-section.
+fn wire_affine_tail_pad(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    boundary: OutletId,
+    merged: OutletId,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<OutletId> {
+    let boundary_fact = model.outlet_fact(boundary)?;
+    let merged_fact = patch.outlet_fact(merged)?.clone();
+    if boundary_fact.shape.len() != merged_fact.shape.len() {
+        return Ok(merged);
+    }
+    let mut pad_axis: Option<(usize, i64)> = None;
+    for (axis, (b, m)) in boundary_fact.shape.iter().zip(merged_fact.shape.iter()).enumerate() {
+        if b == m {
+            continue;
+        }
+        let b_off = affine_chunk_offset(b, chunk_sym, k);
+        let m_off = affine_chunk_offset(m, chunk_sym, k);
+        match (b_off, m_off) {
+            (Some(bc), Some(0)) if bc > 0 => {
+                if pad_axis.is_some() {
+                    return Ok(merged);
+                }
+                pad_axis = Some((axis, bc));
+            }
+            _ => return Ok(merged),
+        }
+    }
+    let Some((axis, c)) = pad_axis else {
+        return Ok(merged);
+    };
+    let mut pads = vec![(0usize, 0usize); merged_fact.shape.len()];
+    pads[axis] = (0, c as usize);
+    let pad_value = Tensor::zero_scalar_dt(merged_fact.datum_type)?.into_arc_tensor();
+    let pad_op = tract_core::ops::array::Pad {
+        pads,
+        mode: tract_core::ops::array::PadMode::Constant(pad_value),
+    };
+    let name = format!("{}.affine_tail_pad", &model.nodes[boundary.node].name);
+    Ok(patch.wire_node(name, pad_op, &[merged])?[0])
+}
+
+// ── Per-role dispatchers ────────────────────────────────────────────────
+//
+// Each `wire_*` helper takes a section node + the patch-in-progress and
+// dispatches to a per-op-type implementation.  Unhandled op-types `bail!`
+// with a clear "Unsupported …" message — Blockify either fully rewrites
+// a detected section or errors loudly, never silently leaves a half-
+// rewritten graph for downstream pulsification to choke on.
+
+fn wire_initiator(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    node: &TypedNode,
+    mask: &MaskForm,
+    contracted_axis: usize,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<OutletId> {
+    if let Some(op) = node.op_as::<EinSum>() {
+        return wire_initiator_einsum(patch, model, node, op, mask, contracted_axis, chunk_sym, k);
+    }
+    if node.op_as::<tract_core::ops::array::MultiBroadcastTo>().is_some() {
+        let in_fact = model.outlet_fact(node.inputs[0])?;
+        if streaming_positions(in_fact, chunk_sym).is_empty() {
+            return wire_initiator_multibroadcastto(patch, model, node, chunk_sym);
+        } else {
+            return wire_initiator_multibroadcastto_streaming(
+                patch,
+                model,
+                node,
+                mask,
+                contracted_axis,
+                chunk_sym,
+                k,
+            );
+        }
+    }
+    if let Some(op) = node.op_as::<DiagGather>() {
+        return wire_initiator_diag_gather(
+            patch,
+            model,
+            node,
+            op,
+            mask,
+            contracted_axis,
+            chunk_sym,
+            k,
+        );
+    }
+    if let Some(op) = node.op_as::<TypedBinOp>() {
+        return wire_initiator_typed_binop(
+            patch,
+            model,
+            node,
+            op,
+            mask,
+            contracted_axis,
+            chunk_sym,
+            k,
+        );
+    }
+    bail!("Unsupported initiator {node}")
+}
+
+/// Initiator for `DiagGather` — the folded skew trick at the section
+/// boundary.  Input is single-T-axis (the relative-position pre-skew
+/// scores `[..., S, 2T_max-1]`), output is multi-T-axis (`[..., S, S]`,
+/// the absolute-position scores).  In the chunked frame both wires
+/// become single-T-axis with constant inner shape:
+///
+///   input  [..., chunks, k, 2T_max-1]
+///   output [..., chunks, k, W]    where W = (mask.upper-mask.lower+1)·k
+///
+/// The chunked DiagGather has fixed `offset = k-1` (= P-1, the relative-
+/// position-zero entry within the per-pulse window) and `out_len = W`.
+fn wire_initiator_diag_gather(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    node: &TypedNode,
+    op: &DiagGather,
+    mask: &MaskForm,
+    contracted_axis: usize,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<OutletId> {
+    let out_streaming = streaming_positions(&node.outputs[0].fact, chunk_sym);
+    ensure!(
+        out_streaming.len() == 2 && out_streaming[1] == out_streaming[0] + 1,
+        "Initiator DiagGather output must have two contiguous streaming axes, \
+         got {out_streaming:?}"
+    );
+    ensure!(node.inputs.len() == 1, "DiagGather has 1 input, got {}", node.inputs.len());
+
+    let in_fact = model.outlet_fact(node.inputs[0])?;
+    let in_streaming = streaming_positions(in_fact, chunk_sym);
+    ensure!(
+        in_streaming.len() == 1,
+        "Initiator DiagGather input must have exactly one streaming axis, got {in_streaming:?}"
+    );
+    let stream_axis = in_streaming[0];
+
+    let tapped = patch.tap_model(model, node.inputs[0])?;
+    let in_fact_patch = patch.outlet_fact(tapped)?.clone();
+    let chunked = wire_chunk_split(patch, &node.name, tapped, stream_axis, chunk_sym, k)?;
+
+    // The R (relative-position) axis of pos_raw is the last axis: a constant
+    // width carrying the rel-pos table.  The DiagGather op's `offset` field
+    // points to the column where rel-pos = 0 lives in the R-axis numbering.
+    // Per chunk c, the W = (L+1)·k key-window starts at chunk
+    // `c + window_start` (= `c − L` for lookback, `c` for lookahead), so the
+    // (δi, δj)-th in-window key has rel-pos `δj − δi + window_start·k`.
+    // Solving `chunked_offset + δj − δi = (op.offset) + (δj − δi + window_start·k)`
+    // gives `chunked_offset = op.offset + window_start·k`.
+    let r_axis = in_fact_patch.shape.last().context("DiagGather input has no last axis")?;
+    let r = r_axis.to_i64().context("DiagGather R axis must be a constant integer")?;
+    // Prefer `op.offset` if it simplifies to a concrete column index — this is
+    // the path the streaming-rel-pos rewrite (subsequent commit) uses to plant
+    // a centre that doesn't match the canonical `(R-1)/2`.  Fall back to the
+    // T-XL convention `centre = (R-1)/2` for models where the op was built
+    // with a row-count-based symbolic offset (e.g. `T - 1`) that hasn't
+    // simplified post-substitution.
+    let centre = op.offset.to_i64().ok().unwrap_or((r - 1) / 2);
+    let l = mask.upper - mask.lower;
+    let w = (l + 1) * k;
+    let window_start = window_start_for(mask, contracted_axis);
+    let chunked_offset = centre + window_start * k;
+    let chunked_op = DiagGather { offset: chunked_offset.to_dim(), out_len: w.to_dim() };
+    Ok(patch.wire_node(format!("{}.blockified", node.name), chunked_op, &[chunked])?[0])
+}
+
+/// Generic initiator for a `TypedBinOp` lifting two single-T-axis inputs
+/// into a multi-T-axis score-shape output via implicit broadcasting (post-
+/// declutter spelling of the pad-mask outer-AND pattern: `Add(at=0)(pad)` AND
+/// `Add(at=1)(pad)` → `[T, T]`).
+///
+/// Per input, the streaming axis tracks via the op's axes_mapping to one of
+/// the section's score axes.  If that score axis lands on the contracted (K)
+/// side of the mask, the input is windowed by `WindowOnAxis(W) + flatten`;
+/// otherwise it's just chunk-split.  Each input's chunks axis is then moved
+/// to the section's `chunks_target_axis` so the chunked op aligns them.
+///
+/// `WindowOnAxis` pads boundary slots with the op's `absorbing_element` (0
+/// for And/Mul/BitAnd, 1 for Or), so the chunked op produces "definitely
+/// excluded" at out-of-stream positions.  Bails if the op has no absorbing
+/// element (e.g. Add, Xor) — we can't safely window-pad those.
+///
+/// Non-streaming inputs are tapped and rank-bumped to the chunked-frame rank
+/// (= score_rank + 1).  The chunked op's own broadcasting fills in the
+/// streaming dims.
+fn wire_initiator_typed_binop(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    node: &TypedNode,
+    op: &TypedBinOp,
+    mask: &MaskForm,
+    contracted_axis: usize,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<OutletId> {
+    let out_streaming_axes = streaming_positions(&node.outputs[0].fact, chunk_sym);
+    ensure!(
+        out_streaming_axes.len() == 2 && out_streaming_axes[1] == out_streaming_axes[0] + 1,
+        "Initiator TypedBinOp output must have two contiguous streaming axes"
+    );
+    let chunks_target_axis = out_streaming_axes[0];
+    let score_rank = node.outputs[0].fact.rank();
+    let rank_diff = score_rank.checked_sub(2).ok_or_else(|| {
+        format_err!("Score rank {score_rank} < 2; cannot translate to mask frame")
+    })?;
+
+    let input_facts: TVec<&TypedFact> =
+        node.inputs.iter().map(|inp| model.outlet_fact(*inp)).collect::<TractResult<_>>()?;
+    let output_facts: TVec<&TypedFact> = node.outputs.iter().map(|o| &o.fact).collect();
+    let mapping = op.axes_mapping(&input_facts, &output_facts)?;
+
+    let mut chunked_inputs: TVec<OutletId> = tvec!();
+    for (ix, &input) in node.inputs.iter().enumerate() {
+        let in_fact = model.outlet_fact(input)?;
+        let streaming = streaming_positions(in_fact, chunk_sym);
+        ensure!(
+            streaming.len() <= 1,
+            "Initiator TypedBinOp input {ix} has {} streaming axes, expected 0 or 1",
+            streaming.len()
+        );
+
+        let tapped = patch.tap_model(model, input)?;
+        let wire = if streaming.is_empty() {
+            let target_rank = score_rank + 1;
+            bump_rank_to(patch, &node.name, ix, tapped, target_rank)?
+        } else {
+            let stream_axis = streaming[0];
+            let split = wire_chunk_split(
+                patch,
+                &format!("{}.{ix}", node.name),
+                tapped,
+                stream_axis,
+                chunk_sym,
+                k,
+            )?;
+
+            let tracked_in_score = mapping
+                .track_axis((InOut::In(ix), stream_axis), InOut::Out(0))?
+                .ok_or_else(|| {
+                    format_err!(
+                        "TypedBinOp stream axis on input {ix} doesn't track to a unique output axis"
+                    )
+                })?;
+            let tracked_in_mask = tracked_in_score.checked_sub(rank_diff).ok_or_else(|| {
+                format_err!(
+                    "Tracked score axis {tracked_in_score} doesn't map to mask frame \
+                     (rank_diff={rank_diff})"
+                )
+            })?;
+
+            let needs_window = !mask.is_block_diag() && tracked_in_mask == contracted_axis;
+            let after_window = if needs_window {
+                let window: usize = (mask.upper - mask.lower + 1) as usize;
+                let start = window_start_for(mask, contracted_axis);
+                let dt = patch.outlet_fact(split)?.datum_type;
+                let absorbing = op.0.absorbing_element().ok_or_else(|| {
+                    format_err!(
+                        "TypedBinOp '{}' has no absorbing_element; cannot safely window-pad \
+                         a section-initiator input",
+                        op.0.name()
+                    )
+                })?;
+                let pad_value = tensor0(absorbing).cast_to_dt(dt)?.into_owned().into_arc_tensor();
+                let windowed = patch.wire_node(
+                    format!("{}.{ix}.window", node.name),
+                    tract_pulse_opl::ops::WindowOnAxis {
+                        axis: stream_axis,
+                        window,
+                        start,
+                        pad_value,
+                    },
+                    &[split],
+                )?[0];
+                let from = tvec!(window.to_dim(), k.to_dim());
+                let to = tvec!(((window as i64) * k).to_dim());
+                patch.wire_node(
+                    format!("{}.{ix}.window_flat", node.name),
+                    AxisOp::Reshape(stream_axis + 1, from, to),
+                    &[windowed],
+                )?[0]
+            } else {
+                split
+            };
+
+            if stream_axis != chunks_target_axis {
+                patch.wire_node(
+                    format!("{}.{ix}.move_chunks", node.name),
+                    AxisOp::Move(stream_axis, chunks_target_axis),
+                    &[after_window],
+                )?[0]
+            } else {
+                after_window
+            }
+        };
+        chunked_inputs.push(wire);
+    }
+
+    Ok(patch.wire_node(format!("{}.blockified", node.name), op.clone(), &chunked_inputs)?[0])
+}
+
+/// Initiator for `MultiBroadcastTo` — the `select(mask, scores, -inf)`
+/// false-branch pattern, where declutter folds `scores * 0.0 + -inf`
+/// down to a `MultiBroadcastTo` of a small const (typically scalar) up
+/// to the score's `[T, T]` shape.  The op's input is non-streaming
+/// (otherwise `MultiBroadcastTo` would be in the middle of the section,
+/// not its boundary), so we just tap and rank-bump it to the chunked-
+/// frame rank `score_rank + 1`.  Subsequent body-op broadcasting fills
+/// in the chunked dimensions with the (constant) input value.
+fn wire_initiator_multibroadcastto(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    node: &TypedNode,
+    chunk_sym: &Symbol,
+) -> TractResult<OutletId> {
+    ensure!(node.inputs.len() == 1, "MultiBroadcastTo expects 1 input, got {}", node.inputs.len());
+    let input = node.inputs[0];
+    let in_fact = model.outlet_fact(input)?;
+    ensure!(
+        streaming_positions(in_fact, chunk_sym).is_empty(),
+        "MultiBroadcastTo initiator with streaming input not supported (input has \
+         {} streaming axes)",
+        streaming_positions(in_fact, chunk_sym).len(),
+    );
+    let target_rank = node.outputs[0].fact.rank() + 1;
+    let mut wire = patch.tap_model(model, input)?;
+    let mut step = 0;
+    while patch.outlet_fact(wire)?.rank() < target_rank {
+        wire =
+            patch.wire_node(format!("{}.bump_rank.{step}", node.name), AxisOp::Add(0), &[wire])?[0];
+        step += 1;
+    }
+    Ok(wire)
+}
+
+/// Initiator for `MultiBroadcastTo` whose input is streaming:
+/// a `[..., 1, T]` per-key-position mask broadcast to `[..., T, T]`.
+/// The input's streaming axis must track (after broadcast) to the
+/// section's `contracted_axis`.  Chunked form per chunk: split T into
+/// `[S, k]`, window L+1 chunks, flatten `[L+1, k] → W`, move the chunk
+/// axis to first-streaming position, broadcast the size-1 axis up to k.
+fn wire_initiator_multibroadcastto_streaming(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    node: &TypedNode,
+    mask: &MaskForm,
+    contracted_axis: usize,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<OutletId> {
+    ensure!(node.inputs.len() == 1, "MultiBroadcastTo expects 1 input, got {}", node.inputs.len());
+    let input = node.inputs[0];
+    let in_fact = model.outlet_fact(input)?;
+    let in_streaming = streaming_positions(in_fact, chunk_sym);
+    ensure!(
+        in_streaming.len() == 1,
+        "MultiBroadcastTo streaming initiator: input must have exactly one streaming axis, \
+         got {in_streaming:?}"
+    );
+    let in_stream_axis = in_streaming[0];
+
+    let out_streaming = streaming_positions(&node.outputs[0].fact, chunk_sym);
+    ensure!(
+        out_streaming.len() == 2 && out_streaming[1] == out_streaming[0] + 1,
+        "Initiator MultiBroadcastTo output must have two contiguous streaming axes, \
+         got {out_streaming:?}"
+    );
+
+    // The input axis that gets broadcast from 1 to a streaming dim.
+    let bcast_axis = if out_streaming[0] == in_stream_axis {
+        out_streaming[1]
+    } else if out_streaming[1] == in_stream_axis {
+        out_streaming[0]
+    } else {
+        bail!(
+            "MultiBroadcastTo streaming initiator: input stream axis {in_stream_axis} not in \
+             output streaming axes {out_streaming:?}"
+        );
+    };
+    ensure!(
+        in_fact.shape[bcast_axis].is_one(),
+        "MultiBroadcastTo streaming initiator: broadcast-from axis {bcast_axis} must be 1, \
+         got {}",
+        in_fact.shape[bcast_axis]
+    );
+
+    // Translate the score-frame stream axis to mask frame and check it's the
+    // contracted side.  `score_rank - 2` is the leading-batch-dims offset.
+    let score_rank = node.outputs[0].fact.rank();
+    let rank_diff = score_rank.checked_sub(2).ok_or_else(|| {
+        format_err!("Score rank {score_rank} < 2; cannot translate to mask frame")
+    })?;
+    let tracked_in_mask = in_stream_axis.checked_sub(rank_diff).ok_or_else(|| {
+        format_err!(
+            "Tracked score axis {in_stream_axis} doesn't map to mask frame (rank_diff={rank_diff})"
+        )
+    })?;
+    ensure!(
+        tracked_in_mask == contracted_axis,
+        "MultiBroadcastTo streaming initiator: input stream axis must track to the \
+         contracted axis ({contracted_axis}), got {tracked_in_mask}"
+    );
+
+    let tapped = patch.tap_model(model, input)?;
+    let split = wire_chunk_split(patch, &node.name, tapped, in_stream_axis, chunk_sym, k)?;
+    let bcast_axis_post_split =
+        if bcast_axis > in_stream_axis { bcast_axis + 1 } else { bcast_axis };
+
+    let window: usize = (mask.upper - mask.lower + 1) as usize;
+    let start = window_start_for(mask, contracted_axis);
+    let dt = patch.outlet_fact(split)?.datum_type;
+    let pad_value = Tensor::zero_scalar_dt(dt)?.into_arc_tensor();
+    let windowed = patch.wire_node(
+        format!("{}.window", node.name),
+        tract_pulse_opl::ops::WindowOnAxis { axis: in_stream_axis, window, start, pad_value },
+        &[split],
+    )?[0];
+    let bcast_axis_post_window = if bcast_axis_post_split > in_stream_axis {
+        bcast_axis_post_split + 1
+    } else {
+        bcast_axis_post_split
+    };
+
+    // Flatten [L+1, k] back to a single W = (L+1)·k axis.
+    let from = tvec!(window.to_dim(), k.to_dim());
+    let to = tvec!(((window as i64) * k).to_dim());
+    let flat = patch.wire_node(
+        format!("{}.window_flat", node.name),
+        AxisOp::Reshape(in_stream_axis + 1, from, to),
+        &[windowed],
+    )?[0];
+    let bcast_axis_post_flat = if bcast_axis_post_window > in_stream_axis + 1 {
+        bcast_axis_post_window - 1
+    } else {
+        bcast_axis_post_window
+    };
+
+    // Move chunk axis to the original first-streaming output position
+    // (convention shared with `chunkify_einsum`).
+    let chunks_target_axis = out_streaming[0];
+    let mut chunks_axis = in_stream_axis;
+    let mut bcast_axis_now = bcast_axis_post_flat;
+    let mut wire = flat;
+    if chunks_axis != chunks_target_axis {
+        wire = patch.wire_node(
+            format!("{}.move_chunks", node.name),
+            AxisOp::Move(chunks_axis, chunks_target_axis),
+            &[wire],
+        )?[0];
+        // Track the broadcast-from-1 axis through the Move: dims STRICTLY
+        // between source and target shift by one slot — [target, source)
+        // leftward, (source, target] rightward.
+        if chunks_target_axis < chunks_axis {
+            if bcast_axis_now >= chunks_target_axis && bcast_axis_now < chunks_axis {
+                bcast_axis_now += 1;
+            }
+        } else if bcast_axis_now > chunks_axis && bcast_axis_now <= chunks_target_axis {
+            bcast_axis_now = bcast_axis_now.saturating_sub(1);
+        }
+        chunks_axis = chunks_target_axis;
+        let _ = chunks_axis;
+    }
+
+    let mut target_shape: TVec<TDim> = patch.outlet_fact(wire)?.shape.to_tvec();
+    target_shape[bcast_axis_now] = k.to_dim();
+    let bcast = tract_core::ops::array::MultiBroadcastTo { shape: target_shape.into() };
+    Ok(patch.wire_node(format!("{}.blockified", node.name), bcast, &[wire])?[0])
+}
+
+/// Initiator for a multi-T-axis `uniform_tdim` node — typically the
+/// `Eq`/`Sub` head of the mask-construction chain whose two inputs are
+/// single-T-axis chunk-index wires (`chunk_row` at axis 0, `chunk_col`
+/// at axis 1).  Tap each input, split its T-axis into `[..., S, k, ...]`,
+/// move the chunk axis to position 0 to align with the rest of the
+/// section, and (if its source T-axis equals the section's contracted
+/// axis) wrap with `WindowOnAxis` using a **sentinel pad value** so the
+/// downstream band predicate evaluates to false on out-of-stream
+/// boundary slots.  Then wire the same op (Eq/Sub/…) with the chunked
+/// inputs.
+fn wire_uniform_tdim_initiator(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    node: &TypedNode,
+    mask: &MaskForm,
+    contracted_axis: usize,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<OutletId> {
+    let mut chunked_inputs: TVec<OutletId> = tvec!();
+    for (ix, &input) in node.inputs.iter().enumerate() {
+        let chunked = chunkify_uniform_tdim_input(
+            patch,
+            model,
+            input,
+            &format!("{}.in{ix}", node.name),
+            mask,
+            contracted_axis,
+            chunk_sym,
+            k,
+        )?;
+        chunked_inputs.push(chunked);
+    }
+    let mut out =
+        patch.wire_node(format!("{}.blockified", node.name), node.op.clone(), &chunked_inputs)?[0];
+    // Match the source's output dtype: `chunkify_uniform_tdim_input` may
+    // have cast TDim → I64 to satisfy `PulsePad`'s Copy-based fill, so
+    // body ops downstream that tap external constants (e.g. the `0` in
+    // `ge(diff, 0)`) need the chunked outlet to carry the original dtype.
+    let source_dt = node.outputs[0].fact.datum_type;
+    let cur_dt = patch.outlet_fact(out)?.datum_type;
+    if cur_dt != source_dt {
+        out = patch.wire_node(
+            format!("{}.blockified.cast_back", node.name),
+            tract_core::ops::cast::cast(source_dt),
+            &[out],
+        )?[0];
+    }
+    Ok(out)
+}
+
+/// Tap a single-T-axis `uniform_tdim` wire (e.g. `chunk_row [T, 1]` or
+/// `chunk_col [1, T]`), split its T-axis at `k`, move the chunk axis to
+/// position 0, and — for the contracted side — `WindowOnAxis` with a
+/// sentinel pad so out-of-stream boundary slots produce out-of-band
+/// values for the downstream predicate.  Returns the chunked outlet.
+fn chunkify_uniform_tdim_input(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    input: OutletId,
+    name_prefix: &str,
+    mask: &MaskForm,
+    contracted_axis: usize,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<OutletId> {
+    let in_fact = model.outlet_fact(input)?;
+    let positions = streaming_positions(in_fact, chunk_sym);
+    ensure!(
+        positions.len() == 1,
+        "uniform_tdim initiator input must have exactly one streaming axis (got {})",
+        positions.len(),
+    );
+    let stream_axis = positions[0];
+
+    let tapped = patch.tap_model(model, input)?;
+
+    // Cast TDim → I64 up-front: PulsePad (used by WindowOnAxis pulsifier
+    // for the contracted side) fills with `dispatch_copy_by_size!`, which
+    // panics on non-Copy datum types like TDim.  Body ops downstream are
+    // Sub/Ge/Le/And — they don't care whether their numeric inputs are
+    // TDim or I64 (the final mask comes out as Bool either way).
+    //
+    // REVISIT: add a TDim arm to `pulse-opl/src/pad.rs` (clone-fill instead
+    // of `dispatch_copy_by_size`) and drop this round-trip.
+    let mut wire = tapped;
+    if patch.outlet_fact(wire)?.datum_type == TDim::datum_type() {
+        wire = patch.wire_node(
+            format!("{name_prefix}.cast_i64"),
+            tract_core::ops::cast::cast(i64::datum_type()),
+            &[wire],
+        )?[0];
+    }
+    let dt = patch.outlet_fact(wire)?.datum_type;
+
+    // Split the T-axis at `k`.  Output rank = input rank + 1, with the
+    // chunk axis at `stream_axis` and the within-block axis at
+    // `stream_axis + 1`.
+    wire = wire_chunk_split(patch, name_prefix, wire, stream_axis, chunk_sym, k)?;
+
+    // Move chunk axis to position 0 if it isn't already, so the section
+    // frame uniformly carries the chunk axis at 0.
+    if stream_axis != 0 {
+        wire = patch.wire_node(
+            format!("{name_prefix}.move_chunk"),
+            AxisOp::Move(stream_axis, 0),
+            &[wire],
+        )?[0];
+    }
+
+    // If this input's source T-axis is the contracted side, window the
+    // chunk axis (now at position 0) and flatten the W slot back into the
+    // within-block axis so downstream consumers see W·k along that axis.
+    let needs_window = !mask.is_block_diag() && stream_axis == contracted_axis;
+    if needs_window {
+        let window_size: usize = (mask.upper - mask.lower + 1) as usize;
+        let start = window_start_for(mask, contracted_axis);
+        // Sentinel pad: a value far outside any sane chunk-index range,
+        // so the downstream band predicate `chunk_a − chunk_b ∈ [lower,
+        // upper]` is false on out-of-stream boundary slots.
+        let sentinel = sentinel_pad_value(dt)?.into_arc_tensor();
+        wire = patch.wire_node(
+            format!("{name_prefix}.window"),
+            tract_pulse_opl::ops::WindowOnAxis {
+                axis: 0,
+                window: window_size,
+                start,
+                pad_value: sentinel,
+            },
+            &[wire],
+        )?[0];
+
+        // Post-window shape: chunk at 0, W at 1, then the original axes
+        // (the within-block axis is at `stream_axis + 1` post-window:
+        // chunk was at 0 pre-window, W gets inserted at 1, so axes shift
+        // right by 1).  Flatten W (slice index 0) and within-block (slice
+        // index `stream_axis`) into a single (W·k) axis.
+        let post_window = patch.outlet_fact(wire)?.clone();
+        let rank_after = post_window.rank();
+        let from: TVec<TDim> = (1..rank_after).map(|i| post_window.shape[i].clone()).collect();
+        let within_slice_idx = stream_axis;
+        let mut to: TVec<TDim> = tvec!();
+        for (i, dim) in from.iter().enumerate() {
+            if i == 0 {
+                continue;
+            }
+            if i == within_slice_idx + 1 {
+                let merged = from[0].clone() * dim.clone();
+                to.push(merged);
+            } else {
+                to.push(dim.clone());
+            }
+        }
+        wire = patch.wire_node(
+            format!("{name_prefix}.flatten_window"),
+            AxisOp::Reshape(1, from, to),
+            &[wire],
+        )?[0];
+    }
+
+    Ok(wire)
+}
+
+/// Pad value for windowing a chunk-index wire: any value outside any
+/// reasonable `[lower, upper]` band so the downstream `Ge`/`Le`
+/// comparisons on `chunk_a − chunk_b` evaluate to false at boundary
+/// slots.  Bounded by `i32::MAX / 4` because tract's tensor cast routes
+/// `i64 → TDim` through `i32` (see `data/src/tensor.rs:1250`), which
+/// would truncate a larger sentinel.  Half a billion is comfortably
+/// above any plausible chunk count yet safe under that cast.
+///
+/// REVISIT: route `i64 → TDim` directly in `data/src/tensor.rs:1250`
+/// (no `i32` middle step) and lift the cap to `i64::MAX / 4`.
+fn sentinel_pad_value(dt: DatumType) -> TractResult<Tensor> {
+    if dt == bool::datum_type() {
+        bail!("uniform_tdim wire of bool dtype not expected as initiator-side input");
+    }
+    Ok(tensor0((i32::MAX / 4) as i64).cast_to_dt(dt)?.into_owned())
+}
+
+/// Replay a body op in the chunked frame.
+///
+/// * Inputs from `chunked` (= chunked wires produced by the initiator path
+///   for both the data side and the mask-construction chain): pass through.
+/// * Other external inputs: tapped, with `AddAxis(0)` bumping any rank
+///   deficit so rank-strict consumers (TypedBinOp, …) accept them.
+///
+/// `axes_mapping::track_axis` asserts each chunked input's chunk axis
+/// reaches a unique output axis — bails with a precise error if the op
+/// would disconnect the chunk axis (e.g. softmax over it).
+fn wire_body(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    node: &TypedNode,
+    _mask: &MaskForm,
+    _contracted_axis: usize,
+    chunked: &HashMap<OutletId, OutletId>,
+    chunk_sym: &Symbol,
+    _k: i64,
+) -> TractResult<OutletId> {
+    // Pass 1: collect chunked inputs and discover the rank we'll work in.
+    // A chunked input may have 1 streaming axis (the usual case — this
+    // input contributes a chunk axis we need to track through `op`'s
+    // axes_mapping), or 0 streaming axes (rank-bumped broadcast constant
+    // produced by `wire_initiator_multibroadcastto` — its value is
+    // independent of the chunk index, so no axis-mapping check needed).
+    let n = node.inputs.len();
+    let mut new_inputs: TVec<Option<OutletId>> = tvec![None; n];
+    let mut chunk_input_axes: Vec<(usize, usize)> = vec![];
+    let mut chunked_rank: Option<usize> = None;
+    for (slot, &input) in node.inputs.iter().enumerate() {
+        if let Some(&c) = chunked.get(&input) {
+            let cf = patch.outlet_fact(c)?;
+            let positions = streaming_positions(cf, chunk_sym);
+            ensure!(
+                positions.len() <= 1,
+                "Body op {node}: chunked input slot {slot} has {} streaming axes, expected ≤ 1",
+                positions.len()
+            );
+            if let Some(&ax) = positions.first() {
+                chunk_input_axes.push((slot, ax));
+            }
+            chunked_rank = Some(cf.rank().max(chunked_rank.unwrap_or(0)));
+            new_inputs[slot] = Some(c);
+        }
+    }
+    let chunked_rank = chunked_rank.ok_or_else(|| {
+        format_err!("Body op {node} has no chunked input — at least one is required")
+    })?;
+
+    // Pass 2: external taps, rank-bumped to match the chunked frame.
+    // (uniform_tdim mask wires are now in `chunked` already, courtesy of
+    // the uniform_tdim initiator + faithful body chunking — no per-op
+    // mask-substitute logic.)
+    for (slot, &input) in node.inputs.iter().enumerate() {
+        if new_inputs[slot].is_some() {
+            continue;
+        }
+        let tapped = patch.tap_model(model, input)?;
+        let bumped = bump_rank_to(patch, &node.name, slot, tapped, chunked_rank)?;
+        new_inputs[slot] = Some(bumped);
+    }
+    let new_inputs: TVec<OutletId> = new_inputs.into_iter().map(|o| o.unwrap()).collect();
+
+    // Chunkability: for every chunked input, its chunk axis must track
+    // through to a unique output axis position.
+    let input_facts: TVec<TypedFact> =
+        new_inputs.iter().map(|o| patch.outlet_fact(*o).cloned()).collect::<TractResult<_>>()?;
+    let in_refs: TVec<&TypedFact> = input_facts.iter().collect();
+    let output_facts = node.op.output_facts(&in_refs)?;
+    let out_refs: TVec<&TypedFact> = output_facts.iter().collect();
+    let am = node.op.axes_mapping(&in_refs, &out_refs)?;
+    for &(slot, axis) in &chunk_input_axes {
+        let tracked = am.track_axis((InOut::In(slot), axis), InOut::Out(0))?;
+        ensure!(
+            tracked.is_some(),
+            "Body op {node} doesn't preserve the chunk axis (input slot {slot}, axis {axis}) \
+             through to the output — its axes_mapping disconnects it"
+        );
+    }
+
+    // Some body ops carry an explicit axis or axes parameter (Softmax,
+    // AxisOp::Move/Add/Rm, …) whose values are positions in the *original*
+    // rank.  The chunk axis is inserted at the chunked input's chunk
+    // position; every original axis at or beyond that position shifts
+    // right by one.  Translate accordingly.  When inputs disagree on the
+    // chunk position we punt (no consistent chunk_pos to translate
+    // against); that case shouldn't arise in a valid section.
+    let chunk_pos = chunk_input_axes.iter().map(|&(_, ax)| ax).next();
+    if let Some(cp) = chunk_pos {
+        ensure!(
+            chunk_input_axes.iter().all(|&(_, ax)| ax == cp),
+            "Body op {node}: chunked inputs disagree on chunk axis position {chunk_input_axes:?}"
+        );
+    }
+    let chunked_op = translate_body_op_axes(node.op.as_ref(), chunk_pos);
+    Ok(patch.wire_node(&*node.name, chunked_op, &new_inputs)?[0])
+}
+
+/// Rewrite an op's axis/axes parameters for the chunked frame, where
+/// the chunk axis was inserted at `chunk_pos` (taken from the chunked
+/// input's streaming axis position).  Original axes at or beyond
+/// `chunk_pos` shift right by one; axes strictly before it stay put.
+/// Handles `Softmax`, `AxisOp::Move/Add/Rm`; other axis-bearing ops
+/// fall through unchanged.
+fn translate_body_op_axes(op: &dyn TypedOp, chunk_pos: Option<usize>) -> Box<dyn TypedOp> {
+    use tract_core::ops::nn::{Softmax, SoftmaxKind};
+    let shift = |a: usize| match chunk_pos {
+        Some(cp) => chunked_axis_index(a, cp),
+        None => a,
+    };
+    if let Some(softmax) = op.downcast_ref::<Softmax>() {
+        let new_axes: TVec<usize> = softmax.axes.iter().map(|&a| shift(a)).collect();
+        let new_softmax = match &softmax.kind {
+            SoftmaxKind::Softmax(exp) => {
+                Softmax::new(new_axes, softmax.quant_output_dt, SoftmaxKind::Softmax(*exp))
+            }
+            SoftmaxKind::LogSoftmax => {
+                Softmax::new(new_axes, softmax.quant_output_dt, SoftmaxKind::LogSoftmax)
+            }
+        };
+        return Box::new(new_softmax);
+    }
+    if let Some(ax_op) = op.downcast_ref::<AxisOp>() {
+        // `Add(at)` inserts a new axis *before* the original position `at`.
+        // We want the new axis to land in the same broadcast slot relative
+        // to the original tensor, which means it stays at `at` when
+        // `at <= chunk_pos` (placed before the chunk axis) and shifts +1
+        // otherwise.  Move/Rm name existing axes, so their parameters
+        // translate via `chunked_axis_index` like any other label.
+        let add_shift = |a: usize| match chunk_pos {
+            Some(cp) if a > cp => a + 1,
+            _ => a,
+        };
+        let translated = match ax_op {
+            AxisOp::Move(from, to) => AxisOp::Move(shift(*from), shift(*to)),
+            AxisOp::Add(at) => AxisOp::Add(add_shift(*at)),
+            AxisOp::Rm(at) => AxisOp::Rm(shift(*at)),
+            other => other.clone(),
+        };
+        return Box::new(translated);
+    }
+    tract_core::dyn_clone::clone_box(op)
+}
+
+/// Insert `AddAxis(0)`s until the wire's rank reaches `target`.  Used to
+/// rank-bump tapped external constants (e.g. a rank-2 broadcast literal
+/// like the `0` in `ge(diff, 0)`) so they match the chunked-frame rank
+/// for rank-strict consumer ops.
+fn bump_rank_to(
+    patch: &mut TypedModelPatch,
+    node_name: &str,
+    slot: usize,
+    mut outlet: OutletId,
+    target: usize,
+) -> TractResult<OutletId> {
+    let mut rank = patch.outlet_fact(outlet)?.rank();
+    let mut step = 0;
+    while rank < target {
+        outlet = patch.wire_node(
+            format!("{node_name}.bump_rank.{slot}.{step}"),
+            AxisOp::Add(0),
+            &[outlet],
+        )?[0];
+        rank += 1;
+        step += 1;
+    }
+    Ok(outlet)
+}
+
+fn wire_terminator(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    node: &TypedNode,
+    chunked: &HashMap<OutletId, OutletId>,
+    mask: &MaskForm,
+    contracted_axis: usize,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<(OutletId, OutletId)> {
+    if let Some(op) = node.op_as::<Reduce>() {
+        return wire_terminator_reduce(patch, model, node, op, chunked);
+    }
+    if let Some(op) = node.op_as::<EinSum>() {
+        return wire_terminator_einsum(
+            patch,
+            model,
+            node,
+            op,
+            chunked,
+            mask,
+            contracted_axis,
+            chunk_sym,
+            k,
+        );
+    }
+    bail!("Unsupported operator {node}")
+}
+
+// ── Per-op-type implementations ─────────────────────────────────────────
+
+/// Initiator EinSum: tap each input from the model, wire a split reshape
+/// for it, then wire the chunked EinSum.  For banded masks, additionally
+/// wrap the input whose streaming axis tracks to `contracted_axis` (the
+/// score-matrix axis the section's terminator contracts) with a
+/// `WindowOnAxis(W)` + flatten reshape, so the within-chunk contracted
+/// axis on that input has size `W·k` instead of `k`.  Returns the chunked
+/// output.
+fn wire_initiator_einsum(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    node: &TypedNode,
+    op: &EinSum,
+    mask: &MaskForm,
+    contracted_axis: usize,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<OutletId> {
+    let out_streaming_axes = streaming_positions(&node.outputs[0].fact, chunk_sym);
+    ensure!(
+        out_streaming_axes.len() == 2 && out_streaming_axes[1] == out_streaming_axes[0] + 1,
+        "Initiator EinSum output must have two contiguous streaming axes"
+    );
+    let score_rank = node.outputs[0].fact.rank();
+    let rank_diff = score_rank.checked_sub(2).ok_or_else(|| {
+        format_err!("Score rank {score_rank} < 2; cannot translate to mask frame")
+    })?;
+    let mut in_streaming_axes: TVec<usize> = tvec!();
+    for &input in &node.inputs {
+        let positions = streaming_positions(model.outlet_fact(input)?, chunk_sym);
+        ensure!(
+            positions.len() == 1,
+            "Initiator EinSum input must have exactly one streaming axis"
+        );
+        in_streaming_axes.push(positions[0]);
+    }
+
+    let mut chunked_inputs: TVec<OutletId> = tvec!();
+    for (ix, (&input, &stream_axis)) in node.inputs.iter().zip(in_streaming_axes.iter()).enumerate()
+    {
+        let tapped = patch.tap_model(model, input)?;
+        let chunked = wire_chunk_split(
+            patch,
+            &format!("{}.{ix}", node.name),
+            tapped,
+            stream_axis,
+            chunk_sym,
+            k,
+        )?;
+
+        // Banded path: if this input's stream axis is on the contracted
+        // side of the section, expose `W` chunks per pulse on it.
+        // Translate the tracked score axis to mask frame for the
+        // comparison with `contracted_axis` (also mask frame).
+        let tracked_in_score =
+            op.axes.track_axis((InOut::In(ix), stream_axis), InOut::Out(0))?.ok_or_else(|| {
+                format_err!(
+                    "EinSum stream axis on input {ix} doesn't track to a unique output axis"
+                )
+            })?;
+        let tracked_in_mask = tracked_in_score.checked_sub(rank_diff).ok_or_else(|| {
+            format_err!(
+                "Tracked score axis {tracked_in_score} doesn't map to mask frame \
+                 (rank_diff={rank_diff})"
+            )
+        })?;
+        let chunked = wrap_with_window_if_needed(
+            patch,
+            chunked,
+            stream_axis,
+            tracked_in_mask,
+            &format!("{}.{ix}", node.name),
+            mask,
+            contracted_axis,
+            k,
+        )?;
+        chunked_inputs.push(chunked);
+    }
+
+    let in_starts: Vec<Option<usize>> = in_streaming_axes.iter().map(|&p| Some(p)).collect();
+    let chunked_op = chunkify_einsum(op, &in_starts, Some(out_streaming_axes[0]))?;
+    Ok(patch.wire_node(format!("{}.blockified", node.name), chunked_op, &chunked_inputs)?[0])
+}
+
+/// Wrap `chunked` (shape `[..., S, k, ...]` with the streaming dim at
+/// `stream_axis`) with `WindowOnAxis(W) + flatten(W, k) → W·k` if the
+/// section requires it: the mask is banded AND the input's stream axis
+/// maps to the contracted score axis.  Otherwise pass through unchanged.
+///
+/// `score_axis` is where this input's stream axis lands on the score
+/// matrix (= input 0 of the terminator).  `window_start_for(mask,
+/// contracted_axis)` picks the slot offset so the W chunks cover the
+/// in-band range relative to the consumer's logical chunk index.
+fn wrap_with_window_if_needed(
+    patch: &mut TypedModelPatch,
+    chunked: OutletId,
+    stream_axis: usize,
+    score_axis: usize,
+    name_prefix: &str,
+    mask: &MaskForm,
+    contracted_axis: usize,
+    k: i64,
+) -> TractResult<OutletId> {
+    if mask.is_block_diag() || score_axis != contracted_axis {
+        return Ok(chunked);
+    }
+    let window: usize = (mask.upper - mask.lower + 1) as usize;
+    let start = window_start_for(mask, contracted_axis);
+    let dt = patch.outlet_fact(chunked)?.datum_type;
+    let pad_value = Tensor::zero_scalar_dt(dt)?.into_arc_tensor();
+    let windowed = patch.wire_node(
+        format!("{name_prefix}.window"),
+        tract_pulse_opl::ops::WindowOnAxis { axis: stream_axis, window, start, pad_value },
+        &[chunked],
+    )?[0];
+    let from = tvec!(window.to_dim(), k.to_dim());
+    let to = tvec!(((window as i64) * k).to_dim());
+    let flatten = AxisOp::Reshape(stream_axis + 1, from, to);
+    Ok(patch.wire_node(format!("{name_prefix}.window_flat"), flatten, &[windowed])?[0])
+}
+
+/// Slot-0 offset for a window that covers the in-band range:
+///
+/// * `contracted_axis == mask.axis_a`: at consumer logical chunk c on
+///   the kept axis (= axis_b), we want `chunk(axis_a) ∈ [c + lower,
+///   c + upper]` → slot 0 is at `c + lower`, so `start = lower`.
+/// * `contracted_axis == mask.axis_b`: at consumer logical chunk c on
+///   the kept axis (= axis_a), we want `chunk(axis_b) ∈ [c - upper,
+///   c - lower]` → slot 0 is at `c - upper`, so `start = -upper`.
+fn window_start_for(mask: &MaskForm, contracted_axis: usize) -> i64 {
+    if contracted_axis == mask.axis_a { mask.lower } else { -mask.upper }
+}
+
+/// Reduce<Sum> terminator: wires a chunked Reduce on the within-chunk
+/// version of the original reduce axis.  If a downstream `RmAxis` removes
+/// the now-size-1 reduced slot, wire its chunked counterpart inside the
+/// patch and use its output as the boundary.
+fn wire_terminator_reduce(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    node: &TypedNode,
+    op: &Reduce,
+    chunked: &HashMap<OutletId, OutletId>,
+) -> TractResult<(OutletId, OutletId)> {
+    ensure!(op.reducer == Reducer::Sum && op.axes.len() == 1);
+    let chunked_input = chunked[&node.inputs[0]];
+    // Chunk insertion position: the first streaming axis of the input fact.
+    let in_fact = model.outlet_fact(node.inputs[0])?;
+    let stream_sym = first_streaming_symbol(in_fact)?;
+    let in_streaming = streaming_positions(in_fact, &stream_sym);
+    ensure!(!in_streaming.is_empty());
+    let chunk_pos = in_streaming[0];
+    let new_axis = chunked_axis_index(op.axes[0], chunk_pos);
+    let new_reduce = Reduce { axes: tvec!(new_axis), reducer: op.reducer };
+    let chunked_term =
+        patch.wire_node(format!("{}.blockified", node.name), new_reduce, &[chunked_input])?[0];
+
+    // If the immediate consumer is `AxisOp::Rm` on the (former) reduce
+    // axis, wire its chunked counterpart and use its output as the
+    // boundary.  Otherwise the Reduce's own output is the boundary.
+    let term_consumers = model.outlet_successors(OutletId::new(node.id, 0));
+    if term_consumers.len() == 1 {
+        let consumer = &model.nodes[term_consumers[0].node];
+        if let Some(AxisOp::Rm(axis)) = consumer.op_as::<AxisOp>()
+            && *axis == op.axes[0]
+        {
+            let new_axis = chunked_axis_index(op.axes[0], chunk_pos);
+            let chunked_rm = patch.wire_node(
+                format!("{}.blockified", consumer.name),
+                AxisOp::Rm(new_axis),
+                &[chunked_term],
+            )?[0];
+            return Ok((OutletId::new(consumer.id, 0), chunked_rm));
+        }
+    }
+    Ok((OutletId::new(node.id, 0), chunked_term))
+}
+
+/// EinSum terminator (e.g. ex02's `attn @ V`): chunkifies the second
+/// EinSum the same way as the initiator.  Inputs already in `chunked`
+/// (the multi-T-axis input from the body) are reused as-is; auxiliary
+/// inputs (single-T-axis) get a tap + split reshape inserted.  For
+/// banded masks, an auxiliary input whose stream axis maps (through
+/// this einsum) to the section's `contracted_axis` of the score matrix
+/// (= input 0 here) also gets `WindowOnAxis + flatten` so its
+/// within-chunk axis matches the W·k size of the windowed score.
+fn wire_terminator_einsum(
+    patch: &mut TypedModelPatch,
+    model: &TypedModel,
+    node: &TypedNode,
+    op: &EinSum,
+    chunked: &HashMap<OutletId, OutletId>,
+    mask: &MaskForm,
+    contracted_axis: usize,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<(OutletId, OutletId)> {
+    let score_rank = model.outlet_fact(node.inputs[0])?.rank();
+    let rank_diff = score_rank.checked_sub(2).ok_or_else(|| {
+        format_err!("Terminator score rank {score_rank} < 2; cannot translate to mask frame")
+    })?;
+    let mut chunked_inputs: TVec<OutletId> = tvec!();
+    let mut input_starts: Vec<Option<usize>> = vec![];
+    for (slot, &input) in node.inputs.iter().enumerate() {
+        let positions = streaming_positions(model.outlet_fact(input)?, chunk_sym);
+        if let Some(&already_chunked) = chunked.get(&input) {
+            // Multi-T-axis input from the body — already in chunked form
+            // (windowed if needed by the initiator).
+            chunked_inputs.push(already_chunked);
+            input_starts.push(positions.first().copied());
+        } else if positions.len() == 1 {
+            let tapped = patch.tap_model(model, input)?;
+            let in_fact = patch.outlet_fact(tapped)?.clone();
+            let stream_axis = in_fact
+                .shape
+                .iter()
+                .position(|d| d.symbols().contains(chunk_sym))
+                .ok_or_else(|| format_err!("auxiliary input lost streaming axis"))?;
+            let new_chunked = wire_chunk_split(
+                patch,
+                &format!("{}.in{slot}", node.name),
+                tapped,
+                stream_axis,
+                chunk_sym,
+                k,
+            )?;
+
+            // Where does this auxiliary's stream axis sit on the score
+            // matrix (= input 0 of this einsum)?  If it's the contracted
+            // side, window it so its within-chunk axis matches the
+            // already-windowed score input.  Translate the score axis
+            // to mask frame for the comparison with `contracted_axis`.
+            let aux_in_score = op.axes.track_axis((InOut::In(slot), stream_axis), InOut::In(0))?;
+            let new_chunked = if let Some(score_axis) = aux_in_score
+                && let Some(mask_axis) = score_axis.checked_sub(rank_diff)
+            {
+                wrap_with_window_if_needed(
+                    patch,
+                    new_chunked,
+                    stream_axis,
+                    mask_axis,
+                    &format!("{}.in{slot}", node.name),
+                    mask,
+                    contracted_axis,
+                    k,
+                )?
+            } else {
+                new_chunked
+            };
+            chunked_inputs.push(new_chunked);
+            input_starts.push(Some(positions[0]));
+        } else if positions.is_empty() {
+            chunked_inputs.push(patch.tap_model(model, input)?);
+            input_starts.push(None);
+        } else {
+            bail!(
+                "Blockify: EinSum terminator input {slot} has {} streaming axes (max 2)",
+                positions.len()
+            );
+        }
+    }
+
+    let out_streaming = streaming_positions(&node.outputs[0].fact, chunk_sym);
+    let chunked_op = chunkify_einsum(op, &input_starts, out_streaming.first().copied())?;
+    let chunked_term =
+        patch.wire_node(format!("{}.blockified", node.name), chunked_op, &chunked_inputs)?[0];
+    Ok((OutletId::new(node.id, 0), chunked_term))
+}
+
+/// Wire the boundary merge reshape: collapses [..., S, k, ...] back to
+/// [..., k·S, ...] so the patch's output matches the original outlet's
+/// shape (which is [..., k·S, ...] post-substitution).
+fn wire_merge_reshape(
+    patch: &mut TypedModelPatch,
+    boundary_name: &str,
+    chunked_form: OutletId,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<OutletId> {
+    let chunked_fact = patch.outlet_fact(chunked_form)?.clone();
+    let chunk_pos = chunked_fact.shape.iter().position(|d| d == &chunk_sym.to_dim());
+    if let Some(pos) = chunk_pos
+        && pos + 1 < chunked_fact.shape.len()
+        && chunked_fact.shape[pos + 1] == k.to_dim()
+    {
+        let from = tvec!(chunk_sym.to_dim(), k.to_dim());
+        let to = tvec!(chunk_sym.to_dim() * k);
+        let reshape = AxisOp::Reshape(pos, from, to);
+        Ok(patch.wire_node(
+            format!("{}.blockify_merge", boundary_name),
+            reshape,
+            &[chunked_form],
+        )?[0])
+    } else {
+        Ok(chunked_form)
+    }
+}
+
+/// Compute the constant `c` such that `dim == c + k · chunk_sym`, when one
+/// exists.  Encoder-style conv stacks emit dims like `1 + (T+6)/8` which,
+/// after the `T → P · S` substitute, become `1 + 14·S` — affine in `S`
+/// with constant `c = 1`.  Blockify's chunked Reshape can't directly
+/// reshape `c + k·S → [S, k]`; we slice off the trailing `c` tokens
+/// first so the chunkable region is exactly `k·S`.
+///
+/// Returns `Some(c)` only when `c` is a non-negative integer constant.
+/// `c = 0` is the clean case (no slice needed).
+fn affine_chunk_offset(dim: &TDim, chunk_sym: &Symbol, k: i64) -> Option<i64> {
+    let target = chunk_sym.to_dim() * k;
+    let diff = dim.clone() - target;
+    let c = diff.to_i64().ok()?;
+    (c >= 0).then_some(c)
+}
+
+/// Wrap a chunked `Reshape(stream_axis, [dim], [chunk_sym, k])` with an
+/// `AffineChunkTrim` when the input dim is `c + k · chunk_sym` for
+/// `c > 0`, dropping the trailing `c` tokens so the Reshape sees `k·S`.
+fn wire_chunk_split(
+    patch: &mut TypedModelPatch,
+    name: &str,
+    input: OutletId,
+    stream_axis: usize,
+    chunk_sym: &Symbol,
+    k: i64,
+) -> TractResult<OutletId> {
+    let in_fact = patch.outlet_fact(input)?.clone();
+    let dim = in_fact.shape[stream_axis].clone();
+    let target = chunk_sym.to_dim() * k;
+    let mut wire = input;
+    if dim != target
+        && let Some(c) = affine_chunk_offset(&dim, chunk_sym, k)
+        && c > 0
+    {
+        wire = patch.wire_node(
+            format!("{name}.affine_trim"),
+            crate::ops::array::AffineChunkTrim {
+                axis: stream_axis,
+                typed_trim: c as usize,
+                target_per_pulse: k as usize,
+            },
+            &[wire],
+        )?[0];
+    }
+    let from = tvec!(patch.outlet_fact(wire)?.shape[stream_axis].clone());
+    let to = tvec!(chunk_sym.to_dim(), k.to_dim());
+    Ok(patch.wire_node(
+        format!("{name}.blockify_split"),
+        AxisOp::Reshape(stream_axis, from, to),
+        &[wire],
+    )?[0])
+}
+
+/// First streaming-symbol-bearing symbol on a fact's shape.  Used by
+/// terminator wiring to derive the chunk insertion position from the
+/// input fact, op-agnostically.
+fn first_streaming_symbol(fact: &TypedFact) -> TractResult<Symbol> {
+    fact.shape
+        .iter()
+        .find_map(|d| d.symbols().into_iter().next())
+        .context("No streaming axis found")
+}
+
+fn chunked_axis_index(orig_axis: usize, chunk_pos: usize) -> usize {
+    // The chunk batch axis is inserted at `chunk_pos` (the position of the
+    // first streaming output axis).  Every original axis at or after that
+    // position shifts right by one.  Streaming axes themselves are no
+    // exception — they shift by one too, because the chunk axis lives where
+    // they used to start.
+    if orig_axis < chunk_pos { orig_axis } else { orig_axis + 1 }
+}
+
+/// Insert the chunk-axis char at the streaming-axis position on each input
+/// and output.  `None` for an input/output skips the insertion (no streaming
+/// axis there, so no chunk axis on that side).  Within-chunk versions of
+/// formerly-streaming axes keep their original chars and shift right by 1.
+fn chunkify_einsum(
+    op: &EinSum,
+    input_streaming_starts: &[Option<usize>],
+    output_streaming_start: Option<usize>,
+) -> TractResult<EinSum> {
+    let (inputs, outputs) = op.axes.to_strs();
+    let new_repr = op.axes.available_label();
+    let insert_at = |s: &String, pos: Option<usize>| -> String {
+        let Some(p) = pos else {
+            return s.clone();
+        };
+        let mut chars: Vec<char> = s.chars().collect();
+        chars.insert(p, new_repr);
+        chars.into_iter().collect()
+    };
+    let new_inputs: Vec<String> = inputs
+        .iter()
+        .zip(input_streaming_starts.iter())
+        .map(|(s, &pos)| insert_at(s, pos))
+        .collect();
+    let new_outputs: Vec<String> = outputs
+        .iter()
+        .enumerate()
+        .map(|(i, s)| if i == 0 { insert_at(s, output_streaming_start) } else { s.clone() })
+        .collect();
+    let new_mapping = AxesMapping::from_strs(&new_inputs, &new_outputs)?;
+    Ok(EinSum { axes: new_mapping, operating_dt: op.operating_dt, q_params: op.q_params.clone() })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn coord(scope: &SymbolScope, axis: usize) -> TDim {
+        TDim::Sym(scope.sym(&format!("🎯{axis}")))
+    }
+
+    fn make_block_diag(scope: &SymbolScope, i: usize, j: usize, k: u64) -> TDim {
+        TDim::Eq(
+            Box::new(TDim::Div(Box::new(coord(scope, i)), k)),
+            Box::new(TDim::Div(Box::new(coord(scope, j)), k)),
+        )
+    }
+
+    fn make_banded(scope: &SymbolScope, a: usize, b: usize, k: u64, lo: i64, up: i64) -> TDim {
+        // Build the canonical form: Mul([Ge(Val(up), D), Ge(D, Val(lo))]).
+        let div_a = TDim::Div(Box::new(coord(scope, a)), k);
+        let div_b = TDim::Div(Box::new(coord(scope, b)), k);
+        let diff = (div_a - div_b).reduce();
+        let ge_upper = TDim::Ge(Box::new(TDim::Val(up)), Box::new(diff.clone())).reduce();
+        let ge_lower = TDim::Ge(Box::new(diff), Box::new(TDim::Val(lo))).reduce();
+        TDim::Mul(vec![ge_upper, ge_lower]).reduce()
+    }
+
+    #[test]
+    fn decode_mask_recognises_block_diag_canonical_form() {
+        let scope = SymbolScope::default();
+        let expr = make_block_diag(&scope, 0, 1, 2);
+        let m = decode_mask(&expr, &[0, 1]).unwrap();
+        assert_eq!((m.chunk_size, m.lower, m.upper), (2, 0, 0));
+    }
+
+    #[test]
+    fn decode_mask_recognises_block_diag_arbitrary_chunk_size() {
+        let scope = SymbolScope::default();
+        let expr = make_block_diag(&scope, 0, 1, 137);
+        let m = decode_mask(&expr, &[0, 1]).unwrap();
+        assert_eq!(m.chunk_size, 137);
+    }
+
+    #[test]
+    fn decode_mask_recognises_block_diag_swapped_axes() {
+        let scope = SymbolScope::default();
+        let expr = make_block_diag(&scope, 1, 0, 2);
+        let m = decode_mask(&expr, &[0, 1]).unwrap();
+        assert_eq!(m.chunk_size, 2);
+    }
+
+    #[test]
+    fn decode_mask_recognises_banded_form() {
+        // Mimics ex03: `0 ≤ chunk(0) - chunk(1) ≤ 1` with k=2.
+        let scope = SymbolScope::default();
+        let expr = make_banded(&scope, 0, 1, 2, 0, 1);
+        let m = decode_mask(&expr, &[0, 1]).unwrap();
+        assert_eq!((m.chunk_size, m.lower, m.upper, m.axis_a, m.axis_b), (2, 0, 1, 0, 1));
+    }
+
+    #[test]
+    fn decode_mask_recognises_banded_form_negative_lower() {
+        let scope = SymbolScope::default();
+        let expr = make_banded(&scope, 0, 1, 2, -1, 1);
+        let m = decode_mask(&expr, &[0, 1]).unwrap();
+        assert_eq!((m.chunk_size, m.lower, m.upper), (2, -1, 1));
+    }
+
+    #[test]
+    fn decode_mask_rejects_mismatched_chunk_sizes() {
+        let scope = SymbolScope::default();
+        let expr = TDim::Eq(
+            Box::new(TDim::Div(Box::new(coord(&scope, 0)), 2)),
+            Box::new(TDim::Div(Box::new(coord(&scope, 1)), 3)),
+        );
+        assert_eq!(decode_mask(&expr, &[0, 1]), None);
+    }
+
+    #[test]
+    fn decode_mask_rejects_non_streaming_axis() {
+        let scope = SymbolScope::default();
+        let expr = make_block_diag(&scope, 0, 2, 2);
+        assert_eq!(decode_mask(&expr, &[0, 1]), None);
+    }
+
+    #[test]
+    fn decode_mask_rejects_bare_ge() {
+        let scope = SymbolScope::default();
+        // A single Ge isn't a complete band — both bounds must be present.
+        let expr = TDim::Ge(
+            Box::new(TDim::Div(Box::new(coord(&scope, 0)), 2)),
+            Box::new(TDim::Div(Box::new(coord(&scope, 1)), 2)),
+        );
+        assert_eq!(decode_mask(&expr, &[0, 1]), None);
+    }
+
+    /// Exploratory probe: confirm what the `(0 <= diff <= L) ∧ (mask)` form looks
+    /// like at the TDim level after `reduce()`.  Kept as a regression on the
+    /// canonical form the recogniser expects.
+    #[test]
+    fn decode_banded_probe_canonical_form() {
+        let scope = SymbolScope::default();
+        let coord_a = coord(&scope, 0);
+        let coord_b = coord(&scope, 1);
+        let div_a = TDim::Div(Box::new(coord_a), 2);
+        let div_b = TDim::Div(Box::new(coord_b), 2);
+        let diff = (div_a.clone() - div_b.clone()).reduce();
+        let ge_lower = TDim::Ge(Box::new(diff.clone()), Box::new(TDim::Val(0))).reduce();
+        let ge_upper = TDim::Ge(Box::new(TDim::Val(1)), Box::new(diff.clone())).reduce();
+        let mask = TDim::Mul(vec![ge_upper, ge_lower]).reduce();
+        println!("PROBE diff = {diff:?}");
+        println!("PROBE mask = {mask:?}");
+        println!("PROBE mask display = {mask}");
+    }
+
+    #[test]
+    fn decode_mask_rejects_offset_in_numerator() {
+        let scope = SymbolScope::default();
+        let expr = TDim::Eq(
+            Box::new(TDim::Div(Box::new(TDim::Add(vec![coord(&scope, 0), TDim::Val(1)])), 2)),
+            Box::new(TDim::Div(Box::new(TDim::Add(vec![coord(&scope, 1), TDim::Val(1)])), 2)),
+        );
+        assert_eq!(decode_mask(&expr, &[0, 1]), None);
+    }
+
+    fn einsum_for(inputs: &[&str], output: &str) -> EinSum {
+        EinSum {
+            axes: AxesMapping::from_strs(inputs, &[output]).unwrap(),
+            operating_dt: f32::datum_type(),
+            q_params: None,
+        }
+    }
+
+    fn axes_to_strings(op: &EinSum) -> (Vec<String>, Vec<String>) {
+        let (ins, outs) = op.axes.to_strs();
+        (ins.into_iter().collect(), outs.into_iter().collect())
+    }
+
+    fn ck(op: &EinSum, ins: &[usize], out: usize) -> EinSum {
+        let in_starts: Vec<Option<usize>> = ins.iter().map(|&p| Some(p)).collect();
+        chunkify_einsum(op, &in_starts, Some(out)).unwrap()
+    }
+
+    #[test]
+    fn chunkify_einsum_handles_streaming_at_position_zero() {
+        let op = einsum_for(&["id", "jd"], "ij");
+        let chunked = ck(&op, &[0, 0], 0);
+        let (ins, outs) = axes_to_strings(&chunked);
+        let chunk_char = op.axes.available_label();
+        assert_eq!(ins[0], format!("{chunk_char}id"));
+        assert_eq!(ins[1], format!("{chunk_char}jd"));
+        assert_eq!(outs[0], format!("{chunk_char}ij"));
+    }
+
+    #[test]
+    fn chunkify_einsum_handles_streaming_at_inner_position() {
+        let op = einsum_for(&["bid", "bjd"], "bij");
+        let chunked = ck(&op, &[1, 1], 1);
+        let (ins, outs) = axes_to_strings(&chunked);
+        let chunk_char = op.axes.available_label();
+        assert_eq!(ins[0], format!("b{chunk_char}id"));
+        assert_eq!(ins[1], format!("b{chunk_char}jd"));
+        assert_eq!(outs[0], format!("b{chunk_char}ij"));
+    }
+
+    #[test]
+    fn chunkify_einsum_handles_mixed_input_positions() {
+        let op = einsum_for(&["id", "bjd"], "bij");
+        let chunked = ck(&op, &[0, 1], 1);
+        let (ins, outs) = axes_to_strings(&chunked);
+        let chunk_char = op.axes.available_label();
+        assert_eq!(ins[0], format!("{chunk_char}id"));
+        assert_eq!(ins[1], format!("b{chunk_char}jd"));
+        assert_eq!(outs[0], format!("b{chunk_char}ij"));
+    }
+
+    #[test]
+    fn chunkify_einsum_for_terminator_with_two_streaming_input() {
+        // ex02 terminator: "ij,jd->id".  Input 0 (masked) has streaming at
+        // positions 0 and 1 — chunk char goes before position 0.  Input 1
+        // (c) has streaming at position 0.  Output has streaming at 0.
+        let op = einsum_for(&["ij", "jd"], "id");
+        let chunked = ck(&op, &[0, 0], 0);
+        let (ins, outs) = axes_to_strings(&chunked);
+        let chunk_char = op.axes.available_label();
+        assert_eq!(ins[0], format!("{chunk_char}ij"));
+        assert_eq!(ins[1], format!("{chunk_char}jd"));
+        assert_eq!(outs[0], format!("{chunk_char}id"));
+    }
+
+    #[test]
+    fn chunked_axis_index_zero_chunk_position() {
+        // Chunk at output pos 0; everything shifts right by 1.
+        assert_eq!(chunked_axis_index(0, 0), 1);
+        assert_eq!(chunked_axis_index(1, 0), 2);
+    }
+
+    #[test]
+    fn chunked_axis_index_inner_chunk_position() {
+        // Chunk at output pos 1; axis 0 stays, axes 1+ shift right.
+        assert_eq!(chunked_axis_index(0, 1), 0);
+        assert_eq!(chunked_axis_index(1, 1), 2);
+        assert_eq!(chunked_axis_index(2, 1), 3);
+    }
+
+    /// Build a tiny model with two parallel chains of identity ops, all
+    /// claiming multi-T-axis shape via hand-crafted facts, and check that
+    /// the connected-components walker splits them.
+    #[test]
+    fn connected_components_splits_independent_subgraphs() {
+        // Build the model topologically by hand: two parallel pairs of
+        // sources, each consumed by an identity-ish op (we use AxisOp::Add
+        // to add a unit axis — its output shape doesn't actually matter for
+        // this test; we only edit the fact afterwards to make the walker
+        // see multi-T-axis on selected nodes).
+        let mut model = TypedModel::default();
+        let t = model.symbols.sym("T");
+
+        let a1 = model.add_source("a1", f32::fact(dims![t.clone(), 4_usize].as_ref())).unwrap();
+        let b1 =
+            model.wire_node("b1", tract_core::ops::change_axes::AxisOp::Add(0), &[a1]).unwrap()[0];
+        let c1 =
+            model.wire_node("c1", tract_core::ops::change_axes::AxisOp::Add(0), &[b1]).unwrap()[0];
+
+        let a2 = model.add_source("a2", f32::fact(dims![t.clone(), 4_usize].as_ref())).unwrap();
+        let b2 =
+            model.wire_node("b2", tract_core::ops::change_axes::AxisOp::Add(0), &[a2]).unwrap()[0];
+        let c2 =
+            model.wire_node("c2", tract_core::ops::change_axes::AxisOp::Add(0), &[b2]).unwrap()[0];
+
+        model.select_output_outlets(&[c1, c2]).unwrap();
+
+        // Pretend nodes b1, c1, b2, c2 are multi-T-axis (the function we're
+        // testing only inspects connectivity; it doesn't look at facts).
+        let multi: BTreeSet<usize> = [b1.node, c1.node, b2.node, c2.node].into_iter().collect();
+        let groups = connected_components(&model, &multi);
+        assert_eq!(groups.len(), 2, "expected two independent components: {groups:?}");
+        // Each component must contain exactly its two nodes.
+        let g0: BTreeSet<usize> = [b1.node, c1.node].into_iter().collect();
+        let g1: BTreeSet<usize> = [b2.node, c2.node].into_iter().collect();
+        assert!(groups.iter().any(|g| *g == g0), "expected component {g0:?} in {groups:?}");
+        assert!(groups.iter().any(|g| *g == g1), "expected component {g1:?} in {groups:?}");
+    }
+}
diff --git a/pulse/src/fact.rs b/pulse/src/fact.rs
index ef9d027d73..f432d79f6e 100644
--- a/pulse/src/fact.rs
+++ b/pulse/src/fact.rs
@@ -42,7 +42,7 @@ impl PulsedFact {
             .stream_info(symbol)
             .ok_or_else(|| format_err!("Can not pulse a tensor with no streaming dim"))?;
         let mut shape: TVec<TDim> = tf.shape.to_tvec();
-        shape[axis] = pulse.clone();
+        shape[axis] = shape[axis].substitute(symbol, pulse)?;
         Ok(PulsedFact {
             datum_type,
             shape: shape.into(),
@@ -121,3 +121,32 @@ impl<'a> From<&'a PulsedFact> for TypedFact {
         fact.datum_type.fact(fact.shape.clone())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn pulsed_fact_from_pure_symbol() {
+        let symbols = SymbolScope::default();
+        let s = symbols.sym("S");
+        let tf = f32::fact(tvec!(s.to_dim(), 4.to_dim()));
+        let pulse = PulsedFact::from_tensor_fact_pulse(&tf, &s, &2.to_dim()).unwrap();
+        assert_eq!(&*pulse.shape, &[2.to_dim(), 4.to_dim()]);
+        let stream = pulse.stream.unwrap();
+        assert_eq!(stream.axis, 0);
+        assert_eq!(stream.dim, s.to_dim());
+    }
+
+    #[test]
+    fn pulsed_fact_from_symbol_multiple() {
+        let symbols = SymbolScope::default();
+        let s = symbols.sym("S");
+        let tf = f32::fact(tvec!(s.to_dim() * 2, 4.to_dim()));
+        let pulse = PulsedFact::from_tensor_fact_pulse(&tf, &s, &1.to_dim()).unwrap();
+        assert_eq!(&*pulse.shape, &[2.to_dim(), 4.to_dim()]);
+        let stream = pulse.stream.unwrap();
+        assert_eq!(stream.axis, 0);
+        assert_eq!(stream.dim, s.to_dim() * 2);
+    }
+}
diff --git a/pulse/src/lib.rs b/pulse/src/lib.rs
index 24ea6f3c5c..73ef44d750 100644
--- a/pulse/src/lib.rs
+++ b/pulse/src/lib.rs
@@ -2,6 +2,7 @@
 #[macro_use]
 pub mod macros;
 
+pub mod blockify;
 pub mod fact;
 pub mod model;
 pub mod ops;
@@ -21,6 +22,7 @@ pub mod internal {
 use std::ops::ControlFlow;
 
 use internal::*;
+use tract_core::optim::TypedPass;
 use tract_core::transform::ModelTransform;
 use tract_pulse_opl::tract_nnef::tract_core;
 
@@ -43,6 +45,9 @@ impl ModelTransform for PulseTransform {
         let symbol = self.0.symbol.as_deref().unwrap_or("S");
         let sym = model.symbols.sym(symbol);
         let pulse_dim = parse_tdim(&model.symbols, &self.0.pulse)?;
+        ops::diag_gather::detect_diag_gather(model)?;
+        tract_core::optim::propagate_roi::PropagateRoi.run_direct(model)?;
+        model.declutter()?;
         let pulsed = model::PulsedModel::new(model, sym, &pulse_dim)?;
         *model = pulsed.into_typed()?;
         Ok(())
@@ -51,6 +56,10 @@ impl ModelTransform for PulseTransform {
 
 register_model_transform!("pulse", PulseConfig, |config| Ok(Box::new(PulseTransform(config))));
 
+register_model_transform!("blockify", blockify::BlockifyConfig, |config| Ok(Box::new(
+    blockify::BlockifyTransform(config)
+)));
+
 pub trait WithPulse {
     fn enable_pulse(&mut self);
     fn with_pulse(self) -> Self;
@@ -58,7 +67,6 @@ pub trait WithPulse {
 
 impl WithPulse for tract_nnef::framework::Nnef {
     fn enable_pulse(&mut self) {
-        self.enable_tract_core();
         self.registries.push(tract_nnef_registry());
     }
     fn with_pulse(mut self) -> Self {
@@ -120,4 +128,169 @@ mod tests {
         assert_eq!(*pulse.input_fact(0).unwrap().to_typed_fact().unwrap(), f32::fact([4, 2, 3]));
         assert_eq!(*pulse.output_fact(0).unwrap().to_typed_fact().unwrap(), f32::fact([4, 2, 3]));
     }
+
+    #[test]
+    fn test_reshape_split_streaming_axis() {
+        use tract_core::ops::change_axes::AxisOp;
+        let mut model = TypedModel::default();
+        let s = model.symbols.sym("S");
+        let a = model.add_source("a", f32::fact(dims![s.to_dim() * 2, 4].as_ref())).unwrap();
+        let split = model
+            .wire_node(
+                "split",
+                AxisOp::Reshape(0, tvec!(s.to_dim() * 2), tvec!(s.to_dim(), 2.to_dim())),
+                &[a],
+            )
+            .unwrap();
+        model.select_output_outlets(&split).unwrap();
+        let pulse = PulsedModel::new(&model, s.clone(), &1.to_dim()).unwrap();
+        assert_eq!(*pulse.input_fact(0).unwrap().to_typed_fact().unwrap(), f32::fact([2, 4]));
+        assert_eq!(*pulse.output_fact(0).unwrap().to_typed_fact().unwrap(), f32::fact([1, 2, 4]));
+        let out_stream = pulse.output_fact(0).unwrap().stream.as_ref().unwrap();
+        assert_eq!(out_stream.axis, 0);
+        assert_eq!(out_stream.dim, s.to_dim());
+    }
+
+    #[test]
+    fn test_reshape_merge_streaming_axis() {
+        use tract_core::ops::change_axes::AxisOp;
+        let mut model = TypedModel::default();
+        let s = model.symbols.sym("S");
+        let a = model.add_source("a", f32::fact(dims![s, 2, 4].as_ref())).unwrap();
+        let merged = model
+            .wire_node(
+                "merge",
+                AxisOp::Reshape(0, tvec!(s.to_dim(), 2.to_dim()), tvec!(s.to_dim() * 2)),
+                &[a],
+            )
+            .unwrap();
+        model.select_output_outlets(&merged).unwrap();
+        let pulse = PulsedModel::new(&model, s.clone(), &1.to_dim()).unwrap();
+        assert_eq!(*pulse.input_fact(0).unwrap().to_typed_fact().unwrap(), f32::fact([1, 2, 4]));
+        assert_eq!(*pulse.output_fact(0).unwrap().to_typed_fact().unwrap(), f32::fact([2, 4]));
+        let out_stream = pulse.output_fact(0).unwrap().stream.as_ref().unwrap();
+        assert_eq!(out_stream.axis, 0);
+        assert_eq!(out_stream.dim, s.to_dim() * 2);
+    }
+
+    #[test]
+    fn test_reshape_split_then_run() {
+        use tract_core::ops::change_axes::AxisOp;
+        let mut model = TypedModel::default();
+        let s = model.symbols.sym("S");
+        let a = model.add_source("a", f32::fact(dims![s.to_dim() * 2].as_ref())).unwrap();
+        let split = model
+            .wire_node(
+                "split",
+                AxisOp::Reshape(0, tvec!(s.to_dim() * 2), tvec!(s.to_dim(), 2.to_dim())),
+                &[a],
+            )
+            .unwrap();
+        model.select_output_outlets(&split).unwrap();
+
+        let pulse = PulsedModel::new(&model, s, &1.to_dim()).unwrap();
+        let plan = SimplePlan::new(pulse.into_typed().unwrap()).unwrap();
+        let mut state = SimpleState::new(&plan).unwrap();
+        let chunk1 = tensor1(&[1f32, 2.0]);
+        let out1 = state.run(tvec!(chunk1.into_tvalue())).unwrap();
+        assert_eq!(*out1[0], tensor2(&[[1f32, 2.0]]).into());
+        let chunk2 = tensor1(&[3f32, 4.0]);
+        let out2 = state.run(tvec!(chunk2.into_tvalue())).unwrap();
+        assert_eq!(*out2[0], tensor2(&[[3f32, 4.0]]).into());
+    }
+
+    /// Two parallel pulse paths meeting at an elementwise op produce
+    /// different per-pulse stream-axis sizes when one path goes through a
+    /// ConvTranspose (kernel > stride) and the other doesn't.  Pre-fix
+    /// pulsification bailed at the meet point because the typed
+    /// `output_facts`' `multi_broadcast` returned `Broadcast(K_a, K_b)`
+    /// on the stream axis -- not equal, not 1, doesn't simplify.  After fix
+    /// the merge uses LCM for the stream axis specifically.
+    ///
+    /// Minimal repro of the Pocket-TTS upsample-then-attention pattern:
+    /// a ConvTranspose1d(stride=4, kernel=8) emits steady-state stride=4
+    /// frames per pulse with 4-frame overlap-add; an arange of the same
+    /// post-convtr length produces (after our Range slope-based fix) also
+    /// 4 frames per pulse; an elementwise Add of the two requires the
+    /// meet-point merge to be LCM(4, 4) = 4 (trivial here, but the path
+    /// went through Broadcast(4, 8) before slope+LCM fixes were in place).
+    #[test]
+    fn test_pulse_meet_with_arange_branch_types_through() {
+        use tract_core::ops::array::Range;
+        use tract_core::ops::cnn::{Deconv, KernelFormat, PaddingSpec, PoolSpec};
+        use tract_core::ops::nn::DataFormat;
+
+        let mut model = TypedModel::default();
+        let t = model.symbols.sym("T");
+        let src = model.add_source("x", f32::fact(dims![1, 2, t.to_dim()].as_ref())).unwrap();
+
+        // ConvTranspose1d(C=2, kernel=8, stride=4) → stream-axis dim
+        // becomes 4*T + 4 (post overlap-add tail).
+        let kernel = model
+            .add_const("kernel", tract_core::ndarray::Array3::<f32>::zeros((2, 2, 8)))
+            .unwrap();
+        let bias = model.add_const("bias", tract_core::ndarray::arr1(&[0.0f32, 0.0])).unwrap();
+        let conv_out = model
+            .wire_node(
+                "convtr",
+                Deconv {
+                    pool_spec: PoolSpec {
+                        data_format: DataFormat::NCHW,
+                        kernel_shape: tvec!(8),
+                        padding: PaddingSpec::Valid,
+                        dilations: Some(tvec!(1)),
+                        strides: Some(tvec!(4)),
+                        input_channels: 2,
+                        output_channels: 2,
+                    },
+                    kernel_format: KernelFormat::OIHW,
+                    adjustments: tvec!(0),
+                    group: 1,
+                },
+                &[src, kernel, bias],
+            )
+            .unwrap()[0];
+
+        // arange(0, 4*T + 4) of the same stream-axis length — this is the
+        // branch that surfaced the Broadcast bug pre-fix.
+        let start = model.add_const("range_start", tensor0(TDim::Val(0))).unwrap();
+        let end = model
+            .add_const(
+                "range_end",
+                tract_core::ndarray::arr0(t.to_dim() * 4 + 4).into_dyn().into_tensor(),
+            )
+            .unwrap();
+        let step = model.add_const("range_step", tensor0(TDim::Val(1))).unwrap();
+        let range_out = model
+            .wire_node("range", Range::new(t.to_dim() * 4 + 4), &[start, end, step])
+            .unwrap()[0];
+
+        // Cast range to f32 and broadcast-shape with conv_out so they Add.
+        let range_f32 = model
+            .wire_node("range_cast", tract_core::ops::cast::cast(f32::datum_type()), &[range_out])
+            .unwrap()[0];
+        let range_bc = model
+            .wire_node(
+                "range_unsqueeze",
+                tract_core::ops::change_axes::AxisOp::Add(0),
+                &[range_f32],
+            )
+            .unwrap()[0];
+        let range_bc = model
+            .wire_node(
+                "range_unsqueeze2",
+                tract_core::ops::change_axes::AxisOp::Add(0),
+                &[range_bc],
+            )
+            .unwrap()[0];
+
+        let added =
+            model.wire_node("add", tract_core::ops::math::add(), &[conv_out, range_bc]).unwrap();
+        model.select_output_outlets(&added).unwrap();
+
+        // The point of the test: this used to panic with
+        // `Pulsification requires pulse Broadcast(4, 8) ...` at the
+        // downstream meet point.  Now it should pulsify without error.
+        let _pulse = PulsedModel::new(&model, t, &2.to_dim()).expect("pulsification");
+    }
 }
diff --git a/pulse/src/model.rs b/pulse/src/model.rs
index 7a991b06f1..d84ea20dd0 100644
--- a/pulse/src/model.rs
+++ b/pulse/src/model.rs
@@ -10,6 +10,108 @@ use tract_pulse_opl::tract_core::ops::source::TypedSource;
 pub type PulsedModel = Graph<PulsedFact, Box<dyn PulsedOp>>;
 pub type PulsedNode = Node<PulsedFact, Box<dyn PulsedOp>>;
 
+/// Pre-flight check: reject models with wires whose size is superlinear in the
+/// streaming symbol but have no `region_of_interest` annotation.
+///
+/// A wire is superlinear when the streaming symbol appears in more than one
+/// shape dimension (e.g. `[T, T]` or `[T, 2T-1]`).  Such wires cannot be
+/// pulsified unless ROI narrows the live region to linear size.
+fn check_no_unannotated_superlinear_wires(model: &TypedModel, symbol: &Symbol) -> TractResult<()> {
+    for node in &model.nodes {
+        for (slot, output) in node.outputs.iter().enumerate() {
+            let streaming_dims: usize =
+                output.fact.shape.iter().filter(|d| d.symbols().contains(symbol)).count();
+            if streaming_dims <= 1
+                || output.fact.region_of_interest.is_some()
+                || output.fact.uniform_tdim.is_some()
+                || output.fact.konst.is_some()
+            {
+                continue;
+            }
+            // Avoid false positives: if any input to this node already carries
+            // an ROI or uniform_tdim that mentions the streaming symbol, the
+            // consumer pulsifier can typically derive what it needs from that
+            // annotation without one on this wire (e.g. Iff outputs inherit
+            // the cond/scores ROI structurally; Softmax output inherits its
+            // input's ROI; MultiBroadcastTo fills inherit the broadcast target
+            // ROI).  Only ops whose *inputs* are all unannotated are genuine
+            // ROI-propagation gaps.
+            let any_input_annotated = node.inputs.iter().any(|inp| {
+                model
+                    .outlet_fact(*inp)
+                    .map(|f| f.region_of_interest.is_some() || f.uniform_tdim.is_some())
+                    .unwrap_or(false)
+            });
+            if any_input_annotated {
+                continue;
+            }
+            log::warn!(
+                "Wire {}/{} ({:?}) has shape {:?} which is superlinear in streaming \
+                 symbol {} ({} dimensions depend on it) but carries no region_of_interest \
+                 annotation, and none of its inputs do either. Pulsification may fail.",
+                node.name,
+                slot,
+                OutletId::new(node.id, slot),
+                output.fact.shape,
+                symbol,
+                streaming_dims,
+            );
+        }
+    }
+    Ok(())
+}
+
+/// LCM of the stream-axis dims across all stream-bearing inputs.
+///
+/// Used at elementwise pulse meet-points where parallel paths emit
+/// different per-pulse sizes (e.g. ConvTranspose with kernel > stride
+/// surfacing as `(K_steady, K_initial)` on the two phases of the pulse
+/// cycle). Two semantics get conflated otherwise:
+///
+/// * Tensor shape compatibility (must match at runtime): non-stream
+///   axes use NumPy `Broadcast` -- equal or one is 1, anything else
+///   fails.
+/// * Pulse-divisibility (a scalar constraint on per-pulse cycle): on
+///   the stream axis, two paths with steady-state size `K_a` and `K_b`
+///   are compatible at any pulse that is a multiple of
+///   `LCM(K_a, K_b)`.
+///
+/// Returns `None` if any stream-axis dim is symbolic; the caller falls
+/// back to the unmodified shape `multi_broadcast` produced.
+pub fn stream_axis_lcm(inputs: &[&PulsedFact]) -> Option<TDim> {
+    let mut dims = inputs.iter().filter_map(|f| f.stream.as_ref().map(|s| &f.shape[s.axis]));
+    let first = dims.next()?.clone();
+    dims.try_fold(first, |acc, d| acc.lcm(d))
+}
+
+/// Pulse-driven path: the pulse value is concrete, so we mint S, substitute
+/// `T → pulse_value · S` ourselves, and call blockify just for the section
+/// rewrites.  The audio-side multiplier is user-driven — required when
+/// there's subsampling between the streaming source and the section's mask
+/// wire (e.g. a stride-2 pool: audio chunk = 2 × post-pool chunk).
+fn pulse_driven_blockify(
+    model: &mut TypedModel,
+    symbol: &Symbol,
+    pulse_value: i64,
+) -> TractResult<(Symbol, TDim)> {
+    let chunk_sym = model.symbols.new_with_prefix("S");
+    // `S >= 0` is the precondition for the `Div(Add([k·X, …, c]), k) → X`
+    // simplification (commit 11b310622).  Without it, post-substitute shapes
+    // like `1 + (3 + 56·S)/4` stay unreduced and blockify's chunked Reshape
+    // volume check fails comparing them to `14·S`.
+    model.symbols.add_assertion(format!("{chunk_sym} >= 0"))?;
+    let subs: HashMap<Symbol, TDim> =
+        HashMap::from([(symbol.clone(), chunk_sym.to_dim() * pulse_value)]);
+    *model = model.substitute_symbols(&subs)?;
+    crate::blockify::rewrite_sections(model, &chunk_sym, pulse_value)?;
+    model.properties.insert(
+        crate::blockify::BLOCKIFY_ORIGINAL_SYMBOL.to_string(),
+        tensor1(&[format!("{symbol}")]).into_arc_tensor(),
+    );
+    // Streaming dim is now `pulse_value · S`, so one pulse covers exactly one S.
+    Ok((chunk_sym, 1.to_dim()))
+}
+
 #[allow(clippy::new_ret_no_self)]
 pub trait PulsedModelExt {
     fn new(source: &TypedModel, symbol: Symbol, pulse: &TDim) -> TractResult<PulsedModel>;
@@ -33,23 +135,50 @@ impl PulsedModelExt for PulsedModel {
         symbol: Symbol,
         pulse: &TDim,
     ) -> TractResult<(PulsedModel, HashMap<OutletId, OutletId>)> {
+        check_no_unannotated_superlinear_wires(source, &symbol)?;
+        use tract_core::optim::TypedPass;
+        let mut blockified = source.clone();
+        // Mirror PulseTransform's pre-fold so callers entering through
+        // PulsedModel::new (or `--pulse`) get the same treatment.
+        crate::ops::diag_gather::detect_diag_gather(&mut blockified)?;
+        tract_core::optim::propagate_roi::PropagateRoi.run_direct(&mut blockified)?;
+        blockified.declutter()?;
+        let (stream_sym, pulse_dim) = match pulse.as_i64() {
+            Some(pv) if crate::blockify::has_quadratic_sections(&blockified, &symbol)? => {
+                pulse_driven_blockify(&mut blockified, &symbol, pv)?
+            }
+            _ => (symbol, pulse.clone()),
+        };
         let pulsifiers = crate::ops::OpPulsifier::inventory();
-        Pulsifier(symbol, pulse.to_owned(), pulsifiers).translate_model_with_mappings(source)
+        let (mut pulsed, mapping) = Pulsifier(stream_sym, pulse_dim, pulsifiers)
+            .translate_model_with_mappings(&blockified)?;
+        for key in [
+            crate::blockify::BLOCKIFY_CHUNK_SYMBOL,
+            crate::blockify::BLOCKIFY_CHUNK_SIZE,
+            crate::blockify::BLOCKIFY_ORIGINAL_SYMBOL,
+        ] {
+            if let Some(v) = blockified.properties.get(key) {
+                pulsed.properties.insert(key.to_string(), v.clone());
+            }
+        }
+        Ok((pulsed, mapping))
     }
 
     fn into_typed(self) -> TractResult<TypedModel> {
         let mut typed = tract_core::model::translator::IntoTranslator.translate_model(&self)?;
         ensure!(
-            self.input_outlets()?.iter().all(|o| self.outlet_fact(*o).unwrap().stream.is_some())
+            self.input_outlets()?.iter().any(|o| self.outlet_fact(*o).unwrap().stream.is_some())
         );
         ensure!(
-            self.output_outlets()?.iter().all(|o| self.outlet_fact(*o).unwrap().stream.is_some())
+            self.output_outlets()?.iter().any(|o| self.outlet_fact(*o).unwrap().stream.is_some())
         );
         let delays = tensor1(
             &self
                 .output_outlets()?
                 .iter()
-                .map(|oo| Ok(self.outlet_fact(*oo)?.stream.as_ref().unwrap().delay as _))
+                .map(|oo| {
+                    Ok(self.outlet_fact(*oo)?.stream.as_ref().map(|s| s.delay as i64).unwrap_or(0))
+                })
                 .collect::<TractResult<TVec<i64>>>()?,
         );
         typed.properties.insert("pulse.delay".to_string(), delays.into_arc_tensor());
@@ -57,7 +186,9 @@ impl PulsedModelExt for PulsedModel {
             &self
                 .input_outlets()?
                 .iter()
-                .map(|oo| Ok(self.outlet_fact(*oo)?.stream.as_ref().unwrap().axis as _))
+                .map(|oo| {
+                    Ok(self.outlet_fact(*oo)?.stream.as_ref().map(|s| s.axis as i64).unwrap_or(-1))
+                })
                 .collect::<TractResult<TVec<i64>>>()?,
         );
         typed.properties.insert("pulse.input_axes".to_string(), input_axes.into_arc_tensor());
@@ -65,10 +196,29 @@ impl PulsedModelExt for PulsedModel {
             &self
                 .output_outlets()?
                 .iter()
-                .map(|oo| Ok(self.outlet_fact(*oo)?.stream.as_ref().unwrap().axis as _))
+                .map(|oo| {
+                    Ok(self.outlet_fact(*oo)?.stream.as_ref().map(|s| s.axis as i64).unwrap_or(-1))
+                })
                 .collect::<TractResult<TVec<i64>>>()?,
         );
         typed.properties.insert("pulse.output_axes".to_string(), output_axes.into_arc_tensor());
+        // Stash the streaming symbol's name so downstream consumers (CLI run
+        // path, etc.) can resolve `op.end_input.eval(...)` and other symbolic
+        // dims at runtime.  The symbol lives in TDims like a PulsePad's
+        // `end_input = stream.dim + …`; without binding it, those evals hit
+        // `usize::MAX` fallbacks and end-of-stream padding silently misfires.
+        let stream_syms: Vec<String> = self
+            .input_outlets()?
+            .iter()
+            .filter_map(|oo| self.outlet_fact(*oo).unwrap().stream.as_ref())
+            .flat_map(|s| s.dim.symbols())
+            .map(|s| s.to_string())
+            .collect();
+        if let Some(name) = stream_syms.into_iter().next() {
+            typed
+                .properties
+                .insert("pulse.streaming_symbol".to_string(), tensor1(&[name]).into_arc_tensor());
+        }
         Ok(typed)
     }
 }
@@ -251,13 +401,31 @@ impl PulsedOp for PulseWrappingOp {
         let axes_mapping = self.0.axes_mapping(&input_facts_ref, &output_facts_ref)?;
         let axis_info = axes_mapping.axis((InOut::In(pulsing_input), stream.axis))?;
         std::mem::drop(output_facts_ref);
+        // When parallel pulse paths converge at an elementwise op, the
+        // typed `output_facts` falls through to `multi_broadcast` for shape
+        // merging, which produces `Broadcast([K_a, K_b])` on the stream
+        // axis when the inputs have different per-pulse sizes (e.g.
+        // steady-state `stride` vs initial-state `kernel` of a
+        // streaming convtr surfacing on two phases of the cycle).
+        // `Broadcast` is the right semantic for non-stream axes (shape
+        // compatibility at runtime) but a category error for the stream
+        // axis: there the merged constraint is *scalar pulse-divisibility*
+        // and the right answer is LCM. Compute it post-hoc and override
+        // the offending dim before it propagates downstream.
+        let merged_stream_dim = stream_axis_lcm(inputs);
         output_facts
             .into_iter()
             .enumerate()
             .map(|(ix, tf)| {
                 if let &[axis] = &*axis_info.outputs[ix] {
+                    let mut shape = tf.shape;
+                    if let Some(merged) = merged_stream_dim.as_ref() {
+                        if matches!(shape[axis], TDim::Broadcast(_)) {
+                            shape.set(axis, merged.clone());
+                        }
+                    }
                     Ok(PulsedFact {
-                        shape: tf.shape,
+                        shape,
                         datum_type: tf.datum_type,
                         stream: Some(StreamInfo {
                             delay: stream.delay,
diff --git a/pulse/src/ops/array/affine_trim.rs b/pulse/src/ops/array/affine_trim.rs
new file mode 100644
index 0000000000..12697f2e78
--- /dev/null
+++ b/pulse/src/ops/array/affine_trim.rs
@@ -0,0 +1,121 @@
+//! `AffineChunkTrim` — pulse-aware "drop trailing `c` on the streaming
+//! axis" op for affine-tail dims (`c + k·S`).  Replaces a typed
+//! `Slice(0, k·S)` whose pulsifier is identity and would leave the
+//! per-pulse buffer at `c + k`, breaking the chunked Reshape downstream
+//! that expects `k`.  Some upstream pulsifiers absorb `c` into Delay
+//! state and emit `k` per pulse; others emit `c + k`.  Handles both by
+//! trimming only when the input exceeds `target_per_pulse`.
+
+use crate::internal::*;
+
+register_all!(AffineChunkTrim: pulsify);
+
+fn pulsify(
+    op: &AffineChunkTrim,
+    _source: &TypedModel,
+    node: &TypedNode,
+    target: &mut PulsedModel,
+    mapping: &HashMap<OutletId, OutletId>,
+    _symbol: &Symbol,
+    _pulse: &TDim,
+) -> TractResult<Option<TVec<OutletId>>> {
+    let input = mapping[&node.inputs[0]];
+    target.wire_node(&*node.name, op.clone(), &[input]).map(Some)
+}
+
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+pub struct AffineChunkTrim {
+    pub axis: usize,
+    pub typed_trim: usize,
+    pub target_per_pulse: usize,
+}
+
+impl Op for AffineChunkTrim {
+    fn name(&self) -> StaticName {
+        "AffineChunkTrim".into()
+    }
+
+    fn info(&self) -> TractResult<Vec<String>> {
+        Ok(vec![format!(
+            "axis: {} typed_trim: {} target_per_pulse: {}",
+            self.axis, self.typed_trim, self.target_per_pulse
+        )])
+    }
+
+    op_as_typed_op!();
+}
+
+impl EvalOp for AffineChunkTrim {
+    fn is_stateless(&self) -> bool {
+        true
+    }
+
+    fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>> {
+        let input = args_1!(inputs);
+        let n = input.shape()[self.axis];
+        let take = if n.saturating_sub(self.typed_trim) >= self.target_per_pulse {
+            n - self.typed_trim
+        } else {
+            n
+        };
+        if take == n {
+            return Ok(tvec!(input));
+        }
+        unsafe {
+            let mut shape: TVec<usize> = input.shape().into();
+            shape[self.axis] = take;
+            let mut tensor = Tensor::uninitialized_dt(input.datum_type(), &shape)?;
+            tensor.assign_slice_unchecked(.., &input, 0..take, self.axis);
+            Ok(tvec!(tensor.into_tvalue()))
+        }
+    }
+}
+
+impl TypedOp for AffineChunkTrim {
+    as_op!();
+
+    fn output_facts(&self, inputs: &[&TypedFact]) -> TractResult<TVec<TypedFact>> {
+        let mut fact = inputs[0].without_value();
+        let dim = fact.shape[self.axis].clone();
+        // Mirror `eval` / `pulsed_output_facts`: trim only when the input
+        // axis is concrete *and* strictly exceeds `target_per_pulse`.  Two
+        // cases this handles:
+        //   - pre-pulse typed model with symbolic input `c + k·S`: dim is
+        //     not concretely sized → subtract `typed_trim` (`c`) as before.
+        //   - pulsified model where the upstream emits `target_per_pulse`
+        //     directly (e.g. Delay absorbed `c`): dim is concrete and equal
+        //     to `target_per_pulse` → no trim, matching eval.  Previously
+        //     this branch returned `dim - typed_trim` which lied about the
+        //     output shape and broke downstream consumers (e.g. a Reshape
+        //     keyed off `target_per_pulse`) — visible when --cuda translates
+        //     the pulsified encoder.
+        let new_dim = if let Ok(cur) = dim.to_usize() {
+            if cur > self.target_per_pulse { self.target_per_pulse.to_dim() } else { cur.to_dim() }
+        } else {
+            dim - self.typed_trim.to_dim()
+        };
+        fact.shape.set(self.axis, new_dim);
+        Ok(tvec!(fact))
+    }
+}
+
+impl PulsedOp for AffineChunkTrim {
+    fn pulsed_output_facts(&self, inputs: &[&PulsedFact]) -> TractResult<TVec<PulsedFact>> {
+        let mut fact = inputs[0].clone();
+        let cur = fact.shape[self.axis].to_usize()?;
+        let trim_amount = cur.saturating_sub(self.target_per_pulse);
+        if trim_amount > 0 {
+            let new_per_pulse = cur - trim_amount;
+            let mut shape: TVec<TDim> = fact.shape.iter().cloned().collect();
+            shape[self.axis] = new_per_pulse.to_dim();
+            fact.shape = shape.into();
+        }
+        if let Some(stream) = fact.stream.as_mut() {
+            stream.dim = stream.dim.clone() - self.typed_trim.to_dim();
+        }
+        Ok(tvec!(fact))
+    }
+
+    as_op!();
+    pulsed_op_to_typed_op!();
+}
diff --git a/pulse/src/ops/array/mod.rs b/pulse/src/ops/array/mod.rs
index e76aee6a1b..4ef732e3ea 100644
--- a/pulse/src/ops/array/mod.rs
+++ b/pulse/src/ops/array/mod.rs
@@ -1,9 +1,14 @@
 use crate::internal::*;
 
+mod affine_trim;
 mod broadcast;
 mod concat;
 mod mask;
 mod pad;
+mod range;
+mod reshape;
 mod slice;
 
-register_all_mod!(broadcast, concat, pad, slice);
+pub use affine_trim::AffineChunkTrim;
+
+register_all_mod!(affine_trim, broadcast, concat, pad, range, reshape, slice);
diff --git a/pulse/src/ops/array/pad.rs b/pulse/src/ops/array/pad.rs
index c7d0001ecf..05d1f0a156 100644
--- a/pulse/src/ops/array/pad.rs
+++ b/pulse/src/ops/array/pad.rs
@@ -20,9 +20,7 @@ fn pulsify(
     // Non-constant mode can't handle non-stream-axis padding
     let has_non_stream_axis_padding =
         op.pads.iter().enumerate().any(|(ax, &(a, b))| ax != stream.axis && (a != 0 || b != 0));
-    if has_non_stream_axis_padding && !matches!(op.mode, PadMode::Constant(_)) {
-        return Ok(None);
-    }
+    rule_if!(!has_non_stream_axis_padding || matches!(op.mode, PadMode::Constant(_)));
     let (before, after) = op.pads[stream.axis];
     let pulse = fact.pulse().unwrap();
     let mut extra_delay = before.saturating_sub(stream.delay);
diff --git a/pulse/src/ops/array/range.rs b/pulse/src/ops/array/range.rs
new file mode 100644
index 0000000000..f0baccd11f
--- /dev/null
+++ b/pulse/src/ops/array/range.rs
@@ -0,0 +1,80 @@
+//! Pulsifier for `tract_core::ops::array::Range`.
+//!
+//! Range is a 0-streaming-input generator: its (start, end, step) inputs
+//! are scalar constants; its single output is a 1-D wire whose length is
+//! the symbolic `(end - start)/step`.  When `end` contains the streaming
+//! symbol, `NonPulsingWrappingOp`'s konst-stripping fallback in
+//! `Range::output_facts` re-evaluates the shape via `self.len`, producing
+//! a fresh `Sym(range_NN)` symbol unrelated to the stream — a pulse-time
+//! mismatch we sidestep by emitting `PulsedRange` instead.
+
+use crate::fact::StreamInfo;
+use crate::internal::*;
+use tract_core::ops::array::Range;
+use tract_pulse_opl::ops::PulsedRange;
+
+register_all!(Range: pulsify);
+
+fn pulsify(
+    _op: &Range,
+    source: &TypedModel,
+    node: &TypedNode,
+    target: &mut PulsedModel,
+    _mapping: &HashMap<OutletId, OutletId>,
+    symbol: &Symbol,
+    pulse: &TDim,
+) -> TractResult<Option<TVec<OutletId>>> {
+    // Output shape must be a 1-D wire whose dim contains the stream symbol.
+    let out_fact = &node.outputs[0].fact;
+    rule_if!(out_fact.rank() == 1);
+    rule_if!(out_fact.shape[0].symbols().contains(symbol));
+    let stream_dim = out_fact.shape[0].clone();
+    let datum_type = out_fact.datum_type;
+
+    // Pull start/step from the source model's input facts as scalar consts.
+    // (`end` only matters for the symbolic length, which we already have.)
+    let input_facts = source.node_input_facts(node.id)?;
+    rule_if!(input_facts.len() == 3);
+    rule_if_some!(start = input_facts[0].konst.as_ref());
+    rule_if_some!(step = input_facts[2].konst.as_ref());
+    let start = start.clone().into_tensor();
+    let step = step.clone().into_tensor();
+
+    // Per-pulse element count on the stream axis = slope (coefficient of
+    // the streaming symbol) × pulse, NOT the full evaluated length at the
+    // first pulse. For `stream_dim = c·S + k` (e.g. an arange over the
+    // post-upsample length where the convtr's kernel-stride window
+    // adds a constant tail `k`), the data path downstream emits `c·pulse`
+    // new frames per pulse with the constant `k` absorbed into the
+    // streaming state. Range must match that to keep the stream-axis dim
+    // consistent at elementwise meet points (otherwise the constant `k`
+    // surfaces as `Broadcast(c·pulse, c·pulse + k)` and downstream
+    // pulse-divisibility checks bail).
+    //
+    // `guess_slope` returns `(num, den)` for the rational slope; we
+    // require integer slopes (den == 1) — Range over a fractional-slope
+    // stream dim doesn't have a single per-pulse count.
+    let (slope_num, slope_den) = stream_dim.guess_slope(symbol);
+    rule_if!(slope_num > 0 && slope_den == 1);
+    let pulse_int = pulse.to_usize()?;
+    let per_pulse: usize = (slope_num as usize).checked_mul(pulse_int).ok_or_else(|| {
+        format_err!("Range pulsification: per-pulse overflow ({}*{})", slope_num, pulse_int)
+    })?;
+    let pulsed =
+        PulsedRange { datum_type, start, step, stream_dim: stream_dim.clone(), pulse: per_pulse };
+    target.wire_node(&*node.name, pulsed, &[]).map(Some)
+}
+
+impl PulsedOp for PulsedRange {
+    fn pulsed_output_facts(&self, _inputs: &[&PulsedFact]) -> TractResult<TVec<PulsedFact>> {
+        let shape: TVec<TDim> = tvec!(self.pulse.to_dim());
+        Ok(tvec!(PulsedFact {
+            datum_type: self.datum_type,
+            shape: shape.into(),
+            stream: Some(StreamInfo { axis: 0, dim: self.stream_dim.clone(), delay: 0 }),
+        }))
+    }
+
+    as_op!();
+    pulsed_op_to_typed_op!();
+}
diff --git a/pulse/src/ops/array/reshape.rs b/pulse/src/ops/array/reshape.rs
new file mode 100644
index 0000000000..63a5084ff1
--- /dev/null
+++ b/pulse/src/ops/array/reshape.rs
@@ -0,0 +1,152 @@
+use crate::fact::StreamInfo;
+use crate::internal::*;
+use tract_core::ops::change_axes::AxisOp;
+use tract_pulse_opl::ops::Delay;
+
+register_all!(AxisOp: pulsify);
+
+fn pulsify(
+    op: &AxisOp,
+    _source: &TypedModel,
+    node: &TypedNode,
+    target: &mut PulsedModel,
+    mapping: &HashMap<OutletId, OutletId>,
+    symbol: &Symbol,
+    pulse: &TDim,
+) -> TractResult<Option<TVec<OutletId>>> {
+    rule_if_let!(AxisOp::Reshape(at, from, to) = op);
+    let mut input = mapping[&node.inputs[0]];
+    let fact = target.outlet_fact(input)?.clone();
+    rule_if_some!(stream = &fact.stream);
+    rule_if!(stream.axis >= *at && stream.axis < *at + from.len());
+    let from_pos = stream.axis - *at;
+    rule_if!(from[from_pos].symbols().contains(symbol));
+    rule_if!(from.iter().enumerate().all(|(i, d)| i == from_pos || !d.symbols().contains(symbol)));
+    let to_streaming: TVec<usize> = to
+        .iter()
+        .enumerate()
+        .filter(|(_, d)| d.symbols().contains(symbol))
+        .map(|(i, _)| i)
+        .collect();
+    rule_if!(to_streaming.len() == 1);
+    let to_pos = to_streaming[0];
+
+    let from_pulsed: TVec<TDim> =
+        from.iter().map(|d| d.substitute(symbol, pulse)).collect::<TractResult<_>>()?;
+    let to_pulsed: TVec<TDim> =
+        to.iter().map(|d| d.substitute(symbol, pulse)).collect::<TractResult<_>>()?;
+
+    // PulsedReshape requires `stream.delay * new_per_pulse % old_per_pulse == 0`
+    // so the lag rescales cleanly into the new units.  When the upstream
+    // ops left a non-aligned delay (e.g. a kernel-stride pool feeding a
+    // chunk Reshape: 1-token carryover into a P-token chunk), insert an
+    // alignment Delay that bumps `stream.delay` up to the next multiple
+    // of `old_per_pulse / gcd(old_per_pulse, new_per_pulse)`.  Costs
+    // exactly the slack needed to land on a chunk boundary.
+    let old_per_pulse = from_pulsed[from_pos].to_usize()?;
+    let new_per_pulse = to_pulsed[to_pos].to_usize()?;
+    if (stream.delay * new_per_pulse) % old_per_pulse != 0 {
+        let g = gcd(old_per_pulse, new_per_pulse);
+        let align = old_per_pulse / g;
+        let extra = align - (stream.delay % align);
+        input = target.wire_node(
+            format!("{}.delay-align", node.name),
+            Delay::new_typed(&(&fact).into(), stream.axis, extra, 0),
+            &[input],
+        )?[0];
+    }
+
+    let pulsed = PulsedReshape {
+        op: AxisOp::Reshape(*at, from_pulsed, to_pulsed),
+        new_stream_axis: *at + to_pos,
+        new_stream_dim: to[to_pos].clone(),
+    };
+    target.wire_node(&*node.name, pulsed, &[input]).map(Some)
+}
+
+fn gcd(mut a: usize, mut b: usize) -> usize {
+    while b != 0 {
+        let t = b;
+        b = a % b;
+        a = t;
+    }
+    a
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct PulsedReshape {
+    pub op: AxisOp,
+    pub new_stream_axis: usize,
+    pub new_stream_dim: TDim,
+}
+
+impl Op for PulsedReshape {
+    fn name(&self) -> StaticName {
+        "PulsedReshape".into()
+    }
+
+    fn info(&self) -> TractResult<Vec<String>> {
+        Ok(vec![format!(
+            "op:{:?} stream_axis:{} stream_dim:{}",
+            self.op, self.new_stream_axis, self.new_stream_dim
+        )])
+    }
+
+    not_a_typed_op!();
+}
+
+impl EvalOp for PulsedReshape {
+    fn is_stateless(&self) -> bool {
+        true
+    }
+
+    fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>> {
+        self.op.eval(inputs)
+    }
+}
+
+impl PulsedOp for PulsedReshape {
+    fn pulsed_output_facts(&self, inputs: &[&PulsedFact]) -> TractResult<TVec<PulsedFact>> {
+        let input_typed: TypedFact = inputs[0].into();
+        let outs = self.op.output_facts(&[&input_typed])?;
+        let stream = inputs[0].stream.as_ref().unwrap();
+        let out_fact = outs.into_iter().next().context("Reshape produced no output fact")?;
+        // `stream.delay` counts elements on the streaming axis.  When the
+        // reshape changes the per-pulse size on that axis (e.g. merging
+        // `(S, k) → S·k` at a Blockify section boundary), the delay must
+        // rescale by `new_per_pulse / old_per_pulse` so the same physical
+        // lag is preserved in the new units.
+        let AxisOp::Reshape(at, from, to) = &self.op else {
+            unreachable!("PulsedReshape only built from AxisOp::Reshape (see pulsify above)");
+        };
+        let from_pos = stream.axis - at;
+        let to_pos = self.new_stream_axis - at;
+        let old_per_pulse = from[from_pos].to_usize()?;
+        let new_per_pulse = to[to_pos].to_usize()?;
+        let scaled = stream.delay * new_per_pulse;
+        ensure!(
+            scaled % old_per_pulse == 0,
+            "PulsedReshape: stream.delay {} can't be rescaled from per-pulse {} \
+             to per-pulse {} (would lose precision)",
+            stream.delay,
+            old_per_pulse,
+            new_per_pulse,
+        );
+        let new_delay = scaled / old_per_pulse;
+        Ok(tvec!(PulsedFact {
+            datum_type: out_fact.datum_type,
+            shape: out_fact.shape,
+            stream: Some(StreamInfo {
+                axis: self.new_stream_axis,
+                dim: self.new_stream_dim.clone(),
+                delay: new_delay,
+            }),
+        }))
+    }
+
+    fn to_typed(&self) -> Box<dyn TypedOp> {
+        Box::new(self.op.clone())
+    }
+
+    as_op!();
+}
diff --git a/pulse/src/ops/cnn/deconv.rs b/pulse/src/ops/cnn/deconv.rs
index c51cfb1df7..0ac00ba27b 100644
--- a/pulse/src/ops/cnn/deconv.rs
+++ b/pulse/src/ops/cnn/deconv.rs
@@ -23,16 +23,14 @@ fn pulsify(
     if c_axis == stream.axis {
         bail!("Pulsification on C axis is not supported");
     }
-    if op
-        .axes_mapping(&source.node_input_facts(node.id)?, &source.node_output_facts(node.id)?)?
-        .axis((InOut::In(0), stream.axis))?
-        .outputs[0]
-        .len()
-        == 1
-    {
-        // general case for invariants will manage
-        return Ok(None);
-    }
+    // general case for invariants will manage
+    rule_if!(
+        op.axes_mapping(&source.node_input_facts(node.id)?, &source.node_output_facts(node.id)?)?
+            .axis((InOut::In(0), stream.axis))?
+            .outputs[0]
+            .len()
+            != 1
+    );
     let geo_axis = stream.axis - op.pool_spec.data_format.h_axis();
     let stride = op.pool_spec.stride(geo_axis);
     let mut pulse_op = op.clone();
@@ -47,7 +45,19 @@ fn pulsify(
     };
     wire = target.wire_node(format!("{}.mask", node.name), mask, &wire)?;
     wire.push(mapping[&node.inputs[1]]);
-    wire.push(mapping[&node.inputs[2]]);
+    // Feed a zero bias to the per-pulse Deconv. With kernel > stride the
+    // per-pulse output has overlap slots that the bulk Deconv would never
+    // emit; DeconvDelay's overlap-add then double-counts the bias on
+    // those slots. Adding the original bias back after DeconvDelay
+    // guarantees it's added exactly once per output position.
+    let source_bias_fact = source.outlet_fact(node.inputs[2])?.clone();
+    let bias_tensor = source_bias_fact
+        .konst
+        .clone()
+        .context("Deconv bias must be a constant for pulsification")?;
+    let zero_bias = Tensor::zero_dt(bias_tensor.datum_type(), bias_tensor.shape())?;
+    let zero_bias = target.add_const(format!("{}.zero_bias", node.name), zero_bias)?;
+    wire.push(zero_bias);
     wire = target.wire_node(format!("{}.deconv", node.name), pulse_op, &wire)?;
     let overlap = overlap(stream.axis, op);
     let deconv_input_dim = (stream.dim.clone() - 1) * stride + 1;
@@ -94,6 +104,38 @@ fn pulsify(
         }
     }
 
+    // Add the original bias to the now-merged output. See the zero-bias
+    // comment above pulse_op wiring for why this can't be done inside
+    // per-pulse Deconv.
+    let out_shape = target.outlet_fact(wire[0])?.shape.clone();
+    let out_rank = out_shape.rank();
+    let c_axis = op.pool_spec.data_format.shape(out_shape.to_tvec())?.c_axis();
+    let mut reshaped_bias_shape: TVec<usize> = tvec![1; out_rank];
+    reshaped_bias_shape[c_axis] = op.pool_spec.output_channels;
+    // Broadcast the bias to ``(1, ..., C, ..., 1)`` so the post-Deconv
+    // Add is a clean elementwise op. Bulk Deconv accepts both scalar and
+    // rank-1 ``(C,)`` biases (handled by ``wire_reshape_bias_for_bin``);
+    // we mirror that here at build time.
+    let bias_const = if bias_tensor.rank() == 0 {
+        bias_tensor.broadcast_scalar_to_shape(&reshaped_bias_shape)?.into_arc_tensor()
+    } else if bias_tensor.shape() == [op.pool_spec.output_channels] {
+        bias_tensor.clone().into_tensor().into_shape(&reshaped_bias_shape)?.into_arc_tensor()
+    } else if bias_tensor.shape() == &*reshaped_bias_shape {
+        bias_tensor.clone()
+    } else {
+        bail!(
+            "Unexpected Deconv bias shape {:?} for {} output channels",
+            bias_tensor.shape(),
+            op.pool_spec.output_channels
+        );
+    };
+    let bias = target.add_const(format!("{}.bias", node.name), bias_const)?;
+    wire = target.wire_node(
+        format!("{}.add_bias", node.name),
+        crate::model::PulseWrappingOp(Box::new(tract_core::ops::math::add())),
+        &[wire[0], bias],
+    )?;
+
     Ok(Some(wire))
 }
 
diff --git a/pulse/src/ops/cnn/pools.rs b/pulse/src/ops/cnn/pools.rs
index c760f005ce..f0edf79994 100644
--- a/pulse/src/ops/cnn/pools.rs
+++ b/pulse/src/ops/cnn/pools.rs
@@ -112,9 +112,7 @@ pub fn pulsify_pooled_input(
     let input_fact: PulsedFact = target.outlet_fact(wire)?.clone();
     let input_stream = input_fact.stream.as_ref().unwrap();
     let input_shape = spec.data_format.shape(input_fact.shape.clone())?;
-    if Some(input_stream.axis) == input_shape.n_axis() {
-        return Ok(None);
-    }
+    rule_if!(Some(input_stream.axis) != input_shape.n_axis());
     if input_stream.axis == input_shape.c_axis() {
         bail!("Can not pulsify cnn pooling ops along the input channel axis");
     }
diff --git a/pulse/src/ops/diag_gather.rs b/pulse/src/ops/diag_gather.rs
new file mode 100644
index 0000000000..2757a5fe10
--- /dev/null
+++ b/pulse/src/ops/diag_gather.rs
@@ -0,0 +1,143 @@
+//! Pulsifier for [`tract_transformers::ops::DiagGather`].
+//!
+//! The op itself and the `Pad → Reshape → Slice → Reshape → Slice` skew-trick
+//! detect pass live in the `tract-transformers` crate (alongside ApplyRope,
+//! ScaledMaskedSoftmax, etc.).  Only the pulse-specific pulsifier — plus its
+//! local chunk-window mask classifier — lives here.
+
+use crate::internal::*;
+use crate::model::PulseWrappingOp;
+use tract_core::ops::logic::sym_to_coord_axis;
+use tract_transformers::ops::DiagGather;
+
+// Re-export the detect pass at this path for in-pulse callers.  New code
+// should import directly from `tract_transformers::ops`.
+pub use tract_transformers::ops::detect_diag_gather;
+
+register_all!(DiagGather: pulsify_diag_gather);
+
+/// If `expr` is a 2-D chunk-window `uniform_tdim` mask, return the live
+/// window width `W = (L+1)·P` — the per-row output length DiagGather should
+/// produce in pulsed form.
+///
+/// The mask shape is
+/// `Ge(Val(L), diff) * Ge(diff, Val(0))` with
+/// `diff = floor((🎯row + r_off) / P) - floor((🎯col + c_off) / P) + constant`.
+///
+/// Semantically: a query at chunk-index `c = floor(i/P)` attends to keys in
+/// chunks `c-L..=c`, i.e. `(L+1)·P` consecutive columns.  That width is the
+/// only signal the DiagGather pulsifier needs from the ROI — it does not
+/// care how the window decomposes into `L` and `P` individually.
+fn chunk_window_width(expr: &TDim) -> Option<u64> {
+    let TDim::Mul(factors) = expr else { return None };
+    let n = factors.len();
+    if n < 2 {
+        return None;
+    }
+    for f0 in 0..n {
+        for f1 in 0..n {
+            if f0 == f1 {
+                continue;
+            }
+            let TDim::Ge(lhs0, rhs0) = &factors[f0] else { continue };
+            let TDim::Ge(lhs1, rhs1) = &factors[f1] else { continue };
+            let TDim::Val(l) = lhs0.as_ref() else { continue };
+            let TDim::Val(0) = rhs1.as_ref() else { continue };
+            let Some((row, col, p)) = extract_div_diff_axes(rhs0) else { continue };
+            let Some((row2, col2, p2)) = extract_div_diff_axes(lhs1) else { continue };
+            if row != row2 || col != col2 || p != p2 {
+                continue;
+            }
+            if *l < 0 {
+                continue;
+            }
+            return Some((*l as u64 + 1) * p);
+        }
+    }
+    None
+}
+
+/// Try to decompose `expr` as a chunk-index difference:
+///   `floor((🎯row + r_off) / P) - floor((🎯col + c_off) / P) + constant`
+fn extract_div_diff_axes(expr: &TDim) -> Option<(usize, usize, u64)> {
+    let TDim::Add(terms) = expr else { return None };
+    let mut pos: Option<(usize, u64)> = None;
+    let mut neg: Option<(usize, u64)> = None;
+    for term in terms {
+        match term {
+            TDim::Div(inner, p) => {
+                if let Some(axis) = extract_coord_sym_from_div_arg(inner) {
+                    pos = Some((axis, *p));
+                }
+            }
+            TDim::MulInt(-1, inner) => {
+                if let TDim::Div(inner2, p) = inner.as_ref() {
+                    if let Some(axis) = extract_coord_sym_from_div_arg(inner2) {
+                        neg = Some((axis, *p));
+                    }
+                }
+            }
+            TDim::Val(_) => {}
+            _ => return None,
+        }
+    }
+    let (row_axis, p_row) = pos?;
+    let (col_axis, p_col) = neg?;
+    if p_row != p_col {
+        return None;
+    }
+    Some((row_axis, col_axis, p_row))
+}
+
+fn extract_coord_sym_from_div_arg(inner: &TDim) -> Option<usize> {
+    match inner {
+        TDim::Sym(sym) => sym_to_coord_axis(sym),
+        TDim::Add(terms) => {
+            let mut axis = None;
+            for t in terms {
+                match t {
+                    TDim::Sym(sym) => {
+                        if axis.is_some() {
+                            return None;
+                        }
+                        axis = sym_to_coord_axis(sym);
+                    }
+                    TDim::Val(_) => {}
+                    _ => return None,
+                }
+            }
+            axis
+        }
+        _ => None,
+    }
+}
+
+fn pulsify_diag_gather(
+    _op: &DiagGather,
+    source: &TypedModel,
+    node: &TypedNode,
+    target: &mut PulsedModel,
+    mapping: &HashMap<OutletId, OutletId>,
+    _symbol: &Symbol,
+    _pulse: &TDim,
+) -> TractResult<Option<TVec<OutletId>>> {
+    // Pulse-time `out_len` is the live-window width W from the output's
+    // chunk-window ROI.  If the ROI is missing or doesn't match the
+    // chunk-window pattern, defer to the regular fallback.
+    let roi_raw = source.outlet_fact(OutletId::new(node.id, 0))?.region_of_interest.clone();
+    rule_if_some!(w = roi_raw.as_ref().and_then(|r| chunk_window_width(&r.clone().simplify())));
+
+    let input_wire = mapping[&node.inputs[0]];
+    let input_fact = target.outlet_fact(input_wire)?.clone();
+    let stream = input_fact.stream.as_ref().context("DiagGather input must be streaming")?;
+
+    // P_local: the pulse size at this level (after any subsampling).  In the
+    // windowed input the relative-position axis has `W + P_local − 1`
+    // entries; distance 0 sits at position `P_local − 1`.
+    let p_local = input_fact.shape[stream.axis].to_i64()?;
+
+    let pulsed_op = DiagGather { offset: (p_local - 1).to_dim(), out_len: (w as i64).to_dim() };
+
+    let out = target.wire_node(&node.name, PulseWrappingOp(Box::new(pulsed_op)), &[input_wire])?;
+    Ok(Some(out))
+}
diff --git a/pulse/src/ops/downsample.rs b/pulse/src/ops/downsample.rs
index b25eaaf424..447bb23316 100644
--- a/pulse/src/ops/downsample.rs
+++ b/pulse/src/ops/downsample.rs
@@ -16,37 +16,32 @@ fn pulsify(
 ) -> TractResult<Option<TVec<OutletId>>> {
     let input = mapping[&node.inputs[0]];
     let fact = target.outlet_fact(input)?.clone();
-    if let Some(stream) = fact.stream.as_ref() {
-        if stream.axis != op.axis {
-            return Ok(None);
-        }
-        let stride = if op.stride > 0 {
-            op.stride as usize
-        } else {
-            bail!("Negative strides are not causal, can not pulsify.")
-        };
-        let pulse = fact.pulse().unwrap();
-        if !(pulse.clone() % stride).is_zero() {
-            bail!("Pulsification requires pulse ({}) to be a stride ({}) multiple", pulse, stride)
-        }
-        let mut wire = tvec!(input);
-        let first_offset = stream.delay + op.modulo;
-        let new_op = Downsample { modulo: first_offset % stride, axis: op.axis, stride: op.stride };
-        wire = target.wire_node(format!("{}.downsample", node.name), new_op, &wire)?;
-        wire = target.wire_node(
-            &node.name,
-            PulsedAxisSlice {
-                axis: stream.axis,
-                skip: first_offset / stride,
-                take: (stream.dim.to_owned() - op.modulo).divceil(stride),
-            },
-            &wire,
-        )?;
-        target.rename_node(wire[0].node, &node.name)?;
-        Ok(Some(wire))
+    rule_if_some!(stream = fact.stream.as_ref());
+    rule_if!(stream.axis == op.axis);
+    let stride = if op.stride > 0 {
+        op.stride as usize
     } else {
-        Ok(None)
+        bail!("Negative strides are not causal, can not pulsify.")
+    };
+    let pulse = fact.pulse().unwrap();
+    if !(pulse.clone() % stride).is_zero() {
+        bail!("Pulsification requires pulse ({}) to be a stride ({}) multiple", pulse, stride)
     }
+    let mut wire = tvec!(input);
+    let first_offset = stream.delay + op.modulo;
+    let new_op = Downsample { modulo: first_offset % stride, axis: op.axis, stride: op.stride };
+    wire = target.wire_node(format!("{}.downsample", node.name), new_op, &wire)?;
+    wire = target.wire_node(
+        &node.name,
+        PulsedAxisSlice {
+            axis: stream.axis,
+            skip: first_offset / stride,
+            take: (stream.dim.to_owned() - op.modulo).divceil(stride),
+        },
+        &wire,
+    )?;
+    target.rename_node(wire[0].node, &node.name)?;
+    Ok(Some(wire))
 }
 
 impl PulsedOp for Downsample {
diff --git a/pulse/src/ops/fft.rs b/pulse/src/ops/fft.rs
index 4075779949..c7d809f5d7 100644
--- a/pulse/src/ops/fft.rs
+++ b/pulse/src/ops/fft.rs
@@ -22,9 +22,7 @@ fn pulsify(
         None => return Ok(None),
     };
 
-    if stream.axis != op.axis {
-        return Ok(None);
-    }
+    rule_if!(stream.axis == op.axis);
 
     let overlap = op.frame - op.stride;
 
diff --git a/pulse/src/ops/mod.rs b/pulse/src/ops/mod.rs
index daf2d29144..87ab5e3495 100644
--- a/pulse/src/ops/mod.rs
+++ b/pulse/src/ops/mod.rs
@@ -9,6 +9,7 @@ use tract_pulse_opl::ops::Delay;
 pub mod array;
 pub mod cnn;
 pub mod delay;
+pub mod diag_gather;
 pub mod downsample;
 pub mod dummy;
 pub mod fft;
@@ -16,6 +17,7 @@ pub mod mask;
 pub mod scan;
 pub mod slice;
 pub mod source;
+pub mod window;
 
 pub(crate) fn sync_inputs(
     node: &TypedNode,
@@ -49,7 +51,7 @@ pub(crate) fn sync_inputs(
     Ok(inputs)
 }
 
-register_all_mod!(array, cnn, downsample, fft, scan, source);
+register_all_mod!(array, cnn, diag_gather, downsample, fft, scan, source, window);
 
 type PulsifierFn = fn(
     &TypedModel,
diff --git a/pulse/src/ops/window.rs b/pulse/src/ops/window.rs
new file mode 100644
index 0000000000..7788f01aa6
--- /dev/null
+++ b/pulse/src/ops/window.rs
@@ -0,0 +1,333 @@
+//! Pulsifier for `WindowOnAxis`.
+//!
+//! `WindowOnAxis(axis, W, start)` lowers to a three-op chain:
+//!
+//! ```text
+//! Delay(0, W-1) → PulsePad(before = -start, after = 0, value = 0) → PulsedExposeWindow
+//! ```
+//!
+//! * `Delay(0, W-1)` accumulates the latest W chunks per pulse (current
+//!   plus `W-1` past), bumping `stream.delay` by `W-1`.
+//! * `PulsePad(before = -start)` zero-fills the leading `-start` chunks of
+//!   the post-delay buffer (so the very first pulses see zeros for the
+//!   "out-of-stream" past, matching the batch-eval boundary semantics) and
+//!   subtracts `-start` from `stream.delay`.  For `start = 0` (future
+//!   window, ex03) this is a no-op (`before = 0`).  For `start = -(W-1)`
+//!   (past window, ex04) it cancels the Delay's increment, leaving
+//!   `stream.delay = 0` (causal).  For mixed bands the residue is
+//!   `stream.delay = end = start + W - 1`.
+//! * `PulsedExposeWindow` reshapes the per-pulse `[W, ...]` view into
+//!   `[1, W, ...]`, exposing W as a static window axis.
+//!
+//! Constraint: `pulse == 1` on the windowed axis (the case Blockify
+//! produces today).  Constraints: `start ≤ 0` and `start + W - 1 ≥ 0`
+//! (the window must straddle the current chunk).
+
+use crate::internal::*;
+use tract_core::ops::array::PadMode;
+use tract_pulse_opl::ops::{Delay, PulsePad, WindowOnAxis};
+
+register_all!(WindowOnAxis: pulsify);
+
+fn pulsify(
+    op: &WindowOnAxis,
+    _source: &TypedModel,
+    node: &TypedNode,
+    target: &mut PulsedModel,
+    mapping: &HashMap<OutletId, OutletId>,
+    _symbol: &Symbol,
+    _pulse: &TDim,
+) -> TractResult<Option<TVec<OutletId>>> {
+    let input = mapping[&node.inputs[0]];
+    let fact = target.outlet_fact(input)?.clone();
+    rule_if_some!(stream = fact.stream.as_ref());
+    rule_if!(stream.axis == op.axis);
+    let pulse = fact.pulse().unwrap();
+    let pulse_size = pulse.to_usize()?;
+    ensure!(
+        pulse_size == 1,
+        "WindowOnAxis pulsifier currently requires pulse=1 on the windowed axis (got {pulse_size})"
+    );
+
+    // Bail on `start > 0` (purely-future skip-current) and on
+    // `start + window - 1 < 0` (purely-past) — neither is needed yet.
+    ensure!(op.start <= 0, "WindowOnAxis pulsifier: start > 0 not supported (got {})", op.start);
+    ensure!(
+        op.start + op.window as i64 - 1 >= 0,
+        "WindowOnAxis pulsifier: window must straddle current (start={}, window={})",
+        op.start,
+        op.window
+    );
+
+    let overlap = op.window - 1;
+    let before: usize = (-op.start) as usize;
+
+    let delayed = target.wire_node(
+        format!("{}.delay", node.name),
+        Delay::new_typed(&(&fact).into(), op.axis, 0, overlap),
+        &[input],
+    )?[0];
+
+    // For `start < 0` (past-window): pad-fill the leading `before` chunks
+    // of the post-delay buffer and shift `stream.delay` back by `before`,
+    // so the consumer's logical time anchors to the *latest* chunk in the
+    // window rather than the oldest.  For `start = 0` we still wire the
+    // pad with `before = 0`: it's a runtime no-op (no fill, no delay
+    // change) but keeps the pulsifier's structure uniform.  The fill
+    // value comes from `op.pad_value` — zero for data wires, sentinel
+    // for chunk-index wires whose downstream band predicate keys off it.
+    let post_delay_fact = target.outlet_fact(delayed)?.clone();
+    let post_delay_stream = post_delay_fact.stream.as_ref().unwrap();
+    let begin_input = post_delay_stream.delay;
+    let end_input = post_delay_stream.delay.to_dim() + &post_delay_stream.dim;
+    let padded = target.wire_node(
+        format!("{}.pulse_pad", node.name),
+        PulsePad {
+            axis: op.axis,
+            before,
+            after: 0.to_dim(),
+            begin_input,
+            end_input,
+            mode: PadMode::Constant(op.pad_value.clone()),
+            overlap,
+        },
+        &[delayed],
+    )?[0];
+
+    let exposed = target.wire_node(
+        &*node.name,
+        PulsedExposeWindow { axis: op.axis, window: op.window },
+        &[padded],
+    )?;
+    Ok(Some(exposed))
+}
+
+/// Pulse-only op: splits the per-pulse streaming-axis buffer of size
+/// `1 + W - 1 = W` into `[1, W]`, exposing `W` as a static window axis.
+/// Logical streaming dim is preserved (1 chunk per pulse on the new
+/// streaming axis at the same position as before).  Stream-delay
+/// adjustment is handled upstream by the `PulsePad` the WindowOnAxis
+/// pulsifier wires before this op.
+#[derive(Debug, Clone, Default, Hash, PartialEq, Eq)]
+pub struct PulsedExposeWindow {
+    pub axis: usize,
+    pub window: usize,
+}
+
+impl Op for PulsedExposeWindow {
+    fn name(&self) -> StaticName {
+        "PulsedExposeWindow".into()
+    }
+
+    fn info(&self) -> TractResult<Vec<String>> {
+        Ok(vec![format!("axis: {} window: {}", self.axis, self.window)])
+    }
+
+    not_a_typed_op!();
+}
+
+impl EvalOp for PulsedExposeWindow {
+    fn is_stateless(&self) -> bool {
+        true
+    }
+
+    fn eval(&self, inputs: TVec<TValue>) -> TractResult<TVec<TValue>> {
+        let input = args_1!(inputs).into_tensor();
+        let mut new_shape: TVec<usize> = input.shape().into();
+        // input shape on `axis` is W (= pulse + overlap, with pulse=1).
+        // Output shape on `axis` is 1, with a new W axis at `axis + 1`.
+        ensure!(
+            new_shape[self.axis] == self.window,
+            "PulsedExposeWindow: per-pulse axis {} has size {} but expected window {}",
+            self.axis,
+            new_shape[self.axis],
+            self.window
+        );
+        new_shape[self.axis] = 1;
+        new_shape.insert(self.axis + 1, self.window);
+        let mut t = input;
+        unsafe { t.set_shape_unchecked(&new_shape) };
+        Ok(tvec!(t.into_tvalue()))
+    }
+}
+
+impl PulsedOp for PulsedExposeWindow {
+    fn pulsed_output_facts(&self, inputs: &[&PulsedFact]) -> TractResult<TVec<PulsedFact>> {
+        let mut fact = inputs[0].clone();
+        let stream = fact.stream.as_mut().context("PulsedExposeWindow needs a streaming input")?;
+        ensure!(
+            stream.axis == self.axis,
+            "PulsedExposeWindow: input stream axis {} doesn't match op axis {}",
+            stream.axis,
+            self.axis
+        );
+        let mut shape: TVec<TDim> = fact.shape.iter().cloned().collect();
+        shape[self.axis] = 1.to_dim();
+        shape.insert(self.axis + 1, self.window.to_dim());
+        fact.shape = shape.into();
+        Ok(tvec!(fact))
+    }
+
+    fn to_typed(&self) -> Box<dyn TypedOp> {
+        // Typed equivalent: factor the pulse-axis (size W) into [1, W].
+        // After this reshape the typed output matches eval's output shape:
+        // [..., 1, W, ...] with the streaming axis at `self.axis` and the
+        // static window at `self.axis + 1`.
+        use tract_pulse_opl::tract_core::ops::change_axes::AxisOp;
+        Box::new(AxisOp::Reshape(
+            self.axis,
+            tvec!(self.window.to_dim()),
+            tvec!(1.to_dim(), self.window.to_dim()),
+        ))
+    }
+
+    as_op!();
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Typed batch eval of `WindowOnAxis(axis=0, window=2)` on `[S=4, D=2]`
+    /// should produce `[S=4, W=2, D=2]` with future-shifted slots and zero
+    /// fill at the trailing edge.
+    #[test]
+    fn window_on_axis_typed_eval() {
+        let op = WindowOnAxis::future(0, 2, f32::datum_type()).unwrap();
+        let input =
+            tract_core::ndarray::Array2::<f32>::from_shape_fn((4, 2), |(i, j)| (i * 10 + j) as f32);
+        let result = op.eval(tvec!(input.clone().into_dyn().into_tvalue())).unwrap();
+        let out = result[0].to_plain_array_view::<f32>().unwrap().to_owned();
+        assert_eq!(out.shape(), &[4, 2, 2]);
+        // Slot w=0: copy of input.
+        for s in 0..4 {
+            for d in 0..2 {
+                assert_eq!(out[[s, 0, d]], input[[s, d]], "w=0, s={s}, d={d}");
+            }
+        }
+        // Slot w=1: shifted by 1, last row zero.
+        for s in 0..3 {
+            for d in 0..2 {
+                assert_eq!(out[[s, 1, d]], input[[s + 1, d]], "w=1, s={s}, d={d}");
+            }
+        }
+        for d in 0..2 {
+            assert_eq!(out[[3, 1, d]], 0.0, "w=1 trailing pad must be zero");
+        }
+    }
+
+    /// Pulsified WindowOnAxis on a streaming `[S, D]` input with pulse=1
+    /// should produce `[1, W, D]` per pulse, lagging by `W-1` (the future-
+    /// lookahead delay).  We feed S=4 chunks of size 1, plus W-1 trailing
+    /// flush pulses, and verify the windowed output matches the expected
+    /// future-shift values.
+    #[test]
+    fn window_on_axis_pulsified() {
+        use tract_core::ndarray::Array2;
+        let mut model = TypedModel::default();
+        let s = model.symbols.sym("S");
+        let a = model.add_source("a", f32::fact(dims![s.clone(), 2_usize].as_ref())).unwrap();
+        let win = model
+            .wire_node("win", WindowOnAxis::future(0, 2, f32::datum_type()).unwrap(), &[a])
+            .unwrap();
+        model.select_output_outlets(&win).unwrap();
+
+        let pulsed = PulsedModel::new(&model, s.clone(), &1.to_dim()).unwrap();
+        let plan = SimplePlan::new(pulsed.into_typed().unwrap()).unwrap();
+        let mut state = SimpleState::new(&plan).unwrap();
+
+        let input = Array2::<f32>::from_shape_fn((4, 2), |(i, j)| (i * 10 + j) as f32);
+
+        // Window=2 → output lags by 1.  Feed 4 + 1 trailing pulses.
+        let mut got: Vec<Vec<f32>> = vec![];
+        for s in 0..5 {
+            let chunk: Array2<f32> = if s < 4 {
+                input.slice_axis(tract_core::ndarray::Axis(0), (s..s + 1).into()).to_owned()
+            } else {
+                Array2::<f32>::zeros((1, 2)) // flush pulse
+            };
+            let out = state.run(tvec!(chunk.into_dyn().into_tvalue())).unwrap();
+            let arr = out[0].to_plain_array_view::<f32>().unwrap().to_owned();
+            assert_eq!(arr.shape(), &[1, 2, 2], "step {s}");
+            got.push(arr.iter().copied().collect());
+        }
+
+        // After delay=W-1=1, pulses 0..S produce logical-time-(s-1) outputs.
+        // Pulse 0: garbage (logical -1, before stream start).  Skip.
+        // Pulse 1: logical 0 → window {input[0], input[1]}.
+        let p1 = &got[1];
+        assert_eq!(p1[..2], [input[[0, 0]], input[[0, 1]]]);
+        assert_eq!(p1[2..], [input[[1, 0]], input[[1, 1]]]);
+        // Pulse 2: logical 1 → window {input[1], input[2]}.
+        let p2 = &got[2];
+        assert_eq!(p2[..2], [input[[1, 0]], input[[1, 1]]]);
+        assert_eq!(p2[2..], [input[[2, 0]], input[[2, 1]]]);
+        // Pulse 3: logical 2 → window {input[2], input[3]}.
+        let p3 = &got[3];
+        assert_eq!(p3[..2], [input[[2, 0]], input[[2, 1]]]);
+        assert_eq!(p3[2..], [input[[3, 0]], input[[3, 1]]]);
+        // Pulse 4: logical 3 → window {input[3], 0}.
+        let p4 = &got[4];
+        assert_eq!(p4[..2], [input[[3, 0]], input[[3, 1]]]);
+        assert_eq!(p4[2..], [0.0, 0.0]);
+    }
+
+    /// Past-window (`start = -1`, `W = 2`): output stream.delay must be 0
+    /// (causal), and per-pulse slot 0 must be the *previous* chunk while
+    /// slot 1 is current.  No trailing flush needed.
+    #[test]
+    fn window_on_axis_past_window_pulsified() {
+        use tract_core::ndarray::Array2;
+        let mut model = TypedModel::default();
+        let s = model.symbols.sym("S");
+        let a = model.add_source("a", f32::fact(dims![s.clone(), 2_usize].as_ref())).unwrap();
+        let win = model
+            .wire_node(
+                "win",
+                WindowOnAxis {
+                    axis: 0,
+                    window: 2,
+                    start: -1,
+                    pad_value: tensor0(0f32).into_arc_tensor(),
+                },
+                &[a],
+            )
+            .unwrap();
+        model.select_output_outlets(&win).unwrap();
+
+        let pulsed = PulsedModel::new(&model, s.clone(), &1.to_dim()).unwrap();
+        // Output's stream.delay must be 0 (causal past window).
+        assert_eq!(pulsed.outputs[0], pulsed.outputs[0]); // (placeholder for compile)
+        let plan = SimplePlan::new(pulsed.into_typed().unwrap()).unwrap();
+        let mut state = SimpleState::new(&plan).unwrap();
+
+        let input = Array2::<f32>::from_shape_fn((4, 2), |(i, j)| (i * 10 + j) as f32);
+        let mut got: Vec<Vec<f32>> = vec![];
+        for s in 0..4 {
+            let chunk =
+                input.slice_axis(tract_core::ndarray::Axis(0), (s..s + 1).into()).to_owned();
+            let out = state.run(tvec!(chunk.into_dyn().into_tvalue())).unwrap();
+            let arr = out[0].to_plain_array_view::<f32>().unwrap().to_owned();
+            assert_eq!(arr.shape(), &[1, 2, 2], "step {s}");
+            got.push(arr.iter().copied().collect());
+        }
+
+        // Pulse 0: current chunk = input[0], past chunk = zero (no prior).
+        // slot 0 = past = 0, slot 1 = current = input[0].
+        let p0 = &got[0];
+        assert_eq!(p0[..2], [0.0, 0.0]);
+        assert_eq!(p0[2..], [input[[0, 0]], input[[0, 1]]]);
+        // Pulse 1: slot 0 = input[0], slot 1 = input[1].
+        let p1 = &got[1];
+        assert_eq!(p1[..2], [input[[0, 0]], input[[0, 1]]]);
+        assert_eq!(p1[2..], [input[[1, 0]], input[[1, 1]]]);
+        // Pulse 2: slot 0 = input[1], slot 1 = input[2].
+        let p2 = &got[2];
+        assert_eq!(p2[..2], [input[[1, 0]], input[[1, 1]]]);
+        assert_eq!(p2[2..], [input[[2, 0]], input[[2, 1]]]);
+        // Pulse 3: slot 0 = input[2], slot 1 = input[3].
+        let p3 = &got[3];
+        assert_eq!(p3[..2], [input[[2, 0]], input[[2, 1]]]);
+        assert_eq!(p3[2..], [input[[3, 0]], input[[3, 1]]]);
+    }
+}
diff --git a/test-rt/suite-onnx/node.txt b/test-rt/suite-onnx/node.txt
index 0c64d0957f..0f76bf449e 100644
--- a/test-rt/suite-onnx/node.txt
+++ b/test-rt/suite-onnx/node.txt
@@ -231,6 +231,9 @@ test_less.*
 test_log
 test_log_example
 test_logsoftmax.*
+test_l1normalization.*
+test_l2normalization.*
+test_lpnormalization.*
 test_lrn
 test_lrn_default
 test_lstm_batchwise
@@ -260,6 +263,7 @@ test_mean_one_input
 test_mean_two_inputs
 test_melweightmatrix input:
 test_min.*
+test_mish
 test_mish_expanded
 test_mod_broadcast not-nnef
 test_mod_int64_fmod not-nnef
@@ -275,6 +279,7 @@ test_mod_uint32 not-nnef
 test_mod_uint64 not-nnef
 test_mod_uint8 not-nnef
 test_mul.*
+test_mvn
 test_mvn_expanded
 test_neg
 test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded
@@ -664,3 +669,173 @@ test_xor_bcast3v2d
 test_xor_bcast4v2d
 test_xor_bcast4v3d
 test_xor_bcast4v4d
+test_ai_onnx_ml_array_feature_extractor since:24
+test_attention_3d since:24
+test_attention_3d_attn_mask since:24
+test_attention_3d_attn_mask_expanded since:24
+test_attention_3d_causal since:24
+test_attention_3d_causal_expanded since:24
+test_attention_3d_diff_heads_sizes since:24
+test_attention_3d_diff_heads_sizes_attn_mask since:24
+test_attention_3d_diff_heads_sizes_attn_mask_expanded since:24
+test_attention_3d_diff_heads_sizes_causal since:24
+test_attention_3d_diff_heads_sizes_causal_expanded since:24
+test_attention_3d_diff_heads_sizes_expanded since:24
+test_attention_3d_diff_heads_sizes_scaled since:24
+test_attention_3d_diff_heads_sizes_scaled_expanded since:24
+test_attention_3d_diff_heads_sizes_softcap_expanded since:24
+test_attention_3d_diff_heads_with_past_and_present since:24
+test_attention_3d_diff_heads_with_past_and_present_expanded since:24
+test_attention_3d_expanded since:24
+test_attention_3d_gqa since:24
+test_attention_3d_gqa_attn_mask since:24
+test_attention_3d_gqa_attn_mask_expanded since:24
+test_attention_3d_gqa_causal since:24
+test_attention_3d_gqa_causal_expanded since:24
+test_attention_3d_gqa_expanded since:24
+test_attention_3d_gqa_scaled since:24
+test_attention_3d_gqa_scaled_expanded since:24
+test_attention_3d_gqa_softcap_expanded since:24
+test_attention_3d_gqa_with_past_and_present since:24
+test_attention_3d_gqa_with_past_and_present_expanded since:24
+test_attention_3d_scaled since:24
+test_attention_3d_scaled_expanded since:24
+test_attention_3d_softcap_expanded since:24
+test_attention_3d_with_past_and_present since:24
+test_attention_3d_with_past_and_present_expanded since:24
+test_attention_3d_with_past_and_present_qk_matmul_bias_expanded since:24
+test_attention_3d_with_past_and_present_qk_matmul_expanded since:24
+test_attention_3d_with_past_and_present_qk_matmul_softcap_expanded since:24
+test_attention_3d_with_past_and_present_qk_matmul_softmax_expanded since:24
+test_attention_4d since:24
+test_attention_4d_attn_mask since:24
+test_attention_4d_attn_mask_3d since:24
+test_attention_4d_attn_mask_3d_causal since:24
+test_attention_4d_attn_mask_3d_causal_expanded since:24
+test_attention_4d_attn_mask_3d_expanded since:24
+test_attention_4d_attn_mask_4d since:24
+test_attention_4d_attn_mask_4d_causal since:24
+test_attention_4d_attn_mask_4d_causal_expanded since:24
+test_attention_4d_attn_mask_4d_expanded since:24
+test_attention_4d_attn_mask_bool_4d_expanded since:24
+test_attention_4d_attn_mask_bool_expanded since:24
+test_attention_4d_attn_mask_expanded since:24
+test_attention_4d_causal since:24
+test_attention_4d_causal_expanded since:24
+test_attention_4d_diff_heads_sizes since:24
+test_attention_4d_diff_heads_sizes_attn_mask since:24
+test_attention_4d_diff_heads_sizes_attn_mask_expanded since:24
+test_attention_4d_diff_heads_sizes_causal since:24
+test_attention_4d_diff_heads_sizes_causal_expanded since:24
+test_attention_4d_diff_heads_sizes_expanded since:24
+test_attention_4d_diff_heads_sizes_scaled since:24
+test_attention_4d_diff_heads_sizes_scaled_expanded since:24
+test_attention_4d_diff_heads_sizes_softcap_expanded since:24
+test_attention_4d_diff_heads_with_past_and_present since:24
+test_attention_4d_diff_heads_with_past_and_present_expanded since:24
+test_attention_4d_diff_heads_with_past_and_present_mask3d since:24
+test_attention_4d_diff_heads_with_past_and_present_mask3d_expanded since:24
+test_attention_4d_diff_heads_with_past_and_present_mask4d since:24
+test_attention_4d_diff_heads_with_past_and_present_mask4d_expanded since:24
+test_attention_4d_expanded since:24
+test_attention_4d_fp16 since:24
+test_attention_4d_fp16_expanded since:24
+test_attention_4d_gqa since:24
+test_attention_4d_gqa_attn_mask since:24
+test_attention_4d_gqa_attn_mask_expanded since:24
+test_attention_4d_gqa_causal since:24
+test_attention_4d_gqa_causal_expanded since:24
+test_attention_4d_gqa_expanded since:24
+test_attention_4d_gqa_scaled since:24
+test_attention_4d_gqa_scaled_expanded since:24
+test_attention_4d_gqa_softcap_expanded since:24
+test_attention_4d_gqa_with_past_and_present since:24
+test_attention_4d_gqa_with_past_and_present_expanded since:24
+test_attention_4d_gqa_with_past_and_present_fp16 since:24
+test_attention_4d_gqa_with_past_and_present_fp16_expanded since:24
+test_attention_4d_scaled since:24
+test_attention_4d_scaled_expanded since:24
+test_attention_4d_softcap_expanded since:24
+test_attention_4d_with_past_and_present since:24
+test_attention_4d_with_past_and_present_expanded since:24
+test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_causal_expanded since:24
+test_attention_4d_with_past_and_present_qk_matmul_bias_3d_mask_expanded since:24
+test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_causal_expanded since:24
+test_attention_4d_with_past_and_present_qk_matmul_bias_4d_mask_expanded since:24
+test_attention_4d_with_past_and_present_qk_matmul_bias_expanded since:24
+test_attention_4d_with_past_and_present_qk_matmul_expanded since:24
+test_attention_4d_with_qk_matmul_bias_expanded since:24
+test_attention_4d_with_qk_matmul_expanded since:24
+test_attention_4d_with_qk_matmul_softcap_expanded since:24
+test_attention_4d_with_qk_matmul_softmax_expanded since:24
+test_averagepool_2d_ceil_last_window_starts_on_pad since:24
+test_conv_with_autopad_same
+test_convinteger_without_padding
+test_convtranspose_autopad_same since:18
+test_convtranspose_group_2 since:24
+test_convtranspose_group_2_image_3 since:24
+test_gelu_default_1 since:24
+test_gelu_default_1_expanded since:24
+test_gelu_default_2 since:24
+test_gelu_default_2_expanded since:24
+test_gelu_tanh_1 since:24
+test_gelu_tanh_1_expanded since:24
+test_gelu_tanh_2 since:24
+test_gelu_tanh_2_expanded since:24
+test_group_normalization_epsilon
+test_group_normalization_epsilon_expanded
+test_group_normalization_example
+test_group_normalization_example_expanded
+test_isnan_float16 since:24
+test_maxpool_2d_ceil_output_size_reduce_by_one since:24
+test_mvn_expanded_ver18
+test_qlinearconv
+test_qlinearmatmul_2D_uint8_float32 since:24
+test_qlinearmatmul_3D_uint8_float32 since:24
+test_reduce_sum_empty_axes_input_noop since:24
+test_rms_normalization_2d_axis0 since:24
+test_rms_normalization_2d_axis0_expanded since:24
+test_rms_normalization_2d_axis1 since:24
+test_rms_normalization_2d_axis1_expanded since:24
+test_rms_normalization_2d_axis_negative_1 since:24
+test_rms_normalization_2d_axis_negative_1_expanded since:24
+test_rms_normalization_2d_axis_negative_2 since:24
+test_rms_normalization_2d_axis_negative_2_expanded since:24
+test_rms_normalization_3d_axis0_epsilon since:24
+test_rms_normalization_3d_axis0_epsilon_expanded since:24
+test_rms_normalization_3d_axis1_epsilon since:24
+test_rms_normalization_3d_axis1_epsilon_expanded since:24
+test_rms_normalization_3d_axis2_epsilon since:24
+test_rms_normalization_3d_axis2_epsilon_expanded since:24
+test_rms_normalization_3d_axis_negative_1_epsilon since:24
+test_rms_normalization_3d_axis_negative_1_epsilon_expanded since:24
+test_rms_normalization_3d_axis_negative_2_epsilon since:24
+test_rms_normalization_3d_axis_negative_2_epsilon_expanded since:24
+test_rms_normalization_3d_axis_negative_3_epsilon since:24
+test_rms_normalization_3d_axis_negative_3_epsilon_expanded since:24
+test_rms_normalization_4d_axis0 since:24
+test_rms_normalization_4d_axis0_expanded since:24
+test_rms_normalization_4d_axis1 since:24
+test_rms_normalization_4d_axis1_expanded since:24
+test_rms_normalization_4d_axis2 since:24
+test_rms_normalization_4d_axis2_expanded since:24
+test_rms_normalization_4d_axis3 since:24
+test_rms_normalization_4d_axis3_expanded since:24
+test_rms_normalization_4d_axis_negative_1 since:24
+test_rms_normalization_4d_axis_negative_1_expanded since:24
+test_rms_normalization_4d_axis_negative_2 since:24
+test_rms_normalization_4d_axis_negative_2_expanded since:24
+test_rms_normalization_4d_axis_negative_3 since:24
+test_rms_normalization_4d_axis_negative_3_expanded since:24
+test_rms_normalization_4d_axis_negative_4 since:24
+test_rms_normalization_4d_axis_negative_4_expanded since:24
+test_rms_normalization_default_axis since:24
+test_rms_normalization_default_axis_expanded since:24
+test_rotary_embedding.* since:24
+test_scatter_elements_with_duplicate_indices
+test_swish since:24
+test_swish_expanded since:24
+test_top_k_same_values since:24
+test_top_k_same_values_2d since:24
+test_top_k_same_values_largest since:24
+test_top_k_uint64 since:24
diff --git a/test-rt/suite-unit/src/q_binary.rs b/test-rt/suite-unit/src/q_binary.rs
index 83ce98f5cd..84727b5e39 100644
--- a/test-rt/suite-unit/src/q_binary.rs
+++ b/test-rt/suite-unit/src/q_binary.rs
@@ -169,6 +169,17 @@ pub fn suite() -> TractResult<TestSuite> {
             c_dt: qu8_dt(0, 1.),
         },
     );
+    // declutter_absorbing was shunting the absorbing input (QU8(Z:61 S:1)) directly
+    // to the output (QU8(Z:0 S:0.5)), causing a type mismatch.
+    suite.add(
+        "bug_absorbing_quant_type_mismatch",
+        QBinaryOpProblem {
+            operator: tract_core::ops::math::mul(),
+            tensor_a: qu8_tensor0(0u8, 0, 1.5)?,
+            tensor_b: qu8_tensor0(61u8, 61, 1.)?,
+            c_dt: qu8_dt(0, 0.5),
+        },
+    );
 
     suite.add(
         "trivial_mul_as_qu8_overflow_clamp",
diff --git a/test-rt/test-blas/Cargo.toml b/test-rt/test-blas/Cargo.toml
deleted file mode 100644
index 478331f704..0000000000
--- a/test-rt/test-blas/Cargo.toml
+++ /dev/null
@@ -1,24 +0,0 @@
-[package]
-name = "test-blas"
-version = "0.1.0"
-edition = "2024"
-
-[dependencies]
-
-[build-dependencies]
-infra = { path = "../infra" }
-itertools.workspace = true
-lazy_static.workspace = true
-suite-onnx = { path = "../suite-onnx" }
-suite-unit = { path = "../suite-unit" }
-tract-core = { workspace = true, features = [ "blis" ] }
-
-[dev-dependencies]
-infra = { path = "../infra" }
-itertools.workspace = true
-lazy_static.workspace = true
-log.workspace = true
-suite-onnx = { path = "../suite-onnx" }
-suite-unit = { path = "../suite-unit" }
-tract-core = { workspace = true, features = [ "blis" ] }
-tract-onnx-opl.workspace = true
diff --git a/test-rt/test-blas/build.rs b/test-rt/test-blas/build.rs
deleted file mode 100644
index 2fdb675d5a..0000000000
--- a/test-rt/test-blas/build.rs
+++ /dev/null
@@ -1,11 +0,0 @@
-#[path = "suite.rs"]
-mod suite;
-
-fn main() {
-    suite::suite().test_runtime(
-        "as_blas",
-        "suite::suite()",
-        "as_blas()",
-        "Approximation::Approximate",
-    );
-}
diff --git a/test-rt/test-blas/src/lib.rs b/test-rt/test-blas/src/lib.rs
deleted file mode 100644
index 55ef62358a..0000000000
--- a/test-rt/test-blas/src/lib.rs
+++ /dev/null
@@ -1,37 +0,0 @@
-#![cfg(test)]
-use std::fmt::Debug;
-
-use tract_core::internal::*;
-
-#[path = "../suite.rs"]
-mod suite;
-
-mod as_blas {
-    use super::*;
-
-    pub fn as_blas() -> &'static AsBlasRuntime {
-        &AsBlasRuntime
-    }
-
-    #[derive(Debug)]
-    pub struct AsBlasRuntime;
-
-    impl Runtime for AsBlasRuntime {
-        fn name(&self) -> StaticName {
-            Cow::Borrowed("as_blas")
-        }
-        fn prepare_with_options(
-            &self,
-            mut model: TypedModel,
-            options: &RunOptions,
-        ) -> TractResult<Box<dyn Runnable>> {
-            tract_core::transform::get_transform("as_blas")?.unwrap().transform(&mut model)?;
-            Ok(Box::new(model.into_runnable_with_options(options)?))
-        }
-        fn check(&self) -> TractResult<()> {
-            Ok(())
-        }
-    }
-
-    include!(concat!(env!("OUT_DIR"), "/tests/as_blas.rs"));
-}
diff --git a/test-rt/test-blas/suite.rs b/test-rt/test-blas/suite.rs
deleted file mode 100644
index 41d755bdfe..0000000000
--- a/test-rt/test-blas/suite.rs
+++ /dev/null
@@ -1,27 +0,0 @@
-use infra::Test;
-
-pub fn suite() -> &'static infra::TestSuite {
-    lazy_static::lazy_static! {
-        static ref SUITE: infra::TestSuite  = mk_suite();
-    };
-    &SUITE
-}
-
-#[allow(clippy::needless_update)]
-fn mk_suite() -> infra::TestSuite {
-    let mut onnx = suite_onnx::suite().clone();
-    onnx.ignore(&ignore_onnx);
-
-    let mut unit = suite_unit::suite().unwrap().clone();
-    unit.ignore_case(&ignore_unit);
-
-    infra::TestSuite::default().with("onnx", onnx).with("unit", unit)
-}
-
-fn ignore_onnx(_t: &[String]) -> bool {
-    false
-}
-
-fn ignore_unit(_t: &[String], _tc: &dyn Test) -> bool {
-    false
-}
diff --git a/test-rt/test-onnx-core/build.rs b/test-rt/test-onnx-core/build.rs
index f605c659b9..ad47c9b621 100644
--- a/test-rt/test-onnx-core/build.rs
+++ b/test-rt/test-onnx-core/build.rs
@@ -7,5 +7,4 @@ fn main() {
         "unoptimized()",
         "Approximation::Approximate",
     );
-    suite.test_runtime("as_blas", "suite_onnx::suite()", "as_blas()", "Approximation::Approximate");
 }
diff --git a/tflite/src/rewriter.rs b/tflite/src/rewriter.rs
index a1a020196c..45f4c8f0fa 100644
--- a/tflite/src/rewriter.rs
+++ b/tflite/src/rewriter.rs
@@ -37,9 +37,7 @@ fn trivial_axes_around_matmul(
 ) -> TractResult<Option<TypedModelPatch>> {
     let facts = model.node_input_facts(node.id)?;
     let rank = facts[0].rank();
-    if rank <= 4 {
-        return Ok(None);
-    }
+    rule_if!(rank > 4);
     let trivial_axes = (0..rank - 2)
         .filter(|axis| facts[0].shape[*axis].is_one() && facts[1].shape[*axis].is_one())
         .collect_vec();
@@ -68,9 +66,7 @@ fn kernel_in_ohwi(
     name: &str,
     conv: &Conv,
 ) -> TractResult<Option<TypedModelPatch>> {
-    if conv.kernel_fmt == KernelFormat::OHWI {
-        return Ok(None);
-    }
+    rule_if!(conv.kernel_fmt != KernelFormat::OHWI);
     if conv.group != 1 && conv.group != conv.output_channels() {
         bail!("Arbitrary grouping is not supported in tflite")
     }
@@ -116,9 +112,7 @@ fn bias_as_vector(
 ) -> TractResult<Option<TypedModelPatch>> {
     let bias_fact = model.outlet_fact(node.inputs[2])?;
     let co = conv.output_channels();
-    if *bias_fact.shape == [co.to_dim()] {
-        return Ok(None);
-    }
+    rule_if!(*bias_fact.shape != [co.to_dim()]);
     let mut patch = TypedModelPatch::default();
     let mut wire = patch.taps(model, &node.inputs)?;
     wire[2] = tract_core::ops::cnn::wire_reshape_bias_as_vector(
@@ -144,12 +138,8 @@ let input_fact = model.outlet_fact(node.inputs[0])?;
 let idt = input_fact.datum_type;
 let kernel_fact = model.outlet_fact(node.inputs[1])?;
 let kdt = kernel_fact.datum_type;
-if idt.is_float() || model.outlet_fact(node.inputs[6])?.shape.len() > 1 {
-return Ok(None);
-}
-if idt.unquantized() == u8::datum_type() && kdt.unquantized() == u8::datum_type() {
-return Ok(None);
-}
+rule_if!(!idt.is_float() && model.outlet_fact(node.inputs[6])?.shape.len() <= 1);
+rule_if!(idt.unquantized() != u8::datum_type() || kdt.unquantized() != u8::datum_type());
 let mut patch = TypedModelPatch::default();
 let wire = patch.taps(model, &node.inputs)?;
 let [mut i, mut k, b, mut i0, is, mut k0, ks, o0, os] = &*wire else {
diff --git a/transformers/src/lib.rs b/transformers/src/lib.rs
index 36432eb401..fc83923999 100644
--- a/transformers/src/lib.rs
+++ b/transformers/src/lib.rs
@@ -6,6 +6,7 @@ use rewriter::*;
 use tract_nnef::internal::*;
 
 register_simple_model_transform!("detect_apply_rope", ApplyRopeTransform);
+register_simple_model_transform!("detect_diag_gather", DetectDiagGatherTransform);
 register_simple_model_transform!("detect_scaled_masked_softmax", ScaledMaskedSoftmaxTransform);
 register_simple_model_transform!("detect_kv_cache", KeyValueCacheTransform);
 register_simple_model_transform!(
@@ -29,7 +30,6 @@ pub trait WithTractTransformers {
 
 impl WithTractTransformers for tract_nnef::framework::Nnef {
     fn enable_tract_transformers(&mut self) {
-        self.enable_tract_core();
         self.registries.push(tract_transformers_registry());
     }
 
diff --git a/transformers/src/ops/apply_rope.rs b/transformers/src/ops/apply_rope.rs
index 9876bfabc0..0616ab2e7f 100644
--- a/transformers/src/ops/apply_rope.rs
+++ b/transformers/src/ops/apply_rope.rs
@@ -215,15 +215,14 @@ pub fn apply_rope_rule(
     // If cos and rotate half don't share the same input, we check if they don't
     // input node that are the same.
     let (apply_rope_in, cos) = if !cos_mul.inputs.contains(&rotate_half.inputs[0]) {
-        let Some(rotate_half_prev) = model.previous_node(rotate_half) else { return Ok(None) };
-        let Some((cos_common_input_idx, _)) = model
-            .previous_nodes(cos_mul)
-            .iter()
-            .enumerate()
-            .find(|(_, n)| n.same_as(rotate_half_prev))
-        else {
-            return Ok(None);
-        };
+        rule_if_some!(rotate_half_prev = model.previous_node(rotate_half));
+        rule_if_some!(
+            (cos_common_input_idx, _) = model
+                .previous_nodes(cos_mul)
+                .iter()
+                .enumerate()
+                .find(|(_, n)| n.same_as(rotate_half_prev))
+        );
         (rotate_half.inputs[0], cos_mul.inputs[1 - cos_common_input_idx])
     } else {
         let apply_rope_in = rotate_half.inputs[0];
diff --git a/transformers/src/ops/diag_gather.rs b/transformers/src/ops/diag_gather.rs
new file mode 100644
index 0000000000..ad8f61b16b
--- /dev/null
+++ b/transformers/src/ops/diag_gather.rs
@@ -0,0 +1,556 @@
+//! Transformer-XL relative-position "skew trick" folded into a single op.
+//!
+//! The skew chain `Pad(axis, pre=1) → Reshape([T,2T]→[2T,T]) → Slice(start=1)
+//! → Reshape([2T-1,T]→[T,2T-1]) → Slice(end=T)` converts relative-position
+//! scores `[…, T, 2T-1]` into absolute-position scores `[…, T, T]`.  This
+//! module replaces that 5-op chain with a single [`DiagGather`] whose
+//! per-element semantics are trivial: `output[…, i, k] = input[…, i, offset + k − i]`.
+//!
+//! Folding is a pure typed-model rewrite (strength reduction): the op is
+//! cheap to evaluate, pulsifier-friendly, and easier for downstream passes to
+//! reason about than the chain it replaces.
+
+use tract_nnef::internal::*;
+use tract_nnef::tract_core::ops::array::{Pad, PadMode, Slice};
+use tract_nnef::tract_core::ops::change_axes::{AxisOp, InOut};
+
+/// Diagonal gather: `output[…, i, k] = input[…, i, offset + k − i]`
+///
+/// `offset` is the centre of the relative-position table (typically `T - 1`)
+/// and `out_len` is the number of output columns per query row (typically `T`).
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+pub struct DiagGather {
+    /// Centre of the relative position table: `T − 1`.
+    pub offset: TDim,
+    /// Number of output columns per query row.
+    pub out_len: TDim,
+}
+
+impl Op for DiagGather {
+    fn name(&self) -> StaticName {
+        "DiagGather".into()
+    }
+
+    fn info(&self) -> TractResult<Vec<String>> {
+        Ok(vec![format!("offset={}, out_len={}", self.offset, self.out_len)])
+    }
+
+    op_as_typed_op!();
+}
+
+impl EvalOp for DiagGather {
+    fn is_stateless(&self) -> bool {
+        true
+    }
+
+    fn eval_with_session(
+        &self,
+        _node_id: usize,
+        session: &TurnState,
+        inputs: TVec<TValue>,
+    ) -> TractResult<TVec<TValue>> {
+        let input = args_1!(inputs);
+        let rank = input.rank();
+        let t = input.shape()[rank - 2];
+        let r = input.shape()[rank - 1];
+        let offset = self.offset.eval(&session.resolved_symbols).to_i64()? as isize;
+        let out_len = self.out_len.eval(&session.resolved_symbols).to_usize()?;
+
+        let mut out_shape: TVec<usize> = input.shape().into();
+        out_shape[rank - 1] = out_len;
+
+        unsafe {
+            let mut output = Tensor::uninitialized_dt(input.datum_type(), &out_shape)?;
+            let elem_size = input.datum_type().size_of();
+            let in_ptr = input.as_ptr_unchecked::<u8>();
+            let out_ptr = output.as_ptr_mut_unchecked::<u8>();
+
+            let batch_size: usize = out_shape[..rank - 2].iter().product();
+            let in_row_stride = r * elem_size;
+            let out_row_stride = out_len * elem_size;
+
+            for b in 0..batch_size {
+                for i in 0..t {
+                    let in_row = in_ptr.add((b * t + i) * in_row_stride);
+                    let out_row = out_ptr.add((b * t + i) * out_row_stride);
+                    for k in 0..out_len {
+                        let idx = offset + k as isize - i as isize;
+                        if idx >= 0 && (idx as usize) < r {
+                            std::ptr::copy_nonoverlapping(
+                                in_row.add(idx as usize * elem_size),
+                                out_row.add(k * elem_size),
+                                elem_size,
+                            );
+                        } else {
+                            std::ptr::write_bytes(out_row.add(k * elem_size), 0, elem_size);
+                        }
+                    }
+                }
+            }
+            Ok(tvec!(output.into_tvalue()))
+        }
+    }
+}
+
+impl TypedOp for DiagGather {
+    fn output_facts(&self, inputs: &[&TypedFact]) -> TractResult<TVec<TypedFact>> {
+        let mut shape: TVec<TDim> = inputs[0].shape.to_tvec();
+        let rank = shape.len();
+        shape[rank - 1] = self.out_len.clone();
+        Ok(tvec!(inputs[0].datum_type.fact(&shape)))
+    }
+
+    fn axes_mapping(
+        &self,
+        inputs: &[&TypedFact],
+        _outputs: &[&TypedFact],
+    ) -> TractResult<AxesMapping> {
+        // All axes map 1:1 between input and output.
+        // The last axis is semantically a gather (not element-wise), but
+        // for axis tracking purposes it maps input-last to output-last.
+        AxesMapping::natural_for_rank(1, 1, inputs[0].rank())
+    }
+
+    fn input_roi(
+        &self,
+        model: &TypedModel,
+        node: &TypedNode,
+    ) -> TractResult<Option<TVec<Option<TDim>>>> {
+        // Output indexing: out[..., q, c] = in[..., q, offset + c - q]
+        // So input position (q, r) is read for output position (q, r + q - offset).
+        // To translate output ROI to input ROI, substitute the c symbol with
+        // (r + q - offset) where r is the input's last axis and q is the
+        // shared axis at rank-2.
+        let output_fact = model.outlet_fact(OutletId::new(node.id, 0))?;
+        let Some(roi) = &output_fact.region_of_interest else { return Ok(None) };
+        let rank = output_fact.shape.rank();
+        if rank < 2 {
+            return Ok(Some(tvec![Some(roi.clone())]));
+        }
+        let c_sym = roi
+            .symbols()
+            .into_iter()
+            .find(|s| tract_nnef::tract_core::ops::logic::sym_to_coord_axis(s) == Some(rank - 1));
+        let Some(c_sym) = c_sym else {
+            // No mention of the column axis — pass through unchanged.
+            return Ok(Some(tvec![Some(roi.clone())]));
+        };
+        let Some(scope) = c_sym.scope() else { return Ok(Some(tvec![Some(roi.clone())])) };
+        let q_sym = TDim::Sym(scope.coord_sym(rank - 2));
+        let r_expr = TDim::Sym(c_sym.clone()) + q_sym - self.offset.clone();
+        let input_roi = roi.substitute(&c_sym, &r_expr).map(|d| d.reduce()).unwrap_or(roi.clone());
+        Ok(Some(tvec![Some(input_roi)]))
+    }
+
+    fn substitute_symbols(
+        &self,
+        _source: &TypedModel,
+        node: &TypedNode,
+        target: &mut TypedModel,
+        mapping: &HashMap<OutletId, OutletId>,
+        subs: &HashMap<Symbol, TDim>,
+    ) -> TractResult<TVec<OutletId>> {
+        let inputs = node.inputs.iter().map(|i| mapping[i]).collect::<TVec<_>>();
+        let op = DiagGather {
+            offset: self.offset.substitute_all(subs)?,
+            out_len: self.out_len.substitute_all(subs)?,
+        };
+        target.wire_node(&node.name, op, &inputs)
+    }
+
+    fn declutter(
+        &self,
+        model: &TypedModel,
+        node: &TypedNode,
+    ) -> TractResult<Option<TypedModelPatch>> {
+        declutter_narrow_via_band_roi(self, model, node)
+    }
+
+    as_op!();
+}
+
+/// Coordinated narrow: when an axes-preserving chain upstream of this
+/// DiagGather terminates at a `Slice` whose output has a constant-width
+/// band ROI on the slice axis, narrow the Slice to that band AND re-anchor
+/// `DG.offset` so the rel-pos-zero column stays at the right place.
+///
+/// Math: with `lo, hi_excl` the band bounds on the slice's output axis,
+/// `new_slice.start = old_slice.start + lo`, `new_slice.end =
+/// old_slice.start + hi_excl`, `new_offset = old_offset − lo`.  All three
+/// must simplify to concrete integers before applying.
+fn declutter_narrow_via_band_roi(
+    op: &DiagGather,
+    model: &TypedModel,
+    node: &TypedNode,
+) -> TractResult<Option<TypedModelPatch>> {
+    // The rel-pos axis on DG's input is the last axis.
+    let dg_input_rank = model.outlet_fact(node.inputs[0])?.shape.rank();
+    if dg_input_rank < 1 {
+        return Ok(None);
+    }
+    let rel_pos_axis = dg_input_rank - 1;
+
+    let Some(trace) = trace_back_to_slice(model, node.inputs[0], rel_pos_axis)? else {
+        return Ok(None);
+    };
+    let slice_node = &model.nodes()[trace.slice_id];
+    let Some(slice_op) = slice_node.op_as::<Slice>() else { return Ok(None) };
+    let slice_fact = model.outlet_fact(OutletId::new(trace.slice_id, 0))?;
+    let Some(roi) = &slice_fact.region_of_interest else { return Ok(None) };
+    let scope = model.symbols.clone();
+    let axis_sym = scope.coord_sym(slice_op.axis);
+    let Some((lo, hi_excl)) = bounds_on_axis_tdim(roi, &axis_sym) else {
+        return Ok(None);
+    };
+
+    // Compute new slice bounds.  Both must reduce to concrete integers
+    // (the downstream chain's `output_facts` re-derives the chain shape
+    // from the narrowed slice, so symbolic bounds would leave it stuck).
+    let new_start = (slice_op.start.clone() + lo.clone()).reduce();
+    let new_end = (slice_op.start.clone() + hi_excl.clone()).reduce();
+    if new_start.as_i64().is_none() || new_end.as_i64().is_none() {
+        return Ok(None);
+    }
+    if new_start == slice_op.start && new_end == slice_op.end {
+        return Ok(None); // No narrowing.
+    }
+
+    // Re-anchor DG.offset.  `op.offset` corresponds to "centre − slice.start"
+    // in absolute pos_enc rows; narrowing shifts slice.start by lo, so
+    // new_offset = old_offset − lo.
+    let new_offset_tdim = (op.offset.clone() - lo).reduce();
+    let Ok(new_offset) = new_offset_tdim.to_i64() else { return Ok(None) };
+
+    // Build coordinated patch: re-wire slice with narrow bounds, replay
+    // intermediate chain nodes, then wire new DG with adjusted offset.
+    let mut patch = TypedModelPatch::new(format!("narrow_via_band_roi@{}", node.name));
+    let src = patch.tap_model(model, slice_node.inputs[0])?;
+    let new_slice = Slice { axis: slice_op.axis, start: new_start, end: new_end };
+    let mut current =
+        patch.wire_node(format!("{}.narrowed", slice_node.name), new_slice, &[src])?[0];
+
+    // `trace.intermediate` is the chain from slice's successor up to (but
+    // not including) DG itself, ordered from upstream to downstream.
+    for (chain_nid, path_in_idx) in &trace.intermediate {
+        let chain_node = &model.nodes()[*chain_nid];
+        let mut new_inputs: TVec<OutletId> = tvec!();
+        for (i, inp) in chain_node.inputs.iter().enumerate() {
+            if i == *path_in_idx {
+                new_inputs.push(current);
+            } else {
+                new_inputs.push(patch.tap_model(model, *inp)?);
+            }
+        }
+        current = patch.wire_node(
+            format!("{}.narrow_replay", chain_node.name),
+            chain_node.op.clone(),
+            &new_inputs,
+        )?[0];
+    }
+
+    let new_dg = DiagGather { offset: TDim::Val(new_offset), out_len: op.out_len.clone() };
+    let new_dg_out =
+        patch.wire_node(format!("{}.narrowed_offset", node.name), new_dg, &[current])?[0];
+    patch.shunt_outside(model, OutletId::new(node.id, 0), new_dg_out)?;
+    Ok(Some(patch))
+}
+
+/// Result of walking backward from a DiagGather along the rel-pos axis
+/// through axes-mapping-preserving ops until a `Slice` is reached.
+struct ReverseTrace {
+    slice_id: usize,
+    /// Chain nodes between (exclusive) Slice and (exclusive) DG, ordered
+    /// upstream→downstream, with the input index that carries the rel-pos
+    /// axis at each step.
+    intermediate: Vec<(usize, usize)>,
+}
+
+fn trace_back_to_slice(
+    model: &TypedModel,
+    start_outlet: OutletId,
+    start_axis: usize,
+) -> TractResult<Option<ReverseTrace>> {
+    let mut current_outlet = start_outlet;
+    let mut current_axis = start_axis;
+    let mut intermediate: Vec<(usize, usize)> = vec![];
+    for _ in 0..32 {
+        let node = &model.nodes()[current_outlet.node];
+        if let Some(slice_op) = node.op_as::<Slice>()
+            && slice_op.axis == current_axis
+        {
+            intermediate.reverse();
+            return Ok(Some(ReverseTrace { slice_id: node.id, intermediate }));
+        }
+        let input_facts: TVec<&TypedFact> =
+            node.inputs.iter().map(|inp| model.outlet_fact(*inp)).collect::<TractResult<_>>()?;
+        let output_facts: TVec<&TypedFact> = node.outputs.iter().map(|o| &o.fact).collect();
+        let Ok(mapping) = node.op.axes_mapping(&input_facts, &output_facts) else {
+            return Ok(None);
+        };
+        let mut advanced: Option<(usize, usize)> = None;
+        for (i, _inp) in node.inputs.iter().enumerate() {
+            let Some(in_axis) = mapping
+                .track_axis((InOut::Out(current_outlet.slot), current_axis), InOut::In(i))?
+            else {
+                continue;
+            };
+            // Only follow inputs whose axis size matches output axis size on this axis.
+            let in_fact = &input_facts[i];
+            if in_fact.shape[in_axis] == node.outputs[current_outlet.slot].fact.shape[current_axis]
+            {
+                advanced = Some((i, in_axis));
+                break;
+            }
+        }
+        let Some((idx, ax)) = advanced else { return Ok(None) };
+        intermediate.push((current_outlet.node, idx));
+        current_outlet = node.inputs[idx];
+        current_axis = ax;
+    }
+    Ok(None)
+}
+
+/// Extract `(lo, hi_excl)` TDim bounds from a band ROI predicate of shape
+/// `Mul(Ge(hi, 🎯_axis), Ge(🎯_axis, lo))` (either order).  The returned
+/// `hi_excl = hi + 1` matches the half-open `[lo, hi_excl)` convention
+/// used by `Slice::{start, end}`.
+fn bounds_on_axis_tdim(roi: &TDim, axis_sym: &Symbol) -> Option<(TDim, TDim)> {
+    let TDim::Mul(terms) = roi else { return None };
+    if terms.len() != 2 {
+        return None;
+    }
+    let mut lo: Option<TDim> = None;
+    let mut hi: Option<TDim> = None;
+    for term in terms {
+        let TDim::Ge(left, right) = term else { return None };
+        if let TDim::Sym(s) = left.as_ref()
+            && s == axis_sym
+        {
+            lo = Some((**right).clone());
+            continue;
+        }
+        if let TDim::Sym(s) = right.as_ref()
+            && s == axis_sym
+        {
+            hi = Some((**left).clone());
+            continue;
+        }
+        return None;
+    }
+    Some((lo?, hi? + TDim::Val(1)))
+}
+
+// ─── Detect pass: Pad → Reshape → Slice → Reshape → Slice  →  DiagGather ──
+
+/// Scan the model for skew-trick chains and replace each with a single
+/// [`DiagGather`].
+pub fn detect_diag_gather(model: &mut TypedModel) -> TractResult<()> {
+    Rewriter::default()
+        .with_rule_for::<Pad>("detect-diag-gather", diag_gather_rule)
+        .rewrite(&(), model)
+}
+
+/// Rewrite rule fired by `Rewriter` on each `Pad` node — matches the
+/// `Pad → Reshape → Slice → Reshape → Slice` skew chain anchored at this
+/// `Pad` and replaces it with a single `DiagGather`.
+pub fn diag_gather_rule(
+    _ctx: &(),
+    model: &TypedModel,
+    pad_node: &TypedNode,
+    _node_name: &str,
+    pad_op: &Pad,
+) -> TractResult<Option<TypedModelPatch>> {
+    // ── Step 1: Pad must be Constant(0), one axis (pre=1, post=0), last axis ─
+    rule_if_let!(PadMode::Constant(c) = &pad_op.mode);
+    rule_if!(c.cast_to_scalar::<f64>().ok() == Some(0.0));
+    rule_if_some!(pad_axis = pad_op.pads.iter().position(|&(a, b)| a != 0 || b != 0));
+    rule_if!(pad_op.pads[pad_axis] == (1, 0));
+    rule_if!(
+        !pad_op.pads.iter().enumerate().any(|(i, &(a, b))| i != pad_axis && (a != 0 || b != 0))
+    );
+    let rank = model.outlet_fact(pad_node.inputs[0])?.rank();
+    rule_if!(pad_axis == rank - 1);
+
+    // ── Step 2: Pad → Reshape (transpose last two axes) ────────────────────
+    rule_if_some!(reshape1_node = model.single_succ(pad_node.id)?);
+    rule_if_let!(Some(AxisOp::Reshape(at1, from1, to1)) = reshape1_node.op_as::<AxisOp>());
+    rule_if!(from1.len() == 2 && to1.len() == 2);
+    // Block must cover (query axis, padded axis); at1 = rank-2, pad_axis = rank-1.
+    rule_if!(*at1 + 1 == pad_axis);
+    // Verify transpose: from=[D1, D2] to=[D2, D1].
+    rule_if!(from1[0] == to1[1] && from1[1] == to1[0]);
+    let d1 = &from1[0]; // query dim (T)
+
+    // ── Step 3: Reshape → Slice (drop the leading padded row) ──────────────
+    rule_if_some!(slice1_node = model.single_succ(reshape1_node.id)?);
+    rule_if_some!(slice1_op = slice1_node.op_as::<Slice>());
+    rule_if!(slice1_op.axis == *at1 && slice1_op.start == 1.to_dim());
+
+    // ── Step 4: Slice → Reshape (transpose back) ───────────────────────────
+    rule_if_some!(reshape2_node = model.single_succ(slice1_node.id)?);
+    rule_if_let!(Some(AxisOp::Reshape(at2, from2, to2)) = reshape2_node.op_as::<AxisOp>());
+    rule_if!(from2.len() == 2 && to2.len() == 2);
+    rule_if!(*at2 == *at1);
+    // Inverse transpose: from=[D2-1, D1] to=[D1, D2-1].
+    rule_if!(from2[0] == to2[1] && from2[1] == to2[0]);
+    rule_if!(from2[1] == *d1);
+
+    // ── Step 5: Reshape → Slice (take first D1 columns) ────────────────────
+    rule_if_some!(slice2_node = model.single_succ(reshape2_node.id)?);
+    rule_if_some!(slice2_op = slice2_node.op_as::<Slice>());
+    rule_if!(slice2_op.axis == at2 + 1 && slice2_op.start == 0.to_dim());
+
+    // ── Build the replacement DiagGather ────────────────────────────────────
+    let diag_gather = DiagGather {
+        offset: d1.clone() - 1,                            // T - 1
+        out_len: slice2_op.end.clone() - &slice2_op.start, // = D1
+    };
+
+    let mut patch = TypedModelPatch::new("detect-diag-gather");
+    let pos_raw = patch.tap_model(model, pad_node.inputs[0])?;
+    let out = patch.wire_node(&slice2_node.name, diag_gather, &[pos_raw])?[0];
+    patch.shunt_outside(model, slice2_node.id.into(), out)?;
+
+    Ok(Some(patch))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Build the skew trick chain and verify DiagGather fold produces correct output.
+    #[test]
+    fn test_detect_diag_gather_concrete() -> TractResult<()> {
+        let t: usize = 4;
+        let r = 2 * t - 1; // 7
+
+        // Build a model with the skew trick chain.
+        let mut model = TypedModel::default();
+        let input = model.add_source("pos_raw", f32::fact(&[1, t, r]))?;
+
+        // Pad axis 2, pre=1
+        let mut pads = vec![(0, 0); 3];
+        pads[2] = (1, 0);
+        let padded = model.wire_node(
+            "pad",
+            Pad::new(pads, PadMode::Constant(rctensor0(0.0f32))),
+            &[input],
+        )?[0];
+
+        // Reshape [T, 2T] → [2T, T]
+        let reshaped1 = model.wire_node(
+            "reshape1",
+            AxisOp::Reshape(
+                1,
+                tvec![t.to_dim(), (2 * t).to_dim()],
+                tvec![(2 * t).to_dim(), t.to_dim()],
+            ),
+            &[padded],
+        )?[0];
+
+        // Slice axis=1, start=1, end=2T
+        let sliced1 = model.wire_node("slice1", Slice::new(1, 1, 2 * t), &[reshaped1])?[0];
+
+        // Reshape [2T-1, T] → [T, 2T-1]
+        let reshaped2 = model.wire_node(
+            "reshape2",
+            AxisOp::Reshape(
+                1,
+                tvec![(2 * t - 1).to_dim(), t.to_dim()],
+                tvec![t.to_dim(), (2 * t - 1).to_dim()],
+            ),
+            &[sliced1],
+        )?[0];
+
+        // Slice axis=2, start=0, end=T
+        let sliced2 = model.wire_node("slice2", Slice::new(2, 0, t), &[reshaped2])?[0];
+
+        model.select_output_outlets(&[sliced2])?;
+
+        // Run the original model.
+        let mut rng = 42u64;
+        let input_data: Vec<f32> = (0..(t * r))
+            .map(|_| {
+                rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
+                (rng >> 33) as f32 / 1000.0
+            })
+            .collect();
+        let input_tensor = tensor1(&input_data).into_shape(&[1, t, r])?;
+        let original_output =
+            model.clone().into_runnable()?.run(tvec![input_tensor.clone().into()])?;
+
+        // Fold.
+        let mut folded = model.clone();
+        detect_diag_gather(&mut folded)?;
+
+        // Verify the folded model has a DiagGather node.
+        assert!(
+            folded.nodes().iter().any(|n| n.op_as::<DiagGather>().is_some()),
+            "folded model should contain DiagGather"
+        );
+
+        // Run the folded model.
+        let folded_output = folded.into_runnable()?.run(tvec![input_tensor.into()])?;
+
+        // Compare outputs.
+        let orig = original_output[0].to_plain_array_view::<f32>()?;
+        let fold = folded_output[0].to_plain_array_view::<f32>()?;
+        assert_eq!(orig.shape(), fold.shape());
+        for (a, b) in orig.iter().zip(fold.iter()) {
+            assert!((*a - *b).abs() < 1e-6, "Mismatch: original={a}, folded={b}");
+        }
+        Ok(())
+    }
+
+    /// DiagGather's `input_roi` should substitute the column axis `c` with
+    /// `r + q - offset`: the output index `(q, c)` reads input index
+    /// `(q, offset + c - q)`, so input position `(q, r)` matters iff there's
+    /// some output `(q, c)` with `r = offset + c - q`, i.e. `c = r + q - offset`.
+    ///
+    /// Test case: a diagonal-of-width-3 band on the output `(q, c)` —
+    /// `Mul(Ge(c, q-1), Ge(q+1, c))` — should translate to a CONSTANT band
+    /// `2 ≤ r ≤ 4` on the input (q drops out), because the bandwidth is the
+    /// same offset around the diagonal.
+    #[test]
+    fn diag_gather_input_roi_substitutes_diagonal_coord() -> TractResult<()> {
+        let t: usize = 4;
+        let r = 2 * t - 1; // 7
+
+        let mut model = TypedModel::default();
+        let src = model.add_source("src", f32::fact(&[1, t, r]))?;
+        let dg = model.wire_node(
+            "dg",
+            DiagGather { offset: (t as i64 - 1).to_dim(), out_len: t.to_dim() },
+            &[src],
+        )?[0];
+        model.select_output_outlets(&[dg])?;
+
+        // Plant a diagonal band ROI on dg's output: |q - c| <= 1.
+        // That is: Ge(c, q - 1) AND Ge(q + 1, c).
+        let q_sym = model.symbols.coord_sym(1);
+        let c_sym = model.symbols.coord_sym(2);
+        let q = TDim::Sym(q_sym);
+        let c = TDim::Sym(c_sym);
+        let band = TDim::Mul(vec![
+            TDim::Ge(Box::new(c.clone()), Box::new(q.clone() - TDim::Val(1))),
+            TDim::Ge(Box::new(q + TDim::Val(1)), Box::new(c)),
+        ]);
+        model.nodes_mut()[dg.node].outputs[0].fact.region_of_interest = Some(band);
+
+        // Call input_roi on the DG node and inspect what gets planted on input 0.
+        let dg_node = &model.nodes()[dg.node];
+        let input_rois = dg_node.op.as_typed().unwrap().input_roi(&model, dg_node)?;
+        let input_rois = input_rois.expect("DG should return Some");
+        let input_roi = input_rois[0].as_ref().expect("DG should plant on input 0");
+
+        // Verify the substitution actually happened: `c` (🎯2) should now
+        // appear as the sum `🎯1 + 🎯2 - 3` in both Ge terms.
+        let printed = format!("{input_roi}");
+        eprintln!("DG input ROI: {printed}");
+        assert!(
+            printed.contains("🎯1+🎯2+-3") || printed.contains("🎯1+🎯2-3"),
+            "expected `c → r + q - offset` substitution, got {printed}"
+        );
+        Ok(())
+    }
+}
diff --git a/transformers/src/ops/mod.rs b/transformers/src/ops/mod.rs
index d3b3619f72..2648a01238 100644
--- a/transformers/src/ops/mod.rs
+++ b/transformers/src/ops/mod.rs
@@ -1,4 +1,5 @@
 pub mod apply_rope;
+pub mod diag_gather;
 pub mod dyn_kv_cache;
 pub mod flash_sdpa;
 pub mod scaled_masked_softmax;
@@ -20,6 +21,7 @@ pub mod gelu_approximate {
 }
 
 pub use apply_rope::{apply_rope_rule, rotate_half_rule};
+pub use diag_gather::{DiagGather, detect_diag_gather, diag_gather_rule};
 pub use dyn_kv_cache::{DynKeyValueCache, replace_kv_cache, unfold_kv_cache};
 pub use scaled_masked_softmax::scaled_masked_softmax_rule;
 pub use sdpa::fuse_kv_cache_broadcast_rule;
diff --git a/transformers/src/ops/scaled_masked_softmax.rs b/transformers/src/ops/scaled_masked_softmax.rs
index 0b87116391..104755665c 100644
--- a/transformers/src/ops/scaled_masked_softmax.rs
+++ b/transformers/src/ops/scaled_masked_softmax.rs
@@ -1,4 +1,5 @@
 use tract_nnef::internal::*;
+use tract_nnef::tract_core::axes::{AxesMapping, Axis};
 use tract_nnef::tract_core::ops::binary::{BinMiniOp, TypedBinOp};
 use tract_nnef::tract_core::ops::logic::Iff;
 use tract_nnef::tract_core::ops::math::{Add, Mul};
@@ -129,20 +130,134 @@ impl TypedOp for ScaledMaskedSoftmax {
         Ok(tvec!(fact))
     }
 
+    fn declutter(
+        &self,
+        model: &TypedModel,
+        node: &TypedNode,
+    ) -> TractResult<Option<TypedModelPatch>> {
+        // Insert leading `AddAxis` ops on the mask to bring it to the same
+        // rank as the scores.  SMS already broadcast-aligns right when the
+        // mask has lower rank (see `output_facts`), but downstream consumers
+        // like blockify require the chunked input axes to be at matched
+        // positions across all of SMS's inputs.  Pre-aligning ranks at
+        // declutter time satisfies both.
+        let scores_fact = model.outlet_fact(node.inputs[0])?;
+        let mask_fact = model.outlet_fact(node.inputs[1])?;
+        let rank_diff = scores_fact.shape.rank().saturating_sub(mask_fact.shape.rank());
+        if rank_diff > 0 && mask_fact.datum_type == bool::datum_type() {
+            let mut patch = TypedModelPatch::new(format!("sms_align_mask_rank@{}", node.name));
+            let scores = patch.tap_model(model, node.inputs[0])?;
+            let mut mask = patch.tap_model(model, node.inputs[1])?;
+            for i in 0..rank_diff {
+                mask = patch.wire_node(
+                    format!("{}.mask_unsqueeze_{i}", node.name),
+                    tract_nnef::tract_core::ops::change_axes::AxisOp::Add(0),
+                    &[mask],
+                )?[0];
+            }
+            let out = patch.wire_node(&node.name, self.clone(), &[scores, mask])?[0];
+            patch.shunt_outside(model, node.id.into(), out)?;
+            return Ok(Some(patch));
+        }
+        Ok(None)
+    }
+
     fn input_roi(
         &self,
         model: &TypedModel,
         node: &TypedNode,
     ) -> TractResult<Option<TVec<Option<TDim>>>> {
         // Introduction: mask's uniform_tdim defines which positions matter for scores.
+        // When the mask has lower rank than the scores, its coord symbols are
+        // expressed in mask-frame (🎯0 = mask axis 0).  Scores broadcasting
+        // right-aligns the mask, so mask axis K corresponds to scores axis
+        // K + rank_diff — remap each coord symbol accordingly before planting.
         let mask_fact = model.outlet_fact(node.inputs[1])?;
         if let Some(mask_expr) = &mask_fact.uniform_tdim {
-            return Ok(Some(tvec![Some(mask_expr.clone()), None]));
+            let scores_fact = model.outlet_fact(node.inputs[0])?;
+            let rank_diff = scores_fact.shape.rank().saturating_sub(mask_fact.shape.rank());
+            let remapped = if rank_diff == 0 {
+                mask_expr.clone()
+            } else {
+                let mut sub_map: HashMap<Symbol, TDim> = Default::default();
+                for sym in mask_expr.symbols() {
+                    let Some(k) = tract_nnef::tract_core::ops::logic::sym_to_coord_axis(&sym)
+                    else {
+                        continue;
+                    };
+                    let Some(scope) = sym.scope() else { continue };
+                    sub_map.insert(sym, TDim::Sym(scope.coord_sym(k + rank_diff)));
+                }
+                if sub_map.is_empty() {
+                    mask_expr.clone()
+                } else {
+                    mask_expr
+                        .substitute_all(&sub_map)
+                        .map(|d| d.reduce())
+                        .unwrap_or_else(|_| mask_expr.clone())
+                }
+            };
+            return Ok(Some(tvec![Some(remapped), None]));
         }
         // Bubbling: delegate to the natural blanket implementation.
         tract_nnef::tract_core::optim::propagate_roi::bubble_roi(model, node)
     }
 
+    /// Axes layout: every non-reducing axis is identity-mapped between
+    /// inputs and the output (with the bool mask right-aligned and
+    /// possibly missing leading axes).  The softmax-reducing axis (last)
+    /// is *deliberately disconnected* between input side and output side:
+    /// its size is preserved but it is not "the same axis" — splitting it
+    /// through a reshape would break softmax's normalisation semantics.
+    fn axes_mapping(
+        &self,
+        inputs: &[&TypedFact],
+        _outputs: &[&TypedFact],
+    ) -> TractResult<AxesMapping> {
+        let input = inputs[0];
+        let mask = inputs[1];
+        let rank = input.rank();
+        let mask_rank = mask.rank();
+        if rank == 0 {
+            return AxesMapping::disconnected_for_ranks(&[rank, mask_rank], &[rank]);
+        }
+
+        let mut axes: TVec<Axis> = tvec!();
+        let mut labels = 'a'..;
+        let rank_diff = rank.saturating_sub(mask_rank);
+        let mask_non_reducing = mask_rank.saturating_sub(1);
+
+        // Non-reducing axes (all but the last): identity-mapped between
+        // input 0 and the output.  Mask broadcasts right-aligned; align
+        // its non-reducing axes with the trailing input 0 axes.
+        for i in 0..rank.saturating_sub(1) {
+            let mut ax = Axis::new(labels.next().unwrap(), 2, 1).input(0, i).output(0, i);
+            if i >= rank_diff {
+                let mask_axis = i - rank_diff;
+                if mask_axis < mask_non_reducing {
+                    ax = ax.input(1, mask_axis);
+                }
+            }
+            axes.push(ax);
+        }
+
+        // Softmax-reducing axis on the *input* side: last axis of input 0
+        // (and last axis of the mask, when present).  Not connected to
+        // any output axis.
+        let mut in_red = Axis::new(labels.next().unwrap(), 2, 1).input(0, rank - 1);
+        if 0 < mask_rank && mask_rank <= rank {
+            in_red = in_red.input(1, mask_rank - 1);
+        }
+        axes.push(in_red);
+
+        // Reducing axis on the *output* side: a fresh label, only on the
+        // output.  Same size as the input-side reducing axis but
+        // semantically distinct (post-normalisation positions).
+        axes.push(Axis::new(labels.next().unwrap(), 2, 1).output(0, rank - 1));
+
+        AxesMapping::new(2, 1, axes)
+    }
+
     as_op!();
 }
 
@@ -368,3 +483,134 @@ fn try_extract_scale(
         .find(|o| model.outlet_fact(*o).map(|f| f.konst.is_none()).unwrap_or(false))?;
     Some((scores_outlet, scale))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tract_nnef::tract_core::ops::change_axes::InOut;
+
+    fn smsoftmax() -> ScaledMaskedSoftmax {
+        ScaledMaskedSoftmax { scale: tensor0(1f32).into_arc_tensor(), post_softmax_mask: false }
+    }
+
+    /// Same-rank f32 mask: non-reducing axes identity-mapped on both
+    /// inputs and output; reducing axis disconnected (input vs output).
+    #[test]
+    fn axes_mapping_same_rank_float_mask() {
+        let op = smsoftmax();
+        let input = f32::fact([2usize, 3, 4]);
+        let mask = f32::fact([2usize, 3, 4]);
+        let output = f32::fact([2usize, 3, 4]);
+        let am = op.axes_mapping(&[&input, &mask], &[&output]).unwrap();
+
+        // Non-reducing input axes track to the corresponding output axes.
+        assert_eq!(am.track_axis((InOut::In(0), 0), InOut::Out(0)).unwrap(), Some(0));
+        assert_eq!(am.track_axis((InOut::In(0), 1), InOut::Out(0)).unwrap(), Some(1));
+        // Mask's non-reducing axes also track to the same outputs.
+        assert_eq!(am.track_axis((InOut::In(1), 0), InOut::Out(0)).unwrap(), Some(0));
+        assert_eq!(am.track_axis((InOut::In(1), 1), InOut::Out(0)).unwrap(), Some(1));
+        // Reducing axis on either input does NOT track to the output.
+        assert_eq!(am.track_axis((InOut::In(0), 2), InOut::Out(0)).unwrap(), None);
+        assert_eq!(am.track_axis((InOut::In(1), 2), InOut::Out(0)).unwrap(), None);
+    }
+
+    /// Same-rank bool mask: same shape as f32 mask above.
+    #[test]
+    fn axes_mapping_same_rank_bool_mask() {
+        let op = smsoftmax();
+        let input = f32::fact([2usize, 3, 4]);
+        let mask = bool::fact([2usize, 3, 4]);
+        let output = f32::fact([2usize, 3, 4]);
+        let am = op.axes_mapping(&[&input, &mask], &[&output]).unwrap();
+        assert_eq!(am.track_axis((InOut::In(0), 0), InOut::Out(0)).unwrap(), Some(0));
+        assert_eq!(am.track_axis((InOut::In(0), 1), InOut::Out(0)).unwrap(), Some(1));
+        assert_eq!(am.track_axis((InOut::In(0), 2), InOut::Out(0)).unwrap(), None);
+    }
+
+    /// Lower-rank bool mask: broadcasts right-aligned.  Mask axis 0
+    /// aligns with input axis 1 (= rank − mask_rank + 0 = 3 − 2 + 0).
+    #[test]
+    fn axes_mapping_broadcast_bool_mask() {
+        let op = smsoftmax();
+        let input = f32::fact([2usize, 3, 4]);
+        let mask = bool::fact([3usize, 4]);
+        let output = f32::fact([2usize, 3, 4]);
+        let am = op.axes_mapping(&[&input, &mask], &[&output]).unwrap();
+        // Input non-reducing axes still track to output.
+        assert_eq!(am.track_axis((InOut::In(0), 0), InOut::Out(0)).unwrap(), Some(0));
+        assert_eq!(am.track_axis((InOut::In(0), 1), InOut::Out(0)).unwrap(), Some(1));
+        // Mask's non-reducing axis (0) aligns with input axis 1.
+        assert_eq!(am.track_axis((InOut::In(1), 0), InOut::In(0)).unwrap(), Some(1));
+        // Mask's reducing axis (1) aligns with input axis 2.
+        assert_eq!(am.track_axis((InOut::In(1), 1), InOut::In(0)).unwrap(), Some(2));
+        // Reducing axes don't reach output.
+        assert_eq!(am.track_axis((InOut::In(0), 2), InOut::Out(0)).unwrap(), None);
+        assert_eq!(am.track_axis((InOut::In(1), 1), InOut::Out(0)).unwrap(), None);
+    }
+
+    /// Scalar bool mask: no axes shared between mask and input/output.
+    /// Input non-reducing axes still track to output; reducing axis still
+    /// disconnected from output.
+    #[test]
+    fn axes_mapping_scalar_bool_mask() {
+        let op = smsoftmax();
+        let input = f32::fact([2usize, 3, 4]);
+        let mask = bool::fact([] as [usize; 0]);
+        let output = f32::fact([2usize, 3, 4]);
+        let am = op.axes_mapping(&[&input, &mask], &[&output]).unwrap();
+        assert_eq!(am.track_axis((InOut::In(0), 0), InOut::Out(0)).unwrap(), Some(0));
+        assert_eq!(am.track_axis((InOut::In(0), 1), InOut::Out(0)).unwrap(), Some(1));
+        assert_eq!(am.track_axis((InOut::In(0), 2), InOut::Out(0)).unwrap(), None);
+    }
+
+    /// SMS::input_roi planted from mask uniform_tdim must remap coord
+    /// symbols across the rank gap.  Mask axis K corresponds to scores
+    /// axis K + rank_diff (right-align broadcast).  Without the remap,
+    /// the planted ROI uses mask-frame indices on a scores-frame fact
+    /// (a silent semantic bug that PropagateRoi then bubbles forward).
+    #[test]
+    fn input_roi_remaps_coord_syms_across_rank_diff() -> TractResult<()> {
+        let mut model = TypedModel::default();
+        // Scores: rank 4 — [B=2, H=8, T_q=4, T_k=4]
+        let scores = model.add_source("scores", f32::fact(&[2usize, 8, 4, 4]))?;
+        // Mask: rank 3 — [B=2, T_q=4, T_k=4], with band uniform_tdim on its
+        // own axis 2 (T_k axis): `Mul(Ge(🎯2, 1), Ge(2, 🎯2))` = `1 ≤ k ≤ 2`.
+        let mask_axis_k = model.symbols.coord_sym(2);
+        let band = TDim::Mul(vec![
+            TDim::Ge(Box::new(TDim::Sym(mask_axis_k.clone())), Box::new(TDim::Val(1))),
+            TDim::Ge(Box::new(TDim::Val(2)), Box::new(TDim::Sym(mask_axis_k))),
+        ]);
+        let mut mask_fact = bool::fact(&[2usize, 4, 4]);
+        mask_fact.uniform_tdim = Some(band);
+        let mask = model.add_source("mask", mask_fact)?;
+        let sms = model.wire_node("sms", smsoftmax(), &[scores, mask])?[0];
+        model.select_output_outlets(&[sms])?;
+
+        let sms_node = &model.nodes()[sms.node];
+        let input_rois = sms_node.op.as_typed().unwrap().input_roi(&model, sms_node)?;
+        let input_rois = input_rois.expect("SMS should return Some");
+        let scores_roi = input_rois[0].as_ref().expect("SMS should plant ROI on scores");
+
+        // After remap, mask axis 2 → scores axis 3 (rank_diff = 1).  The
+        // planted ROI on scores must use 🎯3, not 🎯2.
+        let printed = format!("{scores_roi}");
+        eprintln!("SMS scores ROI: {printed}");
+        let uses_axis_3 = scores_roi
+            .symbols()
+            .iter()
+            .any(|s| tract_nnef::tract_core::ops::logic::sym_to_coord_axis(s) == Some(3));
+        let uses_axis_2 = scores_roi
+            .symbols()
+            .iter()
+            .any(|s| tract_nnef::tract_core::ops::logic::sym_to_coord_axis(s) == Some(2));
+        assert!(
+            uses_axis_3,
+            "expected planted ROI to mention scores axis 3 (= mask axis 2 + rank_diff 1), got {printed}"
+        );
+        assert!(
+            !uses_axis_2,
+            "expected planted ROI to NOT mention scores axis 2 (the un-shifted mask axis 2), got {printed}"
+        );
+        Ok(())
+    }
+}
diff --git a/transformers/src/ops/sdpa.rs b/transformers/src/ops/sdpa.rs
index 69c9732e10..9eee53d686 100644
--- a/transformers/src/ops/sdpa.rs
+++ b/transformers/src/ops/sdpa.rs
@@ -365,9 +365,18 @@ impl TypedOp for Sdpa {
         node: &TypedNode,
     ) -> TractResult<Option<TypedModelPatch>> {
         if self.acc_datum_type.is::<f32>() {
-            let scale = self.scale.as_ref().map(|t| t.cast_to_scalar()).transpose()?;
-            let op = FlashSdpaOp { causal: self.is_causal, scale };
-            TypedModelPatch::replace_single_op(model, node, &node.inputs, op).map(Some)
+            // FlashSdpaOp requires Q and V to share the same head dim (last axis).
+            // When they differ (MLA / diff-head-sizes attention), fall back to the
+            // generic SDPA expansion instead.
+            let q_head_dim = model.outlet_fact(node.inputs[0])?.shape.last().cloned();
+            let v_head_dim = model.outlet_fact(node.inputs[2])?.shape.last().cloned();
+            if q_head_dim == v_head_dim {
+                let scale = self.scale.as_ref().map(|t| t.cast_to_scalar()).transpose()?;
+                let op = FlashSdpaOp { causal: self.is_causal, scale };
+                TypedModelPatch::replace_single_op(model, node, &node.inputs, op).map(Some)
+            } else {
+                self.patch_sdpa(model, node).context("Wiring fallback SDPA (diff head dims)")
+            }
         } else {
             self.patch_sdpa(model, node).context("Wiring fallback SDPA")
         }
diff --git a/transformers/src/rewriter.rs b/transformers/src/rewriter.rs
index 6a95a98b22..8dd8328703 100644
--- a/transformers/src/rewriter.rs
+++ b/transformers/src/rewriter.rs
@@ -89,6 +89,21 @@ impl ModelTransform for UnfoldKeyValueCacheTransform {
     }
 }
 
+#[derive(Debug, Default)]
+pub struct DetectDiagGatherTransform;
+
+impl ModelTransform for DetectDiagGatherTransform {
+    fn name(&self) -> StaticName {
+        "detect_diag_gather".into()
+    }
+
+    fn transform(&self, model: &mut TypedModel) -> TractResult<()> {
+        Rewriter::default()
+            .with_rule_for("detect-diag-gather", ops::diag_gather_rule)
+            .rewrite(&(), model)
+    }
+}
+
 // TODO: This is why Transform should be renamed to Remodel
 #[derive(Debug, Default)]
 pub struct TransformersTransform;
@@ -106,6 +121,7 @@ impl ModelTransform for TransformersTransform {
             .with_rule_for("detect-apply-rope", ops::apply_rope_rule)
             .with_rule_for("detect-scaled-masked-softmax", ops::scaled_masked_softmax_rule)
             .with_rule_for("detect-sdpa-kv-cache-broadcast", ops::fuse_kv_cache_broadcast_rule)
+            .with_rule_for("detect-diag-gather", ops::diag_gather_rule)
             .rewrite(&(), model)
     }
 }