From 8ddf19e141a8cf41a7e16de60c3849145b6960f4 Mon Sep 17 00:00:00 2001
From: czoli1976 <64466170+czoli1976@users.noreply.github.com>
Date: Tue, 28 Apr 2026 07:20:42 +0100
Subject: [PATCH 01/10] DFN3 WASM optimization: bump tract 0.21 -> 0.22.1,
 route tract-linalg via patched fork

Captures the libDF-side changes that land Vonage's DFN3 WASM kernel
investigation (RTF 0.1290 -> 0.0516, -60% / 2.5x faster, audio
bit-identical).

Changes:

1. tract bump 0.21.4 -> 0.22.1 in libDF/Cargo.toml. Required for the
   newer tract-linalg architecture that the kernel kit targets.

2. ndarray re-import via tract_core. tract 0.22.1 vendors ndarray
   under tract_core::ndarray; the older bare `use ndarray::prelude::*;`
   no longer resolves cleanly under the bumped dep. Updated:
     - libDF/src/bin/enhance_wav.rs
     - libDF/src/tract.rs
     - libDF/src/transforms.rs
     - libDF/src/wasm.rs
     - libDF/src/wav_utils.rs

3. m.symbol_table.sym('S') -> m.symbols.sym('S') in libDF/src/tract.rs
   (3 sites: encoder + erb_decoder + df_decoder). API rename in
   tract 0.22.1.

4. Workspace [patch.crates-io] override pointing tract-linalg at a
   local fork that adds six WASM SIMD kernels (4x4 existing + 4x1,
   8x1, 16x1, 8x4, 8x8 new) plus a per-M dispatcher in Ops::mmv_f32.
   Source for the kernels: czoli1976/tract@add-wasm-f32-full-kernel-kit.

5. Cargo.lock updated to reflect deps changes.

Production builds must set RUSTFLAGS=\"-C target-feature=+simd128\"
(Discovery #1 from the investigation: simd128 was never on in
production builds, which kept tract-linalg/src/wasm.rs cfg-gated
out and forced the scalar generic_f32_4x4 path, costing 16% RTF).

See WASM_SIMD_KERNEL_INVESTIGATION.md in the dfn3-wasm-opt-v2.1
worktree for the full investigation log + measurements.
---
 Cargo.lock                   | 1720 ++++++++++++++++++----------------
 Cargo.toml                   |    9 +
 libDF/Cargo.toml             |    8 +-
 libDF/src/bin/enhance_wav.rs |    2 +-
 libDF/src/tract.rs           |    8 +-
 libDF/src/transforms.rs      |    2 +-
 libDF/src/wasm.rs            |    2 +-
 libDF/src/wav_utils.rs       |    2 +-
 8 files changed, 938 insertions(+), 815 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 886c82549..05c665bb2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9,7 +9,7 @@ dependencies = [
  "crossbeam-channel",
  "deep_filter",
  "log",
- "ndarray",
+ "ndarray 0.15.6",
  "numpy",
  "pyo3",
 ]
@@ -19,7 +19,7 @@ name = "DeepFilterLib"
 version = "0.5.7-pre"
 dependencies = [
  "deep_filter",
- "ndarray",
+ "ndarray 0.15.6",
  "numpy",
  "pyo3",
 ]
@@ -62,6 +62,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
 dependencies = [
  "cfg-if",
+ "getrandom",
  "once_cell",
  "version_check",
  "zerocopy",
@@ -111,20 +112,23 @@ dependencies = [
 
 [[package]]
 name = "android-activity"
-version = "0.4.3"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64529721f27c2314ced0890ce45e469574a73e5e6fdd6e9da1860eb29285f5e0"
+checksum = "ee91c0c2905bae44f84bfa4e044536541df26b7703fd0888deeb9060fcc44289"
 dependencies = [
  "android-properties",
- "bitflags 1.3.2",
+ "bitflags 2.5.0",
  "cc",
+ "cesu8",
+ "jni",
  "jni-sys",
  "libc",
  "log",
- "ndk 0.7.0",
+ "ndk 0.8.0",
  "ndk-context",
- "ndk-sys 0.4.1+23.1.7779620",
- "num_enum 0.6.1",
+ "ndk-sys 0.5.0+25.2.9519653",
+ "num_enum",
+ "thiserror",
 ]
 
 [[package]]
@@ -206,6 +210,12 @@ version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"
 
+[[package]]
+name = "anymap3"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "170433209e817da6aae2c51aa0dd443009a613425dd041ebfb2492d1c4c11a25"
+
 [[package]]
 name = "approx"
 version = "0.5.1"
@@ -227,6 +237,12 @@ version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
 
+[[package]]
+name = "as-raw-xcb-connection"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "175571dd1d178ced59193a6fc02dde1b972eb0bc56c892cde9beeceac5bf0f6b"
+
 [[package]]
 name = "ascii"
 version = "1.1.0"
@@ -374,7 +390,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -409,7 +425,7 @@ checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -448,7 +464,7 @@ dependencies = [
  "bitflags 2.5.0",
  "cexpr",
  "clang-sys",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "lazy_static",
  "lazycell",
  "proc-macro2",
@@ -456,7 +472,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -515,21 +531,21 @@ dependencies = [
 
 [[package]]
 name = "block-sys"
-version = "0.1.0-beta.1"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fa55741ee90902547802152aaf3f8e5248aab7e21468089560d4c8840561146"
+checksum = "ae85a0696e7ea3b835a453750bf002770776609115e6d25c6d2ff28a8200f7e7"
 dependencies = [
  "objc-sys",
 ]
 
 [[package]]
 name = "block2"
-version = "0.2.0-alpha.6"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8dd9e63c1744f755c2f60332b88de39d341e5e86239014ad839bd71c106dec42"
+checksum = "15b55663a85f33501257357e6421bb33e769d5c9ffb5ba0921c975a123e35e68"
 dependencies = [
  "block-sys",
- "objc2-encode",
+ "objc2 0.4.1",
 ]
 
 [[package]]
@@ -575,7 +591,7 @@ checksum = "4da9a32f3fed317401fa3c862968128267c3106685286e15d5aaa3d7389c2f60"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -590,20 +606,6 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9"
 
-[[package]]
-name = "calloop"
-version = "0.10.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52e0d00eb1ea24371a97d2da6201c6747a633dc6dc1988ef503403b4c59504a8"
-dependencies = [
- "bitflags 1.3.2",
- "log",
- "nix 0.25.1",
- "slotmap",
- "thiserror",
- "vec_map 0.8.2",
-]
-
 [[package]]
 name = "calloop"
 version = "0.12.4"
@@ -624,10 +626,10 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0f0ea9b9476c7fad82841a8dbb380e2eae480c21910feba80725b46931ed8f02"
 dependencies = [
- "calloop 0.12.4",
+ "calloop",
  "rustix 0.38.34",
- "wayland-backend 0.3.3",
- "wayland-client 0.31.2",
+ "wayland-backend",
+ "wayland-client",
 ]
 
 [[package]]
@@ -653,7 +655,7 @@ version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
 dependencies = [
- "nom",
+ "nom 7.1.3",
 ]
 
 [[package]]
@@ -710,7 +712,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -727,13 +729,11 @@ checksum = "4bfbf56724aa9eca8afa4fcfadeb479e722935bb2a0900c2d37e0cc477af0688"
 
 [[package]]
 name = "clipboard-win"
-version = "4.5.0"
+version = "5.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7191c27c2357d9b7ef96baac1773290d4ca63b24205b82a3fd8a0637afcf0362"
+checksum = "bde03770d3df201d4fb868f2c9c59e66a3e4e2bd06692a0fe701e7103c7e84d4"
 dependencies = [
  "error-code",
- "str-buf",
- "winapi",
 ]
 
 [[package]]
@@ -763,7 +763,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4274ea815e013e0f9f04a2633423e14194e408a0576c943ce3d14ca56c50031c"
 dependencies = [
  "thiserror",
- "x11rb 0.13.1",
+ "x11rb",
 ]
 
 [[package]]
@@ -775,36 +775,6 @@ dependencies = [
  "cc",
 ]
 
-[[package]]
-name = "cocoa"
-version = "0.24.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f425db7937052c684daec3bd6375c8abe2d146dca4b8b143d6db777c39138f3a"
-dependencies = [
- "bitflags 1.3.2",
- "block",
- "cocoa-foundation",
- "core-foundation",
- "core-graphics",
- "foreign-types",
- "libc",
- "objc",
-]
-
-[[package]]
-name = "cocoa-foundation"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c6234cbb2e4c785b456c0644748b1ac416dd045799740356f8363dfe00c93f7"
-dependencies = [
- "bitflags 1.3.2",
- "block",
- "core-foundation",
- "core-graphics-types",
- "libc",
- "objc",
-]
-
 [[package]]
 name = "codespan-reporting"
 version = "0.11.1"
@@ -828,10 +798,35 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
 
 [[package]]
-name = "com-rs"
-version = "0.2.1"
+name = "com"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e17887fd17353b65b1b2ef1c526c83e26cd72e74f598a8dc1bee13a48f3d9f6"
+dependencies = [
+ "com_macros",
+]
+
+[[package]]
+name = "com_macros"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d375883580a668c7481ea6631fc1a8863e33cc335bf56bfad8d7e6d4b04b13a5"
+dependencies = [
+ "com_macros_support",
+ "proc-macro2",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "com_macros_support"
+version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf43edc576402991846b093a7ca18a3477e0ef9c588cde84964b5d3e43016642"
+checksum = "ad899a1087a9296d5644792d7cb72b8e34c1bec8e7d4fbc002230169a6e8710c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
 
 [[package]]
 name = "combine"
@@ -900,9 +895,9 @@ checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
 
 [[package]]
 name = "core-graphics"
-version = "0.22.3"
+version = "0.23.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2581bbab3b8ffc6fcbd550bf46c355135d16e9ff2a6ea032ad6b9bf1d7efe4fb"
+checksum = "c07782be35f9e1140080c6b96f0d44b739e2278479f64e02fdab4e32dfd8b081"
 dependencies = [
  "bitflags 1.3.2",
  "core-foundation",
@@ -944,16 +939,17 @@ dependencies = [
 
 [[package]]
 name = "cosmic-text"
-version = "0.9.0"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0b68966c2543609f8d92f9d33ac3b719b2a67529b0c6c0b3e025637b477eef9"
+checksum = "75acbfb314aeb4f5210d379af45ed1ec2c98c7f1790bf57b8a4c562ac0c51b71"
 dependencies = [
- "aliasable",
  "fontdb",
  "libm",
  "log",
  "rangemap",
+ "rustc-hash",
  "rustybuzz",
+ "self_cell",
  "swash",
  "sys-locale",
  "unicode-bidi",
@@ -1053,6 +1049,15 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "ctor"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83cf0d42651b16c6dfe68685716d18480d18a9c39c62d76e8cf3eb6ed5d8bcbf"
+dependencies = [
+ "dtor",
+]
+
 [[package]]
 name = "ctrlc"
 version = "3.4.4"
@@ -1071,12 +1076,12 @@ checksum = "96a6ac251f4a2aca6b3f91340350eab87ae57c3f127ffeb585e92bd336717991"
 
 [[package]]
 name = "d3d12"
-version = "0.6.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8f0de2f5a8e7bd4a9eec0e3c781992a4ce1724f68aec7d7a3715344de8b39da"
+checksum = "3e3d747f100290a1ca24b752186f61f6637e1deffe3bf6320de6fcb29510a307"
 dependencies = [
- "bitflags 1.3.2",
- "libloading 0.7.4",
+ "bitflags 2.5.0",
+ "libloading 0.8.3",
  "winapi",
 ]
 
@@ -1095,7 +1100,7 @@ dependencies = [
  "event-listener 2.5.3",
  "ladspa",
  "log",
- "ndarray",
+ "ndarray 0.15.6",
  "uuid",
  "zbus",
 ]
@@ -1120,7 +1125,7 @@ dependencies = [
  "js-sys",
  "lewton",
  "log",
- "ndarray",
+ "ndarray 0.15.6",
  "ndarray-rand",
  "num-complex",
  "ogg",
@@ -1130,7 +1135,7 @@ dependencies = [
  "realfft",
  "roots",
  "rstest",
- "rubato",
+ "rubato 0.14.1",
  "rust-ini",
  "rustfft",
  "serde",
@@ -1146,9 +1151,9 @@ dependencies = [
 
 [[package]]
 name = "deranged"
-version = "0.3.11"
+version = "0.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
 dependencies = [
  "powerfmt",
 ]
@@ -1187,11 +1192,11 @@ dependencies = [
  "env_logger 0.10.2",
  "iced",
  "image",
- "itertools 0.11.0",
+ "itertools 0.12.1",
  "log",
- "ndarray",
+ "ndarray 0.15.6",
  "ringbuf",
- "rubato",
+ "rubato 0.15.0",
 ]
 
 [[package]]
@@ -1210,6 +1215,16 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bd0c93bb4b0c6d9b77f4435b0ae98c24d17f1c45b2ff844c6151a07256ca923b"
 
+[[package]]
+name = "dispatch2"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38"
+dependencies = [
+ "bitflags 2.5.0",
+ "objc2 0.6.4",
+]
+
 [[package]]
 name = "dlib"
 version = "0.5.2"
@@ -1228,24 +1243,70 @@ dependencies = [
  "const-random",
 ]
 
-[[package]]
-name = "doc-comment"
-version = "0.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
-
 [[package]]
 name = "downcast-rs"
 version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2"
 
+[[package]]
+name = "drm"
+version = "0.14.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "80bc8c5c6c2941f70a55c15f8d9f00f9710ebda3ffda98075f996a0e6c92756f"
+dependencies = [
+ "bitflags 2.5.0",
+ "bytemuck",
+ "drm-ffi",
+ "drm-fourcc",
+ "libc",
+ "rustix 0.38.34",
+]
+
+[[package]]
+name = "drm-ffi"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51a91c9b32ac4e8105dec255e849e0d66e27d7c34d184364fb93e469db08f690"
+dependencies = [
+ "drm-sys",
+ "rustix 1.1.4",
+]
+
+[[package]]
+name = "drm-fourcc"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0aafbcdb8afc29c1a7ee5fbe53b5d62f4565b35a042a662ca9fecd0b54dae6f4"
+
+[[package]]
+name = "drm-sys"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc8e1361066d91f5ffccff060a3c3be9c3ecde15be2959c1937595f7a82a9f8"
+dependencies = [
+ "libc",
+ "linux-raw-sys 0.9.4",
+]
+
+[[package]]
+name = "dtor"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edf234dd1594d6dd434a8fb8cada51ddbbc593e40e4a01556a0b31c62da2775b"
+
 [[package]]
 name = "dyn-clone"
 version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125"
 
+[[package]]
+name = "dyn-hash"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15401da73a9ed8c80e3b2d4dc05fe10e7b72d7243b9f614e516a44fa99986e88"
+
 [[package]]
 name = "either"
 version = "1.11.0"
@@ -1270,7 +1331,7 @@ checksum = "5c785274071b1b420972453b306eeca06acf4633829db4223b58a2a8c5953bc4"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1317,23 +1378,19 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.8"
+version = "0.3.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.52.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "error-code"
-version = "2.3.1"
+version = "3.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64f18991e7bf11e7ffee451b5318b5c1a73c52d0d0ada6e5a3017c8c1ced6a21"
-dependencies = [
- "libc",
- "str-buf",
-]
+checksum = "dea2df4cf52843e0452895c455a1a2cfbb842a1e7329671acf418fdc53ed4c59"
 
 [[package]]
 name = "etagere"
@@ -1490,6 +1547,12 @@ dependencies = [
  "spin",
 ]
 
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
 [[package]]
 name = "font-types"
 version = "0.5.3"
@@ -1499,14 +1562,24 @@ dependencies = [
  "bytemuck",
 ]
 
+[[package]]
+name = "fontconfig-parser"
+version = "0.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbc773e24e02d4ddd8395fd30dc147524273a83e54e0f312d986ea30de5f5646"
+dependencies = [
+ "roxmltree",
+]
+
 [[package]]
 name = "fontdb"
-version = "0.14.1"
+version = "0.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af8d8cbea8f21307d7e84bca254772981296f058a1d36b461bf4d83a7499fc9e"
+checksum = "020e203f177c0fb250fb19455a252e838d2bbbce1f80f25ecc42402aafa8cd38"
 dependencies = [
+ "fontconfig-parser",
  "log",
- "memmap2 0.6.2",
+ "memmap2 0.8.0",
  "slotmap",
  "tinyvec",
  "ttf-parser 0.19.2",
@@ -1514,18 +1587,30 @@ dependencies = [
 
 [[package]]
 name = "foreign-types"
-version = "0.3.2"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965"
 dependencies = [
+ "foreign-types-macros",
  "foreign-types-shared",
 ]
 
+[[package]]
+name = "foreign-types-macros"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "foreign-types-shared"
-version = "0.1.1"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b"
 
 [[package]]
 name = "futures"
@@ -1612,7 +1697,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1661,16 +1746,6 @@ dependencies = [
  "version_check",
 ]
 
-[[package]]
-name = "gethostname"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1ebd34e35c46e00bb73e81363248d627782724609fe1b6396f553f68fe3862e"
-dependencies = [
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "gethostname"
 version = "0.4.3"
@@ -1710,11 +1785,22 @@ version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
+[[package]]
+name = "gl_generator"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a95dfc23a2b4a9a2f5ab41d194f8bfda3cabec42af4e39f08c339eb2a0c124d"
+dependencies = [
+ "khronos_api",
+ "log",
+ "xml-rs",
+]
+
 [[package]]
 name = "glam"
-version = "0.24.2"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5418c17512bdf42730f9032c74e1ae39afc408745ebb2acf72fbc4691c17945"
+checksum = "151665d9be52f9bb40fc7966565d39666f2d1e69233571b71b87791c7e0528b3"
 
 [[package]]
 name = "glob"
@@ -1724,9 +1810,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
 [[package]]
 name = "glow"
-version = "0.12.3"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca0fe580e4b60a8ab24a868bc08e2f03cbcb20d3d676601fa909386713333728"
+checksum = "bd348e04c43b32574f2de31c8bb397d96c9fcfa1371bd4ca6d8bdc464ab121b1"
 dependencies = [
  "js-sys",
  "slotmap",
@@ -1734,11 +1820,20 @@ dependencies = [
  "web-sys",
 ]
 
+[[package]]
+name = "glutin_wgl_sys"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c8098adac955faa2d31079b65dc48841251f69efd3ac25477903fc424362ead"
+dependencies = [
+ "gl_generator",
+]
+
 [[package]]
 name = "glyphon"
-version = "0.3.0"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e87caa7459145f5e5f167bf34db4532901404c679e62339fb712a0e3ccf722a"
+checksum = "6a62d0338e4056db6a73221c2fb2e30619452f6ea9651bac4110f51b0f7a7581"
 dependencies = [
  "cosmic-text",
  "etagere",
@@ -1748,34 +1843,34 @@ dependencies = [
 
 [[package]]
 name = "gpu-alloc"
-version = "0.5.4"
+version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22beaafc29b38204457ea030f6fb7a84c9e4dd1b86e311ba0542533453d87f62"
+checksum = "fbcd2dba93594b227a1f57ee09b8b9da8892c34d55aa332e034a228d0fe6a171"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.5.0",
  "gpu-alloc-types",
 ]
 
 [[package]]
 name = "gpu-alloc-types"
-version = "0.2.0"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54804d0d6bc9d7f26db4eaec1ad10def69b599315f487d32c334a80d1efe67a5"
+checksum = "98ff03b468aa837d70984d55f5d3f846f6ec31fe34bbb97c4f85219caeee1ca4"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.5.0",
 ]
 
 [[package]]
 name = "gpu-allocator"
-version = "0.22.0"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce95f9e2e11c2c6fadfce42b5af60005db06576f231f5c92550fdded43c423e8"
+checksum = "6f56f6318968d03c18e1bcf4857ff88c61157e9da8e47c5f29055d60e1228884"
 dependencies = [
- "backtrace",
  "log",
+ "presser",
  "thiserror",
  "winapi",
- "windows 0.44.0",
+ "windows 0.52.0",
 ]
 
 [[package]]
@@ -1821,30 +1916,35 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.12.3"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+dependencies = [
+ "ahash",
+ "allocator-api2",
+]
 
 [[package]]
 name = "hashbrown"
-version = "0.14.5"
+version = "0.15.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
 dependencies = [
- "ahash",
  "allocator-api2",
+ "equivalent",
+ "foldhash",
 ]
 
 [[package]]
 name = "hassle-rs"
-version = "0.10.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1397650ee315e8891a0df210707f0fc61771b0cc518c3023896064c5407cb3b0"
+checksum = "af2a7e73e1f34c48da31fb668a907f250794837e08faa144fd24f0b8b741e890"
 dependencies = [
- "bitflags 1.3.2",
- "com-rs",
+ "bitflags 2.5.0",
+ "com",
  "libc",
- "libloading 0.7.4",
+ "libloading 0.8.3",
  "thiserror",
  "widestring",
  "winapi",
@@ -1862,8 +1962,8 @@ dependencies = [
  "hdf5-types",
  "lazy_static",
  "libc",
- "ndarray",
- "parking_lot 0.12.2",
+ "ndarray 0.15.6",
+ "parking_lot 0.12.5",
  "paste",
 ]
 
@@ -1875,7 +1975,7 @@ dependencies = [
  "proc-macro-error",
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -1956,9 +2056,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
 
 [[package]]
 name = "iced"
-version = "0.10.0"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c708807ec86f99dd729dc4d42db5239acf118cec14d3c5f57679dcfdbbc472b1"
+checksum = "7d4eb0fbbefb8c428b70680e77ed9013887b17c1d6be366b40f264f956d1a096"
 dependencies = [
  "iced_core",
  "iced_futures",
@@ -1971,23 +2071,27 @@ dependencies = [
 
 [[package]]
 name = "iced_core"
-version = "0.10.0"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64d0bc4fbf018576d08d93f838e6058cc6f10bbc05e04ae249a2a44dffb4ebc8"
+checksum = "7d7e6bbd197f311ed3d8b71651876b0ce01318fde52cda862a9a7a4373c9b930"
 dependencies = [
- "bitflags 1.3.2",
- "instant",
+ "bitflags 2.5.0",
+ "glam",
  "log",
+ "num-traits",
  "palette",
+ "raw-window-handle",
+ "smol_str",
  "thiserror",
- "twox-hash",
+ "web-time",
+ "xxhash-rust",
 ]
 
 [[package]]
 name = "iced_futures"
-version = "0.7.0"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14dab0054a9c7a1cbce227a8cd9ee4a094497b3d06094551ac6c1488d563802e"
+checksum = "370bad88fb3832cbeeb3fa6c486b4701fb7e8da32a753b3101d4ce81fc1d9497"
 dependencies = [
  "futures",
  "iced_core",
@@ -1999,52 +2103,57 @@ dependencies = [
 
 [[package]]
 name = "iced_graphics"
-version = "0.9.0"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67ff14447a221e9e9205a13d84d7bbdf0636a3b1daa02cfca690ed09689c4d2b"
+checksum = "6a044c193ef0840eacabfa05424717331d1fc5b3ecb9a89316200c75da2ba9a4"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.5.0",
  "bytemuck",
- "glam",
+ "cosmic-text",
  "half",
  "iced_core",
+ "iced_futures",
  "image",
  "kamadak-exif",
  "log",
+ "once_cell",
  "raw-window-handle",
+ "rustc-hash",
  "thiserror",
+ "unicode-segmentation",
+ "xxhash-rust",
 ]
 
 [[package]]
 name = "iced_renderer"
-version = "0.1.0"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1033385b0db0099a0d13178c9ff93c1ce11e7d0177522acf578bf79febdb2af8"
+checksum = "5c281e03001d566058f53dec9325bbe61c62da715341206d2627f57a3ecc7f69"
 dependencies = [
  "iced_graphics",
  "iced_tiny_skia",
  "iced_wgpu",
  "log",
- "raw-window-handle",
  "thiserror",
 ]
 
 [[package]]
 name = "iced_runtime"
-version = "0.1.1"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c6c89853e1250c6fac82c5015fa2144517be9b33d4b8e456f10e198b23e28bd"
+checksum = "a79f852c01cc6d61663c94379cb3974ac3ad315a28c504e847d573e094f46822"
 dependencies = [
  "iced_core",
  "iced_futures",
+ "raw-window-handle",
  "thiserror",
 ]
 
 [[package]]
 name = "iced_style"
-version = "0.9.0"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d85c47d9d13e2281f75ddf98c865daf2101632bd2b855c401dd0b1c8b81a31a0"
+checksum = "2ea42a740915d2a5a9ff9c3aa0bca28b16e9fb660bc8f675eed71d186cadb579"
 dependencies = [
  "iced_core",
  "once_cell",
@@ -2053,29 +2162,28 @@ dependencies = [
 
 [[package]]
 name = "iced_tiny_skia"
-version = "0.1.0"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7715f6222c9470bbbd75a39f70478fa0d1bdfb81a377a34fd1b090ffccc480b"
+checksum = "8c2228781f4d381a1cbbd7905a9f077351aa8d37269094021d5d9e779f130aff"
 dependencies = [
  "bytemuck",
  "cosmic-text",
  "iced_graphics",
  "kurbo",
  "log",
- "raw-window-handle",
  "rustc-hash",
  "softbuffer",
- "tiny-skia 0.10.0",
- "twox-hash",
+ "tiny-skia",
+ "xxhash-rust",
 ]
 
 [[package]]
 name = "iced_wgpu"
-version = "0.11.1"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "703f7c5de46b997ed7b18e05ec67059dcdf3beeac51e917c21071b021bb848b9"
+checksum = "e3c243b6700452886aac1ee1987e84d9fb43b56b53fea9a1eb67713fd0fde244"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.5.0",
  "bytemuck",
  "futures",
  "glam",
@@ -2084,17 +2192,14 @@ dependencies = [
  "iced_graphics",
  "log",
  "once_cell",
- "raw-window-handle",
- "rustc-hash",
- "twox-hash",
  "wgpu",
 ]
 
 [[package]]
 name = "iced_widget"
-version = "0.1.3"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a177219ae51c3ba08f228ab932354b360cc669e94aec50c01e7c9b675f074c7c"
+checksum = "7e01b2212adecf1cb80e2267f302c0e0c263e55f97812056949199ccf9f0b908"
 dependencies = [
  "iced_renderer",
  "iced_runtime",
@@ -2107,22 +2212,33 @@ dependencies = [
 
 [[package]]
 name = "iced_winit"
-version = "0.10.1"
+version = "0.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad0c884bcb14722a57192b40a5ef6b5e170fa2f01fe2ff28d6cdd9efe37acf70"
+checksum = "63f66831d0e399b93f631739121a6171780d344b275d56808b9504d8ca75c7d2"
 dependencies = [
  "iced_graphics",
  "iced_runtime",
  "iced_style",
  "log",
- "raw-window-handle",
  "thiserror",
+ "tracing",
  "web-sys",
  "winapi",
  "window_clipboard",
  "winit",
 ]
 
+[[package]]
+name = "icrate"
+version = "0.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d3aaff8a54577104bafdf686ff18565c3b6903ca5782a2026ef06e2c7aa319"
+dependencies = [
+ "block2",
+ "dispatch",
+ "objc2 0.4.1",
+]
+
 [[package]]
 name = "image"
 version = "0.24.9"
@@ -2141,16 +2257,6 @@ dependencies = [
  "tiff",
 ]
 
-[[package]]
-name = "indexmap"
-version = "1.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
-dependencies = [
- "autocfg",
- "hashbrown 0.12.3",
-]
-
 [[package]]
 name = "indexmap"
 version = "2.2.6"
@@ -2174,9 +2280,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
 dependencies = [
  "cfg-if",
- "js-sys",
- "wasm-bindgen",
- "web-sys",
 ]
 
 [[package]]
@@ -2218,18 +2321,18 @@ dependencies = [
 
 [[package]]
 name = "itertools"
-version = "0.11.0"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
 dependencies = [
  "either",
 ]
 
 [[package]]
 name = "itertools"
-version = "0.12.1"
+version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
 dependencies = [
  "either",
 ]
@@ -2320,15 +2423,21 @@ dependencies = [
 
 [[package]]
 name = "khronos-egl"
-version = "4.1.0"
+version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c2352bd1d0bceb871cb9d40f24360c8133c11d7486b68b5381c1dd1a32015e3"
+checksum = "6aae1df220ece3c0ada96b8153459b67eebe9ae9212258bb0134ae60416fdf76"
 dependencies = [
  "libc",
- "libloading 0.7.4",
+ "libloading 0.8.3",
  "pkg-config",
 ]
 
+[[package]]
+name = "khronos_api"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2db585e1d738fc771bf08a151420d3ed193d9d895a36df7f6f8a9456b911ddc"
+
 [[package]]
 name = "kstring"
 version = "2.0.0"
@@ -2341,11 +2450,12 @@ dependencies = [
 
 [[package]]
 name = "kurbo"
-version = "0.9.5"
+version = "0.10.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd85a5776cd9500c2e2059c8c76c3b01528566b7fcbaf8098b55a33fc298849b"
+checksum = "1618d4ebd923e97d67e7cd363d80aef35fe961005cbbbb3d2dad8bdd1bc63440"
 dependencies = [
  "arrayvec",
+ "smallvec",
 ]
 
 [[package]]
@@ -2356,14 +2466,14 @@ checksum = "6197e2fb8a3da99eca216e9689b47465b23cfe09e1a1ddc720fa1acdd54aa267"
 dependencies = [
  "bitflags 0.8.2",
  "libc",
- "vec_map 0.7.0",
+ "vec_map",
 ]
 
 [[package]]
 name = "lazy_static"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 
 [[package]]
 name = "lazycell"
@@ -2390,9 +2500,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.154"
+version = "0.2.186"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae743338b92ff9146ce83992f766a31066a91a8c84a45e0e9f21e7cf6de6d346"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
 
 [[package]]
 name = "libloading"
@@ -2411,14 +2521,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.5",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
 name = "libm"
-version = "0.2.8"
+version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 
 [[package]]
 name = "libredox"
@@ -2443,13 +2553,24 @@ version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
+
 [[package]]
 name = "liquid"
-version = "0.26.4"
+version = "0.26.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69f68ae1011499ae2ef879f631891f21c78e309755f4a5e483c4a8f12e10b609"
+checksum = "2a494c3f9dad3cb7ed16f1c51812cbe4b29493d6c2e5cd1e2b87477263d9534d"
 dependencies = [
- "doc-comment",
  "liquid-core",
  "liquid-derive",
  "liquid-lib",
@@ -2458,15 +2579,14 @@ dependencies = [
 
 [[package]]
 name = "liquid-core"
-version = "0.26.4"
+version = "0.26.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79e0724dfcaad5cfb7965ea0f178ca0870b8d7315178f4a7179f5696f7f04d5f"
+checksum = "fc623edee8a618b4543e8e8505584f4847a4e51b805db1af6d9af0a3395d0d57"
 dependencies = [
  "anymap2",
- "itertools 0.10.5",
+ "itertools 0.14.0",
  "kstring",
  "liquid-derive",
- "num-traits",
  "pest",
  "pest_derive",
  "regex",
@@ -2476,24 +2596,23 @@ dependencies = [
 
 [[package]]
 name = "liquid-derive"
-version = "0.26.4"
+version = "0.26.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc2fb41a9bb4257a3803154bdf7e2df7d45197d1941c9b1a90ad815231630721"
+checksum = "de66c928222984aea59fcaed8ba627f388aaac3c1f57dcb05cc25495ef8faefe"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "liquid-lib"
-version = "0.26.4"
+version = "0.26.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2a17e273a6fb1fb6268f7a5867ddfd0bd4683c7e19b51084f3d567fad4348c0"
+checksum = "9befeedd61f5995bc128c571db65300aeb50d62e4f0542c88282dbcb5f72372a"
 dependencies = [
- "itertools 0.10.5",
+ "itertools 0.14.0",
  "liquid-core",
- "once_cell",
  "percent-encoding",
  "regex",
  "time",
@@ -2502,11 +2621,10 @@ dependencies = [
 
 [[package]]
 name = "lock_api"
-version = "0.4.12"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
 dependencies = [
- "autocfg",
  "scopeguard",
 ]
 
@@ -2518,11 +2636,11 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
 
 [[package]]
 name = "lru"
-version = "0.11.1"
+version = "0.12.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a83fb7698b3643a0e34f9ae6f2e8f0178c0fd42f8b59d493aa271ff3a5bf21"
+checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
 dependencies = [
- "hashbrown 0.14.5",
+ "hashbrown 0.15.5",
 ]
 
 [[package]]
@@ -2567,18 +2685,9 @@ checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
 
 [[package]]
 name = "memmap2"
-version = "0.5.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "memmap2"
-version = "0.6.2"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d28bba84adfe6646737845bc5ebbfa2c08424eb1c37e94a1fd2a82adb56a872"
+checksum = "43a5a03cefb0d953ec0be133036f14e109412fa594edc2f77227249db66cc3ed"
 dependencies = [
  "libc",
 ]
@@ -2592,15 +2701,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "memoffset"
-version = "0.6.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
-dependencies = [
- "autocfg",
-]
-
 [[package]]
 name = "memoffset"
 version = "0.7.1"
@@ -2621,16 +2721,17 @@ dependencies = [
 
 [[package]]
 name = "metal"
-version = "0.24.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de11355d1f6781482d027a3b4d4de7825dcedb197bf573e0596d00008402d060"
+checksum = "c43f73953f8cbe511f021b58f18c3ce1c3d1ae13fe953293e13345bf83217f25"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.5.0",
  "block",
  "core-graphics-types",
  "foreign-types",
  "log",
  "objc",
+ "paste",
 ]
 
 [[package]]
@@ -2649,18 +2750,6 @@ dependencies = [
  "simd-adler32",
 ]
 
-[[package]]
-name = "mio"
-version = "0.8.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
-dependencies = [
- "libc",
- "log",
- "wasi",
- "windows-sys 0.48.0",
-]
-
 [[package]]
 name = "mutate_once"
 version = "0.1.1"
@@ -2669,15 +2758,15 @@ checksum = "16cf681a23b4d0a43fc35024c176437f9dcd818db34e0f42ab456a0ee5ad497b"
 
 [[package]]
 name = "naga"
-version = "0.12.3"
+version = "0.19.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbcc2e0513220fd2b598e6068608d4462db20322c0e77e47f6f488dfcfc279cb"
+checksum = "50e3524642f53d9af419ab5e8dd29d3ba155708267667c2f3f06c88c9e130843"
 dependencies = [
  "bit-set",
- "bitflags 1.3.2",
+ "bitflags 2.5.0",
  "codespan-reporting",
  "hexf-parse",
- "indexmap 1.9.3",
+ "indexmap",
  "log",
  "num-traits",
  "rustc-hash",
@@ -2701,42 +2790,59 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "ndarray"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841"
+dependencies = [
+ "matrixmultiply",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "portable-atomic",
+ "portable-atomic-util",
+ "rawpointer",
+]
+
 [[package]]
 name = "ndarray-rand"
 version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "65608f937acc725f5b164dcf40f4f0bc5d67dc268ab8a649d3002606718c4588"
 dependencies = [
- "ndarray",
+ "ndarray 0.15.6",
  "rand",
  "rand_distr",
 ]
 
 [[package]]
 name = "ndk"
-version = "0.7.0"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "451422b7e4718271c8b5b3aadf5adedba43dc76312454b387e98fae0fc951aa0"
+checksum = "2076a31b7010b17a38c01907c45b945e8f11495ee4dd588309718901b1f7a5b7"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.5.0",
  "jni-sys",
- "ndk-sys 0.4.1+23.1.7779620",
- "num_enum 0.5.11",
+ "log",
+ "ndk-sys 0.5.0+25.2.9519653",
+ "num_enum",
  "raw-window-handle",
  "thiserror",
 ]
 
 [[package]]
 name = "ndk"
-version = "0.8.0"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2076a31b7010b17a38c01907c45b945e8f11495ee4dd588309718901b1f7a5b7"
+checksum = "c3f42e7bbe13d351b6bead8286a43aac9534b82bd3cc43e47037f012ebfd62d4"
 dependencies = [
  "bitflags 2.5.0",
  "jni-sys",
  "log",
- "ndk-sys 0.5.0+25.2.9519653",
- "num_enum 0.7.2",
+ "ndk-sys 0.6.0+11769913",
+ "num_enum",
+ "raw-window-handle",
  "thiserror",
 ]
 
@@ -2746,15 +2852,6 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "27b02d87554356db9e9a873add8782d4ea6e3e58ea071a9adb9a2e8ddb884a8b"
 
-[[package]]
-name = "ndk-sys"
-version = "0.4.1+23.1.7779620"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cf2aae958bd232cac5069850591667ad422d263686d75b52a065f9badeee5a3"
-dependencies = [
- "jni-sys",
-]
-
 [[package]]
 name = "ndk-sys"
 version = "0.5.0+25.2.9519653"
@@ -2765,28 +2862,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "nix"
-version = "0.24.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa52e972a9a719cecb6864fb88568781eb706bac2cd1d4f04a648542dbf78069"
-dependencies = [
- "bitflags 1.3.2",
- "cfg-if",
- "libc",
- "memoffset 0.6.5",
-]
-
-[[package]]
-name = "nix"
-version = "0.25.1"
+name = "ndk-sys"
+version = "0.6.0+11769913"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4"
+checksum = "ee6cda3051665f1fb8d9e08fc35c96d5a244fb1be711a03b71118828afc9a873"
 dependencies = [
- "autocfg",
- "bitflags 1.3.2",
- "cfg-if",
- "libc",
- "memoffset 0.6.5",
+ "jni-sys",
 ]
 
 [[package]]
@@ -2799,7 +2880,6 @@ dependencies = [
  "cfg-if",
  "libc",
  "memoffset 0.7.1",
- "pin-utils",
 ]
 
 [[package]]
@@ -2824,6 +2904,24 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "nom"
+version = "8.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "nom-language"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2de2bc5b451bfedaef92c90b8939a8fff5770bdcc1fafd6239d086aab8fa6b29"
+dependencies = [
+ "nom 8.0.0",
+]
+
 [[package]]
 name = "num-complex"
 version = "0.4.5"
@@ -2836,9 +2934,9 @@ dependencies = [
 
 [[package]]
 name = "num-conv"
-version = "0.1.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967"
 
 [[package]]
 name = "num-derive"
@@ -2848,7 +2946,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -2880,55 +2978,13 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "num_enum"
-version = "0.5.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f646caf906c20226733ed5b1374287eb97e3c2a5c227ce668c1f2ce20ae57c9"
-dependencies = [
- "num_enum_derive 0.5.11",
-]
-
-[[package]]
-name = "num_enum"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a015b430d3c108a207fd776d2e2196aaf8b1cf8cf93253e3a097ff3085076a1"
-dependencies = [
- "num_enum_derive 0.6.1",
-]
-
 [[package]]
 name = "num_enum"
 version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845"
 dependencies = [
- "num_enum_derive 0.7.2",
-]
-
-[[package]]
-name = "num_enum_derive"
-version = "0.5.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcbff9bc912032c62bf65ef1d5aea88983b420f4f839db1e9b0c281a25c9c799"
-dependencies = [
- "proc-macro-crate 1.3.1",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "num_enum_derive"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96667db765a921f7b295ffee8b60472b686a51d4f21c2ee4ffdb94c7013b65a6"
-dependencies = [
- "proc-macro-crate 1.3.1",
- "proc-macro2",
- "quote",
- "syn 2.0.60",
+ "num_enum_derive",
 ]
 
 [[package]]
@@ -2940,7 +2996,7 @@ dependencies = [
  "proc-macro-crate 3.1.0",
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -2950,7 +3006,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef41cbb417ea83b30525259e30ccef6af39b31c240bda578889494c5392d331"
 dependencies = [
  "libc",
- "ndarray",
+ "ndarray 0.15.6",
  "num-complex",
  "num-integer",
  "num-traits",
@@ -2981,28 +3037,97 @@ dependencies = [
 
 [[package]]
 name = "objc-sys"
-version = "0.2.0-beta.2"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df3b9834c1e95694a05a828b59f55fa2afec6288359cda67146126b3f90a55d7"
+checksum = "cdb91bdd390c7ce1a8607f35f3ca7151b65afc0ff5ff3b34fa350f7d7c7e4310"
 
 [[package]]
 name = "objc2"
-version = "0.3.0-beta.3.patch-leaks.3"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e01640f9f2cb1220bbe80325e179e532cb3379ebcd1bf2279d703c19fe3a468"
+checksum = "559c5a40fdd30eb5e344fbceacf7595a81e242529fb4e21cf5f43fb4f11ff98d"
 dependencies = [
- "block2",
  "objc-sys",
- "objc2-encode",
+ "objc2-encode 3.0.0",
+]
+
+[[package]]
+name = "objc2"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f"
+dependencies = [
+ "objc2-encode 4.1.0",
+]
+
+[[package]]
+name = "objc2-core-foundation"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536"
+dependencies = [
+ "bitflags 2.5.0",
+ "dispatch2",
+ "objc2 0.6.4",
+]
+
+[[package]]
+name = "objc2-core-graphics"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e022c9d066895efa1345f8e33e584b9f958da2fd4cd116792e15e07e4720a807"
+dependencies = [
+ "bitflags 2.5.0",
+ "dispatch2",
+ "objc2 0.6.4",
+ "objc2-core-foundation",
+ "objc2-io-surface",
 ]
 
 [[package]]
 name = "objc2-encode"
-version = "2.0.0-pre.2"
+version = "3.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abfcac41015b00a120608fdaa6938c44cb983fee294351cc4bac7638b4e50512"
+checksum = "d079845b37af429bfe5dfa76e6d087d788031045b25cfc6fd898486fd9847666"
+
+[[package]]
+name = "objc2-encode"
+version = "4.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33"
+
+[[package]]
+name = "objc2-foundation"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272"
 dependencies = [
- "objc-sys",
+ "bitflags 2.5.0",
+ "objc2 0.6.4",
+ "objc2-core-foundation",
+]
+
+[[package]]
+name = "objc2-io-surface"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "180788110936d59bab6bd83b6060ffdfffb3b922ba1396b312ae795e1de9d81d"
+dependencies = [
+ "bitflags 2.5.0",
+ "objc2 0.6.4",
+ "objc2-core-foundation",
+]
+
+[[package]]
+name = "objc2-quartz-core"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96c1358452b371bf9f104e21ec536d37a650eb10f7ee379fff67d2e08d537f1f"
+dependencies = [
+ "bitflags 2.5.0",
+ "objc2 0.6.4",
+ "objc2-core-foundation",
+ "objc2-foundation",
 ]
 
 [[package]]
@@ -3101,9 +3226,9 @@ dependencies = [
 
 [[package]]
 name = "ouroboros"
-version = "0.17.2"
+version = "0.18.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2ba07320d39dfea882faa70554b4bd342a5f273ed59ba7c1c6b4c840492c954"
+checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59"
 dependencies = [
  "aliasable",
  "ouroboros_macro",
@@ -3112,15 +3237,15 @@ dependencies = [
 
 [[package]]
 name = "ouroboros_macro"
-version = "0.17.2"
+version = "0.18.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec4c6225c69b4ca778c0aea097321a64c421cf4577b331c61b229267edabb6f8"
+checksum = "3c7028bdd3d43083f6d8d4d5187680d0d3560d54df4cc9d752005268b41e64d0"
 dependencies = [
  "heck 0.4.1",
- "proc-macro-error",
  "proc-macro2",
+ "proc-macro2-diagnostics",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3153,7 +3278,7 @@ dependencies = [
  "by_address",
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3175,12 +3300,12 @@ dependencies = [
 
 [[package]]
 name = "parking_lot"
-version = "0.12.2"
+version = "0.12.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e4af0ca4f6caed20e900d564c242b8e5d4903fdacf31d3daf527b66fe6f42fb"
+checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
 dependencies = [
  "lock_api",
- "parking_lot_core 0.9.10",
+ "parking_lot_core 0.9.12",
 ]
 
 [[package]]
@@ -3199,15 +3324,15 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.10"
+version = "0.9.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
 dependencies = [
  "cfg-if",
  "libc",
  "redox_syscall 0.5.1",
  "smallvec",
- "windows-targets 0.52.5",
+ "windows-link",
 ]
 
 [[package]]
@@ -3216,6 +3341,12 @@ version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
 
+[[package]]
+name = "pastey"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec"
+
 [[package]]
 name = "percent-encoding"
 version = "2.3.1"
@@ -3253,7 +3384,7 @@ dependencies = [
  "pest_meta",
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3297,7 +3428,7 @@ dependencies = [
  "phf_shared",
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3388,6 +3519,15 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0"
 
+[[package]]
+name = "portable-atomic-util"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618"
+dependencies = [
+ "portable-atomic",
+]
+
 [[package]]
 name = "powerfmt"
 version = "0.2.0"
@@ -3400,6 +3540,12 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "presser"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8cf8e6a8aa66ce33f63993ffc4ea4271eb5b0530a9002db8455ea6050c77bfa"
+
 [[package]]
 name = "primal-check"
 version = "0.3.3"
@@ -3437,7 +3583,6 @@ dependencies = [
  "proc-macro-error-attr",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
  "version_check",
 ]
 
@@ -3454,13 +3599,26 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.81"
+version = "1.0.106"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
 dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "proc-macro2-diagnostics"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "version_check",
+ "yansi",
+]
+
 [[package]]
 name = "profiling"
 version = "1.0.15"
@@ -3500,7 +3658,7 @@ dependencies = [
  "indoc",
  "libc",
  "memoffset 0.9.1",
- "parking_lot 0.12.2",
+ "parking_lot 0.11.2",
  "portable-atomic",
  "pyo3-build-config",
  "pyo3-ffi",
@@ -3537,7 +3695,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3550,7 +3708,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-build-config",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -3562,15 +3720,6 @@ dependencies = [
  "bytemuck",
 ]
 
-[[package]]
-name = "quick-xml"
-version = "0.28.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ce5e73202a820a31f8a0ee32ada5e21029c81fd9e3ebf668a40832e4219d9d1"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "quick-xml"
 version = "0.31.0"
@@ -3652,9 +3801,9 @@ checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684"
 
 [[package]]
 name = "raw-window-handle"
-version = "0.5.2"
+version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2ff9a1f06a88b01621b7ae906ef0211290d1c8a168a15542486a8f61c0833b9"
+checksum = "20675572f6f24e9e76ef639bc5552774ed45f1c30e2951e1e99c59888861c539"
 
 [[package]]
 name = "rawpointer"
@@ -3793,6 +3942,12 @@ version = "0.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1c36d2bbc763f480668d6d6790ae2fdd2e52ac0c21a3a26d156f3534a3d9eea9"
 
+[[package]]
+name = "roxmltree"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c20b6793b5c2fa6553b250154b78d6d0db37e72700ae35fad9387a46f487c97"
+
 [[package]]
 name = "rstest"
 version = "0.19.0"
@@ -3818,7 +3973,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.60",
+ "syn 2.0.117",
  "unicode-ident",
 ]
 
@@ -3834,6 +3989,18 @@ dependencies = [
  "realfft",
 ]
 
+[[package]]
+name = "rubato"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5d18b486e7d29a408ef3f825bc1327d8f87af091c987ca2f5b734625940e234"
+dependencies = [
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "realfft",
+]
+
 [[package]]
 name = "rust-ini"
 version = "0.21.0"
@@ -3908,20 +4075,39 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "rustix"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
+dependencies = [
+ "bitflags 2.5.0",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.12.1",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
 [[package]]
 name = "rustybuzz"
-version = "0.8.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82eea22c8f56965eeaf3a209b3d24508256c7b920fb3b6211b8ba0f7c0583250"
+checksum = "2ee8fe2a8461a0854a37101fe7a1b13998d0cfa987e43248e81d2a5f4570f6fa"
 dependencies = [
  "bitflags 1.3.2",
  "bytemuck",
  "libm",
  "smallvec",
- "ttf-parser 0.19.2",
+ "ttf-parser 0.20.0",
  "unicode-bidi-mirroring",
  "unicode-ccc",
- "unicode-general-category",
+ "unicode-properties",
  "unicode-script",
 ]
 
@@ -3931,6 +4117,16 @@ version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1"
 
+[[package]]
+name = "safetensors"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "172dd94c5a87b5c79f945c863da53b2ebc7ccef4eca24ac63cca66a41aab2178"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "same-file"
 version = "1.0.6"
@@ -3963,17 +4159,23 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
 name = "sctk-adwaita"
-version = "0.5.4"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cda4e97be1fd174ccc2aae81c8b694e803fa99b34e8fd0f057a9d70698e3ed09"
+checksum = "70b31447ca297092c5a9916fc3b955203157b37c19ca8edde4f52e9843e602c7"
 dependencies = [
  "ab_glyph",
  "log",
- "memmap2 0.5.10",
- "smithay-client-toolkit 0.16.1",
- "tiny-skia 0.8.4",
+ "memmap2 0.9.4",
+ "smithay-client-toolkit",
+ "tiny-skia",
 ]
 
+[[package]]
+name = "self_cell"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b12e76d157a900eb52e81bc6e9f3069344290341720e9178cde2407113ac8d89"
+
 [[package]]
 name = "semver"
 version = "1.0.22"
@@ -3982,22 +4184,32 @@ checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca"
 
 [[package]]
 name = "serde"
-version = "1.0.200"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.200"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "856f046b9400cee3c8c94ed572ecdb752444c24528c035cd35882aad6f492bcb"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4019,7 +4231,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4095,25 +4307,6 @@ version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
-[[package]]
-name = "smithay-client-toolkit"
-version = "0.16.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "870427e30b8f2cbe64bf43ec4b86e88fe39b0a84b3f15efd9c9c2d020bc86eb9"
-dependencies = [
- "bitflags 1.3.2",
- "calloop 0.10.6",
- "dlib",
- "lazy_static",
- "log",
- "memmap2 0.5.10",
- "nix 0.24.3",
- "pkg-config",
- "wayland-client 0.29.5",
- "wayland-cursor 0.29.5",
- "wayland-protocols 0.29.5",
-]
-
 [[package]]
 name = "smithay-client-toolkit"
 version = "0.18.1"
@@ -4121,7 +4314,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "922fd3eeab3bd820d76537ce8f582b1cf951eceb5475c28500c7457d9d17f53a"
 dependencies = [
  "bitflags 2.5.0",
- "calloop 0.12.4",
+ "calloop",
  "calloop-wayland-source",
  "cursor-icon",
  "libc",
@@ -4129,13 +4322,13 @@ dependencies = [
  "memmap2 0.9.4",
  "rustix 0.38.34",
  "thiserror",
- "wayland-backend 0.3.3",
- "wayland-client 0.31.2",
+ "wayland-backend",
+ "wayland-client",
  "wayland-csd-frame",
- "wayland-cursor 0.31.1",
- "wayland-protocols 0.31.2",
+ "wayland-cursor",
+ "wayland-protocols",
  "wayland-protocols-wlr",
- "wayland-scanner 0.31.1",
+ "wayland-scanner",
  "xkeysym",
 ]
 
@@ -4146,8 +4339,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c091e7354ea8059d6ad99eace06dd13ddeedbb0ac72d40a9a6e7ff790525882d"
 dependencies = [
  "libc",
- "smithay-client-toolkit 0.18.1",
- "wayland-backend 0.3.3",
+ "smithay-client-toolkit",
+ "wayland-backend",
+]
+
+[[package]]
+name = "smol_str"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd538fb6910ac1099850255cf94a94df6551fbdd602454387d0adb2d1ca6dead"
+dependencies = [
+ "serde",
 ]
 
 [[package]]
@@ -4162,30 +4364,34 @@ dependencies = [
 
 [[package]]
 name = "softbuffer"
-version = "0.2.1"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2b953f6ba7285f0af131eb748aabd8ddaf53e0b81dda3ba5d803b0847d6559f"
+checksum = "aac18da81ebbf05109ab275b157c22a653bb3c12cf884450179942f81bcbf6c3"
 dependencies = [
+ "as-raw-xcb-connection",
  "bytemuck",
- "cfg_aliases",
- "cocoa",
- "core-graphics",
- "fastrand 1.9.0",
- "foreign-types",
- "log",
- "nix 0.26.4",
- "objc",
+ "drm",
+ "fastrand 2.1.0",
+ "js-sys",
+ "memmap2 0.9.4",
+ "ndk 0.9.0",
+ "objc2 0.6.4",
+ "objc2-core-foundation",
+ "objc2-core-graphics",
+ "objc2-foundation",
+ "objc2-quartz-core",
  "raw-window-handle",
- "redox_syscall 0.3.5",
- "thiserror",
+ "redox_syscall 0.5.1",
+ "rustix 1.1.4",
+ "tiny-xlib",
+ "tracing",
  "wasm-bindgen",
- "wayland-backend 0.1.2",
- "wayland-client 0.30.2",
- "wayland-sys 0.30.1",
+ "wayland-backend",
+ "wayland-client",
+ "wayland-sys",
  "web-sys",
- "windows-sys 0.48.0",
- "x11-dl",
- "x11rb 0.11.1",
+ "windows-sys 0.61.2",
+ "x11rb",
 ]
 
 [[package]]
@@ -4199,12 +4405,11 @@ dependencies = [
 
 [[package]]
 name = "spirv"
-version = "0.2.0+1.5.4"
+version = "0.3.0+sdk-1.3.268.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "246bfa38fe3db3f1dfc8ca5a2cdeb7348c78be2112740cc0ec8ef18b6d94f830"
+checksum = "eda41003dc44290527a59b13432d4a0379379fa074b70174882adfbdfd917844"
 dependencies = [
- "bitflags 1.3.2",
- "num-traits",
+ "bitflags 2.5.0",
 ]
 
 [[package]]
@@ -4213,12 +4418,6 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
 
-[[package]]
-name = "str-buf"
-version = "1.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e08d8363704e6c71fc928674353e6b7c23dcea9d82d7012c8faf2a3a025f8d0"
-
 [[package]]
 name = "strength_reduce"
 version = "0.2.4"
@@ -4278,9 +4477,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.60"
+version = "2.0.117"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -4351,7 +4550,7 @@ checksum = "d1cd413b5d558b4c5bf3680e324a6fa5014e7b7c067a51e69dbdf47eb7148b66"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
@@ -4367,30 +4566,30 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.36"
+version = "0.3.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
+checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c"
 dependencies = [
  "deranged",
  "itoa",
  "num-conv",
  "powerfmt",
- "serde",
+ "serde_core",
  "time-core",
  "time-macros",
 ]
 
 [[package]]
 name = "time-core"
-version = "0.1.2"
+version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
+checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca"
 
 [[package]]
 name = "time-macros"
-version = "0.2.18"
+version = "0.2.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
+checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215"
 dependencies = [
  "num-conv",
  "time-core",
@@ -4399,31 +4598,17 @@ dependencies = [
 [[package]]
 name = "tiny-keccak"
 version = "2.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
-dependencies = [
- "crunchy",
-]
-
-[[package]]
-name = "tiny-skia"
-version = "0.8.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df8493a203431061e901613751931f047d1971337153f96d0e5e363d6dbf6a67"
-dependencies = [
- "arrayref",
- "arrayvec",
- "bytemuck",
- "cfg-if",
- "png",
- "tiny-skia-path 0.8.4",
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
+dependencies = [
+ "crunchy",
 ]
 
 [[package]]
 name = "tiny-skia"
-version = "0.10.0"
+version = "0.11.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7db11798945fa5c3e5490c794ccca7c6de86d3afdd54b4eb324109939c6f37bc"
+checksum = "83d13394d44dae3207b52a326c0c85a8bf87f1541f23b0d143811088497b09ab"
 dependencies = [
  "arrayref",
  "arrayvec",
@@ -4431,14 +4616,14 @@ dependencies = [
  "cfg-if",
  "log",
  "png",
- "tiny-skia-path 0.10.0",
+ "tiny-skia-path",
 ]
 
 [[package]]
 name = "tiny-skia-path"
-version = "0.8.4"
+version = "0.11.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adbfb5d3f3dd57a0e11d12f4f13d4ebbbc1b5c15b7ab0a156d030b21da5f677c"
+checksum = "9c9e7fc0c2e86a30b117d0462aa261b72b7a99b7ebd7deb3a14ceda95c5bdc93"
 dependencies = [
  "arrayref",
  "bytemuck",
@@ -4446,14 +4631,16 @@ dependencies = [
 ]
 
 [[package]]
-name = "tiny-skia-path"
-version = "0.10.0"
+name = "tiny-xlib"
+version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f60aa35c89ac2687ace1a2556eaaea68e8c0d47408a2e3e7f5c98a489e7281c"
+checksum = "a90a0ca3ee6a69f2ad28fd11621a4c3f03b371f366be500b64df260c4ffbafb4"
 dependencies = [
- "arrayref",
- "bytemuck",
- "strict-num",
+ "as-raw-xcb-connection",
+ "ctor",
+ "libloading 0.8.3",
+ "pkg-config",
+ "tracing",
 ]
 
 [[package]]
@@ -4494,7 +4681,7 @@ version = "0.19.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421"
 dependencies = [
- "indexmap 2.2.6",
+ "indexmap",
  "toml_datetime",
  "winnow",
 ]
@@ -4505,16 +4692,16 @@ version = "0.21.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1"
 dependencies = [
- "indexmap 2.2.6",
+ "indexmap",
  "toml_datetime",
  "winnow",
 ]
 
 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
 dependencies = [
  "pin-project-lite",
  "tracing-attributes",
@@ -4523,31 +4710,32 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
 dependencies = [
  "once_cell",
 ]
 
 [[package]]
 name = "tract-core"
-version = "0.21.4"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98ef3e1f2d94e88d007811e78a3dcc9e5734bfd841849416fab09a407488cb7d"
+checksum = "b65d67f5190132365dda73fe215bfc5e01b031e8cbfbea9d486bb5b0dbba3545"
 dependencies = [
  "anyhow",
+ "anymap3",
  "bit-set",
  "derive-new",
  "downcast-rs",
@@ -4555,11 +4743,11 @@ dependencies = [
  "lazy_static",
  "log",
  "maplit",
- "ndarray",
+ "ndarray 0.16.1",
  "num-complex",
  "num-integer",
  "num-traits",
- "paste",
+ "pastey",
  "rustfft",
  "smallvec",
  "tract-data",
@@ -4568,19 +4756,25 @@ dependencies = [
 
 [[package]]
 name = "tract-data"
-version = "0.21.4"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "580103fb6de703d9bff3d64cfe76c26d454cbcb4b84716d3f2936e36c8a88d67"
+checksum = "73cd7fda1e5e8b854ea3abdd09126a87fc4af81e6d1e29ec1710a8a4abf4f13a"
 dependencies = [
  "anyhow",
+ "downcast-rs",
+ "dyn-clone",
+ "dyn-hash",
  "half",
  "itertools 0.12.1",
  "lazy_static",
+ "libm",
  "maplit",
- "ndarray",
- "nom",
+ "ndarray 0.16.1",
+ "nom 8.0.0",
+ "nom-language",
  "num-integer",
  "num-traits",
+ "parking_lot 0.12.5",
  "scan_fmt",
  "smallvec",
  "string-interner",
@@ -4588,9 +4782,9 @@ dependencies = [
 
 [[package]]
 name = "tract-hir"
-version = "0.21.4"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14a1deb233efc188da3617e66160202ef08247f91f6e12a39940dcb74b5a6af3"
+checksum = "554df991b647dba8af0547ee5838b6912ed20b424f2adda0ea0b7faf8db1b151"
 dependencies = [
  "derive-new",
  "log",
@@ -4599,21 +4793,22 @@ dependencies = [
 
 [[package]]
 name = "tract-linalg"
-version = "0.21.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b58f074c94c74ea736a75b7ac6f696add05c62fc4745d1c420cf7d4d42eb7b2b"
+version = "0.22.1"
 dependencies = [
+ "byteorder",
  "cc",
  "derive-new",
  "downcast-rs",
  "dyn-clone",
+ "dyn-hash",
  "half",
  "lazy_static",
  "liquid",
  "liquid-core",
+ "liquid-derive",
  "log",
  "num-traits",
- "paste",
+ "pastey",
  "scan_fmt",
  "smallvec",
  "time",
@@ -4624,14 +4819,19 @@ dependencies = [
 
 [[package]]
 name = "tract-nnef"
-version = "0.21.4"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "630653ba2da4e55bddf1e7fad9d4e128dcb87200de621d12dc9a48a81bdaedd8"
+checksum = "45b3755dd0948111b407085d11033ba218cb85b85ce8d795cec2b8353db552ea"
 dependencies = [
  "byteorder",
  "flate2",
+ "liquid",
+ "liquid-core",
  "log",
- "nom",
+ "nom 8.0.0",
+ "nom-language",
+ "safetensors",
+ "serde_json",
  "tar",
  "tract-core",
  "walkdir",
@@ -4639,9 +4839,9 @@ dependencies = [
 
 [[package]]
 name = "tract-onnx"
-version = "0.21.4"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cb5d5898db7dd7d7051d50365aebcb48b075d6f8bb17ce1f9e75e6e452aed2a"
+checksum = "ac23ad1d2d5da3256ae1a78757b1072a8a3fac2a4b28d27cfb561c5942ec2701"
 dependencies = [
  "bytes",
  "derive-new",
@@ -4657,9 +4857,9 @@ dependencies = [
 
 [[package]]
 name = "tract-onnx-opl"
-version = "0.21.4"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "620e3c0036ad846d62cc44c3b430ba0e1a22afe9d1abf0cc7f0e4caa3e232186"
+checksum = "87561bf0b84f74a124afc0f1997682728da6cd821083511e0357432954fd24f6"
 dependencies = [
  "getrandom",
  "log",
@@ -4671,9 +4871,9 @@ dependencies = [
 
 [[package]]
 name = "tract-pulse"
-version = "0.21.4"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dcd04336e207e760ce1a47f152664f34c14f3a9ead3af7c16ea2d9d520cf8ac"
+checksum = "ff926428bf533d0d8ee70e2626fb9f8197d33d3cc9e0cafc3f9acf8e11b4dd93"
 dependencies = [
  "downcast-rs",
  "lazy_static",
@@ -4683,9 +4883,9 @@ dependencies = [
 
 [[package]]
 name = "tract-pulse-opl"
-version = "0.21.4"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acc42aebbb6ae3300e2435c3ff67f1520636f1b9196644d0f92edcd8b94c88bc"
+checksum = "5621466758a263fb3baf6494a9aca555dd90c1c0c5216186987a1857bca21f87"
 dependencies = [
  "downcast-rs",
  "lazy_static",
@@ -4720,17 +4920,6 @@ version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "17f77d76d837a7830fe1d4f12b7b4ba4192c1888001c7164257e4bc6d21d96b4"
 
-[[package]]
-name = "twox-hash"
-version = "1.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
-dependencies = [
- "cfg-if",
- "rand",
- "static_assertions",
-]
-
 [[package]]
 name = "typenum"
 version = "1.17.0"
@@ -4772,12 +4961,6 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cc2520efa644f8268dce4dcd3050eaa7fc044fca03961e9998ac7e2e92b77cf1"
 
-[[package]]
-name = "unicode-general-category"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2281c8c1d221438e373249e065ca4989c4c36952c211ff21a0ee91c44a3869e7"
-
 [[package]]
 name = "unicode-ident"
 version = "1.0.12"
@@ -4799,6 +4982,12 @@ dependencies = [
  "tinyvec",
 ]
 
+[[package]]
+name = "unicode-properties"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d"
+
 [[package]]
 name = "unicode-script"
 version = "0.5.6"
@@ -4851,12 +5040,6 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8cdc8b93bd0198ed872357fb2e667f7125646b1762f16d60b2c96350d361897"
 
-[[package]]
-name = "vec_map"
-version = "0.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -4887,26 +5070,14 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.92"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
+checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89"
 dependencies = [
  "cfg-if",
- "wasm-bindgen-macro",
-]
-
-[[package]]
-name = "wasm-bindgen-backend"
-version = "0.2.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
-dependencies = [
- "bumpalo",
- "log",
  "once_cell",
- "proc-macro2",
- "quote",
- "syn 2.0.60",
+ "rustversion",
+ "wasm-bindgen-macro",
  "wasm-bindgen-shared",
 ]
 
@@ -4924,9 +5095,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.92"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
+checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -4934,22 +5105,25 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.92"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
+checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904"
 dependencies = [
+ "bumpalo",
  "proc-macro2",
  "quote",
- "syn 2.0.60",
- "wasm-bindgen-backend",
+ "syn 2.0.117",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.92"
+version = "0.2.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
+checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129"
+dependencies = [
+ "unicode-ident",
+]
 
 [[package]]
 name = "wasm-timer"
@@ -4966,21 +5140,6 @@ dependencies = [
  "web-sys",
 ]
 
-[[package]]
-name = "wayland-backend"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41b48e27457e8da3b2260ac60d0a94512f5cba36448679f3747c0865b7893ed8"
-dependencies = [
- "cc",
- "downcast-rs",
- "io-lifetimes",
- "nix 0.26.4",
- "scoped-tls",
- "smallvec",
- "wayland-sys 0.30.1",
-]
-
 [[package]]
 name = "wayland-backend"
 version = "0.3.3"
@@ -4992,35 +5151,7 @@ dependencies = [
  "rustix 0.38.34",
  "scoped-tls",
  "smallvec",
- "wayland-sys 0.31.1",
-]
-
-[[package]]
-name = "wayland-client"
-version = "0.29.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f3b068c05a039c9f755f881dc50f01732214f5685e379829759088967c46715"
-dependencies = [
- "bitflags 1.3.2",
- "downcast-rs",
- "libc",
- "nix 0.24.3",
- "scoped-tls",
- "wayland-commons",
- "wayland-scanner 0.29.5",
- "wayland-sys 0.29.5",
-]
-
-[[package]]
-name = "wayland-client"
-version = "0.30.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "489c9654770f674fc7e266b3c579f4053d7551df0ceb392f153adb1f9ed06ac8"
-dependencies = [
- "bitflags 1.3.2",
- "nix 0.26.4",
- "wayland-backend 0.1.2",
- "wayland-scanner 0.30.1",
+ "wayland-sys",
 ]
 
 [[package]]
@@ -5031,20 +5162,8 @@ checksum = "82fb96ee935c2cea6668ccb470fb7771f6215d1691746c2d896b447a00ad3f1f"
 dependencies = [
  "bitflags 2.5.0",
  "rustix 0.38.34",
- "wayland-backend 0.3.3",
- "wayland-scanner 0.31.1",
-]
-
-[[package]]
-name = "wayland-commons"
-version = "0.29.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8691f134d584a33a6606d9d717b95c4fa20065605f798a3f350d78dced02a902"
-dependencies = [
- "nix 0.24.3",
- "once_cell",
- "smallvec",
- "wayland-sys 0.29.5",
+ "wayland-backend",
+ "wayland-scanner",
 ]
 
 [[package]]
@@ -5055,18 +5174,7 @@ checksum = "625c5029dbd43d25e6aa9615e88b829a5cad13b2819c4ae129fdbb7c31ab4c7e"
 dependencies = [
  "bitflags 2.5.0",
  "cursor-icon",
- "wayland-backend 0.3.3",
-]
-
-[[package]]
-name = "wayland-cursor"
-version = "0.29.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6865c6b66f13d6257bef1cd40cbfe8ef2f150fb8ebbdb1e8e873455931377661"
-dependencies = [
- "nix 0.24.3",
- "wayland-client 0.29.5",
- "xcursor",
+ "wayland-backend",
 ]
 
 [[package]]
@@ -5076,22 +5184,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "71ce5fa868dd13d11a0d04c5e2e65726d0897be8de247c0c5a65886e283231ba"
 dependencies = [
  "rustix 0.38.34",
- "wayland-client 0.31.2",
+ "wayland-client",
  "xcursor",
 ]
 
-[[package]]
-name = "wayland-protocols"
-version = "0.29.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b950621f9354b322ee817a23474e479b34be96c2e909c14f7bc0100e9a970bc6"
-dependencies = [
- "bitflags 1.3.2",
- "wayland-client 0.29.5",
- "wayland-commons",
- "wayland-scanner 0.29.5",
-]
-
 [[package]]
 name = "wayland-protocols"
 version = "0.31.2"
@@ -5099,44 +5195,35 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f81f365b8b4a97f422ac0e8737c438024b5951734506b0e1d775c73030561f4"
 dependencies = [
  "bitflags 2.5.0",
- "wayland-backend 0.3.3",
- "wayland-client 0.31.2",
- "wayland-scanner 0.31.1",
+ "wayland-backend",
+ "wayland-client",
+ "wayland-scanner",
 ]
 
 [[package]]
-name = "wayland-protocols-wlr"
+name = "wayland-protocols-plasma"
 version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad1f61b76b6c2d8742e10f9ba5c3737f6530b4c243132c2a2ccc8aa96fe25cd6"
+checksum = "23803551115ff9ea9bce586860c5c5a971e360825a0309264102a9495a5ff479"
 dependencies = [
  "bitflags 2.5.0",
- "wayland-backend 0.3.3",
- "wayland-client 0.31.2",
- "wayland-protocols 0.31.2",
- "wayland-scanner 0.31.1",
-]
-
-[[package]]
-name = "wayland-scanner"
-version = "0.29.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f4303d8fa22ab852f789e75a967f0a2cdc430a607751c0499bada3e451cbd53"
-dependencies = [
- "proc-macro2",
- "quote",
- "xml-rs",
+ "wayland-backend",
+ "wayland-client",
+ "wayland-protocols",
+ "wayland-scanner",
 ]
 
 [[package]]
-name = "wayland-scanner"
-version = "0.30.1"
+name = "wayland-protocols-wlr"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9b873b257fbc32ec909c0eb80dea312076a67014e65e245f5eb69a6b8ab330e"
+checksum = "ad1f61b76b6c2d8742e10f9ba5c3737f6530b4c243132c2a2ccc8aa96fe25cd6"
 dependencies = [
- "proc-macro2",
- "quick-xml 0.28.2",
- "quote",
+ "bitflags 2.5.0",
+ "wayland-backend",
+ "wayland-client",
+ "wayland-protocols",
+ "wayland-scanner",
 ]
 
 [[package]]
@@ -5146,50 +5233,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "63b3a62929287001986fb58c789dce9b67604a397c15c611ad9f747300b6c283"
 dependencies = [
  "proc-macro2",
- "quick-xml 0.31.0",
+ "quick-xml",
  "quote",
 ]
 
 [[package]]
 name = "wayland-sys"
-version = "0.29.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be12ce1a3c39ec7dba25594b97b42cb3195d54953ddb9d3d95a7c3902bc6e9d4"
-dependencies = [
- "dlib",
- "lazy_static",
- "pkg-config",
-]
-
-[[package]]
-name = "wayland-sys"
-version = "0.30.1"
+version = "0.31.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96b2a02ac608e07132978689a6f9bf4214949c85998c247abadd4f4129b1aa06"
+checksum = "15a0c8eaff5216d07f226cb7a549159267f3467b289d9a2e52fd3ef5aae2b7af"
 dependencies = [
  "dlib",
- "lazy_static",
  "log",
+ "once_cell",
  "pkg-config",
 ]
 
 [[package]]
-name = "wayland-sys"
-version = "0.31.1"
+name = "web-sys"
+version = "0.3.67"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15a0c8eaff5216d07f226cb7a549159267f3467b289d9a2e52fd3ef5aae2b7af"
+checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed"
 dependencies = [
- "dlib",
- "log",
- "once_cell",
- "pkg-config",
+ "js-sys",
+ "wasm-bindgen",
 ]
 
 [[package]]
-name = "web-sys"
-version = "0.3.69"
+name = "web-time"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
+checksum = "aa30049b1c872b72c89866d458eae9f20380ab280ffd1b1e18df2d3e2d98cfe0"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -5203,16 +5277,17 @@ checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082"
 
 [[package]]
 name = "wgpu"
-version = "0.16.3"
+version = "0.19.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "480c965c9306872eb6255fa55e4b4953be55a8b64d57e61d7ff840d3dcc051cd"
+checksum = "cbd7311dbd2abcfebaabf1841a2824ed7c8be443a0f29166e5d3c6a53a762c01"
 dependencies = [
  "arrayvec",
  "cfg-if",
+ "cfg_aliases",
  "js-sys",
  "log",
  "naga",
- "parking_lot 0.12.2",
+ "parking_lot 0.11.2",
  "profiling",
  "raw-window-handle",
  "smallvec",
@@ -5227,17 +5302,20 @@ dependencies = [
 
 [[package]]
 name = "wgpu-core"
-version = "0.16.1"
+version = "0.19.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f478237b4bf0d5b70a39898a66fa67ca3a007d79f2520485b8b0c3dfc46f8c2"
+checksum = "28b94525fc99ba9e5c9a9e24764f2bc29bad0911a7446c12f446a8277369bf3a"
 dependencies = [
  "arrayvec",
  "bit-vec",
  "bitflags 2.5.0",
+ "cfg_aliases",
  "codespan-reporting",
+ "indexmap",
  "log",
  "naga",
- "parking_lot 0.12.2",
+ "once_cell",
+ "parking_lot 0.11.2",
  "profiling",
  "raw-window-handle",
  "rustc-hash",
@@ -5250,9 +5328,9 @@ dependencies = [
 
 [[package]]
 name = "wgpu-hal"
-version = "0.16.2"
+version = "0.19.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ecb3258078e936deee14fd4e0febe1cfe9bbb5ffef165cb60218d2ee5eb4448"
+checksum = "bfabcfc55fd86611a855816326b2d54c3b2fd7972c27ce414291562650552703"
 dependencies = [
  "android_system_properties",
  "arrayvec",
@@ -5260,10 +5338,11 @@ dependencies = [
  "bit-set",
  "bitflags 2.5.0",
  "block",
+ "cfg_aliases",
  "core-graphics-types",
  "d3d12",
- "foreign-types",
  "glow",
+ "glutin_wgl_sys",
  "gpu-alloc",
  "gpu-allocator",
  "gpu-descriptor",
@@ -5275,8 +5354,10 @@ dependencies = [
  "log",
  "metal",
  "naga",
+ "ndk-sys 0.5.0+25.2.9519653",
  "objc",
- "parking_lot 0.12.2",
+ "once_cell",
+ "parking_lot 0.11.2",
  "profiling",
  "range-alloc",
  "raw-window-handle",
@@ -5292,9 +5373,9 @@ dependencies = [
 
 [[package]]
 name = "wgpu-types"
-version = "0.16.1"
+version = "0.19.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0c153280bb108c2979eb5c7391cb18c56642dd3c072e55f52065e13e2a1252a"
+checksum = "b671ff9fb03f78b46ff176494ee1ebe7d603393f42664be55b64dc8d53969805"
 dependencies = [
  "bitflags 2.5.0",
  "js-sys",
@@ -5332,15 +5413,6 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "winapi-wsapoll"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1eafc5f679c576995526e81635d0cf9695841736712b4e892f87abbe6fed3f28"
-dependencies = [
- "winapi",
-]
-
 [[package]]
 name = "winapi-x86_64-pc-windows-gnu"
 version = "0.4.0"
@@ -5349,9 +5421,9 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
 [[package]]
 name = "window_clipboard"
-version = "0.3.0"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "63287c9c4396ccf5346d035a9b0fcaead9e18377637f5eaa78b7ac65c873ff7d"
+checksum = "f6d692d46038c433f9daee7ad8757e002a4248c20b0a3fbc991d99521d3bcb6d"
 dependencies = [
  "clipboard-win",
  "clipboard_macos",
@@ -5363,11 +5435,12 @@ dependencies = [
 
 [[package]]
 name = "windows"
-version = "0.44.0"
+version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e745dab35a0c4c77aa3ce42d595e13d2003d6902d6b08c9ef5fc326d08da12b"
+checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
 dependencies = [
- "windows-targets 0.42.2",
+ "windows-core 0.52.0",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -5376,7 +5449,16 @@ version = "0.54.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9252e5725dbed82865af151df558e754e4a3c2c30818359eb17465f1346a1b49"
 dependencies = [
- "windows-core",
+ "windows-core 0.54.0",
+ "windows-targets 0.52.5",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
  "windows-targets 0.52.5",
 ]
 
@@ -5390,6 +5472,12 @@ dependencies = [
  "windows-targets 0.52.5",
 ]
 
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
 [[package]]
 name = "windows-result"
 version = "0.1.1"
@@ -5426,6 +5514,15 @@ dependencies = [
  "windows-targets 0.52.5",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.42.2"
@@ -5606,37 +5703,50 @@ checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
 
 [[package]]
 name = "winit"
-version = "0.28.7"
+version = "0.29.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9596d90b45384f5281384ab204224876e8e8bf7d58366d9b795ad99aa9894b94"
+checksum = "0d59ad965a635657faf09c8f062badd885748428933dad8e8bdd64064d92e5ca"
 dependencies = [
+ "ahash",
  "android-activity",
- "bitflags 1.3.2",
+ "atomic-waker",
+ "bitflags 2.5.0",
+ "bytemuck",
+ "calloop",
  "cfg_aliases",
  "core-foundation",
  "core-graphics",
- "dispatch",
- "instant",
+ "cursor-icon",
+ "icrate",
+ "js-sys",
  "libc",
  "log",
- "mio",
- "ndk 0.7.0",
- "objc2",
+ "memmap2 0.9.4",
+ "ndk 0.8.0",
+ "ndk-sys 0.5.0+25.2.9519653",
+ "objc2 0.4.1",
  "once_cell",
  "orbclient",
  "percent-encoding",
  "raw-window-handle",
  "redox_syscall 0.3.5",
+ "rustix 0.38.34",
  "sctk-adwaita",
- "smithay-client-toolkit 0.16.1",
+ "smithay-client-toolkit",
+ "smol_str",
+ "unicode-segmentation",
  "wasm-bindgen",
- "wayland-client 0.29.5",
- "wayland-commons",
- "wayland-protocols 0.29.5",
- "wayland-scanner 0.29.5",
+ "wasm-bindgen-futures",
+ "wayland-backend",
+ "wayland-client",
+ "wayland-protocols",
+ "wayland-protocols-plasma",
  "web-sys",
- "windows-sys 0.45.0",
+ "web-time",
+ "windows-sys 0.48.0",
  "x11-dl",
+ "x11rb",
+ "xkbcommon-dl",
 ]
 
 [[package]]
@@ -5670,40 +5780,19 @@ dependencies = [
  "pkg-config",
 ]
 
-[[package]]
-name = "x11rb"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdf3c79412dd91bae7a7366b8ad1565a85e35dd049affc3a6a2c549e97419617"
-dependencies = [
- "gethostname 0.2.3",
- "libc",
- "libloading 0.7.4",
- "nix 0.25.1",
- "once_cell",
- "winapi",
- "winapi-wsapoll",
- "x11rb-protocol 0.11.1",
-]
-
 [[package]]
 name = "x11rb"
 version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5d91ffca73ee7f68ce055750bf9f6eca0780b8c85eff9bc046a3b0da41755e12"
 dependencies = [
- "gethostname 0.4.3",
+ "as-raw-xcb-connection",
+ "gethostname",
+ "libc",
+ "libloading 0.8.3",
+ "once_cell",
  "rustix 0.38.34",
- "x11rb-protocol 0.13.1",
-]
-
-[[package]]
-name = "x11rb-protocol"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0b1513b141123073ce54d5bb1d33f801f17508fbd61e02060b1214e96d39c56"
-dependencies = [
- "nix 0.25.1",
+ "x11rb-protocol",
 ]
 
 [[package]]
@@ -5739,6 +5828,19 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "xkbcommon-dl"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d039de8032a9a8856a6be89cea3e5d12fdd82306ab7c94d74e6deab2460651c5"
+dependencies = [
+ "bitflags 2.5.0",
+ "dlib",
+ "log",
+ "once_cell",
+ "xkeysym",
+]
+
 [[package]]
 name = "xkeysym"
 version = "0.2.0"
@@ -5751,6 +5853,18 @@ version = "0.8.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "791978798f0597cfc70478424c2b4fdc2b7a8024aaff78497ef00f24ef674193"
 
+[[package]]
+name = "xxhash-rust"
+version = "0.8.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
+
+[[package]]
+name = "yansi"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
+
 [[package]]
 name = "yazi"
 version = "0.1.6"
@@ -5846,7 +5960,7 @@ checksum = "6f4b6c273f496d8fd4eaf18853e6b448760225dc030ff2c485a786859aea6393"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.60",
+ "syn 2.0.117",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 1019dbedd..8a5306657 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,3 +25,12 @@ debug = false
 lto = "fat"
 strip = true
 panic = "abort"
+
+# Local patched tract-linalg with wasm_f32_4x1 GEMV kernel.
+# Adds a 4-row × 1-col SIMD kernel handling N=1 matrix-vector ops without the
+# 75% column-tile waste of the existing wasm_f32_4x4. Source at
+# /Users/CZoli/Desktop/tract-fork/tract-linalg-0.22.1/src/wasm.rs.
+[patch.crates-io]
+tract-linalg = { path = "/Users/CZoli/Desktop/tract-fork/tract-linalg-0.22.1" }
+
+
diff --git a/libDF/Cargo.toml b/libDF/Cargo.toml
index c0f3271ef..879c98ee0 100644
--- a/libDF/Cargo.toml
+++ b/libDF/Cargo.toml
@@ -120,10 +120,10 @@ claxon = { version = "^0.4", optional = true }
 env_logger = { version = "0.11", optional = true }
 clap = { version = "4.0", optional = true, features = ["derive"] }
 rust-ini = { version = "^0.21", optional = true }
-tract-core = { version = "^0.21.4", optional = true }
-tract-onnx = { version = "^0.21.4", optional = true }
-tract-pulse = { version = "^0.21.4", optional = true }
-tract-hir = { version = "^0.21.4", optional = true }
+tract-core = { version = "=0.22.1", optional = true }
+tract-onnx = { version = "=0.22.1", optional = true }
+tract-pulse = { version = "=0.22.1", optional = true }
+tract-hir = { version = "=0.22.1", optional = true }
 flate2 = { version = "1.0.24", optional = true }
 tar = { version = "0.4.38", optional = true }
 wasm-bindgen = { version = "0.2.87", optional = true }
diff --git a/libDF/src/bin/enhance_wav.rs b/libDF/src/bin/enhance_wav.rs
index b37592660..24e06a8e4 100644
--- a/libDF/src/bin/enhance_wav.rs
+++ b/libDF/src/bin/enhance_wav.rs
@@ -3,7 +3,7 @@ use std::{path::PathBuf, process::exit, time::Instant};
 use anyhow::Result;
 use clap::{Parser, ValueHint};
 use df::{tract::*, transforms::resample, wav_utils::*};
-use ndarray::{prelude::*, Axis};
+use tract_core::ndarray::{self, prelude::*, Axis};
 
 #[cfg(all(
     not(windows),
diff --git a/libDF/src/tract.rs b/libDF/src/tract.rs
index b39a98726..2415538f1 100644
--- a/libDF/src/tract.rs
+++ b/libDF/src/tract.rs
@@ -7,7 +7,7 @@ use std::time::Instant;
 use anyhow::{bail, Context, Result};
 use flate2::read::GzDecoder;
 use ini::Ini;
-use ndarray::{prelude::*, Axis};
+use tract_core::ndarray::{self, prelude::*, Axis};
 use tar::Archive;
 use tract_core::internal::tract_itertools::izip;
 use tract_core::internal::tract_smallvec::alloc::collections::VecDeque;
@@ -772,7 +772,7 @@ fn init_encoder_impl(
     n_ch: usize,
 ) -> Result<TypedModel> {
     log::debug!("Start init encoder.");
-    let s = m.symbol_table.sym("S");
+    let s = m.symbols.sym("S");
 
     let nb_erb = df_cfg.get("nb_erb").unwrap().parse::<usize>()?;
     let nb_df = df_cfg.get("nb_df").unwrap().parse::<usize>()?;
@@ -821,7 +821,7 @@ fn init_erb_decoder_impl(
     mask_reduction: Option<ReduceMask>,
 ) -> Result<TypedModel> {
     log::debug!("Start init ERB decoder.");
-    let s = m.symbol_table.sym("S");
+    let s = m.symbols.sym("S");
 
     let nb_erb = df_cfg.get("nb_erb").unwrap().parse::<usize>()?;
     let layer_width = net_cfg.get("conv_ch").unwrap().parse::<usize>()?;
@@ -934,7 +934,7 @@ fn init_df_decoder_impl(
     n_ch: usize,
 ) -> Result<TypedModel> {
     log::debug!("Start init DF decoder.");
-    let s = m.symbol_table.sym("S");
+    let s = m.symbols.sym("S");
 
     let nb_erb = df_cfg.get("nb_erb").unwrap().parse::<usize>()?;
     let nb_df = df_cfg.get("nb_df").unwrap().parse::<usize>()?;
diff --git a/libDF/src/transforms.rs b/libDF/src/transforms.rs
index 7d6b1175e..9840b2ecd 100644
--- a/libDF/src/transforms.rs
+++ b/libDF/src/transforms.rs
@@ -1,6 +1,6 @@
 use std::mem::MaybeUninit;
 
-use ndarray::{prelude::*, Slice};
+use tract_core::ndarray::{self, prelude::*, Slice};
 use rubato::{FftFixedInOut, Resampler};
 use thiserror::Error;
 
diff --git a/libDF/src/wasm.rs b/libDF/src/wasm.rs
index 2e02095b7..0cfbe64b7 100644
--- a/libDF/src/wasm.rs
+++ b/libDF/src/wasm.rs
@@ -1,6 +1,6 @@
 use std::boxed::Box;
 
-use ndarray::prelude::*;
+use tract_core::ndarray::{self, prelude::*};
 use wasm_bindgen::prelude::*;
 
 use crate::tract::*;
diff --git a/libDF/src/wav_utils.rs b/libDF/src/wav_utils.rs
index 99e2cbc56..904f52150 100644
--- a/libDF/src/wav_utils.rs
+++ b/libDF/src/wav_utils.rs
@@ -6,7 +6,7 @@ use std::{
 
 use hound::{WavReader, WavWriter};
 #[cfg(any(feature = "dataset", feature = "wav-utils"))]
-use ndarray::prelude::*;
+use tract_core::ndarray::{self, prelude::*};
 use thiserror::Error;
 
 #[derive(Error, Debug)]

From 5308f75ea9298a5bc07fab77d158e2180b9831f5 Mon Sep 17 00:00:00 2001
From: czoli1976 <64466170+czoli1976@users.noreply.github.com>
Date: Tue, 28 Apr 2026 07:24:38 +0100
Subject: [PATCH 02/10] workspace Cargo.toml: repoint tract-linalg patch at GH
 fork (git source)                        + describe full kernel kit

Two changes in the [patch.crates-io] override:

1. Source swap: absolute local path -> git source on czoli1976/tract.
   Makes the build reproducible for anyone with access to the fork;
   was previously pinned to /Users/CZoli/Desktop/tract-fork/tract-linalg-0.22.1
   which doesn't exist outside the original investigation environment.

2. Comment updated to describe the full WASM SIMD kernel kit (4x4
   existing + 4x1, 8x1, 16x1, 8x4, 8x8 new + per-M dispatcher). The
   previous comment only mentioned the initial 4x1 kernel and was
   stale relative to what's actually being patched in.

Resulting patch:
  tract-linalg = { git = "https://github.com/czoli1976/tract",
                   branch = "add-wasm-f32-full-kernel-kit" }

Cargo.lock will regenerate on next build (the registry source
descriptor for tract-linalg changes from path-based to git-based).
---
 Cargo.toml | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 8a5306657..acdcfd4a8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,11 +26,14 @@ lto = "fat"
 strip = true
 panic = "abort"
 
-# Local patched tract-linalg with wasm_f32_4x1 GEMV kernel.
-# Adds a 4-row × 1-col SIMD kernel handling N=1 matrix-vector ops without the
-# 75% column-tile waste of the existing wasm_f32_4x4. Source at
-# /Users/CZoli/Desktop/tract-fork/tract-linalg-0.22.1/src/wasm.rs.
+# Patched tract-linalg with the WASM SIMD kernel kit:
+#   wasm_f32_4x4 (existing) + wasm_f32_4x1 / 8x1 / 16x1 (new GEMV variants)
+#   + wasm_f32_8x4 / 8x8 (new MM variants) + per-M dispatcher in Ops::mmv_f32.
+# Cumulative impact on DFN3: RTF 0.1290 -> 0.0516 (-60%, 2.5x faster), bit-identical audio.
+# Source:   https://github.com/czoli1976/tract/tree/add-wasm-f32-full-kernel-kit
+# Tracking: czoli1976/tract#2 (full kit), czoli1976/tract#1 (upstream-PR-ready 4x1 only),
+#           sonos/tract#2161 (upstream issue).
 [patch.crates-io]
-tract-linalg = { path = "/Users/CZoli/Desktop/tract-fork/tract-linalg-0.22.1" }
+tract-linalg = { git = "https://github.com/czoli1976/tract", branch = "add-wasm-f32-full-kernel-kit" }
 
 

From 8153d23fc1f295e9327f9bcf865c6b7b5eee9387 Mon Sep 17 00:00:00 2001
From: czoli1976 <64466170+czoli1976@users.noreply.github.com>
Date: Tue, 28 Apr 2026 09:15:51 +0100
Subject: [PATCH 03/10] workspace Cargo.toml: drop 8x4 from kit comment (kernel
 was removed upstream-side after A/B)

Mirrors czoli1976/tract@9c45f8d which dropped wasm_f32_8x4 after a
controlled A/B confirmed it's structurally dead code for DFN3 (every
MM op has N >= 8, strategizer always picks 8x8 over 8x4; mean A/B
delta was -1.22% within thermal noise).

Final kit shipped via [patch.crates-io]:
  wasm_f32_4x4 (existing) + wasm_f32_4x1 / 8x1 / 16x1 (new GEMV)
  + wasm_f32_8x8 (new MM) + per-M dispatcher in Ops::mmv_f32.
---
 Cargo.toml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index acdcfd4a8..00270c1c8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,11 +28,10 @@ panic = "abort"
 
 # Patched tract-linalg with the WASM SIMD kernel kit:
 #   wasm_f32_4x4 (existing) + wasm_f32_4x1 / 8x1 / 16x1 (new GEMV variants)
-#   + wasm_f32_8x4 / 8x8 (new MM variants) + per-M dispatcher in Ops::mmv_f32.
+#   + wasm_f32_8x8 (new MM variant) + per-M dispatcher in Ops::mmv_f32.
 # Cumulative impact on DFN3: RTF 0.1290 -> 0.0516 (-60%, 2.5x faster), bit-identical audio.
 # Source:   https://github.com/czoli1976/tract/tree/add-wasm-f32-full-kernel-kit
-# Tracking: czoli1976/tract#2 (full kit), czoli1976/tract#1 (upstream-PR-ready 4x1 only),
-#           sonos/tract#2161 (upstream issue).
+# Tracking: czoli1976/tract#2 (kernel kit), sonos/tract#2161 (upstream issue).
 [patch.crates-io]
 tract-linalg = { git = "https://github.com/czoli1976/tract", branch = "add-wasm-f32-full-kernel-kit" }
 

From c90060c43069aeaf8fcef12ee79c19f010ec138c Mon Sep 17 00:00:00 2001
From: czoli1976 <64466170+czoli1976@users.noreply.github.com>
Date: Tue, 28 Apr 2026 09:28:30 +0100
Subject: [PATCH 04/10] workspace Cargo.toml: pin tract-linalg patch to rev
 b82d1f0 (instead of branch)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pins the [patch.crates-io] override to a specific commit SHA rather than
a branch, so libDF builds are reproducible while the upstream PR is
under review. Any further commits we push to the branch (in response to
review feedback) will not auto-propagate to libDF builds — we'll bump
the rev deliberately when we want to absorb them.

Pinned commit (b82d1f0): full kernel kit (4x1, 8x1, 16x1, 8x8 + per-M
dispatcher), no 8x4, with module-level #![allow(unsafe_op_in_unsafe_fn)].
---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 00270c1c8..0df6e4d78 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,6 @@ panic = "abort"
 # Source:   https://github.com/czoli1976/tract/tree/add-wasm-f32-full-kernel-kit
 # Tracking: czoli1976/tract#2 (kernel kit), sonos/tract#2161 (upstream issue).
 [patch.crates-io]
-tract-linalg = { git = "https://github.com/czoli1976/tract", branch = "add-wasm-f32-full-kernel-kit" }
+tract-linalg = { git = "https://github.com/czoli1976/tract", rev = "b82d1f0" }
 
 

From d089c96b7d7a6dd834a5060d394f7465827cad25 Mon Sep 17 00:00:00 2001
From: czoli1976 <64466170+czoli1976@users.noreply.github.com>
Date: Tue, 28 Apr 2026 11:32:05 +0100
Subject: [PATCH 05/10] workspace Cargo.toml: bump tract-linalg patch rev to
 d925624

Picks up the upstream review fix (inner unsafe { } blocks per kali's
request, plus cargo fmt). Rev d925624 is the new HEAD of
czoli1976:add-wasm-f32-full-kernel-kit; the previous b82d1f0 was
force-pushed away (rejected lint-allow approach).
---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 0df6e4d78..1640e31c6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,6 @@ panic = "abort"
 # Source:   https://github.com/czoli1976/tract/tree/add-wasm-f32-full-kernel-kit
 # Tracking: czoli1976/tract#2 (kernel kit), sonos/tract#2161 (upstream issue).
 [patch.crates-io]
-tract-linalg = { git = "https://github.com/czoli1976/tract", rev = "b82d1f0" }
+tract-linalg = { git = "https://github.com/czoli1976/tract", rev = "d925624" }
 
 

From d222c7b62e5095e1fd4cbc4b252924a9d9ecbbb0 Mon Sep 17 00:00:00 2001
From: czoli1976 <64466170+czoli1976@users.noreply.github.com>
Date: Tue, 28 Apr 2026 14:43:38 +0100
Subject: [PATCH 06/10] fix(deps): vendor tract-linalg-0.22.1 with kernel kit
 (path-based [patch.crates-io])
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the broken `[patch.crates-io] tract-linalg = { git = ..., rev = "d925624" }`
directive (which silently fell back to stock crates.io tract-linalg-0.22.1
because the git ref was at version 0.23.0-pre — version mismatch with
libDF's =0.22.1 constraint, so cargo ignored the patch).

Replacement: vendor a self-contained single-crate copy of
sonos/tract@v0.22.1's `linalg/` subcrate at `vendor/tract-linalg-0.22.1/`,
with the kernel kit cherry-picks applied (sonos/tract#2164's 4 commits:
4x1 GEMV, kit extension, 8x4 drop, review-feedback inner-unsafe-blocks).

The vendor crate is at version 0.22.1 (matches libDF's =0.22.1 requirement)
and is single-crate (its tract-data dep comes from crates.io, avoiding
the multi-workspace tract-data version conflict that caused the previous
git+branch attempt to fail with 103 type-mismatch errors).

Validated:
- Cargo.lock now has ONE tract-linalg entry (path-resolved, 0.22.1)
- WASM size: 10,007,237 bytes raw (was 9,971,007 with stock kernel-kit-free
  fallback — +36 KB confirms kernel kit symbols are linked)
- Bench: RTF 0.0516, per-frame mean 0.5157 ms (matches prior kernel-kit
  best; was 0.108 with broken patch falling back to stock 4x4-only)

Drop this vendor dir entirely once tract publishes a release including
the merged kernel kit (sonos/tract#2164 was merged 2026-04-28; awaiting
0.22.x cherry-pick or 0.23.x cut).
---
 Cargo.toml                                    |    7 +-
 vendor/tract-linalg-0.22.1/.cargo-ok          |    1 +
 .../tract-linalg-0.22.1/.cargo_vcs_info.json  |    7 +
 vendor/tract-linalg-0.22.1/Cargo.toml         |  224 +++
 vendor/tract-linalg-0.22.1/Cargo.toml.orig    |  125 ++
 vendor/tract-linalg-0.22.1/LICENSE            |   12 +
 vendor/tract-linalg-0.22.1/LICENSE-APACHE     |  201 ++
 vendor/tract-linalg-0.22.1/LICENSE-MIT        |   23 +
 vendor/tract-linalg-0.22.1/README.md          |   27 +
 .../armv7neon_mmm_f32_32x1_core.tmpl          |  204 ++
 .../armv7neon/armv7neon_mmm_f32_8x1_core.tmpl |   98 +
 .../armv7neon/armv7neon_mmm_f32_8x4_core.tmpl |  143 ++
 .../armv7neon/armv7neon_mmm_f32_8x6_core.tmpl |  158 ++
 .../armv7neon_mmm_f32_per_cols.tmpliq         |    9 +
 .../armv7neon_mmm_f32_per_rows.tmpliq         |    9 +
 .../armv7neon_mmm_f32_scalars.tmpliq          |   24 +
 .../armv7neon/armv7neon_mmm_i32_32x1.tmpl     |  174 ++
 .../armv7neon/armv7neon_mmm_i32_8x4.tmpl      |  294 +++
 .../armv7neon_mmm_i32_per_cols.tmpliq         |    8 +
 .../armv7neon_mmm_i32_per_rows.tmpliq         |    8 +
 .../armv7neon_mmm_i32_scalars.tmpliq          |   20 +
 .../armv7neon_mmm_i32_scale_q8_q15.tmpliq     |  232 +++
 .../armv7neon/armv7neon_mmm_q_per_col.tmpliq  |   33 +
 .../armv7neon/armv7neon_mmm_q_per_row.tmpliq  |   24 +
 .../armv7neon/armv7neon_mmm_q_scalar.tmpliq   |   15 +
 .../arm32/armv7neon/armv7neon_prefetch.tmpl   |   22 +
 .../armv7neon/armv7neon_sigmoid_f32_4n.tmpl   |  215 +++
 .../armv7neon/armv7neon_tanh_f32_4n.tmpl      |  209 ++
 .../arm32/armv7neon/dispatcher.tmpliq         |   38 +
 .../arm32/armvfpv2/armvfpv2_mmm_f32_4x4.tmpl  |  491 +++++
 .../arm32/armvfpv2/dispatcher.tmpliq          |   32 +
 .../apple_amx/apple_amx_mmm_f16_64x1.tmpl     |  533 ++++++
 .../apple_amx/apple_amx_mmm_f16_64x32.tmpl    |  658 +++++++
 .../apple_amx/apple_amx_mmm_f32_32x1.tmpl     |  533 ++++++
 .../apple_amx/apple_amx_mmm_f32_32x32.tmpl    |  764 ++++++++
 .../arm64/apple_amx/dispatcher.tmpliq         |   37 +
 .../arm64/apple_amx/instructions.rs           |  191 ++
 .../arm64fp16_leaky_relu_f16_8n.tmpl          |   71 +
 .../arm64fp16/arm64fp16_mmm_8h_per_col.tmpliq |   41 +
 .../arm64fp16/arm64fp16_mmm_8h_per_row.tmpliq |   25 +
 .../arm64fp16/arm64fp16_mmm_8h_scalar.tmpliq  |   18 +
 .../loop1/cortex_a53.tmpli                    |   65 +
 .../arm64fp16_mmm_f16_128x1/loop1/naive.tmpli |   32 +
 .../loop2/cortex_a55.tmpli                    |   85 +
 .../arm64fp16_mmm_f16_128x1_core.tmpl         |  203 ++
 .../arm64fp16_mmm_f16_16x8/loop1/naive.tmpli  |   21 +
 .../loop2/cortex_a55.tmpli                    |   54 +
 .../arm64fp16_mmm_f16_16x8_core.tmpl          |  174 ++
 .../arm64fp16_mmm_f16_32x4/loop1/naive.tmpli  |   21 +
 .../loop2/cortex_a55.tmpli                    |   71 +
 .../arm64fp16_mmm_f16_32x4_core.tmpl          |  165 ++
 .../arm64fp16_mmm_f16_32x6.core.tmpl          |  148 ++
 .../arm64fp16_mmm_f16_64x1.core.tmpl          |  264 +++
 .../arm64fp16_mmm_f16_64x3.core.tmpl          |  165 ++
 .../arm64fp16_mmm_f16_per_cols.tmpliq         |    9 +
 .../arm64fp16_mmm_f16_per_rows.tmpliq         |    9 +
 .../arm64fp16_mmm_f16_scalars.tmpliq          |   36 +
 .../arm64fp16/arm64fp16_mmm_load_tile.tmpliq  |   10 +
 .../arm64fp16/arm64fp16_sigmoid_f16_8n.tmpl   |  131 ++
 .../arm64fp16/arm64fp16_tanh_f16_8n.tmpl      |  124 ++
 .../arm64/arm64fp16/dispatcher.tmpliq         |   37 +
 .../arm64/arm64fp16/dummy_fmla_no_pragma.S    |   13 +
 .../arm64/arm64fp16/dummy_fmla_pragma.S       |   13 +
 .../arm64simd/arm64simd_mmm_4s_per_col.tmpliq |   36 +
 .../arm64simd/arm64simd_mmm_4s_per_row.tmpliq |   25 +
 .../arm64simd/arm64simd_mmm_4s_scalar.tmpliq  |   18 +
 .../ldr_w_no_preload.tmpli                    |   69 +
 .../packed_packed_loop1/ldr_w_preload.tmpli   |   82 +
 .../packed_packed_loop1/ldr_x_preload.tmpli   |   60 +
 .../packed_packed_loop1/naive.tmpli           |   34 +
 .../packed_packed_loop2/cortex_a55.tmpli      |  107 ++
 .../arm64simd_mmm_f32_12x8_core.tmpl          |  163 ++
 .../packed_packed_loop1/cortex_a53.tmpli      |   45 +
 .../packed_packed_loop1/naive.tmpli           |   21 +
 .../packed_packed_loop2/cortex_a55.tmpli      |   73 +
 .../arm64simd_mmm_f32_16x4_core.tmpl          |  174 ++
 .../loop2/cortex_a55.tmpli                    |   73 +
 .../packed_packed_loop1/cortex_a53.tmpli      |   63 +
 .../packed_packed_loop1/cortex_a55.tmpli      |   53 +
 .../packed_packed_loop1/naive.tmpli           |   31 +
 .../arm64simd_mmm_f32_24x4_core.tmpl          |  185 ++
 .../arm64simd_mmm_f32_32x1_core.tmpl          |  403 ++++
 .../arm64simd_mmm_f32_32x3_core.tmpl          |  307 +++
 .../loop1/cortex_a53.tmpli                    |   65 +
 .../arm64simd_mmm_f32_64x1/loop1/naive.tmpli  |   32 +
 .../loop2/cortex_a55.tmpli                    |   85 +
 .../arm64simd_mmm_f32_64x1/loop2/naive.tmpli  |   66 +
 .../arm64simd_mmm_f32_64x1_core.tmpl          |  225 +++
 .../packed_packed_loop1/broken_chains.tmpli   |   25 +
 .../ldr_w_no_preload.tmpli                    |   51 +
 .../packed_packed_loop1/ldr_w_preload.tmpli   |   54 +
 .../ldr_x_no_preload.tmpli                    |   35 +
 .../packed_packed_loop1/ldr_x_preload.tmpli   |   43 +
 .../packed_packed_loop1/naive.tmpli           |   21 +
 .../packed_packed_loop2/broken_chains.tmpli   |   41 +
 .../packed_packed_loop2/cortex_a55.tmpli      |   60 +
 .../arm64simd/arm64simd_mmm_f32_8x8_core.tmpl |  182 ++
 .../arm64simd_mmm_f32_per_cols.tmpliq         |    9 +
 .../arm64simd_mmm_f32_per_rows.tmpliq         |    9 +
 .../arm64simd_mmm_f32_scalars.tmpliq          |   36 +
 .../arm64simd/arm64simd_mmm_i32_64x1.tmpl     |  180 ++
 .../arm64simd/arm64simd_mmm_i32_8x8.tmpl      |  234 +++
 .../arm64simd_mmm_i32_per_cols.tmpliq         |    8 +
 .../arm64simd_mmm_i32_per_rows.tmpliq         |    8 +
 .../arm64simd_mmm_i32_scalars.tmpliq          |   31 +
 .../arm64simd_mmm_i32_scale_q16_q31.tmpliq    |  267 +++
 .../arm64simd/arm64simd_mmm_load_tile.tmpliq  |   10 +
 .../arm64simd/arm64simd_sigmoid_f32_4n.tmpl   |  206 ++
 .../arm64simd/arm64simd_tanh_f32_4n.tmpl      |  198 ++
 .../arm64/arm64simd/dispatcher.tmpliq         |   37 +
 .../tract-linalg-0.22.1/benches/arm32neon.rs  |  179 ++
 vendor/tract-linalg-0.22.1/benches/arm64.rs   |   77 +
 .../tract-linalg-0.22.1/benches/arm64simd.rs  |  926 +++++++++
 vendor/tract-linalg-0.22.1/benches/intel.rs   |  200 ++
 .../tract-linalg-0.22.1/benches/leaky_relu.rs |   63 +
 vendor/tract-linalg-0.22.1/benches/mat_vec.rs |   46 +
 .../benches/mm_for_asr_am.rs                  |   37 +
 .../benches/mm_for_inception.rs               |   45 +
 .../benches/mm_for_wavenet_hw.rs              |   12 +
 vendor/tract-linalg-0.22.1/benches/sigmoid.rs |   22 +
 vendor/tract-linalg-0.22.1/benches/softmax.rs |  110 ++
 vendor/tract-linalg-0.22.1/benches/utils.rs   |   92 +
 .../benches/virtual_im2col.rs                 |   47 +
 vendor/tract-linalg-0.22.1/benches/x86_64.rs  |  242 +++
 vendor/tract-linalg-0.22.1/build.rs           |  374 ++++
 vendor/tract-linalg-0.22.1/src/arm32.rs       |  101 +
 .../src/arm32/armv7neon.rs                    |   46 +
 .../tract-linalg-0.22.1/src/arm32/armvfpv2.rs |   11 +
 .../src/arm32/cortex_a7.rs                    |   16 +
 .../src/arm32/cortex_a7.txt                   | 1701 +++++++++++++++++
 .../src/arm32/cortex_a9.rs                    |   16 +
 .../src/arm32/cortex_a9.txt                   | 1701 +++++++++++++++++
 vendor/tract-linalg-0.22.1/src/arm64.rs       |  383 ++++
 .../src/arm64/apple_amx.rs                    |   32 +
 .../src/arm64/arm64fp16.rs                    |   64 +
 .../src/arm64/arm64fp16/by_scalar.rs          |  258 +++
 .../src/arm64/arm64fp16/leaky_relu.rs         |   56 +
 .../src/arm64/arm64fp16/max.rs                |   63 +
 .../src/arm64/arm64fp16/panel_extract.rs      |   94 +
 .../src/arm64/arm64fp16/sum.rs                |   62 +
 .../src/arm64/arm64fp16/unicast.rs            |  271 +++
 .../src/arm64/arm64simd.rs                    |  117 ++
 .../src/arm64/arm64simd/by_scalar.rs          |  202 ++
 .../src/arm64/arm64simd/leaky_relu.rs         |   50 +
 .../src/arm64/arm64simd/max.rs                |   52 +
 .../src/arm64/arm64simd/panel_extract.rs      |   98 +
 .../src/arm64/arm64simd/softmax.rs            |  110 ++
 .../src/arm64/arm64simd/sum.rs                |   59 +
 .../src/arm64/arm64simd/unicast.rs            |  233 +++
 .../src/arm64/cortex_a53.rs                   |   16 +
 .../src/arm64/cortex_a55.rs                   |   16 +
 .../src/arm64/cortex_a72.rs                   |    4 +
 .../src/arm64/cortex_a73.rs                   |    4 +
 .../src/frame/block_quant/helpers.rs          |   65 +
 .../src/frame/block_quant/mod.rs              |  327 ++++
 .../src/frame/block_quant/q4_0.rs             |  509 +++++
 .../src/frame/block_quant/value.rs            |  116 ++
 .../src/frame/by_scalar.rs                    |   96 +
 .../src/frame/element_wise.rs                 |  165 ++
 .../src/frame/element_wise_helper.rs          |  169 ++
 .../src/frame/leaky_relu.rs                   |   65 +
 vendor/tract-linalg-0.22.1/src/frame/lut.rs   |  141 ++
 .../src/frame/mmm/cost_model.rs               |   86 +
 .../tract-linalg-0.22.1/src/frame/mmm/fuse.rs |  125 ++
 .../src/frame/mmm/input_store.rs              |  179 ++
 .../src/frame/mmm/kernel.rs                   |  159 ++
 .../src/frame/mmm/macros.rs                   |  124 ++
 .../tract-linalg-0.22.1/src/frame/mmm/mod.rs  |  307 +++
 .../src/frame/mmm/panel_extract.rs            |  300 +++
 .../src/frame/mmm/scratch.rs                  |  529 +++++
 .../src/frame/mmm/storage.rs                  |  139 ++
 .../src/frame/mmm/tests/frame.rs              |  295 +++
 .../src/frame/mmm/tests/fuse.rs               |  287 +++
 .../src/frame/mmm/tests/mod.rs                |   89 +
 .../src/frame/mmm/tests/packed_packed.rs      |  382 ++++
 .../src/frame/mmm/tests/q_scale.rs            |  176 ++
 .../src/frame/mmm/tests/store.rs              |  131 ++
 vendor/tract-linalg-0.22.1/src/frame/mod.rs   |   25 +
 vendor/tract-linalg-0.22.1/src/frame/pack.rs  | 1015 ++++++++++
 .../src/frame/reduce/max.rs                   |   42 +
 .../src/frame/reduce/mod.rs                   |  300 +++
 .../src/frame/reduce/softmax.rs               |   86 +
 .../src/frame/reduce/sum.rs                   |   54 +
 .../tract-linalg-0.22.1/src/frame/sigmoid.rs  |   96 +
 vendor/tract-linalg-0.22.1/src/frame/tanh.rs  |  101 +
 .../tract-linalg-0.22.1/src/frame/unicast.rs  |  233 +++
 .../tract-linalg-0.22.1/src/frame/weights.rs  |   80 +
 vendor/tract-linalg-0.22.1/src/generic.rs     |   55 +
 .../src/generic/by_scalar.rs                  |  181 ++
 vendor/tract-linalg-0.22.1/src/generic/erf.rs |   51 +
 .../src/generic/leaky_relu.rs                 |   74 +
 vendor/tract-linalg-0.22.1/src/generic/lut.rs |   47 +
 vendor/tract-linalg-0.22.1/src/generic/mmm.rs |  453 +++++
 .../tract-linalg-0.22.1/src/generic/reduce.rs |  187 ++
 .../src/generic/rounding.rs                   |  524 +++++
 .../src/generic/sigmoid.rs                    |  138 ++
 .../tract-linalg-0.22.1/src/generic/tanh.rs   |  133 ++
 .../src/generic/unicast.rs                    |  194 ++
 .../src/hwbench/bandwidth.rs                  |  159 ++
 vendor/tract-linalg-0.22.1/src/hwbench/mod.rs |    4 +
 .../tract-linalg-0.22.1/src/hwbench/runner.rs |  122 ++
 vendor/tract-linalg-0.22.1/src/lib.rs         |  404 ++++
 vendor/tract-linalg-0.22.1/src/multithread.rs |   57 +
 vendor/tract-linalg-0.22.1/src/wasm.rs        | 1664 ++++++++++++++++
 .../src/wasm.rs.before-fma                    | 1664 ++++++++++++++++
 .../tract-linalg-0.22.1/src/wasm.rs.with-8x4  | 1555 +++++++++++++++
 vendor/tract-linalg-0.22.1/src/x86_64_fma.rs  |   49 +
 .../src/x86_64_fma/by_scalar.rs               |   56 +
 .../src/x86_64_fma/intel.rs                   |    5 +
 .../tract-linalg-0.22.1/src/x86_64_fma/max.rs |   67 +
 .../tract-linalg-0.22.1/src/x86_64_fma/mmm.rs |  172 ++
 .../src/x86_64_fma/panel_extract.rs           |  136 ++
 .../src/x86_64_fma/softmax.rs                 |  121 ++
 .../tests/virtual_im2col.rs                   |  545 ++++++
 .../packed_packed_loop1/avx-512-unroll.tmpli  |   59 +
 .../10x1/packed_packed_loop1/avx-512.tmpli    |   33 +
 .../1x1/packed_packed_loop1/avx-512.tmpli     |    7 +
 .../1x1/packed_packed_loop1/unroll-16.tmpli   |   68 +
 .../1x1/packed_packed_loop1/unroll-4.tmpli    |   24 +
 .../1x1/packed_packed_loop1/unroll-8.tmpli    |   29 +
 .../1x1/packed_packed_loop1/unroll.tmpli      |   11 +
 .../1x12/packed_packed_loop1/avx-512.tmpli    |   45 +
 .../packed_packed_loop1/avx-512-unroll.tmpli  |   53 +
 .../2x5/packed_packed_loop1/avx-512.tmpli     |   30 +
 .../packed_packed_loop1/avx-512-unroll.tmpli  |   71 +
 .../2x6/packed_packed_loop1/avx-512.tmpli     |   39 +
 .../packed_packed_loop1/avx-512-unroll.tmpli  |   63 +
 .../3x4/packed_packed_loop1/avx-512.tmpli     |   35 +
 .../packed_packed_loop1/avx-512-unroll.tmpli  |   69 +
 .../4x3/packed_packed_loop1/avx-512.tmpli     |   38 +
 .../packed_packed_loop1/avx-512-unroll.tmpli  |   63 +
 .../5x2/packed_packed_loop1/avx-512.tmpli     |   34 +
 .../packed_packed_loop1/avx-512-unroll.tmpli  |   25 +
 .../6x1/packed_packed_loop1/avx-512.tmpli     |   29 +
 .../packed_packed_loop1/avx-512-unroll.tmpli  |   70 +
 .../6x2/packed_packed_loop1/avx-512.tmpli     |   38 +
 .../packed_packed_loop1/avx-512-unroll.tmpli  |   40 +
 .../7x1/packed_packed_loop1/avx-512.tmpli     |   21 +
 .../packed_packed_loop1/avx-512-unroll.tmpli  |   30 +
 .../8x1/packed_packed_loop1/avx-512.tmpli     |   25 +
 .../8x2/packed_packed_loop1/avx-512.tmpli     |   42 +
 .../packed_packed_loop1/avx-512-unroll.tmpli  |   61 +
 .../8x8/packed_packed_loop1/avx-512.tmpli     |   33 +
 .../x86_64/avx512/avx512_mmm_f32_128x1.tmpl   |  110 ++
 .../x86_64/avx512/avx512_mmm_f32_16x1.tmpl    |  143 ++
 .../x86_64/avx512/avx512_mmm_f32_16x12.tmpl   |  165 ++
 .../x86_64/avx512/avx512_mmm_f32_16x8.tmpl    |  143 ++
 .../x86_64/avx512/avx512_mmm_f32_32x5.tmpl    |  144 ++
 .../x86_64/avx512/avx512_mmm_f32_32x6.tmpl    |  161 ++
 .../x86_64/avx512/avx512_mmm_f32_48x4.tmpl    |  148 ++
 .../x86_64/avx512/avx512_mmm_f32_64x3.tmpl    |  149 ++
 .../x86_64/avx512/avx512_mmm_f32_80x2.tmpl    |  148 ++
 .../x86_64/avx512/avx512_mmm_load_tile.tmpliq |    9 +
 .../x86_64/avx512/dispatcher.tmpliq           |   40 +
 .../x86_64/avx512/f32_per_cols.tmpliq         |    8 +
 .../x86_64/avx512/f32_per_rows.tmpliq         |    8 +
 .../x86_64/avx512/f32_scalars.tmpliq          |   29 +
 .../x86_64/avx512/i32_per_cols.tmpliq         |    8 +
 .../x86_64/avx512/i32_per_rows.tmpliq         |    8 +
 .../x86_64/avx512/i32_scalars.tmpliq          |   10 +
 .../x86_64/avx512/postamble.tmpliq            |   38 +
 .../x86_64/avx512/preamble.tmpliq             |   63 +
 .../x86_64/avx512/sigmoid_f32.tmpl            |  324 ++++
 .../x86_64/avx512/tanh_f32.tmpl               |  313 +++
 .../x86_64/avx512/zmm_per_col.tmpliq          |   29 +
 .../x86_64/avx512/zmm_per_row.tmpliq          |   23 +
 .../x86_64/avx512/zmm_scalar.tmpliq           |   15 +
 .../10x1/packed_packed_loop1/avx-unroll.tmpli |   58 +
 .../fma/10x1/packed_packed_loop1/avx.tmpli    |   33 +
 .../2x5/packed_packed_loop1/avx-unroll.tmpli  |   52 +
 .../fma/2x5/packed_packed_loop1/avx.tmpli     |   30 +
 .../packed_packed_loop1/original-unroll.tmpli |   71 +
 .../2x6/packed_packed_loop1/original.tmpli    |   39 +
 .../3x4/packed_packed_loop1/avx-unroll.tmpli  |   60 +
 .../fma/3x4/packed_packed_loop1/avx.tmpli     |   32 +
 .../4x3/packed_packed_loop1/avx-unroll.tmpli  |   69 +
 .../fma/4x3/packed_packed_loop1/avx.tmpli     |   38 +
 .../5x2/packed_packed_loop1/avx-unroll.tmpli  |   63 +
 .../fma/5x2/packed_packed_loop1/avx.tmpli     |   34 +
 .../6x1/packed_packed_loop1/avx-unroll.tmpli  |   25 +
 .../fma/6x1/packed_packed_loop1/avx.tmpli     |   29 +
 .../6x2/packed_packed_loop1/avx-unroll.tmpli  |   70 +
 .../fma/6x2/packed_packed_loop1/avx.tmpli     |   38 +
 .../7x1/packed_packed_loop1/avx-unroll.tmpli  |   37 +
 .../fma/7x1/packed_packed_loop1/avx.tmpli     |   22 +
 .../8x1/packed_packed_loop1/avx-unroll.tmpli  |   48 +
 .../fma/8x1/packed_packed_loop1/avx.tmpli     |   33 +
 .../8x8/packed_packed_loop1/avx-unroll.tmpli  |   58 +
 .../fma/8x8/packed_packed_loop1/avx.tmpli     |   30 +
 .../x86_64/fma/avx2_mmm_i32_8x8.tmpl          |  682 +++++++
 .../x86_64/fma/dispatcher.tmpliq              |   40 +
 .../x86_64/fma/fma_mmm_f32_16x5.tmpl          |  143 ++
 .../x86_64/fma/fma_mmm_f32_16x6.tmpl          |  131 ++
 .../x86_64/fma/fma_mmm_f32_24x4.tmpl          |  158 ++
 .../x86_64/fma/fma_mmm_f32_32x1.tmpl          |  368 ++++
 .../x86_64/fma/fma_mmm_f32_32x3.tmpl          |  239 +++
 .../x86_64/fma/fma_mmm_f32_40x2.tmpl          |  158 ++
 .../x86_64/fma/fma_mmm_f32_64x1.tmpl          |  142 ++
 .../x86_64/fma/fma_mmm_f32_8x8.tmpl           |  129 ++
 .../x86_64/fma/fma_mmm_f32_per_cols.tmpliq    |    9 +
 .../x86_64/fma/fma_mmm_f32_per_rows.tmpliq    |    9 +
 .../x86_64/fma/fma_mmm_f32_scalars.tmpliq     |   38 +
 .../x86_64/fma/fma_mmm_i32_per_cols.tmpliq    |    9 +
 .../x86_64/fma/fma_mmm_i32_per_rows.tmpliq    |    9 +
 .../x86_64/fma/fma_mmm_i32_scalars.tmpliq     |   23 +
 .../x86_64/fma/fma_mmm_load_tile.tmpliq       |    9 +
 .../x86_64/fma/fma_mmm_ymm_per_col.tmpliq     |   35 +
 .../x86_64/fma/fma_mmm_ymm_per_row.tmpliq     |   32 +
 .../x86_64/fma/fma_mmm_ymm_scalar.tmpliq      |   22 +
 .../x86_64/fma/fma_sigmoid_f32.tmpl           |  319 ++++
 .../x86_64/fma/fma_tanh_f32.tmpl              |  313 +++
 .../x86_64/fma/postamble.tmpliq               |   38 +
 .../x86_64/fma/preamble.tmpliq                |   64 +
 313 files changed, 43672 insertions(+), 3 deletions(-)
 create mode 100644 vendor/tract-linalg-0.22.1/.cargo-ok
 create mode 100644 vendor/tract-linalg-0.22.1/.cargo_vcs_info.json
 create mode 100644 vendor/tract-linalg-0.22.1/Cargo.toml
 create mode 100644 vendor/tract-linalg-0.22.1/Cargo.toml.orig
 create mode 100644 vendor/tract-linalg-0.22.1/LICENSE
 create mode 100644 vendor/tract-linalg-0.22.1/LICENSE-APACHE
 create mode 100644 vendor/tract-linalg-0.22.1/LICENSE-MIT
 create mode 100644 vendor/tract-linalg-0.22.1/README.md
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_32x1_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x4_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x6_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_cols.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_rows.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_scalars.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_32x1.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_8x4.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_cols.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_rows.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scalars.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scale_q8_q15.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_col.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_row.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_scalar.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_prefetch.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_sigmoid_f32_4n.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_tanh_f32_4n.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armv7neon/dispatcher.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armvfpv2/armvfpv2_mmm_f32_4x4.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm32/armvfpv2/dispatcher.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x1.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x32.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x1.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x32.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/dispatcher.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/apple_amx/instructions.rs
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_leaky_relu_f16_8n.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_col.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_row.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_scalar.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/cortex_a53.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/naive.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop1/naive.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop1/naive.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x6.core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x1.core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x3.core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_cols.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_rows.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_scalars.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_load_tile.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_sigmoid_f16_8n.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_tanh_f16_8n.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/dispatcher.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_no_pragma.S
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_pragma.S
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_col.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_row.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_scalar.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_no_preload.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_preload.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/loop2/cortex_a55.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x1_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x3_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/naive.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/naive.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/broken_chains.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_no_preload.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_preload.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_no_preload.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/broken_chains.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8_core.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_cols.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_rows.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_scalars.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_64x1.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_8x8.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_cols.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_rows.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scalars.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scale_q16_q31.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_load_tile.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_sigmoid_f32_4n.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_tanh_f32_4n.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/arm64/arm64simd/dispatcher.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/benches/arm32neon.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/arm64.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/arm64simd.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/intel.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/leaky_relu.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/mat_vec.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/mm_for_asr_am.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/mm_for_inception.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/mm_for_wavenet_hw.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/sigmoid.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/softmax.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/utils.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/virtual_im2col.rs
 create mode 100644 vendor/tract-linalg-0.22.1/benches/x86_64.rs
 create mode 100644 vendor/tract-linalg-0.22.1/build.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm32.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/armv7neon.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/armvfpv2.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.txt
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.txt
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/apple_amx.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/by_scalar.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/leaky_relu.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/max.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/panel_extract.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/sum.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/unicast.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/by_scalar.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/leaky_relu.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/max.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/panel_extract.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/softmax.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/sum.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/arm64simd/unicast.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/cortex_a53.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/cortex_a55.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/cortex_a72.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/arm64/cortex_a73.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/block_quant/helpers.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/block_quant/mod.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/block_quant/q4_0.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/block_quant/value.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/by_scalar.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/element_wise.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/element_wise_helper.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/leaky_relu.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/lut.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/cost_model.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/fuse.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/input_store.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/kernel.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/macros.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/mod.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/panel_extract.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/scratch.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/storage.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/frame.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/fuse.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/mod.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/packed_packed.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/q_scale.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mmm/tests/store.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/mod.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/pack.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/reduce/max.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/reduce/mod.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/reduce/softmax.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/reduce/sum.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/sigmoid.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/tanh.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/unicast.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/frame/weights.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/generic.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/generic/by_scalar.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/generic/erf.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/generic/leaky_relu.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/generic/lut.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/generic/mmm.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/generic/reduce.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/generic/rounding.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/generic/sigmoid.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/generic/tanh.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/generic/unicast.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/hwbench/bandwidth.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/hwbench/mod.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/hwbench/runner.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/lib.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/multithread.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/wasm.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/wasm.rs.before-fma
 create mode 100644 vendor/tract-linalg-0.22.1/src/wasm.rs.with-8x4
 create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/by_scalar.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/intel.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/max.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/mmm.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/panel_extract.rs
 create mode 100644 vendor/tract-linalg-0.22.1/src/x86_64_fma/softmax.rs
 create mode 100644 vendor/tract-linalg-0.22.1/tests/virtual_im2col.rs
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_128x1.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x1.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x12.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x8.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x5.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x6.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_48x4.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_64x3.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_80x2.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_load_tile.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/dispatcher.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_cols.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_rows.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/f32_scalars.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_cols.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_rows.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/i32_scalars.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/postamble.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/preamble.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/sigmoid_f32.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/tanh_f32.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_col.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_row.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_scalar.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx-unroll.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx.tmpli
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/avx2_mmm_i32_8x8.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/dispatcher.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x5.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x6.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_24x4.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x1.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x3.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_40x2.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_64x1.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_8x8.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_cols.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_rows.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_scalars.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_cols.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_rows.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_scalars.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_load_tile.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_col.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_row.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_scalar.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_sigmoid_f32.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/fma_tanh_f32.tmpl
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/postamble.tmpliq
 create mode 100644 vendor/tract-linalg-0.22.1/x86_64/fma/preamble.tmpliq

diff --git a/Cargo.toml b/Cargo.toml
index 1640e31c6..3702d226b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,9 +30,10 @@ panic = "abort"
 #   wasm_f32_4x4 (existing) + wasm_f32_4x1 / 8x1 / 16x1 (new GEMV variants)
 #   + wasm_f32_8x8 (new MM variant) + per-M dispatcher in Ops::mmv_f32.
 # Cumulative impact on DFN3: RTF 0.1290 -> 0.0516 (-60%, 2.5x faster), bit-identical audio.
-# Source:   https://github.com/czoli1976/tract/tree/add-wasm-f32-full-kernel-kit
-# Tracking: czoli1976/tract#2 (kernel kit), sonos/tract#2161 (upstream issue).
+# Source:   vendor/tract-linalg-0.22.1 (self-contained crate, version-matched 0.22.1).
+# Lineage:  cherry-picked from czoli1976/tract@kernel-kit-on-v0.22.1 — kernel kit
+#           from sonos/tract#2164 (merged) applied onto sonos/tract v0.22.1 tag.
 [patch.crates-io]
-tract-linalg = { git = "https://github.com/czoli1976/tract", rev = "d925624" }
+tract-linalg = { path = "vendor/tract-linalg-0.22.1" }
 
 
diff --git a/vendor/tract-linalg-0.22.1/.cargo-ok b/vendor/tract-linalg-0.22.1/.cargo-ok
new file mode 100644
index 000000000..5f8b79583
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/.cargo-ok
@@ -0,0 +1 @@
+{"v":1}
\ No newline at end of file
diff --git a/vendor/tract-linalg-0.22.1/.cargo_vcs_info.json b/vendor/tract-linalg-0.22.1/.cargo_vcs_info.json
new file mode 100644
index 000000000..9f37214c9
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/.cargo_vcs_info.json
@@ -0,0 +1,7 @@
+{
+  "git": {
+    "sha1": "88246b2a7e5b55df558a828b9c0c4815590620ce",
+    "dirty": true
+  },
+  "path_in_vcs": "linalg"
+}
\ No newline at end of file
diff --git a/vendor/tract-linalg-0.22.1/Cargo.toml b/vendor/tract-linalg-0.22.1/Cargo.toml
new file mode 100644
index 000000000..d678cb319
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/Cargo.toml
@@ -0,0 +1,224 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2024"
+name = "tract-linalg"
+version = "0.22.1"
+authors = ["Mathieu Poumeyrol <kali@zoy.org>"]
+build = "build.rs"
+autolib = false
+autobins = false
+autoexamples = false
+autotests = false
+autobenches = false
+description = "Tiny, no-nonsense, self contained, TensorFlow and ONNX inference"
+readme = "README.md"
+keywords = [
+    "TensorFlow",
+    "NeuralNetworks",
+]
+categories = ["science"]
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/snipsco/tract"
+resolver = "2"
+
+[badges.maintenance]
+status = "actively-developed"
+
+[features]
+apple-amx-ios = []
+complex = ["tract-data/complex"]
+default = []
+hwbench = ["rayon"]
+multithread-mm = ["rayon"]
+no_fp16 = []
+
+[lib]
+name = "tract_linalg"
+path = "src/lib.rs"
+
+[[test]]
+name = "virtual_im2col"
+path = "tests/virtual_im2col.rs"
+
+[[bench]]
+name = "arm32neon"
+path = "benches/arm32neon.rs"
+bench = false
+harness = false
+
+[[bench]]
+name = "arm64"
+path = "benches/arm64.rs"
+bench = false
+harness = false
+
+[[bench]]
+name = "arm64simd"
+path = "benches/arm64simd.rs"
+bench = false
+harness = false
+
+[[bench]]
+name = "intel"
+path = "benches/intel.rs"
+bench = false
+harness = false
+
+[[bench]]
+name = "leaky_relu"
+path = "benches/leaky_relu.rs"
+bench = false
+harness = false
+
+[[bench]]
+name = "mat_vec"
+path = "benches/mat_vec.rs"
+harness = false
+
+[[bench]]
+name = "mm_for_asr_am"
+path = "benches/mm_for_asr_am.rs"
+harness = false
+
+[[bench]]
+name = "mm_for_inception"
+path = "benches/mm_for_inception.rs"
+harness = false
+
+[[bench]]
+name = "mm_for_wavenet_hw"
+path = "benches/mm_for_wavenet_hw.rs"
+harness = false
+
+[[bench]]
+name = "sigmoid"
+path = "benches/sigmoid.rs"
+harness = false
+
+[[bench]]
+name = "softmax"
+path = "benches/softmax.rs"
+harness = false
+
+[[bench]]
+name = "virtual_im2col"
+path = "benches/virtual_im2col.rs"
+harness = false
+
+[[bench]]
+name = "x86_64"
+path = "benches/x86_64.rs"
+bench = false
+harness = false
+
+[dependencies.byteorder]
+version = "1.4.3"
+
+[dependencies.derive-new]
+version = "0.5.9"
+
+[dependencies.downcast-rs]
+version = "1.2.0"
+
+[dependencies.dyn-clone]
+version = "1.0.4"
+
+[dependencies.dyn-hash]
+version = "0.2"
+
+[dependencies.lazy_static]
+version = "1.5.0"
+
+[dependencies.log]
+version = "0.4.14"
+
+[dependencies.num-traits]
+version = "0.2.14"
+
+[dependencies.pastey]
+version = "0.1"
+
+[dependencies.rayon]
+version = "1.10"
+optional = true
+
+[dependencies.scan_fmt]
+version = "0.2.6"
+
+[dependencies.tract-data]
+version = "=0.22.1"
+
+[dev-dependencies.core_affinity]
+version = "0.8.0"
+
+[dev-dependencies.env_logger]
+version = "0.10"
+
+[dev-dependencies.libc]
+version = "0.2.164"
+
+[dev-dependencies.nu-ansi-term]
+version = "0.46"
+
+[build-dependencies.cc]
+version = "1.0.69"
+
+[build-dependencies.half]
+version = ">=2.4,<3.0"
+features = [
+    "std",
+    "num-traits",
+]
+
+[build-dependencies.liquid]
+version = "0.26.8"
+
+[build-dependencies.liquid-core]
+version = "0.26.8"
+
+[build-dependencies.liquid-derive]
+version = "0.26.8"
+
+[build-dependencies.smallvec]
+version = "1.6.1"
+
+[build-dependencies.time]
+version = "0.3.23"
+
+[build-dependencies.unicode-normalization]
+version = "0.1.19"
+
+[build-dependencies.walkdir]
+version = "2.3.2"
+
+[target.'cfg(not(target_family = "wasm"))'.dev-dependencies.criterion]
+version = "0.6"
+
+[target.'cfg(not(target_family = "wasm"))'.dev-dependencies.proptest]
+version = "1.0.0"
+
+[target.'cfg(target_family = "wasm")'.dev-dependencies.criterion]
+version = "0.6"
+features = [
+    "plotters",
+    "cargo_bench_support",
+]
+default-features = false
+
+[target.'cfg(target_family = "wasm")'.dev-dependencies.proptest]
+version = "1.0.0"
+features = [
+    "std",
+    "bit-set",
+]
+default-features = false
diff --git a/vendor/tract-linalg-0.22.1/Cargo.toml.orig b/vendor/tract-linalg-0.22.1/Cargo.toml.orig
new file mode 100644
index 000000000..5e7551008
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/Cargo.toml.orig
@@ -0,0 +1,125 @@
+[package]
+name = "tract-linalg"
+version = "0.22.1"
+license = "MIT OR Apache-2.0"
+authors = ["Mathieu Poumeyrol <kali@zoy.org>"]
+description = "Tiny, no-nonsense, self contained, TensorFlow and ONNX inference"
+repository = "https://github.com/snipsco/tract"
+keywords = ["TensorFlow", "NeuralNetworks"]
+categories = ["science"]
+autobenches = false
+edition = "2024"
+
+[badges]
+maintenance = { status = "actively-developed" }
+
+[dependencies]
+byteorder.workspace = true
+derive-new.workspace = true
+downcast-rs.workspace = true
+dyn-clone.workspace = true
+dyn-hash.workspace = true
+lazy_static.workspace = true
+log.workspace = true
+num-traits.workspace = true
+pastey.workspace = true
+rayon = { workspace = true, optional = true }
+scan_fmt.workspace = true
+tract-data.workspace = true
+
+[build-dependencies]
+cc.workspace = true
+half.workspace = true
+liquid.workspace = true
+liquid-core.workspace = true
+liquid-derive.workspace = true
+smallvec.workspace = true
+unicode-normalization.workspace = true
+time.workspace = true
+walkdir.workspace = true
+
+[dev-dependencies]
+env_logger.workspace = true
+libc.workspace = true
+nu-ansi-term.workspace = true
+core_affinity.workspace = true
+
+[target.'cfg(not(target_family = "wasm"))'.dev-dependencies]
+criterion.workspace = true
+proptest.workspace = true
+
+[target.'cfg(target_family = "wasm")'.dev-dependencies]
+# Wasm doesn't support the `rayon` feature of criterion
+criterion = { version = "0.6", default-features = false, features = ["plotters", "cargo_bench_support"] }
+# Wasm doesn't support the `fork` feature of proptest.
+proptest = { version = "1.0.0", default-features = false, features = ["std", "bit-set"] }
+
+[features]
+# This feature is meant to accomodate very restrictive / legacy toolchains that do
+# have support for fp16 instructions, breaking tract compilation.
+# It is not meant to be used in other situations, where run-time detection is
+# preferred.
+no_fp16 = []
+apple-amx-ios = []
+default = [ ]
+multithread-mm = [ "rayon" ]
+complex = [ "tract-data/complex" ]
+hwbench = [ "rayon" ]
+
+[[bench]]
+bench = false
+name = "arm64"
+harness = false
+
+[[bench]]
+name = "mat_vec"
+harness = false
+
+[[bench]]
+name = "mm_for_wavenet_hw"
+harness = false
+
+[[bench]]
+name = "mm_for_inception"
+harness = false
+
+[[bench]]
+name = "mm_for_asr_am"
+harness = false
+
+[[bench]]
+name = "sigmoid"
+harness = false
+
+[[bench]]
+name = "softmax"
+harness = false
+
+[[bench]]
+bench = false
+name = "arm64simd"
+harness = false
+
+[[bench]]
+bench = false
+name = "arm32neon"
+harness = false
+
+[[bench]]
+name = "virtual_im2col"
+harness = false
+
+[[bench]]
+bench = false
+name = "x86_64"
+harness = false
+
+[[bench]]
+bench = false
+name = "intel"
+harness = false
+
+[[bench]]
+bench = false
+name = "leaky_relu"
+harness = false
diff --git a/vendor/tract-linalg-0.22.1/LICENSE b/vendor/tract-linalg-0.22.1/LICENSE
new file mode 100644
index 000000000..09250ca89
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/LICENSE
@@ -0,0 +1,12 @@
+## License
+
+Licensed under either of
+ * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+ * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+at your option.
+
+### Contribution
+
+Unless you explicitly state otherwise, any contribution intentionally submitted
+for inclusion in the work by you, as defined in the Apache-2.0 license, shall
+be dual licensed as above, without any additional terms or conditions.
diff --git a/vendor/tract-linalg-0.22.1/LICENSE-APACHE b/vendor/tract-linalg-0.22.1/LICENSE-APACHE
new file mode 100644
index 000000000..16fe87b06
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/vendor/tract-linalg-0.22.1/LICENSE-MIT b/vendor/tract-linalg-0.22.1/LICENSE-MIT
new file mode 100644
index 000000000..31aa79387
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/LICENSE-MIT
@@ -0,0 +1,23 @@
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/vendor/tract-linalg-0.22.1/README.md b/vendor/tract-linalg-0.22.1/README.md
new file mode 100644
index 000000000..ba7b722b0
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/README.md
@@ -0,0 +1,27 @@
+# tract-linalg
+
+linalg stands for "linear algebra". This is a misnamer. This crates contains
+low-level, architecture dependant optimisations used by tract-core.
+
+# Functions
+
+* MatMatMul: Extended matrix*matrix product:
+    * inspired by Gotoblass and BLIS micro kernel approach
+    * extended for convolution friendly addressing (fused img2col)
+    * fused output pipeline (min, max, and a few more simple, fast ops)
+    * f32*f32 -> f32 (à la sgemm)
+    * i8*i8 -> i32 accumulator -> i32 storage
+    * i8*i8 -> i32 accumulator -> i8 (with channel zeropoint and scale, and re-quantization pipeline)
+* f32 sigmoid and f32 tanh: at f32 precision, by a rationale function (no exponentiation)
+* byte-to-byte lookup table
+
+# Implementations
+
+|                   |  generic fallback  |   armv6, vfp  |     armv7 neon    |    armv8 simd     |     x64 FMA
+|-------------------|--------------------|---------------|-------------------|-------------------|-----------------
+| MatMatMul f32     |                    |      4x4      |         8x4       |       8x8         |       16x6
+| MatMatMul i8->i8  |                    |               |         8x4       |                   |        8x8
+| MatMatMul i8->i32 |                    |               |                   |                   |        8x8
+| sigmoid f32       |                    |               |         4n        |        4n         |
+| tanh f32          |                    |               |         4n        |        4n         |
+| byte lookup       |                    |               |                   |                   |
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_32x1_core.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_32x1_core.tmpl
new file mode 100644
index 000000000..d7b572e7b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_32x1_core.tmpl
@@ -0,0 +1,204 @@
+// vim: ft=arm
+
+// C tile regs
+//
+//      q8[0]
+//      q8[1]
+//      q8[2]
+//      q8[3]
+//
+//      ....
+//
+//      q15[0]
+//      q15[1]
+//      q15[2]
+//      q15[3]
+
+    .arm
+    .text
+    .global armv7neon_mmm_f32_32x1_{{core}}_{{suffix}}
+    .type armv7neon_mmm_f32_32x1_{{core}}_{{suffix}}, %function
+
+armv7neon_mmm_f32_32x1_{{core}}_{{suffix}}:
+
+    pld     [r0]
+    push    { r4-r12 }
+    vpush   { q4-q7 }
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+
+    cmp     r3, #0
+    beq     .non_linear_loop
+
+    mov     r1, r4 // packed A ptr
+    pld     [r3]
+    pld     [r5]
+
+    pld     [r1, #128]
+    pld     [r1, #192]
+    pld     [r1, #256]
+    pld     [r1, #320]
+    pld     [r1, #384]
+    pld     [r1, #448]
+    pld     [r1, #512]
+
+.packed_packed_loop_1:
+    pld     [r5]                           // packed B ptr
+
+{% if core == "cortexa7" %}
+
+    vldr            d0, [r1]
+    vldr            d1, [r1, #8]
+    vldr            d2, [r1, #16]
+    vldr            d3, [r1, #24]
+    vldr            d4, [r1, #32]
+    vldr            d5, [r1, #40]
+    vldr            d6, [r1, #48]
+    vldr            d7, [r1, #56]
+    vldr            d8, [r1, #64]
+    vldr            d9, [r1, #72]
+    vldr            d10, [r1, #80]
+    vldr            d11, [r1, #88]
+    vldr            s30, [r5]
+
+    pld             [r1, #512]
+    pld             [r1, #576]
+    pld             [r5, #64]
+
+    vmla.f32        q8, q0, d15[0]
+    vmla.f32        q9, q1, d15[0]
+
+    vldr            d0, [r1, #96]
+    vldr            d1, [r1, #104]
+    vldr            d2, [r1, #112]
+    vldr            d3, [r1, #120]
+
+    vmla.f32        q10, q2, d15[0]
+    vmla.f32        q11, q3, d15[0]
+
+    vmla.f32        q12, q4, d15[0]
+    vmla.f32        q13, q5, d15[0]
+
+    vmla.f32        q14, q0, d15[0]
+    vmla.f32        q15, q1, d15[0]
+
+    add             r1, #128
+    add             r5, #4
+
+{% elsif core == "cortexa9" %}
+
+    vld1.64         {d0-d3}, [r1]!
+    vld1.64         {d4-d7}, [r1]!
+    pld             [r1, #512]
+    pld             [r1, #576]
+    vld1.64         {d8-d11}, [r1]!
+    vld1.f32        d15[0], [r5]!
+    pld             [r5, #64]
+
+    vmla.f32        q8, q0, d15[0]
+    vmla.f32        q9, q1, d15[0]
+    vld1.64         {d0-d3}, [r1]!
+
+    vmla.f32        q10, q2, d15[0]
+    vmla.f32        q11, q3, d15[0]
+
+    vmla.f32        q12, q4, d15[0]
+    vmla.f32        q13, q5, d15[0]
+
+    vmla.f32        q14, q0, d15[0]
+    vmla.f32        q15, q1, d15[0]
+
+{% else %}
+
+    vldmia          r1!, { q0-q3 }
+    vldmia          r5!, { s30 }
+
+    vmla.f32        q8, q0, d15[0]
+    vmla.f32        q9, q1, d15[0]
+    vldmia          r1!, { q0-q1 }
+
+    vmla.f32        q10, q2, d15[0]
+    vmla.f32        q11, q3, d15[0]
+    vldmia          r1!, { q2-q3 }
+
+    vmla.f32        q12, q0, d15[0]
+    vmla.f32        q13, q1, d15[0]
+
+    vmla.f32        q14, q2, d15[0]
+    vmla.f32        q15, q3, d15[0]
+
+{% endif %}
+
+    subs            r3, r3, #1
+    bne .packed_packed_loop_1
+
+    b   .non_linear_loop
+
+{% include "armv7neon_mmm_f32_scalars.tmpliq" from:8, to:15 %}
+{% include "armv7neon_mmm_f32_per_rows.tmpliq" mr:32, from:8, to:15 %}
+{% include "armv7neon_mmm_f32_per_cols.tmpliq" mr:32, from:8, to:15 %}
+
+.add_unicast:
+    {% for reg in (0..15) %}
+        vld1.f32    d{{reg}}[0], [ r3 ], r4
+        vld1.f32    d{{reg}}[1], [ r3 ], r4
+    {% endfor %}
+    {% for reg in (0..7) %}
+        vadd.f32 q{{reg|plus:8}}, q{{reg|plus:8}}, q{{reg}}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.add_row_col_products:
+    vld1.f32        d0[0], [ r4 ]
+    vldmia          r3!, { q4-q7 }
+
+    vmla.f32        q8, q4, d0[0]
+    vmla.f32        q9, q5, d0[0]
+
+    vmla.f32        q10, q6, d0[0]
+    vmla.f32        q11, q7, d0[0]
+
+    vldmia          r3!, { q4-q7 }
+
+    vmla.f32        q12, q4, d0[0]
+    vmla.f32        q13, q5, d0[0]
+
+    vmla.f32        q14, q6, d0[0]
+    vmla.f32        q15, q7, d0[0]
+
+    b .non_linear_loop
+
+.store:
+    // r3, r4 <- ptr, rsc
+    cmp     r4, #4
+    bne     .store_generic
+
+    vst1.f64    {d16-d19}, [r3]!
+    vst1.f64    {d20-d23}, [r3]!
+    vst1.f64    {d24-d27}, [r3]!
+    vst1.f64    {d28-d31}, [r3]!
+
+    b .non_linear_loop
+
+.store_generic:
+
+    {% for reg in (16..31) %}
+        vst1.f32    d{{reg}}[0], [r3], r4
+        vst1.f32    d{{reg}}[1], [r3], r4
+    {% endfor %}
+
+    b .non_linear_loop
+
+.load_tile:
+    vldmia          r3!, { q8-q15 }
+    b .non_linear_loop
+
+.return:
+    vpop        { q4-q7 }
+    pop         { r4-r12 }
+
+    bx          lr
+
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.tmpl
new file mode 100644
index 000000000..93aa6f295
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x1_core.tmpl
@@ -0,0 +1,98 @@
+// vim: ft=arm
+
+    .arm
+    .text
+    .global armv7neon_mmm_f32_8x1_{{core}}_{{suffix}}
+    .type armv7neon_mmm_f32_8x1_{{core}}_{{suffix}}, %function
+
+armv7neon_mmm_f32_8x1_{{core}}_{{suffix}}:
+
+    pld     [r0]
+    push    { r4-r12 }
+    vpush   { q4-q7 }
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+
+    cmp     r3, #0
+    beq     .non_linear_loop
+
+    mov     r1, r4 // packed A ptr
+    pld     [r3]
+    pld     [r5]
+
+    pld     [r1, #128]
+    pld     [r1, #192]
+    pld     [r1, #256]
+    pld     [r1, #320]
+    pld     [r1, #384]
+    pld     [r1, #448]
+    pld     [r1, #512]
+
+.packed_packed_loop_1:
+    pld     [r5]                           // packed B ptr
+
+    vldmia          r1!, { q0-q1 }
+    vldmia          r5!, { s30 }
+
+    vmla.f32        q8, q0, d15[0]
+    vmla.f32        q9, q1, d15[0]
+
+    subs            r3, r3, #1
+    bne .packed_packed_loop_1
+
+    b   .non_linear_loop
+
+{% include "armv7neon_mmm_f32_scalars.tmpliq" from:8, to:9 %}
+{% include "armv7neon_mmm_f32_per_rows.tmpliq" mr:8, from:8, to:9 %}
+{% include "armv7neon_mmm_f32_per_cols.tmpliq" mr:8, from:8, to:9 %}
+
+.add_unicast:
+    {% for reg in (0..15) %}
+        vld1.f32    d{{reg}}[0], [ r3 ], r4
+        vld1.f32    d{{reg}}[1], [ r3 ], r4
+    {% endfor %}
+    {% for reg in (0..7) %}
+        vadd.f32 q{{reg|plus:8}}, q{{reg|plus:8}}, q{{reg}}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.add_row_col_products:
+    vld1.f32        d0[0], [ r4 ]
+    vldmia          r3!, { q4-q5 }
+
+    vmla.f32        q8, q4, d0[0]
+    vmla.f32        q9, q5, d0[0]
+
+    b .non_linear_loop
+
+.store:
+    // r3, r4 <- ptr, rsc
+    cmp     r4, #4
+    bne     .store_generic
+
+    vst1.f64    {d16-d19}, [r3]!
+
+    b .non_linear_loop
+
+.store_generic:
+
+    {% for reg in (16..19) %}
+        vst1.f32    d{{reg}}[0], [r3], r4
+        vst1.f32    d{{reg}}[1], [r3], r4
+    {% endfor %}
+
+    b .non_linear_loop
+
+.load_tile:
+    vldmia          r3!, { q8-q15 }
+    b .non_linear_loop
+
+.return:
+    vpop        { q4-q7 }
+    pop         { r4-r12 }
+
+    bx          lr
+
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x4_core.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x4_core.tmpl
new file mode 100644
index 000000000..9117e6aa6
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x4_core.tmpl
@@ -0,0 +1,143 @@
+// vim: ft=arm
+
+// C tile regs
+//
+//      q8[0]    q10[0]   q12[0]    q14[0]
+//      q8[1]    q10[1]   q12[1]    q14[1]
+//      q8[2]    q10[2]   q12[2]    q14[2]
+//      q8[3]    q10[3]   q12[3]    q14[3]
+//
+//      q9[0]    q11[0]   q13[0]    q15[0]
+//      q9[1]    q11[1]   q13[1]    q15[1]
+//      q9[2]    q11[2]   q13[2]    q15[2]
+//      q9[3]    q11[3]   q13[3]    q15[3]
+
+// packed A buffering (2x8 values): alternating q0, q1 with q2, q3
+// packed B buffering (2x4 values): alternating q4 with q5
+
+    .arm
+    .text
+    .global armv7neon_mmm_f32_8x4_{{core}}_{{suffix}}
+    .type armv7neon_mmm_f32_8x4_{{core}}_{{suffix}}, %function
+
+armv7neon_mmm_f32_8x4_{{core}}_{{suffix}}:
+    pld     [r0]
+    push    { r4-r12 }
+    vpush   { q4-q7 }
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+
+    cmp     r3, #0
+    beq     .non_linear_loop
+
+    mov     r1, r4 // packed A ptr
+    pld     [r3]
+    pld     [r5]
+
+    .packed_packed:
+    pld     [r5]                           // packed B ptr
+    .packed_packed_loop_1:
+
+{% if core == "cortexa7" %}
+    vldr            d0, [r1]
+    vldr            d1, [r1, #8]
+    vldr            d2, [r1, #16]
+    vldr            d3, [r1, #24]
+    vldr            d4, [r5]
+    vldr            d5, [r5, #8]
+{% elsif core == "cortexa9" %}
+    vld1.64         {d0-d3}, [r1]!
+    vld1.64         {d4, d5}, [r5]!
+{% else %}
+    vldmia          r1!, { q0, q1}
+    vldmia          r5!, { q2 }
+{% endif %}
+
+{% if core != "generic" %}
+    pld             [r1, #512]
+    pld             [r5, #512]
+{% endif %}
+
+    vmla.f32        q8, q0, d4[0]
+    vmla.f32        q9, q1, d4[0]
+
+    vmla.f32        q10, q0, d4[1]
+    vmla.f32        q11, q1, d4[1]
+
+    vmla.f32        q12, q0, d5[0]
+    vmla.f32        q13, q1, d5[0]
+
+    vmla.f32        q14, q0, d5[1]
+    vmla.f32        q15, q1, d5[1]
+
+{% if core == "cortexa7" %}
+    add             r1, #32
+    add             r5, #16
+{% endif %}
+
+    subs r3, r3, #1
+    bne .packed_packed_loop_1
+    b   .non_linear_loop
+
+{% include "armv7neon_mmm_f32_scalars.tmpliq" from:8, to:15 %}
+{% include "armv7neon_mmm_f32_per_rows.tmpliq" mr:8, from:8, to:15 %}
+{% include "armv7neon_mmm_f32_per_cols.tmpliq" mr:8, from:8, to:15 %}
+
+.add_unicast:
+    // r3, r4, r5 <- ptr, rsc, csc
+    {% for col in (0..3) %}
+        mov         r2, r3
+        {% for reg in (0..3) %}
+            vld1.f32    d0[0], [ r2 ], r4
+            vld1.f32    d0[1], [ r2 ], r4
+            vadd.f32    d{{col | times: 4 | plus: reg | plus : 16}}, d0
+        {% endfor %}
+        add r3, r3, r5
+    {% endfor %}
+
+    b .non_linear_loop
+
+.add_row_col_products:
+    vldmia          r3!, { q0, q1 }
+    vldmia          r4!, { q4 }
+
+    vmla.f32        q8, q0, d8[0]
+    vmla.f32        q9, q1, d8[0]
+
+    vmla.f32        q10, q0, d8[1]
+    vmla.f32        q11, q1, d8[1]
+
+    vmla.f32        q12, q0, d9[0]
+    vmla.f32        q13, q1, d9[0]
+
+    vmla.f32        q14, q0, d9[1]
+    vmla.f32        q15, q1, d9[1]
+
+    b .non_linear_loop
+
+.store:
+    // r3,r4,r5 are c,rsc,csc
+    {% for col in (0..3) %}
+        mov         r8, r3
+        {% for reg in (0..3) %}
+            vst1.f32    d{{col | times: 4 | plus: reg | plus : 16}}[0], [ r8 ], r4
+            vst1.f32    d{{col | times: 4 | plus: reg | plus : 16}}[1], [ r8 ], r4
+        {% endfor %}
+        {% if col < 3 %}
+            add r3, r3, r5
+        {% endif %}
+    {% endfor %}
+    b .non_linear_loop
+
+.load_tile:
+    vldmia          r3!, { q8-q15 }
+    b .non_linear_loop
+
+.return:
+    vpop        { q4-q7 }
+    pop         { r4-r12 }
+
+    bx          lr
+
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x6_core.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x6_core.tmpl
new file mode 100644
index 000000000..7baefd69d
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_8x6_core.tmpl
@@ -0,0 +1,158 @@
+// vim: ft=arm
+
+    .arm
+    .text
+    .global armv7neon_mmm_f32_8x6_{{core}}_{{suffix}}
+    .type armv7neon_mmm_f32_8x6_{{core}}_{{suffix}}, %function
+
+armv7neon_mmm_f32_8x6_{{core}}_{{suffix}}:
+
+    pld     [r0]
+    push    { r4-r12 }
+    vpush   { q4-q7 }
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    cmp     r3, #0
+    beq     .non_linear_loop
+
+    mov     r1, r4 // packed A ptr
+    pld     [r3]
+    pld     [r5]
+
+    .packed_packed_loop_1:
+
+{% if core == "cortexa7" %}
+    vldr            d0, [r1]
+    vldr            d1, [r1, #8]
+    vldr            d2, [r1, #16]
+    vldr            d3, [r1, #24]
+    vldr            d4, [r5]
+    vldr            d5, [r5, #8]
+    vldr            d6, [r5, #16]
+{% elsif core == "cortexa9" %}
+    vld1.64         {d0-d3}, [r1]!
+    vld1.64         {d4, d5, d6}, [r5]!
+{% else %}
+    vldmia          r1!, {q0-q1}
+    vldmia          r5!, {d4-d6}
+{% endif %}
+
+{% if core != "generic" %}
+    pld             [r1, #512]
+    pld             [r5, #512]
+{% endif %}
+
+    vmla.f32        q4, q0, d4[0]
+    vmla.f32        q5, q1, d4[0]
+
+    vmla.f32        q6, q0, d4[1]
+    vmla.f32        q7, q1, d4[1]
+
+    vmla.f32        q8, q0, d5[0]
+    vmla.f32        q9, q1, d5[0]
+
+    vmla.f32        q10, q0, d5[1]
+    vmla.f32        q11, q1, d5[1]
+
+    vmla.f32        q12, q0, d6[0]
+    vmla.f32        q13, q1, d6[0]
+
+    vmla.f32        q14, q0, d6[1]
+    vmla.f32        q15, q1, d6[1]
+
+{% if core == "cortexa7" %}
+    add             r1, #32
+    add             r5, #24
+{% endif %}
+
+    subs r3, r3, #1
+    bne .packed_packed_loop_1
+    b   .non_linear_loop
+
+{% include "armv7neon_mmm_f32_scalars.tmpliq" from:4, to:15 %}
+{% include "armv7neon_mmm_f32_per_rows.tmpliq" mr:8, from:4, to:15 %}
+{% include "armv7neon_mmm_f32_per_cols.tmpliq" mr:8, from:4, to:15 %}
+
+.add_unicast:
+    //  r3, r4, r5, r6 <- ptr, rsc, csc, size
+    {% for col in (0..5) %}
+        mov         r2, r3
+        {% for reg in (0..3) %}
+            vld1.f32    d0[0], [ r2 ], r4
+            vld1.f32    d0[1], [ r2 ], r4
+            vadd.f32    d{{col | times: 4 | plus: reg | plus : 8}}, d0
+        {% endfor %}
+        add r3, r3, r5
+    {% endfor %}
+
+    b .non_linear_loop
+
+.add_row_col_products:
+    vldmia          r3!, { q0, q1 }
+    vldmia          r4!, { d4, d5, d6 }
+
+    vmla.f32        q4, q0, d4[0]
+    vmla.f32        q5, q1, d4[0]
+
+    vmla.f32        q6, q0, d4[1]
+    vmla.f32        q7, q1, d4[1]
+
+    vmla.f32        q8, q0, d5[0]
+    vmla.f32        q9, q1, d5[0]
+
+    vmla.f32        q10, q0, d5[1]
+    vmla.f32        q11, q1, d5[1]
+
+    vmla.f32        q12, q0, d6[0]
+    vmla.f32        q13, q1, d6[0]
+
+    vmla.f32        q14, q0, d6[1]
+    vmla.f32        q15, q1, d6[1]
+
+    b .non_linear_loop
+
+.store:
+    // r3, r4, r5 <- ptr, rsc, csc
+
+    cmp     r4, #4
+    bne     .store_generic
+
+    {% for col in (0..5) %}
+        mov         r8, r3
+        {% for reg in (0..3) %}
+            vst1.64     d{{col| times: 4 | plus: 8 | plus: reg}}, [ r8 ]!
+        {% endfor %}
+        {% if col < 5 %}
+            add r3, r3, r5
+        {% endif %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.store_generic:
+    {% for col in (0..5) %}
+        mov         r8, r3
+        {% for reg in (0..3) %}
+            vst1.f32    d{{col | times: 4 | plus: reg | plus : 8}}[0], [ r8 ], r4
+            vst1.f32    d{{col | times: 4 | plus: reg | plus : 8}}[1], [ r8 ], r4
+        {% endfor %}
+        {% if col < 5 %}
+            add r3, r3, r5
+        {% endif %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.load_tile:
+    vldmia          r3!, { q4-q7 }
+    vldmia          r3!, { q8-q15 }
+    b .non_linear_loop
+
+.return:
+    vpop        { q4-q7 }
+    pop         { r4-r12 }
+
+    bx          lr
+
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_cols.tmpliq
new file mode 100644
index 000000000..adc9b14ed
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_cols.tmpliq
@@ -0,0 +1,9 @@
+// vim: ft=arm
+
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_min", op:"vmin.f32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_max", op:"vmax.f32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_mul", op:"vmul.f32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_add", op:"vadd.f32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_sub", op:"vsub.f32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsub.f32", mr:mr, from:from, to:to, flipped: true%}
+
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_rows.tmpliq
new file mode 100644
index 000000000..64dd5ca8a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_per_rows.tmpliq
@@ -0,0 +1,9 @@
+// vim: ft=arm
+
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_min", op:"vmin.f32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_max", op:"vmax.f32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_mul", op:"vmul.f32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_add", op:"vadd.f32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_sub", op:"vsub.f32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsub.f32", mr:mr, from:from, to:to, flipped: true%}
+
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_scalars.tmpliq
new file mode 100644
index 000000000..352606371
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_f32_scalars.tmpliq
@@ -0,0 +1,24 @@
+// vim: ft=arm
+
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_min", op:"vmin.f32", from:from, to:to%}
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_max", op:"vmax.f32", from:from, to:to%}
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_mul", op:"vmul.f32", from:from, to:to%}
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_add", op:"vadd.f32", from:from, to:to%}
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_sub", op:"vsub.f32", from:from, to:to%}
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsub.f32", from:from, to:to, flipped:true%}
+
+.leaky_relu:
+    vmov            s0, r3
+    vdup.32         q0, d0[0]
+    {% for reg in (from..to) %}
+        vmul.f32    q2, q{{reg}}, q0
+        vcgt.f32    q1, q{{reg}}, 0
+        vbsl        q1, q{{reg}}, q2
+        vmov        q{{reg}}, q1
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shl:
+.q_shr:
+.q_scale:
+    b .unsupported
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_32x1.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_32x1.tmpl
new file mode 100644
index 000000000..e176c48b3
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_32x1.tmpl
@@ -0,0 +1,174 @@
+// vim: ft=arm
+
+// C tile regs: q8..q16
+
+    .arm
+    .text
+    .global armv7neon_mmm_i32_32x1_{{suffix}}
+    .type armv7neon_mmm_i32_32x1_{{suffix}}, %function
+
+armv7neon_mmm_i32_32x1_{{suffix}}:
+
+    pld     [r0]
+    push    { r4-r12 }
+    vpush   { q4-q7 }
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    // r3 r4 r5 r6
+    // k  a  b  packing
+    cmp     r3, #0
+    beq     .non_linear_loop
+
+    mov     r1, r4 // packed A ptr
+    pld     [r3]
+    pld     [r7]
+
+    cmp     r6, #1
+    beq     .packed_packed_i8i8
+
+    .packed_packed:
+
+    .packed_packed_loop_1:
+    vldmia         r1!, { q4-q7 }
+
+    vld1.32        { d0[0] }, [ r5 ]!
+
+    vmla.s32       q8, q4, d0[0]
+
+    vldmia         r1!, { q1-q4 }
+
+    vmla.s32       q9, q5, d0[0]
+    vmla.s32       q10, q6, d0[0]
+    vmla.s32       q11, q7, d0[0]
+
+    vmla.s32       q12, q1, d0[0]
+    vmla.s32       q13, q2, d0[0]
+
+    vmla.s32       q14, q3, d0[0]
+    vmla.s32       q15, q4, d0[0]
+
+    subs r3, r3, #1
+    bne .packed_packed_loop_1
+    b   .non_linear_loop
+
+    .packed_packed_i8i8:
+
+    .packed_packed_loop_i8i8_1:
+    vldmia          r1!, { q4-q5 }
+
+    vld1.8          { d0[0] }, [ r5 ]!
+    vmovl.s8        q0, d0
+
+    vmovl.s8        q1, d8
+    vmlal.s16       q8, d2, d0[0]
+    vmlal.s16       q9, d3, d0[0]
+
+    vmovl.s8        q1, d9
+    vmlal.s16       q10, d2, d0[0]
+    vmlal.s16       q11, d3, d0[0]
+
+    vmovl.s8        q1, d10
+    vmlal.s16       q12, d2, d0[0]
+    vmlal.s16       q13, d3, d0[0]
+
+    vmovl.s8        q1, d11
+    vmlal.s16       q14, d2, d0[0]
+    vmlal.s16       q15, d3, d0[0]
+
+    subs r3, r3, #1
+    bne .packed_packed_loop_i8i8_1
+    b   .non_linear_loop
+
+{% include "armv7neon_mmm_i32_scalars.tmpliq" from:8, to:15 %}
+{% include "armv7neon_mmm_i32_per_rows.tmpliq" mr:32, from:8, to:15 %}
+{% include "armv7neon_mmm_i32_per_cols.tmpliq" mr:32, from:8, to:15 %}
+
+.add_unicast:
+    // r3, r4, r5, r6 <- ptr, rsc, csc, size
+
+    cmp     r6, #4
+    beq     .non_linear_addc_i32
+
+    {% for reg in (16..31) %}
+        vld1.s8     d0[0], [ r3 ], r4
+        vld1.s8     d0[1], [ r3 ], r4
+        vmovl.s8    q0, d0
+        vmovl.s16   q0, d0
+        vadd.i32    d{{reg}}, d0
+    {% endfor %}
+
+    b .non_linear_loop
+
+.non_linear_addc_i32:
+    {% for reg in (16..31) %}
+        vld1.s32    d0[0], [ r3 ], r4
+        vld1.s32    d0[1], [ r3 ], r4
+        vadd.i32    d{{reg}}, d0
+    {% endfor %}
+    b .non_linear_loop
+
+.add_row_col_products:
+    vldm    	r4, { s0 }
+
+    vldmia          r3!, { q4-q7 }
+
+    vmla.s32        q8, q4, d0[0]
+    vmla.s32        q9, q5, d0[0]
+
+    vmla.s32        q10, q6, d0[0]
+    vmla.s32        q11, q7, d0[0]
+
+    vldmia          r3!, { q4-q7 }
+
+    vmla.s32        q12, q4, d0[0]
+    vmla.s32        q13, q5, d0[0]
+
+    vmla.s32        q14, q6, d0[0]
+    vmla.s32        q15, q7, d0[0]
+
+    b .non_linear_loop
+
+    {% include "armv7neon_mmm_i32_scale_q8_q15.tmpliq" %}
+
+.store:
+    // r3, r4, r5, r6 <- ptr, rsc, csc, size
+    cmp     r6, #4
+    beq     .store_strides_i32
+
+    {% for reg in (8..15) %}
+        vmovn.s32 d{{reg | times: 2}}, q{{reg}}
+        vmovn.s16 d{{reg | times: 2}}, q{{reg}}
+    {% endfor %}
+    {% for reg in (8..15) %}
+        {%capture d%}{{reg | times: 2 }}{%endcapture%}
+        vst1.s8     d{{d}}[0], [ r3 ], r4
+        vst1.s8     d{{d}}[1], [ r3 ], r4
+        vst1.s8     d{{d}}[2], [ r3 ], r4
+        vst1.s8     d{{d}}[3], [ r3 ], r4
+    {% endfor %}
+
+    b .non_linear_loop
+
+.store_strides_i32:
+    {% for reg in (8..15) %}
+        {%capture d%}{{reg | times: 2}}{%endcapture%}
+        vst1.s32    d{{d}}[0], [ r3 ], r4
+        vst1.s32    d{{d}}[1], [ r3 ], r4
+        vst1.s32    d{{d|plus:1}}[0], [ r3 ], r4
+        vst1.s32    d{{d|plus:1}}[1], [ r3 ], r4
+    {% endfor %}
+
+    b .non_linear_loop
+
+.load_tile:
+    vldmia          r3!, { q8-q15 }
+    b .non_linear_loop
+
+.return:
+    vpop        { q4-q7 }
+    pop         { r4-r12 }
+
+    bx          lr
+
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_8x4.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_8x4.tmpl
new file mode 100644
index 000000000..a50f5a10a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_8x4.tmpl
@@ -0,0 +1,294 @@
+// vim: ft=arm
+
+// C tile regs
+// 
+//      q8[0]    q10[0]   q12[0]    q14[0]
+//      q8[1]    q10[1]   q12[1]    q14[1]
+//      q8[2]    q10[2]   q12[2]    q14[2]
+//      q8[3]    q10[3]   q12[3]    q14[3]
+//
+//      q9[0]    q11[0]   q13[0]    q15[0]
+//      q9[1]    q11[1]   q13[1]    q15[1]
+//      q9[2]    q11[2]   q13[2]    q15[2]
+//      q9[3]    q11[3]   q13[3]    q15[3]
+
+    .arm
+    .text
+    .global armv7neon_mmm_i32_8x4_{{suffix}}
+    .type armv7neon_mmm_i32_8x4_{{suffix}}, %function
+
+armv7neon_mmm_i32_8x4_{{suffix}}:
+
+    pld     [r0]
+    push    { r4-r12 }
+    vpush   { q4-q7 }
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    // r3 r4 r5 r6
+    // k  a  b  packing
+    cmp     r3, #0
+    beq     .non_linear_loop
+
+    mov     r1, r4 // packed A ptr
+    pld     [r3]
+    pld     [r5]
+
+    cmp     r6, #1
+    beq     .packed_packed_i8i8
+
+    .packed_packed_loop_1:
+
+    vldmia          r1!, { q0, q1 }
+    vldmia          r5!, { q2 }
+
+    vmla.s32       q8, q0, d4[0]
+    vmla.s32       q9, q1, d4[0]
+
+    vmla.s32       q10, q0, d4[1]
+    vmla.s32       q11, q1, d4[1]
+
+    vmla.s32       q12, q0, d5[0]
+    vmla.s32       q13, q1, d5[0]
+
+    vmla.s32       q14, q0, d5[1]
+    vmla.s32       q15, q1, d5[1]
+
+    subs r3, r3, #1
+    bne .packed_packed_loop_1
+
+    b   .non_linear_loop
+
+    .packed_packed_i8i8:
+    pld     [r5]                           // packed B ptr       
+
+    cmp r3, #4
+    blt .packed_packed_loop_i8i8_1
+
+    .packed_packed_loop_i8i8_4:
+    pld             [r1, #64]
+    pld             [r5, #64]
+
+    // q2: d4 -> d4,d5 A even cols (from r1)
+    // q3: d6 -> d6,d7 A odd cols (from r1)
+    // q0: s0 -> d0 : B even lines (from r5)
+    // q1: s4 -> d2 : B odd lines (from r5)
+
+    // 0
+    vldmia          r1!, { d4 }
+    vldmia          r5!, { s0 }
+
+    vmovl.s8        q2, d4
+    vmovl.s8        q0, d0
+
+    vmlal.s16       q8, d4, d0[0]
+    vmlal.s16       q9, d5, d0[0]
+
+    vldmia          r1!, { d6 }
+
+    vmlal.s16       q10, d4, d0[1]
+    vmlal.s16       q11, d5, d0[1]
+
+    vldmia          r5!, { s4 }
+
+    vmlal.s16       q12, d4, d0[2]
+    vmlal.s16       q13, d5, d0[2]
+
+    vmlal.s16       q14, d4, d0[3]
+    vmlal.s16       q15, d5, d0[3]
+
+    // 1
+    vmovl.s8        q3, d6
+    vmovl.s8        q1, d2
+
+    vmlal.s16       q8, d6, d2[0]
+    vldmia          r1!, { d4 }
+    vmlal.s16       q9, d7, d2[0]
+    vldmia          r5!, { s0 }
+
+    vmlal.s16       q10, d6, d2[1]
+    vmlal.s16       q11, d7, d2[1]
+
+    vmlal.s16       q12, d6, d2[2]
+    vmlal.s16       q13, d7, d2[2]
+
+    vmlal.s16       q14, d6, d2[3]
+    vmlal.s16       q15, d7, d2[3]
+
+    // 2
+    vmovl.s8        q2, d4
+    vmovl.s8        q0, d0
+
+    vmlal.s16       q8, d4, d0[0]
+    vmlal.s16       q9, d5, d0[0]
+
+    vldmia          r1!, { d6 }
+
+    vmlal.s16       q10, d4, d0[1]
+    vmlal.s16       q11, d5, d0[1]
+
+    vldmia          r5!, { s4 }
+
+    vmlal.s16       q12, d4, d0[2]
+    vmlal.s16       q13, d5, d0[2]
+
+    vmlal.s16       q14, d4, d0[3]
+    vmlal.s16       q15, d5, d0[3]
+
+    // 3
+    vmovl.s8        q3, d6
+    vmovl.s8        q1, d2
+
+    vmlal.s16       q8, d6, d2[0]
+    vmlal.s16       q9, d7, d2[0]
+
+    vmlal.s16       q10, d6, d2[1]
+    vmlal.s16       q11, d7, d2[1]
+
+    vmlal.s16       q12, d6, d2[2]
+    vmlal.s16       q13, d7, d2[2]
+
+    vmlal.s16       q14, d6, d2[3]
+    vmlal.s16       q15, d7, d2[3]
+
+    sub r3, r3, #4
+    cmp r3, #4
+    bge .packed_packed_loop_i8i8_4
+
+    cmp r3, #0
+    beq .non_linear_loop
+
+    .packed_packed_loop_i8i8_1:
+
+    vldmia          r1!, { s0, s1 }
+    vmovl.s8        q0, d0
+    vldmia          r5!, { s4 }
+    vmovl.s8        q1, d2
+
+    vmlal.s16       q8, d0, d2[0]
+    vmlal.s16       q9, d1, d2[0]
+
+    vmlal.s16       q10, d0, d2[1]
+    vmlal.s16       q11, d1, d2[1]
+
+    vmlal.s16       q12, d0, d2[2]
+    vmlal.s16       q13, d1, d2[2]
+
+    vmlal.s16       q14, d0, d2[3]
+    vmlal.s16       q15, d1, d2[3]
+
+    subs r3, r3, #1
+    bne .packed_packed_loop_i8i8_1
+    b   .non_linear_loop
+
+{% include "armv7neon_mmm_i32_scalars.tmpliq" from:8, to:15 %}
+{% include "armv7neon_mmm_i32_per_rows.tmpliq" mr:8, from:8, to:15 %}
+{% include "armv7neon_mmm_i32_per_cols.tmpliq" mr:8, from:8, to:15 %}
+
+.add_unicast:
+    // r3, r4, r5, r6 <- ptr, rsc, csc, size
+    cmp     r6, #4
+    beq     .non_linear_addc_i32
+
+    {% for col in (0..3) %}
+        mov         r8, r3
+        {% for reg in (0..3) %}
+            vld1.s8     d0[0], [ r8 ], r4
+            vld1.s8     d0[1], [ r8 ], r4
+            vmovl.s8    q0, d0
+            vmovl.s16   q0, d0
+            vadd.i32    d{{col | times: 4 | plus: reg | plus : 16}}, d0
+        {% endfor %}
+        add r3, r3, r5
+    {% endfor %}
+
+    b .non_linear_loop
+
+.non_linear_addc_i32:
+
+    {% for col in (0..3) %}
+        mov         r8, r3
+        {% for reg in (0..3) %}
+            vld1.s32    d0[0], [ r8 ], r4
+            vld1.s32    d0[1], [ r8 ], r4
+            vadd.i32    d{{col | times: 4 | plus: reg | plus : 16}}, d0
+        {% endfor %}
+        {% if col < 3 %}
+            add r3, r3, r5
+        {% endif %}
+    {% endfor %}
+
+b .non_linear_loop
+
+.add_row_col_products:
+    vldmia          r3!, { q0, q1 }
+    vldmia          r4!, { q4 }
+
+    vmla.s32        q8, q0, d8[0]
+    vmla.s32        q9, q1, d8[0]
+
+    vmla.s32        q10, q0, d8[1]
+    vmla.s32        q11, q1, d8[1]
+
+    vmla.s32        q12, q0, d9[0]
+    vmla.s32        q13, q1, d9[0]
+
+    vmla.s32        q14, q0, d9[1]
+    vmla.s32        q15, q1, d9[1]
+
+    b .non_linear_loop
+
+    {% include "armv7neon_mmm_i32_scale_q8_q15.tmpliq" %}
+
+.store:
+    // r3, r4, r5, r6 <- ptr, rsc, csc, size
+    cmp     r6, #4
+    beq     .store_strides_i32
+
+    {% for reg in (8..15) %}
+        vmovn.s32 d{{reg | times: 2}}, q{{reg}}
+        vmovn.s16 d{{reg | times: 2}}, q{{reg}}
+    {% endfor %}
+    {% for col in (0..3) %}
+        mov         r8, r3
+        {% for reg in (0..1) %}
+            {%capture d%}{{col | times: 2 | plus: reg | times: 2 | plus: 16}}{%endcapture%}
+            vst1.s8     d{{d}}[0], [ r8 ], r4
+            vst1.s8     d{{d}}[1], [ r8 ], r4
+            vst1.s8     d{{d}}[2], [ r8 ], r4
+            vst1.s8     d{{d}}[3], [ r8 ], r4
+        {% endfor %}
+        {% if col < 3 %}
+            add r3, r3, r5
+        {% endif %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.store_strides_i32:
+
+    {% for col in (0..3) %}
+        mov         r8, r3
+        {% for reg in (0..3) %}
+            {% for lane in (0..1) %}
+                vst1.s32     d{{col | times: 4 | plus: reg | plus: 16}}[{{lane}}], [ r8 ], r4
+            {% endfor %}
+        {% endfor %}
+        {% if col < 3 %}
+            add r3, r3, r5
+        {% endif %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.load_tile:
+    vldmia          r3!, { q8-q15 }
+    b .non_linear_loop
+
+.return:
+    vpop        { q4-q7 }
+    pop         { r4-r12 }
+
+    bx          lr
+
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_cols.tmpliq
new file mode 100644
index 000000000..3beef2095
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_cols.tmpliq
@@ -0,0 +1,8 @@
+// vim: ft=arm
+
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_min", op:"vmin.s32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_max", op:"vmax.s32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_mul", op:"vmul.s32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_add", op:"vadd.s32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_sub", op:"vsub.s32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsub.s32", mr:mr, from:from, to:to, flipped:true%}
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_rows.tmpliq
new file mode 100644
index 000000000..f0739b31c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_per_rows.tmpliq
@@ -0,0 +1,8 @@
+// vim: ft=arm
+
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_min", op:"vmin.s32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_max", op:"vmax.s32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_mul", op:"vmul.s32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_add", op:"vadd.s32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_sub", op:"vsub.s32", mr:mr, from:from, to:to %}
+{% include "armv7neon_mmm_q_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsub.s32", mr:mr, from:from, to:to, flipped:true%}
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scalars.tmpliq
new file mode 100644
index 000000000..7c3053d5a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scalars.tmpliq
@@ -0,0 +1,20 @@
+// vim: ft=arm
+
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_min", op:"vmin.s32", from:from, to:to%}
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_max", op:"vmax.s32", from:from, to:to%}
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_mul", op:"vmul.s32", from:from, to:to%}
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_add", op:"vadd.s32", from:from, to:to%}
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_sub", op:"vsub.s32", from:from, to:to%}
+{% include "armv7neon_mmm_q_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsub.s32", from:from, to:to, flipped:true%}
+
+.leaky_relu:
+    vmov            s0, r3
+    vdup.32         q0, d0[0]
+    {% for reg in (from..to) %}
+        vmul.s32    q2, q{{reg}}, q0
+        vcgt.s32    q1, q{{reg}}, 0
+        vbsl        q1, q{{reg}}, q2
+        vmov        q{{reg}}, q1
+    {% endfor %}
+    b .non_linear_loop
+
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scale_q8_q15.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scale_q8_q15.tmpliq
new file mode 100644
index 000000000..f594928ae
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_i32_scale_q8_q15.tmpliq
@@ -0,0 +1,232 @@
+// vim: ft=arm
+
+.q_scale:
+    ldm         r0, { r4, r5, r6, r7 }      // fixme params are already loaded by disp.
+    vdup.s32    q0, r7                      // q0 <- multiplier
+
+    mov         r3, #1
+    vdup.s32    q1, r3                      // q1 <- ones
+    vmovl.s32   q1, d2
+
+    add         r5, #32
+    neg         r5, r5
+    vdup.s32    q2, r5                      // q2 <- -(shift + 32)
+    vmovl.s32   q2, d4
+
+    cmp     r6, #1
+    beq     .q_scale_rounding_zero
+    cmp     r6, #2
+    beq     .q_scale_rounding_away
+    cmp     r6, #3
+    beq     .q_scale_rounding_minus_inf
+    cmp     r6, #4
+    beq     .q_scale_rounding_plus_inf
+    cmp     r6, #5
+    beq     .q_scale_rounding_even
+    cmp     r6, #6
+    beq     .q_scale_rounding_odd
+
+    b .unsupported
+
+.q_scale_rounding_zero:
+    {% for q in (8..15) %}
+        vclt.s32    q7, q{{q}}, #0
+        vabs.s32    q{{q}}, q{{q}}
+        vqdmull.s32 q5, d{{q | times:2}}, d0[0]
+        vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0]
+        vsub.s64    q5, q1
+        vsub.s64    q6, q1
+        vqrshl.s64  q5, q2
+        vqrshl.s64  q6, q2
+        vmovn.s64   d{{q | times:2}}, q5
+        vmovn.s64   d{{q | times:2 | plus: 1}}, q6
+        vneg.s32    q5, q{{q}}
+        vbit.s32    q{{q}}, q5, q7
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_scale_rounding_away:
+    {% for q in (8..15) %}
+        vclt.s32    q7, q{{q}}, #0
+        vabs.s32    q{{q}}, q{{q}}
+        vqdmull.s32 q5, d{{q | times:2}}, d0[0]
+        vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0]
+        vqrshl.s64  q5, q2
+        vqrshl.s64  q6, q2
+        vmovn.s64   d{{q | times:2}}, q5
+        vmovn.s64   d{{q | times:2 | plus: 1}}, q6
+        vneg.s32    q5, q{{q}}
+        vbit.s32    q{{q}}, q5, q7
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_scale_rounding_minus_inf:
+    {% for q in (8..15) %}
+        vqdmull.s32 q5, d{{q | times:2}}, d0[0]
+        vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0]
+        vsub.s64    q5, q1
+        vsub.s64    q6, q1
+        vqrshl.s64  q5, q2
+        vqrshl.s64  q6, q2
+        vmovn.s64   d{{q | times:2}}, q5
+        vmovn.s64   d{{q | times:2 | plus: 1}}, q6
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_scale_rounding_plus_inf:
+    {% for q in (8..15) %}
+        vqdmull.s32 q5, d{{q | times:2}}, d0[0]
+        vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0]
+        vqrshl.s64  q5, q2
+        vqrshl.s64  q6, q2
+        vmovn.s64   d{{q | times:2}}, q5
+        vmovn.s64   d{{q | times:2 | plus: 1}}, q6
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_scale_rounding_even:
+    {% for q in (8..15) %}
+        vclt.s32    q7, q{{q}}, #0
+        vabs.s32    q{{q}}, q{{q}}
+        vqdmull.s32 q5, d{{q | times:2}}, d0[0]
+        vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0]
+        vqshl.s64   q3, q5, q2
+        vqshl.s64   q4, q6, q2
+        vand        q3, q3, q1
+        vand        q4, q4, q1
+        vsub.s64    q3, q3, q1
+        vsub.s64    q4, q4, q1
+        vadd.s64    q5, q3
+        vadd.s64    q6, q4
+        vqrshl.s64  q5, q2
+        vqrshl.s64  q6, q2
+        vmovn.s64   d{{q | times:2}}, q5
+        vmovn.s64   d{{q | times:2 | plus: 1}}, q6
+        vneg.s32    q5, q{{q}}
+        vbit.s32    q{{q}}, q5, q7
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_scale_rounding_odd:
+    {% for q in (8..15) %}
+        vclt.s32    q7, q{{q}}, #0
+        vabs.s32    q{{q}}, q{{q}}
+        vqdmull.s32 q5, d{{q | times:2}}, d0[0]
+        vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0]
+        vqshl.s64   q3, q5, q2
+        vqshl.s64   q4, q6, q2
+        vand        q3, q3, q1
+        vand        q4, q4, q1
+        vsub.s64    q5, q3
+        vsub.s64    q6, q4
+        vqrshl.s64  q5, q2
+        vqrshl.s64  q6, q2
+        vmovn.s64   d{{q | times:2}}, q5
+        vmovn.s64   d{{q | times:2 | plus: 1}}, q6
+        vneg.s32    q5, q{{q}}
+        vbit.s32    q{{q}}, q5, q7
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_shl:
+    ldm         r0, { r4, r5 }      // fixme params are already loaded by disp.
+    vdup.s32    q2, r5              // q2 <- shift
+
+    {% for q in (8..15) %}
+        vqrshl.s32  q{{q}}, q2      // Shift
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_shr:
+    ldm         r0, { r4, r5, r6 }      // fixme params are already loaded by disp.
+
+    mov         r3, #1
+    vdup.s32    q1, r3                      // q1 <- ones
+
+    neg         r5, r5
+    vdup.s32    q2, r5                      // q2 <- shift
+
+    cmp     r6, #1
+    beq     .q_shr_rounding_zero
+    cmp     r6, #2
+    beq     .q_shr_rounding_away
+    cmp     r6, #3
+    beq     .q_shr_rounding_minus_inf
+    cmp     r6, #4
+    beq     .q_shr_rounding_plus_inf
+    cmp     r6, #5
+    beq     .q_shr_rounding_even
+    cmp     r6, #6
+    beq     .q_shr_rounding_odd
+
+    b .unsupported
+
+.q_shr_rounding_zero:
+    // return signum(x) * ((abs(x) - 1) >>r shift )
+    {% for q in (8..15) %}
+        vclt.s32    q3, q{{q}}, #0  // Store the sign of the value
+        vabs.s32    q{{q}}, q{{q}}  // Compute their abs
+        vsub.s32    q{{q}}, q1      // Substract 1 to abs(x)
+        vqrshl.s32  q{{q}}, q2      // Rounding shift (0.5 -> 1)
+        vneg.s32    q4, q{{q}}      // Compute -((abs(x) - 1) >>r shift )
+        vbit.s32    q{{q}}, q4, q3  // Restore sign of x with bit mask
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shr_rounding_away:
+    // return signum(x) * (abs(x) >>r shift )
+    {% for q in (8..15) %}
+        vclt.s32    q3, q{{q}}, #0  // Store the sign of the value
+        vabs.s32    q{{q}}, q{{q}}  // Compute their abs
+        vqrshl.s32  q{{q}}, q2      // Rounding shift (0.5 -> 1)
+        vneg.s32    q4, q{{q}}      // Compute -(abs(x) >>r shift )
+        vbit.s32    q{{q}}, q4, q3  // Restore sign of x with bit mask
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shr_rounding_minus_inf:
+    // return -(-x >>r shift)
+    {% for q in (8..15) %}
+        vneg.s32    q3, q{{q}}      // Compute -x
+        vqrshl.s32  q3, q2          // Rounding shift (0.5 -> 1)
+        vneg.s32    q{{q}}, q3      // Compute -(-x >>r shift)
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shr_rounding_plus_inf:
+    // return x >>r shift
+    {% for q in (8..15) %}
+        vqrshl.s32  q{{q}}, q2      // Rounding shift (0.5 -> 1)
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shr_rounding_even:
+    // If (x >> shift) is odd -> (x - 0) >>r shift
+    // If (x >> shift) is even -> (x - 1) >>r shift
+    {% for q in (8..15) %}
+        vqshl.s32   q3, q{{q}}, q2      // Truncate shift (0.5 -> 0)
+        vand.s32    q4, q3, q1          // Store if x is odd
+        vsub.s32    q5, q4, q1          // If (x >> shift) is odd 0 else -1
+        vadd.s32   q{{q}}, q{{q}}, q5   // If (x >> shift) is odd (x - 0) else (x - 1)
+        vqrshl.s32 q{{q}}, q2           // Rounding shift (0.5 -> 1)
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shr_rounding_odd:
+    // If (x >> shift) is even -> (x - 0) >>r shift
+    // If (x >> shift) is odd -> (x - 1) >>r shift
+    {% for q in (8..15) %}
+        vqshl.s32   q3, q{{q}}, q2      // Truncate shift (0.5 -> 0)
+        vand.s32    q4, q3, q1          // Store if x >> shift is odd
+        vneg.s32    q5, q4              // If x is odd -1 else 0
+        vadd.s32   q{{q}}, q{{q}}, q5   // If x is odd (x - 1) else (x - 0)
+        vqrshl.s32 q{{q}}, q2           // Rounding shift (0.5 -> 1)
+    {% endfor %}
+    b .non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_col.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_col.tmpliq
new file mode 100644
index 000000000..769d290cf
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_col.tmpliq
@@ -0,0 +1,33 @@
+// vim: ft=arm
+
+.{{label}}:
+
+{% capture mr_over_4 %}{{ mr | divided_by: 4}}{%endcapture%}
+{% capture mr_over_4_min_1 %}{{ mr | divided_by: 4 | minus: 1}}{%endcapture%}
+
+{%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_4}}{%endcapture%}
+{%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_4|minus:1}}{%endcapture%}
+
+{% if cols == "1" %}
+    vld1.f32        d0[0], [ r3 ]
+{% else %}
+    {%capture cols_over_2_minus_1%}{{cols | divided_by:2 | minus:1}}{%endcapture%}
+    {% for c in (0..cols_over_2_minus_1) %}
+        vldmia      r3!, { d{{c}} }
+    {% endfor %}
+{% endif %}
+
+
+{% for right in (0..cols_min_1) %}
+    vdup.f32 q3, d{{right|divided_by:2}}[{{right| modulo:2}}]
+    {% for down in (0..mr_over_4_min_1) %}
+        {%capture acc%}{{mr_over_4|times:right|plus:from|plus:down}}{%endcapture%}
+        {% if flipped %}
+            {{op}} q{{acc}}, q{{acc}}, q3
+        {% else %}
+            {{op}} q{{acc}}, q3, q{{acc}}
+        {% endif %}
+    {% endfor %}
+{% endfor %}
+
+    b .non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_row.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_row.tmpliq
new file mode 100644
index 000000000..a0f2d40c6
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_per_row.tmpliq
@@ -0,0 +1,24 @@
+// vim: ft=arm
+
+.{{label}}:
+
+{% capture mr_over_4 %}{{ mr | divided_by: 4}}{%endcapture%}
+{% capture mr_over_4_min_1 %}{{ mr | divided_by: 4 | minus: 1}}{%endcapture%}
+
+{% for reg in (0..mr_over_4_min_1) %}
+    vldmia         r3!, { q{{reg}} }
+{% endfor %}
+
+{% if flipped %}
+    {% for acc in (from..to) %}
+        {% capture other%}{{acc | minus: from | modulo: mr_over_4}}{%endcapture%}
+        {{op}} q{{acc}}, q{{acc}}, q{{other}}
+    {% endfor %}
+{% else %}
+    {% for acc in (from..to) %}
+        {% capture other%}{{acc | minus: from | modulo: mr_over_4}}{%endcapture%}
+        {{op}} q{{acc}}, q{{other}}, q{{acc}}
+    {% endfor %}
+{% endif %}
+
+b           .non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_scalar.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_scalar.tmpliq
new file mode 100644
index 000000000..4f135b415
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_mmm_q_scalar.tmpliq
@@ -0,0 +1,15 @@
+// vim: ft=arm
+
+.{{label}}:
+    vmov            s0, r3
+    vdup.32         q0, d0[0]
+    {% if flipped %}
+        {% for reg in (from..to) %}
+            {{op}}    q{{reg}}, q{{reg}}, q0
+        {% endfor %}
+    {% else %}
+        {% for reg in (from..to) %}
+            {{op}}    q{{reg}}, q0, q{{reg}}
+        {% endfor %}
+    {% endif %}
+    b .non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_prefetch.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_prefetch.tmpl
new file mode 100644
index 000000000..d153e66b8
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_prefetch.tmpl
@@ -0,0 +1,22 @@
+// vim: ft=arm
+
+.arm
+.text
+.global armv7neon_prefetch_{{suffix}}
+.type armv7neon_prefetch_{{suffix}}, %function
+
+armv7neon_prefetch_{{suffix}}:
+loop:
+    pld     [r0]
+    pld     [r0, #32]
+    pld     [r0, #64]
+    pld     [r0, #96]
+    pld     [r0, #128]
+    pld     [r0, #160]
+    pld     [r0, #192]
+    pld     [r0, #224]
+    add     r0, r0, #256
+    cmp     r0, r1
+    blt     loop
+
+    bx      lr
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_sigmoid_f32_4n.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_sigmoid_f32_4n.tmpl
new file mode 100644
index 000000000..baa5072ae
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_sigmoid_f32_4n.tmpl
@@ -0,0 +1,215 @@
+// vim: ft=arm
+
+    .arm
+    .text
+    .global armv7neon_sigmoid_f32_4n_{{suffix}}
+    .type armv7neon_sigmoid_f32_4n_{{suffix}}, %function
+
+/*
+    s16–s31 (d8–d15, q4–q7) must be preserved
+    s0–s15 (d0–d7, q0–q3) and d16–d31 (q8–q15) do not need to be preserved
+*/
+
+armv7neon_sigmoid_f32_4n_{{suffix}}:
+    cmp         r1, #0
+    blxeq       lr
+
+    vpush       { q4-q7 }
+
+    adr         r2, .coeffs_num
+    vldmia      r2!, { s0-s13 }
+
+// q4 -> q4,5,6
+// q5 -> q7,8,9
+// q6 -> q10,11,12
+// q7 -> q13,14,15
+
+
+    cmp         r1, #12
+    blt         .loop
+
+.loop_3:
+    vldmia      r0, { q4, q5, q6 }         // q4 <- x
+
+    vdup.32     q15, d0[0]
+    vmax.f32    q4, q15
+    vmax.f32    q5, q15
+    vmax.f32    q6, q15
+    vdup.32     q15, d0[1]
+    vmin.f32    q4, q15
+    vmin.f32    q5, q15
+    vmin.f32    q6, q15
+
+    vmul.f32    q7, q4, q4          // q7 <- x2
+    vmul.f32    q8, q5, q5
+    vmul.f32    q9, q6, q6
+
+    vdup.32     q10, d1[0]
+    vdup.32     q11, d1[0]
+    vdup.32     q12, d1[0]
+    vdup.32     q13, d1[1]
+    vdup.32     q14, d1[1]
+    vdup.32     q15, d1[1]
+    vmla.f32    q13, q7, q10
+    vmla.f32    q14, q8, q11
+    vmla.f32    q15, q9, q12
+    vdup.32     q10, d2[0]
+    vdup.32     q11, d2[0]
+    vdup.32     q12, d2[0]
+    vmla.f32    q10, q13, q7
+    vmla.f32    q11, q14, q8
+    vmla.f32    q12, q15, q9
+    vdup.32     q13, d2[1]
+    vdup.32     q14, d2[1]
+    vdup.32     q15, d2[1]
+    vmla.f32    q13, q7, q10
+    vmla.f32    q14, q8, q11
+    vmla.f32    q15, q9, q12
+    vdup.32     q10, d3[0]
+    vdup.32     q11, d3[0]
+    vdup.32     q12, d3[0]
+    vmla.f32    q10, q13, q7
+    vmla.f32    q11, q14, q8
+    vmla.f32    q12, q15, q9
+    vdup.32     q13, d3[1]
+    vdup.32     q14, d3[1]
+    vdup.32     q15, d3[1]
+    vmla.f32    q13, q7, q10
+    vmla.f32    q14, q8, q11
+    vmla.f32    q15, q9, q12
+    vdup.32     q10, d4[0]
+    vdup.32     q11, d4[0]
+    vdup.32     q12, d4[0]
+    vmla.f32    q10, q13, q7
+    vmla.f32    q11, q14, q8
+    vmla.f32    q12, q15, q9
+    vmul.f32    q4, q4, q10          // q4 <- numerator
+    vmul.f32    q5, q5, q11
+    vmul.f32    q6, q6, q12
+
+    vdup.32     q10, d4[1]
+    vdup.32     q11, d4[1]
+    vdup.32     q12, d4[1]
+    vdup.32     q13, d5[0]
+    vdup.32     q14, d5[0]
+    vdup.32     q15, d5[0]
+    vmla.f32    q13, q7, q10
+    vmla.f32    q14, q8, q11
+    vmla.f32    q15, q9, q12
+    vdup.32     q10, d5[1]
+    vdup.32     q11, d5[1]
+    vdup.32     q12, d5[1]
+    vmla.f32    q10, q13, q7
+    vmla.f32    q11, q14, q8
+    vmla.f32    q12, q15, q9
+    vdup.32     q13, d6[0]
+    vdup.32     q14, d6[0]
+    vdup.32     q15, d6[0]
+    vmla.f32    q13, q7, q10          // q13 <- denum
+    vmla.f32    q14, q8, q11
+    vmla.f32    q15, q9, q12
+
+    vrecpe.f32  q7, q13
+    vrecpe.f32  q8, q14
+    vrecpe.f32  q9, q15
+    vrecps.f32  q10, q7, q13
+    vrecps.f32  q11, q8, q14
+    vrecps.f32  q12, q9, q15
+    vmul.f32    q7, q7, q10
+    vmul.f32    q8, q8, q11
+    vmul.f32    q9, q9, q12
+    vrecps.f32  q10, q7, q13
+    vrecps.f32  q11, q8, q14
+    vrecps.f32  q12, q9, q15
+    vmul.f32    q7, q7, q10          // q7 <- 1/q13
+    vmul.f32    q8, q8, q11
+    vmul.f32    q9, q9, q12
+
+    vdup.32     q10, d6[1]
+    vdup.32     q11, d6[1]
+    vdup.32     q12, d6[1]
+    vmla.f32    q10, q4, q7
+    vmla.f32    q11, q5, q8
+    vmla.f32    q12, q6, q9
+
+    vstmia      r0!, { q10, q11, q12 }
+
+    subs        r1, #12
+    cmp         r1, #12
+    bge         .loop_3
+
+    cmp         r1, #0;
+    beq         .return
+
+.loop:
+    vldmia      r0, { q4 }         // q4 <- x
+
+    vdup.32     q15, d0[0]
+    vmax.f32    q4, q15
+    vdup.32     q15, d0[1]
+    vmin.f32    q4, q15
+
+    vmul.f32    q7, q4, q4          // q7 <- x2
+
+    vdup.32     q10, d1[0]
+    vdup.32     q13, d1[1]
+    vmla.f32    q13, q7, q10
+    vdup.32     q10, d2[0]
+    vmla.f32    q10, q13, q7
+    vdup.32     q13, d2[1]
+    vmla.f32    q13, q7, q10
+    vdup.32     q10, d3[0]
+    vmla.f32    q10, q13, q7
+    vdup.32     q13, d3[1]
+    vmla.f32    q13, q7, q10
+    vdup.32     q10, d4[0]
+    vmla.f32    q10, q13, q7
+    vmul.f32    q4, q4, q10          // q4 <- numerator
+
+    vdup.32     q10, d4[1]
+    vdup.32     q13, d5[0]
+    vmla.f32    q13, q7, q10
+    vdup.32     q10, d5[1]
+    vmla.f32    q10, q13, q7
+    vdup.32     q13, d6[0]
+    vmla.f32    q13, q7, q10          // q13 <- denum
+
+    vrecpe.f32  q7, q13
+    vrecps.f32  q10, q7, q13
+    vmul.f32    q7, q7, q10
+    vrecps.f32  q10, q7, q13
+    vmul.f32    q7, q7, q10          // q7 <- 1/q13
+
+    vdup.32     q10, d6[1]
+    vmla.f32    q10, q4, q7
+
+    vstmia      r0!, { q10 }
+
+    subs        r1, #4;
+    bne         .loop
+
+.return:
+    vpop        { q4-q7 }
+    bx          lr
+
+.coeffs_num:
+    .float -18.6                    // low
+    .float 18.6                     // high
+    .float -4.433153405e-18         // alpha_13
+    .float 1.169974371e-14
+
+    .float -1.875289645e-11
+    .float 4.257889523e-8
+    .float 0.00004811817576
+    .float 0.008163842030
+
+    .float 0.2499999971
+    .float 3.922935744e-6           // beta_6
+    .float 0.001524872358
+    .float 0.1159886749
+
+    .float 1.0
+    .float 0.5                      //              
+    .float 0.0                      // padding
+    .float 0.0
+
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_tanh_f32_4n.tmpl b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_tanh_f32_4n.tmpl
new file mode 100644
index 000000000..5165f6fb9
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/armv7neon_tanh_f32_4n.tmpl
@@ -0,0 +1,209 @@
+// vim: ft=arm
+
+    .arm
+    .text
+    .global armv7neon_tanh_f32_4n_{{suffix}}
+    .type armv7neon_tanh_f32_4n_{{suffix}}, %function
+
+/*
+    s16–s31 (d8–d15, q4–q7) must be preserved
+    s0–s15 (d0–d7, q0–q3) and d16–d31 (q8–q15) do not need to be preserved
+*/
+
+armv7neon_tanh_f32_4n_{{suffix}}:
+    cmp         r1, #0
+    blxeq       lr
+
+    vpush       { q4-q7 }
+
+    adr         r2, .coeffs_num
+    vldmia      r2!, { s0-s13 }
+
+// q4 -> q4,5,6
+// q5 -> q7,8,9
+// q6 -> q10,11,12
+// q7 -> q13,14,15
+
+    cmp         r1, #12
+    blt         .loop
+
+.loop_3:
+    vldmia      r0, { q4, q5, q6 }         // q4 <- x
+
+    vdup.32     q15, d0[0]
+    vmax.f32    q4, q15
+    vmax.f32    q5, q15
+    vmax.f32    q6, q15
+    vdup.32     q15, d0[1]
+    vmin.f32    q4, q15
+    vmin.f32    q5, q15
+    vmin.f32    q6, q15
+
+    vmul.f32    q7, q4, q4          // q7 <- x2
+    vmul.f32    q8, q5, q5
+    vmul.f32    q9, q6, q6
+
+    vdup.32     q10, d1[0]
+    vdup.32     q11, d1[0]
+    vdup.32     q12, d1[0]
+    vdup.32     q13, d1[1]
+    vdup.32     q14, d1[1]
+    vdup.32     q15, d1[1]
+    vmla.f32    q13, q7, q10
+    vmla.f32    q14, q8, q11
+    vmla.f32    q15, q9, q12
+    vdup.32     q10, d2[0]
+    vdup.32     q11, d2[0]
+    vdup.32     q12, d2[0]
+    vmla.f32    q10, q13, q7
+    vmla.f32    q11, q14, q8
+    vmla.f32    q12, q15, q9
+    vdup.32     q13, d2[1]
+    vdup.32     q14, d2[1]
+    vdup.32     q15, d2[1]
+    vmla.f32    q13, q7, q10
+    vmla.f32    q14, q8, q11
+    vmla.f32    q15, q9, q12
+    vdup.32     q10, d3[0]
+    vdup.32     q11, d3[0]
+    vdup.32     q12, d3[0]
+    vmla.f32    q10, q13, q7
+    vmla.f32    q11, q14, q8
+    vmla.f32    q12, q15, q9
+    vdup.32     q13, d3[1]
+    vdup.32     q14, d3[1]
+    vdup.32     q15, d3[1]
+    vmla.f32    q13, q7, q10
+    vmla.f32    q14, q8, q11
+    vmla.f32    q15, q9, q12
+    vdup.32     q10, d4[0]
+    vdup.32     q11, d4[0]
+    vdup.32     q12, d4[0]
+    vmla.f32    q10, q13, q7
+    vmla.f32    q11, q14, q8
+    vmla.f32    q12, q15, q9
+    vmul.f32    q4, q4, q10          // q4 <- numerator
+    vmul.f32    q5, q5, q11
+    vmul.f32    q6, q6, q12
+
+    vdup.32     q10, d4[1]
+    vdup.32     q11, d4[1]
+    vdup.32     q12, d4[1]
+    vdup.32     q13, d5[0]
+    vdup.32     q14, d5[0]
+    vdup.32     q15, d5[0]
+    vmla.f32    q13, q7, q10
+    vmla.f32    q14, q8, q11
+    vmla.f32    q15, q9, q12
+    vdup.32     q10, d5[1]
+    vdup.32     q11, d5[1]
+    vdup.32     q12, d5[1]
+    vmla.f32    q10, q13, q7
+    vmla.f32    q11, q14, q8
+    vmla.f32    q12, q15, q9
+    vdup.32     q13, d6[0]
+    vdup.32     q14, d6[0]
+    vdup.32     q15, d6[0]
+    vmla.f32    q13, q7, q10          // q13 <- denum
+    vmla.f32    q14, q8, q11
+    vmla.f32    q15, q9, q12
+
+    vrecpe.f32  q7, q13
+    vrecpe.f32  q8, q14
+    vrecpe.f32  q9, q15
+    vrecps.f32  q10, q7, q13
+    vrecps.f32  q11, q8, q14
+    vrecps.f32  q12, q9, q15
+    vmul.f32    q7, q7, q10
+    vmul.f32    q8, q8, q11
+    vmul.f32    q9, q9, q12
+    vrecps.f32  q10, q7, q13
+    vrecps.f32  q11, q8, q14
+    vrecps.f32  q12, q9, q15
+    vmul.f32    q7, q7, q10          // q7 <- 1/q13
+    vmul.f32    q8, q8, q11
+    vmul.f32    q9, q9, q12
+
+    vmul.f32    q10, q4, q7
+    vmul.f32    q11, q5, q8
+    vmul.f32    q12, q6, q9
+
+    vstmia      r0!, { q10, q11, q12 }
+
+    subs        r1, #12
+    cmp         r1, #12
+    bge         .loop_3
+
+    cmp         r1, #0;
+    beq         .return
+
+.loop:
+    vldmia      r0, { q4 }         // q4 <- x
+
+    vdup.32     q15, d0[0]
+    vmax.f32    q4, q15
+    vdup.32     q15, d0[1]
+    vmin.f32    q4, q15
+
+    vmul.f32    q7, q4, q4          // q7 <- x2
+
+    vdup.32     q10, d1[0]
+    vdup.32     q13, d1[1]
+    vmla.f32    q13, q7, q10
+    vdup.32     q10, d2[0]
+    vmla.f32    q10, q13, q7
+    vdup.32     q13, d2[1]
+    vmla.f32    q13, q7, q10
+    vdup.32     q10, d3[0]
+    vmla.f32    q10, q13, q7
+    vdup.32     q13, d3[1]
+    vmla.f32    q13, q7, q10
+    vdup.32     q10, d4[0]
+    vmla.f32    q10, q13, q7
+    vmul.f32    q4, q4, q10          // q4 <- numerator
+
+    vdup.32     q10, d4[1]
+    vdup.32     q13, d5[0]
+    vmla.f32    q13, q7, q10
+    vdup.32     q10, d5[1]
+    vmla.f32    q10, q13, q7
+    vdup.32     q13, d6[0]
+    vmla.f32    q13, q7, q10          // q13 <- denum
+
+    vrecpe.f32  q7, q13
+    vrecps.f32  q10, q7, q13
+    vmul.f32    q7, q7, q10
+    vrecps.f32  q10, q7, q13
+    vmul.f32    q7, q7, q10          // q7 <- 1/q13
+
+    vmul.f32    q10, q4, q7
+
+    vstmia      r0!, { q10 }
+
+    subs        r1, #4;
+    bne         .loop
+
+.return:
+    vpop        { q4-q7 }
+    bx          lr
+
+.coeffs_num:
+    .float -8.9                     // low
+    .float 8.9                      // high
+    .float -8.488492677e-14         // alpha_13
+    .float 5.277853000e-11
+
+    .float -2.022500419e-8
+    .float 0.00001115424833
+    .float 0.003103950131
+    .float 0.1308400453
+
+    .float 0.9999999934
+    .float 0.0002546136580          // beta_6
+    .float 0.02449515379
+    .float 0.4641733162
+
+    .float 1.0
+    .float 0                        // padding
+    .float 0                        // padding
+    .float 0                        // padding
diff --git a/vendor/tract-linalg-0.22.1/arm32/armv7neon/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armv7neon/dispatcher.tmpliq
new file mode 100644
index 000000000..2c5e910f5
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armv7neon/dispatcher.tmpliq
@@ -0,0 +1,38 @@
+// vim: ft=arm
+
+.non_linear:
+
+.non_linear_loop_entry:
+    sub     r0, #20
+
+.non_linear_loop:
+    add     r0, #20
+    ldm     r0, { r2, r3, r4, r5, r6 }
+
+    cmp     r2, #{{ jump_table | size }}
+    movgt   r2, #{{ jump_table | size }}
+    cmp     r2, #0
+    movlt   r2, #{{ jump_table | size }}
+
+    add     pc, pc, r2, LSL#2
+    nop     // pc in Rn above is start of the add instruction + 8, hence a nop is needed
+            // This is A32 asm, for T32/Thump2 use nop.w and b.w to avoid problems.
+{% for j in jump_table %}
+    b .{{j}}
+{% endfor %}
+    b .unsupported
+
+
+.unsupported:
+    mov         r0,     #1
+    b           .return
+
+.done:
+    mov         r0,     #0
+    b           .return
+
+.clear:
+{% for r in (4..15) %}
+    veor    q{{r}}, q{{r}}, q{{r}}
+{% endfor %}
+    b           .non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/arm32/armvfpv2/armvfpv2_mmm_f32_4x4.tmpl b/vendor/tract-linalg-0.22.1/arm32/armvfpv2/armvfpv2_mmm_f32_4x4.tmpl
new file mode 100644
index 000000000..23fbcc2d1
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armvfpv2/armvfpv2_mmm_f32_4x4.tmpl
@@ -0,0 +1,491 @@
+// vim: ft=arm
+
+    .arm
+    .text
+    .global armvfpv2_mmm_f32_4x4_{{suffix}}
+    .type armvfpv2_mmm_f32_4x4_{{suffix}}, %function
+
+// C tile:
+
+//  s16 s20 s24 s28
+//  s17 s21 s25 s29
+//  s18 s22 s26 s30
+//  s19 s23 s27 s31
+
+// packed A: (2x4) alternating between (s0-s3) and (s4-s7)
+// packed B: (2x4) alternating between (s8-s11) and (s12-15)
+
+// all vfp registers in use.
+
+armvfpv2_mmm_f32_4x4_{{suffix}}:
+
+/*
+    pld [r1]
+    pld [r1, #8]
+    pld [r2]
+    pld [r2, #8]
+*/
+
+    push        { r4-r12 }               // no lr (we're a leaf), no fp. #24 bytes
+
+    ldr         r8, [sp, #28]
+    ldr         r9, [sp, #24]
+
+//  r8=rsc, r9=csc
+
+    vmrs        r6, FPSCR
+    bic         r6, r6, #0x00370000
+    vmsr        FPSCR, r6
+
+    vpush       { s16-s31 }
+
+{% include "dispatcher.tmpliq" %}
+
+.clear:
+    eor         r6, r6
+    vmov        s16, r6
+    vmov.f32    s17, s16
+    vmov.f32    s18, s16
+    vmov.f32    s19, s16
+    vmov.f32    s20, s16
+    vmov.f32    s21, s16
+    vmov.f32    s22, s16
+    vmov.f32    s23, s16
+    vmov.f32    s24, s16
+    vmov.f32    s25, s16
+    vmov.f32    s26, s16
+    vmov.f32    s27, s16
+    vmov.f32    s28, s16
+    vmov.f32    s29, s16
+    vmov.f32    s30, s16
+    vmov.f32    s31, s16
+    b     .non_linear_loop
+
+.add_mat_mul:
+    // r3 <- k, r4 <- a, r5 <- b
+    cmp     r3, #0
+    beq     .non_linear_loop
+
+    mov     r1, r4 // packed A ptr
+    pld     [r3]
+    pld     [r5]
+
+    .packed_packed:
+    cmp r3, #4
+    blt .packed_packed_loop_1
+
+    .packed_packed_loop_4:
+
+    // 1
+    vldmia          r1!, { s0, s1 }
+    vldmia          r5!, { s8, s9 }
+
+    vmla.f32        s16, s0, s8
+    vldmia          r1!, { s2, s3 }
+    vmla.f32        s17, s1, s8
+    vldmia          r5!, { s10, s11 }
+    vmla.f32        s18, s2, s8
+    vmla.f32        s19, s3, s8
+
+    vmla.f32        s20, s0, s9
+    vmla.f32        s21, s1, s9
+    vmla.f32        s22, s2, s9
+    vmla.f32        s23, s3, s9
+
+    vldmia          r1!, { s4-s7 }
+    vmla.f32        s24, s0, s10
+    vmla.f32        s25, s1, s10
+    vmla.f32        s26, s2, s10
+    vmla.f32        s27, s3, s10
+
+    vldmia          r5!, { s12-s15 }
+    vmla.f32        s28, s0, s11
+    vmla.f32        s29, s1, s11
+    vmla.f32        s30, s2, s11
+    vmla.f32        s31, s3, s11
+
+    // 2
+    vmla.f32        s16, s4, s12
+    vmla.f32        s17, s5, s12
+    vmla.f32        s18, s6, s12
+    vmla.f32        s19, s7, s12
+
+    vldmia          r1!, { s0-s3 }
+
+    vmla.f32        s20, s4, s13
+    vmla.f32        s21, s5, s13
+    vmla.f32        s22, s6, s13
+    vmla.f32        s23, s7, s13
+
+    vldmia          r5!, { s8-s11 }
+
+    vmla.f32        s24, s4, s14
+    vmla.f32        s25, s5, s14
+    vmla.f32        s26, s6, s14
+    vmla.f32        s27, s7, s14
+
+    vmla.f32        s28, s4, s15
+    vmla.f32        s29, s5, s15
+    vmla.f32        s30, s6, s15
+    vmla.f32        s31, s7, s15
+
+    // 3
+    vmla.f32        s16, s0, s8
+    vmla.f32        s17, s1, s8
+    vmla.f32        s18, s2, s8
+    vmla.f32        s19, s3, s8
+
+    vldmia          r1!, { s4-s7 }
+
+    vmla.f32        s20, s0, s9
+    vmla.f32        s21, s1, s9
+    vmla.f32        s22, s2, s9
+    vmla.f32        s23, s3, s9
+
+    vldmia          r5!, { s12-s15 }
+
+    vmla.f32        s24, s0, s10
+    vmla.f32        s25, s1, s10
+    vmla.f32        s26, s2, s10
+    vmla.f32        s27, s3, s10
+
+    pld [r1]
+
+    vmla.f32        s28, s0, s11
+    vmla.f32        s29, s1, s11
+    vmla.f32        s30, s2, s11
+    vmla.f32        s31, s3, s11
+
+    pld [r6]
+
+    // 4
+    vmla.f32        s16, s4, s12
+    vmla.f32        s17, s5, s12
+    vmla.f32        s18, s6, s12
+    vmla.f32        s19, s7, s12
+
+    vmla.f32        s20, s4, s13
+    vmla.f32        s21, s5, s13
+    vmla.f32        s22, s6, s13
+    vmla.f32        s23, s7, s13
+
+    vmla.f32        s24, s4, s14
+    vmla.f32        s25, s5, s14
+    vmla.f32        s26, s6, s14
+    vmla.f32        s27, s7, s14
+
+    vmla.f32        s28, s4, s15
+    vmla.f32        s29, s5, s15
+    vmla.f32        s30, s6, s15
+    vmla.f32        s31, s7, s15
+
+    sub r3, r3, #4
+    cmp r3, #4
+    bge .packed_packed_loop_4
+
+    cmp r3, #0
+    beq .non_linear_loop
+
+    .packed_packed_loop_1:
+
+    vldmia          r1!, { s0, s1 }
+    vldmia          r5!, { s8, s9 }
+
+    vmla.f32        s16, s0, s8
+    vldmia          r1!, { s2, s3 }
+    vmla.f32        s17, s1, s8
+    vldmia          r5!, { s10, s11 }
+    vmla.f32        s18, s2, s8
+    vmla.f32        s19, s3, s8
+
+    vmla.f32        s20, s0, s9
+    vmla.f32        s21, s1, s9
+    vmla.f32        s22, s2, s9
+    vmla.f32        s23, s3, s9
+
+    vmla.f32        s24, s0, s10
+    vmla.f32        s25, s1, s10
+    vmla.f32        s26, s2, s10
+    vmla.f32        s27, s3, s10
+
+    vmla.f32        s28, s0, s11
+    vmla.f32        s29, s1, s11
+    vmla.f32        s30, s2, s11
+    vmla.f32        s31, s3, s11
+
+    subs r3, r3, #1
+    bne .packed_packed_loop_1
+
+    b .non_linear_loop
+
+.add_unicast:
+    {% for col in (0..3) %}
+        mov         r8, r3
+        {% for reg in (0..3) %}
+            vldr            s0, [ r8 ]
+            vadd.f32        s{{col|times:4|plus:reg|plus:16}}, s{{col|times:4|plus:reg|plus:16}}, s0
+            {% if reg < 3 %}
+                add         r8, r8, r4
+            {% endif %}
+        {% endfor %}
+        {% if col < 3 %}
+            add r3, r3, r5
+        {% endif %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.scalar_min:
+    vmov            s0, r3
+    {% for reg in (16..31) %}
+        vcmp.f32        s{{reg}}, s0
+        vmrs            apsr_nzcv, fpscr
+        vmovge          s{{reg}}, s0
+    {% endfor %}
+
+    b .non_linear_loop
+
+.scalar_max:
+    vmov            s0, r3
+    {% for reg in (16..31) %}
+        vcmp.f32        s{{reg}}, s0
+        vmrs            apsr_nzcv, fpscr
+        vmovle          s{{reg}}, s0
+    {% endfor %}
+
+    b .non_linear_loop
+
+.scalar_add:
+    vmov            s0, r3
+    {% for s in (16..31) %}
+        vadd.f32    s{{s}}, s{{s}}, s0
+    {% endfor %}
+
+    b .non_linear_loop
+
+.scalar_mul:
+    vmov            s0, r3
+    {% for s in (16..31) %}
+        vmul.f32    s{{s}}, s{{s}}, s0
+    {% endfor %}
+
+    b .non_linear_loop
+
+.scalar_sub:
+    vmov            s0, r3
+    {% for s in (16..31) %}
+        vsub.f32    s{{s}}, s0, s{{s}}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.scalar_sub_flipped:
+    vmov            s0, r3
+    {% for s in (16..31) %}
+        vsub.f32    s{{s}}, s{{s}}, s0
+    {% endfor %}
+
+    b .non_linear_loop
+
+.leaky_relu:
+    vmov            s0, r3
+    {% for reg in (16..31) %}
+        vmul.f32        s1, s0, s{{reg}}
+        vcmp.f32        s{{reg}}, #0
+        vmrs            apsr_nzcv, fpscr
+        vmovlt          s{{reg}}, s1
+    {% endfor %}
+    b .non_linear_loop
+
+.per_row_min:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%}
+            vcmp.f32        {{s}}, s{{row}}
+            vmrs            apsr_nzcv, fpscr
+            vmovge          {{s}}, s{{row}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.per_row_max:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%}
+            vcmp.f32        {{s}}, s{{row}}
+            vmrs            apsr_nzcv, fpscr
+            vmovlt          {{s}}, s{{row}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.per_row_add:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            vadd.f32    s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{row}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.per_row_mul:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            vmul.f32    s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{row}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.per_row_sub:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            vsub.f32    s{{col|times:4|plus:row|plus:16}}, s{{row}}, s{{col|times:4|plus:row|plus:16}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.per_row_sub_flipped:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            vsub.f32    s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{row}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.per_col_min:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%}
+            vcmp.f32        {{s}}, s{{col}}
+            vmrs            apsr_nzcv, fpscr
+            vmovge          {{s}}, s{{col}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.per_col_max:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%}
+            vcmp.f32        {{s}}, s{{col}}
+            vmrs            apsr_nzcv, fpscr
+            vmovlt          {{s}}, s{{col}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.per_col_add:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            vadd.f32    s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{col}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.per_col_mul:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            vmul.f32    s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{col}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.per_col_sub:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            vsub.f32    s{{col|times:4|plus:row|plus:16}}, s{{col}}, s{{col|times:4|plus:row|plus:16}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.per_col_sub_flipped:
+    vldm    r3, {s0, s1, s2, s3}
+    {% for row in (0..3) %}
+        {% for col in (0..3) %}
+            vsub.f32    s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{col}}
+        {% endfor %}
+    {% endfor %}
+
+    b .non_linear_loop
+
+.add_row_col_products:
+    vldmia          r3!, { s0, s1 }
+    vldmia          r4!, { s8, s9 }
+
+    vmla.f32        s16, s0, s8
+    vldmia          r3!, { s2, s3 }
+    vmla.f32        s17, s1, s8
+    vldmia          r4!, { s10, s11 }
+    vmla.f32        s18, s2, s8
+    vmla.f32        s19, s3, s8
+
+    vmla.f32        s20, s0, s9
+    vmla.f32        s21, s1, s9
+    vmla.f32        s22, s2, s9
+    vmla.f32        s23, s3, s9
+
+    vmla.f32        s24, s0, s10
+    vmla.f32        s25, s1, s10
+    vmla.f32        s26, s2, s10
+    vmla.f32        s27, s3, s10
+
+    vmla.f32        s28, s0, s11
+    vmla.f32        s29, s1, s11
+    vmla.f32        s30, s2, s11
+    vmla.f32        s31, s3, s11
+
+    b .non_linear_loop
+
+.store:
+    {% for col in (0..3) %}
+        mov         r8, r3
+        {% for reg in (0..3) %}
+            fsts        s{{col|times:4|plus:reg|plus:16}}, [ r8 ]
+            {% if reg < 3 %}
+                add         r8, r8, r4
+            {% endif %}
+        {% endfor %}
+        {% if col < 3 %}
+            add r3, r3, r5
+        {% endif %}
+    {% endfor %}
+
+    mov         r0,     #0
+    b   .return
+
+.load_tile:
+    vldmia       r3!, { s16-s31 }
+    b .non_linear_loop
+
+.q_scale:
+.q_shl:
+.q_shr:
+    b   .unsupported
+
+.return:
+    vpop        { s16-s31 }
+    pop         { r4-r12 }
+
+    bx          lr
+
diff --git a/vendor/tract-linalg-0.22.1/arm32/armvfpv2/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/arm32/armvfpv2/dispatcher.tmpliq
new file mode 100644
index 000000000..5386a420d
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm32/armvfpv2/dispatcher.tmpliq
@@ -0,0 +1,32 @@
+// vim: ft=arm
+
+.non_linear:
+
+.non_linear_loop_entry:
+    sub     r0, #20
+
+.non_linear_loop:
+    add     r0, #20
+    ldm     r0, { r2, r3, r4, r5, r6 }
+
+    cmp     r2, #{{ jump_table | size }}
+    movgt   r2, #{{ jump_table | size }}
+    cmp     r2, #0
+    movlt   r2, #{{ jump_table | size }}
+
+    add     pc, pc, r2, LSL#2
+    nop     // pc in Rn above is start of the add instruction + 8, hence a nop is needed
+            // This is A32 asm, for T32/Thump2 use nop.w and b.w to avoid problems.
+{% for j in jump_table %}
+        b .{{j}}
+{% endfor %}
+    b .unsupported
+
+.unsupported:
+    mov         r0,     #1
+    b           .return
+
+.done:
+    mov         r0,     #0
+    b           .return
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x1.tmpl b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x1.tmpl
new file mode 100644
index 000000000..8ff4125d4
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x1.tmpl
@@ -0,0 +1,533 @@
+// vim: ft=arm
+.text
+.align 4
+
+/* Z: 32x1
+ z0[0] ..  z0[15] z1[0] .. z1[15]
+*/
+    
+
+.global {{G}}apple_amx_mmm_f16_64x1_{{suffix}}
+{{G}}apple_amx_mmm_f16_64x1_{{suffix}}:
+
+{{ AMX_SET }}
+
+    // set x1 to a 128 bytes aligned block for loads
+    mov x1, sp
+    lsr x1, x1, #7
+    lsl x1, x1, #7
+    sub x1, x1, 128
+
+{% include "dispatcher.tmpliq" %}
+
+.leaky_relu:
+.q_scale:
+.q_shl:
+.q_shr:
+    b .unsupported
+
+.add_mat_mul:
+
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x4, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    orr x4, x4, {{ 0|setting:62 }}  // load a pair of A
+
+    mov x5, {{ 0|setting:43 }}      // f16
+    orr x5, x5, {{ 0|setting:38 }}  // Broadcast Y
+
+    orr x6, x5, {{ 0|setting:20 }}  // z offset
+    orr x6, x6, {{ 0|setting:16 }}  // x offset
+
+    cmp         x3, #32
+    blt         .packed_packed_loop_1
+
+    mov x9, {{0|setting:32}}        // Y broadcast offset += 1
+
+    .packed_packed_loop_32:
+        mov x7, x5
+        mov x8, x6
+        {% amx ldy x2 %}
+        {% for k in (0..31) %}
+            {% amx ldx x4 %}
+            add x4, x4, 128
+            {% amx vecfp x7 %}
+            {% amx vecfp x8 %}
+            add x7, x7, x9
+            add x8, x8, x9
+        {% endfor %}
+        add x2, x2, #64
+        sub x3, x3, #32
+        cmp x3, #32
+    bge .packed_packed_loop_32
+
+    cmp x3, #0
+    beq .non_linear_loop
+
+    .packed_packed_loop_1:
+        ldr w7, [x2], #2
+        str w7, [x1]
+        {% amx ldx x4 %}
+        {% amx ldy x1 %}
+        {% amx vecfp x5 %}
+        {% amx vecfp x6 %}
+        add x4, x4, 128
+        subs x3, x3, #1
+    bne .packed_packed_loop_1
+
+    b .non_linear_loop
+
+.clear:
+    // top left
+    eor x2, x2, x2
+    orr x2, x2, {{ 0|setting:27 }}
+    orr x2, x2, {{ 0|setting:28 }}
+    orr x2, x2, {{ 0|setting:29 }}  // Z = 0
+    {% amx fma32 x2 %}
+
+    // top right
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    {% amx fma32 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:21 }}  // Z row = 3
+    {% amx fma32 x2 %}
+
+    // bottom left
+    eor x2, x2, {{ 0|setting:20 }}  // Z row = 2
+    {% amx fma32 x2 %}
+
+    b .non_linear_loop
+
+.per_col_sub:
+
+    // performs a unary neg on Z
+    eor x2, x2, x2                      // X[0] = Z[0]
+    // extr[hxyz] is suport confusing
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+
+    {% amx extrx x2 %}
+    {% amx fms16 x4 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    add x4, x4, {{0|setting:20}}    // next Z row
+    {% amx extrx x2 %}              // extr[hxyz] is confusing
+    {% amx fms16 x4 %}
+
+    // continue
+
+.per_col_add:
+    ldr         x2, [x0, #8]
+
+    // broadcast value to x0
+    ld1         { v0.h }[0], [x2]
+    dup         v0.8h, v0.h[0]
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    sub         x1, x1, #64
+
+    {% amx ldx x1 %} // load into x0 by default
+
+    mov x2, {{ 0|setting:28 }}      // z += y
+    {% amx fma16 x2 %}
+
+    orr x2, x2, {{ 0|setting:20 }}  // target is now z1
+    {% amx fma16 x2 %}
+
+    b .non_linear_loop
+
+.per_col_sub_flipped:
+    ldr         x2, [x0, #8]
+
+    // broadcast value to x0
+    ld1         { v0.h }[0], [x2]
+    dup         v0.8h, v0.h[0]
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    sub x1, x1, #64
+
+    {% amx ldx x1 %} // load into x0 by default
+
+    mov x2, {{ 0|setting:28 }}      // z += y
+    {% amx fms16 x2 %}
+
+    orr x2, x2, {{ 0|setting:20 }}  // target is now z1
+    {% amx fms16 x2 %}
+
+    b .non_linear_loop
+
+.per_row_sub_flipped:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2]
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldy x2 %}
+
+    mov x2, {{ 0|setting:63 }}      // vector mode
+    orr x2, x2, {{ 0|setting:29 }}  // z -= y
+
+    // top left
+    {% amx fms16 x2 %}
+
+    // bottom left
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    orr x2, x2, {{ 0|setting:6 }}   // Y offset
+    {% amx fms16 x2 %}
+
+    b .non_linear_loop
+
+.per_row_sub:
+    // performs a unary neg on Z
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+
+    {% amx extrx x2 %}
+    {% amx fms16 x4 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    add x4, x4, {{0|setting:20}}    // next Z row
+    {% amx extrx x2 %}
+    {% amx fms16 x4 %}
+
+    // continue
+
+.per_row_add:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2]
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldy x2 %}
+
+    mov x2, {{ 0|setting:63 }}      // vector mode
+    orr x2, x2, {{ 0|setting:29 }}  // z += y
+
+    // top left
+    {% amx fma16 x2 %}
+
+    // bottom left
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    orr x2, x2, {{ 0|setting:6 }}   // Y offset
+    {% amx fma16 x2 %}
+
+    b .non_linear_loop
+
+.per_row_min:
+    mov x2, 5
+    b .per_row_min_max
+.per_row_max:
+    mov x2, 7
+.per_row_min_max:
+    ldr         x5, [x0, #8]
+
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x5], #64
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x5]
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    sub x1, x1, #64
+
+    orr x5, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldx x5 %}
+
+    lsl x2, x2, 47                  // max(x,z) (or min)
+    orr x2, x2, {{ 0|setting:44 }}  // f32
+    {% amx vecfp x2 %}
+
+    orr x2, x2, {{ 0|setting:16 }}  // x1
+    orr x2, x2, {{ 0|setting:20 }}  // z1
+    {% amx vecfp x2 %}
+
+    b .non_linear_loop
+
+.per_col_min:
+    mov x2, 5
+    b .per_col_min_max
+.per_col_max:
+    mov x2, 7
+.per_col_min_max:
+    ldr         x4, [x0, #8]
+
+    // broadcast value to x0
+    ld1         { v0.h }[0], [x4]
+    dup         v0.8h, v0.h[0]
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    sub         x1, x1, #64
+
+    {% amx ldx x1 %}
+
+    lsl x2, x2, 47                  // max(x,z) (or min)
+    orr x2, x2, {{ 0|setting:43 }}  // f32
+
+    {% amx vecfp x2 %}
+    orr x2, x2, {{ 0|setting:20 }}  // z offset
+    {% amx vecfp x2 %}
+
+    b .non_linear_loop
+
+.per_col_mul:
+    ldr         x4, [x0, #8]
+
+    // broadcast value to y0
+    ld1         { v0.h }[0], [x4]
+    dup         v0.8h, v0.h[0]
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    st1         { v0.8h }, [x1], #16
+    sub         x1, x1, #64
+
+    {% amx ldy x1 %}
+
+    eor x2, x2, x2                      // X[0] = Z[0]
+    {% amx extrx x2 %}
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:27 }}      // Z=X*Y
+    {% amx fma16 x4 %}
+    orr x2, x2, {{ 0|setting:20 }}      // Z1
+    {% amx extrx x2 %}
+    orr x4, x4, {{ 0|setting:20 }}      // Z1
+    {% amx fma16 x4 %}
+
+    b .non_linear_loop
+
+.per_row_mul:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2]
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}      // pair
+    {% amx ldy x2 %}
+
+    eor x2, x2, x2                      // X[0] = Z[0]
+    {% amx extrx x2 %}
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:27 }}      // Z=X*Y
+    {% amx fma16 x4 %}
+    orr x2, x2, {{ 0|setting:20 }}      // Z1
+    {% amx extrx x2 %}
+    orr x4, x4, {{ 0|setting:20 }}      // Z1
+    orr x4, x4, {{ 0|setting:6 }}       // Y1
+    {% amx fma16 x4 %}
+
+    b .non_linear_loop
+
+.scalar_sub:
+    // performs a unary neg on Z, then go to scalar_add
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+    {% amx extrx x2 %}
+    {% amx fms16 x4 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    add x4, x4, {{0|setting:20}}    // next Z row
+    {% amx extrx x2 %}
+    {% amx fms16 x4 %}
+
+    // continue on purpose
+
+.scalar_add:
+    ldr         w5, [x0, #8]
+
+    fmov        h0, w5
+    dup         v0.8h, v0.h[0]
+    dup         v1.8h, v0.h[0]
+    dup         v2.8h, v0.h[0]
+    dup         v3.8h, v0.h[0]
+
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    {% amx ldx x1 %}    // load 16 values
+
+    mov x2, {{ 0|setting:28 }}          // Z+=X 
+    {% amx fma16 x2 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    {% amx fma16 x2 %}
+    b .non_linear_loop
+
+.scalar_sub_flipped:
+    ldr         w5, [x0, #8]
+    fmov        s0, w5
+    dup         v0.8h, v0.h[0]
+    dup         v1.8h, v0.h[0]
+    dup         v2.8h, v0.h[0]
+    dup         v3.8h, v0.h[0]
+
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    {% amx ldx x1 %}    // load 16 values
+
+    mov x2, {{ 0|setting:28 }}          // Z-=X 
+    {% amx fms16 x2 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    {% amx fms16 x2 %}
+    b .non_linear_loop
+
+.scalar_mul:
+    ldr         w5, [x0, #8]
+    fmov        h0, w5
+    dup         v0.8h, v0.h[0]
+    dup         v1.8h, v0.h[0]
+    dup         v2.8h, v0.h[0]
+    dup         v3.8h, v0.h[0]
+
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    {% amx ldy x1 %}
+
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:27 }}      // Z=X*Y
+
+    {% amx extrx x2 %}
+    {% amx fma16 x4 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    add x4, x4, {{0|setting:20}}    // next Z row
+    {% amx extrx x2 %}
+    {% amx fma16 x4 %}
+
+    b .non_linear_loop
+
+.scalar_min:
+    mov x2, 5
+    b .scalar_min_max
+.scalar_max:
+    mov x2, 7
+.scalar_min_max:
+    ldr         w5, [x0, #8]
+    fmov        h0, w5
+    dup         v0.8h, v0.h[0]
+    dup         v1.8h, v0.h[0]
+    dup         v2.8h, v0.h[0]
+    dup         v3.8h, v0.h[0]
+
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    {% amx ldx x1 %}    // load 16 values
+
+    lsl x2, x2, 47
+    orr x2, x2, {{ 0|setting:43 }} // f16
+
+    {% amx vecfp x2 %}
+    add x2, x2, {{ 0|setting:20}} // next Z
+    {% amx vecfp x2 %}
+
+    b .non_linear_loop
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    {% for neon in (0..7) %}
+       {% for lane in (0..7) %}
+           ld1 { v{{neon}}.h }[{{lane}}], [x5], x6
+       {% endfor %}
+    {% endfor %}
+    mov x8, x1
+    st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x8], #64
+    st1 { v4.8h, v5.8h, v6.8h, v7.8h }, [x8], #64
+
+    orr x8, x1, {{ 0|setting:62 }}          // pair
+    {% amx ldy x8 %}
+
+    eor x2, x2, x2
+    orr x2, x2, {{ 0|setting:63 }}  // vector mode
+    orr x2, x2, {{ 0|setting:29 }}  // perform Z0+=Y0
+    {% amx fma16 x2 %}
+    orr x2, x2, {{ 0|setting:20 }}  // Z1
+    orr x2, x2, 64                  // offset Y
+    {% amx fma16 x2 %}
+    
+    b .non_linear_loop
+
+.add_row_col_products:
+    ldp         x5, x6, [x0, #8]            // a base ptr, b base ptr
+
+    ld1         { v0.h }[0], [x6]
+    st1         { v0.h }[0], [x1]
+    {% amx ldy x1 %}
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x5], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x5]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldx x2 %}
+
+    mov x2, {{ 0|setting:43 }}      // f16
+    orr x2, x2, {{ 0|setting:38 }}  // Broadcast Y
+    {% amx vecfp x2 %}
+
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    orr x2, x2, {{ 0|setting:16 }}  // X offset
+    {% amx vecfp x2 %}
+
+    b .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    ands        x8, x5, 0x7f
+    bne         .store_generic
+    cmp         x6, 4
+    bne         .store_generic
+    cmp         x7, 4
+    bne         .store_generic
+ 
+    orr x5, x5, {{ 0|setting:62 }}          // pair
+    {% amx stz x5 %}
+    b .non_linear_loop
+
+ .store_generic:
+
+    orr x8, x1, {{ 0|setting:62 }}          // pair
+    {% amx stz x8 %}
+
+    mov x8, x1
+    ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x8], #64
+    ld1 { v4.8h, v5.8h, v6.8h, v7.8h }, [x8], #64
+    {% for neon in (0..7) %}
+       {% for lane in (0..7) %}
+           st1 { v{{neon}}.h }[{{lane}}], [x5], x6
+       {% endfor %}
+    {% endfor %}
+    
+    b .non_linear_loop
+
+.load_tile:
+    ldr  x2, [x0, #16]                      // row major ptr
+    orr  x2, x2, {{0|setting:62}}           // load pairs
+    {% amx ldz x2 %}
+    b .non_linear_loop
+
+.return:
+{{ AMX_CLR }}
+ret
diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x32.tmpl b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x32.tmpl
new file mode 100644
index 000000000..5c5bcea19
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f16_64x32.tmpl
@@ -0,0 +1,658 @@
+// vim: ft=arm
+.text
+.align 4
+
+/* Z: 64x32 tile. each Z reg is f16x32
+    Z0
+    Z2
+    ...
+    Z62
+    
+    Z1
+    Z3
+    S63
+*/
+    
+
+.global {{G}}apple_amx_mmm_f16_64x32_{{suffix}}
+{{G}}apple_amx_mmm_f16_64x32_{{suffix}}:
+
+{{ AMX_SET }}
+
+    // set x1 to a 128 bytes aligned block for loads
+    mov x1, sp
+    lsr x1, x1, #7
+    lsl x1, x1, #7
+    sub x1, x1, 128
+
+{% include "dispatcher.tmpliq" %}
+
+.leaky_relu:
+.q_scale:
+.q_shl:
+.q_shr:
+    b .unsupported
+
+.add_mat_mul:
+
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x4, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    orr         x4, x4, {{0|setting:62}}    // load pairs (A)
+
+    eor         x5, x5, x5                  // top left
+
+    orr         x7, x5, {{ 0|setting:20 }}
+    orr         x7, x7, {{ 0|setting:6 }}   // bottom left
+
+    .packed_packed_loop_1:
+    {% amx ldx x2 %}
+    {% amx ldy x4 %}
+    add x2, x2, 64
+    add x4, x4, 128
+
+    {% amx fma16 x5 %}
+    {% amx fma16 x7 %}
+
+    subs x3, x3, #1
+    bne .packed_packed_loop_1
+
+    b .non_linear_loop
+
+.clear:
+    // top left
+    eor x2, x2, x2
+    orr x2, x2, {{ 0|setting:27 }}
+    orr x2, x2, {{ 0|setting:28 }}
+    orr x2, x2, {{ 0|setting:29 }}  // Z = 0
+    {% amx fma32 x2 %}
+
+    // top right
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    {% amx fma32 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:21 }}  // Z row = 3
+    {% amx fma32 x2 %}
+
+    // bottom left
+    eor x2, x2, {{ 0|setting:20 }}  // Z row = 2
+    {% amx fma32 x2 %}
+
+    mov     x3, #16
+    str     x3, [x1]
+
+    b .non_linear_loop
+
+.per_col_sub:
+
+    // performs a unary neg on Z
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+
+    mov x6, 64
+    .per_col_sub_loop:
+        {% amx extrx x2 %}
+        {% amx fms16 x4 %}
+        add x2, x2, {{0|setting:20}}    // next Z row
+        add x4, x4, {{0|setting:20}}    // next Z row
+    subs x6, x6, 1
+    bne .per_col_sub_loop
+
+    // continue
+
+.per_col_add:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    {% amx ldx x1 %}
+
+    mov x2, {{ 0|setting:28 }}      // z += y
+
+    // top left
+    {% amx fma16 x2 %}
+
+    // bottom left
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 2
+    {% amx fma16 x2 %}
+
+    b .non_linear_loop
+
+.per_col_sub_flipped:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2]
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+
+    {% amx ldx x1 %}
+
+    mov x2, {{ 0|setting:28 }}      // z += y
+
+    {% amx fms16 x2 %}
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    {% amx fms16 x2 %}
+
+    b .non_linear_loop
+
+.per_row_sub_flipped:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2]
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldy x2 %}
+
+    mov x2, {{ 0|setting:29 }}      // z += y
+
+    // top left
+    {% amx fms16 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 3
+    orr x2, x2, {{ 0|setting:6 }}   // Y offset
+    {% amx fms16 x2 %}
+
+    b .non_linear_loop
+
+.per_row_sub:
+    // performs a unary neg on Z
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+
+    mov x6, 64
+    .per_row_sub_loop:
+        {% amx extrx x2 %}
+        {% amx fms16 x4 %}
+        add x2, x2, {{0|setting:20}}    // next Z row
+        add x4, x4, {{0|setting:20}}    // next Z row
+    subs x6, x6, 1
+    bne .per_row_sub_loop
+
+    // continue
+
+.per_row_add:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1], #64
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2]
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldy x2 %}
+
+    mov x2, {{ 0|setting:29 }}      // z += y
+
+    // top left
+    {% amx fma16 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    orr x2, x2, {{ 0|setting:6 }}   // Y offset
+    {% amx fma16 x2 %}
+
+    b .non_linear_loop
+
+.per_row_min:
+    mov x2, 5
+    b .per_row_min_max
+.per_row_max:
+    mov x2, 7
+.per_row_min_max:
+    ldr         x5, [x0, #8]
+
+    add x6, x5, 64
+
+    lsl x2, x2, 47                  // max(x,z) (or min)
+    orr x2, x2, {{ 0|setting:43 }}  // f16
+
+    orr x8, x2, {{ 0|setting:20 }}  // bottom left
+
+    mov x4, 32
+    .loop_per_row_max:
+        // top half
+        ld1         { v0.h }[0], [x5], #2
+        dup         v0.8h, v0.h[0]
+        dup         v1.8h, v0.h[0]
+        dup         v2.8h, v0.h[0]
+        dup         v3.8h, v0.h[0]
+        st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+
+        {% amx ldx x1 %}
+        {% amx vecfp x2 %}
+
+        add x2, x2, {{ 0|setting:21 }}
+
+        // bottom half
+        ld1         { v0.h }[0], [x6], #2
+        dup         v0.8h, v0.h[0]
+        dup         v1.8h, v0.h[0]
+        dup         v2.8h, v0.h[0]
+        dup         v3.8h, v0.h[0]
+        st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+
+        {% amx ldx x1 %}
+        {% amx vecfp x8 %}
+
+        add x8, x8, {{ 0|setting:21 }}
+
+    subs x4, x4, 1
+    bne .loop_per_row_max
+
+    b .non_linear_loop
+
+.per_col_min:
+    mov x2, 5
+    b .per_col_min_max
+.per_col_max:
+    mov x2, 7
+.per_col_min_max:
+    ldr         x4, [x0, #8]
+
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x4]
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    {% amx ldx x1 %}
+
+    lsl x2, x2, 47                  // max(x,z) (or min)
+    orr x2, x2, {{ 0|setting:43 }}  // f16
+
+    mov x4, 64
+    .loop_per_col_max:
+        {% amx vecfp x2 %}
+        add x2, x2, {{ 0|setting:20 }}
+    subs x4, x4, 1
+    bne .loop_per_col_max
+
+    b .non_linear_loop
+
+.per_col_mul:
+    ldr         x4, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x4]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    {% amx ldy x1 %}
+
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:27 }}      // Z=X*Y
+
+    mov x6, 64
+    .loop_per_col_mul:
+        {% amx extrx x2 %}
+        {% amx fma16 x4 %}
+        add x2, x2, {{0|setting:20}}
+        add x4, x4, {{0|setting:20}}
+    subs x6, x6, 1
+    bne .loop_per_col_mul
+
+    b .non_linear_loop
+
+.per_row_mul:
+    ldr         x14, [x0, #8]
+    add         x15, x14, 64
+
+    // extrx
+    eor x2, x2, x2                      // X[0] = Z[0] (top left)
+
+    eor x4, x4, x4
+    orr x4, x4, {{0|setting:20}}        // X[0] = Z[1] (bottom left)
+
+    // fma16
+    eor x6, x6, x6
+    orr x6, x6, {{0|setting:63}}        // vector mode
+    orr x6, x6, {{0|setting:27}}        // Z=X*Y       Z[0]=X[0]*Y[0]
+
+    orr x8, x6, {{0|setting:20}}        // Z[1]
+
+    mov x10, 32
+    .loop_per_row_mul:
+        // top
+        ld1         { v0.h }[0], [x14], #2
+        dup         v0.8h, v0.h[0]
+        dup         v1.8h, v0.h[0]
+        dup         v2.8h, v0.h[0]
+        dup         v3.8h, v0.h[0]
+        st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+
+        {% amx ldy x1 %}
+        {% amx extrx x2 %}
+        {% amx fma16 x6 %}
+
+        add x2, x2, {{ 0|setting:21 }}
+        add x6, x6, {{ 0|setting:21 }}
+
+        // bottom
+        ld1         { v0.h }[0], [x15], #2
+        dup         v0.8h, v0.h[0]
+        dup         v1.8h, v0.h[0]
+        dup         v2.8h, v0.h[0]
+        dup         v3.8h, v0.h[0]
+        st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+
+        {% amx ldy x1 %}
+        {% amx extrx x4 %}
+        {% amx fma16 x8 %}
+
+        add x4, x4, {{ 0|setting:21 }}
+        add x8, x8, {{ 0|setting:21 }}
+
+    subs x10, x10, 1
+    bne .loop_per_row_mul
+
+    b .non_linear_loop
+
+.scalar_sub:
+    // performs a unary neg on Z, then go to scalar_add
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+
+    mov x6, 64
+    .scalar_sub_loop:
+        {% amx extrx x2 %}
+        {% amx fms16 x4 %}
+        add x2, x2, {{0|setting:20}}    // next Z row
+        add x4, x4, {{0|setting:20}}    // next Z row
+    subs x6, x6, 1
+    bne .scalar_sub_loop
+
+    // continue on purpose
+
+.scalar_add:
+    ldr         w5, [x0, #8]
+
+    fmov        h0, w5
+    dup         v0.8h, v0.h[0]
+    dup         v1.8h, v0.h[0]
+    dup         v2.8h, v0.h[0]
+    dup         v3.8h, v0.h[0]
+
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    {% amx ldx x1 %}    // load 16 values
+
+    mov x2, {{ 0|setting:28 }}          // Z+=X 
+    {% amx fma16 x2 %}
+    add x2, x2, {{0|setting:20}}    // Z1
+    {% amx fma16 x2 %}
+    b .non_linear_loop
+
+.scalar_sub_flipped:
+    ldr         w5, [x0, #8]
+
+    fmov        h0, w5
+    dup         v0.8h, v0.h[0]
+    dup         v1.8h, v0.h[0]
+    dup         v2.8h, v0.h[0]
+    dup         v3.8h, v0.h[0]
+
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    {% amx ldx x1 %}    // load 32 values
+
+    mov x2, {{ 0|setting:28 }}          // Z-=X 
+    {% amx fms16 x2 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    {% amx fms16 x2 %}
+    b .non_linear_loop
+
+.scalar_mul:
+    ldr         w5, [x0, #8]
+
+    fmov        h0, w5
+    dup         v0.8h, v0.h[0]
+    dup         v1.8h, v0.h[0]
+    dup         v2.8h, v0.h[0]
+    dup         v3.8h, v0.h[0]
+
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    {% amx ldy x1 %}    // load 32 values
+
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:27 }}      // Z=X*Y
+
+    mov x6, 64
+    .scalar_mul_loop:
+        {% amx extrx x2 %}
+        {% amx fma16 x4 %}
+        add x2, x2, {{0|setting:20}}    // next Z row
+        add x4, x4, {{0|setting:20}}    // next Z row
+    subs x6, x6, 1
+    bne .scalar_mul_loop
+
+    b .non_linear_loop
+
+.scalar_min:
+    mov x2, 5
+    b .scalar_min_max
+.scalar_max:
+    mov x2, 7
+.scalar_min_max:
+    ldr         w5, [x0, #8]
+
+    fmov        h0, w5
+    dup         v0.8h, v0.h[0]
+    dup         v1.8h, v0.h[0]
+    dup         v2.8h, v0.h[0]
+    dup         v3.8h, v0.h[0]
+
+    st1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+    {% amx ldx x1 %}    // load 16 values
+
+    lsl x2, x2, 47
+    orr x2, x2, {{ 0|setting:43 }} // f32
+
+    mov x3, 64
+    .loop_scalar_max:
+        add x2, x2, {{ 0|setting:20}} // next Z
+        {% amx vecfp x2 %}
+        subs x3, x3, 1
+        bne .loop_scalar_max
+
+    b .non_linear_loop
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    mov x3, 0                               // x3 is the row
+    .loop_load:
+        // z reg is (row % 32) * 2 + (row / 32)
+        and x9, x3, 0x1f
+        lsl x9, x9, 1
+        lsr x10, x3, 5
+        add x9, x9, x10
+
+        mov x4, x5
+        {% for neon in (0..3) %}
+            {% for lane in (0..7) %}
+                ld1 { v{{neon}}.h }[{{lane}}], [x4], x7
+            {% endfor %}
+        {% endfor %}
+
+        st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+        {% amx ldy x1 %}
+
+        lsl x2, x9, 20                  // Z register to update
+        orr x2, x2, {{ 0|setting:63 }}  // vector mode
+        orr x2, x2, {{ 0|setting:29 }}  // perform Z+=Y
+        {% amx fma16 x2 %}
+
+        add x5, x5, x6
+        add x3, x3, 1
+        cmp x3, 64
+    bne .loop_load
+
+    /*
+    mov x3, 0                               // x3 is the row
+    .loop_load:
+        and x9, x3, 0xf                     // x9 = row % 16
+        lsl x9, x9, 2                       // x9 = (row % 16) * 4
+        lsr x10, x3, 4                      // x10 = row / 16 
+        lsl x10, x10, 1                     // x10 = (row / 16) * 2
+        add x9, x9, x10                     // x9 = x9 + x10
+
+        mov x4, x5
+        {% for neon in (0..3) %}
+            {% for lane in (0..3) %}
+                ld1 { v{{neon}}.s }[{{lane}}], [x4], x7
+            {% endfor %}
+        {% endfor %}
+        st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+        {% for neon in (0..3) %}
+            {% for lane in (0..3) %}
+                ld1 { v{{neon}}.s }[{{lane}}], [x4], x7
+            {% endfor %}
+        {% endfor %}
+        st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8]
+
+        mov x2, x1
+        orr x2, x2, {{ 0|setting:62 }} // load 32 values
+        {% amx ldy x2 %}
+
+        lsl x2, x9, 20                  // left Z register to update
+        orr x2, x2, {{ 0|setting:63 }}  // vector mode
+        orr x2, x2, {{ 0|setting:29 }}  // perform Z+=Y
+        {% amx fma32 x2 %}
+
+        add x2, x2, {{0|setting:20}}
+        orr x2, x2, 64                  // offset Y by 16 values
+        {% amx fma32 x2 %}
+
+        add x5, x5, x6
+    add x3, x3, 1
+    cmp x3, 32
+    bne .loop_load
+
+*/
+
+    b .non_linear_loop
+
+.add_row_col_products:
+    ldp         x5, x6, [x0, #8]            // a base ptr, b base ptr
+
+    add x8, x1, 64
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x5], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x5]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldy x2 %}
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+
+    {% amx ldx x1 %}
+
+    // top
+    eor x2, x2, x2
+    {% amx fma16 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    orr x2, x2, {{ 0|setting:6 }}   // Y offset
+    {% amx fma16 x2 %}
+
+    b .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    cmp         x7, 2
+    bne         .store_generic
+    ands        x8, x5, 0x7f
+    bne         .store_generic
+    ands        x8, x6, 0x7f
+    bne         .store_generic
+
+    lsl x8, x6, 5
+    add x8, x8, x5                          // x8 = 32*rsc
+    orr x8, x8, {{ 0|setting:56 }}          // first to x8 is z1
+
+    mov x4, {{0|setting:57}}                // Zreg += 2
+    add x4, x4, x6                          // +rsc
+
+    mov x3, 32
+    .loop_store_direct:
+        {% amx stz x5 %}
+        {% amx stz x8 %}
+        add x5, x5, x4
+        add x8, x8, x4
+    subs x3, x3, 1
+    bne .loop_store_direct
+
+    b .non_linear_loop
+
+.store_generic:
+
+    mov x3, 0                               // row id
+    .loop_store:
+        // z reg is (row % 32) * 2 + (row / 32)
+        and x9, x3, 0x1f
+        lsl x9, x9, 1
+        lsr x10, x3, 5
+        add x9, x9, x10
+
+        lsl x2, x9, 56
+        orr x2, x2, x1
+        {% amx stz x2 %}                            // f16 x 32
+
+        ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x1]
+
+        mov x4, x5
+        {% for neon in (0..3) %}
+            {% for lane in (0..7) %}
+                st1 { v{{neon}}.h }[{{lane}}], [x4], x7
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x6
+
+        add x3, x3, 1
+        cmp x3, 64
+    bne .loop_store
+    b .non_linear_loop
+
+.load_tile:
+    ldr  x2, [x0, #16]                      // row major ptr
+    orr  x3, x2, {{0|setting:56}}
+    add  x3, x3, #2048
+    
+    mov  x4, {{0|setting:57}}               // z+=2
+    add  x4, x4, #64
+
+    mov x8, 32
+    .loop_load_tile:
+        {% amx ldz x2 %}
+        {% amx ldz x3 %}
+        add x2, x2, x4
+        add x3, x3, x4
+    subs x8, x8, 1
+    bne .loop_load_tile
+
+    b .non_linear_loop
+   
+.return:
+{{ AMX_CLR }}
+ret
diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x1.tmpl b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x1.tmpl
new file mode 100644
index 000000000..132283f7f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x1.tmpl
@@ -0,0 +1,533 @@
+// vim: ft=arm
+.text
+.align 4
+
+/* Z: 32x1
+ z0[0] ..  z0[15] z1[0] .. z1[15]
+*/
+    
+
+.global {{G}}apple_amx_mmm_f32_32x1_{{suffix}}
+{{G}}apple_amx_mmm_f32_32x1_{{suffix}}:
+
+{{ AMX_SET }}
+
+    // set x1 to a 128 bytes aligned block for loads
+    mov x1, sp
+    lsr x1, x1, #7
+    lsl x1, x1, #7
+    sub x1, x1, 128
+
+{% include "dispatcher.tmpliq" %}
+
+.leaky_relu:
+.q_scale:
+.q_shl:
+.q_shr:
+    b .unsupported
+
+.add_mat_mul:
+
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x4, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    orr x4, x4, {{ 0|setting:62 }}  // load a pair of A
+
+    mov x5, {{ 0|setting:44 }}      // f32
+    orr x5, x5, {{ 0|setting:38 }}  // Broadcast Y
+
+    orr x6, x5, {{ 0|setting:20 }}  // z offset
+    orr x6, x6, {{ 0|setting:16 }}  // x offset
+
+    cmp         x3, #16
+    blt         .packed_packed_loop_1
+
+    mov x9, {{0|setting:32}}        // Y broadcast offset += 1
+
+    .packed_packed_loop_16:
+        mov x7, x5
+        mov x8, x6
+        {% amx ldy x2 %}
+        {% for k in (0..15) %}
+            {% amx ldx x4 %}
+            add x4, x4, 128
+            {% amx vecfp x7 %}
+            {% amx vecfp x8 %}
+            add x7, x7, x9
+            add x8, x8, x9
+        {% endfor %}
+        add x2, x2, #64
+        sub x3, x3, #16
+        cmp x3, #16
+    bge .packed_packed_loop_16
+
+    cmp x3, #0
+    beq .non_linear_loop
+
+    .packed_packed_loop_1:
+        ldr w7, [x2], #4
+        str w7, [x1]
+        {% amx ldx x4 %}
+        {% amx ldy x1 %}
+        {% amx vecfp x5 %}
+        {% amx vecfp x6 %}
+        add x4, x4, 128
+        subs x3, x3, #1
+    bne .packed_packed_loop_1
+
+    b .non_linear_loop
+
+.clear:
+    // top left
+    eor x2, x2, x2
+    orr x2, x2, {{ 0|setting:27 }}
+    orr x2, x2, {{ 0|setting:28 }}
+    orr x2, x2, {{ 0|setting:29 }}  // Z = 0
+    {% amx fma32 x2 %}
+
+    // top right
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    {% amx fma32 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:21 }}  // Z row = 3
+    {% amx fma32 x2 %}
+
+    // bottom left
+    eor x2, x2, {{ 0|setting:20 }}  // Z row = 2
+    {% amx fma32 x2 %}
+
+    b .non_linear_loop
+
+.per_col_sub:
+
+    // performs a unary neg on Z
+    eor x2, x2, x2                      // X[0] = Z[0]
+    // extr[hxyz] is suport confusing
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+
+    {% amx extrx x2 %}
+    {% amx fms32 x4 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    add x4, x4, {{0|setting:20}}    // next Z row
+    {% amx extrx x2 %}              // extr[hxyz] is confusing
+    {% amx fms32 x4 %}
+
+    // continue
+
+.per_col_add:
+    ldr         x2, [x0, #8]
+
+    // broadcast value to x0
+    ld1         { v0.s }[0], [x2]
+    dup         v0.4s, v0.s[0]
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    sub         x1, x1, #64
+
+    {% amx ldx x1 %} // load into x0 by default
+
+    mov x2, {{ 0|setting:28 }}      // z += y
+    {% amx fma32 x2 %}
+
+    orr x2, x2, {{ 0|setting:20 }}  // target is now z1
+    {% amx fma32 x2 %}
+
+    b .non_linear_loop
+
+.per_col_sub_flipped:
+    ldr         x2, [x0, #8]
+
+    // broadcast value to x0
+    ld1         { v0.s }[0], [x2]
+    dup         v0.4s, v0.s[0]
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    sub x1, x1, #64
+
+    {% amx ldx x1 %} // load into x0 by default
+
+    mov x2, {{ 0|setting:28 }}      // z += y
+    {% amx fms32 x2 %}
+
+    orr x2, x2, {{ 0|setting:20 }}  // target is now z1
+    {% amx fms32 x2 %}
+
+    b .non_linear_loop
+
+.per_row_sub_flipped:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldy x2 %}
+
+    mov x2, {{ 0|setting:63 }}      // vector mode
+    orr x2, x2, {{ 0|setting:29 }}  // z -= y
+
+    // top left
+    {% amx fms32 x2 %}
+
+    // bottom left
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    orr x2, x2, {{ 0|setting:6 }}   // Y offset
+    {% amx fms32 x2 %}
+
+    b .non_linear_loop
+
+.per_row_sub:
+    // performs a unary neg on Z
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+
+    {% amx extrx x2 %}
+    {% amx fms32 x4 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    add x4, x4, {{0|setting:20}}    // next Z row
+    {% amx extrx x2 %}
+    {% amx fms32 x4 %}
+
+    // continue
+
+.per_row_add:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldy x2 %}
+
+    mov x2, {{ 0|setting:63 }}      // vector mode
+    orr x2, x2, {{ 0|setting:29 }}  // z += y
+
+    // top left
+    {% amx fma32 x2 %}
+
+    // bottom left
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    orr x2, x2, {{ 0|setting:6 }}   // Y offset
+    {% amx fma32 x2 %}
+
+    b .non_linear_loop
+
+.per_row_min:
+    mov x2, 5
+    b .per_row_min_max
+.per_row_max:
+    mov x2, 7
+.per_row_min_max:
+    ldr         x5, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x5], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x5]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x5, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldx x5 %}
+
+    lsl x2, x2, 47                  // max(x,z) (or min)
+    orr x2, x2, {{ 0|setting:44 }}  // f32
+    {% amx vecfp x2 %}
+
+    orr x2, x2, {{ 0|setting:16 }}  // x1
+    orr x2, x2, {{ 0|setting:20 }}  // z1
+    {% amx vecfp x2 %}
+
+    b .non_linear_loop
+
+.per_col_min:
+    mov x2, 5
+    b .per_col_min_max
+.per_col_max:
+    mov x2, 7
+.per_col_min_max:
+    ldr         x4, [x0, #8]
+
+    // broadcast value to x0
+    ld1         { v0.s }[0], [x4]
+    dup         v0.4s, v0.s[0]
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    sub         x1, x1, #64
+
+    {% amx ldx x1 %}
+
+    lsl x2, x2, 47                  // max(x,z) (or min)
+    orr x2, x2, {{ 0|setting:44 }}  // f32
+
+    {% amx vecfp x2 %}
+    orr x2, x2, {{ 0|setting:20 }}  // z offset
+    {% amx vecfp x2 %}
+
+    b .non_linear_loop
+
+.per_col_mul:
+    ldr         x4, [x0, #8]
+
+    // broadcast value to y0
+    ld1         { v0.s }[0], [x4]
+    dup         v0.4s, v0.s[0]
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    st1         { v0.4s }, [x1], #16
+    sub         x1, x1, #64
+
+    {% amx ldy x1 %}
+
+    eor x2, x2, x2                      // X[0] = Z[0]
+    {% amx extrx x2 %}
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:27 }}      // Z=X*Y
+    {% amx fma32 x4 %}
+    orr x2, x2, {{ 0|setting:20 }}      // Z1
+    {% amx extrx x2 %}
+    orr x4, x4, {{ 0|setting:20 }}      // Z1
+    {% amx fma32 x4 %}
+
+    b .non_linear_loop
+
+.per_row_mul:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}      // pair
+    {% amx ldy x2 %}
+
+    eor x2, x2, x2                      // X[0] = Z[0]
+    {% amx extrx x2 %}
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:27 }}      // Z=X*Y
+    {% amx fma32 x4 %}
+    orr x2, x2, {{ 0|setting:20 }}      // Z1
+    {% amx extrx x2 %}
+    orr x4, x4, {{ 0|setting:20 }}      // Z1
+    orr x4, x4, {{ 0|setting:6 }}       // Y1
+    {% amx fma32 x4 %}
+
+    b .non_linear_loop
+
+.scalar_sub:
+    // performs a unary neg on Z, then go to scalar_add
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+    {% amx extrx x2 %}
+    {% amx fms32 x4 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    add x4, x4, {{0|setting:20}}    // next Z row
+    {% amx extrx x2 %}
+    {% amx fms32 x4 %}
+
+    // continue on purpose
+
+.scalar_add:
+    ldr         w5, [x0, #8]
+
+    fmov        s0, w5
+    dup         v0.4s, v0.s[0]
+    dup         v1.4s, v0.s[0]
+    dup         v2.4s, v0.s[0]
+    dup         v3.4s, v0.s[0]
+
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    {% amx ldx x1 %}    // load 16 values
+
+    mov x2, {{ 0|setting:28 }}          // Z+=X 
+    {% amx fma32 x2 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    {% amx fma32 x2 %}
+    b .non_linear_loop
+
+.scalar_sub_flipped:
+    ldr         w5, [x0, #8]
+    fmov        s0, w5
+    dup         v0.4s, v0.s[0]
+    dup         v1.4s, v0.s[0]
+    dup         v2.4s, v0.s[0]
+    dup         v3.4s, v0.s[0]
+
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    {% amx ldx x1 %}    // load 16 values
+
+    mov x2, {{ 0|setting:28 }}          // Z-=X 
+    {% amx fms32 x2 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    {% amx fms32 x2 %}
+    b .non_linear_loop
+
+.scalar_mul:
+    ldr         w5, [x0, #8]
+    fmov        s0, w5
+    dup         v0.4s, v0.s[0]
+    dup         v1.4s, v0.s[0]
+    dup         v2.4s, v0.s[0]
+    dup         v3.4s, v0.s[0]
+
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    {% amx ldy x1 %}    // load 16 values
+
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:27 }}      // Z=X*Y
+
+    {% amx extrx x2 %}
+    {% amx fma32 x4 %}
+    add x2, x2, {{0|setting:20}}    // next Z row
+    add x4, x4, {{0|setting:20}}    // next Z row
+    {% amx extrx x2 %}
+    {% amx fma32 x4 %}
+
+    b .non_linear_loop
+
+.scalar_min:
+    mov x2, 5
+    b .scalar_min_max
+.scalar_max:
+    mov x2, 7
+.scalar_min_max:
+    ldr         w5, [x0, #8]
+    fmov        s0, w5
+    dup         v0.4s, v0.s[0]
+    dup         v1.4s, v0.s[0]
+    dup         v2.4s, v0.s[0]
+    dup         v3.4s, v0.s[0]
+
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    {% amx ldx x1 %}    // load 16 values
+
+    lsl x2, x2, 47
+    orr x2, x2, {{ 0|setting:44 }} // f32
+
+    {% amx vecfp x2 %}
+    add x2, x2, {{ 0|setting:20}} // next Z
+    {% amx vecfp x2 %}
+
+    b .non_linear_loop
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    {% for neon in (0..7) %}
+       {% for lane in (0..3) %}
+           ld1 { v{{neon}}.s }[{{lane}}], [x5], x6
+       {% endfor %}
+    {% endfor %}
+    mov x8, x1
+    st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8], #64
+    st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x8], #64
+
+    orr x8, x1, {{ 0|setting:62 }}          // pair
+    {% amx ldy x8 %}
+
+    eor x2, x2, x2
+    orr x2, x2, {{ 0|setting:63 }}  // vector mode
+    orr x2, x2, {{ 0|setting:29 }}  // perform Z0+=Y0
+    {% amx fma32 x2 %}
+    orr x2, x2, {{ 0|setting:20 }}  // Z1
+    orr x2, x2, 64                  // offset Y by 16 values
+    {% amx fma32 x2 %}
+    
+    b .non_linear_loop
+
+.add_row_col_products:
+    ldp         x5, x6, [x0, #8]            // a base ptr, b base ptr
+
+    ld1         { v0.s }[0], [x6]
+    st1         { v0.s }[0], [x1]
+    {% amx ldy x1 %}
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x5], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x5]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldx x2 %}
+
+    mov x2, {{ 0|setting:44 }}      // f32
+    orr x2, x2, {{ 0|setting:38 }}  // Broadcast Y
+    {% amx vecfp x2 %}
+
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    orr x2, x2, {{ 0|setting:16 }}  // X offset
+    {% amx vecfp x2 %}
+
+    b .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    ands        x8, x5, 0x7f
+    bne         .store_generic
+    cmp         x6, 4
+    bne         .store_generic
+    cmp         x7, 4
+    bne         .store_generic
+ 
+    orr x5, x5, {{ 0|setting:62 }}          // pair
+    {% amx stz x5 %}
+    b .non_linear_loop
+
+ .store_generic:
+
+    orr x8, x1, {{ 0|setting:62 }}          // pair
+    {% amx stz x8 %}
+
+    mov x8, x1
+    ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8], #64
+    ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x8], #64
+    {% for neon in (0..7) %}
+       {% for lane in (0..3) %}
+           st1 { v{{neon}}.s }[{{lane}}], [x5], x6
+       {% endfor %}
+    {% endfor %}
+    
+    b .non_linear_loop
+
+.load_tile:
+    ldr  x2, [x0, #16]                      // row major ptr
+    orr  x2, x2, {{0|setting:62}}           // load pairs
+    {% amx ldz x2 %}
+    b .non_linear_loop
+
+.return:
+{{ AMX_CLR }}
+ret
diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x32.tmpl b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x32.tmpl
new file mode 100644
index 000000000..940efc74c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/apple_amx_mmm_f32_32x32.tmpl
@@ -0,0 +1,764 @@
+// vim: ft=arm
+.text
+.align 4
+
+/* Z: 32x32
+ z0[0] ..  z0[15] z1[0] .. z1[15]
+ z4[0] ..  z4[15] z5[0] .. z5[15]
+  ..
+z60[0] .. z60[15] z61[0] .. z61[15]
+
+ z2[0] ..  z2[15] z3[0] .. z3[15]
+ z5[0] ..  z5[15] z6[0] .. z6[15]
+  ..
+z62[0] .. z62[15] z63[0] .. z63[15]
+*/
+    
+
+.global {{G}}apple_amx_mmm_f32_32x32_{{suffix}}
+{{G}}apple_amx_mmm_f32_32x32_{{suffix}}:
+
+{{ AMX_SET }}
+
+    // set x1 to a 128 bytes aligned block for loads
+    mov x1, sp
+    lsr x1, x1, #7
+    lsl x1, x1, #7
+    sub x1, x1, 128
+
+{% include "dispatcher.tmpliq" %}
+
+.leaky_relu:
+.q_scale:
+.q_shl:
+.q_shr:
+    b .unsupported
+
+.add_mat_mul:
+
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x4, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    orr         x4, x4, {{0|setting:62}}    // load pairs (A)
+    orr         x2, x2, {{0|setting:62}}    // load pairs (B)
+
+    eor         x5, x5, x5                  // top left
+
+    orr         x6, x5, {{ 0|setting:20 }}  // Z row = 1
+    orr         x6, x6, {{ 0|setting:16 }}  // top right
+
+    orr         x7, x5, {{ 0|setting:21 }}
+    orr         x7, x7, {{ 0|setting:6 }}   // bottom left
+
+    orr         x8, x7, x6                  // bottom right
+
+    .packed_packed_loop_1:
+    {% amx ldx x2 %}
+    {% amx ldy x4 %}
+    add x2, x2, 128
+    add x4, x4, 128
+
+    {% amx fma32 x5 %}
+    {% amx fma32 x6 %}
+    {% amx fma32 x7 %}
+    {% amx fma32 x8 %}
+
+    subs x3, x3, #1
+    bne .packed_packed_loop_1
+
+    b .non_linear_loop
+
+.clear:
+    // top left
+    eor x2, x2, x2
+    orr x2, x2, {{ 0|setting:27 }}
+    orr x2, x2, {{ 0|setting:28 }}
+    orr x2, x2, {{ 0|setting:29 }}  // Z = 0
+    {% amx fma32 x2 %}
+
+    // top right
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    {% amx fma32 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:21 }}  // Z row = 3
+    {% amx fma32 x2 %}
+
+    // bottom left
+    eor x2, x2, {{ 0|setting:20 }}  // Z row = 2
+    {% amx fma32 x2 %}
+
+    b .non_linear_loop
+
+.per_col_sub:
+
+    // performs a unary neg on Z
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+
+    mov x6, 64
+    .per_col_sub_loop:
+        {% amx extrx x2 %}
+        {% amx fms32 x4 %}
+        add x2, x2, {{0|setting:20}}    // next Z row
+        add x4, x4, {{0|setting:20}}    // next Z row
+    subs x6, x6, 1
+    bne .per_col_sub_loop
+
+    // continue
+
+.per_col_add:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x1, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldx x1 %}
+
+    mov x2, {{ 0|setting:28 }}      // z += y
+
+    // top left
+    {% amx fma32 x2 %}
+
+    // bottom left
+    orr x2, x2, {{ 0|setting:21 }}  // Z row = 2
+    {% amx fma32 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:16 }}  // X offset
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 3
+    {% amx fma32 x2 %}
+
+    // top right
+    eor x2, x2, {{ 0|setting:21 }}  // Z row = 1
+    {% amx fma32 x2 %}
+
+    b .non_linear_loop
+
+.per_col_sub_flipped:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x1, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldx x1 %}
+
+    mov x2, {{ 0|setting:28 }}      // z += y
+
+    // top left
+    {% amx fms32 x2 %}
+
+    // bottom left
+    orr x2, x2, {{ 0|setting:21 }}  // Z row = 2
+    {% amx fms32 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:16 }}  // X offset
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 3
+    {% amx fms32 x2 %}
+
+    // top right
+    eor x2, x2, {{ 0|setting:21 }}  // Z row = 1
+    {% amx fms32 x2 %}
+
+
+    b .non_linear_loop
+
+.per_row_sub_flipped:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldy x2 %}
+
+    mov x2, {{ 0|setting:29 }}      // z += y
+
+    // top left
+    {% amx fms32 x2 %}
+
+    // top right
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    {% amx fms32 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:21 }}  // Z row = 3
+    orr x2, x2, {{ 0|setting:6 }}   // Y offset
+    {% amx fms32 x2 %}
+
+    // bottom left
+    eor x2, x2, {{ 0|setting:20 }}  // Z row = 2
+    {% amx fms32 x2 %}
+
+    b .non_linear_loop
+
+.per_row_sub:
+    // performs a unary neg on Z
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+
+    mov x6, 64
+    .per_row_sub_loop:
+        {% amx extrx x2 %}
+        {% amx fms32 x4 %}
+        add x2, x2, {{0|setting:20}}    // next Z row
+        add x4, x4, {{0|setting:20}}    // next Z row
+    subs x6, x6, 1
+    bne .per_row_sub_loop
+
+    // continue
+
+.per_row_add:
+    ldr         x2, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldy x2 %}
+
+    mov x2, {{ 0|setting:29 }}      // z += y
+
+    // top left
+    {% amx fma32 x2 %}
+
+    // top right
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    {% amx fma32 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:21 }}  // Z row = 3
+    orr x2, x2, {{ 0|setting:6 }}   // Y offset
+    {% amx fma32 x2 %}
+
+    // bottom left
+    eor x2, x2, {{ 0|setting:20 }}  // Z row = 2
+    {% amx fma32 x2 %}
+
+    b .non_linear_loop
+
+.per_row_min:
+    mov x2, 5
+    b .per_row_min_max
+.per_row_max:
+    mov x2, 7
+.per_row_min_max:
+    ldr         x5, [x0, #8]
+
+    add x6, x5, 64
+
+    lsl x2, x2, 47                  // max(x,z) (or min)
+    orr x2, x2, {{ 0|setting:44 }}  // f32
+    orr x3, x2, {{ 0|setting:20 }}  // right half: z offset
+
+    orr x8, x2, {{ 0|setting:21 }}  // bottom left
+    orr x9, x3, {{ 0|setting:21 }}  // bottom right
+
+    mov x4, 16
+    .loop_per_row_max:
+        // top half
+        ld1         { v0.s }[0], [x5], #4
+        dup         v0.4s, v0.s[0]
+        dup         v1.4s, v0.s[0]
+        dup         v2.4s, v0.s[0]
+        dup         v3.4s, v0.s[0]
+        st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+
+        {% amx ldx x1 %}
+        {% amx vecfp x2 %}
+        {% amx vecfp x3 %}
+
+        add x2, x2, {{ 0|setting:22 }}
+        add x3, x3, {{ 0|setting:22 }}
+
+        // bottom half
+        ld1         { v0.s }[0], [x6], #4
+        dup         v0.4s, v0.s[0]
+        dup         v1.4s, v0.s[0]
+        dup         v2.4s, v0.s[0]
+        dup         v3.4s, v0.s[0]
+        st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+
+        {% amx ldx x1 %}
+        {% amx vecfp x8 %}
+        {% amx vecfp x9 %}
+
+        add x8, x8, {{ 0|setting:22 }}
+        add x9, x9, {{ 0|setting:22 }}
+
+    subs x4, x4, 1
+    bne .loop_per_row_max
+
+    b .non_linear_loop
+
+.per_col_min:
+    mov x2, 5
+    b .per_col_min_max
+.per_col_max:
+    mov x2, 7
+.per_col_min_max:
+    ldr         x4, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x4], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x4]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x3, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldx x3 %}
+
+    lsl x2, x2, 47                  // max(x,z) (or min)
+    orr x2, x2, {{ 0|setting:44 }}  // f32
+
+    orr x3, x2, {{ 0|setting:16 }}  // right half: x offset
+    orr x3, x3, {{ 0|setting:20 }}  // right half: z offset
+
+    mov x4, 32
+    .loop_per_col_max:
+        {% amx vecfp x2 %}
+        {% amx vecfp x3 %}
+        add x2, x2, {{ 0|setting:21 }}
+        add x3, x3, {{ 0|setting:21 }}
+    subs x4, x4, 1
+    bne .loop_per_col_max
+
+    b .non_linear_loop
+
+.per_col_mul:
+    ldr         x4, [x0, #8]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x4], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x4]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}      // load a pair
+    {% amx ldy x2 %}
+
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    eor x3, x3, x3
+    orr x3, x3, {{0|setting:20 }}       // Z[1]
+    orr x3, x3, {{0|setting:16 }}       // X[1]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:27 }}      // Z=X*Y
+
+    mov x5, {{ 0|setting:63 }}          // vector mode
+    orr x5, x5, {{ 0|setting:27 }}      // Z=X*Y
+    orr x5, x5, {{ 0|setting:20 }}      // Z right
+    orr x5, x5, {{ 0|setting:16 }}      // X[1] (right)
+    orr x5, x5, {{ 0|setting:6 }}       // Y[1] (right)
+
+    mov x6, 32
+    .loop_per_col_mul:
+        {% amx extrx x2 %}
+        {% amx extrx x3 %}
+        {% amx fma32 x4 %}
+        {% amx fma32 x5 %}
+        add x2, x2, {{0|setting:21}}
+        add x3, x3, {{0|setting:21}}
+        add x4, x4, {{0|setting:21}}
+        add x5, x5, {{0|setting:21}}
+    subs x6, x6, 1
+    bne .loop_per_col_mul
+
+    b .non_linear_loop
+
+.per_row_mul:
+    ldr         x14, [x0, #8]
+    add         x15, x14, 64
+
+    // extrx
+    eor x2, x2, x2                      // X[0] = Z[0] (top left)
+
+    eor x3, x3, x3
+    orr x3, x3, {{0|setting:20 }}       // Z[1]
+    orr x3, x3, {{0|setting:16 }}       // X[1] = Z[1] (top right)
+
+    eor x4, x4, x4
+    orr x4, x4, {{0|setting:21}}        // X[0] = Z[2] (bottom left)
+
+    orr x5, x4, {{0|setting:20}}
+    orr x5, x5, {{0|setting:16}}        // X[1] = Z[3] (bottom right)
+
+    // fma32
+    eor x6, x6, x6
+    orr x6, x6, {{0|setting:63}}        // vector mode
+    orr x6, x6, {{0|setting:27}}        // Z=X*Y       Z[0]=X[0]*Y[0]
+
+    orr x7, x6, {{0|setting:20}}        // Z[1]
+    orr x7, x7, {{0|setting:16}}        // X[1]        Z[1] = X[1]*Y[0]
+
+    orr x8, x6, {{0|setting:21}}        // Z[2]
+    orr x8, x8, {{0|setting:21}}        // Z[2]
+
+    orr x9, x8, {{0|setting:20}}        // Z[3]
+    orr x9, x9, {{0|setting:16}}        // X[1]
+
+    mov x10, 16
+    .loop_per_row_mul:
+        // top
+        ld1         { v0.s }[0], [x14], #4
+        dup         v0.4s, v0.s[0]
+        dup         v1.4s, v0.s[0]
+        dup         v2.4s, v0.s[0]
+        dup         v3.4s, v0.s[0]
+        st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+
+        {% amx ldy x1 %}
+        {% amx extrx x2 %}
+        {% amx extrx x3 %}
+        {% amx fma32 x6 %}
+        {% amx fma32 x7 %}
+
+        add x2, x2, {{ 0|setting:22 }}
+        add x3, x3, {{ 0|setting:22 }}
+        add x6, x6, {{ 0|setting:22 }}
+        add x7, x7, {{ 0|setting:22 }}
+
+        // bottom
+        ld1         { v0.s }[0], [x15], #4
+        dup         v0.4s, v0.s[0]
+        dup         v1.4s, v0.s[0]
+        dup         v2.4s, v0.s[0]
+        dup         v3.4s, v0.s[0]
+        st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+
+        {% amx ldy x1 %}
+        {% amx extrx x4 %}
+        {% amx extrx x5 %}
+        {% amx fma32 x8 %}
+        {% amx fma32 x9 %}
+
+        add x4, x4, {{ 0|setting:22 }}
+        add x5, x5, {{ 0|setting:22 }}
+        add x8, x8, {{ 0|setting:22 }}
+        add x9, x9, {{ 0|setting:22 }}
+
+    subs x10, x10, 1
+    bne .loop_per_row_mul
+
+    b .non_linear_loop
+
+.scalar_sub:
+    // performs a unary neg on Z, then go to scalar_add
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:28 }}
+    orr x4, x4, {{ 0|setting:27 }}      // Z=-X
+
+    mov x6, 64
+    .scalar_sub_loop:
+        {% amx extrx x2 %}
+        {% amx fms32 x4 %}
+        add x2, x2, {{0|setting:20}}    // next Z row
+        add x4, x4, {{0|setting:20}}    // next Z row
+    subs x6, x6, 1
+    bne .scalar_sub_loop
+
+    // continue on purpose
+
+.scalar_add:
+    ldr         w5, [x0, #8]
+
+    fmov        s0, w5
+    dup         v0.4s, v0.s[0]
+    dup         v1.4s, v0.s[0]
+    dup         v2.4s, v0.s[0]
+    dup         v3.4s, v0.s[0]
+
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    {% amx ldx x1 %}    // load 16 values
+
+    mov x2, {{ 0|setting:28 }}          // Z+=X 
+    {% for chunk in (0..3) %}
+        {% amx fma32 x2 %}
+        add x2, x2, {{0|setting:20}}    // next Z row
+    {% endfor %}
+    b .non_linear_loop
+
+.scalar_sub_flipped:
+    ldr         w5, [x0, #8]
+    fmov        s0, w5
+    dup         v0.4s, v0.s[0]
+    dup         v1.4s, v0.s[0]
+    dup         v2.4s, v0.s[0]
+    dup         v3.4s, v0.s[0]
+
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    {% amx ldx x1 %}    // load 16 values
+
+    mov x2, {{ 0|setting:28 }}          // Z-=X 
+    {% for chunk in (0..3) %}
+        {% amx fms32 x2 %}
+        add x2, x2, {{0|setting:20}}    // next Z row
+    {% endfor %}
+    b .non_linear_loop
+
+.scalar_mul:
+    ldr         w5, [x0, #8]
+    fmov        s0, w5
+    dup         v0.4s, v0.s[0]
+    dup         v1.4s, v0.s[0]
+    dup         v2.4s, v0.s[0]
+    dup         v3.4s, v0.s[0]
+
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    {% amx ldy x1 %}    // load 16 values
+
+    eor x2, x2, x2                      // X[0] = Z[0]
+
+    mov x4, {{ 0|setting:63 }}          // vector mode
+    orr x4, x4, {{ 0|setting:27 }}      // Z=X*Y
+
+    mov x6, 64
+    .scalar_mul_loop:
+        {% amx extrx x2 %}
+        {% amx fma32 x4 %}
+        add x2, x2, {{0|setting:20}}    // next Z row
+        add x4, x4, {{0|setting:20}}    // next Z row
+    subs x6, x6, 1
+    bne .scalar_mul_loop
+
+    b .non_linear_loop
+
+.scalar_min:
+    mov x2, 5
+    b .scalar_min_max
+.scalar_max:
+    mov x2, 7
+.scalar_min_max:
+    ldr         w5, [x0, #8]
+    fmov        s0, w5
+    dup         v0.4s, v0.s[0]
+    dup         v1.4s, v0.s[0]
+    dup         v2.4s, v0.s[0]
+    dup         v3.4s, v0.s[0]
+
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    {% amx ldx x1 %}    // load 16 values
+
+    lsl x2, x2, 47
+    orr x2, x2, {{ 0|setting:44 }} // f32
+
+    mov x3, 64
+    .loop_scalar_max:
+        add x2, x2, {{ 0|setting:20}} // next Z
+        {% amx vecfp x2 %}
+        subs x3, x3, 1
+        bne .loop_scalar_max
+
+    b .non_linear_loop
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    add x8, x1, 64
+
+    mov x3, 0                               // x3 is the row
+    .loop_load:
+        and x9, x3, 0xf                     // x9 = row % 16
+        lsl x9, x9, 2                       // x9 = (row % 16) * 4
+        lsr x10, x3, 4                      // x10 = row / 16 
+        lsl x10, x10, 1                     // x10 = (row / 16) * 2
+        add x9, x9, x10                     // x9 = x9 + x10
+
+        mov x4, x5
+        {% for neon in (0..3) %}
+            {% for lane in (0..3) %}
+                ld1 { v{{neon}}.s }[{{lane}}], [x4], x7
+            {% endfor %}
+        {% endfor %}
+        st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+        {% for neon in (0..3) %}
+            {% for lane in (0..3) %}
+                ld1 { v{{neon}}.s }[{{lane}}], [x4], x7
+            {% endfor %}
+        {% endfor %}
+        st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8]
+
+        mov x2, x1
+        orr x2, x2, {{ 0|setting:62 }} // load 32 values
+        {% amx ldy x2 %}
+
+        lsl x2, x9, 20                  // left Z register to update
+        orr x2, x2, {{ 0|setting:63 }}  // vector mode
+        orr x2, x2, {{ 0|setting:29 }}  // perform Z+=Y
+        {% amx fma32 x2 %}
+
+        add x2, x2, {{0|setting:20}}
+        orr x2, x2, 64                  // offset Y by 16 values
+        {% amx fma32 x2 %}
+
+        add x5, x5, x6
+    add x3, x3, 1
+    cmp x3, 32
+    bne .loop_load
+
+    b .non_linear_loop
+
+.add_row_col_products:
+    ldp         x5, x6, [x0, #8]            // a base ptr, b base ptr
+
+    add x8, x1, 64
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x5], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x5]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldy x2 %}
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x6], #64
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1], #64
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
+    st1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+    sub x1, x1, #64
+
+    orr x2, x1, {{ 0|setting:62 }}  // load a pair
+    {% amx ldx x2 %}
+
+    // top left
+    eor x2, x2, x2
+    {% amx fma32 x2 %}
+
+    // top right
+    orr x2, x2, {{ 0|setting:20 }}  // Z row = 1
+    orr x2, x2, {{ 0|setting:16 }}  // X offset
+    {% amx fma32 x2 %}
+
+    // bottom right
+    orr x2, x2, {{ 0|setting:21 }}  // Z row = 3
+    orr x2, x2, {{ 0|setting:6 }}   // Y offset
+    {% amx fma32 x2 %}
+
+    // bottom left
+    eor x2, x2, {{ 0|setting:20 }}  // Z row = 2
+    eor x2, x2, {{ 0|setting:16 }}  // X offset <-
+    {% amx fma32 x2 %}
+
+    b .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    cmp         x7, 4
+    bne         .store_generic
+    ands        x8, x5, 0x7f
+    bne         .store_generic
+    ands        x8, x6, 0x7f
+    bne         .store_generic
+
+    orr x5, x5, {{ 0|setting:62 }}          // pair
+    lsl x8, x6, 4
+    add x8, x8, x5                          // x8 = 16*rsc
+    orr x8, x8, {{ 0|setting:57 }}          // first to x8 is z2
+
+    mov x4, {{0|setting:58}}                // Zreg += 4
+    add x4, x4, x6                          // +rsc
+
+    mov x3, 16
+    .loop_store_direct:
+        {% amx stz x5 %}
+        {% amx stz x8 %}
+        add x5, x5, x4
+        add x8, x8, x4
+    subs x3, x3, 1
+    bne .loop_store_direct
+
+    b .non_linear_loop
+
+.store_generic:
+    
+    add x8, x1, 64
+
+    mov x3, 0                               // row id
+    .loop_store:
+        and x9, x3, 0xf                     // x9 = row % 16
+        lsl x9, x9, 2                       // x9 = (row % 16) * 4
+        lsr x10, x3, 4                      // x10 = row / 16 
+        lsl x10, x10, 1                     // x10 = (row / 16) * 2
+        add x9, x9, x10                     // x9 = x9 + x10
+
+        lsl x2, x9, 56
+        orr x2, x2, {{ 0|setting:62 }}
+        orr x2, x2, x1
+        {% amx stz x2 %}
+        ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x1]
+
+        mov x4, x5
+        {% for neon in (0..3) %}
+            {% for lane in (0..3) %}
+                st1 { v{{neon}}.s }[{{lane}}], [x4], x7
+            {% endfor %}
+        {% endfor %}
+        ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x8]
+        {% for neon in (0..3) %}
+            {% for lane in (0..3) %}
+                st1 { v{{neon}}.s }[{{lane}}], [x4], x7
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x6
+
+    add x3, x3, 1
+    cmp x3, 32
+    bne .loop_store
+
+    b .non_linear_loop
+
+.load_tile:
+    ldr  x2, [x0, #16]                      // row major ptr
+    orr  x2, x2, {{0|setting:62}}           // load pairs
+    mov  x3, x2
+    orr  x3, x3, {{0|setting:57}}
+    add  x3, x3, #2048
+    
+    mov  x4, {{0|setting:58}}               // z+=4
+    add  x4, x4, #128
+
+    mov x8, 16
+    .loop_load_tile:
+        {% amx ldz x2 %}
+        {% amx ldz x3 %}
+        add x2, x2, x4
+        add x3, x3, x4
+    subs x8, x8, 1
+    bne .loop_load_tile
+
+    b .non_linear_loop
+
+.return:
+{{ AMX_CLR }}
+ret
diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/arm64/apple_amx/dispatcher.tmpliq
new file mode 100644
index 000000000..150db4683
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/dispatcher.tmpliq
@@ -0,0 +1,37 @@
+// vim: ft=arm
+
+.non_linear:
+    sub         x0, x0, 40
+
+.non_linear_loop:
+    add         x0, x0, 40
+    ldr         x2, [x0]
+
+    mov         x4, #{{ jump_table | size }}
+
+    cmp         x2, #{{ jump_table | size }}
+    csel        x2, x2, x4, lt
+    cmp         x2, #0
+    csel        x2, x4, x2, lt
+
+    adr         x3, .jmp_table
+    add         x3, x3, x2, LSL#2
+    br          x3
+
+.jmp_table:
+{% for j in jump_table %}
+    b   .{{j}}
+{% endfor %}
+    b   .unsupported
+
+    add x0, x2, #4000
+    b .return
+
+.unsupported:
+    mov         x0, #1
+    b           .return
+
+.done:
+    mov         x0, 0
+    b           .return
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/apple_amx/instructions.rs b/vendor/tract-linalg-0.22.1/arm64/apple_amx/instructions.rs
new file mode 100644
index 000000000..5912b742c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/apple_amx/instructions.rs
@@ -0,0 +1,191 @@
+use liquid::model::KString;
+use liquid::partials::PartialCompiler;
+use liquid::{ParserBuilder, ValueView};
+use liquid_core::{
+    Display_filter, Expression, Filter, FilterParameters, FilterReflection, FromFilterParameters,
+    ParseFilter, ParseTag, Renderable, Runtime, TagReflection, Value,
+};
+
+pub fn register<C: PartialCompiler>(parser: ParserBuilder<C>) -> ParserBuilder<C> {
+    parser.tag(AmxTag).filter(LeftShift).filter(Setting).filter(Unsigned)
+}
+
+pub fn globals() -> Vec<(KString, Value)> {
+    vec![
+        ("AMX_SET".to_string().into(), Value::scalar(amx_nop_op_imm5(17, 0))),
+        ("AMX_CLR".to_string().into(), Value::scalar(amx_nop_op_imm5(17, 1))),
+    ]
+}
+
+fn amx_nop_op_imm5(op: usize, imm5: usize) -> String {
+    format!("nop\nnop\nnop\n.word 0x{:x}\n", (0x201000 + (op << 5) + imm5))
+}
+
+fn amx_nop_op_gpr(op: usize, gpr: usize) -> String {
+    format!(".word 0x{:x}", (0x201000 + (op << 5) + gpr))
+}
+
+#[derive(Copy, Clone)]
+struct AmxTag;
+
+impl ParseTag for AmxTag {
+    fn reflection(&self) -> &dyn liquid_core::TagReflection {
+        self
+    }
+
+    fn parse(
+        &self,
+        mut arguments: liquid_core::TagTokenIter,
+        _options: &liquid_core::Language,
+    ) -> liquid_core::Result<Box<dyn liquid_core::Renderable>> {
+        let op = arguments.expect_next("expects op and gpr")?.as_str().to_string();
+        let gpr = arguments
+            .expect_next("expects op and gpr")?
+            .as_str()
+            .trim_start_matches('x')
+            .parse::<usize>()
+            .unwrap();
+        let op_id = [
+            "ldx", "ldy", "stx", "sty", "ldz", "stz", "ldzi", "stzi", "extrx", "extry", "fma64",
+            "fms64", "fma32", "fms32", "mac16", "fma16", "fms16", "setclr", "vecint", "vecfp",
+            "matint", "matfp", "genlut",
+        ]
+        .iter()
+        .position(|x| x == &op)
+        .unwrap();
+        Ok(Box::new(RenderedAmxTag(format!(
+            "{} \t\t\t\t// AMX {op} x{gpr}\n",
+            amx_nop_op_gpr(op_id, gpr)
+        ))))
+    }
+}
+
+impl TagReflection for AmxTag {
+    fn tag(&self) -> &str {
+        "amx"
+    }
+
+    fn description(&self) -> &str {
+        "translate to an Apple AMX instruction"
+    }
+}
+
+#[derive(Clone, Debug)]
+struct RenderedAmxTag(String);
+
+impl Renderable for RenderedAmxTag {
+    fn render_to(
+        &self,
+        writer: &mut dyn std::io::Write,
+        _runtime: &dyn liquid_core::Runtime,
+    ) -> liquid_core::Result<()> {
+        writer.write_all(self.0.as_bytes()).unwrap();
+        Ok(())
+    }
+}
+
+#[derive(Debug, FilterParameters)]
+struct ShiftArgs {
+    #[parameter(description = "The number to shift the input by.")]
+    operand: Expression,
+}
+
+#[derive(Clone, ParseFilter, FilterReflection)]
+#[filter(
+    name = "lsl",
+    description = "Shift left a number by the given operand.",
+    parameters(ShiftArgs),
+    parsed(LeftShiftFilter)
+)]
+struct LeftShift;
+
+#[derive(Debug, FromFilterParameters, Display_filter)]
+#[name = "lsl"]
+struct LeftShiftFilter {
+    #[parameters]
+    args: ShiftArgs,
+}
+
+impl Filter for LeftShiftFilter {
+    fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> liquid_core::Result<Value> {
+        let args = self.args.evaluate(runtime)?;
+
+        let operand = args
+            .operand
+            .as_scalar()
+            .ok_or_else(|| invalid_argument("operand", "Number expected"))?;
+
+        let result = input
+            .as_scalar()
+            .unwrap()
+            .to_integer()
+            .and_then(|i| operand.to_integer().map(|o| Value::scalar(i << o)))
+            .ok_or_else(|| invalid_argument("operand", "Integer expected"))?;
+
+        Ok(result)
+    }
+}
+
+#[derive(Clone, ParseFilter, FilterReflection)]
+#[filter(
+    name = "setting",
+    description = "Set the bit deigned by the operand.",
+    parameters(ShiftArgs),
+    parsed(SettingFilter)
+)]
+struct Setting;
+
+#[derive(Debug, FromFilterParameters, Display_filter)]
+#[name = "setting"]
+struct SettingFilter {
+    #[parameters]
+    args: ShiftArgs,
+}
+
+impl Filter for SettingFilter {
+    fn evaluate(&self, input: &dyn ValueView, runtime: &dyn Runtime) -> liquid_core::Result<Value> {
+        let args = self.args.evaluate(runtime)?;
+
+        let operand = args
+            .operand
+            .as_scalar()
+            .ok_or_else(|| invalid_argument("operand", "Number expected"))?;
+
+        let result = input
+            .as_scalar()
+            .unwrap()
+            .to_integer()
+            .and_then(|i| operand.to_integer().map(|o| Value::scalar(i | (1 << o))))
+            .ok_or_else(|| invalid_argument("operand", "Integer expected"))?;
+
+        Ok(result)
+    }
+}
+
+fn invalid_argument<S>(argument: S, cause: S) -> liquid::Error
+where
+    S: Into<liquid_core::model::KString>,
+{
+    liquid_core::Error::with_msg("Invalid argument")
+        .context("argument", argument)
+        .context("cause", cause)
+}
+
+#[derive(Clone, ParseFilter, FilterReflection)]
+#[filter(name = "u", description = "unsigned number", parsed(UnsignedFilter))]
+pub struct Unsigned;
+
+#[derive(Debug, Default, Display_filter)]
+#[name = "float16"]
+struct UnsignedFilter;
+
+impl Filter for UnsignedFilter {
+    fn evaluate(
+        &self,
+        input: &dyn ValueView,
+        _runtime: &dyn Runtime,
+    ) -> liquid_core::Result<Value> {
+        let input = input.as_scalar().unwrap().to_integer().unwrap() as u64;
+        Ok(input.to_string().to_value())
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_leaky_relu_f16_8n.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_leaky_relu_f16_8n.tmpl
new file mode 100644
index 000000000..b0d13dc34
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_leaky_relu_f16_8n.tmpl
@@ -0,0 +1,71 @@
+// vim: ft=arm
+
+// no preservation either for v0-v7 and v16-v31
+
+.text
+.align 4
+
+{% if needs_pragma == true %}
+.cpu generic+fp+simd+fp16
+{% endif %}
+.global {{G}}arm64fp16_leaky_relu_f16_8n_{{suffix}}
+{{G}}arm64fp16_leaky_relu_f16_8n_{{suffix}}:
+
+    cmp         x1, #0
+    beq         .return
+
+    mov         v31.h[0], w2
+    dup         v31.8h, v31.h[0]
+    mov         x2, x0
+    
+    cmp         x1, #64
+    blt         .loop
+
+    ld1         { v16.8h, v17.8h, v18.8h, v19.8h }, [x2], #64
+.loop4:
+
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64
+
+    fmul        v20.8h, v16.8h, v31.8h
+    fmul        v21.8h, v17.8h, v31.8h
+    fmul        v22.8h, v18.8h, v31.8h
+    fmul        v23.8h, v19.8h, v31.8h
+
+    fcmge       v24.8h, v16.8h, #0.0
+    fcmge       v25.8h, v17.8h, #0.0
+    fcmge       v26.8h, v18.8h, #0.0
+    fcmge       v27.8h, v19.8h, #0.0
+
+    bsl         v24.16b, v16.16b, v20.16b
+    bsl         v25.16b, v17.16b, v21.16b
+    bsl         v26.16b, v18.16b, v22.16b
+    bsl         v27.16b, v19.16b, v23.16b
+
+    st1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x0], #64
+
+    and         v16.16b, v0.16b, v0.16b
+    and         v17.16b, v1.16b, v1.16b
+    and         v18.16b, v2.16b, v2.16b
+    and         v19.16b, v3.16b, v3.16b
+
+    subs        x1, x1, #32
+    cmp         x1, #64
+    bge         .loop4
+
+    cmp         x1, #0
+    beq         .return
+
+.loop:
+    ld1         { v16.8h }, [x0]
+
+    fmul        v17.8h, v16.8h, v31.8h
+    fcmge       v18.8h, v16.8h, #0.0
+    bsl         v18.16b, v16.16b, v17.16b
+    
+    st1         { v18.8h }, [x0], #16
+
+    subs        x1, x1, #8
+    bne         .loop
+
+.return:
+    ret
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_col.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_col.tmpliq
new file mode 100644
index 000000000..6bddb35f9
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_col.tmpliq
@@ -0,0 +1,41 @@
+// vim: ft=arm
+
+.{{label}}:
+    ldr         x2, [x0, #8]
+
+{% capture mr_over_8 %}{{ mr | divided_by: 8}}{%endcapture%}
+{% capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_8}}{%endcapture%}
+
+{% capture loads %}{{cols | divided_by: 8}}{% endcapture %}
+
+{%if cols == "1" %}
+        ld1         {v0.h}[0], [ x2 ]
+{% elsif cols == "3" %}
+        ld1         {v0.s}[0], [ x2 ], #4
+        ld1         {v0.h}[2], [ x2 ]
+{% elsif cols == "4" %}
+        ldr         d0, [ x2 ]
+{% elsif cols == "6" %}
+        ld1         {v0.d}[0], [ x2 ], #8
+        ld1         {v0.s}[2], [ x2 ]
+{% else %}
+    {% for reg in (1..loads) %}
+        ldr         q{{reg |minus:1}}, [ x2 ], #16
+    {% endfor %}
+{% endif %}
+
+// mr:{{mr}} {{ loads }} {{cols}}
+
+{% for col in (1..cols) %}
+    dup v3.8h, v{{col| minus: 1|divided_by:8}}.h[{{col| minus: 1|modulo:8}}]
+    {% for row in (1..mr_over_8) %}
+        {% capture acc %}{{ col|minus:1|times:mr_over_8|plus:row|minus:1|plus:from }}{% endcapture %}
+        {% if flipped %}
+            {{op}} v{{acc}}.8h, v{{acc}}.8h, v3.8h
+        {% else %}
+            {{op}} v{{acc}}.8h, v3.8h, v{{acc}}.8h
+        {% endif %}
+    {% endfor %}
+{% endfor %}
+
+b           .non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_row.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_row.tmpliq
new file mode 100644
index 000000000..f756344a7
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_per_row.tmpliq
@@ -0,0 +1,25 @@
+// vim: ft=arm
+
+.{{label}}:
+    ldr         x2, [x0, #8]
+
+{% capture mr_over_8 %}{{ mr | divided_by: 8 }}{%endcapture%}
+{% capture mr_over_8_min_1 %}{{ mr | divided_by: 8 | minus: 1 }}{%endcapture%}
+
+{% for reg in (0..mr_over_8_min_1) %}
+    ldr         q{{reg}}, [ x2 ], #16
+{% endfor %}
+
+{% if flipped %}
+    {% for acc in (from..to) %}
+        {% capture other%}{{acc | minus: from | modulo: mr_over_8}}{%endcapture%}
+        {{op}} v{{acc}}.8h, v{{acc}}.8h, v{{other}}.8h
+    {% endfor %}
+{% else %}
+    {% for acc in (from..to) %}
+        {% capture other%}{{acc | minus: from | modulo: mr_over_8}}{%endcapture%}
+        {{op}} v{{acc}}.8h, v{{other}}.8h, v{{acc}}.8h
+    {% endfor %}
+{% endif %}
+
+b           .non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_scalar.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_scalar.tmpliq
new file mode 100644
index 000000000..1916c2698
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_8h_scalar.tmpliq
@@ -0,0 +1,18 @@
+// vim: ft=arm
+
+.{{label}}:
+    add         x2, x0, #8
+    ld1         {v0.h}[0], [ x2 ]
+    dup         v0.8h, v0.h[0]
+    {% if flipped %}
+        {% for reg in (from..to) %}
+            {{op}}       v{{reg}}.8h, v{{reg}}.8h, v0.8h
+        {% endfor %}
+    {% else %}
+        {% for reg in (from..to) %}
+            {{op}}       v{{reg}}.8h, v0.8h, v{{reg}}.8h
+        {% endfor %}
+    {% endif %}
+
+    b           .non_linear_loop
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/cortex_a53.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/cortex_a53.tmpli
new file mode 100644
index 000000000..410816dff
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/cortex_a53.tmpli
@@ -0,0 +1,65 @@
+    fmla        v16.4s, v0.4s, v8.s[0]
+    ldr         x5, [x1, #128]
+    fmla        v17.4s, v1.4s, v8.s[0]
+    ldr         x6, [x1, #136]
+    fmla        v18.4s, v2.4s, v8.s[0]
+    ldr         x7, [x1, #144]
+    fmla        v19.4s, v3.4s, v8.s[0]
+    ldr         x9, [x1, #152]
+    ld1         {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [ x1 ], #64
+
+    fmla        v20.4s, v4.4s, v8.s[0]
+    ldr         x10, [x1, #96]
+    fmla        v21.4s, v5.4s, v8.s[0]
+    ldr         x11, [x1, #104]
+    fmla        v22.4s, v6.4s, v8.s[0]
+    ldr         x12, [x1, #112]
+    fmla        v23.4s, v7.4s, v8.s[0]
+    ldr         x13, [x1, #120]
+
+    ld1         {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [ x1 ]
+
+    fmla        v24.4s, v0.4s, v8.s[0]
+    ldr         x14, [x1, #128]
+    fmla        v25.4s, v1.4s, v8.s[0]
+    ldr         x15, [x1, #136]
+    fmla        v26.4s, v2.4s, v8.s[0]
+    ldr         x20, [x1, #144]
+    fmla        v27.4s, v3.4s, v8.s[0]
+    ldr         x21, [x1, #152]
+    fmla        v28.4s, v4.4s, v8.s[0]
+    ldr         x22, [x1, #160]
+    fmla        v29.4s, v5.4s, v8.s[0]
+    ldr         x23, [x1, #168]
+    fmla        v30.4s, v6.4s, v8.s[0]
+    ldr         x24, [x1, #176]
+    fmla        v31.4s, v7.4s, v8.s[0]
+    ldr         x25, [x1, #184]
+
+    ld1         {{ v8.s }}[0], [ x2 ], #4
+
+    prfm        pldl1keep, [x1, #1024]
+    prfm        pldl1keep, [x1, #1088]
+    prfm        pldl1keep, [x1, #1152]
+    prfm        pldl1keep, [x1, #1216]
+    prfm        pldl1keep, [x2, #256]
+
+    ins         v0.d[0], x5
+    ins         v1.d[0], x7
+    ins         v2.d[0], x10
+    ins         v3.d[0], x12
+    ins         v4.d[0], x14
+    ins         v5.d[0], x20
+    ins         v6.d[0], x22
+    ins         v7.d[0], x24
+
+    ins         v0.d[1], x6
+    ins         v1.d[1], x9
+    ins         v2.d[1], x11
+    ins         v3.d[1], x13
+    ins         v4.d[1], x15
+    ins         v5.d[1], x21
+    ins         v6.d[1], x23
+    ins         v7.d[1], x25
+
+    add         x1, x1, #192
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/naive.tmpli
new file mode 100644
index 000000000..367339ef5
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop1/naive.tmpli
@@ -0,0 +1,32 @@
+    ld1         {{ v9.8h, v10.8h, v11.8h, v12.8h }}, [x1], #64
+    ld1         {{ v13.8h, v14.8h, v15.8h }}, [x1], #48
+
+    fmla        v16.8h, v0.8h, v8.h[0]
+    fmla        v17.8h, v1.8h, v8.h[0]
+    fmla        v18.8h, v2.8h, v8.h[0]
+    fmla        v19.8h, v3.8h, v8.h[0]
+    fmla        v20.8h, v4.8h, v8.h[0]
+    fmla        v21.8h, v5.8h, v8.h[0]
+    fmla        v22.8h, v6.8h, v8.h[0]
+    fmla        v23.8h, v7.8h, v8.h[0]
+    fmla        v24.8h, v9.8h, v8.h[0]
+    ld1         {{ v9.8h }}, [ x1 ], #16
+    ld1         {{ v0.8h, v1.8h, v2.8h, v3.8h }}, [x1], #64
+    ld1         {{ v4.8h, v5.8h, v6.8h, v7.8h }}, [x1], #64
+    fmla        v25.8h, v10.8h, v8.h[0]
+    fmla        v26.8h, v11.8h, v8.h[0]
+    fmla        v27.8h, v12.8h, v8.h[0]
+    fmla        v28.8h, v13.8h, v8.h[0]
+    fmla        v29.8h, v14.8h, v8.h[0]
+    fmla        v30.8h, v15.8h, v8.h[0]
+
+    fmla        v31.8h, v9.8h, v8.h[0]
+
+    ld1         {{ v8.h }}[0], [ x2 ], #2
+
+    prfm        pldl1keep, [x1, #1024]
+    prfm        pldl1keep, [x1, #1088]
+    prfm        pldl1keep, [x1, #1152]
+    prfm        pldl1keep, [x1, #1216]
+    prfm        pldl1keep, [x2, #256]
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli
new file mode 100644
index 000000000..821ed3f5c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli
@@ -0,0 +1,85 @@
+    ld1         {{ v9.4s, v10.4s, v11.4s }}, [x1], #48
+
+    fmla        v16.8h, v0.8h, v8.h[0]
+    ldr         w8, [x2], #4
+    fmla        v17.8h, v1.8h, v8.h[0]
+    ldr         d12, [x1], #8
+    fmla        v18.8h, v2.8h, v8.h[0]
+    ldr         x12, [x1], #8
+    fmla        v19.8h, v3.8h, v8.h[0]
+    ldr         d13, [x1], #8
+    fmla        v20.8h, v4.8h, v8.h[0]
+    ldr         x13, [x1], #8
+    fmla        v21.8h, v5.8h, v8.h[0]
+    ldr         d14, [x1], #8
+    fmla        v22.8h, v6.8h, v8.h[0]
+    ldr         x14, [x1], #8
+    fmla        v23.8h, v7.8h, v8.h[0]
+    ldr         d15, [x1], #8
+    fmla        v24.8h, v9.8h, v8.h[0]
+    ldr         x15, [x1], #8
+
+    ld1         {{ v0.8h, v1.8h, v2.8h, v3.8h }}, [x1], #64
+    ins         v8.s[1], w8
+    ld1         {{ v4.8h, v5.8h, v6.8h, v7.8h }}, [x1], #64
+
+    fmla        v25.8h, v10.8h, v8.h[0]
+    ins         v12.d[1], x12
+    fmla        v26.8h, v11.8h, v8.h[0]
+    ins         v13.d[1], x13
+    fmla        v27.8h, v12.8h, v8.h[0]
+    ins         v14.d[1], x14
+    fmla        v28.8h, v13.8h, v8.h[0]
+    ins         v15.d[1], x15
+
+    ld1         {{ v9.8h, v10.8h, v11.8h, v12.8h }}, [x1], #64
+
+    fmla        v29.8h, v14.8h, v8.h[0]
+    ldr         d13, [x1], #8
+    fmla        v30.8h, v15.8h, v8.h[0]
+    ldr         x13, [x1], #8
+    fmla        v31.8h, v0.8h, v8.h[0]
+    ldr         d14, [x1], #8
+
+    fmla        v16.8h, v1.8h, v8.h[2]
+    ldr         x14, [x1], #8
+    fmla        v17.8h, v2.8h, v8.h[2]
+    ldr         d15, [x1], #8
+    fmla        v18.8h, v3.8h, v8.h[2]
+    ldr         x15, [x1], #8
+    fmla        v19.8h, v4.8h, v8.h[2]
+
+    ld1         {{ v0.8h }}, [x1], #16
+
+    fmla        v20.8h, v5.8h, v8.h[2]
+    ldr         d1, [x1], #8
+    fmla        v21.8h, v6.8h, v8.h[2]
+    ldr         x10, [x1], #8
+
+    fmla        v22.8h, v7.8h, v8.h[2]
+
+    fmla        v23.8h, v9.8h, v8.h[2]
+    ins         v13.d[1], x13
+    fmla        v24.8h, v10.8h, v8.h[2]
+    ins         v14.d[1], x14
+    fmla        v25.8h, v11.8h, v8.h[2]
+    ins         v15.d[1], x15
+
+    fmla        v26.8h, v12.8h, v8.h[2]
+    prfm        pldl1keep, [x1, #1024]
+    fmla        v27.8h, v13.8h, v8.h[2]
+    ins         v1.d[1], x10
+    fmla        v28.8h, v14.8h, v8.h[2]
+    prfm        pldl1keep, [x1, #1088]
+    fmla        v29.8h, v15.8h, v8.h[2]
+    prfm        pldl1keep, [x1, #1152]
+    fmla        v30.8h, v0.8h, v8.h[2]
+    prfm        pldl1keep, [x1, #1216]
+    fmla        v31.8h, v1.8h, v8.h[2]
+    prfm        pldl1keep, [x2, #256]
+
+    ld1         {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [x1], #64
+    ins         v8.h[0], v8.h[3]
+    ld1         {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [x1], #64
+
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1_core.tmpl
new file mode 100644
index 000000000..4ba821d4d
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_128x1_core.tmpl
@@ -0,0 +1,203 @@
+// vim: ft=arm
+
+// C tile regs: v16 to v31, no need to preserve
+
+// no preservation either for v0-v7...
+// v8..v15 are callee-preserved
+// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+// packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+{% if needs_pragma == true %}
+.cpu generic+fp+simd+fp16
+{% endif %}
+.global {{G}}arm64fp16_mmm_f16_128x1_{{core}}_{{suffix}}
+{{G}}arm64fp16_mmm_f16_128x1_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+    sub         x3, x3, #1
+
+
+    ld1         { v8.h }[0], [ x2 ], #2
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64
+    ld1         { v4.4s, v5.4s, v6.4s, v7.4s }, [ x1 ], #64
+
+    cmp         x3, #0
+    beq         .packed_packed_loop_1_last
+
+    cmp         x3, #4
+    blt        .packed_packed_loop_1
+
+{% capture packed_packed_loop1 %}
+    {% include "arm64fp16_mmm_f16_128x1/loop1/naive.tmpli" %}
+{% endcapture %}
+
+{% capture packed_packed_loop2 %}
+    {% include "arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli" %}
+{% endcapture %}
+
+.p2align 4
+.packed_packed_loop_4:
+    {{ packed_packed_loop2 }}
+    {{ packed_packed_loop2 }}
+
+    sub         x3, x3, #4
+    cmp         x3, #4
+    bge         .packed_packed_loop_4
+
+    cmp         x3, #0
+    beq         .packed_packed_loop_1_last
+
+.p2align 4
+.packed_packed_loop_1:
+    {{ packed_packed_loop1 }}
+
+    subs        x3, x3, #1
+    bne         .packed_packed_loop_1
+
+// last loop can't read beyond actual input as it's likely not packed and padded
+.packed_packed_loop_1_last:
+    ld1         { v9.8h, v10.8h, v11.8h, v12.8h }, [x1], #64
+    ld1         { v13.8h, v14.8h, v15.8h }, [x1], #48
+
+    fmla        v16.8h, v0.8h, v8.h[0]
+    fmla        v17.8h, v1.8h, v8.h[0]
+    ld1         { v0.8h }, [ x1 ]
+    fmla        v18.8h, v2.8h, v8.h[0]
+    fmla        v19.8h, v3.8h, v8.h[0]
+    fmla        v20.8h, v4.8h, v8.h[0]
+    fmla        v21.8h, v5.8h, v8.h[0]
+    fmla        v22.8h, v6.8h, v8.h[0]
+    fmla        v23.8h, v7.8h, v8.h[0]
+
+    fmla        v24.8h, v9.8h, v8.h[0]
+    fmla        v25.8h, v10.8h, v8.h[0]
+    fmla        v26.8h, v11.8h, v8.h[0]
+    fmla        v27.8h, v12.8h, v8.h[0]
+    fmla        v28.8h, v13.8h, v8.h[0]
+    fmla        v29.8h, v14.8h, v8.h[0]
+    fmla        v30.8h, v15.8h, v8.h[0]
+    fmla        v31.8h, v0.8h, v8.h[0]
+
+    b           .non_linear_loop
+
+{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:16, to:31%}
+{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:128, from:16, to:31%}
+{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:128, from:16, to:31%}
+{% include "arm64fp16_mmm_load_tile.tmpliq" from:16, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]           // c base ptr, rsc
+    cmp         x6, #2
+    beq         .do_per_row_add
+
+    {% for reg in (16..31) %}
+        {% for lane in (0..7) %}
+            ld1 {v0.h}[{{lane}}], [ x5 ], x6
+        {% endfor %}
+        fadd v{{reg}}.8h, v{{reg}}.8h, v0.8h
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.do_per_row_add:
+    ld1     {v0.8h-v3.8h}, [x5], #64
+    ld1     {v4.8h-v7.8h}, [x5], #64
+    ld1     {v8.8h-v11.8h}, [x5], #64
+    ld1     {v12.8h-v15.8h}, [x5], #64
+
+    {% for r in (0..15) %}
+        fadd v{{r| plus: 16}}.8h, v{{r | plus: 16}}.8h, v{{r}}.8h
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x3, [x0, #16]
+    ldr     x2, [x0, #8]
+
+    ld1         {v8.h}[0], [ x3 ]
+
+    {% for r in (0..7) %}
+        ldr     q{{r}}, [x2], #16
+    {% endfor %}
+
+    fmla        v16.8h, v0.8h, v8.h[0]
+    ldr         q0, [x2], #16
+    fmla        v17.8h, v1.8h, v8.h[0] 
+    ldr         q1, [x2], #16
+    fmla        v18.8h, v2.8h, v8.h[0] 
+    ldr         q2, [x2], #16
+    fmla        v19.8h, v3.8h, v8.h[0] 
+    ldr         q3, [x2], #16
+    fmla        v20.8h, v4.8h, v8.h[0] 
+    ldr         q4, [x2], #16
+    fmla        v21.8h, v5.8h, v8.h[0] 
+    ldr         q5, [x2], #16
+    fmla        v22.8h, v6.8h, v8.h[0] 
+    ldr         q6, [x2], #16
+    fmla        v23.8h, v7.8h, v8.h[0] 
+    ldr         q7, [x2], #16
+
+    fmla        v24.8h, v0.8h, v8.h[0]
+    fmla        v25.8h, v1.8h, v8.h[0] 
+    fmla        v26.8h, v2.8h, v8.h[0] 
+    fmla        v27.8h, v3.8h, v8.h[0] 
+    fmla        v28.8h, v4.8h, v8.h[0] 
+    fmla        v29.8h, v5.8h, v8.h[0] 
+    fmla        v30.8h, v6.8h, v8.h[0] 
+    fmla        v31.8h, v7.8h, v8.h[0] 
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]                // c base ptr, rsc$
+
+    cmp         x6, #2
+    beq         .store_strides_contig
+
+    {% for reg in (16..31) %}
+        {% for lane in (0..7) %}
+            st1 { v{{reg}}.h }[{{lane}}], [ x5 ], x6
+        {% endfor %}
+    {% endfor %}
+    b           .non_linear_loop
+
+.store_strides_contig:
+
+    {% for reg in (16..31) %}
+        st1 { v{{reg}}.8h }, [ x5 ], #16
+    {% endfor %}
+    b           .non_linear_loop
+
+.return:
+
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop1/naive.tmpli
new file mode 100644
index 000000000..a55fe12e8
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop1/naive.tmpli
@@ -0,0 +1,21 @@
+
+fmla        v16.8h, v0.8h, v4.h[0]
+fmla        v17.8h, v1.8h, v4.h[0]
+fmla        v18.8h, v0.8h, v4.h[1]
+fmla        v19.8h, v1.8h, v4.h[1]
+fmla        v20.8h, v0.8h, v4.h[2]
+fmla        v21.8h, v1.8h, v4.h[2]
+fmla        v22.8h, v0.8h, v4.h[3]
+fmla        v23.8h, v1.8h, v4.h[3]
+
+fmla        v24.8h, v0.8h, v4.h[4]
+fmla        v25.8h, v1.8h, v4.h[4]
+fmla        v26.8h, v0.8h, v4.h[5]
+fmla        v27.8h, v1.8h, v4.h[5]
+fmla        v28.8h, v0.8h, v4.h[6]
+fmla        v29.8h, v1.8h, v4.h[6]
+fmla        v30.8h, v0.8h, v4.h[7]
+fmla        v31.8h, v1.8h, v4.h[7]
+
+ld1         {{ v0.8h, v1.8h }}, [x1], #32
+ld1         {{ v4.8h }}, [x2], #16
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli
new file mode 100644
index 000000000..3fef68ae7
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli
@@ -0,0 +1,54 @@
+fmla        v16.8h, v0.8h, v4.h[0]
+ldr         d2, [x1], #8
+fmla        v17.8h, v1.8h, v4.h[0]
+ldr         d6, [x2], #8
+fmla        v18.8h, v0.8h, v4.h[1]
+ldr         x5, [x1], #8
+fmla        v19.8h, v1.8h, v4.h[1]
+ldr         x7, [x2], #8
+fmla        v20.8h, v0.8h, v4.h[2]
+ldr         d3, [x1], #8
+fmla        v21.8h, v1.8h, v4.h[2]
+fmla        v22.8h, v0.8h, v4.h[3]
+ldr         x6, [x1], #8
+fmla        v23.8h, v1.8h, v4.h[3]
+
+fmla        v24.8h, v0.8h, v4.h[4]
+fmla        v25.8h, v1.8h, v4.h[4]
+fmla        v26.8h, v0.8h, v4.h[5]
+fmla        v27.8h, v1.8h, v4.h[5]
+fmla        v28.8h, v0.8h, v4.h[6]
+ins         v2.d[1], x5
+fmla        v29.8h, v1.8h, v4.h[6]
+ins         v6.d[1], x7
+fmla        v30.8h, v0.8h, v4.h[7]
+ins         v3.d[1], x6
+fmla        v31.8h, v1.8h, v4.h[7]
+
+fmla        v16.8h, v2.8h, v6.h[0]
+ldr         d0, [x1], #8
+fmla        v17.8h, v3.8h, v6.h[0]
+ldr         d4, [x2], #8
+fmla        v18.8h, v2.8h, v6.h[1]
+ldr         x5, [x1], #8
+fmla        v19.8h, v3.8h, v6.h[1]
+ldr         x7, [x2], #8
+fmla        v20.8h, v2.8h, v6.h[2]
+ldr         d1, [x1], #8
+fmla        v21.8h, v3.8h, v6.h[2]
+fmla        v22.8h, v2.8h, v6.h[3]
+ldr         x6, [x1], #8
+fmla        v23.8h, v3.8h, v6.h[3]
+
+fmla        v24.8h, v2.8h, v6.h[4]
+fmla        v25.8h, v3.8h, v6.h[4]
+fmla        v26.8h, v2.8h, v6.h[5]
+fmla        v27.8h, v3.8h, v6.h[5]
+fmla        v28.8h, v2.8h, v6.h[6]
+ins         v0.d[1], x5
+fmla        v29.8h, v3.8h, v6.h[6]
+ins         v4.d[1], x7
+fmla        v30.8h, v2.8h, v6.h[7]
+ins         v1.d[1], x6
+fmla        v31.8h, v3.8h, v6.h[7]
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8_core.tmpl
new file mode 100644
index 000000000..a523751dc
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_16x8_core.tmpl
@@ -0,0 +1,174 @@
+// vim: ft=arm
+
+// x20..x27 are used, callee-preserved
+
+// C tile regs: v16 to v31, (scratch)
+// 
+//      v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0]
+//      v16[1] v18[1] 
+//      v16[2] v18[2] 
+//      v16[3] v18[3]
+//                     
+//      v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0]
+//      v17[1] v19[1] 
+//      v17[2] v19[2] 
+//      v17[3] v19[3] 
+
+// v8 is used, d8 (lower half) must preserved
+// v0-v7 (scratch registers)
+//  packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+//  packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+{% if needs_pragma == true %}
+.cpu generic+fp+simd+fp16
+{% endif %}
+.global {{G}}arm64fp16_mmm_f16_16x8_{{core}}_{{suffix}}
+{{G}}arm64fp16_mmm_f16_16x8_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+    stp         x26, x27, [sp, #-16]!
+
+    str         q8, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+.packed_packed:
+    ld1         { v0.4s, v1.4s }, [ x1 ], #32
+    ld1         { v4.4s }, [ x2 ], #16
+
+{% capture packed_packed_loop1 %}
+    {% include "arm64fp16_mmm_f16_16x8/loop1/naive.tmpli" %}
+{% endcapture %}
+
+{% capture packed_packed_loop2 %}
+    {% if core == "a55" %}
+        {% include "arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli" %}
+    {% else %}
+        {{ packed_packed_loop1 }}
+        {{ packed_packed_loop1 }}
+    {% endif %}
+{% endcapture %}
+
+    cmp         x3, #4
+    blt         .packed_packed_loop_1
+
+.p2align 4
+.packed_packed_loop_4:
+    {{ packed_packed_loop2 }}
+    {{ packed_packed_loop2 }}
+
+    sub x3, x3, #4
+    cmp x3, #4
+    bge .packed_packed_loop_4
+
+
+    cmp x3, #0
+    beq .non_linear_loop
+
+.p2align 4
+.packed_packed_loop_1:
+    {{ packed_packed_loop1 }}
+    subs        x3, x3, #1
+    bne .packed_packed_loop_1
+
+    b .non_linear_loop
+
+{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:16, to:31%}
+{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:16, from:16, to:31 %}
+{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:16, from:16, to:31 %}
+{% include "arm64fp16_mmm_load_tile.tmpliq" from:16, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]
+    ldp         x7, x8, [x0, #24]
+
+    {% for col in (8..15) %}
+        mov x4, x5
+        {% for reg in (0..1) %}
+            {% for lane in (0..7) %}
+                ld1 {v0.h}[{{lane}}], [ x4 ], x6
+            {% endfor %}
+            fadd v{{col | times:2 | plus: reg}}.8h, v{{col | times:2 | plus: reg}}.8h, v0.8h
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x2, [x0, #8]
+    ldr     x3, [x0, #16]
+
+    ld1         { v0.4s, v1.4s }, [ x2 ], #32
+    ld1         { v4.4s }, [ x3 ], #16
+
+    fmla        v16.8h, v0.8h, v4.h[0]
+    fmla        v17.8h, v1.8h, v4.h[0]
+    fmla        v18.8h, v0.8h, v4.h[1]
+    fmla        v19.8h, v1.8h, v4.h[1]
+    fmla        v20.8h, v0.8h, v4.h[2]
+    fmla        v21.8h, v1.8h, v4.h[2]
+    fmla        v22.8h, v0.8h, v4.h[3]
+    fmla        v23.8h, v1.8h, v4.h[3]
+
+    fmla        v24.8h, v0.8h, v4.h[4]
+    fmla        v25.8h, v1.8h, v4.h[4]
+    fmla        v26.8h, v0.8h, v4.h[5]
+    fmla        v27.8h, v1.8h, v4.h[5]
+    fmla        v28.8h, v0.8h, v4.h[6]
+    fmla        v29.8h, v1.8h, v4.h[6]
+    fmla        v30.8h, v0.8h, v4.h[7]
+    fmla        v31.8h, v1.8h, v4.h[7]
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    cmp         x6, #2
+    bne         .store_strides_generic
+
+    {% for col in (8..15) %}
+        str q{{col | times:2 }}, [ x5 ]
+        str q{{col | times:2 | plus: 1}}, [ x5, #16 ]
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.store_strides_generic:
+
+    {% for col in (8..15) %}
+        mov x4, x5
+        {% for reg in (0..1) %}
+            {% for lane in (0..7) %}
+                st1 { v{{col | times:2 | plus: reg}}.h }[{{lane}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.return:
+    ldr         q8, [sp], #16
+
+    ldp         x26, x27, [sp], #16
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop1/naive.tmpli
new file mode 100644
index 000000000..fa0b84887
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop1/naive.tmpli
@@ -0,0 +1,21 @@
+
+fmla        v16.8h, v0.8h, v4.h[0]
+fmla        v17.8h, v1.8h, v4.h[0]
+fmla        v18.8h, v2.8h, v4.h[0]
+fmla        v19.8h, v3.8h, v4.h[0]
+fmla        v20.8h, v0.8h, v4.h[1]
+fmla        v21.8h, v1.8h, v4.h[1]
+fmla        v22.8h, v2.8h, v4.h[1]
+fmla        v23.8h, v3.8h, v4.h[1]
+
+fmla        v24.8h, v0.8h, v4.h[2]
+fmla        v25.8h, v1.8h, v4.h[2]
+fmla        v26.8h, v2.8h, v4.h[2]
+fmla        v27.8h, v3.8h, v4.h[2]
+fmla        v28.8h, v0.8h, v4.h[3]
+fmla        v29.8h, v1.8h, v4.h[3]
+fmla        v30.8h, v2.8h, v4.h[3]
+fmla        v31.8h, v3.8h, v4.h[3]
+
+ld1         {{ v0.8h, v1.8h, v2.8h, v3.8h }}, [ x1 ], #64
+ldr         d4, [x2], #8
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli
new file mode 100644
index 000000000..2e64319d0
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli
@@ -0,0 +1,71 @@
+// mul a: v0, v1, v2, v3 b: v4
+// load a: v5(d5/x5), v6(d6,x6), v7(d7,x7), v8(d8, x8)
+// load b: v9 as d9
+
+fmla        v16.8h, v0.8h, v4.h[0]
+ldr         d5, [x1], #8
+fmla        v17.8h, v1.8h, v4.h[0]
+ldr         d9, [x2], #8
+fmla        v18.8h, v2.8h, v4.h[0]
+ldr         x5, [x1], #8
+fmla        v19.8h, v3.8h, v4.h[0]
+fmla        v20.8h, v0.8h, v4.h[1]
+ldr         d6, [x1], #8
+fmla        v21.8h, v1.8h, v4.h[1]
+ldr         x6, [x1], #8
+fmla        v22.8h, v2.8h, v4.h[1]
+ldr         d7, [x1], #8
+fmla        v23.8h, v3.8h, v4.h[1]
+ldr         x7, [x1], #8
+
+fmla        v24.8h, v0.8h, v4.h[2]
+ldr         d8, [x1], #8
+fmla        v25.8h, v1.8h, v4.h[2]
+ldr         x8, [x1], #8
+fmla        v26.8h, v2.8h, v4.h[2]
+ins         v5.d[1], x5
+fmla        v27.8h, v3.8h, v4.h[2]
+ins         v6.d[1], x6
+fmla        v28.8h, v0.8h, v4.h[3]
+ins         v7.d[1], x7
+fmla        v29.8h, v1.8h, v4.h[3]
+ins         v8.d[1], x8
+fmla        v30.8h, v2.8h, v4.h[3]
+ins         v9.d[1], x9
+fmla        v31.8h, v3.8h, v4.h[3]
+
+// mul a: v5, v6, v7, v8 b: v9
+// load a: v0(d0/x5), v1(d1,x6), v2(d2,x7), v3(d3, x8)
+// load b: v4 as d4
+
+fmla        v16.8h, v5.8h, v9.h[0]
+ldr         d0, [x1], #8
+fmla        v17.8h, v6.8h, v9.h[0]
+ldr         d4, [x2], #8
+fmla        v18.8h, v7.8h, v9.h[0]
+ldr         x5, [x1], #8
+fmla        v19.8h, v8.8h, v9.h[0]
+fmla        v20.8h, v5.8h, v9.h[1]
+ldr         d1, [x1], #8
+fmla        v21.8h, v6.8h, v9.h[1]
+ldr         x6, [x1], #8
+fmla        v22.8h, v7.8h, v9.h[1]
+ldr         d2, [x1], #8
+fmla        v23.8h, v8.8h, v9.h[1]
+ldr         x7, [x1], #8
+
+fmla        v24.8h, v5.8h, v9.h[2]
+ldr         d3, [x1], #8
+fmla        v25.8h, v6.8h, v9.h[2]
+ldr         x8, [x1], #8
+fmla        v26.8h, v7.8h, v9.h[2]
+ins         v0.d[1], x5
+fmla        v27.8h, v8.8h, v9.h[2]
+ins         v1.d[1], x6
+fmla        v28.8h, v5.8h, v9.h[3]
+ins         v2.d[1], x7
+fmla        v29.8h, v6.8h, v9.h[3]
+ins         v3.d[1], x8
+fmla        v30.8h, v7.8h, v9.h[3]
+ins         v4.d[1], x9
+fmla        v31.8h, v8.8h, v9.h[3]
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4_core.tmpl
new file mode 100644
index 000000000..11cb30b87
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x4_core.tmpl
@@ -0,0 +1,165 @@
+// vim: ft=arm
+
+// x20..x27 are used, callee-preserved
+
+// C tile regs: v16 to v31, (scratch)
+
+// v8 is used, d8 (lower half) must preserved
+// v0-v7 (scratch registers)
+//  packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+//  packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+{% if needs_pragma == true %}
+.cpu generic+fp+simd+fp16
+{% endif %}
+.global {{G}}arm64fp16_mmm_f16_32x4_{{core}}_{{suffix}}
+{{G}}arm64fp16_mmm_f16_32x4_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+    stp         x26, x27, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64
+    ldr         d4, [x2], #8
+
+{% capture packed_packed_loop1 %}
+    {% include "arm64fp16_mmm_f16_32x4/loop1/naive.tmpli" %}
+{% endcapture %}
+
+{% capture packed_packed_loop2 %}
+    {% if core == "a55" %}
+        {% include "arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli" %}
+    {% else %}
+        {{ packed_packed_loop1 }}
+        {{ packed_packed_loop1 }}
+    {% endif %}
+{% endcapture %}
+
+    cmp         x3, #4
+    blt         .packed_packed_loop_1
+
+.p2align 4
+.packed_packed_loop_4:
+    {{ packed_packed_loop2 }}
+    {{ packed_packed_loop2 }}
+
+    sub x3, x3, #4
+    cmp x3, #4
+    bge .packed_packed_loop_4
+
+    cmp x3, #0
+    beq .non_linear_loop
+
+.p2align 4
+.packed_packed_loop_1:
+    {{ packed_packed_loop1 }}
+    subs        x3, x3, #1
+    bne .packed_packed_loop_1
+
+    b   .non_linear_loop
+
+{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:16, to:31%}
+{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:32, from:16, to:31 %}
+{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:32, from:16, to:31 %}
+{% include "arm64fp16_mmm_load_tile.tmpliq" from:16, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]
+    ldp         x7, x8, [x0, #24]
+
+    {% for col in (0..3) %}
+        mov x4, x5
+        {% for reg in (0..3) %}
+            {% for lane in (0..7) %}
+                ld1 {v0.h}[{{lane}}], [ x4 ], x6
+            {% endfor %}
+            fadd v{{col | times:4 | plus: 16| plus: reg}}.8h, v{{col | times:4 | plus: 16 | plus: reg}}.8h, v0.8h
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x2, [x0, #8]
+    ldr     x3, [x0, #16]
+
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [ x2 ]
+    ldr         d4, [x3]
+
+    fmla        v16.8h, v0.8h, v4.h[0]
+    fmla        v17.8h, v1.8h, v4.h[0]
+    fmla        v18.8h, v2.8h, v4.h[0]
+    fmla        v19.8h, v3.8h, v4.h[0]
+    fmla        v20.8h, v0.8h, v4.h[1]
+    fmla        v21.8h, v1.8h, v4.h[1]
+    fmla        v22.8h, v2.8h, v4.h[1]
+    fmla        v23.8h, v3.8h, v4.h[1]
+
+    fmla        v24.8h, v0.8h, v4.h[2]
+    fmla        v25.8h, v1.8h, v4.h[2]
+    fmla        v26.8h, v2.8h, v4.h[2]
+    fmla        v27.8h, v3.8h, v4.h[2]
+    fmla        v28.8h, v0.8h, v4.h[3]
+    fmla        v29.8h, v1.8h, v4.h[3]
+    fmla        v30.8h, v2.8h, v4.h[3]
+    fmla        v31.8h, v3.8h, v4.h[3]
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    cmp         x6, #2
+    bne           .store_strides_generic
+
+    {% for col in (0..3) %}
+        str q{{col | times:4 | plus:16 | plus: 0}}, [ x5 ]
+        str q{{col | times:4 | plus:16 | plus: 1}}, [ x5, #16 ]
+        str q{{col | times:4 | plus:16 | plus: 2}}, [ x5, #32 ]
+        str q{{col | times:4 | plus:16 | plus: 3}}, [ x5, #48 ]
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.store_strides_generic:
+
+    {% for col in (0..3) %}
+        mov x4, x5
+        {% for reg in (0..3) %}
+            {% for lane in (0..7) %}
+                st1 { v{{col | times:4 | plus: 16 | plus: reg}}.h }[{{lane}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.return:
+    ldp         d8, d9, [sp], #16
+
+    ldp         x26, x27, [sp], #16
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x6.core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x6.core.tmpl
new file mode 100644
index 000000000..b66d1d399
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_32x6.core.tmpl
@@ -0,0 +1,148 @@
+// vim: ft=arm
+
+// C tile regs: v16 to v31, no need to preserve
+
+// no preservation either for v0-v7...
+// v8..v15 are callee-preserved
+// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+// packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+{% if needs_pragma == true %}
+.cpu generic+fp+simd+fp16
+{% endif %}
+.global {{G}}arm64fp16_mmm_f16_32x6_{{core}}_{{suffix}}
+{{G}}arm64fp16_mmm_f16_32x6_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldp         x2, x4, [x0, #24]   // b, packing
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+.p2align 4
+.packed_packed_loop_1:
+    ld1         { v7.8h }, [ x2 ]
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64
+    add         x2, x2, 12
+
+{% for row in (0..3) %}
+    {% for col in (0..5) %}
+        fmla        v{{ col|times:4|plus:8|plus:row}}.8h, v{{row}}.8h, v7.h[{{col}}]
+    {% endfor %}
+    /*
+    {% for col in (0..1) %}
+        fmla        v{{ col|plus:4|times:4|plus:8|plus:row}}.8h, v{{row}}.8h, v6.h[{{col}}]
+    {% endfor %}
+    */
+{% endfor %}
+
+    subs        x3, x3, #1
+    bne         .packed_packed_loop_1
+
+    b           .non_linear_loop
+
+{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:8, to:31%}
+{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:32, from:8, to:31%}
+{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:32, from:8, to:31%}
+{% include "arm64fp16_mmm_load_tile.tmpliq" from:8, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]
+    ldp         x7, x8, [x0, #24]
+
+    {% for col in (0..5) %}
+        mov x4, x5
+        {% for reg in (0..3) %}
+            {% for lane in (0..7) %}
+                ld1 {v0.h}[{{lane}}], [ x4 ], x6
+            {% endfor %}
+            fadd v{{col | times:4 | plus: 8| plus: reg}}.8h, v{{col | times:4 | plus: 8 | plus: reg}}.8h, v0.8h
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.do_per_row_add:
+    ld1     {v0.8h-v3.8h}, [x5], #64
+    ld1     {v4.8h-v7.8h}, [x5], #64
+
+    {% for r in (0..7) %}
+        fadd v{{r| plus: 24}}.8h, v{{r | plus: 24}}.8h, v{{r}}.8h
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldp         x2, x3, [x0, #8]
+
+    ld1         { v7.d }[0], [ x3 ], #8
+    ld1         { v7.s }[2], [ x3 ], #4
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [ x2 ], #64
+
+{% for row in (0..3) %}
+    {% for col in (0..5) %}
+        fmla        v{{ col|times:4|plus:8|plus:row}}.8h, v{{row}}.8h, v7.h[{{col}}]
+    {% endfor %}
+{% endfor %}
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]                // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]               // csc, item_size
+
+    cmp         x6, #2
+    beq         .store_strides_contig
+
+    {% for col in (0..5) %}
+        mov x4, x5
+        {% for reg in (0..3) %}
+            {% for lane in (0..7) %}
+                st1 { v{{col | times:4 | plus: 8 | plus: reg}}.h }[{{lane}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+    b           .non_linear_loop
+
+.store_strides_contig:
+
+    {% for col in (0..5) %}
+        mov x4, x5
+        {% for r in (0..3) %}
+            st1 { v{{col | times:4 | plus: 8 | plus: r}}.8h }, [ x4 ], 16
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.return:
+
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x1.core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x1.core.tmpl
new file mode 100644
index 000000000..b12e9237d
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x1.core.tmpl
@@ -0,0 +1,264 @@
+// vim: ft=arm
+
+// C tile regs: v16 to v31, no need to preserve
+
+// no preservation either for v0-v7...
+// v8..v15 are callee-preserved
+// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+// packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+{% if needs_pragma == true %}
+.cpu generic+fp+simd+fp16
+{% endif %}
+.global {{G}}arm64fp16_mmm_f16_64x1_{{core}}_{{suffix}}
+{{G}}arm64fp16_mmm_f16_64x1_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldp         x2, x4, [x0, #24]   // b, packing
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    cmp         x4, #1
+    beq         .q4f16se
+    
+    cmp         x4, #2
+    beq         .q4f16
+
+    
+
+.p2align 4
+.packed_packed_loop_1:
+    ld1         { v8.h }[0], [ x2 ], #2
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64
+    ld1         { v4.8h, v5.8h, v6.8h, v7.8h }, [ x1 ], #64
+
+    fmla        v24.8h, v0.8h, v8.h[0]
+    fmla        v25.8h, v1.8h, v8.h[0]
+    fmla        v26.8h, v2.8h, v8.h[0]
+    fmla        v27.8h, v3.8h, v8.h[0]
+    fmla        v28.8h, v4.8h, v8.h[0]
+    fmla        v29.8h, v5.8h, v8.h[0]
+    fmla        v30.8h, v6.8h, v8.h[0]
+    fmla        v31.8h, v7.8h, v8.h[0]
+    subs        x3, x3, #1
+    bne         .packed_packed_loop_1
+
+    b           .non_linear_loop
+
+.p2align 8
+.q40f16_const:
+    .byte 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc2, 0xc0, 0xbc
+    .byte 0x00, 0x3c, 0x40, 0x42, 0x44, 0x45, 0x46, 0x47
+
+.q4f16se:
+    adr      x4, .q40f16_const
+    movi     v15.16b, 15
+    ld1      {v13.16b}, [ x4 ]
+    eor      v12.16b, v12.16b, v12.16b
+
+.q4f16se_outerloop:
+{% for i in (0..7) %}
+    eor      v{{i|plus:16}}.16b, v{{i|plus:16}}.16b, v{{i|plus:16}}.16b
+{% endfor %}
+    mov         x4, #32
+
+.p2align 4
+.q4f16se_innerloop:
+        ld1      { v9.16b-v10.16b }, [x1], #32
+        ld1      { v8.h }[0], [ x2 ], #2
+
+        and      v0.16b, v9.16b, v15.16b
+        ushr     v2.16b, v9.16b, 4
+
+        and      v4.16b, v10.16b, v15.16b
+        ushr     v6.16b, v10.16b, 4
+
+        tbl      v0.16b, { v13.16b }, v0.16b
+        tbl      v2.16b, { v13.16b }, v2.16b
+        tbl      v4.16b, { v13.16b }, v4.16b
+        tbl      v6.16b, { v13.16b }, v6.16b
+
+        zip2     v1.16b, v12.16b, v0.16b
+        zip2     v3.16b, v12.16b, v2.16b
+        zip2     v5.16b, v12.16b, v4.16b
+        zip2     v7.16b, v12.16b, v6.16b
+
+        zip1     v0.16b, v12.16b, v0.16b
+        zip1     v2.16b, v12.16b, v2.16b
+        zip1     v4.16b, v12.16b, v4.16b
+        zip1     v6.16b, v12.16b, v6.16b
+
+{% for i in (0..7) %}
+        fmla        v{{ i|plus: 16 }}.8h, v{{i}}.8h, v8.h[0]
+{% endfor %}
+
+    subs        x4, x4, #1
+    bne         .q4f16se_innerloop
+
+    // scales
+    ld1         { v0.8h-v3.8h }, [ x1 ], #64
+    ld1         { v4.8h-v7.8h }, [ x1 ], #64
+
+{% for i in (0..7) %}
+       fmla     v{{i|plus:24}}.8h, v{{i}}.8h, v{{i|plus:16}}.8h
+{% endfor %}
+
+    subs        x3, x3, #32
+    bne         .q4f16se_outerloop
+
+    b           .non_linear_loop
+    
+.q4f16:
+    adr      x4, .q40f16_const
+    movi     v15.16b, 15
+    ld1      {v13.16b}, [ x4 ]
+    eor      v12.16b, v12.16b, v12.16b
+
+.q4f16_outerloop:
+    // scales
+    ld1         { v16.8h-v19.8h }, [ x1 ], #64
+    ld1         { v20.8h-v23.8h }, [ x1 ], #64
+    mov         x4, #32
+
+.p2align 4
+.q4f16_innerloop:
+        ld1      { v9.16b-v10.16b }, [x1], #32
+        ld1      { v8.h }[0], [ x2 ], #2
+
+        and      v0.16b, v9.16b, v15.16b
+        ushr     v2.16b, v9.16b, 4
+
+        and      v4.16b, v10.16b, v15.16b
+        ushr     v6.16b, v10.16b, 4
+
+        tbl      v0.16b, { v13.16b }, v0.16b
+        tbl      v2.16b, { v13.16b }, v2.16b
+        tbl      v4.16b, { v13.16b }, v4.16b
+        tbl      v6.16b, { v13.16b }, v6.16b
+
+        zip2     v1.16b, v12.16b, v0.16b
+        zip2     v3.16b, v12.16b, v2.16b
+        zip2     v5.16b, v12.16b, v4.16b
+        zip2     v7.16b, v12.16b, v6.16b
+
+        zip1     v0.16b, v12.16b, v0.16b
+        zip1     v2.16b, v12.16b, v2.16b
+        zip1     v4.16b, v12.16b, v4.16b
+        zip1     v6.16b, v12.16b, v6.16b
+
+{% for i in (0..7) %}
+       fmul     v{{i}}.8h, v{{i}}.8h, v{{i|plus:16}}.8h
+{% endfor %}
+
+{% for i in (0..7) %}
+        fmla        v{{ i|plus: 24 }}.8h, v{{i}}.8h, v8.h[0]
+{% endfor %}
+
+    subs        x4, x4, #1
+    bne         .q4f16_innerloop
+
+    subs        x3, x3, #32
+    bne         .q4f16_outerloop
+
+    b           .non_linear_loop
+
+{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:24, to:31%}
+{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:64, from:24, to:31%}
+{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:64, from:24, to:31%}
+{% include "arm64fp16_mmm_load_tile.tmpliq" from:24, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]           // c base ptr, rsc
+    cmp         x6, #2
+    beq         .do_per_row_add
+
+    {% for reg in (24..31) %}
+        {% for lane in (0..7) %}
+            ld1 {v0.h}[{{lane}}], [ x5 ], x6
+        {% endfor %}
+        fadd v{{reg}}.8h, v{{reg}}.8h, v0.8h
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.do_per_row_add:
+    ld1     {v0.8h-v3.8h}, [x5], #64
+    ld1     {v4.8h-v7.8h}, [x5], #64
+
+    {% for r in (0..7) %}
+        fadd v{{r| plus: 24}}.8h, v{{r | plus: 24}}.8h, v{{r}}.8h
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x3, [x0, #16]
+    ldr     x2, [x0, #8]
+
+    ld1         {v8.h}[0], [ x3 ]
+
+    {% for r in (0..7) %}
+        ldr     q{{r}}, [x2], #16
+    {% endfor %}
+
+    fmla        v24.8h, v0.8h, v8.h[0]
+    fmla        v25.8h, v1.8h, v8.h[0] 
+    fmla        v26.8h, v2.8h, v8.h[0] 
+    fmla        v27.8h, v3.8h, v8.h[0] 
+    fmla        v28.8h, v4.8h, v8.h[0] 
+    fmla        v29.8h, v5.8h, v8.h[0] 
+    fmla        v30.8h, v6.8h, v8.h[0] 
+    fmla        v31.8h, v7.8h, v8.h[0] 
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]                // c base ptr, rsc$
+
+    cmp         x6, #2
+    beq         .store_strides_contig
+
+    {% for reg in (24..31) %}
+        {% for lane in (0..7) %}
+            st1 { v{{reg}}.h }[{{lane}}], [ x5 ], x6
+        {% endfor %}
+    {% endfor %}
+    b           .non_linear_loop
+
+.store_strides_contig:
+
+    {% for reg in (24..31) %}
+        st1 { v{{reg}}.8h }, [ x5 ], #16
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.return:
+
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x3.core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x3.core.tmpl
new file mode 100644
index 000000000..9e949531a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_64x3.core.tmpl
@@ -0,0 +1,165 @@
+// vim: ft=arm
+
+// C tile regs: v16 to v31, no need to preserve
+
+// no preservation either for v0-v7...
+// v8..v15 are callee-preserved
+// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+// packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+{% if needs_pragma == true %}
+.cpu generic+fp+simd+fp16
+{% endif %}
+.global {{G}}arm64fp16_mmm_f16_64x3_{{core}}_{{suffix}}
+{{G}}arm64fp16_mmm_f16_64x3_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldp         x2, x4, [x0, #24]   // b, packing
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+.p2align 4
+.packed_packed_loop_1:
+    ld1         { v7.4s }, [ x2 ]
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64
+    ld1         { v4.8h, v5.8h, v6.8h }, [ x1 ], #48
+    add         x2, x2, #6
+
+{% for col in (0..2) %}
+    fmla        v{{ col|times:8|plus:8}}.8h, v0.8h, v7.h[{{ col }}]
+{% endfor %}
+
+    ld1         { v0.8h }, [ x1 ], #16
+
+{% for row in (1..6) %}
+    {% for col in (0..2) %}
+        fmla        v{{ col|times:8|plus:8|plus:row}}.8h, v{{row}}.8h, v7.h[{{col}}]
+    {% endfor %}
+{% endfor %}
+
+{% for col in (0..2) %}
+    fmla        v{{ col|times:8|plus:15}}.8h, v0.8h, v7.h[{{ col }}]
+{% endfor %}
+
+    subs        x3, x3, #1
+    bne         .packed_packed_loop_1
+
+    b           .non_linear_loop
+
+{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:8, to:31%}
+{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:64, from:8, to:31%}
+{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:64, from:8, to:31%}
+{% include "arm64fp16_mmm_load_tile.tmpliq" from:8, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]
+    ldp         x7, x8, [x0, #24]
+
+    {% for col in (0..2) %}
+        mov x4, x5
+        {% for reg in (0..7) %}
+            {% for lane in (0..7) %}
+                ld1 {v0.h}[{{lane}}], [ x4 ], x6
+            {% endfor %}
+            fadd v{{col | times:8 | plus: 8| plus: reg}}.8h, v{{col | times:8 | plus: 8 | plus: reg}}.8h, v0.8h
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.do_per_row_add:
+    ld1     {v0.8h-v3.8h}, [x5], #64
+    ld1     {v4.8h-v7.8h}, [x5], #64
+
+    {% for r in (0..7) %}
+        fadd v{{r| plus: 24}}.8h, v{{r | plus: 24}}.8h, v{{r}}.8h
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldp         x2, x3, [x0, #8]
+
+    ld1         { v7.s }[0], [ x3 ], #4
+    ld1         { v7.h }[2], [ x3 ], #2
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [ x2 ], #64
+    ld1         { v4.8h, v5.8h, v6.8h }, [ x2 ], #48
+
+{% for col in (0..2) %}
+    fmla        v{{ col|times:8|plus:8}}.8h, v0.8h, v7.h[{{ col }}]
+{% endfor %}
+
+    ld1         { v0.8h }, [ x2 ], #16
+
+{% for row in (1..6) %}
+    {% for col in (0..2) %}
+        fmla        v{{ col|times:8|plus:8|plus:row}}.8h, v{{row}}.8h, v7.h[{{col}}]
+    {% endfor %}
+{% endfor %}
+
+{% for col in (0..2) %}
+    fmla        v{{ col|times:8|plus:15}}.8h, v0.8h, v7.h[{{ col }}]
+{% endfor %}
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]                // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]               // csc, item_size
+
+    cmp         x6, #2
+    beq         .store_strides_contig
+
+    {% for col in (0..2) %}
+        mov x4, x5
+        {% for reg in (0..7) %}
+            {% for lane in (0..7) %}
+                st1 { v{{col | times:8 | plus: 8 | plus: reg}}.h }[{{lane}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+    b           .non_linear_loop
+
+.store_strides_contig:
+
+    {% for col in (0..2) %}
+        mov x4, x5
+        {% for r in (0..7) %}
+            st1 { v{{col | times:8 | plus: 8 | plus: r}}.8h }, [ x4 ], 16
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.return:
+
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_cols.tmpliq
new file mode 100644
index 000000000..6e9552bf1
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_cols.tmpliq
@@ -0,0 +1,9 @@
+// vim: ft=arm
+
+{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_min", op:"fmin", mr:mr, from:from, to:to %}
+{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_max", op:"fmax", mr:mr, from:from, to:to %}
+{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_mul", op:"fmul", mr:mr, from:from, to:to %}
+{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_add", op:"fadd", mr:mr, from:from, to:to %}
+{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_sub", op:"fsub", mr:mr, from:from, to:to %}
+{% include "arm64fp16_mmm_8h_per_col.tmpliq" label:"per_col_sub_flipped", op:"fsub", mr:mr, from:from, to:to, flipped: true%}
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_rows.tmpliq
new file mode 100644
index 000000000..477b8db1f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_per_rows.tmpliq
@@ -0,0 +1,9 @@
+// vim: ft=arm
+
+{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_min", op:"fmin", mr:mr, from:from, to:to %}
+{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_max", op:"fmax", mr:mr, from:from, to:to %}
+{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_mul", op:"fmul", mr:mr, from:from, to:to %}
+{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_add", op:"fadd", mr:mr, from:from, to:to %}
+{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_sub", op:"fsub", mr:mr, from:from, to:to %}
+{% include "arm64fp16_mmm_8h_per_row.tmpliq" label:"per_row_sub_flipped", op:"fsub", mr:mr, from:from, to:to, flipped: true%}
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_scalars.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_scalars.tmpliq
new file mode 100644
index 000000000..a448fe387
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_f16_scalars.tmpliq
@@ -0,0 +1,36 @@
+// vim: ft=arm
+
+{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_min", op:"fmin", from:from, to:to %}
+{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_max", op:"fmax", from:from, to:to %}
+{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_mul", op:"fmul", from:from, to:to %}
+{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_add", op:"fadd", from:from, to:to %}
+{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_sub", op:"fsub", from:from, to:to %}
+{% include "arm64fp16_mmm_8h_scalar.tmpliq" label:"scalar_sub_flipped", op:"fsub", from:from, to:to, flipped:true %}
+
+.clear:
+{% for r in (from..to) %}
+    eor         v{{r}}.8b, v{{r}}.8b, v{{r}}.8b
+{% endfor %}
+    b .non_linear_loop
+
+.leaky_relu:
+    add         x2, x0, #8
+    ld1         {v4.s}[0], [ x2 ]
+    dup         v4.8h, v4.h[0]
+
+    // bsl cond/dst, then, else
+    // fcmge dst, src, #0.0
+    {% for r in (from..to) %}
+        fmul  v0.8h, v{{r}}.8h, v4.8h
+        fcmge v1.8h, v{{r}}.8h, #0.0
+        bsl   v1.16b, v{{r}}.16b, v0.16b
+        and   v{{r}}.16b, v1.16b, v1.16b
+    {% endfor %}
+
+    b .non_linear_loop
+
+
+.q_scale:
+.q_shl:
+.q_shr:
+    b .unsupported
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_load_tile.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_load_tile.tmpliq
new file mode 100644
index 000000000..ac920b368
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_mmm_load_tile.tmpliq
@@ -0,0 +1,10 @@
+// vim: ft=arm
+
+.load_tile:
+    ldr         x2, [ x0, #8 ]
+    {% for reg in (from..to) %}
+        ld1         { v{{reg}}.4s }, [ x2 ], #16
+    {% endfor %}
+
+    b           .non_linear_loop
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_sigmoid_f16_8n.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_sigmoid_f16_8n.tmpl
new file mode 100644
index 000000000..7d0e76ef3
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_sigmoid_f16_8n.tmpl
@@ -0,0 +1,131 @@
+// vim: ft=arm
+
+// no preservation either for v0-v7 and v16-v31
+
+.text
+.align 4
+
+{% if needs_pragma == true %}
+.cpu generic+fp+simd+fp16
+{% endif %}
+.global {{G}}arm64fp16_sigmoid_f16_8n_{{suffix}}
+{{G}}arm64fp16_sigmoid_f16_8n_{{suffix}}:
+
+    cmp         x1, #0
+    beq         .return
+
+    adr         x2, .coeffs_num
+    ld1         { v0.8h }, [x2]
+    dup         v5.8h, v0.h[0]              // v5 <- low, broadcasted
+    dup         v6.8h, v0.h[1]              // v6 <- high, broadcasted
+    dup         v7.8h, v0.h[7]              // v7 <- half, broadcasted
+
+    cmp         x1, #32
+    blt         .loop
+
+.loop4:
+    ld1         { v16.8h, v17.8h, v18.8h, v19.8h }, [x0]
+
+    fmax        v16.8h, v16.8h, v5.8h
+    fmax        v17.8h, v17.8h, v5.8h
+    fmax        v18.8h, v18.8h, v5.8h
+    fmax        v19.8h, v19.8h, v5.8h
+
+    fmin        v16.8h, v16.8h, v6.8h
+    fmin        v17.8h, v17.8h, v6.8h
+    fmin        v18.8h, v18.8h, v6.8h
+    fmin        v19.8h, v19.8h, v6.8h       // v16 <- x
+
+    fmul        v20.8h, v16.8h, v16.8h
+    fmul        v21.8h, v17.8h, v17.8h
+    fmul        v22.8h, v18.8h, v18.8h
+    fmul        v23.8h, v19.8h, v19.8h      // v20 <- x2
+
+    dup         v28.8h, v0.h[3]
+    fmla        v28.8h, v20.8h, v0.h[2]
+    dup         v29.8h, v0.h[3]
+    fmla        v29.8h, v21.8h, v0.h[2]
+    dup         v30.8h, v0.h[3]
+    fmla        v30.8h, v22.8h, v0.h[2]
+    dup         v31.8h, v0.h[3]
+    fmla        v31.8h, v23.8h, v0.h[2]
+
+    dup         v24.8h, v0.h[4]
+    fmla        v24.8h, v20.8h, v28.8h
+    dup         v25.8h, v0.h[4]
+    fmla        v25.8h, v21.8h, v29.8h
+    dup         v26.8h, v0.h[4]
+    fmla        v26.8h, v22.8h, v30.8h
+    dup         v27.8h, v0.h[4]
+    fmla        v27.8h, v23.8h, v31.8h
+
+    fmul        v16.8h, v16.8h, v24.8h
+    fmul        v17.8h, v17.8h, v25.8h
+    fmul        v18.8h, v18.8h, v26.8h
+    fmul        v19.8h, v19.8h, v27.8h      // v16 <- numerator
+
+    dup         v24.8h, v0.h[6]
+    dup         v25.8h, v0.h[6]
+    dup         v26.8h, v0.h[6]
+    dup         v27.8h, v0.h[6]
+    fmla        v24.8h, v20.8h, v0.h[5]
+    fmla        v25.8h, v21.8h, v0.h[5]
+    fmla        v26.8h, v22.8h, v0.h[5]
+    fmla        v27.8h, v23.8h, v0.h[5]      // v24 <- denum
+
+    fdiv        v16.8h, v16.8h, v24.8h
+    fdiv        v17.8h, v17.8h, v25.8h
+    fdiv        v18.8h, v18.8h, v26.8h
+    fdiv        v19.8h, v19.8h, v27.8h
+
+    fadd        v16.8h, v16.8h, v7.8h
+    fadd        v17.8h, v17.8h, v7.8h
+    fadd        v18.8h, v18.8h, v7.8h
+    fadd        v19.8h, v19.8h, v7.8h
+
+    st1         { v16.8h, v17.8h, v18.8h, v19.8h }, [x0], #64
+
+    subs        x1, x1, #32
+    cmp         x1, #32
+    bge         .loop4
+
+    cmp         x1, #0
+    beq         .return
+
+.loop:
+    ld1         { v16.8h }, [x0]
+
+    fmax        v16.8h, v16.8h, v5.8h
+    fmin        v16.8h, v16.8h, v6.8h       // v16 <- x
+    fmul        v20.8h, v16.8h, v16.8h      // v20 <- x2
+
+    dup         v28.8h, v0.h[3]
+    fmla        v28.8h, v20.8h, v0.h[2]
+    dup         v24.8h, v0.h[4]
+    fmla        v24.8h, v20.8h, v28.8h
+    fmul        v16.8h, v16.8h, v24.8h      // v16 <- numerator
+
+    dup         v24.8h, v0.h[6]
+    fmla        v24.8h, v20.8h, v0.h[5]      // v24 <- denum
+
+    fdiv        v16.8h, v16.8h, v24.8h
+    fadd        v16.8h, v16.8h, v7.8h
+    
+    st1         { v16.8h }, [x0], #16
+
+    subs        x1, x1, #8
+    bne         .loop
+
+.return:
+    ret
+
+.coeffs_num:
+    {{ -6.92 | float16 }}
+    {{ 6.92 | float16 }}
+    {{ -0.0000124702 | float16 }}
+    {{ 0.00400222 | float16 }}
+
+    {{ 0.249895 | float16 }}
+    {{ 0.098734 | float16 }}
+    {{ 1.0 | float16 }}
+    {{ 0.5 | float16 }}
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_tanh_f16_8n.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_tanh_f16_8n.tmpl
new file mode 100644
index 000000000..cd01f0455
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/arm64fp16_tanh_f16_8n.tmpl
@@ -0,0 +1,124 @@
+// vim: ft=arm
+
+// no preservation either for v0-v7 and v16-v31
+
+.text
+.align 4
+
+{% if needs_pragma == true %}
+.cpu generic+fp+simd+fp16
+{% endif %}
+.global {{G}}arm64fp16_tanh_f16_8n_{{suffix}}
+{{G}}arm64fp16_tanh_f16_8n_{{suffix}}:
+
+    cmp         x1, #0
+    beq         .return
+
+    adr         x2, .coeffs_num
+    ld1         { v0.8h }, [x2]
+    dup         v5.8h, v0.h[0]              // v5 <- low, broadcasted
+    dup         v6.8h, v0.h[1]              // v6 <- high, broadcasted
+
+    cmp         x1, #32
+    blt         .loop
+
+.loop4:
+    ld1         { v16.8h, v17.8h, v18.8h, v19.8h }, [x0]
+
+    fmax        v16.8h, v16.8h, v5.8h
+    fmax        v17.8h, v17.8h, v5.8h
+    fmax        v18.8h, v18.8h, v5.8h
+    fmax        v19.8h, v19.8h, v5.8h
+
+    fmin        v16.8h, v16.8h, v6.8h
+    fmin        v17.8h, v17.8h, v6.8h
+    fmin        v18.8h, v18.8h, v6.8h
+    fmin        v19.8h, v19.8h, v6.8h       // v16 <- x
+
+    fmul        v20.8h, v16.8h, v16.8h
+    fmul        v21.8h, v17.8h, v17.8h
+    fmul        v22.8h, v18.8h, v18.8h
+    fmul        v23.8h, v19.8h, v19.8h      // v20 <- x2
+
+    dup         v24.8h, v0.h[3]
+    fmla        v24.8h, v20.8h, v0.h[2]
+    dup         v25.8h, v0.h[3]
+    fmla        v25.8h, v21.8h, v0.h[2]
+    dup         v26.8h, v0.h[3]
+    fmla        v26.8h, v22.8h, v0.h[2]
+    dup         v27.8h, v0.h[3]
+    fmla        v27.8h, v23.8h, v0.h[2]
+
+    fmul        v16.8h, v16.8h, v24.8h
+    fmul        v17.8h, v17.8h, v25.8h
+    fmul        v18.8h, v18.8h, v26.8h
+    fmul        v19.8h, v19.8h, v27.8h      // v16 <- numerator
+
+    dup         v28.8h, v0.h[5]
+    fmla        v28.8h, v20.8h, v0.h[4]
+    dup         v29.8h, v0.h[5]
+    fmla        v29.8h, v21.8h, v0.h[4]
+    dup         v30.8h, v0.h[5]
+    fmla        v30.8h, v22.8h, v0.h[4]
+    dup         v31.8h, v0.h[5]
+    fmla        v31.8h, v23.8h, v0.h[4]
+
+    dup         v24.8h, v0.h[6]
+    fmla        v24.8h, v20.8h, v28.8h
+    dup         v25.8h, v0.h[6]
+    fmla        v25.8h, v21.8h, v29.8h
+    dup         v26.8h, v0.h[6]
+    fmla        v26.8h, v22.8h, v30.8h
+    dup         v27.8h, v0.h[6]
+    fmla        v27.8h, v23.8h, v31.8h      // v24 <- denum
+
+    fdiv        v16.8h, v16.8h, v24.8h
+    fdiv        v17.8h, v17.8h, v25.8h
+    fdiv        v18.8h, v18.8h, v26.8h
+    fdiv        v19.8h, v19.8h, v27.8h
+
+    st1         { v16.8h, v17.8h, v18.8h, v19.8h }, [x0], #64
+
+    subs        x1, x1, #32
+    cmp         x1, #32
+    bge         .loop4
+
+    cmp         x1, #0
+    beq         .return
+
+.loop:
+    ld1         { v16.8h }, [x0]
+
+    fmax        v16.8h, v16.8h, v5.8h
+    fmin        v16.8h, v16.8h, v6.8h       // v16 <- x
+    fmul        v20.8h, v16.8h, v16.8h      // v20 <- x2
+
+    dup         v24.8h, v0.h[3]
+    fmla        v24.8h, v20.8h, v0.h[2]
+    fmul        v16.8h, v16.8h, v24.8h      // v16 <- numerator
+
+    dup         v28.8h, v0.h[5]
+    fmla        v28.8h, v20.8h, v0.h[4]
+    dup         v24.8h, v0.h[6]
+    fmla        v24.8h, v20.8h, v28.8h      // v24 <- denum
+
+    fdiv        v16.8h, v16.8h, v24.8h
+    
+    st1         { v16.8h }, [x0], #16
+
+    subs        x1, x1, #8
+    bne         .loop
+
+.return:
+    ret
+
+.coeffs_num:
+    {{ -3.84 | float16 }}
+    {{ 3.84 | float16 }}
+    {{ 0.082654955 | float16 }}              // alpha
+    {{ 0.99963124 | float16 }}
+
+    {{ 0.0065383179 | float16 }}             // beta
+    {{ 0.41401828 | float16 }}   
+    {{ 1.0 | float16 }}
+    {{ 0 | float16 }}                        // padding
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dispatcher.tmpliq
new file mode 100644
index 000000000..150db4683
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dispatcher.tmpliq
@@ -0,0 +1,37 @@
+// vim: ft=arm
+
+.non_linear:
+    sub         x0, x0, 40
+
+.non_linear_loop:
+    add         x0, x0, 40
+    ldr         x2, [x0]
+
+    mov         x4, #{{ jump_table | size }}
+
+    cmp         x2, #{{ jump_table | size }}
+    csel        x2, x2, x4, lt
+    cmp         x2, #0
+    csel        x2, x4, x2, lt
+
+    adr         x3, .jmp_table
+    add         x3, x3, x2, LSL#2
+    br          x3
+
+.jmp_table:
+{% for j in jump_table %}
+    b   .{{j}}
+{% endfor %}
+    b   .unsupported
+
+    add x0, x2, #4000
+    b .return
+
+.unsupported:
+    mov         x0, #1
+    b           .return
+
+.done:
+    mov         x0, 0
+    b           .return
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_no_pragma.S b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_no_pragma.S
new file mode 100644
index 000000000..3af092cfc
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_no_pragma.S
@@ -0,0 +1,13 @@
+// vim: ft=arm
+
+// serves as a canary build file to figure out which flag combination will accept half precision fmla 
+
+.text
+.align 4
+
+// .cpu generic+fp+simd+fp16
+.global foo
+foo:
+    fmla        v16.8h, v0.8h, v8.h[0]
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_pragma.S b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_pragma.S
new file mode 100644
index 000000000..6fb61053e
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64fp16/dummy_fmla_pragma.S
@@ -0,0 +1,13 @@
+// vim: ft=arm
+
+// serves as a canary build file to figure out which flag combination will accept half precision fmla 
+
+.text
+.align 4
+
+.cpu generic+fp+simd+fp16
+.global foo
+foo:
+    fmla        v16.8h, v0.8h, v8.h[0]
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_col.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_col.tmpliq
new file mode 100644
index 000000000..a50fa7be9
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_col.tmpliq
@@ -0,0 +1,36 @@
+// vim: ft=arm
+
+.{{label}}:
+    ldr         x2, [x0, #8]
+
+{% capture mr_over_4 %}{{ mr | divided_by: 4}}{%endcapture%}
+{% capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_4}}{%endcapture%}
+
+{% capture loads %}{{cols | divided_by:4}}{% endcapture %}
+
+{%if cols == "1" %}
+        ld1         {v0.s}[0], [ x2 ]
+{% elsif cols == "3" %}
+        ld1         {v0.d}[0], [ x2 ], #8
+        ld1         {v0.s}[2], [ x2 ]
+{% else %}
+    {% for reg in (1..loads) %}
+        ldr         q{{reg |minus:1}}, [ x2 ], #16
+    {% endfor %}
+{% endif %}
+
+// {{mr}} {{cols}}
+
+{% for col in (1..cols) %}
+    dup v3.4s, v{{col| minus: 1|divided_by:4}}.s[{{col| minus: 1|modulo:4}}]
+    {% for row in (1..mr_over_4) %}
+        {% capture acc %}{{ col|minus:1|times:mr_over_4|plus:row|minus:1|plus:from }}{% endcapture %}
+        {% if flipped %}
+            {{op}} v{{acc}}.4s, v{{acc}}.4s, v3.4s
+        {% else %}
+            {{op}} v{{acc}}.4s, v3.4s, v{{acc}}.4s
+        {% endif %}
+    {% endfor %}
+{% endfor %}
+
+b           .non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_row.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_row.tmpliq
new file mode 100644
index 000000000..1db62f2b7
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_per_row.tmpliq
@@ -0,0 +1,25 @@
+// vim: ft=arm
+
+.{{label}}:
+    ldr         x2, [x0, #8]
+
+{% capture mr_over_4 %}{{ mr | divided_by: 4}}{%endcapture%}
+{% capture mr_over_4_min_1 %}{{ mr | divided_by: 4 | minus: 1}}{%endcapture%}
+
+{% for reg in (0..mr_over_4_min_1) %}
+    ldr         q{{reg}}, [ x2 ], #16
+{% endfor %}
+
+{% if flipped %}
+    {% for acc in (from..to) %}
+        {% capture other%}{{acc | minus: from | modulo: mr_over_4}}{%endcapture%}
+        {{op}} v{{acc}}.4s, v{{acc}}.4s, v{{other}}.4s
+    {% endfor %}
+{% else %}
+    {% for acc in (from..to) %}
+        {% capture other%}{{acc | minus: from | modulo: mr_over_4}}{%endcapture%}
+        {{op}} v{{acc}}.4s, v{{other}}.4s, v{{acc}}.4s
+    {% endfor %}
+{% endif %}
+
+b           .non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_scalar.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_scalar.tmpliq
new file mode 100644
index 000000000..db2e10143
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_4s_scalar.tmpliq
@@ -0,0 +1,18 @@
+// vim: ft=arm
+
+.{{label}}:
+    add         x2, x0, #8
+    ld1         {v0.s}[0], [ x2 ]
+    dup         v0.4s, v0.s[0]
+    {% if flipped %}
+        {% for reg in (from..to) %}
+            {{op}}       v{{reg}}.4s, v{{reg}}.4s, v0.4s
+        {% endfor %}
+    {% else %}
+        {% for reg in (from..to) %}
+            {{op}}       v{{reg}}.4s, v0.4s, v{{reg}}.4s
+        {% endfor %}
+    {% endif %}
+
+    b           .non_linear_loop
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_no_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_no_preload.tmpli
new file mode 100644
index 000000000..04deaee6c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_no_preload.tmpli
@@ -0,0 +1,69 @@
+fmla        v8.4s, v0.4s, v4.s[0]
+ldr         w4, [x1], #4
+fmla        v9.4s, v1.4s, v4.s[0]
+ldr         w20, [x2], #4
+fmla        v10.4s, v2.4s, v4.s[0]
+ldr         w5, [x1], #4
+
+fmla        v11.4s, v0.4s, v4.s[1]
+ldr         w21, [x2], #4
+fmla        v12.4s, v1.4s, v4.s[1]
+ldr         w6, [x1], #4
+fmla        v13.4s, v2.4s, v4.s[1]
+ldr         w22, [x2], #4
+
+fmla        v14.4s, v0.4s, v4.s[2]
+ldr         w7, [x1], #4
+fmla        v15.4s, v1.4s, v4.s[2]
+ldr         w23, [x2], #4
+fmla        v16.4s, v2.4s, v4.s[2]
+ldr         w8, [x1], #4
+fmla        v17.4s, v0.4s, v4.s[3]
+ldr         w24, [x2], #4
+fmla        v18.4s, v1.4s, v4.s[3]
+ldr         w9, [x1], #4
+fmla        v19.4s, v2.4s, v4.s[3]
+ldr         w25, [x2], #4
+
+fmla        v20.4s, v0.4s, v5.s[0]
+ldr         w10, [x1], #4
+fmla        v21.4s, v1.4s, v5.s[0]
+ldr         w26, [x2], #4
+fmla        v22.4s, v2.4s, v5.s[0]
+ldr         w11, [x1], #4
+fmla        v23.4s, v0.4s, v5.s[1]
+ldr         w27, [x2], #4
+fmla        v24.4s, v1.4s, v5.s[1]
+ldr         w12, [x1], #4
+fmla        v25.4s, v2.4s, v5.s[1]
+
+fmla        v26.4s, v0.4s, v5.s[2]
+ldr         w13, [x1], #4
+fmla        v27.4s, v1.4s, v5.s[2]
+fmla        v28.4s, v2.4s, v5.s[2]
+ldr         w14, [x1], #4
+fmla        v29.4s, v0.4s, v5.s[3]
+fmla        v30.4s, v1.4s, v5.s[3]
+ldr         w15, [x1], #4
+fmla        v31.4s, v2.4s, v5.s[3]
+
+ins         v0.s[0], w4
+ins         v1.s[0], w8
+ins         v2.s[0], w12
+ins         v4.s[0], w20
+ins         v5.s[0], w24
+ins         v0.s[1], w5
+ins         v1.s[1], w9
+ins         v2.s[1], w13
+ins         v4.s[1], w21
+ins         v5.s[1], w25
+ins         v0.s[2], w6
+ins         v1.s[2], w10
+ins         v2.s[2], w14
+ins         v4.s[2], w22
+ins         v5.s[2], w26
+ins         v0.s[3], w7
+ins         v1.s[3], w11
+ins         v2.s[3], w15
+ins         v4.s[3], w23
+ins         v5.s[3], w27
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_preload.tmpli
new file mode 100644
index 000000000..f97e2527c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_preload.tmpli
@@ -0,0 +1,82 @@
+fmla        v8.4s, v0.4s, v4.s[0]
+    ldr         w4, [x1]
+fmla        v9.4s, v1.4s, v4.s[0]
+        ldr         w20, [x2], #4
+fmla        v10.4s, v2.4s, v4.s[0]
+    ldr         w5, [x1, #4]
+
+fmla        v11.4s, v0.4s, v4.s[1]
+        ldr         w21, [x2], #4
+fmla        v12.4s, v1.4s, v4.s[1]
+    ldr         w6, [x1, #8]
+fmla        v13.4s, v2.4s, v4.s[1]
+        ldr         w22, [x2], #4
+
+fmla        v14.4s, v0.4s, v4.s[2]
+    ldr         w7, [x1, #12]
+fmla        v15.4s, v1.4s, v4.s[2]
+        ldr         w23, [x2], #4
+fmla        v16.4s, v2.4s, v4.s[2]
+    ldr         w8, [x1, #16]
+fmla        v17.4s, v0.4s, v4.s[3]
+        ldr         w24, [x2], #4
+fmla        v18.4s, v1.4s, v4.s[3]
+    ldr         w9, [x1, #20]
+fmla        v19.4s, v2.4s, v4.s[3]
+        ldr         w25, [x2], #4
+
+fmla        v20.4s, v0.4s, v5.s[0]
+    ldr         w10, [x1, #24]
+fmla        v21.4s, v1.4s, v5.s[0]
+        ldr         w26, [x2], #4
+fmla        v22.4s, v2.4s, v5.s[0]
+    ldr         w11, [x1, #28]
+fmla        v23.4s, v0.4s, v5.s[1]
+        ldr         w27, [x2], #4
+fmla        v24.4s, v1.4s, v5.s[1]
+    ldr         w12, [x1, #32]
+fmla        v25.4s, v2.4s, v5.s[1]
+    ldr         w13, [x1, #36]
+
+fmla        v26.4s, v0.4s, v5.s[2]
+    ldr         w14, [x1, #40]
+fmla        v27.4s, v1.4s, v5.s[2]
+    ldr         w15, [x1, #44]
+fmla        v28.4s, v2.4s, v5.s[2]
+    prfm        pldl1keep, [x1, #512]
+fmla        v29.4s, v0.4s, v5.s[3]
+    add         x1, x1, #48
+fmla        v30.4s, v1.4s, v5.s[3]
+    prfm        pldl1keep, [x2, #384]
+fmla        v31.4s, v2.4s, v5.s[3]
+
+    ins         v0.s[0], w4
+
+    ins         v1.s[0], w8
+    ins         v2.s[0], w12
+
+        ins         v4.s[0], w20
+        ins         v5.s[0], w24
+
+    ins         v0.s[1], w5
+    ins         v1.s[1], w9
+
+    ins         v2.s[1], w13
+        ins         v4.s[1], w21
+
+        ins         v5.s[1], w25
+    ins         v0.s[2], w6
+
+    ins         v1.s[2], w10
+    ins         v2.s[2], w14
+
+        ins         v4.s[2], w22
+        ins         v5.s[2], w26
+
+    ins         v0.s[3], w7
+    ins         v1.s[3], w11
+
+    ins         v2.s[3], w15
+        ins         v4.s[3], w23
+        ins         v5.s[3], w27
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli
new file mode 100644
index 000000000..a380bcccf
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli
@@ -0,0 +1,60 @@
+fmla        v8.4s, v0.4s, v4.s[0]
+    ldr         x4, [x1]
+fmla        v9.4s, v1.4s, v4.s[0]
+        ldr         x20, [x2]
+fmla        v10.4s, v2.4s, v4.s[0]
+    ldr         x5, [x1, #8]
+
+fmla        v11.4s, v0.4s, v4.s[1]
+        ldr         x21, [x2, #8]
+fmla        v12.4s, v1.4s, v4.s[1]
+    ldr         x6, [x1, #16]
+fmla        v13.4s, v2.4s, v4.s[1]
+        ldr         x22, [x2, #16]
+
+fmla        v14.4s, v0.4s, v4.s[2]
+    ldr         x7, [x1, #24]
+fmla        v15.4s, v1.4s, v4.s[2]
+        ldr         x23, [x2, #24]
+fmla        v16.4s, v2.4s, v4.s[2]
+    ldr         x8, [x1, #32]
+fmla        v17.4s, v0.4s, v4.s[3]
+fmla        v18.4s, v1.4s, v4.s[3]
+    ldr         x9, [x1, #40]
+fmla        v19.4s, v2.4s, v4.s[3]
+
+fmla        v20.4s, v0.4s, v5.s[0]
+fmla        v21.4s, v1.4s, v5.s[0]
+fmla        v22.4s, v2.4s, v5.s[0]
+fmla        v23.4s, v0.4s, v5.s[1]
+fmla        v24.4s, v1.4s, v5.s[1]
+fmla        v25.4s, v2.4s, v5.s[1]
+
+fmla        v26.4s, v0.4s, v5.s[2]
+fmla        v27.4s, v1.4s, v5.s[2]
+fmla        v28.4s, v2.4s, v5.s[2]
+    prfm        pldl1keep, [x1, #512]
+fmla        v29.4s, v0.4s, v5.s[3]
+    add         x1, x1, #48
+fmla        v30.4s, v1.4s, v5.s[3]
+    prfm        pldl1keep, [x2, #384]
+fmla        v31.4s, v2.4s, v5.s[3]
+    add         x2, x2, #32
+
+
+    ins         v0.d[0], x4
+    ins         v2.d[0], x8
+
+        ins         v4.d[0], x20
+        ins         v5.d[0], x22
+
+    ins         v0.d[1], x5
+    ins         v2.d[1], x9
+
+        ins         v4.d[1], x21
+    ins         v1.d[0], x6
+
+    ins         v1.d[1], x7
+
+        ins         v5.d[1], x23
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli
new file mode 100644
index 000000000..67be44dd0
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli
@@ -0,0 +1,34 @@
+fmla        v8.4s, v0.4s, v4.s[0]
+fmla        v9.4s, v1.4s, v4.s[0]
+fmla        v10.4s, v2.4s, v4.s[0]
+
+fmla        v11.4s, v0.4s, v4.s[1]
+fmla        v12.4s, v1.4s, v4.s[1]
+fmla        v13.4s, v2.4s, v4.s[1]
+
+fmla        v14.4s, v0.4s, v4.s[2]
+fmla        v15.4s, v1.4s, v4.s[2]
+fmla        v16.4s, v2.4s, v4.s[2]
+
+fmla        v17.4s, v0.4s, v4.s[3]
+fmla        v18.4s, v1.4s, v4.s[3]
+fmla        v19.4s, v2.4s, v4.s[3]
+
+fmla        v20.4s, v0.4s, v5.s[0]
+fmla        v21.4s, v1.4s, v5.s[0]
+fmla        v22.4s, v2.4s, v5.s[0]
+
+fmla        v23.4s, v0.4s, v5.s[1]
+fmla        v24.4s, v1.4s, v5.s[1]
+fmla        v25.4s, v2.4s, v5.s[1]
+
+fmla        v26.4s, v0.4s, v5.s[2]
+fmla        v27.4s, v1.4s, v5.s[2]
+fmla        v28.4s, v2.4s, v5.s[2]
+
+fmla        v29.4s, v0.4s, v5.s[3]
+fmla        v30.4s, v1.4s, v5.s[3]
+fmla        v31.4s, v2.4s, v5.s[3]
+
+ld1         {{ v0.4s, v1.4s, v2.4s }}, [x1], #48
+ld1         {{ v4.4s, v5.4s }}, [x2], #32
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli
new file mode 100644
index 000000000..f1ba56c93
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli
@@ -0,0 +1,107 @@
+// mul a: v0, v1, v2, b: v4, v5
+// load a: d3/x23, d6/x26, d7/x27
+// load b: x4, x5, x6, x7
+
+fmla        v8.4s,  v0.4s, v4.s[0] 
+ldr         d3, [x1], #8
+fmla        v9.4s,  v1.4s, v4.s[0]
+ldr         x4, [x2], #8
+fmla        v10.4s, v2.4s, v4.s[0]
+ldr         x23, [x1], #8
+fmla        v11.4s, v0.4s, v4.s[1]
+ldr         x5, [x2], #8
+fmla        v12.4s, v1.4s, v4.s[1]
+ldr         d6, [x1], #8
+fmla        v13.4s, v2.4s, v4.s[1]
+ldr         x6, [x2], #8
+fmla        v14.4s, v0.4s, v4.s[2]
+ldr         x26, [x1], #8
+fmla        v15.4s, v1.4s, v4.s[2]
+ldr         x7, [x2], #8
+fmla        v16.4s, v2.4s, v4.s[2]
+ldr         d7, [x1], #8
+fmla        v17.4s, v0.4s, v4.s[3]
+ldr         x27, [x1], #8
+fmla        v18.4s, v1.4s, v4.s[3]
+
+fmla        v19.4s, v2.4s, v4.s[3]
+fmla        v20.4s, v0.4s, v5.s[0]
+
+// ins b: v4 <- x4/x5
+// ins a: d3/x23, d6/x26, d7/x27
+
+ins         v4.d[0], x4
+fmla        v21.4s, v1.4s, v5.s[0]
+ins         v4.d[1], x5
+fmla        v22.4s, v2.4s, v5.s[0]
+fmla        v23.4s, v0.4s, v5.s[1]
+
+fmla        v24.4s, v1.4s, v5.s[1]
+fmla        v25.4s, v2.4s, v5.s[1]
+fmla        v26.4s, v0.4s, v5.s[2]
+fmla        v27.4s, v1.4s, v5.s[2]
+fmla        v28.4s, v2.4s, v5.s[2]
+fmla        v29.4s, v0.4s, v5.s[3]
+ins         v3.d[1], x23
+fmla        v30.4s, v1.4s, v5.s[3]
+ins         v6.d[1], x26
+fmla        v31.4s, v2.4s, v5.s[3]
+ins         v7.d[1], x27
+
+// mul a: v3, v6, v7, b: v4, v5
+// ins b, v5 <- x6, x7
+// load a: d0/x20, d1/x21, d2/x22
+// load b: x4, x5
+
+fmla        v8.4s,  v3.4s, v4.s[0] 
+ins         v5.d[0], x6
+fmla        v9.4s,  v6.4s, v4.s[0]
+ins         v5.d[1], x7
+fmla        v10.4s, v7.4s, v4.s[0]
+ldr         d0, [x1], #8
+fmla        v11.4s, v3.4s, v4.s[1]
+ldr         x4, [x2], #8
+fmla        v12.4s, v6.4s, v4.s[1]
+ldr         x20, [x1], #8
+fmla        v13.4s, v7.4s, v4.s[1]
+ldr         x5, [x2], #8
+fmla        v14.4s, v3.4s, v4.s[2]
+ldr         d1, [x1], #8
+fmla        v15.4s, v6.4s, v4.s[2]
+ldr         x6, [x2], #8
+fmla        v16.4s, v7.4s, v4.s[2]
+ldr         x21, [x1], #8
+fmla        v17.4s, v3.4s, v4.s[3]
+ldr         x7, [x2], #8
+
+// load b: x6, x7
+fmla        v18.4s, v6.4s, v4.s[3]
+ldr         d2, [x1], #8
+fmla        v19.4s, v7.4s, v4.s[3]
+ldr         x22, [x1], #8
+fmla        v20.4s, v3.4s, v5.s[0]
+fmla        v21.4s, v6.4s, v5.s[0]
+fmla        v22.4s, v7.4s, v5.s[0]
+fmla        v23.4s, v3.4s, v5.s[1]
+fmla        v24.4s, v6.4s, v5.s[1]
+fmla        v25.4s, v7.4s, v5.s[1]
+
+// ins a: d0/x20, d1/x21, d2/x22
+fmla        v26.4s, v3.4s, v5.s[2]
+ins         v0.d[1], x20
+fmla        v27.4s, v6.4s, v5.s[2]
+ins         v1.d[1], x21
+fmla        v28.4s, v7.4s, v5.s[2]
+ins         v2.d[1], x22
+
+// ins b: v4 <- x4, x5
+fmla        v29.4s, v3.4s, v5.s[3]
+ins         v4.d[0], x4
+fmla        v30.4s, v6.4s, v5.s[3]
+ins         v4.d[1], x5
+fmla        v31.4s, v7.4s, v5.s[3]
+
+// ins b: v5 <- x6, x7
+ins         v5.d[0], x6
+ins         v5.d[1], x7
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8_core.tmpl
new file mode 100644
index 000000000..0c5657456
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_12x8_core.tmpl
@@ -0,0 +1,163 @@
+// vim: ft=arm
+
+// C tile regs: 
+// - x19-x29 to preserve (but x19, x28, x29 not used) 
+// - d8..d15 to preserve
+// - v16 to v31, no need to preserve
+//
+// v8  v11 v14 v17 v20 v23 v26 v29
+// v9  v12 v15 v18 v21 v24 v27 v30
+// v10 v13 v16 v19 v22 v25 v28 v31
+
+// no preservation for v0-v7:
+// packed A buffering (2x8 values): rotating over v0..v3
+// packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+.cpu generic+fp+simd
+.global {{G}}arm64simd_mmm_f32_12x8_{{core}}_{{suffix}}
+{{G}}arm64simd_mmm_f32_12x8_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+    stp         x26, x27, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    ld1         { v0.4s, v1.4s, v2.4s }, [ x1 ], #48
+    ld1         { v4.4s, v5.4s }, [ x2 ], #32
+
+{% capture packed_packed_loop1 %}
+    {% if core == "a53" %}
+        {% include "arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli" %}
+    {% else %}
+        {% include "arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli" %}
+    {% endif %}
+{% endcapture %}
+
+{% capture packed_packed_loop2 %}
+    {% if core == "a55" %}
+        {% include "arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli" %}
+    {% else %}
+        {{ packed_packed_loop1 }}
+        {{ packed_packed_loop1 }}
+    {% endif %}
+{% endcapture %}
+
+    cmp         x3, #4
+    blt         .packed_packed_loop_1
+
+.p2align 4
+.packed_packed_loop_4:
+    {{ packed_packed_loop2 }}
+    {{ packed_packed_loop2 }}
+
+    sub x3, x3, #4
+    cmp x3, #4
+    bge .packed_packed_loop_4
+
+    cmp x3, #0
+    beq .non_linear_loop
+
+.p2align 4
+.packed_packed_loop_1:
+    {{ packed_packed_loop1 }}
+    subs        x3, x3, #1
+    bne .packed_packed_loop_1
+
+    b .non_linear_loop
+
+{% include "arm64simd_mmm_f32_scalars.tmpliq" from:8, to:31%}
+{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:12, from:8, to:31 %}
+{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:12, from:8, to:31 %}
+{% include "arm64simd_mmm_load_tile.tmpliq" from:8, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8 ]           // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    {% for col in (0..7) %}
+        mov x4, x5
+        {% for reg in (0..2) %}
+            {% for lane in (0..3) %}
+                ld1 {v0.s}[{{lane}}], [ x4 ], x6
+            {% endfor %}
+            fadd v{{col | times:3 | plus: 8| plus: reg}}.4s, v{{col | times:3 | plus: 8 | plus: reg}}.4s, v0.4s
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x2, [x0, #8]
+    ldr     x3, [x0, #16]
+
+    ld1         { v0.4s, v1.4s, v2.4s }, [ x2 ]
+    ld1         { v4.4s, v5.4s }, [ x3 ]
+
+    {% for col in (0..7) %}
+        {% for reg in (0..2) %}
+            fmla v{{col | times:3 | plus: 8 | plus: reg}}.4s, v{{reg}}.4s, v{{col| divided_by:4 | plus: 4}}.s[{{col| modulo: 4}}]
+        {% endfor %}
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    cmp         x6, #4
+    bne         .store_strides_generic
+
+    {% for col in (0..7) %}
+        str q{{col | times:3 | plus: 8 }}, [ x5 ]
+        str q{{col | times:3 | plus: 9}}, [ x5, #16 ]
+        str q{{col | times:3 | plus: 10}}, [ x5, #32 ]
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.store_strides_generic:
+    {% for col in (0..7) %}
+        mov x4, x5
+        {% for reg in (0..2) %}
+            {% for lane in (0..3) %}
+                st1 { v{{col | times:3 | plus: 8 | plus: reg}}.s }[{{lane}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.return:
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x26, x27, [sp], #16
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli
new file mode 100644
index 000000000..2ae7a54eb
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli
@@ -0,0 +1,45 @@
+fmla        v16.4s, v0.4s, v4.s[0]
+ldr         x5, [x1]
+fmla        v17.4s, v1.4s, v4.s[0]
+ldr         x6, [x1, #8]
+fmla        v18.4s, v2.4s, v4.s[0]
+ldr         x7, [x1, #16]
+fmla        v19.4s, v3.4s, v4.s[0]
+ldr         x8, [x1, #24]
+fmla        v20.4s, v0.4s, v4.s[1]
+ldr         x9, [x1, #32]
+fmla        v21.4s, v1.4s, v4.s[1]
+ldr         x10, [x1, #40]
+fmla        v22.4s, v2.4s, v4.s[1]
+ldr         x11, [x1, #48]
+fmla        v23.4s, v3.4s, v4.s[1]
+ldr         x12, [x1, #56]
+
+fmla        v24.4s, v0.4s, v4.s[2]
+ldr         x24, [x2]
+fmla        v25.4s, v1.4s, v4.s[2]
+ldr         x25, [x2, #8]
+fmla        v26.4s, v2.4s, v4.s[2]
+add         x1, x1, #64
+fmla        v27.4s, v3.4s, v4.s[2]
+add         x2, x2, #16
+fmla        v28.4s, v0.4s, v4.s[3]
+prfm        pldl1keep, [x1, #256]
+fmla        v29.4s, v1.4s, v4.s[3]
+prfm        pldl1keep, [x2, #256]
+fmla        v30.4s, v2.4s, v4.s[3]
+prfm        pldl1keep, [x1, #256]
+fmla        v31.4s, v3.4s, v4.s[3]
+
+ins         v0.d[0], x5
+ins         v2.d[0], x9
+ins         v1.d[0], x7
+ins         v3.d[0], x11
+ins         v4.d[0], x24
+
+ins         v0.d[1], x6
+ins         v2.d[1], x10
+ins         v1.d[1], x8
+ins         v3.d[1], x12
+ins         v4.d[1], x25
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli
new file mode 100644
index 000000000..637466515
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli
@@ -0,0 +1,21 @@
+
+fmla        v16.4s, v0.4s, v4.s[0]
+fmla        v17.4s, v1.4s, v4.s[0]
+fmla        v18.4s, v2.4s, v4.s[0]
+fmla        v19.4s, v3.4s, v4.s[0]
+fmla        v20.4s, v0.4s, v4.s[1]
+fmla        v21.4s, v1.4s, v4.s[1]
+fmla        v22.4s, v2.4s, v4.s[1]
+fmla        v23.4s, v3.4s, v4.s[1]
+
+fmla        v24.4s, v0.4s, v4.s[2]
+fmla        v25.4s, v1.4s, v4.s[2]
+fmla        v26.4s, v2.4s, v4.s[2]
+fmla        v27.4s, v3.4s, v4.s[2]
+fmla        v28.4s, v0.4s, v4.s[3]
+fmla        v29.4s, v1.4s, v4.s[3]
+fmla        v30.4s, v2.4s, v4.s[3]
+fmla        v31.4s, v3.4s, v4.s[3]
+
+ld1         {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [ x1 ], #64
+ld1         {{ v4.4s }}, [ x2 ], #16
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli
new file mode 100644
index 000000000..c0b2f502c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli
@@ -0,0 +1,73 @@
+// mul a: v0, v1, v2, v3 b: v4
+// load a: v5(d5/x5), v6(d6,x6), v7(d7,x7), v8(d8, x8)
+// load b: v9(d9/x9)
+
+fmla        v16.4s, v0.4s, v4.s[0]
+ldr         d5, [x1], #8
+fmla        v17.4s, v1.4s, v4.s[0]
+ldr         d9, [x2], #8
+fmla        v18.4s, v2.4s, v4.s[0]
+ldr         x5, [x1], #8
+fmla        v19.4s, v3.4s, v4.s[0]
+ldr         x9, [x2], #8
+fmla        v20.4s, v0.4s, v4.s[1]
+ldr         d6, [x1], #8
+fmla        v21.4s, v1.4s, v4.s[1]
+ldr         x6, [x1], #8
+fmla        v22.4s, v2.4s, v4.s[1]
+ldr         d7, [x1], #8
+fmla        v23.4s, v3.4s, v4.s[1]
+ldr         x7, [x1], #8
+
+fmla        v24.4s, v0.4s, v4.s[2]
+ldr         d8, [x1], #8
+fmla        v25.4s, v1.4s, v4.s[2]
+ldr         x8, [x1], #8
+fmla        v26.4s, v2.4s, v4.s[2]
+ins         v5.d[1], x5
+fmla        v27.4s, v3.4s, v4.s[2]
+ins         v6.d[1], x6
+fmla        v28.4s, v0.4s, v4.s[3]
+ins         v7.d[1], x7
+fmla        v29.4s, v1.4s, v4.s[3]
+ins         v8.d[1], x8
+fmla        v30.4s, v2.4s, v4.s[3]
+ins         v9.d[1], x9
+fmla        v31.4s, v3.4s, v4.s[3]
+
+// mul a: v5, v6, v7, v8 b: v9
+// load a: v0(d0/x5), v1(d1,x6), v2(d2,x7), v3(d3, x8)
+// load b: v4(d4/x9)
+
+fmla        v16.4s, v5.4s, v9.s[0]
+ldr         d0, [x1], #8
+fmla        v17.4s, v6.4s, v9.s[0]
+ldr         d4, [x2], #8
+fmla        v18.4s, v7.4s, v9.s[0]
+ldr         x5, [x1], #8
+fmla        v19.4s, v8.4s, v9.s[0]
+ldr         x9, [x2], #8
+fmla        v20.4s, v5.4s, v9.s[1]
+ldr         d1, [x1], #8
+fmla        v21.4s, v6.4s, v9.s[1]
+ldr         x6, [x1], #8
+fmla        v22.4s, v7.4s, v9.s[1]
+ldr         d2, [x1], #8
+fmla        v23.4s, v8.4s, v9.s[1]
+ldr         x7, [x1], #8
+
+fmla        v24.4s, v5.4s, v9.s[2]
+ldr         d3, [x1], #8
+fmla        v25.4s, v6.4s, v9.s[2]
+ldr         x8, [x1], #8
+fmla        v26.4s, v7.4s, v9.s[2]
+ins         v0.d[1], x5
+fmla        v27.4s, v8.4s, v9.s[2]
+ins         v1.d[1], x6
+fmla        v28.4s, v5.4s, v9.s[3]
+ins         v2.d[1], x7
+fmla        v29.4s, v6.4s, v9.s[3]
+ins         v3.d[1], x8
+fmla        v30.4s, v7.4s, v9.s[3]
+ins         v4.d[1], x9
+fmla        v31.4s, v8.4s, v9.s[3]
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4_core.tmpl
new file mode 100644
index 000000000..cb8b6e533
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_16x4_core.tmpl
@@ -0,0 +1,174 @@
+// vim: ft=arm
+
+// C tile regs: v16 to v31, (scratch)
+// - x19-x29 to preserve (but x19, x28, x29 not used) 
+// - d8..d15 to preserve
+// - v16 to v31, no need to preserve
+
+// v8 is used, d8 (lower half) must preserved
+// v0-v7 (scratch registers)
+//  packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+//  packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+.cpu generic+fp+simd
+.global {{G}}arm64simd_mmm_f32_16x4_{{core}}_{{suffix}}
+{{G}}arm64simd_mmm_f32_16x4_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+    stp         x26, x27, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64
+    ld1         { v4.4s }, [ x2 ], #16
+
+{% capture packed_packed_loop1 %}
+    {% if core == "a53" %}
+        {% include "arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli" %}
+    {% else %}
+        {% include "arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli" %}
+    {% endif %}
+{% endcapture %}
+
+{% capture packed_packed_loop2 %}
+    {% if core == "a55" %}
+        {% include "arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli" %}
+    {% else %}
+        {{ packed_packed_loop1 }}
+        {{ packed_packed_loop1 }}
+    {% endif %}
+{% endcapture %}
+
+    cmp         x3, #4
+    blt         .packed_packed_loop_1
+
+.p2align 4
+.packed_packed_loop_4:
+    {{ packed_packed_loop2 }}
+    {{ packed_packed_loop2 }}
+
+    sub x3, x3, #4
+    cmp x3, #4
+    bge .packed_packed_loop_4
+
+    cmp x3, #0
+    beq .non_linear_loop
+
+.p2align 4
+.packed_packed_loop_1:
+    {{ packed_packed_loop1 }}
+    subs        x3, x3, #1
+    bne .packed_packed_loop_1
+
+    b   .non_linear_loop
+
+{% include "arm64simd_mmm_f32_scalars.tmpliq" from:16, to:31%}
+{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:16, from:16, to:31 %}
+{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:16, from:16, to:31 %}
+{% include "arm64simd_mmm_load_tile.tmpliq" from:16, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]
+    ldp         x7, x8, [x0, #24]
+
+    {% for col in (0..3) %}
+        mov x4, x5
+        {% for reg in (0..3) %}
+            {% for lane in (0..3) %}
+                ld1 {v0.s}[{{lane}}], [ x4 ], x6
+            {% endfor %}
+            fadd v{{col | times:4 | plus: 16| plus: reg}}.4s, v{{col | times:4 | plus: 16 | plus: reg}}.4s, v0.4s
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x2, [x0, #8]
+    ldr     x3, [x0, #16]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x2 ]
+    ld1         { v4.4s }, [ x3 ]
+
+    fmla        v16.4s, v0.4s, v4.s[0]
+    fmla        v17.4s, v1.4s, v4.s[0]
+    fmla        v18.4s, v2.4s, v4.s[0]
+    fmla        v19.4s, v3.4s, v4.s[0]
+    fmla        v20.4s, v0.4s, v4.s[1]
+    fmla        v21.4s, v1.4s, v4.s[1]
+    fmla        v22.4s, v2.4s, v4.s[1]
+    fmla        v23.4s, v3.4s, v4.s[1]
+
+    fmla        v24.4s, v0.4s, v4.s[2]
+    fmla        v25.4s, v1.4s, v4.s[2]
+    fmla        v26.4s, v2.4s, v4.s[2]
+    fmla        v27.4s, v3.4s, v4.s[2]
+    fmla        v28.4s, v0.4s, v4.s[3]
+    fmla        v29.4s, v1.4s, v4.s[3]
+    fmla        v30.4s, v2.4s, v4.s[3]
+    fmla        v31.4s, v3.4s, v4.s[3]
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    cmp         x6, #4
+    bne           .store_strides_generic
+
+    {% for col in (0..3) %}
+        str q{{col | times:4 | plus:16 | plus: 0}}, [ x5 ]
+        str q{{col | times:4 | plus:16 | plus: 1}}, [ x5, #16 ]
+        str q{{col | times:4 | plus:16 | plus: 2}}, [ x5, #32 ]
+        str q{{col | times:4 | plus:16 | plus: 3}}, [ x5, #48 ]
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.store_strides_generic:
+
+    {% for col in (0..3) %}
+        mov x4, x5
+        {% for reg in (0..3) %}
+            {% for lane in (0..3) %}
+                st1 { v{{col | times:4 | plus: 16 | plus: reg}}.s }[{{lane}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.return:
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x26, x27, [sp], #16
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/loop2/cortex_a55.tmpli
new file mode 100644
index 000000000..b27363856
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/loop2/cortex_a55.tmpli
@@ -0,0 +1,73 @@
+// mul a: v0, v1, v2, v3, v4, v5 b: v7
+// load a: v5(d5/x5), v6(d6,x6), v7(d7,x7), v8(d8, x8)
+// load b: v9(d9/x9)
+
+fmla        v16.4s, v0.4s, v4.s[0]
+ldr         d5, [x1], #8
+fmla        v17.4s, v1.4s, v4.s[0]
+ldr         d9, [x2], #8
+fmla        v18.4s, v2.4s, v4.s[0]
+ldr         x5, [x1], #8
+fmla        v19.4s, v3.4s, v4.s[0]
+ldr         x9, [x2], #8
+fmla        v20.4s, v0.4s, v4.s[1]
+ldr         d6, [x1], #8
+fmla        v21.4s, v1.4s, v4.s[1]
+ldr         x6, [x1], #8
+fmla        v22.4s, v2.4s, v4.s[1]
+ldr         d7, [x1], #8
+fmla        v23.4s, v3.4s, v4.s[1]
+ldr         x7, [x1], #8
+
+fmla        v24.4s, v0.4s, v4.s[2]
+ldr         d8, [x1], #8
+fmla        v25.4s, v1.4s, v4.s[2]
+ldr         x8, [x1], #8
+fmla        v26.4s, v2.4s, v4.s[2]
+ins         v5.d[1], x5
+fmla        v27.4s, v3.4s, v4.s[2]
+ins         v6.d[1], x6
+fmla        v28.4s, v0.4s, v4.s[3]
+ins         v7.d[1], x7
+fmla        v29.4s, v1.4s, v4.s[3]
+ins         v8.d[1], x8
+fmla        v30.4s, v2.4s, v4.s[3]
+ins         v9.d[1], x9
+fmla        v31.4s, v3.4s, v4.s[3]
+
+// mul a: v5, v6, v7, v8 b: v9
+// load a: v0(d0/x5), v1(d1,x6), v2(d2,x7), v3(d3, x8)
+// load b: v4(d4/x9)
+
+fmla        v16.4s, v5.4s, v9.s[0]
+ldr         d0, [x1], #8
+fmla        v17.4s, v6.4s, v9.s[0]
+ldr         d4, [x2], #8
+fmla        v18.4s, v7.4s, v9.s[0]
+ldr         x5, [x1], #8
+fmla        v19.4s, v8.4s, v9.s[0]
+ldr         x9, [x2], #8
+fmla        v20.4s, v5.4s, v9.s[1]
+ldr         d1, [x1], #8
+fmla        v21.4s, v6.4s, v9.s[1]
+ldr         x6, [x1], #8
+fmla        v22.4s, v7.4s, v9.s[1]
+ldr         d2, [x1], #8
+fmla        v23.4s, v8.4s, v9.s[1]
+ldr         x7, [x1], #8
+
+fmla        v24.4s, v5.4s, v9.s[2]
+ldr         d3, [x1], #8
+fmla        v25.4s, v6.4s, v9.s[2]
+ldr         x8, [x1], #8
+fmla        v26.4s, v7.4s, v9.s[2]
+ins         v0.d[1], x5
+fmla        v27.4s, v8.4s, v9.s[2]
+ins         v1.d[1], x6
+fmla        v28.4s, v5.4s, v9.s[3]
+ins         v2.d[1], x7
+fmla        v29.4s, v6.4s, v9.s[3]
+ins         v3.d[1], x8
+fmla        v30.4s, v7.4s, v9.s[3]
+ins         v4.d[1], x9
+fmla        v31.4s, v8.4s, v9.s[3]
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli
new file mode 100644
index 000000000..6742473ea
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli
@@ -0,0 +1,63 @@
+
+fmla        v8.4s, v0.4s, v7.s[0]
+    ldr         x4, [x1]
+fmla        v9.4s, v1.4s, v7.s[0]
+    ldr         x5, [x1, #8]
+fmla        v10.4s, v2.4s, v7.s[0]
+    ldr         x6, [x1, #16]
+fmla        v11.4s, v3.4s, v7.s[0]
+    ldr         x7, [x1, #24]
+fmla        v12.4s, v4.4s, v7.s[0]
+    ldr         x8, [x1, #32]
+fmla        v13.4s, v5.4s, v7.s[0]
+    ldr         x9, [x1, #40]
+
+fmla        v14.4s, v0.4s, v7.s[1]
+    ldr         x10, [x1, #48]
+fmla        v15.4s, v1.4s, v7.s[1]
+    ldr         x11, [x1, #56]
+fmla        v16.4s, v2.4s, v7.s[1]
+    ldr         x12, [x1, #64]
+fmla        v17.4s, v3.4s, v7.s[1]
+    ldr         x13, [x1, #72]
+fmla        v18.4s, v4.4s, v7.s[1]
+    ldr         x14, [x1, #80]
+fmla        v19.4s, v5.4s, v7.s[1]
+    ldr         x15, [x1, #88]
+
+fmla        v20.4s, v0.4s, v7.s[2]
+        ldr         x20, [x2]
+fmla        v21.4s, v1.4s, v7.s[2]
+        ldr         x21, [x2, #8]
+fmla        v22.4s, v2.4s, v7.s[2]
+    add         x1, x1, #96
+fmla        v23.4s, v3.4s, v7.s[2]
+        add         x2, x2, #16
+fmla        v24.4s, v4.4s, v7.s[2]
+    prfm        pldl1keep, [x1, #256]
+fmla        v25.4s, v5.4s, v7.s[2]
+        prfm        pldl1keep, [x2, #256]
+
+fmla        v26.4s, v0.4s, v7.s[3]
+    prfm        pldl1keep, [x1, #320]
+fmla        v27.4s, v1.4s, v7.s[3]
+fmla        v28.4s, v2.4s, v7.s[3]
+fmla        v29.4s, v3.4s, v7.s[3]
+fmla        v30.4s, v4.4s, v7.s[3]
+fmla        v31.4s, v5.4s, v7.s[3]
+
+ins         v0.d[0], x4
+ins         v1.d[0], x6
+ins         v2.d[0], x8
+ins         v3.d[0], x10
+ins         v4.d[0], x12
+ins         v5.d[0], x14
+ins         v7.d[0], x20
+
+ins         v0.d[1], x5
+ins         v1.d[1], x7
+ins         v2.d[1], x9
+ins         v3.d[1], x11
+ins         v4.d[1], x13
+ins         v5.d[1], x15
+ins         v7.d[1], x21
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli
new file mode 100644
index 000000000..93307bf80
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli
@@ -0,0 +1,53 @@
+fmla        v8.4s, v0.4s, v7.s[0]
+fmla        v14.4s, v0.4s, v7.s[1]
+        prfm        pldl1keep, [x2, #256]
+fmla        v20.4s, v0.4s, v7.s[2]
+fmla        v26.4s, v0.4s, v7.s[3]
+    ldr         d0, [x1], #8
+fmla        v9.4s, v1.4s, v7.s[0]
+    ldr         x5, [x1], #8
+fmla        v15.4s, v1.4s, v7.s[1]
+        ldr         x20, [x2], #8
+fmla        v21.4s, v1.4s, v7.s[2]
+        ldr         x21, [x2], #8
+fmla        v27.4s, v1.4s, v7.s[3]
+    ldr         d1, [x1], #8
+fmla        v10.4s, v2.4s, v7.s[0]
+    ldr         x7, [x1], #8
+fmla        v16.4s, v2.4s, v7.s[1]
+    prfm        pldl1keep, [x1, #256]
+fmla        v22.4s, v2.4s, v7.s[2]
+    prfm        pldl1keep, [x1, #320]
+fmla        v28.4s, v2.4s, v7.s[3]
+    ldr         d2, [x1], #8
+fmla        v11.4s, v3.4s, v7.s[0]
+    ldr         x9, [x1], #8
+fmla        v17.4s, v3.4s, v7.s[1]
+    ins         v0.d[1], x5
+fmla        v23.4s, v3.4s, v7.s[2]
+    ins         v1.d[1], x7
+fmla        v29.4s, v3.4s, v7.s[3]
+    ldr         d3, [x1], #8
+fmla        v12.4s, v4.4s, v7.s[0]
+    ldr         x11, [x1], #8
+fmla        v18.4s, v4.4s, v7.s[1]
+    ins         v2.d[1], x9
+fmla        v24.4s, v4.4s, v7.s[2]
+fmla        v30.4s, v4.4s, v7.s[3]
+    ldr         d4, [x1], #8
+fmla        v13.4s, v5.4s, v7.s[0]
+    ldr         x13, [x1], #8
+fmla        v19.4s, v5.4s, v7.s[1]
+    ldr         x14, [x1], #8
+fmla        v25.4s, v5.4s, v7.s[2]
+    ldr         x15, [x1], #8
+fmla        v31.4s, v5.4s, v7.s[3]
+
+ins         v7.d[0], x20
+ins         v7.d[1], x21
+
+ins         v5.d[0], x14
+ins         v5.d[1], x15
+
+ins         v3.d[1], x11
+ins         v4.d[1], x13
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli
new file mode 100644
index 000000000..7b12d7443
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli
@@ -0,0 +1,31 @@
+fmla        v8.4s, v0.4s, v7.s[0]
+fmla        v9.4s, v1.4s, v7.s[0]
+fmla        v10.4s, v2.4s, v7.s[0]
+fmla        v11.4s, v3.4s, v7.s[0]
+fmla        v12.4s, v4.4s, v7.s[0]
+fmla        v13.4s, v5.4s, v7.s[0]
+
+fmla        v14.4s, v0.4s, v7.s[1]
+fmla        v15.4s, v1.4s, v7.s[1]
+fmla        v16.4s, v2.4s, v7.s[1]
+fmla        v17.4s, v3.4s, v7.s[1]
+fmla        v18.4s, v4.4s, v7.s[1]
+fmla        v19.4s, v5.4s, v7.s[1]
+
+fmla        v20.4s, v0.4s, v7.s[2]
+fmla        v21.4s, v1.4s, v7.s[2]
+fmla        v22.4s, v2.4s, v7.s[2]
+fmla        v23.4s, v3.4s, v7.s[2]
+fmla        v24.4s, v4.4s, v7.s[2]
+fmla        v25.4s, v5.4s, v7.s[2]
+
+fmla        v26.4s, v0.4s, v7.s[3]
+fmla        v27.4s, v1.4s, v7.s[3]
+fmla        v28.4s, v2.4s, v7.s[3]
+fmla        v29.4s, v3.4s, v7.s[3]
+fmla        v30.4s, v4.4s, v7.s[3]
+fmla        v31.4s, v5.4s, v7.s[3]
+
+ld1         {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [ x1 ], #64
+ld1         {{ v4.4s, v5.4s }}, [ x1 ], #32
+ld1         {{ v7.4s }}, [ x2 ], #16
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4_core.tmpl
new file mode 100644
index 000000000..f77fbd3fd
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_24x4_core.tmpl
@@ -0,0 +1,185 @@
+// vim: ft=arm
+
+// x20..x27 are used, callee-preserved
+
+// C tile regs: v8 to v31, (scratch)
+// - x19-x29 to preserve (but x19, x28, x29 not used) 
+// - d8..d15 to preserve
+// - v16 to v31, no need to preserve
+
+// v8 is used, d8 (lower half) must preserved
+// v0-v7 (scratch registers)
+//  packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+//  packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+.cpu generic+fp+simd
+.global {{G}}arm64simd_mmm_f32_24x4_{{core}}_{{suffix}}
+{{G}}arm64simd_mmm_f32_24x4_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+    stp         x26, x27, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64
+    ld1         { v4.4s, v5.4s }, [ x1 ], #32
+    ld1         { v7.4s }, [ x2 ], #16
+
+{% capture packed_packed_loop1 %}
+    {% if core == "a53" %}
+        {% include "arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli" %}
+    {% elsif core == "a55" %}
+        {% include "arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli" %}
+    {% else %}
+        {% include "arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli" %}
+    {% endif %}
+{% endcapture %}
+
+    cmp         x3, #4
+    blt         .packed_packed_loop_1
+
+.p2align 4
+.packed_packed_loop_4:
+    {{ packed_packed_loop1 }}
+    {{ packed_packed_loop1 }}
+    {{ packed_packed_loop1 }}
+    {{ packed_packed_loop1 }}
+
+    sub x3, x3, #4
+    cmp x3, #4
+    bge .packed_packed_loop_4
+
+    cmp x3, #0
+    beq .non_linear_loop
+
+.p2align 4
+.packed_packed_loop_1:
+    {{ packed_packed_loop1 }}
+    subs        x3, x3, #1
+    bne .packed_packed_loop_1
+
+    b   .non_linear_loop
+
+{% include "arm64simd_mmm_f32_scalars.tmpliq" from:8, to:31 %}
+{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:24, from:8, to:31 %}
+{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:24, from:8, to:31 %}
+{% include "arm64simd_mmm_load_tile.tmpliq" from:8, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]
+    ldp         x7, x8, [x0, #24]
+
+    {% for col in (0..3) %}
+        mov x4, x5
+        {% for reg in (0..5) %}
+            {% for lane in (0..3) %}
+                ld1 {v0.s}[{{lane}}], [ x4 ], x6
+            {% endfor %}
+            fadd v{{col | times:6 | plus: 8 | plus: reg}}.4s, v{{col | times:6 | plus: 8 | plus: reg}}.4s, v0.4s
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x2, [x0, #8]
+    ldr     x3, [x0, #16]
+
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x2 ], #64
+    ld1         { v7.4s }, [ x3 ]
+    ld1         { v4.4s, v5.4s }, [ x2 ]
+
+    fmla        v8.4s, v0.4s, v7.s[0]
+    fmla        v9.4s, v1.4s, v7.s[0]
+    fmla        v10.4s, v2.4s, v7.s[0]
+    fmla        v11.4s, v3.4s, v7.s[0]
+    fmla        v12.4s, v4.4s, v7.s[0]
+    fmla        v13.4s, v5.4s, v7.s[0]
+
+    fmla        v14.4s, v0.4s, v7.s[1]
+    fmla        v15.4s, v1.4s, v7.s[1]
+    fmla        v16.4s, v2.4s, v7.s[1]
+    fmla        v17.4s, v3.4s, v7.s[1]
+    fmla        v18.4s, v4.4s, v7.s[1]
+    fmla        v19.4s, v5.4s, v7.s[1]
+
+    fmla        v20.4s, v0.4s, v7.s[2]
+    fmla        v21.4s, v1.4s, v7.s[2]
+    fmla        v22.4s, v2.4s, v7.s[2]
+    fmla        v23.4s, v3.4s, v7.s[2]
+    fmla        v24.4s, v4.4s, v7.s[2]
+    fmla        v25.4s, v5.4s, v7.s[2]
+
+    fmla        v26.4s, v0.4s, v7.s[3]
+    fmla        v27.4s, v1.4s, v7.s[3]
+    fmla        v28.4s, v2.4s, v7.s[3]
+    fmla        v29.4s, v3.4s, v7.s[3]
+    fmla        v30.4s, v4.4s, v7.s[3]
+    fmla        v31.4s, v5.4s, v7.s[3]
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    cmp         x6, #4
+    bne           .store_strides_generic
+
+    {% for col in (0..3) %}
+        str q{{col | times:6 | plus:8 | plus: 0}}, [ x5 ]
+        str q{{col | times:6 | plus:8 | plus: 1}}, [ x5, #16 ]
+        str q{{col | times:6 | plus:8 | plus: 2}}, [ x5, #32 ]
+        str q{{col | times:6 | plus:8 | plus: 3}}, [ x5, #48 ]
+        str q{{col | times:6 | plus:8 | plus: 4}}, [ x5, #64 ]
+        str q{{col | times:6 | plus:8 | plus: 5}}, [ x5, #80 ]
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.store_strides_generic:
+
+    {% for col in (0..3) %}
+        mov x4, x5
+        {% for reg in (0..5) %}
+            {% for lane in (0..3) %}
+                st1 { v{{col | times:6 | plus:8 | plus: reg}}.s }[{{lane}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.return:
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x26, x27, [sp], #16
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x1_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x1_core.tmpl
new file mode 100644
index 000000000..336248a3c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x1_core.tmpl
@@ -0,0 +1,403 @@
+// vim: ft=arm
+
+// C tile regs:
+// - x19-x29 to preserve (but x19, x28, x29 not used) 
+// - d8..d15 to preserve
+// - v16 to v31, no need to preserve
+// 
+//      v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0]
+//      v16[1] v18[1] 
+//      v16[2] v18[2] 
+//      v16[3] v18[3]
+//                     
+//      v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0]
+//      v17[1] v19[1] 
+//      v17[2] v19[2] 
+//      v17[3] v19[3] 
+
+// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+// packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+.cpu generic+fp+simd
+.global {{G}}arm64simd_mmm_f32_32x1_{{core}}_{{suffix}}
+{{G}}arm64simd_mmm_f32_32x1_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+    stp         x26, x27, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldp         x2, x4, [x0, #24]   // b, packing
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    cmp         x4, #1
+    beq         .q4f16se
+    cmp         x4, #2
+    beq         .q4f32se
+    cmp         x4, #3
+    beq         .f16f16
+    cmp         x4, #4
+    beq         .f32f16
+    cmp         x4, #5
+    beq         .f16f32
+
+    sub         x3, x3, #1
+
+.p2align 4
+.packed_packed_loop_1:
+    ld1         { v8.s }[0], [ x2 ], #4
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64
+    ld1         { v4.4s, v5.4s, v6.4s, v7.4s }, [ x1 ], #64
+
+    fmla        v24.4s, v0.4s, v8.s[0]
+    fmla        v25.4s, v1.4s, v8.s[0]
+    fmla        v26.4s, v2.4s, v8.s[0]
+    fmla        v27.4s, v3.4s, v8.s[0]
+    fmla        v28.4s, v4.4s, v8.s[0]
+    fmla        v29.4s, v5.4s, v8.s[0]
+    fmla        v30.4s, v6.4s, v8.s[0]
+    fmla        v31.4s, v7.4s, v8.s[0]
+
+    subs        x3, x3, #1
+    bge         .packed_packed_loop_1
+
+    b           .non_linear_loop
+
+.p2align 8
+.q40f16_const:
+    .byte 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc2, 0xc0, 0xbc
+    .byte 0x00, 0x3c, 0x40, 0x42, 0x44, 0x45, 0x46, 0x47
+
+.q4f16se:
+    adr      x4, .q40f16_const
+    movi     v15.16b, 15
+    ld1      {v13.16b}, [ x4 ]
+    eor      v12.16b, v12.16b, v12.16b
+
+.q4f16se_outerloop:
+{% for i in (0..7) %}
+    eor      v{{i|plus:16}}.16b, v{{i|plus:16}}.16b, v{{i|plus:16}}.16b
+{% endfor %}
+    mov         x4, #32
+
+.p2align 4
+.q4f16se_innerloop:
+        ld1      { v10.16b }, [ x1 ], #16
+        ld1      { v11.h }[0], [ x2 ], #2
+
+        and      v9.16b, v10.16b, v15.16b
+        ushr     v10.16b, v10.16b, 4
+
+        tbl      v9.16b, { v13.16b }, v9.16b
+        tbl      v10.16b, { v13.16b }, v10.16b
+
+        zip1     v0.16b, v12.16b, v9.16b
+        zip2     v2.16b, v12.16b, v9.16b
+        zip1     v4.16b, v12.16b, v10.16b
+        zip2     v6.16b, v12.16b, v10.16b
+
+        fcvtl    v11.4s, v11.4h
+
+        fcvtl2   v1.4s, v0.8h
+        fcvtl2   v3.4s, v2.8h
+        fcvtl2   v5.4s, v4.8h
+        fcvtl2   v7.4s, v6.8h
+        fcvtl    v0.4s, v0.4h
+        fcvtl    v2.4s, v2.4h
+        fcvtl    v4.4s, v4.4h
+        fcvtl    v6.4s, v6.4h
+
+{% for i in (0..7) %}
+        fmla        v{{ i|plus: 16 }}.4s, v{{i}}.4s, v11.s[0]
+{% endfor %}
+
+    subs        x4, x4, #1
+    bne         .q4f16se_innerloop
+
+    // scales
+    ld1         { v0.8h-v3.8h }, [ x1 ], #64
+
+    fcvtl       v4.4s, v0.4h
+    fcvtl2      v5.4s, v0.8h
+    fcvtl       v6.4s, v1.4h
+    fcvtl2      v7.4s, v1.8h
+    fcvtl       v8.4s, v2.4h
+    fcvtl2      v9.4s, v2.8h
+    fcvtl       v10.4s, v3.4h
+    fcvtl2      v11.4s, v3.8h
+
+{% for i in (0..7) %}
+       fmla     v{{i|plus:24}}.4s, v{{i|plus:4}}.4s, v{{i|plus:16}}.4s
+{% endfor %}
+
+    subs        x3, x3, #32
+    bne         .q4f16se_outerloop
+
+    b           .non_linear_loop
+
+.q4f32se:
+    adr      x4, .q40f16_const
+    movi     v15.16b, 15
+    ld1      {v13.16b}, [ x4 ]
+    eor      v12.16b, v12.16b, v12.16b
+
+.q4f32se_outerloop:
+{% for i in (0..7) %}
+    eor      v{{i|plus:16}}.16b, v{{i|plus:16}}.16b, v{{i|plus:16}}.16b
+{% endfor %}
+    mov         x4, #32
+
+.p2align 4
+.q4f32se_innerloop:
+        ld1      { v10.16b }, [ x1 ], #16
+        ld1      { v11.s }[0], [ x2 ], #4
+
+        and      v9.16b, v10.16b, v15.16b
+        ushr     v10.16b, v10.16b, 4
+
+        tbl      v9.16b, { v13.16b }, v9.16b
+        tbl      v10.16b, { v13.16b }, v10.16b
+
+        zip1     v0.16b, v12.16b, v9.16b
+        zip2     v2.16b, v12.16b, v9.16b
+        zip1     v4.16b, v12.16b, v10.16b
+        zip2     v6.16b, v12.16b, v10.16b
+
+        fcvtl2   v1.4s, v0.8h
+        fcvtl2   v3.4s, v2.8h
+        fcvtl2   v5.4s, v4.8h
+        fcvtl2   v7.4s, v6.8h
+        fcvtl    v0.4s, v0.4h
+        fcvtl    v2.4s, v2.4h
+        fcvtl    v4.4s, v4.4h
+        fcvtl    v6.4s, v6.4h
+
+{% for i in (0..7) %}
+        fmla        v{{ i|plus: 16 }}.4s, v{{i}}.4s, v11.s[0]
+{% endfor %}
+
+    subs        x4, x4, #1
+    bne         .q4f32se_innerloop
+
+    // scales
+    ld1         { v0.8h-v3.8h }, [ x1 ], #64
+
+    fcvtl       v4.4s, v0.4h
+    fcvtl2      v5.4s, v0.8h
+    fcvtl       v6.4s, v1.4h
+    fcvtl2      v7.4s, v1.8h
+    fcvtl       v8.4s, v2.4h
+    fcvtl2      v9.4s, v2.8h
+    fcvtl       v10.4s, v3.4h
+    fcvtl2      v11.4s, v3.8h
+
+{% for i in (0..7) %}
+       fmla     v{{i|plus:24}}.4s, v{{i|plus:4}}.4s, v{{i|plus:16}}.4s
+{% endfor %}
+
+    subs        x3, x3, #32
+    bne         .q4f32se_outerloop
+
+    b           .non_linear_loop
+
+.p2align 4
+.f16f16:
+    sub         x3, x3, #1
+.f16f16_loop:
+    ld1         { v9.h }[0], [ x2 ], #2
+    ld1         { v10.8h-v13.8h }, [ x1 ], #64
+
+    fcvtl       v8.4s, v9.4h
+    {% for reg in (0..3) %}
+        fcvtl       v{{reg|times:2}}.4s, v{{reg|plus:10}}.4h
+        fcvtl2      v{{reg|times:2|plus:1}}.4s, v{{reg|plus:10}}.8h
+    {% endfor %}
+
+    fmla        v24.4s, v0.4s, v8.s[0]
+    fmla        v25.4s, v1.4s, v8.s[0]
+    fmla        v26.4s, v2.4s, v8.s[0]
+    fmla        v27.4s, v3.4s, v8.s[0]
+    fmla        v28.4s, v4.4s, v8.s[0]
+    fmla        v29.4s, v5.4s, v8.s[0]
+    fmla        v30.4s, v6.4s, v8.s[0]
+    fmla        v31.4s, v7.4s, v8.s[0]
+
+    subs        x3, x3, #1
+    bge         .f16f16_loop
+
+    b           .non_linear_loop
+
+.p2align 4
+.f32f16:
+    sub         x3, x3, #1
+.f32f16_loop:
+    ld1         { v9.h }[0], [ x2 ], #2
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64
+    ld1         { v4.4s, v5.4s, v6.4s, v7.4s }, [ x1 ], #64
+
+    fcvtl       v8.4s, v9.4h
+
+    fmla        v24.4s, v0.4s, v8.s[0]
+    fmla        v25.4s, v1.4s, v8.s[0]
+    fmla        v26.4s, v2.4s, v8.s[0]
+    fmla        v27.4s, v3.4s, v8.s[0]
+    fmla        v28.4s, v4.4s, v8.s[0]
+    fmla        v29.4s, v5.4s, v8.s[0]
+    fmla        v30.4s, v6.4s, v8.s[0]
+    fmla        v31.4s, v7.4s, v8.s[0]
+
+    subs        x3, x3, #1
+    bge         .f32f16_loop
+
+    b           .non_linear_loop
+
+.p2align 4
+.f16f32:
+    sub         x3, x3, #1
+.f16f32_loop:
+    ld1         { v8.s }[0], [ x2 ], #4
+    ld1         { v10.8h-v13.8h }, [ x1 ], #64
+
+    {% for reg in (0..3) %}
+        fcvtl       v{{reg|times:2}}.4s, v{{reg|plus:10}}.4h
+        fcvtl2      v{{reg|times:2|plus:1}}.4s, v{{reg|plus:10}}.8h
+    {% endfor %}
+
+    fmla        v24.4s, v0.4s, v8.s[0]
+    fmla        v25.4s, v1.4s, v8.s[0]
+    fmla        v26.4s, v2.4s, v8.s[0]
+    fmla        v27.4s, v3.4s, v8.s[0]
+    fmla        v28.4s, v4.4s, v8.s[0]
+    fmla        v29.4s, v5.4s, v8.s[0]
+    fmla        v30.4s, v6.4s, v8.s[0]
+    fmla        v31.4s, v7.4s, v8.s[0]
+
+    subs        x3, x3, #1
+    bge         .f16f32_loop
+
+    b           .non_linear_loop
+
+{% include "arm64simd_mmm_f32_scalars.tmpliq" from:24, to:31%}
+{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:32, from:24, to:31%}
+{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:32, from:24, to:31%}
+{% include "arm64simd_mmm_load_tile.tmpliq" from:24, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]           // c base ptr, rsc
+    cmp         x6, #4
+    beq         .do_per_row_add
+
+    {% for reg in (24..31) %}
+        {% for lane in (0..3) %}
+            ld1 {v0.s}[{{lane}}], [ x5 ], x6
+        {% endfor %}
+        fadd v{{reg}}.4s, v{{reg}}.4s, v0.4s
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.do_per_row_add:
+    ld1     {v0.4s-v3.4s}, [x5], #64
+    ld1     {v4.4s-v7.4s}, [x5], #64
+
+    {% for r in (0..7) %}
+        fadd v{{r| plus: 24}}.4s, v{{r | plus: 24}}.4s, v{{r}}.4s
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x3, [x0, #16]
+    ldr     x2, [x0, #8]
+
+    ld1         {v8.s}[0], [ x3 ]
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x2 ], #64
+    ld1         { v4.4s, v5.4s, v6.4s, v7.4s }, [ x2 ], #64
+
+    fmla        v24.4s, v0.4s, v8.s[0]
+    fmla        v25.4s, v1.4s, v8.s[0]
+    fmla        v26.4s, v2.4s, v8.s[0]
+    fmla        v27.4s, v3.4s, v8.s[0]
+    fmla        v28.4s, v4.4s, v8.s[0]
+    fmla        v29.4s, v5.4s, v8.s[0]
+    fmla        v30.4s, v6.4s, v8.s[0]
+    fmla        v31.4s, v7.4s, v8.s[0]
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]                // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]               // csc, item_size
+
+    cmp         x8, #2
+    beq         .store_f16
+
+    cmp         x6, #4
+    beq         .store_strides_contig
+
+    {% for reg in (24..31) %}
+        {% for lane in (0..3) %}
+            st1 { v{{reg}}.s }[{{lane}}], [ x5 ], x6
+        {% endfor %}
+    {% endfor %}
+    b           .non_linear_loop
+
+.store_strides_contig:
+
+    {% for reg in (24..31) %}
+        st1 { v{{reg}}.4s }, [ x5 ], #16
+    {% endfor %}
+    b           .non_linear_loop
+
+.store_f16:
+    {% for reg in (0..3) %}
+        fcvtn  v{{reg}}.4h, v{{reg|times:2|plus:24}}.4s
+        fcvtn2 v{{reg}}.8h, v{{reg|times:2|plus:25}}.4s
+    {% endfor %}
+
+    cmp         x6, #2
+    beq         .store_strides_contig_f16
+
+    {% for reg in (0..3) %}
+        {% for lane in (0..7) %}
+            st1 { v{{reg}}.h }[{{lane}}], [ x5 ], x6
+        {% endfor %}
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.store_strides_contig_f16:
+
+    {% for reg in (0..3) %}
+        st1 { v{{reg}}.8h }, [ x5 ], #16
+    {% endfor %}
+    b           .non_linear_loop
+
+.return:
+
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x26, x27, [sp], #16
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x3_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x3_core.tmpl
new file mode 100644
index 000000000..e434b2b5e
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_32x3_core.tmpl
@@ -0,0 +1,307 @@
+// vim: ft=arm
+
+// C tile regs: v16 to v31, no need to preserve
+
+// no preservation either for v0-v7...
+// v8..v15 are callee-preserved
+// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+// packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+.global {{G}}arm64simd_mmm_f32_32x3_{{core}}_{{suffix}}
+{{G}}arm64simd_mmm_f32_32x3_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldp         x2, x4, [x0, #24]   // b, packing
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    cmp         x4, #1
+    beq         .f32f16
+    cmp         x4, #2
+    beq         .f16f32
+    cmp         x4, #3
+    beq         .f16f16
+
+.p2align 4
+.packed_packed_loop_1:
+    ld1         { v7.4s }, [ x2 ]
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64
+    ld1         { v4.4s, v5.4s, v6.4s }, [ x1 ], #48
+    add         x2, x2, #12
+
+{% for col in (0..2) %}
+    fmla        v{{ col|times:8|plus:8}}.4s, v0.4s, v7.s[{{ col }}]
+{% endfor %}
+
+    ld1         { v0.4s }, [ x1 ], #16
+
+{% for row in (1..6) %}
+    {% for col in (0..2) %}
+        fmla        v{{ col|times:8|plus:8|plus:row}}.4s, v{{row}}.4s, v7.s[{{col}}]
+    {% endfor %}
+{% endfor %}
+
+{% for col in (0..2) %}
+    fmla        v{{ col|times:8|plus:15}}.4s, v0.4s, v7.s[{{ col }}]
+{% endfor %}
+
+    subs        x3, x3, #1
+    bne         .packed_packed_loop_1
+
+    b           .non_linear_loop
+
+.p2align 4
+.f32f16:
+    ld1         { v7.4h }, [ x2 ]
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64
+    ld1         { v4.4s, v5.4s, v6.4s }, [ x1 ], #48
+    fcvtl       v7.4s, v7.4h
+    add         x2, x2, #6
+
+{% for col in (0..2) %}
+    fmla        v{{ col|times:8|plus:8}}.4s, v0.4s, v7.s[{{ col }}]
+{% endfor %}
+
+    ld1         { v0.4s }, [ x1 ], #16
+
+{% for row in (1..6) %}
+    {% for col in (0..2) %}
+        fmla        v{{ col|times:8|plus:8|plus:row}}.4s, v{{row}}.4s, v7.s[{{col}}]
+    {% endfor %}
+{% endfor %}
+
+{% for col in (0..2) %}
+    fmla        v{{ col|times:8|plus:15}}.4s, v0.4s, v7.s[{{ col }}]
+{% endfor %}
+
+    subs        x3, x3, #1
+    bne         .f32f16
+
+    b           .non_linear_loop
+
+.p2align 4
+.f16f32:
+    ld1         { v7.4s }, [ x2 ]
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64
+    add         x2, x2, #12
+
+    fcvtl       v4.4s, v0.4h
+    fcvtl2      v5.4s, v0.8h
+    fcvtl       v6.4s, v1.4h
+    fcvtl2      v0.4s, v1.8h
+
+    {% for col in (0..2) %}
+        fmla        v{{ col|times:8|plus:8}}.4s, v4.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:9}}.4s, v5.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:10}}.4s, v6.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:11}}.4s, v0.4s, v7.s[{{col}}]
+    {% endfor %}
+
+    fcvtl       v4.4s, v2.4h
+    fcvtl2      v5.4s, v2.8h
+    fcvtl       v6.4s, v3.4h
+    fcvtl2      v1.4s, v3.8h
+    
+    {% for col in (0..2) %}
+        fmla        v{{ col|times:8|plus:12}}.4s, v4.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:13}}.4s, v5.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:14}}.4s, v6.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:15}}.4s, v1.4s, v7.s[{{col}}]
+    {% endfor %}
+
+    subs        x3, x3, #1
+    bne         .f16f32
+
+    b           .non_linear_loop
+
+.p2align 4
+.f16f16:
+    ld1         { v7.4h }, [ x2 ]
+    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64
+    add         x2, x2, #6
+
+    fcvtl       v7.4s, v7.4h
+
+    fcvtl       v4.4s, v0.4h
+    fcvtl2      v5.4s, v0.8h
+    fcvtl       v6.4s, v1.4h
+    fcvtl2      v0.4s, v1.8h
+
+    {% for col in (0..2) %}
+        fmla        v{{ col|times:8|plus:8}}.4s, v4.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:9}}.4s, v5.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:10}}.4s, v6.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:11}}.4s, v0.4s, v7.s[{{col}}]
+    {% endfor %}
+
+    fcvtl       v4.4s, v2.4h
+    fcvtl2      v5.4s, v2.8h
+    fcvtl       v6.4s, v3.4h
+    fcvtl2      v1.4s, v3.8h
+    
+    {% for col in (0..2) %}
+        fmla        v{{ col|times:8|plus:12}}.4s, v4.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:13}}.4s, v5.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:14}}.4s, v6.4s, v7.s[{{col}}]
+        fmla        v{{ col|times:8|plus:15}}.4s, v1.4s, v7.s[{{col}}]
+    {% endfor %}
+
+    subs        x3, x3, #1
+    bne         .f16f16
+
+    b           .non_linear_loop
+
+
+{% include "arm64simd_mmm_f32_scalars.tmpliq" from:8, to:31%}
+{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:32, from:8, to:31%}
+{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:32, from:8, to:31%}
+{% include "arm64simd_mmm_load_tile.tmpliq" from:8, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]
+    ldp         x7, x8, [x0, #24]
+
+    {% for col in (0..2) %}
+        mov x4, x5
+        {% for reg in (0..7) %}
+            {% for lane in (0..3) %}
+                ld1 {v0.s}[{{lane}}], [ x4 ], x6
+            {% endfor %}
+            fadd v{{col | times:8 | plus: 8| plus: reg}}.4s, v{{col | times:8 | plus: 8 | plus: reg}}.4s, v0.4s
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldp         x2, x3, [x0, #8]
+
+    ld1         { v7.d }[0], [ x3 ], #8
+    ld1         { v7.s }[2], [ x3 ], #4
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x2 ], #64
+    ld1         { v4.4s, v5.4s, v6.4s }, [ x2 ], #48
+
+{% for col in (0..2) %}
+    fmla        v{{ col|times:8|plus:8}}.4s, v0.4s, v7.s[{{ col }}]
+{% endfor %}
+
+    ld1         { v0.4s }, [ x2 ], #16
+
+{% for row in (1..6) %}
+    {% for col in (0..2) %}
+        fmla        v{{ col|times:8|plus:8|plus:row}}.4s, v{{row}}.4s, v7.s[{{col}}]
+    {% endfor %}
+{% endfor %}
+
+{% for col in (0..2) %}
+    fmla        v{{ col|times:8|plus:15}}.4s, v0.4s, v7.s[{{ col }}]
+{% endfor %}
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]                // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]               // csc, item_size
+
+    cmp         x8, #2
+    beq         .store_f16
+
+    cmp         x6, #4
+    beq         .store_strides_contig
+
+
+    {% for col in (0..2) %}
+        mov x4, x5
+        {% for reg in (0..7) %}
+            {% for lane in (0..3) %}
+                st1 { v{{col | times:8 | plus: 8 | plus: reg}}.s }[{{lane}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+    b           .non_linear_loop
+
+.store_strides_contig:
+
+    {% for col in (0..2) %}
+        mov x4, x5
+        {% for r in (0..7) %}
+            st1 { v{{col | times:8 | plus: 8 | plus: r}}.4s }, [ x4 ], 16
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.store_f16:
+
+    cmp         x6, #2
+    beq         .store_strides_contig_f16
+
+    {% for col in (0..2) %}
+        {% for reg in (0..3) %}
+            fcvtn  v{{reg}}.4h, v{{col|times:4|plus:reg|times:2|plus:8}}.4s
+            fcvtn2 v{{reg}}.8h, v{{col|times:4|plus:reg|times:2|plus:9}}.4s
+        {% endfor %}
+
+        mov x4, x5
+        {% for reg in (0..3) %}
+            {% for lane in (0..7) %}
+                st1 { v{{reg}}.h }[{{lane}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+
+    {% endfor %}
+
+
+    b           .non_linear_loop
+
+.store_strides_contig_f16:
+
+    {% for col in (0..2) %}
+        {% for reg in (0..3) %}
+            fcvtn  v{{reg}}.4h, v{{col|times:4|plus:reg|times:2|plus:8}}.4s
+            fcvtn2 v{{reg}}.8h, v{{col|times:4|plus:reg|times:2|plus:9}}.4s
+        {% endfor %}
+
+        mov x4, x5
+        {% for reg in (0..3) %}
+            st1 { v{{reg}}.4s }, [ x4 ], #16
+        {% endfor %}
+        add x5, x5, x7
+
+    {% endfor %}
+    b           .non_linear_loop
+
+
+.return:
+
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli
new file mode 100644
index 000000000..410816dff
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli
@@ -0,0 +1,65 @@
+    fmla        v16.4s, v0.4s, v8.s[0]
+    ldr         x5, [x1, #128]
+    fmla        v17.4s, v1.4s, v8.s[0]
+    ldr         x6, [x1, #136]
+    fmla        v18.4s, v2.4s, v8.s[0]
+    ldr         x7, [x1, #144]
+    fmla        v19.4s, v3.4s, v8.s[0]
+    ldr         x9, [x1, #152]
+    ld1         {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [ x1 ], #64
+
+    fmla        v20.4s, v4.4s, v8.s[0]
+    ldr         x10, [x1, #96]
+    fmla        v21.4s, v5.4s, v8.s[0]
+    ldr         x11, [x1, #104]
+    fmla        v22.4s, v6.4s, v8.s[0]
+    ldr         x12, [x1, #112]
+    fmla        v23.4s, v7.4s, v8.s[0]
+    ldr         x13, [x1, #120]
+
+    ld1         {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [ x1 ]
+
+    fmla        v24.4s, v0.4s, v8.s[0]
+    ldr         x14, [x1, #128]
+    fmla        v25.4s, v1.4s, v8.s[0]
+    ldr         x15, [x1, #136]
+    fmla        v26.4s, v2.4s, v8.s[0]
+    ldr         x20, [x1, #144]
+    fmla        v27.4s, v3.4s, v8.s[0]
+    ldr         x21, [x1, #152]
+    fmla        v28.4s, v4.4s, v8.s[0]
+    ldr         x22, [x1, #160]
+    fmla        v29.4s, v5.4s, v8.s[0]
+    ldr         x23, [x1, #168]
+    fmla        v30.4s, v6.4s, v8.s[0]
+    ldr         x24, [x1, #176]
+    fmla        v31.4s, v7.4s, v8.s[0]
+    ldr         x25, [x1, #184]
+
+    ld1         {{ v8.s }}[0], [ x2 ], #4
+
+    prfm        pldl1keep, [x1, #1024]
+    prfm        pldl1keep, [x1, #1088]
+    prfm        pldl1keep, [x1, #1152]
+    prfm        pldl1keep, [x1, #1216]
+    prfm        pldl1keep, [x2, #256]
+
+    ins         v0.d[0], x5
+    ins         v1.d[0], x7
+    ins         v2.d[0], x10
+    ins         v3.d[0], x12
+    ins         v4.d[0], x14
+    ins         v5.d[0], x20
+    ins         v6.d[0], x22
+    ins         v7.d[0], x24
+
+    ins         v0.d[1], x6
+    ins         v1.d[1], x9
+    ins         v2.d[1], x11
+    ins         v3.d[1], x13
+    ins         v4.d[1], x15
+    ins         v5.d[1], x21
+    ins         v6.d[1], x23
+    ins         v7.d[1], x25
+
+    add         x1, x1, #192
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/naive.tmpli
new file mode 100644
index 000000000..c65deb967
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop1/naive.tmpli
@@ -0,0 +1,32 @@
+    ld1         {{ v9.4s, v10.4s, v11.4s, v12.4s }}, [x1], #64
+    ld1         {{ v13.4s, v14.4s, v15.4s }}, [x1], #48
+
+    fmla        v16.4s, v0.4s, v8.s[0]
+    fmla        v17.4s, v1.4s, v8.s[0]
+    fmla        v18.4s, v2.4s, v8.s[0]
+    fmla        v19.4s, v3.4s, v8.s[0]
+    fmla        v20.4s, v4.4s, v8.s[0]
+    fmla        v21.4s, v5.4s, v8.s[0]
+    fmla        v22.4s, v6.4s, v8.s[0]
+    fmla        v23.4s, v7.4s, v8.s[0]
+    fmla        v24.4s, v9.4s, v8.s[0]
+    ld1         {{ v9.4s }}, [ x1 ], #16
+    ld1         {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [x1], #64
+    ld1         {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [x1], #64
+    fmla        v25.4s, v10.4s, v8.s[0]
+    fmla        v26.4s, v11.4s, v8.s[0]
+    fmla        v27.4s, v12.4s, v8.s[0]
+    fmla        v28.4s, v13.4s, v8.s[0]
+    fmla        v29.4s, v14.4s, v8.s[0]
+    fmla        v30.4s, v15.4s, v8.s[0]
+
+    fmla        v31.4s, v9.4s, v8.s[0]
+
+    ld1         {{ v8.s }}[0], [ x2 ], #4
+
+    prfm        pldl1keep, [x1, #1024]
+    prfm        pldl1keep, [x1, #1088]
+    prfm        pldl1keep, [x1, #1152]
+    prfm        pldl1keep, [x1, #1216]
+    prfm        pldl1keep, [x2, #256]
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli
new file mode 100644
index 000000000..2a5f06603
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli
@@ -0,0 +1,85 @@
+    ld1         {{ v9.4s, v10.4s, v11.4s }}, [x1], #48
+
+    fmla        v16.4s, v0.4s, v8.s[0]
+    ldr         x8, [x2], #8
+    fmla        v17.4s, v1.4s, v8.s[0]
+    ldr         d12, [x1], #8
+    fmla        v18.4s, v2.4s, v8.s[0]
+    ldr         x12, [x1], #8
+    fmla        v19.4s, v3.4s, v8.s[0]
+    ldr         d13, [x1], #8
+    fmla        v20.4s, v4.4s, v8.s[0]
+    ldr         x13, [x1], #8
+    fmla        v21.4s, v5.4s, v8.s[0]
+    ldr         d14, [x1], #8
+    fmla        v22.4s, v6.4s, v8.s[0]
+    ldr         x14, [x1], #8
+    fmla        v23.4s, v7.4s, v8.s[0]
+    ldr         d15, [x1], #8
+    fmla        v24.4s, v9.4s, v8.s[0]
+    ldr         x15, [x1], #8
+
+    ld1         {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [x1], #64
+    ins         v8.d[1], x8
+    ld1         {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [x1], #64
+
+    fmla        v25.4s, v10.4s, v8.s[0]
+    ins         v12.d[1], x12
+    fmla        v26.4s, v11.4s, v8.s[0]
+    ins         v13.d[1], x13
+    fmla        v27.4s, v12.4s, v8.s[0]
+    ins         v14.d[1], x14
+    fmla        v28.4s, v13.4s, v8.s[0]
+    ins         v15.d[1], x15
+
+    ld1         {{ v9.4s, v10.4s, v11.4s, v12.4s }}, [x1], #64
+
+    fmla        v29.4s, v14.4s, v8.s[0]
+    ldr         d13, [x1], #8
+    fmla        v30.4s, v15.4s, v8.s[0]
+    ldr         x13, [x1], #8
+    fmla        v31.4s, v0.4s, v8.s[0]
+    ldr         d14, [x1], #8
+
+    fmla        v16.4s, v1.4s, v8.s[2]
+    ldr         x14, [x1], #8
+    fmla        v17.4s, v2.4s, v8.s[2]
+    ldr         d15, [x1], #8
+    fmla        v18.4s, v3.4s, v8.s[2]
+    ldr         x15, [x1], #8
+    fmla        v19.4s, v4.4s, v8.s[2]
+
+    ld1         {{ v0.4s }}, [x1], #16
+
+    fmla        v20.4s, v5.4s, v8.s[2]
+    ldr         d1, [x1], #8
+    fmla        v21.4s, v6.4s, v8.s[2]
+    ldr         x10, [x1], #8
+
+    fmla        v22.4s, v7.4s, v8.s[2]
+
+    fmla        v23.4s, v9.4s, v8.s[2]
+    ins         v13.d[1], x13
+    fmla        v24.4s, v10.4s, v8.s[2]
+    ins         v14.d[1], x14
+    fmla        v25.4s, v11.4s, v8.s[2]
+    ins         v15.d[1], x15
+
+    fmla        v26.4s, v12.4s, v8.s[2]
+    prfm        pldl1keep, [x1, #1024]
+    fmla        v27.4s, v13.4s, v8.s[2]
+    ins         v1.d[1], x10
+    fmla        v28.4s, v14.4s, v8.s[2]
+    prfm        pldl1keep, [x1, #1088]
+    fmla        v29.4s, v15.4s, v8.s[2]
+    prfm        pldl1keep, [x1, #1152]
+    fmla        v30.4s, v0.4s, v8.s[2]
+    prfm        pldl1keep, [x1, #1216]
+    fmla        v31.4s, v1.4s, v8.s[2]
+    prfm        pldl1keep, [x2, #256]
+
+    ld1         {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [x1], #64
+    ins         v8.s[0], v8.s[3]
+    ld1         {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [x1], #64
+
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/naive.tmpli
new file mode 100644
index 000000000..cba3dadc5
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1/loop2/naive.tmpli
@@ -0,0 +1,66 @@
+// load a: v9, v10, v11, v12, v13, v14, v15
+// load a: v0, v1, v2, v3, v4, v4, v6, v7
+
+    ld1         {{ v9.4s, v10.4s, v11.4s, v12.4s }}, [x1], #64
+    ld1         {{ v13.4s, v14.4s, v15.4s }}, [x1], #48
+
+    fmla        v16.4s, v0.4s, v8.s[0]
+    fmla        v17.4s, v1.4s, v8.s[0]
+    fmla        v18.4s, v2.4s, v8.s[0]
+    fmla        v19.4s, v3.4s, v8.s[0]
+
+    ld1         {{ v0.4s, v1.4s }}, [x1], #32
+
+    fmla        v20.4s, v4.4s, v8.s[0]
+    fmla        v21.4s, v5.4s, v8.s[0]
+
+    ld1         {{  v2.4s, v3.4s, v4.4s, v5.4s }}, [x1], #64
+    fmla        v22.4s, v6.4s, v8.s[0]
+    fmla        v23.4s, v7.4s, v8.s[0]
+
+    ld1         {{  v6.4s, v7.4s }}, [x1], #32
+
+    fmla        v24.4s, v9.4s, v8.s[0]
+    fmla        v25.4s, v10.4s, v8.s[0]
+    fmla        v26.4s, v11.4s, v8.s[0]
+    fmla        v27.4s, v12.4s, v8.s[0]
+    fmla        v28.4s, v13.4s, v8.s[0]
+    fmla        v29.4s, v14.4s, v8.s[0]
+    fmla        v30.4s, v15.4s, v8.s[0]
+
+    ld1         {{ v9.4s, v10.4s, v11.4s, v12.4s }}, [x1], #64
+    ld1         {{ v13.4s, v14.4s, v15.4s }}, [x1], #48
+
+    fmla        v31.4s, v0.4s, v8.s[0]
+    ld1         {{ v8.s }}[0], [ x2 ], #4
+
+    fmla        v16.4s, v1.4s, v8.s[0]
+    ld1         {{ v0.4s, v1.4s }}, [x1], #32
+    fmla        v17.4s, v2.4s, v8.s[0]
+    fmla        v18.4s, v3.4s, v8.s[0]
+    fmla        v19.4s, v4.4s, v8.s[0]
+
+    fmla        v20.4s, v5.4s, v8.s[0]
+    fmla        v21.4s, v6.4s, v8.s[0]
+    fmla        v22.4s, v7.4s, v8.s[0]
+    fmla        v23.4s, v9.4s, v8.s[0]
+
+    fmla        v24.4s, v10.4s, v8.s[0]
+    fmla        v25.4s, v11.4s, v8.s[0]
+    fmla        v26.4s, v12.4s, v8.s[0]
+    fmla        v27.4s, v13.4s, v8.s[0]
+    fmla        v28.4s, v14.4s, v8.s[0]
+    fmla        v29.4s, v15.4s, v8.s[0]
+    fmla        v30.4s, v0.4s, v8.s[0]
+    fmla        v31.4s, v1.4s, v8.s[0]
+    ld1         {{ v8.s }}[0], [ x2 ], #4
+
+    ld1         {{ v0.4s, v1.4s, v2.4s, v3.4s }}, [x1], #64
+    ld1         {{ v4.4s, v5.4s, v6.4s, v7.4s }}, [x1], #64
+
+    prfm        pldl1keep, [x1, #1024]
+    prfm        pldl1keep, [x1, #1088]
+    prfm        pldl1keep, [x1, #1152]
+    prfm        pldl1keep, [x1, #1216]
+    prfm        pldl1keep, [x2, #256]
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1_core.tmpl
new file mode 100644
index 000000000..1f0c6ce41
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_64x1_core.tmpl
@@ -0,0 +1,225 @@
+// vim: ft=arm
+
+// C tile regs:
+// - x19-x29 to preserve (but x19, x28, x29 not used) 
+// - d8..d15 to preserve
+// - v16 to v31, no need to preserve
+// 
+//      v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0]
+//      v16[1] v18[1] 
+//      v16[2] v18[2] 
+//      v16[3] v18[3]
+//                     
+//      v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0]
+//      v17[1] v19[1] 
+//      v17[2] v19[2] 
+//      v17[3] v19[3] 
+
+// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+// packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+.cpu generic+fp+simd
+.global {{G}}arm64simd_mmm_f32_64x1_{{core}}_{{suffix}}
+{{G}}arm64simd_mmm_f32_64x1_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+    stp         x26, x27, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+    sub         x3, x3, #1
+
+
+    ld1         { v8.s }[0], [ x2 ], #4
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64
+    ld1         { v4.4s, v5.4s, v6.4s, v7.4s }, [ x1 ], #64
+
+    cmp         x3, #0
+    beq         .packed_packed_loop_1_last
+
+    cmp         x3, #4
+    blt        .packed_packed_loop_1
+
+{% capture packed_packed_loop1 %}
+    {% if core == "a53" %}
+        {% include "arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli" %}
+    {% else %}
+        {% include "arm64simd_mmm_f32_64x1/loop1/naive.tmpli" %}
+    {% endif %}
+{% endcapture %}
+
+{% capture packed_packed_loop2 %}
+    {% if core == "a53" %}
+        {{ packed_packed_loop1 }}
+        {{ packed_packed_loop1 }}
+    {% elsif core == "a55" %}
+        {% include "arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli" %}
+    {% else %}
+        {% include "arm64simd_mmm_f32_64x1/loop2/naive.tmpli" %}
+    {% endif %}
+{% endcapture %}
+
+.p2align 4
+.packed_packed_loop_4:
+    {{ packed_packed_loop2 }}
+    {{ packed_packed_loop2 }}
+
+    sub         x3, x3, #4
+    cmp         x3, #4
+    bge         .packed_packed_loop_4
+
+    cmp         x3, #0
+    beq         .packed_packed_loop_1_last
+
+.p2align 4
+.packed_packed_loop_1:
+    {{ packed_packed_loop1 }}
+
+    subs        x3, x3, #1
+    bne         .packed_packed_loop_1
+
+// last loop can't read beyond actual input as it's likely not packed and padded
+.packed_packed_loop_1_last:
+    ld1         { v9.4s, v10.4s, v11.4s, v12.4s }, [x1], #64
+    ld1         { v13.4s, v14.4s, v15.4s }, [x1], #48
+
+    fmla        v16.4s, v0.4s, v8.s[0]
+    fmla        v17.4s, v1.4s, v8.s[0]
+    ld1         { v0.4s }, [ x1 ]
+    fmla        v18.4s, v2.4s, v8.s[0]
+    fmla        v19.4s, v3.4s, v8.s[0]
+    fmla        v20.4s, v4.4s, v8.s[0]
+    fmla        v21.4s, v5.4s, v8.s[0]
+    fmla        v22.4s, v6.4s, v8.s[0]
+    fmla        v23.4s, v7.4s, v8.s[0]
+
+    fmla        v24.4s, v9.4s, v8.s[0]
+    fmla        v25.4s, v10.4s, v8.s[0]
+    fmla        v26.4s, v11.4s, v8.s[0]
+    fmla        v27.4s, v12.4s, v8.s[0]
+    fmla        v28.4s, v13.4s, v8.s[0]
+    fmla        v29.4s, v14.4s, v8.s[0]
+    fmla        v30.4s, v15.4s, v8.s[0]
+    fmla        v31.4s, v0.4s, v8.s[0]
+
+    b           .non_linear_loop
+
+{% include "arm64simd_mmm_f32_scalars.tmpliq" from:16, to:31%}
+{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:64, from:16, to:31%}
+{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:64, from:16, to:31%}
+{% include "arm64simd_mmm_load_tile.tmpliq" from:16, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]           // c base ptr, rsc
+    cmp         x6, #4
+    beq         .do_per_row_add
+
+    {% for reg in (16..31) %}
+        {% for lane in (0..3) %}
+            ld1 {v0.s}[{{lane}}], [ x5 ], x6
+        {% endfor %}
+        fadd v{{reg}}.4s, v{{reg}}.4s, v0.4s
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.do_per_row_add:
+    ld1     {v0.4s-v3.4s}, [x5], #64
+    ld1     {v4.4s-v7.4s}, [x5], #64
+    ld1     {v8.4s-v11.4s}, [x5], #64
+    ld1     {v12.4s-v15.4s}, [x5], #64
+
+    {% for r in (0..15) %}
+        fadd v{{r| plus: 16}}.4s, v{{r | plus: 16}}.4s, v{{r}}.4s
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x3, [x0, #16]
+    ldr     x2, [x0, #8]
+
+    ld1         {v8.s}[0], [ x3 ]
+
+    {% for r in (0..7) %}
+        ldr     q{{r}}, [x2], #16
+    {% endfor %}
+
+    fmla        v16.4s, v0.4s, v8.s[0]
+    ldr         q0, [x2], #16
+    fmla        v17.4s, v1.4s, v8.s[0] 
+    ldr         q1, [x2], #16
+    fmla        v18.4s, v2.4s, v8.s[0] 
+    ldr         q2, [x2], #16
+    fmla        v19.4s, v3.4s, v8.s[0] 
+    ldr         q3, [x2], #16
+    fmla        v20.4s, v4.4s, v8.s[0] 
+    ldr         q4, [x2], #16
+    fmla        v21.4s, v5.4s, v8.s[0] 
+    ldr         q5, [x2], #16
+    fmla        v22.4s, v6.4s, v8.s[0] 
+    ldr         q6, [x2], #16
+    fmla        v23.4s, v7.4s, v8.s[0] 
+    ldr         q7, [x2], #16
+
+    fmla        v24.4s, v0.4s, v8.s[0]
+    fmla        v25.4s, v1.4s, v8.s[0] 
+    fmla        v26.4s, v2.4s, v8.s[0] 
+    fmla        v27.4s, v3.4s, v8.s[0] 
+    fmla        v28.4s, v4.4s, v8.s[0] 
+    fmla        v29.4s, v5.4s, v8.s[0] 
+    fmla        v30.4s, v6.4s, v8.s[0] 
+    fmla        v31.4s, v7.4s, v8.s[0] 
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]                // c base ptr, rsc$
+
+    cmp         x6, #4
+    beq         .store_strides_contig
+
+    {% for reg in (16..31) %}
+        {% for lane in (0..3) %}
+            st1 { v{{reg}}.s }[{{lane}}], [ x5 ], x6
+        {% endfor %}
+    {% endfor %}
+    b           .non_linear_loop
+
+.store_strides_contig:
+
+    {% for reg in (16..31) %}
+        st1 { v{{reg}}.4s }, [ x5 ], #16
+    {% endfor %}
+    b           .non_linear_loop
+
+.return:
+
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x26, x27, [sp], #16
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/broken_chains.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/broken_chains.tmpli
new file mode 100644
index 000000000..9b3035b21
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/broken_chains.tmpli
@@ -0,0 +1,25 @@
+ld1         {{ v2.4s, v3.4s }}, [x1], #32
+ld1         {{ v6.4s, v7.4s }}, [x2], #32
+
+fmla        v16.4s, v0.4s, v4.s[0]
+fmla        v17.4s, v1.4s, v4.s[0]
+fmla        v18.4s, v0.4s, v4.s[1]
+fmla        v19.4s, v1.4s, v4.s[1]
+fmla        v20.4s, v0.4s, v4.s[2]
+fmla        v21.4s, v1.4s, v4.s[2]
+fmla        v22.4s, v0.4s, v4.s[3]
+fmla        v23.4s, v1.4s, v4.s[3]
+
+fmla        v24.4s, v0.4s, v5.s[0]
+fmla        v25.4s, v1.4s, v5.s[0]
+fmla        v26.4s, v0.4s, v5.s[1]
+fmla        v27.4s, v1.4s, v5.s[1]
+fmla        v28.4s, v0.4s, v5.s[2]
+fmla        v29.4s, v1.4s, v5.s[2]
+fmla        v30.4s, v0.4s, v5.s[3]
+fmla        v31.4s, v1.4s, v5.s[3]
+
+and         v0.16b, v2.16b, v2.16b
+and         v1.16b, v3.16b, v3.16b
+and         v4.16b, v6.16b, v6.16b
+and         v5.16b, v7.16b, v7.16b
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_no_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_no_preload.tmpli
new file mode 100644
index 000000000..ac5bdc5bb
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_no_preload.tmpli
@@ -0,0 +1,51 @@
+
+fmla        v16.4s, v0.4s, v4.s[0]
+ldr         w5, [x1], #4
+fmla        v17.4s, v1.4s, v4.s[0]
+ldr         w20, [x2], #4
+fmla        v18.4s, v0.4s, v4.s[1]
+ldr         w6, [x1], #4
+fmla        v20.4s, v1.4s, v4.s[1]
+ldr         w21, [x2], #4
+fmla        v20.4s, v0.4s, v4.s[2]
+ldr         w7, [x1], #4
+fmla        v21.4s, v1.4s, v4.s[2]
+ldr         w22, [x2], #4
+fmla        v22.4s, v0.4s, v4.s[3]
+ldr         w8, [x1], #4
+fmla        v23.4s, v1.4s, v4.s[3]
+ldr         w23, [x2], #4
+
+fmla        v24.4s, v0.4s, v5.s[0]
+ldr         w9, [x1], #4
+fmla        v25.4s, v1.4s, v5.s[0]
+ldr         w24, [x2], #4
+fmla        v26.4s, v0.4s, v5.s[1]
+ldr         w10, [x1], #4
+fmla        v27.4s, v1.4s, v5.s[1]
+ldr         w25, [x2], #4
+fmla        v28.4s, v0.4s, v5.s[2]
+ldr         w11, [x1], #4
+fmla        v29.4s, v1.4s, v5.s[2]
+ldr         w26, [x2], #4
+fmla        v30.4s, v0.4s, v5.s[3]
+ldr         w12, [x1], #4
+fmla        v31.4s, v1.4s, v5.s[3]
+ldr         w27, [x2], #4
+
+ins         v0.s[0], w5
+ins         v4.s[0], w20
+ins         v1.s[0], w9
+ins         v5.s[0], w24
+ins         v0.s[2], w7
+ins         v4.s[2], w22
+ins         v1.s[2], w11
+ins         v5.s[2], w26
+ins         v0.s[1], w6
+ins         v4.s[1], w21
+ins         v1.s[1], w10
+ins         v5.s[1], w25
+ins         v0.s[3], w8
+ins         v4.s[3], w23
+ins         v1.s[3], w12
+ins         v5.s[3], w27
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_preload.tmpli
new file mode 100644
index 000000000..3afc78c7b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_preload.tmpli
@@ -0,0 +1,54 @@
+fmla        v16.4s, v0.4s, v4.s[0]
+ldr         w5, [x1], #4
+fmla        v17.4s, v1.4s, v4.s[0]
+ldr         w20, [x2], #4
+fmla        v18.4s, v0.4s, v4.s[1]
+ldr         w6, [x1], #4
+fmla        v19.4s, v1.4s, v4.s[1]
+ldr         w21, [x2], #4
+fmla        v20.4s, v0.4s, v4.s[2]
+ldr         w7, [x1], #4
+fmla        v21.4s, v1.4s, v4.s[2]
+ldr         w22, [x2], #4
+fmla        v22.4s, v0.4s, v4.s[3]
+ldr         w8, [x1], #4
+fmla        v23.4s, v1.4s, v4.s[3]
+ldr         w23, [x2], #4
+
+fmla        v24.4s, v0.4s, v5.s[0]
+ldr         w9, [x1], #4
+fmla        v25.4s, v1.4s, v5.s[0]
+ldr         w24, [x2], #4
+fmla        v26.4s, v0.4s, v5.s[1]
+ldr         w10, [x1], #4
+fmla        v27.4s, v1.4s, v5.s[1]
+ldr         w25, [x2], #4
+fmla        v28.4s, v0.4s, v5.s[2]
+ldr         w11, [x1], #4
+fmla        v29.4s, v1.4s, v5.s[2]
+ldr         w26, [x2], #4
+fmla        v30.4s, v0.4s, v5.s[3]
+ldr         w12, [x1], #4
+fmla        v31.4s, v1.4s, v5.s[3]
+ldr         w27, [x2], #4
+
+prfm        pldl1keep, [x1, #256]
+prfm        pldl1keep, [x2, #256]
+
+ins         v0.s[0], w5
+ins         v4.s[0], w20
+ins         v1.s[0], w9
+ins         v5.s[0], w24
+ins         v0.s[2], w7
+ins         v4.s[2], w22
+ins         v1.s[2], w11
+ins         v5.s[2], w26
+ins         v0.s[1], w6
+ins         v4.s[1], w21
+ins         v1.s[1], w10
+ins         v5.s[1], w25
+ins         v0.s[3], w8
+ins         v4.s[3], w23
+ins         v1.s[3], w12
+ins         v5.s[3], w27
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_no_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_no_preload.tmpli
new file mode 100644
index 000000000..e3822d347
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_no_preload.tmpli
@@ -0,0 +1,35 @@
+
+fmla        v16.4s, v0.4s, v4.s[0]
+ldr         x5, [x1], #8
+fmla        v17.4s, v1.4s, v4.s[0]
+ldr         x9, [x2], #8
+fmla        v18.4s, v0.4s, v4.s[1]
+ldr         x6, [x1], #8
+fmla        v19.4s, v1.4s, v4.s[1]
+ldr         x10, [x2], #8
+fmla        v20.4s, v0.4s, v4.s[2]
+ldr         x7, [x1], #8
+fmla        v21.4s, v1.4s, v4.s[2]
+ldr         x11, [x2], #8
+fmla        v22.4s, v0.4s, v4.s[3]
+ldr         x8, [x1], #8
+fmla        v23.4s, v1.4s, v4.s[3]
+ldr         x12, [x2], #8
+
+fmla        v24.4s, v0.4s, v5.s[0]
+fmla        v25.4s, v1.4s, v5.s[0]
+fmla        v26.4s, v0.4s, v5.s[1]
+fmla        v27.4s, v1.4s, v5.s[1]
+fmla        v28.4s, v0.4s, v5.s[2]
+fmla        v29.4s, v1.4s, v5.s[2]
+fmla        v30.4s, v0.4s, v5.s[3]
+fmla        v31.4s, v1.4s, v5.s[3]
+
+ins         v2.d[0], x5
+ins         v6.d[0], x9
+ins         v3.d[0], x7
+ins         v7.d[0], x11
+ins         v2.d[1], x6
+ins         v6.d[1], x10
+ins         v3.d[1], x8
+ins         v7.d[1], x12
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli
new file mode 100644
index 000000000..11081e84f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli
@@ -0,0 +1,43 @@
+
+fmla        v16.4s, v0.4s, v4.s[0]
+ldr         x5, [x1], #8
+fmla        v17.4s, v1.4s, v4.s[0]
+ldr         x9, [x2], #8
+fmla        v18.4s, v0.4s, v4.s[1]
+ldr         x6, [x1], #8
+fmla        v19.4s, v1.4s, v4.s[1]
+ldr         x10, [x2], #8
+fmla        v20.4s, v0.4s, v4.s[2]
+ldr         x7, [x1], #8
+fmla        v21.4s, v1.4s, v4.s[2]
+ldr         x11, [x2], #8
+fmla        v22.4s, v0.4s, v4.s[3]
+ldr         x8, [x1], #8
+fmla        v23.4s, v1.4s, v4.s[3]
+ldr         x12, [x2], #8
+
+fmla        v24.4s, v0.4s, v5.s[0]
+prfm        pldl1keep, [x1, #256]
+fmla        v25.4s, v1.4s, v5.s[0]
+prfm        pldl1keep, [x1, #320]
+fmla        v26.4s, v0.4s, v5.s[1]
+prfm        pldl1keep, [x1, #384]
+fmla        v27.4s, v1.4s, v5.s[1]
+prfm        pldl1keep, [x1, #448]
+fmla        v28.4s, v0.4s, v5.s[2]
+prfm        pldl1keep, [x2, #256]
+fmla        v29.4s, v1.4s, v5.s[2]
+prfm        pldl1keep, [x2, #320]
+fmla        v30.4s, v0.4s, v5.s[3]
+prfm        pldl1keep, [x2, #384]
+fmla        v31.4s, v1.4s, v5.s[3]
+prfm        pldl1keep, [x2, #448]
+
+ins         v0.d[0], x5
+ins         v4.d[0], x9
+ins         v1.d[0], x7
+ins         v5.d[0], x11
+ins         v0.d[1], x6
+ins         v4.d[1], x10
+ins         v1.d[1], x8
+ins         v5.d[1], x12
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli
new file mode 100644
index 000000000..14abb2a87
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli
@@ -0,0 +1,21 @@
+
+fmla        v16.4s, v0.4s, v4.s[0]
+fmla        v17.4s, v1.4s, v4.s[0]
+fmla        v18.4s, v0.4s, v4.s[1]
+fmla        v19.4s, v1.4s, v4.s[1]
+fmla        v20.4s, v0.4s, v4.s[2]
+fmla        v21.4s, v1.4s, v4.s[2]
+fmla        v22.4s, v0.4s, v4.s[3]
+fmla        v23.4s, v1.4s, v4.s[3]
+
+fmla        v24.4s, v0.4s, v5.s[0]
+fmla        v25.4s, v1.4s, v5.s[0]
+fmla        v26.4s, v0.4s, v5.s[1]
+fmla        v27.4s, v1.4s, v5.s[1]
+fmla        v28.4s, v0.4s, v5.s[2]
+fmla        v29.4s, v1.4s, v5.s[2]
+fmla        v30.4s, v0.4s, v5.s[3]
+fmla        v31.4s, v1.4s, v5.s[3]
+
+ld1         {{ v0.4s, v1.4s }}, [x1], #32
+ld1         {{ v4.4s, v5.4s }}, [x2], #32
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/broken_chains.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/broken_chains.tmpli
new file mode 100644
index 000000000..5235ac6c2
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/broken_chains.tmpli
@@ -0,0 +1,41 @@
+ld1         {{ v2.4s, v3.4s }}, [x1], #32
+ld1         {{ v6.4s, v7.4s }}, [x2], #32
+
+fmla        v16.4s, v0.4s, v4.s[0]
+fmla        v17.4s, v1.4s, v4.s[0]
+fmla        v18.4s, v0.4s, v4.s[1]
+fmla        v19.4s, v1.4s, v4.s[1]
+fmla        v20.4s, v0.4s, v4.s[2]
+fmla        v21.4s, v1.4s, v4.s[2]
+fmla        v22.4s, v0.4s, v4.s[3]
+fmla        v23.4s, v1.4s, v4.s[3]
+
+fmla        v24.4s, v0.4s, v5.s[0]
+fmla        v25.4s, v1.4s, v5.s[0]
+fmla        v26.4s, v0.4s, v5.s[1]
+fmla        v27.4s, v1.4s, v5.s[1]
+fmla        v28.4s, v0.4s, v5.s[2]
+fmla        v29.4s, v1.4s, v5.s[2]
+fmla        v30.4s, v0.4s, v5.s[3]
+fmla        v31.4s, v1.4s, v5.s[3]
+
+ld1         {{ v0.4s, v1.4s }}, [x1], #32
+ld1         {{ v4.4s, v5.4s }}, [x2], #32
+
+fmla        v16.4s, v2.4s, v6.s[0]
+fmla        v17.4s, v3.4s, v6.s[0]
+fmla        v18.4s, v2.4s, v6.s[1]
+fmla        v19.4s, v3.4s, v6.s[1]
+fmla        v20.4s, v2.4s, v6.s[2]
+fmla        v21.4s, v3.4s, v6.s[2]
+fmla        v22.4s, v2.4s, v6.s[3]
+fmla        v23.4s, v3.4s, v6.s[3]
+
+fmla        v24.4s, v2.4s, v7.s[0]
+fmla        v25.4s, v3.4s, v7.s[0]
+fmla        v26.4s, v2.4s, v7.s[1]
+fmla        v27.4s, v3.4s, v7.s[1]
+fmla        v28.4s, v2.4s, v7.s[2]
+fmla        v29.4s, v3.4s, v7.s[2]
+fmla        v30.4s, v2.4s, v7.s[3]
+fmla        v31.4s, v3.4s, v7.s[3]
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli
new file mode 100644
index 000000000..7f8759688
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli
@@ -0,0 +1,60 @@
+fmla        v16.4s, v0.4s, v4.s[0]
+ldr         d2, [x1], #8
+fmla        v17.4s, v1.4s, v4.s[0]
+ldr         d6, [x2], #8
+fmla        v18.4s, v0.4s, v4.s[1]
+ldr         x5, [x1], #8
+fmla        v19.4s, v1.4s, v4.s[1]
+ldr         x7, [x2], #8
+fmla        v20.4s, v0.4s, v4.s[2]
+ldr         d3, [x1], #8
+fmla        v21.4s, v1.4s, v4.s[2]
+ldr         d7, [x2], #8
+fmla        v22.4s, v0.4s, v4.s[3]
+ldr         x6, [x1], #8
+fmla        v23.4s, v1.4s, v4.s[3]
+ldr         x8, [x2], #8
+
+fmla        v24.4s, v0.4s, v5.s[0]
+fmla        v25.4s, v1.4s, v5.s[0]
+fmla        v26.4s, v0.4s, v5.s[1]
+fmla        v27.4s, v1.4s, v5.s[1]
+fmla        v28.4s, v0.4s, v5.s[2]
+ins         v2.d[1], x5
+fmla        v29.4s, v1.4s, v5.s[2]
+ins         v6.d[1], x7
+fmla        v30.4s, v0.4s, v5.s[3]
+ins         v3.d[1], x6
+fmla        v31.4s, v1.4s, v5.s[3]
+ins         v7.d[1], x8
+
+fmla        v16.4s, v2.4s, v6.s[0]
+ldr         d0, [x1], #8
+fmla        v17.4s, v3.4s, v6.s[0]
+ldr         d4, [x2], #8
+fmla        v18.4s, v2.4s, v6.s[1]
+ldr         x5, [x1], #8
+fmla        v19.4s, v3.4s, v6.s[1]
+ldr         x7, [x2], #8
+fmla        v20.4s, v2.4s, v6.s[2]
+ldr         d1, [x1], #8
+fmla        v21.4s, v3.4s, v6.s[2]
+ldr         d5, [x2], #8
+fmla        v22.4s, v2.4s, v6.s[3]
+ldr         x6, [x1], #8
+fmla        v23.4s, v3.4s, v6.s[3]
+ldr         x8, [x2], #8
+
+fmla        v24.4s, v2.4s, v7.s[0]
+fmla        v25.4s, v3.4s, v7.s[0]
+fmla        v26.4s, v2.4s, v7.s[1]
+fmla        v27.4s, v3.4s, v7.s[1]
+fmla        v28.4s, v2.4s, v7.s[2]
+ins         v0.d[1], x5
+fmla        v29.4s, v3.4s, v7.s[2]
+ins         v4.d[1], x7
+fmla        v30.4s, v2.4s, v7.s[3]
+ins         v1.d[1], x6
+fmla        v31.4s, v3.4s, v7.s[3]
+ins         v5.d[1], x8
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8_core.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8_core.tmpl
new file mode 100644
index 000000000..8bef26cc5
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_8x8_core.tmpl
@@ -0,0 +1,182 @@
+// vim: ft=arm
+
+// C tile regs: v16 to v31, (scratch)
+// - x19-x29 to preserve (but x19, x28, x29 not used) 
+// - d8..d15 to preserve
+// - v16 to v31, no need to preserve
+// 
+//      v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0]
+//      v16[1] v18[1] 
+//      v16[2] v18[2] 
+//      v16[3] v18[3]
+//                     
+//      v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0]
+//      v17[1] v19[1] 
+//      v17[2] v19[2] 
+//      v17[3] v19[3] 
+
+// v0-v7 (scratch registers)
+//  packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+//  packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+.cpu generic+fp+simd
+.global {{G}}arm64simd_mmm_f32_8x8_{{core}}_{{suffix}}
+{{G}}arm64simd_mmm_f32_8x8_{{core}}_{{suffix}}:
+
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+    stp         x26, x27, [sp, #-16]!
+    
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldr         x2, [x0, #24]       // b
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+.packed_packed:
+    ld1         { v0.4s, v1.4s }, [ x1 ], #32
+    ld1         { v4.4s, v5.4s }, [ x2 ], #32
+
+{% capture packed_packed_loop1 %}
+    {% if core == "a53" %}
+        {% include "arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli" %}
+    {% else %}
+        {% include "arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli" %}
+    {% endif %}
+{% endcapture %}
+
+{% capture packed_packed_loop2 %}
+    {% if core == "a55" %}
+        {% include "arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli" %}
+    {% else %}
+        {{ packed_packed_loop1 }}
+        {{ packed_packed_loop1 }}
+    {% endif %}
+{% endcapture %}
+
+    cmp         x3, #4
+    blt         .packed_packed_loop_1
+
+.p2align 4
+.packed_packed_loop_4:
+    {{ packed_packed_loop2 }}
+    {{ packed_packed_loop2 }}
+
+    sub x3, x3, #4
+    cmp x3, #4
+    bge .packed_packed_loop_4
+
+
+    cmp x3, #0
+    beq .non_linear_loop
+
+.p2align 4
+.packed_packed_loop_1:
+    {{ packed_packed_loop1 }}
+    subs        x3, x3, #1
+    bne .packed_packed_loop_1
+
+    b .non_linear_loop
+
+{% include "arm64simd_mmm_f32_scalars.tmpliq" from:16, to:31%}
+{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:8, from:16, to:31 %}
+{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:8, from:16, to:31 %}
+{% include "arm64simd_mmm_load_tile.tmpliq" from:16, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]
+    ldp         x7, x8, [x0, #24]
+
+    {% for col in (8..15) %}
+        mov x4, x5
+        {% for reg in (0..1) %}
+            {% for lane in (0..3) %}
+                ld1 {v0.s}[{{lane}}], [ x4 ], x6
+            {% endfor %}
+            fadd v{{col | times:2 | plus: reg}}.4s, v{{col | times:2 | plus: reg}}.4s, v0.4s
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x2, [x0, #8]
+    ldr     x3, [x0, #16]
+
+    ld1         { v0.4s, v1.4s }, [ x2 ], #32
+    ld1         { v4.4s, v5.4s }, [ x3 ], #32
+
+    fmla        v16.4s, v0.4s, v4.s[0]
+    fmla        v17.4s, v1.4s, v4.s[0]
+    fmla        v18.4s, v0.4s, v4.s[1]
+    fmla        v19.4s, v1.4s, v4.s[1]
+    fmla        v20.4s, v0.4s, v4.s[2]
+    fmla        v21.4s, v1.4s, v4.s[2]
+    fmla        v22.4s, v0.4s, v4.s[3]
+    fmla        v23.4s, v1.4s, v4.s[3]
+
+    fmla        v24.4s, v0.4s, v5.s[0]
+    fmla        v25.4s, v1.4s, v5.s[0]
+    fmla        v26.4s, v0.4s, v5.s[1]
+    fmla        v27.4s, v1.4s, v5.s[1]
+    fmla        v28.4s, v0.4s, v5.s[2]
+    fmla        v29.4s, v1.4s, v5.s[2]
+    fmla        v30.4s, v0.4s, v5.s[3]
+    fmla        v31.4s, v1.4s, v5.s[3]
+
+    b           .non_linear_loop
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    cmp         x6, #4
+    bne         .store_strides_generic
+
+    {% for col in (8..15) %}
+        str q{{col | times:2 }}, [ x5 ]
+        str q{{col | times:2 | plus: 1}}, [ x5, #16 ]
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.store_strides_generic:
+
+    {% for col in (8..15) %}
+        mov x4, x5
+        {% for reg in (0..1) %}
+            {% for lane in (0..3) %}
+                st1 { v{{col | times:2 | plus: reg}}.s }[{{lane}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.return:
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x26, x27, [sp], #16
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_cols.tmpliq
new file mode 100644
index 000000000..ce1ffe1f1
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_cols.tmpliq
@@ -0,0 +1,9 @@
+// vim: ft=arm
+
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_min", op:"fmin", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_max", op:"fmax", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_mul", op:"fmul", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_add", op:"fadd", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_sub", op:"fsub", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_sub_flipped", op:"fsub", mr:mr, from:from, to:to, flipped: true%}
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_rows.tmpliq
new file mode 100644
index 000000000..c518a6b4e
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_per_rows.tmpliq
@@ -0,0 +1,9 @@
+// vim: ft=arm
+
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_min", op:"fmin", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_max", op:"fmax", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_mul", op:"fmul", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_add", op:"fadd", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_sub", op:"fsub", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_sub_flipped", op:"fsub", mr:mr, from:from, to:to, flipped: true%}
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_scalars.tmpliq
new file mode 100644
index 000000000..cc053df02
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_f32_scalars.tmpliq
@@ -0,0 +1,36 @@
+// vim: ft=arm
+
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_min", op:"fmin", from:from, to:to %}
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_max", op:"fmax", from:from, to:to %}
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_mul", op:"fmul", from:from, to:to %}
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_add", op:"fadd", from:from, to:to %}
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_sub", op:"fsub", from:from, to:to %}
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_sub_flipped", op:"fsub", from:from, to:to, flipped:true %}
+
+
+.clear:
+{% for r in (from..to) %}
+    eor         v{{r}}.8b, v{{r}}.8b, v{{r}}.8b
+{% endfor %}
+    b .non_linear_loop
+
+.leaky_relu:
+    add         x2, x0, #8
+    ld1         {v4.s}[0], [ x2 ]
+    dup         v4.4s, v4.s[0]
+
+    // bsl cond/dst, then, else
+    // fcmge dst, src, #0.0
+    {% for r in (from..to) %}
+        fmul  v0.4s, v{{r}}.4s, v4.4s
+        fcmge v1.4s, v{{r}}.4s, #0.0
+        bsl   v1.16b, v{{r}}.16b, v0.16b
+        and   v{{r}}.16b, v1.16b, v1.16b
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_scale:
+.q_shl:
+.q_shr:
+    b .unsupported
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_64x1.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_64x1.tmpl
new file mode 100644
index 000000000..45192c8a5
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_64x1.tmpl
@@ -0,0 +1,180 @@
+// vim: ft=arm
+
+// C tile regs: 
+// - x19-x29 to preserve (but x19, x28, x29 not used) 
+// - d8..d15 to preserve
+// - v16 to v31, no need to preserve
+
+// no preservation either for v0-v7...
+// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+// packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+.cpu generic+fp+simd
+.global {{G}}arm64simd_mmm_i32_64x1_{{suffix}}
+{{G}}arm64simd_mmm_i32_64x1_{{suffix}}:
+
+/*
+    prfm        pldl1keep, [x1]
+    prfm        pldl1keep, [x2]
+*/
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+    stp         x26, x27, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldp         x2, x4, [x0, #24]   // b, packing
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    cmp         x4, #1
+    beq         .packed_packed_loop_1_i8i8
+
+.packed_packed_loop_1:
+    ld1         {v9.s}[0], [ x2 ], 4
+
+    ld1	        { v0.4s-v3.4s }, [ x1 ], #64
+    ld1	        { v4.4s-v7.4s }, [ x1 ], #64
+    {% for reg in (0..3) %}
+        mla      v{{reg | times: 2 | plus: 16 }}.4s, v{{reg | times:2}}.4s, v9.s[0]
+        mla      v{{reg | times: 2 | plus: 17 }}.4s, v{{reg | times:2 | plus:1}}.4s, v9.s[0]
+    {% endfor %}
+
+    ld1	        { v0.4s-v3.4s }, [ x1 ], #64
+    ld1	        { v4.4s-v7.4s }, [ x1 ], #64
+    {% for reg in (0..3) %}
+        mla      v{{reg | times: 2 | plus: 24 }}.4s, v{{reg | times:2}}.4s, v9.s[0]
+        mla      v{{reg | times: 2 | plus: 25 }}.4s, v{{reg | times:2 | plus:1}}.4s, v9.s[0]
+    {% endfor %}
+
+    subs        x3, x3, #1
+    bne .packed_packed_loop_1
+    
+    b .non_linear_loop
+
+.packed_packed_loop_1_i8i8:
+    ld1         {v9.b}[0], [ x2 ], 1
+    sshll       v9.8h, v9.8b, 0
+
+    ld1	        { v0.8b-v3.8b }, [ x1 ], #32
+    ld1	        { v4.8b-v7.8b }, [ x1 ], #32
+
+    {% for reg in (0..7) %}
+        sshll       v10.8h, v{{reg}}.8b, 0
+        smlal       v{{reg | times: 2 | plus: 16 }}.4s, v10.4h, v9.h[0]
+        smlal2      v{{reg | times: 2 | plus: 17 }}.4s, v10.8h, v9.h[0]
+    {% endfor %}
+
+    subs        x3, x3, #1
+    bne .packed_packed_loop_1_i8i8
+
+    b .non_linear_loop
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]
+    ldp         x7, x8, [x0, #24]
+
+    cmp         x8, #4
+    beq         non_linear_addc_i32
+
+    {% for reg in (16..31) %}
+        {% for lane in (0..3) %}
+            ld1 {v0.b}[{{lane}}], [ x5 ], x6
+        {% endfor %}
+        sshll v0.8h, v0.8b, 0
+        sshll v0.4s, v0.4h, 0
+        add v{{reg}}.4s, v{{reg}}.4s, v0.4s
+    {% endfor %}
+
+    b           .non_linear_loop
+
+non_linear_addc_i32:
+    {% for reg in (16..31) %}
+        {% for lane in (0..3) %}
+            ld1 {v0.s}[{{lane}}], [ x5 ], x6
+        {% endfor %}
+        add v{{reg}}.4s, v{{reg}}.4s, v0.4s
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x2, [x0, #8]
+    ldr     x3, [x0, #16]
+
+    ld1         { v15.s }[0], [ x3 ]
+    xtn         v15.4h, v15.4s
+
+    ld1         { v0.4s-v3.4s }, [ x2 ], #64
+    ld1         { v4.4s-v7.4s }, [ x2 ], #64
+
+    {% for reg in (0..7) %}
+        xtn         v{{reg}}.4h, v{{reg}}.4s
+        smlal        v{{reg|plus: 16}}.4s, v{{reg}}.4h, v15.h[0]
+    {% endfor %}
+
+    ld1         { v0.4s-v3.4s }, [ x2 ], #64
+    ld1         { v4.4s-v7.4s }, [ x2 ], #64
+
+    {% for reg in (0..7) %}
+        xtn         v{{reg}}.4h, v{{reg}}.4s
+        smlal        v{{reg|plus: 24}}.4s, v{{reg}}.4h, v15.h[0]
+    {% endfor %}
+
+    b           .non_linear_loop
+
+{% include "arm64simd_mmm_i32_scalars.tmpliq" from:16, to:31 %}
+{% include "arm64simd_mmm_i32_per_rows.tmpliq" mr:64, from:16, to:31 %}
+{% include "arm64simd_mmm_i32_per_cols.tmpliq" mr:64, from:16, to:31 %}
+{% include "arm64simd_mmm_i32_scale_q16_q31.tmpliq" %}
+{% include "arm64simd_mmm_load_tile.tmpliq" from:16, to:31 %}
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    cmp         x8, #4
+    beq         .store_strides_i32
+
+    {% for reg in (16..31) %}
+        {% for lane in (0..3) %}
+            st1 { v{{reg}}.b }[{{lane | times: 4}}], [ x5 ], x6
+        {% endfor %}
+    {% endfor %}
+
+    b   .non_linear_loop
+
+.store_strides_i32:
+    {% for reg in (16..31) %}
+        {% for lane in (0..3) %}
+            st1 { v{{reg}}.s }[{{lane}}], [ x5 ], x6
+        {% endfor %}
+    {% endfor %}
+
+    b   .non_linear_loop
+
+.return:
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+    
+    ldp         x26, x27, [sp], #16
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_8x8.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_8x8.tmpl
new file mode 100644
index 000000000..5aae3dc91
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_8x8.tmpl
@@ -0,0 +1,234 @@
+// vim: ft=arm
+
+// C tile regs:
+// - x19-x29 to preserve (but x19, x28, x29 not used) 
+// - d8..d15 to preserve
+// - v16 to v31, no need to preserve
+// 
+//      v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0]
+//      v16[1] v18[1] 
+//      v16[2] v18[2] 
+//      v16[3] v18[3]
+//                     
+//      v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0]
+//      v17[1] v19[1] 
+//      v17[2] v19[2] 
+//      v17[3] v19[3] 
+
+// no preservation either for v0-v7...
+// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
+// packed B buffering (2x8 values): alternating v4, v5 with v6, v7
+
+.text
+.align 4
+
+.cpu generic+fp+simd
+.global {{G}}arm64simd_mmm_i32_8x8_{{suffix}}
+{{G}}arm64simd_mmm_i32_8x8_{{suffix}}:
+
+/*
+    prfm        pldl1keep, [x1]
+    prfm        pldl1keep, [x2]
+*/
+    stp         x20, x21, [sp, #-16]!
+    stp         x22, x23, [sp, #-16]!
+    stp         x24, x25, [sp, #-16]!
+    stp         x26, x27, [sp, #-16]!
+
+    stp         d8, d9, [sp, #-16]!
+    stp         d10, d11, [sp, #-16]!
+    stp         d12, d13, [sp, #-16]!
+    stp         d14, d15, [sp, #-16]!
+
+{% include "dispatcher.tmpliq" %}
+
+.add_mat_mul:
+    ldp         x2, x4, [x0, #24]   // b, packing
+    ldp         x3, x1, [x0, #8]    // k, a
+
+    cmp         x3, #0
+    beq         .non_linear_loop
+
+    cmp         x4, #1
+    beq         .packed_packed_loop_1_i8i8
+
+.packed_packed_loop_1:
+
+    ld1	        { v0.4s, v1.4s }, [ x1 ], #32
+    ld1	        { v4.4s, v5.4s }, [ x2 ], #32
+
+    mla         v16.4s, v0.4s, v4.s[0]
+    mla         v17.4s, v1.4s, v4.s[0]
+    mla         v18.4s, v0.4s, v4.s[1]
+    mla         v19.4s, v1.4s, v4.s[1]
+
+    mla         v20.4s, v0.4s, v4.s[2]
+    mla         v21.4s, v1.4s, v4.s[2]
+    mla         v22.4s, v0.4s, v4.s[3]
+    mla         v23.4s, v1.4s, v4.s[3]
+
+    mla         v24.4s, v0.4s, v5.s[0]
+    mla         v25.4s, v1.4s, v5.s[0]
+    mla         v26.4s, v0.4s, v5.s[1]
+    mla         v27.4s, v1.4s, v5.s[1]
+
+    mla         v28.4s, v0.4s, v5.s[2]
+    mla         v29.4s, v1.4s, v5.s[2]
+    mla         v30.4s, v0.4s, v5.s[3]
+    mla         v31.4s, v1.4s, v5.s[3]
+
+    subs        x3, x3, #1
+    bne .packed_packed_loop_1
+
+    b .non_linear_loop
+
+.packed_packed_loop_1_i8i8:
+
+    ld1	        { v0.8b }, [ x1 ], #8
+    sshll       v0.8h, v0.8b, 0
+    ld1         { v4.8b }, [ x2 ], #8
+    sshll        v4.8h, v4.8b, 0
+
+    smlal        v16.4s, v0.4h, v4.h[0]
+    smlal2       v17.4s, v0.8h, v4.h[0]
+    smlal        v18.4s, v0.4h, v4.h[1]
+    smlal2       v19.4s, v0.8h, v4.h[1]
+    smlal        v20.4s, v0.4h, v4.h[2]
+    smlal2       v21.4s, v0.8h, v4.h[2]
+    smlal        v22.4s, v0.4h, v4.h[3]
+    smlal2       v23.4s, v0.8h, v4.h[3]
+
+    smlal        v24.4s, v0.4h, v4.h[4]
+    smlal2       v25.4s, v0.8h, v4.h[4]
+    smlal        v26.4s, v0.4h, v4.h[5]
+    smlal2       v27.4s, v0.8h, v4.h[5]
+    smlal        v28.4s, v0.4h, v4.h[6]
+    smlal2       v29.4s, v0.8h, v4.h[6]
+    smlal        v30.4s, v0.4h, v4.h[7]
+    smlal2       v31.4s, v0.8h, v4.h[7]
+
+    subs        x3, x3, #1
+    bne .packed_packed_loop_1_i8i8
+
+    b .non_linear_loop
+
+{% include "arm64simd_mmm_i32_scalars.tmpliq" from:16, to:31%}
+{% include "arm64simd_mmm_i32_per_rows.tmpliq" mr:8, from:16, to:31%}
+{% include "arm64simd_mmm_i32_per_cols.tmpliq" mr:8, from:16, to:31%}
+{% include "arm64simd_mmm_load_tile.tmpliq" from:16, to:31 %}
+
+.add_unicast:
+    ldp         x5, x6, [x0, #8]
+    ldp         x7, x8, [x0, #24]
+
+    cmp         x8, #4
+    beq         non_linear_addc_i32
+
+    {% for col in (8..15) %}
+        mov x4, x5
+        {% for reg in (0..1) %}
+            {% for lane in (0..3) %}
+                ld1 {v0.b}[{{lane}}], [ x4 ], x6
+            {% endfor %}
+            sshll v0.8h, v0.8b, 0
+            sshll v0.4s, v0.4h, 0
+            add v{{col | times:2 | plus: reg}}.4s, v{{col | times:2 | plus: reg}}.4s, v0.4s
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+non_linear_addc_i32:
+    {% for col in (8..15) %}
+        mov x4, x5
+        {% for reg in (0..1) %}
+            {% for lane in (0..3) %}
+                ld1 {v0.s}[{{lane}}], [ x4 ], x6
+            {% endfor %}
+            add v{{col | times:2 | plus: reg}}.4s, v{{col | times:2 | plus: reg}}.4s, v0.4s
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.add_row_col_products:
+    ldr     x2, [x0, #8]
+    ldr     x3, [x0, #16]
+
+    ld1         { v0.4s, v1.4s }, [ x2 ]
+    ld1         { v4.4s, v5.4s }, [ x3 ]
+
+    xtn         v0.4h, v0.4s
+    xtn         v1.4h, v1.4s
+    xtn         v4.4h, v4.4s
+    xtn         v5.4h, v5.4s
+
+    smlal        v16.4s, v0.4h, v4.h[0]
+    smlal        v17.4s, v1.4h, v4.h[0]
+    smlal        v18.4s, v0.4h, v4.h[1]
+    smlal        v19.4s, v1.4h, v4.h[1]
+    smlal        v20.4s, v0.4h, v4.h[2]
+    smlal        v21.4s, v1.4h, v4.h[2]
+    smlal        v22.4s, v0.4h, v4.h[3]
+    smlal        v23.4s, v1.4h, v4.h[3]
+
+    smlal        v24.4s, v0.4h, v5.h[0]
+    smlal        v25.4s, v1.4h, v5.h[0]
+    smlal        v26.4s, v0.4h, v5.h[1]
+    smlal        v27.4s, v1.4h, v5.h[1]
+    smlal        v28.4s, v0.4h, v5.h[2]
+    smlal        v29.4s, v1.4h, v5.h[2]
+    smlal        v30.4s, v0.4h, v5.h[3]
+    smlal        v31.4s, v1.4h, v5.h[3]
+
+    b           .non_linear_loop
+
+    {% include "arm64simd_mmm_i32_scale_q16_q31.tmpliq" %}
+
+.store:
+    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
+    ldp         x7, x8, [x0, #24]           // csc, item_size
+
+    cmp         x8, #4
+    beq         .store_strides_i32
+
+    {% for col in (8..15) %}
+        mov x4, x5
+        {% for reg in (0..1) %}
+            {% for lane in (0..3) %}
+                st1 { v{{col | times:2 | plus: reg}}.b }[{{lane|times:4}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.store_strides_i32:
+    {% for col in (8..15) %}
+        mov x4, x5
+        {% for reg in (0..1) %}
+            {% for lane in (0..3) %}
+                st1 { v{{col | times:2 | plus: reg}}.s }[{{lane}}], [ x4 ], x6
+            {% endfor %}
+        {% endfor %}
+        add x5, x5, x7
+    {% endfor %}
+
+    b           .non_linear_loop
+
+.return:
+    ldp         d14, d15, [sp], #16
+    ldp         d12, d13, [sp], #16
+    ldp         d10, d11, [sp], #16
+    ldp         d8, d9, [sp], #16
+
+    ldp         x26, x27, [sp], #16
+    ldp         x24, x25, [sp], #16
+    ldp         x22, x23, [sp], #16
+    ldp         x20, x21, [sp], #16
+
+    ret
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_cols.tmpliq
new file mode 100644
index 000000000..d770611b1
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_cols.tmpliq
@@ -0,0 +1,8 @@
+// vim: ft=arm
+
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_min", op:"smin", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_max", op:"smax", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_mul", op:"mul", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_add", op:"add", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_sub", op:"sub", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_col.tmpliq" label:"per_col_sub_flipped", op:"sub", mr:mr, from:from, to:to, flipped: true %}
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_rows.tmpliq
new file mode 100644
index 000000000..12fdf9d00
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_per_rows.tmpliq
@@ -0,0 +1,8 @@
+// vim: ft=arm
+
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_min", op:"smin", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_max", op:"smax", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_mul", op:"mul", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_add", op:"add", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_sub", op:"sub", mr:mr, from:from, to:to %}
+{% include "arm64simd_mmm_4s_per_row.tmpliq" label:"per_row_sub_flipped", op:"sub", mr:mr, from:from, to:to, flipped: true %}
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scalars.tmpliq
new file mode 100644
index 000000000..9bf2f8264
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scalars.tmpliq
@@ -0,0 +1,31 @@
+// vim: ft=arm
+
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_min", op:"smin", from:from, to:to%}
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_max", op:"smax", from:from, to:to%}
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_mul", op:"mul", from:from, to:to%}
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_add", op:"add", from:from, to:to%}
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_sub", op:"sub", from:from, to:to%}
+{% include "arm64simd_mmm_4s_scalar.tmpliq" label:"scalar_sub_flipped", op:"sub", from:from, to:to, flipped:true%}
+
+.clear:
+{% for r in (from..to) %}
+    eor         v{{r}}.8b, v{{r}}.8b, v{{r}}.8b
+{% endfor %}
+    b .non_linear_loop
+
+.leaky_relu:
+    add         x2, x0, #8
+    ld1         {v4.s}[0], [ x2 ]
+    dup         v4.4s, v4.s[0]
+
+    // bsl cond/dst, then, else
+    // fcmge dst, src, #0.0
+    {% for r in (from..to) %}
+        mul   v0.4s, v{{r}}.4s, v4.4s
+        cmge  v1.4s, v{{r}}.4s, #0
+        bsl   v1.16b, v{{r}}.16b, v0.16b
+        and   v{{r}}.16b, v1.16b, v1.16b
+    {% endfor %}
+
+    b .non_linear_loop
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scale_q16_q31.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scale_q16_q31.tmpliq
new file mode 100644
index 000000000..fec2f539d
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_i32_scale_q16_q31.tmpliq
@@ -0,0 +1,267 @@
+
+// vim: ft=arm
+
+.q_scale:
+    ldp     x5, x6, [x0, #8]            // x5: shift, x6: policy
+    add     x2, x0, #24
+    ld1r    { v2.4s }, [x2]             // v2.4s <- multiplier
+
+    mov     w3, #1
+    ins     v4.d[0], x3
+    dup     v4.2d, v4.d[0]              // v4.2d <- 1
+
+    add     x5, x5, #32                 // add 32 to shift
+    neg     x5, x5                      // broadcast shift
+    ins     v1.d[0], x5
+    dup     v1.2d, v1.d[0]              // v1.2s <- -(shift + 32)
+
+    cmp     x6, 1
+    beq     .q_scale_rounding_zero
+    cmp     x6, 2
+    beq     .q_scale_rounding_away
+    cmp     x6, 3
+    beq     .q_scale_rounding_minus_inf
+    cmp     x6, 4
+    beq     .q_scale_rounding_plus_inf
+    cmp     x6, 5
+    beq     .q_scale_rounding_even
+    cmp     x6, 6
+    beq     .q_scale_rounding_odd
+
+    b .unsupported
+
+.q_scale_rounding_zero:
+        // rust: signum * ((abs + nudge2) >> shift
+        // asm: signum * (2*abs - 1) >>r (shift + 1)
+
+    {% for q in (16..31) %}
+        cmlt        v0.4s, v{{q}}.4s, #0
+        abs         v{{q}}.4s, v{{q}}.4s
+        sqdmull     v8.2d, v{{q}}.2s, v2.2s
+        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9
+
+        sub         v8.2d, v8.2d, v4.2d
+        sqrshl      v8.2d, v8.2d, v1.2d
+
+        sub         v9.2d, v9.2d, v4.2d
+        sqrshl      v9.2d, v9.2d, v1.2d
+
+        uzp1         v{{q}}.4s, v8.4s, v9.4s    //combine back
+
+        neg         v3.4s, v{{q}}.4s
+        bit         v{{q}}.16b, v3.16b, v0.16b
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_scale_rounding_away: // signum * (abs >> (shift-1) + 1 >> 1)
+
+    {% for q in (16..31) %}
+        cmlt        v0.4s, v{{q}}.4s, #0
+        abs         v{{q}}.4s, v{{q}}.4s
+        sqdmull     v8.2d, v{{q}}.2s, v2.2s
+        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9
+
+        sqrshl      v8.2d, v8.2d, v1.2d
+        sqrshl      v9.2d, v9.2d, v1.2d
+
+        uzp1         v{{q}}.4s, v8.4s, v9.4s    //combine back
+
+        neg         v3.4s, v{{q}}.4s
+        bit         v{{q}}.16b, v3.16b, v0.16b
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_scale_rounding_minus_inf: // val >> shift
+
+    {% for q in (16..31) %}
+        sqdmull     v8.2d, v{{q}}.2s, v2.2s
+        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9
+
+        sub         v8.2d, v8.2d, v4.2d
+        sqrshl      v8.2d, v8.2d, v1.2d
+
+        sub         v9.2d, v9.2d, v4.2d
+        sqrshl      v9.2d, v9.2d, v1.2d
+
+        uzp1         v{{q}}.4s, v8.4s, v9.4s    //combine back
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_scale_rounding_plus_inf: // (val >> shift-1)+1 >>1
+
+    {% for q in (16..31) %}
+        sqdmull     v8.2d, v{{q}}.2s, v2.2s
+        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9
+
+        sqrshl      v8.2d, v8.2d, v1.2d
+        sqrshl      v9.2d, v9.2d, v1.2d
+
+        uzp1         v{{q}}.4s, v8.4s, v9.4s    //combine back
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_scale_rounding_even: // signum * ((abs >> shift-1) + (abs & 0x1) - 1 >> 1)
+
+    {% for q in (16..31) %}
+        cmlt        v0.4s, v{{q}}.4s, #0
+        abs         v{{q}}.4s, v{{q}}.4s
+        sqdmull     v8.2d, v{{q}}.2s, v2.2s
+        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9
+
+        sqshl       v3.2d, v8.2d, v1.2d         // abs >> shift - 1
+        and         v3.16b, v3.16b, v4.16b      // abs & 0x1
+        sub         v3.2d, v3.2d, v4.2d         //nudge : -1 if we want to round down, 0 if up
+
+        add         v8.2d, v8.2d, v3.2d
+        sqrshl      v8.2d, v8.2d, v1.2d
+
+        sqshl       v3.2d, v9.2d, v1.2d
+        and         v3.16b, v3.16b, v4.16b
+        sub         v3.2d, v3.2d, v4.2d         //nudge : -1 if we want to round down, 0 if up
+
+        add         v9.2d, v9.2d, v3.2d
+        sqrshl      v9.2d, v9.2d, v1.2d
+
+        uzp1         v{{q}}.4s, v8.4s, v9.4s    //combine back
+
+        neg         v3.4s, v{{q}}.4s
+        bit         v{{q}}.16b, v3.16b, v0.16b
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_scale_rounding_odd: // signum * ((abs >> shift-1) - (abs & 0x1) >> 1)
+
+    {% for q in (16..31) %}
+        cmlt        v0.4s, v{{q}}.4s, #0
+        abs         v{{q}}.4s, v{{q}}.4s
+        sqdmull     v8.2d, v{{q}}.2s, v2.2s
+        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9
+
+        sqshl       v3.2d, v8.2d, v1.2d
+        and         v3.16b, v3.16b, v4.16b      //nudge : -1 if we want to round down, 0 if up
+
+        sub         v8.2d, v8.2d, v3.2d
+        sqrshl      v8.2d, v8.2d, v1.2d
+
+        sqshl       v3.2d, v9.2d, v1.2d
+        and         v3.16b, v3.16b, v4.16b      //nudge : -1 if we want to round down, 0 if up
+
+        sub         v9.2d, v9.2d, v3.2d
+        sqrshl      v9.2d, v9.2d, v1.2d
+
+        uzp1        v{{q}}.4s, v8.4s, v9.4s    //combine back
+
+        neg         v3.4s, v{{q}}.4s
+        bit         v{{q}}.16b, v3.16b, v0.16b
+    {% endfor %}
+
+    b .non_linear_loop
+
+.q_shl:
+    ldr     x5, [x0, #8]                // x5: shift
+    ins     v1.s[0], w5
+    dup     v1.4s, v1.s[0]              // v1.4s <- shift
+
+    {% for q in (16..31) %}
+        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shr:
+    ldp     x5, x6, [x0, #8]            // x5: shift, x6: policy
+
+    mov     w3, #1
+    ins     v4.s[0], w3
+    dup     v4.4s, v4.s[0]              // v4.4d <- 1
+
+    neg     w5, w5                      // broadcast shift
+    ins     v1.s[0], w5
+    dup     v1.4s, v1.s[0]              // v1.4s <- -shift
+
+    cmp     x6, 1
+    beq     .q_shr_rounding_zero
+    cmp     x6, 2
+    beq     .q_shr_rounding_away
+    cmp     x6, 3
+    beq     .q_shr_rounding_minus_inf
+    cmp     x6, 4
+    beq     .q_shr_rounding_plus_inf
+    cmp     x6, 5
+    beq     .q_shr_rounding_even
+    cmp     x6, 6
+    beq     .q_shr_rounding_odd
+
+    b .unsupported
+
+.q_shr_rounding_zero:
+    // asm: signum * (abs >>r shift)
+    {% for q in (16..31) %}
+        cmlt        v0.4s, v{{q}}.4s, #0
+        abs         v{{q}}.4s, v{{q}}.4s
+
+        sub         v{{q}}.4s, v{{q}}.4s, v4.4s
+        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
+
+        neg         v3.4s, v{{q}}.4s
+        bit         v{{q}}.16b, v3.16b, v0.16b
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shr_rounding_away:
+    {% for q in (16..31) %}
+        cmlt        v0.4s, v{{q}}.4s, #0
+        abs         v{{q}}.4s, v{{q}}.4s
+
+        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
+
+        neg         v3.4s, v{{q}}.4s
+        bit         v{{q}}.16b, v3.16b, v0.16b
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shr_rounding_minus_inf:
+    {% for q in (16..31) %}
+        sqneg       v{{q}}.4s, v{{q}}.4s
+        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
+        sqneg       v{{q}}.4s, v{{q}}.4s
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shr_rounding_plus_inf:
+    {% for q in (16..31) %}
+        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shr_rounding_even:
+    // sqrshl is round(+inf), sqshl trauncates
+    // we look at parity of result by truncation: if it's odd, we have nothing more to do, we go towards +inf
+    // if it's even, we need to nudge towards 0 by adding -1
+    // => nudge = (x >>l shift) & 0x1 - 1 (>>l is sqshl)
+    // => result is (x + nudge) >>r shift (with sqrshl)
+    {% for q in (16..31) %}
+        sqshl       v3.4s, v{{q}}.4s, v1.4s // trunc
+        and         v3.16b, v3.16b, v4.16b
+        sub         v3.4s, v3.4s, v4.4s
+        add         v{{q}}.4s, v{{q}}.4s, v3.4s
+
+        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
+    {% endfor %}
+    b .non_linear_loop
+
+.q_shr_rounding_odd:
+    // here: nudge is -((x >>l shift) & 0x1)
+    {% for q in (16..31) %}
+        sqshl       v3.4s, v{{q}}.4s, v1.4s // trunc
+        and         v3.16b, v3.16b, v4.16b
+        neg         v3.4s, v3.4s
+        add         v{{q}}.4s, v{{q}}.4s, v3.4s
+
+        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
+    {% endfor %}
+    b .non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_load_tile.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_load_tile.tmpliq
new file mode 100644
index 000000000..ac920b368
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_mmm_load_tile.tmpliq
@@ -0,0 +1,10 @@
+// vim: ft=arm
+
+.load_tile:
+    ldr         x2, [ x0, #8 ]
+    {% for reg in (from..to) %}
+        ld1         { v{{reg}}.4s }, [ x2 ], #16
+    {% endfor %}
+
+    b           .non_linear_loop
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_sigmoid_f32_4n.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_sigmoid_f32_4n.tmpl
new file mode 100644
index 000000000..84b927e3b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_sigmoid_f32_4n.tmpl
@@ -0,0 +1,206 @@
+// vim: ft=arm
+
+// no preservation either for v0-v7 and v16-v31
+
+.text
+.align 4
+
+.cpu generic+fp+simd
+.global {{G}}arm64simd_sigmoid_f32_4n_{{suffix}}
+{{G}}arm64simd_sigmoid_f32_4n_{{suffix}}:
+
+    cmp         x1, #0
+    beq         .return
+
+    adr         x2, .coeffs_num
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
+    dup         v5.4s, v0.s[0]              // v5 <- low, broadcasted
+    dup         v6.4s, v0.s[1]              // v6 <- high, broadcasted
+    dup         v7.4s, v3.s[1]              // v7 <- 0.5, broadcasted
+
+    cmp         x1, #16
+    blt         .loop
+
+.loop4:
+    ld1         { v16.4s, v17.4s, v18.4s, v19.4s }, [x0]
+
+    fmax        v16.4s, v16.4s, v5.4s
+    fmax        v17.4s, v17.4s, v5.4s
+    fmax        v18.4s, v18.4s, v5.4s
+    fmax        v19.4s, v19.4s, v5.4s
+
+    fmin        v16.4s, v16.4s, v6.4s
+    fmin        v17.4s, v17.4s, v6.4s
+    fmin        v18.4s, v18.4s, v6.4s
+    fmin        v19.4s, v19.4s, v6.4s       // v16 <- x
+
+    fmul        v20.4s, v16.4s, v16.4s
+    fmul        v21.4s, v17.4s, v17.4s
+    fmul        v22.4s, v18.4s, v18.4s
+    fmul        v23.4s, v19.4s, v19.4s      // v20 <- x2
+
+    dup         v24.4s, v0.s[3]
+    fmla        v24.4s, v20.4s, v0.s[2]
+    dup         v25.4s, v0.s[3]
+    fmla        v25.4s, v21.4s, v0.s[2]
+    dup         v26.4s, v0.s[3]
+    fmla        v26.4s, v22.4s, v0.s[2]
+    dup         v27.4s, v0.s[3]
+    fmla        v27.4s, v23.4s, v0.s[2]
+
+    dup         v28.4s, v1.s[0]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v29.4s, v1.s[0]
+    fmla        v29.4s, v21.4s, v25.4s
+    dup         v30.4s, v1.s[0]
+    fmla        v30.4s, v22.4s, v26.4s
+    dup         v31.4s, v1.s[0]
+    fmla        v31.4s, v23.4s, v27.4s
+
+    dup         v24.4s, v1.s[1]
+    fmla        v24.4s, v20.4s, v28.4s
+    dup         v25.4s, v1.s[1]
+    fmla        v25.4s, v21.4s, v29.4s
+    dup         v26.4s, v1.s[1]
+    fmla        v26.4s, v22.4s, v30.4s
+    dup         v27.4s, v1.s[1]
+    fmla        v27.4s, v23.4s, v31.4s
+
+    dup         v28.4s, v1.s[2]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v29.4s, v1.s[2]
+    fmla        v29.4s, v21.4s, v25.4s
+    dup         v30.4s, v1.s[2]
+    fmla        v30.4s, v22.4s, v26.4s
+    dup         v31.4s, v1.s[2]
+    fmla        v31.4s, v23.4s, v27.4s
+
+    dup         v24.4s, v1.s[3]
+    fmla        v24.4s, v20.4s, v28.4s
+    dup         v25.4s, v1.s[3]
+    fmla        v25.4s, v21.4s, v29.4s
+    dup         v26.4s, v1.s[3]
+    fmla        v26.4s, v22.4s, v30.4s
+    dup         v27.4s, v1.s[3]
+    fmla        v27.4s, v23.4s, v31.4s
+
+    dup         v28.4s, v2.s[0]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v29.4s, v2.s[0]
+    fmla        v29.4s, v21.4s, v25.4s
+    dup         v30.4s, v2.s[0]
+    fmla        v30.4s, v22.4s, v26.4s
+    dup         v31.4s, v2.s[0]
+    fmla        v31.4s, v23.4s, v27.4s
+
+    fmul        v16.4s, v16.4s, v28.4s
+    fmul        v17.4s, v17.4s, v29.4s
+    fmul        v18.4s, v18.4s, v30.4s
+    fmul        v19.4s, v19.4s, v31.4s      // v16 <- numerator
+
+    dup         v24.4s, v2.s[2]
+    fmla        v24.4s, v20.4s, v2.s[1]
+    dup         v25.4s, v2.s[2]
+    fmla        v25.4s, v21.4s, v2.s[1]
+    dup         v26.4s, v2.s[2]
+    fmla        v26.4s, v22.4s, v2.s[1]
+    dup         v27.4s, v2.s[2]
+    fmla        v27.4s, v23.4s, v2.s[1]
+
+    dup         v28.4s, v2.s[3]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v29.4s, v2.s[3]
+    fmla        v29.4s, v21.4s, v25.4s
+    dup         v30.4s, v2.s[3]
+    fmla        v30.4s, v22.4s, v26.4s
+    dup         v31.4s, v2.s[3]
+    fmla        v31.4s, v23.4s, v27.4s
+
+    dup         v24.4s, v3.s[0]
+    fmla        v24.4s, v20.4s, v28.4s
+    dup         v25.4s, v3.s[0]
+    fmla        v25.4s, v21.4s, v29.4s
+    dup         v26.4s, v3.s[0]
+    fmla        v26.4s, v22.4s, v30.4s
+    dup         v27.4s, v3.s[0]
+    fmla        v27.4s, v23.4s, v31.4s  // v24 denum
+
+    fdiv        v16.4s, v16.4s, v24.4s
+    fdiv        v17.4s, v17.4s, v25.4s
+    fdiv        v18.4s, v18.4s, v26.4s
+    fdiv        v19.4s, v19.4s, v27.4s
+
+    fadd        v16.4s, v16.4s, v7.4s
+    fadd        v17.4s, v17.4s, v7.4s
+    fadd        v18.4s, v18.4s, v7.4s
+    fadd        v19.4s, v19.4s, v7.4s
+
+    st1         { v16.4s, v17.4s, v18.4s, v19.4s }, [x0], #64
+
+    subs        x1, x1, #16
+    cmp         x1, #16
+    bge         .loop4
+
+    cmp         x1, #0
+    beq         .return
+
+.loop:
+    ld1         { v16.4s }, [x0]
+
+    fmax        v16.4s, v16.4s, v5.4s
+    fmin        v16.4s, v16.4s, v6.4s       // v16 <- x
+    fmul        v20.4s, v16.4s, v16.4s      // v20 <- x2
+
+    dup         v24.4s, v0.s[3]
+    fmla        v24.4s, v20.4s, v0.s[2]
+    dup         v28.4s, v1.s[0]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v24.4s, v1.s[1]
+    fmla        v24.4s, v20.4s, v28.4s
+    dup         v28.4s, v1.s[2]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v24.4s, v1.s[3]
+    fmla        v24.4s, v20.4s, v28.4s
+    dup         v28.4s, v2.s[0]
+    fmla        v28.4s, v20.4s, v24.4s
+    fmul        v16.4s, v16.4s, v28.4s      // v16 <- numerator
+
+    dup         v24.4s, v2.s[2]
+    fmla        v24.4s, v20.4s, v2.s[1]
+    dup         v28.4s, v2.s[3]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v24.4s, v3.s[0]
+    fmla        v24.4s, v20.4s, v28.4s      // v24 <- denum
+
+    fdiv        v16.4s, v16.4s, v24.4s
+    fadd        v16.4s, v16.4s, v7.4s
+
+    st1         { v16.4s }, [x0], #16
+
+    subs        x1, x1, #4
+    bne         .loop
+
+.return:
+    ret
+
+.coeffs_num:
+    .float -18.6                    // low
+    .float 18.6                     // high
+    .float -4.433153405e-18         // alpha_13
+    .float 1.169974371e-14
+
+    .float -1.875289645e-11
+    .float 4.257889523e-8
+    .float 0.00004811817576
+    .float 0.008163842030
+
+    .float 0.2499999971
+    .float 3.922935744e-6           // beta_6
+    .float 0.001524872358
+    .float 0.1159886749
+
+    .float 1.0
+    .float 0.5                      //              
+    .float 0.0                      // padding
+    .float 0.0
+
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_tanh_f32_4n.tmpl b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_tanh_f32_4n.tmpl
new file mode 100644
index 000000000..dc88569ac
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/arm64simd_tanh_f32_4n.tmpl
@@ -0,0 +1,198 @@
+// vim: ft=arm
+
+// no preservation either for v0-v7 and v16-v31
+
+.text
+.align 4
+
+.cpu generic+fp+simd
+.global {{G}}arm64simd_tanh_f32_4n_{{suffix}}
+{{G}}arm64simd_tanh_f32_4n_{{suffix}}:
+
+    cmp         x1, #0
+    beq         .return
+
+    adr         x2, .coeffs_num
+    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [x2]
+    dup         v5.4s, v0.s[0]              // v5 <- low, broadcasted
+    dup         v6.4s, v0.s[1]              // v6 <- high, broadcasted
+
+    cmp         x1, #16
+    blt         .loop
+
+.loop4:
+    ld1         { v16.4s, v17.4s, v18.4s, v19.4s }, [x0]
+
+    fmax        v16.4s, v16.4s, v5.4s
+    fmax        v17.4s, v17.4s, v5.4s
+    fmax        v18.4s, v18.4s, v5.4s
+    fmax        v19.4s, v19.4s, v5.4s
+
+    fmin        v16.4s, v16.4s, v6.4s
+    fmin        v17.4s, v17.4s, v6.4s
+    fmin        v18.4s, v18.4s, v6.4s
+    fmin        v19.4s, v19.4s, v6.4s       // v16 <- x
+
+    fmul        v20.4s, v16.4s, v16.4s
+    fmul        v21.4s, v17.4s, v17.4s
+    fmul        v22.4s, v18.4s, v18.4s
+    fmul        v23.4s, v19.4s, v19.4s      // v20 <- x2
+
+    dup         v24.4s, v0.s[3]
+    fmla        v24.4s, v20.4s, v0.s[2]
+    dup         v25.4s, v0.s[3]
+    fmla        v25.4s, v21.4s, v0.s[2]
+    dup         v26.4s, v0.s[3]
+    fmla        v26.4s, v22.4s, v0.s[2]
+    dup         v27.4s, v0.s[3]
+    fmla        v27.4s, v23.4s, v0.s[2]
+
+    dup         v28.4s, v1.s[0]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v29.4s, v1.s[0]
+    fmla        v29.4s, v21.4s, v25.4s
+    dup         v30.4s, v1.s[0]
+    fmla        v30.4s, v22.4s, v26.4s
+    dup         v31.4s, v1.s[0]
+    fmla        v31.4s, v23.4s, v27.4s
+
+    dup         v24.4s, v1.s[1]
+    fmla        v24.4s, v20.4s, v28.4s
+    dup         v25.4s, v1.s[1]
+    fmla        v25.4s, v21.4s, v29.4s
+    dup         v26.4s, v1.s[1]
+    fmla        v26.4s, v22.4s, v30.4s
+    dup         v27.4s, v1.s[1]
+    fmla        v27.4s, v23.4s, v31.4s
+
+    dup         v28.4s, v1.s[2]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v29.4s, v1.s[2]
+    fmla        v29.4s, v21.4s, v25.4s
+    dup         v30.4s, v1.s[2]
+    fmla        v30.4s, v22.4s, v26.4s
+    dup         v31.4s, v1.s[2]
+    fmla        v31.4s, v23.4s, v27.4s
+
+    dup         v24.4s, v1.s[3]
+    fmla        v24.4s, v20.4s, v28.4s
+    dup         v25.4s, v1.s[3]
+    fmla        v25.4s, v21.4s, v29.4s
+    dup         v26.4s, v1.s[3]
+    fmla        v26.4s, v22.4s, v30.4s
+    dup         v27.4s, v1.s[3]
+    fmla        v27.4s, v23.4s, v31.4s
+
+    dup         v28.4s, v2.s[0]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v29.4s, v2.s[0]
+    fmla        v29.4s, v21.4s, v25.4s
+    dup         v30.4s, v2.s[0]
+    fmla        v30.4s, v22.4s, v26.4s
+    dup         v31.4s, v2.s[0]
+    fmla        v31.4s, v23.4s, v27.4s
+
+    fmul        v16.4s, v16.4s, v28.4s
+    fmul        v17.4s, v17.4s, v29.4s
+    fmul        v18.4s, v18.4s, v30.4s
+    fmul        v19.4s, v19.4s, v31.4s      // v16 <- numerator
+
+    dup         v24.4s, v2.s[2]
+    fmla        v24.4s, v20.4s, v2.s[1]
+    dup         v25.4s, v2.s[2]
+    fmla        v25.4s, v21.4s, v2.s[1]
+    dup         v26.4s, v2.s[2]
+    fmla        v26.4s, v22.4s, v2.s[1]
+    dup         v27.4s, v2.s[2]
+    fmla        v27.4s, v23.4s, v2.s[1]
+
+    dup         v28.4s, v2.s[3]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v29.4s, v2.s[3]
+    fmla        v29.4s, v21.4s, v25.4s
+    dup         v30.4s, v2.s[3]
+    fmla        v30.4s, v22.4s, v26.4s
+    dup         v31.4s, v2.s[3]
+    fmla        v31.4s, v23.4s, v27.4s
+
+    dup         v24.4s, v3.s[0]
+    fmla        v24.4s, v20.4s, v28.4s
+    dup         v25.4s, v3.s[0]
+    fmla        v25.4s, v21.4s, v29.4s
+    dup         v26.4s, v3.s[0]
+    fmla        v26.4s, v22.4s, v30.4s
+    dup         v27.4s, v3.s[0]
+    fmla        v27.4s, v23.4s, v31.4s  // v24 denum
+
+    fdiv        v16.4s, v16.4s, v24.4s
+    fdiv        v17.4s, v17.4s, v25.4s
+    fdiv        v18.4s, v18.4s, v26.4s
+    fdiv        v19.4s, v19.4s, v27.4s
+
+    st1         { v16.4s, v17.4s, v18.4s, v19.4s }, [x0], #64
+
+    subs        x1, x1, #16
+    cmp         x1, #16
+    bge         .loop4
+
+    cmp         x1, #0
+    beq         .return
+
+.loop:
+    ld1         { v16.4s }, [x0]
+
+    fmax        v16.4s, v16.4s, v5.4s
+    fmin        v16.4s, v16.4s, v6.4s       // v16 <- x
+    fmul        v20.4s, v16.4s, v16.4s      // v20 <- x2
+
+    dup         v24.4s, v0.s[3]
+    fmla        v24.4s, v20.4s, v0.s[2]
+    dup         v28.4s, v1.s[0]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v24.4s, v1.s[1]
+    fmla        v24.4s, v20.4s, v28.4s
+    dup         v28.4s, v1.s[2]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v24.4s, v1.s[3]
+    fmla        v24.4s, v20.4s, v28.4s
+    dup         v28.4s, v2.s[0]
+    fmla        v28.4s, v20.4s, v24.4s
+    fmul        v16.4s, v16.4s, v28.4s      // v16 <- numerator
+
+    dup         v24.4s, v2.s[2]
+    fmla        v24.4s, v20.4s, v2.s[1]
+    dup         v28.4s, v2.s[3]
+    fmla        v28.4s, v20.4s, v24.4s
+    dup         v24.4s, v3.s[0]
+    fmla        v24.4s, v20.4s, v28.4s      // v24 <- denum
+
+    fdiv        v16.4s, v16.4s, v24.4s
+
+    st1         { v16.4s }, [x0], #16
+
+    subs        x1, x1, #4
+    bne         .loop
+
+.return:
+    ret
+
+.coeffs_num:
+    .float -8.9                     // low
+    .float 8.9                      // high
+    .float -8.488492677e-14         // alpha_13
+    .float 5.277853000e-11
+
+    .float -2.022500419e-8
+    .float 0.00001115424833
+    .float 0.003103950131
+    .float 0.1308400453
+
+    .float 0.9999999934
+    .float 0.0002546136580          // beta_6
+    .float 0.02449515379
+    .float 0.4641733162
+
+    .float 1.0
+    .float 0                        // padding
+    .float 0                        // padding
+    .float 0                        // padding
diff --git a/vendor/tract-linalg-0.22.1/arm64/arm64simd/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/arm64/arm64simd/dispatcher.tmpliq
new file mode 100644
index 000000000..150db4683
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/arm64/arm64simd/dispatcher.tmpliq
@@ -0,0 +1,37 @@
+// vim: ft=arm
+
+.non_linear:
+    sub         x0, x0, 40
+
+.non_linear_loop:
+    add         x0, x0, 40
+    ldr         x2, [x0]
+
+    mov         x4, #{{ jump_table | size }}
+
+    cmp         x2, #{{ jump_table | size }}
+    csel        x2, x2, x4, lt
+    cmp         x2, #0
+    csel        x2, x4, x2, lt
+
+    adr         x3, .jmp_table
+    add         x3, x3, x2, LSL#2
+    br          x3
+
+.jmp_table:
+{% for j in jump_table %}
+    b   .{{j}}
+{% endfor %}
+    b   .unsupported
+
+    add x0, x2, #4000
+    b .return
+
+.unsupported:
+    mov         x0, #1
+    b           .return
+
+.done:
+    mov         x0, 0
+    b           .return
+
diff --git a/vendor/tract-linalg-0.22.1/benches/arm32neon.rs b/vendor/tract-linalg-0.22.1/benches/arm32neon.rs
new file mode 100644
index 000000000..4c5101d02
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/arm32neon.rs
@@ -0,0 +1,179 @@
+#![feature(asm)]
+#![allow(dead_code, non_upper_case_globals, unused_macros, non_snake_case, unused_assignments)]
+
+use std::time::Instant;
+
+macro_rules! r2 { ($($stat:stmt)*) => { $( $stat )* $( $stat )* } }
+macro_rules! r4 { ($($stat:stmt)*) => { r2!(r2!($($stat)*)) }}
+macro_rules! r8 { ($($stat:stmt)*) => { r4!(r2!($($stat)*)) }}
+macro_rules! r16 { ($($stat:stmt)*) => { r4!(r4!($($stat)*)) }}
+macro_rules! r32 { ($($stat:stmt)*) => { r8!(r4!($($stat)*)) }}
+macro_rules! r64 { ($($stat:stmt)*) => { r8!(r8!($($stat)*)) }}
+macro_rules! r128 { ($($stat:stmt)*) => { r8!(r16!($($stat)*)) }}
+macro_rules! r1024 { ($($stat:stmt)*) => { r8!(r128!($($stat)*)) }}
+macro_rules! r4096 { ($($stat:stmt)*) => { r4!(r1024!($($stat)*)) }}
+
+const _F32: [f32; 1024] = [12.; 1024];
+const F32: *const f32 = _F32.as_ptr();
+
+/*
+fn ruin_cache() {
+let _a = (0..1000000).collect::<Vec<i32>>();
+}
+*/
+
+macro_rules! b {
+    ($f: block, $inner_loop: expr, $measures: expr) => {{
+        let mut values = Vec::with_capacity($measures);
+        for _ in 0..$measures {
+            //       ruin_cache();
+            let start = Instant::now();
+            for _ in 0..$inner_loop {
+                unsafe { $f };
+            }
+            values.push(start.elapsed());
+        }
+        values.sort();
+        values[$measures / 2].as_nanos() as f64 / 1e9 / $inner_loop as f64
+    }};
+}
+
+fn main() {
+    let cycle = b!(
+        {
+            r1024!(asm!("orr r0, r0, r0", out("r0") _));
+        },
+        1000,
+        1000
+    ) / 1024.;
+    let indep_fmla = b!(
+        {
+            r8!(asm!("
+                vmla.f32 q0, q0, q0
+                vmla.f32 q1, q1, q1
+                vmla.f32 q2, q2, q2
+                vmla.f32 q3, q3, q3
+                vmla.f32 q4, q4, q4
+                vmla.f32 q5, q5, q5
+                vmla.f32 q6, q6, q6
+                vmla.f32 q7, q7, q7
+                 ", out("q0") _, out("q1") _, out("q2") _, out("q3") _, out("q4") _, out("q5") _, out("q6") _, out("q7") _));
+        },
+        1000,
+        1000
+    ) / 64.;
+    eprintln!("rcp tp: indep fmla: {}", indep_fmla / cycle);
+    let dep_accu_fmla = b!(
+        {
+            r16!(asm!("
+                vmla.f32 q15, q0, q0
+                vmla.f32 q15, q1, q1
+                vmla.f32 q15, q2, q2
+                vmla.f32 q15, q3, q3
+                vmla.f32 q15, q4, q4
+                vmla.f32 q15, q5, q5
+                vmla.f32 q15, q6, q6
+                vmla.f32 q15, q7, q7
+                vmla.f32 q15, q8, q8
+                vmla.f32 q15, q9, q9
+                vmla.f32 q15, q10, q10
+                vmla.f32 q15, q11, q11
+                vmla.f32 q15, q12, q12
+                vmla.f32 q15, q13, q13
+                vmla.f32 q15, q14, q14
+                 ", out("q0") _, out("q1") _, out("q2") _, out("q3") _, out("q4") _, out("q5") _, out("q6") _, out("q7") _,
+                 out("q8") _, out("q9") _, out("q10") _, out("q11") _, out("q12") _, out("q13") _, out("q14") _, out("q15") _));
+        },
+        1000,
+        1000
+    ) / 16.
+        / 15.;
+    eprintln!("rcp tp: accu-dep fmla: {}", dep_accu_fmla / cycle);
+    let load_s_using_vld1_64 = b!(
+        {
+            let mut p = F32;
+            r16!(asm!("
+                vld1.64         {{d0-d3}}, [{0}]!
+                vld1.64         {{d4-d7}}, [{0}]!
+                vld1.64         {{d8-d11}}, [{0}]!
+                vld1.64         {{d12-d15}}, [{0}]!
+                vld1.64         {{d16-d19}}, [{0}]!
+                vld1.64         {{d20-d23}}, [{0}]!
+                vld1.64         {{d24-d27}}, [{0}]!
+                vld1.64         {{d28-d31}}, [{0}]!
+                 ", 
+                 inout(reg) p,
+                 out("q0") _, out("q1") _, out("q2") _, out("q3") _, out("q4") _, out("q5") _, out("q6") _, out("q7") _,
+                 out("q8") _, out("q9") _, out("q10") _, out("q11") _, out("q12") _, out("q13") _, out("q14") _, out("q15") _));
+        },
+        1000,
+        1000
+    ) / 16.
+        / 64.; // each line load 8 s
+    eprintln!("rcp tp: load s using vld1_64 ia {}", load_s_using_vld1_64 / cycle);
+    let load_s_using_vldm_q = b!(
+        {
+            let mut p = F32;
+            r16!(asm!("
+                vldm            {0}!, {{q0-q3}}
+                vldm            {0}!, {{q4-q7}}
+                vldm            {0}!, {{q8-q11}}
+                vldm            {0}!, {{q12-q15}}
+                 ", 
+                 inout(reg) p,
+                 out("q0") _, out("q1") _, out("q2") _, out("q3") _, out("q4") _, out("q5") _, out("q6") _, out("q7") _,
+                 out("q8") _, out("q9") _, out("q10") _, out("q11") _, out("q12") _, out("q13") _, out("q14") _, out("q15") _));
+        },
+        1000,
+        1000
+    ) / 16.
+        / 64.;
+    eprintln!("rcp tp: load s using vldmia q: {}", load_s_using_vldm_q / cycle);
+    let load = b!(
+        {
+            let mut p = F32;
+            r16!(asm!("
+                vldr.64  d0, [{0}]
+                vldr.64  d1, [{0}, #8]
+                vldr.64  d2, [{0}, #16]
+                vldr.64  d3, [{0}, #24]
+                vldr.64  d4, [{0}, #32]
+                vldr.64  d5, [{0}, #40]
+                vldr.64  d6, [{0}, #48]
+                vldr.64  d7, [{0}, #56]
+                vldr.64  d8, [{0}, #64]
+                vldr.64  d9, [{0}, #72]
+                vldr.64  d10, [{0}, #80]
+                vldr.64  d11, [{0}, #88]
+                vldr.64  d12, [{0}, #96]
+                vldr.64  d13, [{0}, #104]
+                vldr.64  d14, [{0}, #112]
+                vldr.64  d15, [{0}, #120]
+                vldr.64  d16, [{0}, #128]
+                vldr.64  d17, [{0}, #136]
+                vldr.64  d18, [{0}, #144]
+                vldr.64  d19, [{0}, #152]
+                vldr.64  d20, [{0}, #160]
+                vldr.64  d21, [{0}, #168]
+                vldr.64  d22, [{0}, #176]
+                vldr.64  d23, [{0}, #184]
+                vldr.64  d24, [{0}, #192]
+                vldr.64  d25, [{0}, #200]
+                vldr.64  d26, [{0}, #208]
+                vldr.64  d27, [{0}, #216]
+                vldr.64  d28, [{0}, #224]
+                vldr.64  d29, [{0}, #232]
+                vldr.64  d30, [{0}, #240]
+                vldr.64  d31, [{0}, #248]
+                add {0}, #256
+                 ", 
+                 inout(reg) p,
+                 out("q0") _, out("q1") _, out("q2") _, out("q3") _, out("q4") _, out("q5") _, out("q6") _, out("q7") _,
+                 out("q8") _, out("q9") _, out("q10") _, out("q11") _, out("q12") _, out("q13") _, out("q14") _, out("q15") _));
+        },
+        1000,
+        1000
+    ) / 16.
+        / 64.;
+    eprintln!("rcp tp: load s using vldr d + imm: {}", load / cycle);
+}
diff --git a/vendor/tract-linalg-0.22.1/benches/arm64.rs b/vendor/tract-linalg-0.22.1/benches/arm64.rs
new file mode 100644
index 000000000..c153dbdb4
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/arm64.rs
@@ -0,0 +1,77 @@
+use std::time::Instant;
+
+use tract_data::prelude::*;
+use tract_linalg::LADatum;
+use tract_linalg::frame::mmm::FusedSpec;
+use tract_linalg::frame::mmm::MatMatMulKer;
+
+fn ruin_cache() {
+    let _a = (0..1000000).collect::<Vec<i32>>();
+}
+
+fn bench_to_nanos<T: LADatum + Copy + num_traits::Zero, K: MatMatMulKer<T>>(
+    k: usize,
+    loops: usize,
+) -> f64 {
+    let item_size = T::datum_type().size_of();
+    let a = Tensor::zero_aligned::<T>(
+        &[(k + K::end_padding_packed_a()) * K::mr()],
+        K::alignment_bytes_packed_a(),
+    )
+    .unwrap();
+    let b = Tensor::zero_aligned::<T>(
+        &[(k + K::end_padding_packed_b()) * K::nr()],
+        K::alignment_bytes_packed_b(),
+    )
+    .unwrap();
+    let mut c = Tensor::zero::<T>(&[K::mr() * K::nr()]).unwrap();
+    let ref a = InputStoreKer::Packed { ptr: unsafe { a.as_ptr_unchecked::<u8>() as _ } };
+    let ref b = InputStoreKer::Packed { ptr: unsafe { b.as_ptr_unchecked::<u8>() as _ } };
+    let ref c = OutputStoreKer {
+        ptr: unsafe { c.as_ptr_mut_unchecked::<u8>() as _ },
+        item_size,
+        col_byte_stride: (item_size * K::mr()) as isize,
+        row_byte_stride: item_size as isize,
+    };
+    let ref linear = LinearSpec::Mul { k };
+    let op = MatMatMulKerSpec { a, b, c, linear, non_linear: std::ptr::null() };
+    let mut values = Vec::with_capacity(loops);
+    for _ in 0..loops {
+        ruin_cache();
+        let start = Instant::now();
+        K::kernel(&op);
+        values.push(start.elapsed());
+    }
+    values.sort();
+    values[loops / 2].as_nanos() as f64
+}
+
+fn model<T: Datum + Copy + num_traits::Zero, K: MatMatMulKer<T>>() -> (f64, f64) {
+    let x = 1000;
+    let zp = bench_to_nanos::<T, K>(0, 10000);
+    let y = bench_to_nanos::<T, K>(x, 1000);
+    let slope = (y - zp) / x as f64;
+    (slope, zp)
+}
+
+fn as_match_line<T: Datum + Copy + num_traits::Zero, K: MatMatMulKer<T>>() {
+    let coeffs = model::<T, K>();
+    println!(
+        "({:?}, {}, {}) => {} * k + {},",
+        K::name(),
+        K::mr(),
+        K::nr(),
+        (coeffs.0 * 1000.).round(),
+        (coeffs.1 * 1000.).round()
+    );
+}
+
+fn main() {
+    use tract_linalg::arm64::*;
+    as_match_line::<f32, MatMatMulF32x16x4>();
+    as_match_line::<f32, MatMatMulF32x12x8>();
+    as_match_line::<f32, MatMatMulF32x8x8>();
+    as_match_line::<f32, MatMatMulF32x16x4A53>();
+    as_match_line::<f32, MatMatMulF32x12x8A53>();
+    as_match_line::<f32, MatMatMulF32x8x8A53>();
+}
diff --git a/vendor/tract-linalg-0.22.1/benches/arm64simd.rs b/vendor/tract-linalg-0.22.1/benches/arm64simd.rs
new file mode 100644
index 000000000..1dd244ff1
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/arm64simd.rs
@@ -0,0 +1,926 @@
+#![allow(dead_code, non_upper_case_globals, unused_macros, non_snake_case, unused_assignments)]
+
+use std::arch::asm;
+
+mod nano;
+
+#[repr(C, align(8))]
+struct Floats([f32; 4096]);
+const _F32: Floats = Floats([12.; 4096]);
+const F32: *const f32 = (&_F32) as *const Floats as *const f32;
+
+lazy_static::lazy_static! {
+    static ref TICK: f64 = unsafe { b8192!(asm!("orr x20, x20, x20", out("x20") _)) };
+}
+
+pub unsafe fn armv8(filter: Option<&str>) {
+    macro_rules! s32 {
+        ($label: literal, $n: expr, $stmt:block) => {
+            if $label.contains(filter.unwrap_or("")) {
+                println!("{:40} {:.2}", $label, b32!($stmt) / $n as f64 / *TICK);
+            }
+        };
+    }
+
+    macro_rules! s128 {
+        ($label: literal, $n: expr, $stmt:block) => {
+            if $label.contains(filter.unwrap_or("")) {
+                println!("{:40} {:.2}", $label, b128!($stmt) / $n as f64 / *TICK);
+            }
+        };
+    }
+
+    macro_rules! s1024 {
+        ($label: literal, $n: expr, $stmt:block) => {
+            if $label.contains(filter.unwrap_or("")) {
+                println!("{:40} {:.2}", $label, b1024!($stmt) / $n as f64 / *TICK);
+            }
+        };
+    }
+
+    macro_rules! s8192 {
+        ($label: literal, $n: expr, $stmt:block) => {
+            if $label.contains(filter.unwrap_or("")) {
+                println!("{:40} {:.2}", $label, b8192!($stmt) / $n as f64 / *TICK);
+            }
+        };
+    }
+
+    s128!("nop", 1, { asm!("nop") });
+    s128!("vands", 4, {
+        asm!("  and v0.16b, v1.16b, v1.16b
+                and v2.16b, v3.16b, v3.16b
+                and v4.16b, v5.16b, v5.16b
+                and v6.16b, v7.16b, v7.16b ",
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        )
+    });
+    s128!("fmax", 4, {
+        asm!("  fmax v0.4s, v1.4s, v1.4s
+                fmax v2.4s, v3.4s, v3.4s
+                fmax v4.4s, v5.4s, v5.4s
+                fmax v6.4s, v7.4s, v7.4s ",
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        )
+    });
+    s128!("fmax_with_dep", 1, { asm!("fmax v0.4s, v0.4s, v0.4s", out("v0") _) });
+    s128!("fmla", 16, {
+        asm!(" fmla v0.4s, v0.4s, v0.4s
+               fmla v1.4s, v1.4s, v1.4s
+               fmla v2.4s, v2.4s, v2.4s
+               fmla v3.4s, v3.4s, v3.4s
+               fmla v4.4s, v4.4s, v4.4s
+               fmla v5.4s, v5.4s, v5.4s
+               fmla v6.4s, v6.4s, v6.4s
+               fmla v7.4s, v7.4s, v7.4s
+               fmla v8.4s, v8.4s, v8.4s
+               fmla v9.4s, v9.4s, v9.4s
+               fmla v10.4s,v10.4s,v10.4s
+               fmla v11.4s,v11.4s,v11.4s
+               fmla v12.4s,v12.4s,v12.4s
+               fmla v13.4s,v13.4s,v13.4s
+               fmla v14.4s,v14.4s,v14.4s
+               fmla v15.4s,v15.4s,v15.4s ",
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        )
+    });
+
+    s128!("fmla_with_dep", 1, { asm!("fmla v0.4s, v0.4s, v0.4s", out("v0") _) });
+    s32!("w_load", 64, {
+        let mut p = F32;
+        r8!(asm!("ldr w20, [{0}]
+                   ldr w21, [{0}]
+                   ldr w22, [{0}]
+                   ldr w23, [{0}]
+                   ldr w24, [{0}]
+                   ldr w25, [{0}]
+                   ldr w26, [{0}]
+                   ldr w27, [{0}]",
+        inout(reg) p,
+        out("x20") _, out("x21") _, out("x22") _, out("x23") _,
+        out("x24") _, out("x25") _, out("x26") _, out("x27") _,
+        ));
+    });
+    s32!("x_load", 64, {
+        let mut p = F32;
+        r8!(asm!("
+           ldr x20, [{0}]
+           ldr x21, [{0}]
+           ldr x22, [{0}]
+           ldr x23, [{0}]
+           ldr x24, [{0}]
+           ldr x25, [{0}]
+           ldr x26, [{0}]
+           ldr x27, [{0}]
+           ",
+        inout(reg) p,
+        out("x20") _, out("x21") _, out("x22") _, out("x23") _,
+        out("x24") _, out("x25") _, out("x26") _, out("x27") _,
+        ));
+    });
+    s32!("d_load", 64, {
+        let mut p = F32;
+        r8!(asm!("
+       ldr d20, [{0}]
+       ldr d21, [{0}]
+       ldr d22, [{0}]
+       ldr d23, [{0}]
+       ldr d24, [{0}]
+       ldr d25, [{0}]
+       ldr d26, [{0}]
+       ldr d27, [{0}]
+       ",
+        inout(reg) p,
+        out("v20") _, out("v21") _, out("v22") _, out("v23") _,
+        out("v24") _, out("v25") _, out("v26") _, out("v27") _,
+        ));
+    });
+    s32!("s_load", 64, {
+        let mut p = F32;
+        r8!(asm!("
+       ld1 {{v20.s}}[0], [{0}]
+       ld1 {{v21.s}}[0], [{0}]
+       ld1 {{v22.s}}[0], [{0}]
+       ld1 {{v23.s}}[0], [{0}]
+       ld1 {{v24.s}}[0], [{0}]
+       ld1 {{v25.s}}[0], [{0}]
+       ld1 {{v26.s}}[0], [{0}]
+       ld1 {{v27.s}}[0], [{0}]
+       ",
+        inout(reg) p,
+        out("v20") _, out("v21") _, out("v22") _, out("v23") _,
+        out("v24") _, out("v25") _, out("v26") _, out("v27") _,
+        ));
+    });
+    s32!("d_load_as_v", 64, {
+        let mut p = F32;
+        r8!(asm!("
+       ld1 {{v20.d}}[0], [{0}]
+       ld1 {{v21.d}}[0], [{0}]
+       ld1 {{v22.d}}[0], [{0}]
+       ld1 {{v23.d}}[0], [{0}]
+       ld1 {{v24.d}}[0], [{0}]
+       ld1 {{v25.d}}[0], [{0}]
+       ld1 {{v26.d}}[0], [{0}]
+       ld1 {{v27.d}}[0], [{0}]
+       ",
+        inout(reg) p,
+        out("v20") _, out("v21") _, out("v22") _, out("v23") _,
+        out("v24") _, out("v25") _, out("v26") _, out("v27") _,
+        ));
+    });
+    s32!("v_load", 64, {
+        let mut p = F32;
+        r8!(asm!("
+       ld1 {{v20.4s}}, [{0}]
+       ld1 {{v21.4s}}, [{0}]
+       ld1 {{v22.4s}}, [{0}]
+       ld1 {{v23.4s}}, [{0}]
+       ld1 {{v24.4s}}, [{0}]
+       ld1 {{v25.4s}}, [{0}]
+       ld1 {{v26.4s}}, [{0}]
+       ld1 {{v27.4s}}, [{0}]
+       ",
+        inout(reg) p,
+        out("v20") _, out("v21") _, out("v22") _, out("v23") _,
+        out("v24") _, out("v25") _, out("v26") _, out("v27") _,
+        ));
+    });
+    s32!("v2_load", 64, {
+        let mut p = F32;
+        r8!(asm!("
+                     ld1 {{v0.4s, v1.4s}}, [{0}]
+                     ld1 {{v2.4s, v3.4s}}, [{0}]
+                     ld1 {{v4.4s, v5.4s}}, [{0}]
+                     ld1 {{v6.4s, v7.4s}}, [{0}]
+                     ld1 {{v8.4s, v9.4s}}, [{0}]
+                     ld1 {{v10.4s, v11.4s}}, [{0}]
+                     ld1 {{v12.4s, v13.4s}}, [{0}]
+                     ld1 {{v14.4s, v15.4s}}, [{0}]
+       ",
+        inout(reg) p,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        ));
+    });
+    s32!("v3_load", 32, {
+        let mut p = F32;
+        r8!(asm!("
+           ld1 {{v0.4s, v1.4s, v2.4s}}, [{0}]
+           ld1 {{v3.4s, v4.4s, v5.4s}}, [{0}]
+           ld1 {{v6.4s, v7.4s, v8.4s}}, [{0}]
+           ld1 {{v9.4s, v10.4s, v11.4s}}, [{0}]
+       ",
+        inout(reg) p,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        ));
+    });
+    s32!("v4_load", 32, {
+        let mut p = F32;
+        r8!(asm!("
+           ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{0}]
+           ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{0}]
+           ld1 {{v8.4s, v9.4s, v10.4s, v11.4s}}, [{0}]
+           ld1 {{v12.4s, v13.4s, v14.4s, v15.4s}}, [{0}]
+       ",
+        inout(reg) p,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        ));
+    });
+    s32!("ins_32b", 64, {
+        r8!(asm!("
+           ins v8.s[0], w20
+           ins v9.s[0], w20
+           ins v10.s[0], w20
+           ins v11.s[0], w20
+           ins v12.s[0], w20
+           ins v13.s[0], w20
+           ins v14.s[0], w20
+           ins v15.s[0], w20
+       ",
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        ));
+    });
+    s32!("ins_32b_same_lane", 128, {
+        r8!(asm!("
+           ins         v0.s[0], w20
+           ins         v1.s[0], w20
+           ins         v4.s[0], w20
+           ins         v5.s[0], w20
+           ins         v0.s[1], w20
+           ins         v1.s[1], w20
+           ins         v4.s[1], w20
+           ins         v5.s[1], w20
+           ins         v0.s[2], w20
+           ins         v1.s[2], w20
+           ins         v4.s[2], w20
+           ins         v5.s[2], w20
+           ins         v0.s[3], w20
+           ins         v1.s[3], w20
+           ins         v4.s[3], w20
+           ins         v5.s[3], w20
+       ",
+        out("v0") _, out("v1") _, out("v4") _, out("v5") _,
+        ));
+    });
+    s32!("ins_64b", 64, {
+        r8!(asm!("
+           ins v8.d[0], x20
+           ins v9.d[0], x20
+           ins v10.d[0], x20
+           ins v11.d[0], x20
+           ins v12.d[0], x20
+           ins v13.d[0], x20
+           ins v14.d[0], x20
+           ins v15.d[0], x20
+       ",
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        ));
+    });
+    s32!("ins_64b_same_v", 64, {
+        r8!(asm!("
+                     ins v8.d[0], x20
+                     ins v8.d[1], x20
+                     ins v8.d[0], x20
+                     ins v8.d[1], x20
+                     ins v8.d[0], x20
+                     ins v8.d[1], x20
+                     ins v8.d[0], x20
+                     ins v8.d[1], x20
+                     ",
+        out("v8") _,
+        ));
+    });
+    s32!("ins_64b_from_v", 64, {
+        r8!(asm!("
+                     ins v8.d[0], v9.d[0]
+                     ins v8.d[1], v9.d[0]
+                     ins v8.d[0], v9.d[1]
+                     ins v8.d[1], v9.d[1]
+                     ins v8.d[0], v9.d[0]
+                     ins v8.d[1], v9.d[0]
+                     ins v8.d[0], v9.d[1]
+                     ins v8.d[1], v9.d[1]
+                     ",
+        out("v8") _,
+        ));
+    });
+    s32!("fmla_with_prfm", 64, {
+        let mut p = F32;
+        r8!(asm!("
+           prfm pldl1keep, [{0}, #256]
+           fmla v0.4s, v0.4s, v0.4s
+           prfm pldl1keep, [{0}, #320]
+           fmla v1.4s, v1.4s, v1.4s
+           prfm pldl1keep, [{0}, #384]
+           fmla v2.4s, v2.4s, v2.4s
+           prfm pldl1keep, [{0}, #448]
+           fmla v3.4s, v3.4s, v3.4s
+           prfm pldl1keep, [{0}, #512]
+           fmla v4.4s, v4.4s, v4.4s
+           prfm pldl1keep, [{0}, #576]
+           fmla v5.4s, v5.4s, v5.4s
+           prfm pldl1keep, [{0}, #640]
+           fmla v6.4s, v6.4s, v6.4s
+           prfm pldl1keep, [{0}, #704]
+           fmla v7.4s, v7.4s, v7.4s
+           prfm pldl1keep, [{0}, #768]
+           ",
+        inout(reg) p,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        ));
+    });
+    s32!("fmla_with_w_load", 64, {
+        let mut p = F32;
+        r8!(asm!("
+           ldr w20, [{0}]
+           fmla v0.4s, v0.4s, v0.4s
+           ldr w21, [{0}]
+           fmla v1.4s, v1.4s, v1.4s
+           ldr w22, [{0}]
+           fmla v2.4s, v2.4s, v2.4s
+           ldr w23, [{0}]
+           fmla v3.4s, v3.4s, v3.4s
+           ldr w24, [{0}]
+           fmla v4.4s, v4.4s, v4.4s
+           ldr w25, [{0}]
+           fmla v5.4s, v5.4s, v5.4s
+           ldr w26, [{0}]
+           fmla v6.4s, v6.4s, v6.4s
+           ldr w27, [{0}]
+           fmla v7.4s, v7.4s, v7.4s
+           ",
+        inout(reg) p,
+        out("x20") _, out("x21") _, out("x22") _, out("x23") _,
+        out("x24") _, out("x25") _, out("x26") _, out("x27") _,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        ));
+    });
+    s32!("fmla_with_w_load_inc", 64, {
+        let mut p = F32;
+        r8!(asm!("
+                     ldr w20, [{0}], #4
+                     fmla v0.4s, v0.4s, v0.4s
+                     ldr w21, [{0}], #4
+                     fmla v1.4s, v1.4s, v1.4s
+                     ldr w22, [{0}], #4
+                     fmla v2.4s, v2.4s, v2.4s
+                     ldr w23, [{0}], #4
+                     fmla v3.4s, v3.4s, v3.4s
+                     ldr w24, [{0}], #4
+                     fmla v4.4s, v4.4s, v4.4s
+                     ldr w25, [{0}], #4
+                     fmla v5.4s, v5.4s, v5.4s
+                     ldr w26, [{0}], #4
+                     fmla v6.4s, v6.4s, v6.4s
+                     ldr w27, [{0}], #4
+                     fmla v7.4s, v7.4s, v7.4s
+                     ",
+        inout(reg) p,
+        out("x20") _, out("x21") _, out("x22") _, out("x23") _,
+        out("x24") _, out("x25") _, out("x26") _, out("x27") _,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        ));
+    });
+    s32!("fmla_with_w_load_inc_alt", 64, {
+        let mut p = F32;
+        let mut q = F32;
+        r8!(asm!("
+                     ldr w20, [{0}], #4
+                     fmla v0.4s, v0.4s, v0.4s
+                     ldr w21, [{1}], #4
+                     fmla v1.4s, v1.4s, v1.4s
+                     ldr w22, [{0}], #4
+                     fmla v2.4s, v2.4s, v2.4s
+                     ldr w23, [{1}], #4
+                     fmla v3.4s, v3.4s, v3.4s
+                     ldr w24, [{0}], #4
+                     fmla v4.4s, v4.4s, v4.4s
+                     ldr w25, [{1}], #4
+                     fmla v5.4s, v5.4s, v5.4s
+                     ldr w26, [{0}], #4
+                     fmla v6.4s, v6.4s, v6.4s
+                     ldr w27, [{1}], #4
+                     fmla v7.4s, v7.4s, v7.4s
+                     ",
+        inout(reg) p, inout(reg) q,
+        out("x20") _, out("x21") _, out("x22") _, out("x23") _,
+        out("x24") _, out("x25") _, out("x26") _, out("x27") _,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        ));
+    });
+    s32!("fmla_with_w_load_offset", 64, {
+        let mut p = F32;
+        r8!(asm!("
+                     ldr w20, [{0}]
+                     fmla v0.4s, v0.4s, v0.4s
+                     ldr w21, [{0}, #4]
+                     fmla v1.4s, v1.4s, v1.4s
+                     ldr w22, [{0}, #8]
+                     fmla v2.4s, v2.4s, v2.4s
+                     ldr w23, [{0}, #12]
+                     fmla v3.4s, v3.4s, v3.4s
+                     ldr w24, [{0}, #16]
+                     fmla v4.4s, v4.4s, v4.4s
+                     ldr w25, [{0}, #20]
+                     fmla v5.4s, v5.4s, v5.4s
+                     ldr w26, [{0}, #24]
+                     fmla v6.4s, v6.4s, v6.4s
+                     ldr w27, [{0}, #28]
+                     fmla v7.4s, v7.4s, v7.4s
+                     ",
+        inout(reg) p,
+        out("x20") _, out("x21") _, out("x22") _, out("x23") _,
+        out("x24") _, out("x25") _, out("x26") _, out("x27") _,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        ));
+    });
+    s32!("fmla_with_x_load", 64, {
+        let mut p = F32;
+        r8!(asm!("
+                     fmla v0.4s, v0.4s, v0.4s
+                     ldr x20, [{0}]
+                     fmla v1.4s, v1.4s, v1.4s
+                     ldr x21, [{0}]
+                     fmla v2.4s, v2.4s, v2.4s
+                     ldr x22, [{0}]
+                     fmla v3.4s, v3.4s, v3.4s
+                     ldr x23, [{0}]
+                     fmla v4.4s, v4.4s, v4.4s
+                     ldr x24, [{0}]
+                     fmla v5.4s, v5.4s, v5.4s
+                     ldr x25, [{0}]
+                     fmla v6.4s, v6.4s, v6.4s
+                     ldr x26, [{0}]
+                     fmla v7.4s, v7.4s, v7.4s
+                     ldr x27, [{0}]
+                     ",
+        inout(reg) p,
+        out("x20") _, out("x21") _, out("x22") _, out("x23") _,
+        out("x24") _, out("x25") _, out("x26") _, out("x27") _,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        ));
+    });
+    s32!("fmla_with_s_load", 64, {
+        let mut p = F32;
+        r8!(asm!("
+                     ldr s16, [{0}]
+                     fmla v0.4s, v0.4s, v0.4s
+                     ldr s17, [{0}]
+                     fmla v1.4s, v1.4s, v1.4s
+                     ldr s18, [{0}]
+                     fmla v2.4s, v2.4s, v2.4s
+                     ldr s19, [{0}]
+                     fmla v3.4s, v3.4s, v3.4s
+                     ldr s20, [{0}]
+                     fmla v4.4s, v4.4s, v4.4s
+                     ldr s21, [{0}]
+                     fmla v5.4s, v5.4s, v5.4s
+                     ldr s22, [{0}]
+                     fmla v6.4s, v6.4s, v6.4s
+                     ldr s23, [{0}]
+                     fmla v7.4s, v7.4s, v7.4s
+                     ",
+        inout(reg) p,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        ));
+    });
+    s32!("fmla_with_d_load", 64, {
+        let mut p = F32;
+        r8!(asm!("
+                     ldr d16, [{0}]
+                     fmla v0.4s, v0.4s, v0.4s
+                     ldr d17, [{0}]
+                     fmla v1.4s, v1.4s, v1.4s
+                     ldr d18, [{0}]
+                     fmla v2.4s, v2.4s, v2.4s
+                     ldr d19, [{0}]
+                     fmla v3.4s, v3.4s, v3.4s
+                     ldr d20, [{0}]
+                     fmla v4.4s, v4.4s, v4.4s
+                     ldr d21, [{0}]
+                     fmla v5.4s, v5.4s, v5.4s
+                     ldr d22, [{0}]
+                     fmla v6.4s, v6.4s, v6.4s
+                     ldr d23, [{0}]
+                     fmla v7.4s, v7.4s, v7.4s
+                     ",
+        inout(reg) p,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        out("v16") _, out("v17") _, out("v18") _, out("v19") _,
+        out("v20") _, out("v21") _, out("v22") _, out("v23") _,
+        ));
+    });
+    s32!("fmla_with_d_load_as_v", 64, {
+        let mut p = F32;
+        r8!(asm!("
+                     fmla v0.4s, v0.4s, v0.4s
+                     ld1 {{ v9.d }}[0], [{0}]
+                     fmla v1.4s, v1.4s, v1.4s
+                     ld1 {{ v10.d }}[0], [{0}]
+                     fmla v2.4s, v2.4s, v2.4s
+                     ld1 {{ v11.d }}[0], [{0}]
+                     fmla v3.4s, v3.4s, v3.4s
+                     ld1 {{ v12.d }}[0], [{0}]
+                     fmla v4.4s, v4.4s, v4.4s
+                     ld1 {{ v13.d }}[0], [{0}]
+                     fmla v5.4s, v5.4s, v5.4s
+                     ld1 {{ v14.d }}[0], [{0}]
+                     fmla v6.4s, v6.4s, v6.4s
+                     ld1 {{ v15.d }}[0], [{0}]
+                     fmla v7.4s, v7.4s, v7.4s
+                     ld1 {{ v16.d }}[0], [{0}]
+                     ",
+        inout(reg) p,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        ));
+    });
+    s32!("fmla_with_v_load", 64, {
+        let mut p = F32;
+        r8!(asm!("
+                     fmla v0.4s, v0.4s, v0.4s
+                     ld1 {{ v9.4s }}, [{0}]
+                     fmla v1.4s, v1.4s, v1.4s
+                     ld1 {{ v10.4s }}, [{0}]
+                     fmla v2.4s, v2.4s, v2.4s
+                     ld1 {{ v11.4s }}, [{0}]
+                     fmla v3.4s, v3.4s, v3.4s
+                     ld1 {{ v12.4s }}, [{0}]
+                     fmla v4.4s, v4.4s, v4.4s
+                     ld1 {{ v13.4s }}, [{0}]
+                     fmla v5.4s, v5.4s, v5.4s
+                     ld1 {{ v14.4s }}, [{0}]
+                     fmla v6.4s, v6.4s, v6.4s
+                     ld1 {{ v15.4s }}, [{0}]
+                     fmla v7.4s, v7.4s, v7.4s
+                     ld1 {{ v16.4s }}, [{0}]
+                     ",
+        inout(reg) p,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        ));
+    });
+    s32!("fmla_with_ins_32b", 64, {
+        r8!(asm!("
+                     fmla v0.4s, v0.4s, v0.4s
+                     ins v8.s[0], w20
+                     fmla v1.4s, v1.4s, v1.4s
+                     ins v9.s[0], w20
+                     fmla v2.4s, v2.4s, v2.4s
+                     ins v10.s[0], w20
+                     fmla v3.4s, v3.4s, v3.4s
+                     ins v11.s[0], w20
+                     fmla v4.4s, v4.4s, v4.4s
+                     ins v12.s[0], w20
+                     fmla v5.4s, v5.4s, v5.4s
+                     ins v13.s[0], w20
+                     fmla v6.4s, v6.4s, v6.4s
+                     ins v14.s[0], w20
+                     fmla v7.4s, v7.4s, v7.4s
+                     ins v15.s[0], w20
+                     ",
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        out("x20") _,
+        ));
+    });
+    s32!("fmla_with_ins_64b", 64, {
+        r8!(asm!("
+                     fmla v0.4s, v0.4s, v0.4s
+                     ins v8.d[0], x20
+                     fmla v1.4s, v1.4s, v1.4s
+                     ins v9.d[0], x20
+                     fmla v2.4s, v2.4s, v2.4s
+                     ins v10.d[0], x20
+                     fmla v3.4s, v3.4s, v3.4s
+                     ins v11.d[0], x20
+                     fmla v4.4s, v4.4s, v4.4s
+                     ins v12.d[0], x20
+                     fmla v5.4s, v5.4s, v5.4s
+                     ins v13.d[0], x20
+                     fmla v6.4s, v6.4s, v6.4s
+                     ins v14.d[0], x20
+                     fmla v7.4s, v7.4s, v7.4s
+                     ins v15.d[0], x20
+                     ",
+        out("x20") _,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        ));
+    });
+    s32!("fmla_with_ins_64b_cross_parity", 64, {
+        r8!(asm!("
+                     fmla v0.4s, v0.4s, v0.4s
+                     ins v9.d[0], x20
+                     fmla v1.4s, v1.4s, v1.4s
+                     ins v10.d[0], x20
+                     fmla v2.4s, v2.4s, v2.4s
+                     ins v11.d[0], x20
+                     fmla v3.4s, v6.4s, v3.4s
+                     ins v12.d[0], x20
+                     fmla v4.4s, v4.4s, v4.4s
+                     ins v13.d[0], x20
+                     fmla v5.4s, v5.4s, v5.4s
+                     ins v14.d[0], x20
+                     fmla v6.4s, v6.4s, v6.4s
+                     ins v15.d[0], x20
+                     fmla v7.4s, v7.4s, v7.4s
+                     ins v8.d[0], x20
+                     ",
+        out("x20") _,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        ));
+    });
+    s32!("ins_32b_with_load_s", 64, {
+        let mut p = F32;
+        r8!(asm!("
+                     ldr s0, [{0}]
+                     ins v8.d[0], x20
+                     ldr s1, [{0}]
+                     ins v9.d[0], x20
+                     ldr s2, [{0}]
+                     ins v10.d[0], x20
+                     ldr s3, [{0}]
+                     ins v11.d[0], x20
+                     ldr s4, [{0}]
+                     ins v12.d[0], x20
+                     ldr s5, [{0}]
+                     ins v13.d[0], x20
+                     ldr s6, [{0}]
+                     ins v14.d[0], x20
+                     ldr s7, [{0}]
+                     ins v15.d[0], x20
+                     ",
+        inout(reg) p,
+        out("x20") _,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        ));
+    });
+    s32!("ins_32b_with_load_s_cross_parity", 64, {
+        let mut p = F32;
+        r8!(asm!("
+                     ldr s0, [{0}]
+                     ins v9.d[0], x20
+                     ldr s1, [{0}]
+                     ins v10.d[0], x20
+                     ldr s2, [{0}]
+                     ins v11.d[0], x20
+                     ldr s3, [{0}]
+                     ins v12.d[0], x20
+                     ldr s4, [{0}]
+                     ins v13.d[0], x20
+                     ldr s5, [{0}]
+                     ins v14.d[0], x20
+                     ldr s6, [{0}]
+                     ins v15.d[0], x20
+                     ldr s7, [{0}]
+                     ins v8.d[0], x20
+                     ",
+        inout(reg) p,
+        out("x20") _,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        ));
+    });
+}
+
+fn has_asimdhp() -> bool {
+    std::fs::read_to_string("/proc/cpuinfo").unwrap().contains("asimdhp")
+}
+
+#[target_feature(enable = "fp16")]
+pub unsafe fn asimdhp(filter: Option<&str>) {
+    macro_rules! s32 {
+        ($label: literal, $n: expr, $stmt:block) => {
+            if $label.contains(filter.unwrap_or("")) {
+                println!("{:40} {:.2}", $label, b32!($stmt) / $n as f64 / *TICK);
+            }
+        };
+    }
+
+    s32!("fmlahp", 16, {
+        asm!(" fmla v0.8h, v0.8h, v0.8h
+               fmla v1.8h, v1.8h, v1.8h
+               fmla v2.8h, v2.8h, v2.8h
+               fmla v3.8h, v3.8h, v3.8h
+               fmla v4.8h, v4.8h, v4.8h
+               fmla v5.8h, v5.8h, v5.8h
+               fmla v6.8h, v6.8h, v6.8h
+               fmla v7.8h, v7.8h, v7.8h
+               fmla v8.8h, v8.8h, v8.8h
+               fmla v9.8h, v9.8h, v9.8h
+               fmla v10.8h,v10.8h,v10.8h
+               fmla v11.8h,v11.8h,v11.8h
+               fmla v12.8h,v12.8h,v12.8h
+               fmla v13.8h,v13.8h,v13.8h
+               fmla v14.8h,v14.8h,v14.8h
+               fmla v15.8h,v15.8h,v15.8h ",
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        )
+    });
+
+    s32!("fcvt", 16, {
+        asm!(" fcvtn v0.4h,  v0.4s
+               fcvtn v1.4h,  v1.4s 
+               fcvtn v2.4h,  v2.4s 
+               fcvtn v3.4h,  v3.4s 
+               fcvtn v4.4h,  v4.4s 
+               fcvtn v5.4h,  v5.4s 
+               fcvtn v6.4h,  v6.4s 
+               fcvtn v7.4h,  v7.4s 
+               fcvtn v8.4h,  v8.4s 
+               fcvtn v9.4h,  v9.4s 
+               fcvtn v10.4h, v10.4s
+               fcvtn v11.4h, v11.4s
+               fcvtn v12.4h, v12.4s
+               fcvtn v13.4h, v13.4s
+               fcvtn v14.4h, v14.4s
+               fcvtn v15.4h, v15.4s",
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        )
+    });
+
+    s32!("fcvt2", 16, {
+        asm!(" fcvtn2 v0.8h,  v0.4s
+               fcvtn2 v1.8h,  v1.4s 
+               fcvtn2 v2.8h,  v2.4s 
+               fcvtn2 v3.8h,  v3.4s 
+               fcvtn2 v4.8h,  v4.4s 
+               fcvtn2 v5.8h,  v5.4s 
+               fcvtn2 v6.8h,  v6.4s 
+               fcvtn2 v7.8h,  v7.4s 
+               fcvtn2 v8.8h,  v8.4s 
+               fcvtn2 v9.8h,  v9.4s 
+               fcvtn2 v10.8h, v10.4s
+               fcvtn2 v11.8h, v11.4s
+               fcvtn2 v12.8h, v12.4s
+               fcvtn2 v13.8h, v13.4s
+               fcvtn2 v14.8h, v14.4s
+               fcvtn2 v15.8h, v15.4s",
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        )
+    });
+
+    s32!("fmlahp_with_dep", 1, { asm!("fmla v0.8h, v0.8h, v0.8h", out("v0") _) });
+    s32!("fcvtn_with_dep", 1, { asm!("fcvtn v0.4h, v0.4s", out("v0") _) });
+    s32!("fcvtn2_with_dep", 1, { asm!("fcvtn2 v0.8h, v0.4s", out("v0") _) });
+}
+
+macro_rules! ksimd {
+    ($filter: expr, $vector_size: expr, $geo: literal, $n: expr, $path: literal) => {
+        kloop!($filter, $vector_size, $geo, $n, "arm64simd", $path)
+    }
+}
+
+macro_rules! kfp16 {
+    ($filter: expr, $vector_size: expr, $geo: literal, $n: expr, $path: literal) => {
+        kloop!($filter, $vector_size, $geo, $n, "arm64fp16", $path)
+    }
+}
+
+macro_rules! kloop {
+    ($filter: expr, $vector_size: expr, $geo: literal, $n: expr, $dir: literal, $path: literal) => {
+        let label = $path.split("/").last().unwrap().split_once(".").unwrap().0;
+        let full_label = format!("{:8} {:40}", $geo, label);
+        if full_label.contains($filter.unwrap_or("")) {
+            let time = b2!({
+                let mut p = F32;
+                let mut q = F32;
+                r4!(asm!(include_str!(concat!("../arm64/", $dir, "/", $path)),
+                inout("x1") p, inout("x2") q, out("x3") _,
+                out("x4") _, out("x5") _, out("x6") _, out("x7") _,
+                out("x8") _, out("x9") _, out("x10") _, out("x11") _,
+                out("x12") _, out("x13") _, out("x14") _, out("x15") _,
+                out("x20") _, out("x21") _, out("x22") _, out("x23") _,
+                out("x24") _, out("x25") _, out("x26") _, out("x27") _,
+                out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+                out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+                out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+                out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+                out("v16") _, out("v17") _, out("v18") _, out("v19") _,
+                out("v20") _, out("v21") _, out("v22") _, out("v23") _,
+                out("v24") _, out("v25") _, out("v26") _, out("v27") _,
+                out("v28") _, out("v29") _, out("v30") _, out("v31") _,
+                ));
+            }) / 4.;
+            println!("{} {:3.0}% ({:0.2}/{} cy)", full_label, $n as f64 / $vector_size as f64 / time * 100. * *TICK, time / *TICK, $n as f64 / $vector_size as f64);
+        }
+    }
+}
+
+unsafe fn f32_8x8(f: Option<&str>) {
+    ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/naive.tmpli");
+    ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/broken_chains.tmpli");
+    ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_no_preload.tmpli");
+    ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_x_preload.tmpli");
+    ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_no_preload.tmpli");
+    ksimd!(f, 4, "8x8x1xf32", 64, "arm64simd_mmm_f32_8x8/packed_packed_loop1/ldr_w_preload.tmpli");
+    ksimd!(f, 4, "8x8x2xf32", 128, "arm64simd_mmm_f32_8x8/packed_packed_loop2/broken_chains.tmpli");
+    ksimd!(f, 4, "8x8x2xf32", 128, "arm64simd_mmm_f32_8x8/packed_packed_loop2/cortex_a55.tmpli");
+}
+
+unsafe fn f32_12x8(f: Option<&str>) {
+    ksimd!(f, 4, "12x8x1xf32", 96, "arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli");
+    ksimd!(f, 4, "12x8x1xf32", 96, "arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_no_preload.tmpli");
+    ksimd!(f, 4, "12x8x1xf32", 96, "arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_w_preload.tmpli");
+    ksimd!(f, 4, "12x8x1xf32", 96, "arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli");
+    ksimd!(f, 4, "12x8x2xf32", 192, "arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli");
+}
+
+unsafe fn f32_16x4(f: Option<&str>) {
+    ksimd!(f, 4, "16x4x1xf32", 64, "arm64simd_mmm_f32_16x4/packed_packed_loop1/naive.tmpli");
+    ksimd!(f, 4, "16x4x1xf32", 64, "arm64simd_mmm_f32_16x4/packed_packed_loop1/cortex_a53.tmpli");
+    ksimd!(f, 4, "16x4x2xf32", 128, "arm64simd_mmm_f32_16x4/packed_packed_loop2/cortex_a55.tmpli");
+}
+
+unsafe fn f32_24x4(f: Option<&str>) {
+    ksimd!(f, 4, "24x4x1xf32", 96, "arm64simd_mmm_f32_24x4/packed_packed_loop1/naive.tmpli");
+    ksimd!(f, 4, "24x4x1xf32", 96, "arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a53.tmpli");
+    ksimd!(f, 4, "24x4x1xf32", 96, "arm64simd_mmm_f32_24x4/packed_packed_loop1/cortex_a55.tmpli");
+}
+
+unsafe fn f32_64x1(f: Option<&str>) {
+    ksimd!(f, 4, "64x1x1xf32", 64, "arm64simd_mmm_f32_64x1/loop1/naive.tmpli");
+    ksimd!(f, 4, "64x1x1xf32", 64, "arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli");
+    ksimd!(f, 4, "64x1x2xf32", 128, "arm64simd_mmm_f32_64x1/loop2/naive.tmpli");
+    ksimd!(f, 4, "64x1x2xf32", 128, "arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli");
+}
+
+// RUSTFLAGS="-C target-feature=+fp16" cargo +nightly dinghy -d khadas-paris bench --bench arm64simd
+#[target_feature(enable = "fp16")]
+unsafe fn f16_16x8(f: Option<&str>) {
+    kfp16!(f, 8, "16x8x1xf16", 128, "arm64fp16_mmm_f16_16x8/loop1/naive.tmpli");
+    kfp16!(f, 8, "16x8x2xf16", 256, "arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli");
+    kfp16!(f, 8, "32x4x1xf16", 128, "arm64fp16_mmm_f16_32x4/loop1/naive.tmpli");
+    kfp16!(f, 8, "32x4x2xf16", 256, "arm64fp16_mmm_f16_32x4/loop2/cortex_a55.tmpli");
+}
+
+fn main() {
+    println!("freq {:.2}GHz\n", 1e-9 / *TICK);
+
+    let filter = std::env::args().skip(1).filter(|a| a != "--bench").next();
+    unsafe {
+        armv8(filter.as_deref());
+        if has_asimdhp() {
+            asimdhp(filter.as_deref());
+        }
+        f32_8x8(filter.as_deref());
+        f32_12x8(filter.as_deref());
+        f32_16x4(filter.as_deref());
+        f32_24x4(filter.as_deref());
+        f32_64x1(filter.as_deref());
+        f16_16x8(filter.as_deref());
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/benches/intel.rs b/vendor/tract-linalg-0.22.1/benches/intel.rs
new file mode 100644
index 000000000..d45dcf086
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/intel.rs
@@ -0,0 +1,200 @@
+#![allow(dead_code)]
+use std::time::Instant;
+
+use tract_data::prelude::*;
+use tract_linalg::frame::mmm::*;
+
+
+fn ruin_cache() {
+    // return;
+    let _a = (0..1000000).collect::<Vec<i32>>();
+}
+
+pub fn reference<T, K>(mr: usize, k: usize, nr: usize) -> Vec<f32>
+where
+    T: Datum + Copy + num_traits::Zero + tract_linalg::LADatum,
+    K: MatMatMulKer<T>,
+{
+    let mut vi = vec![0.0; k * nr];
+
+    for m in 0..mr {
+        for n in 0..nr {
+            for _ in 0..k {
+                let a: f32 = 1.0;
+                let b = 1.0;
+                let offset = { n + m * nr };
+                vi[offset] += a * b;
+            }
+        }
+    }
+    vi
+}
+
+fn bench_to_nanos<
+    T: Datum + Copy + num_traits::Zero + tract_linalg::LADatum,
+    K: MatMatMulKer<T>,
+>(
+    loops: usize,
+    m: usize,
+    n: usize,
+    k: usize,
+) -> f64 {
+    let kernel = K::mmm();
+
+    let mut a = Tensor::zero_aligned::<T>(
+        &[(k + K::end_padding_packed_a()) * m],
+        K::alignment_bytes_packed_a(),
+    )
+    .unwrap();
+
+    let mut v = a.to_array_view_mut::<f32>().unwrap();
+    v += 1.0;
+    let mut b = Tensor::zero_aligned::<T>(
+        &[(k + K::end_padding_packed_b()) * n],
+        K::alignment_bytes_packed_b(),
+    )
+    .unwrap();
+
+    let mut v = b.to_array_view_mut::<f32>().unwrap();
+    v += 1.0;
+    let mut c = Tensor::zero::<T>(&[n, m]).unwrap();
+
+    let ops = unsafe {
+        [
+            FusedSpec::AddMatMul {
+                k,
+                a: kernel.a_packed(4, k).wrap(&a.view()),
+                b: kernel.b_packed(4, k).wrap(&b.view()),
+            },
+            // FusedSpec::AddUnicast(kernel.c_view(1, 0).wrap(&c.view_mut())),
+            FusedSpec::Store(kernel.c_view(1, 0).wrap(&c.view_mut())),
+        ]
+    };
+
+    let mut values = Vec::with_capacity(loops);
+
+    for _ in 0..loops {
+        ruin_cache();
+        let start = Instant::now();
+        unsafe { kernel.run(m, n, &ops).unwrap() };
+        values.push(start.elapsed());
+    }
+
+    eprintln!("{:?} -> {:?}", values.first().unwrap(), values.last().unwrap());
+
+    values.sort();
+    values[loops / 2].as_nanos() as f64
+}
+
+fn model<T: Datum + Copy + num_traits::Zero + tract_linalg::LADatum, K: MatMatMulKer<T>>(
+) -> (f64, f64) {
+    let x = 1000;
+    let zp = bench_to_nanos::<T, K>(1000, K::mr() * 4, K::nr() * 4, 0);
+    let y = bench_to_nanos::<T, K>(1000, K::mr() * 4, K::nr() * 4, x);
+    let slope = (y - zp) / x as f64;
+    (slope, zp)
+}
+
+fn as_match_line<T: Datum + Copy + num_traits::Zero + tract_linalg::LADatum, K: MatMatMulKer<T>>() {
+    let coeffs = model::<T, K>();
+    println!("({:?}, {}, {}) => {} * k + {}", K::name(), K::mr(), K::nr(), (coeffs.0), (coeffs.1),);
+}
+
+fn main() {
+    
+    let core_id = core_affinity::get_core_ids().unwrap()[0];
+    core_affinity::set_for_current(core_id);
+    // as_match_line::<f32, fma_mmm_f32_64x1>();
+    // as_match_line::<f32, avx512_mmm_f32_128x1>();
+    // as_match_line::<f32, avx512_mmm_f32_16x1>();
+    // as_match_line::<f32, fma_mmm_f32_40x2>();
+    // as_match_line::<f32, fma_mmm_f32_32x3>();
+    // as_match_line::<f32, fma_mmm_f32_24x4>();
+    // as_match_line::<f32, fma_mmm_f32_16x5>();
+    // as_match_line::<f32, fma_mmm_f32_16x6>();
+    // as_match_line::<f32, fma_mmm_f32_8x8>();
+
+    // mmv_perf_m();
+    mmm_perf_batch_size();
+}
+
+// for mmv
+fn mmv_perf_m() {
+    use tract_linalg::x86_64_fma::mmm::*;
+    let core_id = core_affinity::get_core_ids().unwrap()[0];
+    core_affinity::set_for_current(core_id);
+    fn bench<T: Datum + Copy + num_traits::Zero + tract_linalg::LADatum, K: MatMatMulKer<T>>(
+        m: usize,
+    ) {
+        let val = bench_to_nanos::<T, K>(1000, m, 1, 100) / (m * 100) as f64;
+        print!("{val}\t");
+    }
+
+    print!("N\t");
+    print!("fma_mmm_f32_64x1\t");
+    print!("avx512_mmm_f32_128x1\t");
+    print!("avx512_mmm_f32_16x1\t");
+    println!();
+    for n in 1..=128 {
+        eprintln!("{n}");
+        print!("{n}\t");
+        bench::<f32, fma_mmm_f32_64x1>(n);
+        bench::<f32, avx512_mmm_f32_128x1>(n);
+        bench::<f32, avx512_mmm_f32_16x1>(n);
+        println!();
+    }
+}
+
+// output a csv file with the perf of the kernels wrt batch size
+fn mmm_perf_batch_size() {
+    use tract_linalg::x86_64_fma::mmm::*;
+    let core_id = core_affinity::get_core_ids().unwrap()[0];
+    core_affinity::set_for_current(core_id);
+    fn bench<T: Datum + Copy + num_traits::Zero + tract_linalg::LADatum, K: MatMatMulKer<T>>(
+        n: usize,
+    ) {
+        let val =
+            bench_to_nanos::<T, K>(1000, K::mr() * 4, n, 100) / (K::mr() * 4 * 100 * n) as f64;
+        print!("{val}\t");
+    }
+
+    print!("N\t");
+    print!("fma_mmm_f32_8x8\t");
+    print!("fma_mmm_f32_16x6\t");
+    print!("fma_mmm_f32_16x5\t");
+    print!("fma_mmm_f32_24x4\t");
+    print!("fma_mmm_f32_32x3\t");
+    print!("fma_mmm_f32_40x2\t");
+    print!("fma_mmm_f32_64x1\t");
+    print!("avx512_mmm_f32_128x1\t");
+    print!("avx512_mmm_f32_16x1\t");
+    print!("avx512_mmm_f32_16x12\t");
+    print!("avx512_mmm_f32_16x8\t");
+    print!("avx512_mmm_f32_32x6\t");
+    print!("avx512_mmm_f32_32x5\t");
+    print!("avx512_mmm_f32_48x4\t");
+    print!("avx512_mmm_f32_64x3\t");
+    print!("avx512_mmm_f32_80x2\t");
+    println!();
+    for n in 1..=128 {
+        eprintln!("{n}");
+        print!("{n}\t");
+        bench::<f32, fma_mmm_f32_8x8>(n);
+        bench::<f32, fma_mmm_f32_16x6>(n);
+        bench::<f32, fma_mmm_f32_16x5>(n);
+        bench::<f32, fma_mmm_f32_24x4>(n);
+        bench::<f32, fma_mmm_f32_32x3>(n);
+        bench::<f32, fma_mmm_f32_40x2>(n);
+        bench::<f32, fma_mmm_f32_64x1>(n);
+        bench::<f32, avx512_mmm_f32_128x1>(n);
+        bench::<f32, avx512_mmm_f32_16x1>(n);
+        bench::<f32, avx512_mmm_f32_16x12>(n);
+        bench::<f32, avx512_mmm_f32_16x8>(n);
+        bench::<f32, avx512_mmm_f32_32x6>(n);
+        bench::<f32, avx512_mmm_f32_32x5>(n);
+        bench::<f32, avx512_mmm_f32_48x4>(n);
+        bench::<f32, avx512_mmm_f32_64x3>(n);
+        bench::<f32, avx512_mmm_f32_80x2>(n);
+        println!();
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/benches/leaky_relu.rs b/vendor/tract-linalg-0.22.1/benches/leaky_relu.rs
new file mode 100644
index 000000000..31873fc68
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/leaky_relu.rs
@@ -0,0 +1,63 @@
+use criterion::*;
+use tract_data::prelude::*;
+
+use tract_linalg::element_wise::ElementWiseKer;
+
+fn leaky_relu_f16(c: &mut Criterion) {
+    let mut group = c.benchmark_group("leaky_relu_f16");
+    group.throughput(Throughput::Elements(1024));
+    let mut input = unsafe { Tensor::uninitialized_aligned::<f16>(&[1024], 16).unwrap() };
+    let input = input.as_slice_mut::<f16>().unwrap();
+    let alpha = f16::from_f32(0.1);
+    group.bench_function("rust", |b| b.iter(|| rust_fp16(input, alpha)));
+    group.bench_function("rust_with_f16", |b| b.iter(|| unsafe { rust_with_fp16(input, alpha) }));
+    group.bench_function("linalg", |b| b.iter(|| linalg16(input, alpha)));
+    group.bench_function("linalg-asm", |b| b.iter(|| tract_linalg::arm64::arm64fp16_leaky_relu_f16_16n::run(input, alpha)));
+}
+
+#[inline(never)]
+fn rust_fp16(input: &mut [f16], alpha: f16) {
+    for x in input {
+        *x = if *x > f16::ZERO { *x } else { *x * alpha }
+    }
+}
+
+#[target_feature(enable = "fp16")]
+#[inline(never)]
+unsafe fn rust_with_fp16(input: &mut [f16], alpha: f16) {
+    for x in input {
+        *x = if *x > f16::ZERO { *x } else { *x * alpha }
+    }
+}
+
+#[inline(never)]
+fn linalg16(input: &mut [f16], alpha: f16) {
+    (tract_linalg::ops().leaky_relu_f16)().run_with_params(input, alpha).unwrap();
+}
+
+fn leaky_relu_f32(c: &mut Criterion) {
+    let mut group = c.benchmark_group("leaky_relu_f32");
+    group.throughput(Throughput::Elements(1024));
+    let mut input = unsafe { Tensor::uninitialized_aligned::<f32>(&[1024], 16).unwrap() };
+    let input = input.as_slice_mut::<f32>().unwrap();
+    let alpha = 0.1f32;
+    group.bench_function("rust", |b| b.iter(|| rust_fp32(input, alpha)));
+    group.bench_function("linalg", |b| b.iter(|| linalg32(input, alpha)));
+    group.bench_function("linalg-asm", |b| b.iter(|| tract_linalg::arm64::arm64simd_leaky_relu_f32_8n::run(input, alpha)));
+}
+
+#[inline(never)]
+fn rust_fp32(input: &mut [f32], alpha: f32) {
+    for x in input {
+        *x = if *x > 0.0 { *x } else { *x * alpha }
+    }
+}
+
+#[inline(never)]
+fn linalg32(input: &mut [f32], alpha: f32) {
+    (tract_linalg::ops().leaky_relu_f32)().run_with_params(input, alpha).unwrap();
+}
+
+
+criterion_group!(benches, leaky_relu_f32, leaky_relu_f16);
+criterion_main!(benches);
diff --git a/vendor/tract-linalg-0.22.1/benches/mat_vec.rs b/vendor/tract-linalg-0.22.1/benches/mat_vec.rs
new file mode 100644
index 000000000..2c63bfc92
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/mat_vec.rs
@@ -0,0 +1,46 @@
+use criterion::*;
+use tract_data::internal::*;
+use tract_linalg::mmm::{AsInputValue, FusedSpec};
+
+use DatumType::F32;
+
+fn mat_vec_mul(c: &mut Criterion) {
+    let mut group = c.benchmark_group("mat_vec_mul");
+    unsafe {
+        {
+            let (m, k) = &(768usize, 256usize);
+            group.throughput(Throughput::Elements((m * k) as u64));
+            group.bench_with_input(
+                BenchmarkId::from_parameter(format!("{m}x{k}")),
+                &(m, k),
+                |be, &(&m, &k)| {
+                    let mmm = tract_linalg::ops().mmm(F32, Some(m), Some(k), Some(1)).unwrap();
+                    let packing = &mmm.packings()[0];
+                    let a = Tensor::zero::<f32>(&[m, k]).unwrap();
+                    let pa = packing.0.prepare_one(&a, 1, 0).unwrap();
+                    let b = Tensor::zero::<f32>(&[k, 1]).unwrap();
+                    let pb = packing.1.prepare_one(&b, 0, 1).unwrap();
+                    let mut c = Tensor::zero::<f32>(&[m]).unwrap();
+                    be.iter(move || {
+                        mmm.run(
+                            m,
+                            1,
+                            &[
+                                FusedSpec::AddMatMul {
+                                    a: AsInputValue::Borrowed(&*pa),
+                                    b: AsInputValue::Borrowed(&*pb),
+                                    packing: 0,
+                                },
+                                FusedSpec::Store(mmm.c_view(Some(0), Some(0)).wrap(&c.view_mut())),
+                            ],
+                        )
+                    });
+                },
+            );
+        }
+    }
+    group.finish();
+}
+
+criterion_group!(benches, mat_vec_mul);
+criterion_main!(benches);
diff --git a/vendor/tract-linalg-0.22.1/benches/mm_for_asr_am.rs b/vendor/tract-linalg-0.22.1/benches/mm_for_asr_am.rs
new file mode 100644
index 000000000..b350bed87
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/mm_for_asr_am.rs
@@ -0,0 +1,37 @@
+use criterion::*;
+
+mod utils;
+use utils::*;
+
+fn all(c: &mut Criterion) {
+    // packed_packed: co, ci, n
+//    direct_conv(c, "asr_2M", 24, 5, 40, 200, 1); // lda
+    packed_packed(c, "asr_2M", 256, 200, 24); // tdnn1
+//    direct_conv(c, "asr_2M", 24, 3, 256, 256, 1); // tdnn2
+//    direct_conv(c, "asr_2M", 24, 3, 256, 256, 3); // tdnn3
+    packed_packed(c, "asr_2M", 256, 256, 8); // fastlstm1 and 2 (input) x 8 (4 prod x 2 layers)
+    packed_packed(c, "asr_2M", 256, 128, 1); // fastlstm1 and 2 (hidden) x 64 (4 prod x 2 layers x 8 loops)
+    packed_packed(c, "asr_2M", 256, 256, 1); // fastlstm1 and 2 (rp) x 16 (2 layers x 8 loops)
+//    direct_conv(c, "asr_2M", 8, 3, 256, 256, 1); // tdnn4, tdd5 (x2)
+    packed_packed(c, "asr_2M", 1690, 256, 8); // output
+
+    // 8M
+    packed_packed(c, "asr_8M", 512, 200, 24); // tdnn1
+    packed_packed(c, "asr_8M", 512, 512, 24); // tdnn2
+    packed_packed(c, "asr_8M", 512, 256, 1); // fastlstm1 and 2 (four parts, rec mat*vec)
+    packed_vec(c, "asr_8M", 512, 256, 1); // fastlstm1 and 2 (four parts, rec mat*vec)
+
+    // pseudo 15M
+    packed_packed(c, "asr_pseudo15M", 768, 200, 24); // tdnn1
+    packed_packed(c, "asr_pseudo15M", 768, 2304, 24); // tdnn2
+    packed_packed(c, "asr_pseudo15M", 768, 2304, 8); // tdnn3,4,5
+    packed_packed(c, "asr_pseudo15M", 768, 768, 8); // fastlstm1 and 2 (four parts, rec mat*mat)
+    packed_packed(c, "asr_pseudo15M", 768, 384, 1); // fastlstm1 and 2 (four parts, rec mat*vec)
+    packed_vec(c, "asr_pseudo15M", 768, 384, 1); // fastlstm1 and 2 (four parts, rec mat*vec)
+
+    // 15M
+    packed_vec(c, "asr_15M", 768, 256, 1); // fastlstm1 and 2 (four parts, rec mat*vec)
+}
+
+criterion_group!(benches, all);
+criterion_main!(benches);
diff --git a/vendor/tract-linalg-0.22.1/benches/mm_for_inception.rs b/vendor/tract-linalg-0.22.1/benches/mm_for_inception.rs
new file mode 100644
index 000000000..218d85f0a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/mm_for_inception.rs
@@ -0,0 +1,45 @@
+extern crate criterion;
+use criterion::*;
+use tract_data::internal::*;
+use tract_linalg::mmm::{AsInputValue, FusedSpec};
+
+use DatumType::F32;
+
+fn mat_mul_smmm(be: &mut criterion::Bencher, &(m, k, n): &(usize, usize, usize)) {
+    unsafe {
+        let mmm = tract_linalg::ops().mmm(F32, Some(m), Some(k), Some(n)).unwrap();
+        let a = Tensor::zero::<f32>(&[m, k]).unwrap();
+        let b = Tensor::zero::<f32>(&[k, n]).unwrap();
+        let packing = &mmm.packings()[0];
+        let pa = packing.0.prepare_one(&a, 1, 0).unwrap();
+        let pb = packing.1.prepare_one(&b, 0, 1).unwrap();
+
+        let mut c = Tensor::zero::<f32>(&[m, n]).unwrap();
+        be.iter(move || {
+            mmm.run(
+                m,
+                n,
+                &[
+                    FusedSpec::AddMatMul {
+                        a: AsInputValue::Borrowed(&*pa),
+                        b: AsInputValue::Borrowed(&*pb),
+                        packing: 0,
+                    },
+                    FusedSpec::Store(mmm.c_view(Some(0), Some(1)).wrap(&c.view_mut())),
+                ],
+            )
+        });
+    }
+}
+
+fn mat_mul_prepacked(c: &mut Criterion, m: usize, k: usize, n: usize) {
+    let mut group = c.benchmark_group("mat_mul_prepacked");
+    group.bench_function("smmm", |be| mat_mul_smmm(be, &(m, k, n)));
+}
+
+fn s64x288x21609(c: &mut Criterion) {
+    mat_mul_prepacked(c, 64, 288, 21609)
+}
+
+criterion::criterion_group!(benches, s64x288x21609);
+criterion::criterion_main!(benches);
diff --git a/vendor/tract-linalg-0.22.1/benches/mm_for_wavenet_hw.rs b/vendor/tract-linalg-0.22.1/benches/mm_for_wavenet_hw.rs
new file mode 100644
index 000000000..060db5381
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/mm_for_wavenet_hw.rs
@@ -0,0 +1,12 @@
+use criterion::*;
+
+mod utils;
+use utils::*;
+
+fn s16x60x8(c: &mut Criterion) {
+    packed_packed(c, "wavenet", 32, 32, 8); // postproc
+    packed_packed(c, "wavenet", 16, 60, 8);
+}
+
+criterion_group!(benches, s16x60x8);
+criterion_main!(benches);
diff --git a/vendor/tract-linalg-0.22.1/benches/sigmoid.rs b/vendor/tract-linalg-0.22.1/benches/sigmoid.rs
new file mode 100644
index 000000000..c9868b654
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/sigmoid.rs
@@ -0,0 +1,22 @@
+#[macro_use]
+extern crate criterion;
+extern crate tract_linalg;
+use criterion::Criterion;
+
+fn ssigmoid(c: &mut Criterion, n: usize) {
+    c.bench_function(&format!("ssigmoid_{n}"), move |be| {
+        let mut s = (0..n).map(|i| i as f32 / 10.0).collect::<Vec<f32>>();
+        let op = &(tract_linalg::ops().sigmoid_f32)();
+        be.iter(|| op.run(&mut s));
+    });
+}
+
+fn bs(c: &mut Criterion) {
+    ssigmoid(c, 4);
+    ssigmoid(c, 8);
+    ssigmoid(c, 128);
+    ssigmoid(c, 1024);
+}
+
+criterion_group!(benches, bs);
+criterion_main!(benches);
diff --git a/vendor/tract-linalg-0.22.1/benches/softmax.rs b/vendor/tract-linalg-0.22.1/benches/softmax.rs
new file mode 100644
index 000000000..87ce17360
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/softmax.rs
@@ -0,0 +1,110 @@
+use criterion::*;
+use tract_data::prelude::*;
+use tract_linalg::element_wise::ElementWiseKer;
+use tract_linalg::generic::reduce::softmax_l2::SSoftMaxL2;
+use tract_linalg::reduce::{MapReduceKer, ReduceKer};
+
+#[inline(never)]
+fn loop1_f32_naive(slice: &mut [f32]) -> f32 {
+    let mut max = f32::MIN;
+    for x in &*slice {
+        if *x > max {
+            max = *x;
+        }
+    }
+    max
+}
+
+#[inline(never)]
+fn loop2_f32(slice: &mut [f32], max: f32) -> f32 {
+    let mut sum = 0.;
+    for x in slice.iter_mut() {
+        *x = (*x - max).exp();
+        sum += *x;
+    }
+    sum
+}
+
+#[inline(never)]
+fn loop3_f32(slice: &mut [f32], sum: f32) {
+    let recip = sum.recip();
+    for x in slice {
+        *x *= recip;
+    }
+}
+
+#[inline(never)]
+fn rust_f32(slice: &mut [f32]) {
+    let max = loop1_f32_naive(slice);
+    let sum = loop2_f32(slice, max);
+    loop3_f32(slice, sum);
+}
+
+fn softmax_f32(c: &mut Criterion) {
+    let mut group = c.benchmark_group("softmax_f32");
+    group.throughput(Throughput::Elements(1500));
+    let mut input = unsafe { Tensor::uninitialized_aligned::<f32>(&[1500], 16).unwrap() };
+    let input = input.as_slice_mut::<f32>().unwrap();
+    group.bench_function("rust", |b| b.iter(|| rust_f32(input)));
+    group.bench_function("loop1/naive", |b| b.iter(|| loop1_f32_naive(input)));
+    group.bench_function("loop1/generic", |b| {
+        b.iter(|| tract_linalg::generic::reduce::max::SMax4::red().run(input))
+    });
+    #[cfg(target_arch = "x86_64")]
+    group.bench_function("loop1/iasm", |b| {
+        b.iter(|| {
+            tract_linalg::x86_64_fma::max::x86_64_fma_max_f32_32n::red().run(input).unwrap();
+        })
+    });
+    #[cfg(target_arch = "aarch64")]
+    group.bench_function("loop1/intr", |b| {
+        b.iter(|| {
+            tract_linalg::arm64::arm64simd_max_f32_16n::red().run(input).unwrap();
+        })
+    });
+    group.bench_function("loop2/naive", |b| b.iter(|| loop2_f32(input, 1.0)));
+    group.bench_function("loop2/generic", |b| {
+        b.iter(|| SSoftMaxL2::red().run_with_params(input, 10.))
+    });
+    #[cfg(target_arch = "x86_64")]
+    group.bench_function("loop2/iasm", |b| {
+        b.iter(|| {
+            tract_linalg::x86_64_fma::softmax::x86_64_fma_softmax2_fastcompact_f32_32n::red()
+                .run_with_params(input, 10.)
+                .unwrap()
+        });
+    });
+    #[cfg(target_arch = "aarch64")]
+    group.bench_function("loop2/iasm", |b| {
+        b.iter(|| {
+            tract_linalg::arm64::arm64simd_softmax2_fastcompact_f32_16n::red()
+                .run_with_params(input, 0.21)
+                .unwrap()
+        });
+    });
+    group.bench_function("loop3/naive", |b| b.iter(|| loop3_f32(input, 0.21)));
+    group.bench_function("loop3/generic", |b| {
+        b.iter(|| {
+            tract_linalg::generic::by_scalar::SMulByScalar4::ew().run_with_params(input, 0.21)
+        })
+    });
+    #[cfg(target_arch = "x86_64")]
+    group.bench_function("loop3/iasm", |b| {
+        b.iter(|| {
+            tract_linalg::x86_64_fma::by_scalar::x86_64_avx_f32_mul_by_scalar_32n::ew()
+                .run_with_params(input, 0.21)
+                .unwrap()
+        });
+    });
+    #[cfg(target_arch = "aarch64")]
+    group.bench_function("loop3/iasm", |b| {
+        b.iter(|| {
+            tract_linalg::arm64::arm64simd_mul_by_scalar_f32_16n::ew()
+                .run_with_params(input, 0.21)
+                .unwrap()
+        });
+    });
+}
+
+criterion_group!(benches, softmax_f32);
+criterion_main!(benches);
diff --git a/vendor/tract-linalg-0.22.1/benches/utils.rs b/vendor/tract-linalg-0.22.1/benches/utils.rs
new file mode 100644
index 000000000..8d513a5dd
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/utils.rs
@@ -0,0 +1,92 @@
+#![allow(dead_code)]
+use criterion::*;
+use tract_data::internal::*;
+use tract_linalg::mmm::{FusedSpec, MMMInputValue, MatMatMul};
+
+use tract_linalg::mmm::AsInputValue;
+use DatumType::*;
+
+pub fn packed_packed(c: &mut Criterion, name: &str, m: usize, k: usize, n: usize) {
+    let mut group = c.benchmark_group(format!("{name}/packed_packed"));
+    group.throughput(Throughput::Elements((m * k * n) as u64));
+    let id = format!("{m}x{k}x{n}");
+    group.bench_with_input(BenchmarkId::new("f32/cold", &id), &(F32, m, k, n, true), mat_mat);
+    group.bench_with_input(BenchmarkId::new("f32/hot", &id), &(F32, m, k, n, false), mat_mat);
+    group.bench_with_input(BenchmarkId::new("i8/cold", &id), &(I8, m, k, n, true), mat_mat);
+    group.bench_with_input(BenchmarkId::new("i8/hot", &id), &(I8, m, k, n, false), mat_mat);
+}
+
+pub fn packed_vec(c: &mut Criterion, name: &str, m: usize, k: usize, n: usize) {
+    assert_eq!(n, 1);
+    let mut group = c.benchmark_group(format!("{name}/packed_vec"));
+    group.throughput(Throughput::Elements((m * k * n) as u64));
+    let id = format!("{m}x{k}x{n}");
+    group.bench_with_input(BenchmarkId::new("f32/cold", &id), &(F32, m, k, n, true), mat_mat);
+    group.bench_with_input(BenchmarkId::new("f32/hot", &id), &(F32, m, k, n, false), mat_mat);
+    group.bench_with_input(BenchmarkId::new("i8/cold", &id), &(I8, m, k, n, true), mat_mat);
+    group.bench_with_input(BenchmarkId::new("i8/hot", &id), &(I8, m, k, n, false), mat_mat);
+}
+
+pub fn ruin_cache() {
+    let _a = (0..1000000).collect::<Vec<i32>>();
+}
+
+#[allow(clippy::too_many_arguments)]
+unsafe fn run(
+    m: usize,
+    _k: usize,
+    n: usize,
+    be: &mut Bencher,
+    mmm: &dyn MatMatMul,
+    a: &dyn MMMInputValue,
+    b: &dyn MMMInputValue,
+    cold: bool,
+) {
+    let mut scratch = unsafe { mmm.allocate_scratch_space() };
+    be.iter_custom(move |iters| {
+        let mut dur = std::time::Duration::default();
+        for _ in 0..iters {
+            if cold {
+                ruin_cache();
+            }
+            let instant = std::time::Instant::now();
+            unsafe {
+                mmm.run_with_scratch_space(
+                    m,
+                    n,
+                    scratch.as_mut(),
+                    &[FusedSpec::AddMatMul {
+                        a: AsInputValue::Borrowed(a),
+                        b: AsInputValue::Borrowed(b),
+                        packing: 0,
+                    }],
+                )
+                .unwrap()
+            };
+            let time = instant.elapsed();
+            dur += time;
+        }
+        dur
+    });
+}
+
+fn mat_mat(be: &mut Bencher, params: &(DatumType, usize, usize, usize, bool)) {
+    let (dt, m, k, n, _) = *params;
+    let mm = tract_linalg::ops().mmm(dt, Some(m), Some(k), Some(n)).unwrap();
+    mat_mat_with_mm(be, &*mm, params)
+}
+
+pub fn mat_mat_with_mm(
+    be: &mut Bencher,
+    mmm: &dyn MatMatMul,
+    &(dt, m, k, n, cold): &(DatumType, usize, usize, usize, bool),
+) {
+    let a = Tensor::zero_dt(dt, &[m, k]).unwrap();
+    let b = Tensor::zero_dt(dt, &[k, n]).unwrap();
+    let packing = &mmm.packings()[0];
+    let pa = packing.0.prepare_one(&a, 1, 0).unwrap();
+    let pb = packing.1.prepare_one(&b, 0, 1).unwrap();
+    unsafe {
+        run(m, k, n, be, mmm, &*pa, &*pb, cold);
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/benches/virtual_im2col.rs b/vendor/tract-linalg-0.22.1/benches/virtual_im2col.rs
new file mode 100644
index 000000000..8e7309042
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/virtual_im2col.rs
@@ -0,0 +1,47 @@
+use criterion::measurement::WallTime;
+use criterion::*;
+use tract_data::internal::*;
+
+#[allow(dead_code)]
+#[path = "../tests/virtual_im2col.rs"]
+mod virtual_im2col;
+use virtual_im2col::ConvProblem;
+
+fn conv(
+    c: &mut BenchmarkGroup<WallTime>,
+    ci: usize,
+    h: usize,
+    w: usize,
+    co: usize,
+    kh: usize,
+    kw: usize,
+) {
+    // CHW HWIO
+    let input = Tensor::zero::<f32>(&[ci, h, w]).unwrap();
+    let filters = Tensor::zero::<f32>(&[kh, kw, ci, co]).unwrap();
+    let mut cv = ConvProblem { input, filters, lazy_im2col: false };
+    c.bench_function("eager", |b| {
+        b.iter(|| {
+            cv.tract().unwrap();
+        })
+    });
+    cv.lazy_im2col = true;
+    c.bench_function("lazy", |b| {
+        b.iter(|| {
+            cv.tract().unwrap();
+        })
+    });
+}
+
+fn ex1(c: &mut Criterion) {
+    let mut c = c.benchmark_group("ex1");
+    conv(&mut c, 32, 256, 256, 32, 3, 3);
+}
+
+fn big(c: &mut Criterion) {
+    let mut c = c.benchmark_group("big");
+    conv(&mut c, 1, 1024, 1024, 99, 3, 3);
+}
+
+criterion_group!(benches, ex1, big);
+criterion_main!(benches);
diff --git a/vendor/tract-linalg-0.22.1/benches/x86_64.rs b/vendor/tract-linalg-0.22.1/benches/x86_64.rs
new file mode 100644
index 000000000..2da68f812
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/benches/x86_64.rs
@@ -0,0 +1,242 @@
+#![allow(dead_code, non_upper_case_globals, unused_macros, non_snake_case, unused_assignments)]
+
+use std::arch::asm;
+
+mod nano;
+
+#[repr(C, align(64))]
+struct Floats([f32; 256 * 1024 * 64]);
+const _F32: Floats = Floats([12.; 256 * 1024 * 64]);
+const F32: *const f32 = (&_F32) as *const Floats as *const f32;
+
+lazy_static::lazy_static! {
+    static ref TICK: f64 = unsafe { b8192!(asm!("or rax, rax", out("rax") _)) };
+}
+
+macro_rules! kloop {
+    ($filter: expr, $geo: literal, $n: expr, $path: literal, $ww: expr, $u: expr, $arch: expr) => {
+        let label = $path.split("/").last().unwrap().split_once(".").unwrap().0;
+        let full_label = format!("{:8} {:40}", $geo, label);
+		let repeats = 32;
+		let ks = 256;
+        if full_label.contains($filter.unwrap_or("")) {
+            let time = b1!({
+
+				let mut p = F32;
+				let mut q = F32;
+				let mut k = ks;
+				let mut r = repeats;
+				asm!(
+					concat!(r#"
+2:
+      mov rax, r9
+      mov rcx, r10
+      mov r8, r12
+3:
+    "#, include_str!(			concat!("../x86_64/", $arch, "/", $path)), "\n sub r8, ", $u, r#"
+jnz 3b
+
+sub r11, 1
+jnz 2b
+"#),
+					inout("r9") p, inout("r10") q, inout("r12") k, inout("r11") r, out("rax") _, out("rcx") _,
+					out("r8") _,
+					out("zmm0") _, out("zmm1") _, out("zmm2") _, out("zmm3") _,
+					out("zmm4") _, out("zmm5") _, out("zmm6") _, out("zmm7") _,
+					out("zmm8") _, out("zmm9") _, out("zmm10") _, out("zmm11") _,
+					out("zmm12") _, out("zmm13") _, out("zmm14") _, out("zmm15") _,
+					out("zmm20") _, out("zmm21") _, out("zmm22") _, out("zmm23") _,
+					out("zmm24") _, out("zmm25") _, out("zmm26") _, out("zmm27") _,
+                    out("zmm28") _,  out("zmm29") _,  out("zmm30") _,  out("zmm31") _,
+				);
+            });
+
+			// We have k=1024 * 64 but some tests step twice per iteration
+			let iterations = (ks * repeats / $u);
+			// Those that step twice process twice as many elements per iteration
+			let elems_per_iteration = $n * $u;
+
+			let time_per_iteration = time / iterations  as f64;
+
+			let total_floats = elems_per_iteration * iterations;
+			let flops = total_floats as f64 / time;
+
+			let total_time_ms = time * 1e6;
+			let fmas_per_iteration = ($n as f64 / $ww as f64) * $u as f64;
+			let ticks_per_iteration = time_per_iteration / *TICK;
+            println!("{} {:3.5} {:3.0}% ({:>5.2 }/{:3 } cy) {:.2} GFLOP/s", full_label, total_time_ms, fmas_per_iteration / ticks_per_iteration * 100., ticks_per_iteration, fmas_per_iteration, flops / 1e9 );
+        }
+    };
+
+	($filter: expr, $geo: literal, $n: expr, $path: literal, $ww: expr) => {
+		kloop!($filter, $geo, $n, $path, $ww, 1, "fma")
+	};
+	($filter: expr, $geo: literal, $n: expr, $path: literal, $ww: expr, $u: expr) => {
+		kloop!($filter, $geo, $n, $path, $ww, $u, "fma")
+	};
+}
+
+unsafe fn packed_packed_1x12(f: Option<&str>) {
+    println!("-- 1x12 kernels");
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "1x12x1", (16 * 1 * 12), "1x12/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_1x8(f: Option<&str>) {
+    println!("-- 1x8 kernels");
+    kloop!(f, "1x8x1", (8 * 8), "8x8/packed_packed_loop1/avx.tmpli", 8);
+    kloop!(f, "1x8x2", (8 * 8), "8x8/packed_packed_loop1/avx-unroll.tmpli", 8, 2);
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "1x8x1", (16 * 1 * 8), "8x8/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_2x6(f: Option<&str>) {
+    println!("-- 2x6 kernels");
+    kloop!(f, "2x6x1", (16 * 6), "2x6/packed_packed_loop1/original.tmpli", 8);
+    kloop!(f, "2x6x2", (16 * 6), "2x6/packed_packed_loop1/original-unroll.tmpli", 8, 2);
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "2x6x1", (16 * 2 * 6), "2x6/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+        kloop!(f, "2x6x2", (16 * 2 * 6), "2x6/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_2x5(f: Option<&str>) {
+    println!("-- 2x5 kernels");
+    kloop!(f, "2x5x1", (16 * 5), "2x5/packed_packed_loop1/avx.tmpli", 8);
+    kloop!(f, "2x5x2", (16 * 5), "2x5/packed_packed_loop1/avx-unroll.tmpli", 8, 2);
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "2x5x1", (32 * 5), "2x5/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+        kloop!(f, "2x5x2", (32 * 5), "2x5/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_3x4(f: Option<&str>) {
+    println!("-- 3x4 kernels");
+    kloop!(f, "3x4x1", (24 * 4), "3x4/packed_packed_loop1/avx.tmpli", 8);
+    kloop!(f, "3x4x2", (24 * 4), "3x4/packed_packed_loop1/avx-unroll.tmpli", 8, 2);
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "3x4x1", (16 * 3 * 4), "3x4/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+        kloop!(f, "3x4x2", (16 * 3 * 4), "3x4/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_4x3(f: Option<&str>) {
+    println!("-- 4x3 kernels");
+    kloop!(f, "4x3x1", (32 * 3), "4x3/packed_packed_loop1/avx.tmpli", 8);
+    kloop!(f, "4x3x2", (32 * 3), "4x3/packed_packed_loop1/avx-unroll.tmpli", 8, 2);
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "4x3x1", (16 * 4 * 3), "4x3/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+        kloop!(f, "4x3x2", (16 * 4 * 3), "4x3/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_5x2(f: Option<&str>) {
+    println!("-- 5x2 kernels");
+    kloop!(f, "5x2x1", (40 * 2), "5x2/packed_packed_loop1/avx.tmpli", 8);
+    kloop!(f, "5x2x1", (40 * 2), "5x2/packed_packed_loop1/avx-unroll.tmpli", 8, 2);
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "5x2x1", (16 * 5 * 2), "5x2/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+        kloop!(f, "5x2x2", (16 * 5 * 2), "5x2/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_6x2(f: Option<&str>) {
+    println!("-- 6x2 kernels");
+    kloop!(f, "6x2x1", (48 * 2), "6x2/packed_packed_loop1/avx.tmpli", 8);
+    kloop!(f, "6x2x2", (48 * 2), "6x2/packed_packed_loop1/avx-unroll.tmpli", 8, 2);
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "6x2x1", (16 * 6 * 2), "6x2/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+        kloop!(f, "6x2x2", (16 * 6 * 2), "6x2/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_8x2(f: Option<&str>) {
+    println!("-- 8x2 kernels");
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "8x2x1", (16 * 8 * 2), "8x2/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_8x1(f: Option<&str>) {
+    println!("-- 8x1 kernels");
+    kloop!(f, "8x1x1", (64 * 1), "8x1/packed_packed_loop1/avx.tmpli", 8);
+    kloop!(f, "8x1x2", (64 * 1), "8x1/packed_packed_loop1/avx-unroll.tmpli", 8, 2);
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "8x1x1", (16 * 8 * 1), "8x1/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+        kloop!(f, "8x1x2", (16 * 8 * 1), "8x1/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_6x1(f: Option<&str>) {
+    println!("-- 6x1 kernels");
+    kloop!(f, "6x1x1", (48 * 1), "6x1/packed_packed_loop1/avx.tmpli", 8);
+    kloop!(f, "6x1x2", (48 * 1), "6x1/packed_packed_loop1/avx-unroll.tmpli", 8, 2);
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "6x1x1", (16 * 6 * 1), "6x1/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+        kloop!(f, "6x1x2", (16 * 6 * 1), "6x1/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_7x1(f: Option<&str>) {
+    println!("-- 7x1 kernels");
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "7x1x1", (16 * 7 * 1), "7x1/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+        kloop!(f, "7x1x2", (16 * 7 * 1), "7x1/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_1x1(f: Option<&str>) {
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "1x1x1", (16 * 1 * 1), "1x1/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+        kloop!(f, "1x1x2", (16 * 1 * 1), "1x1/packed_packed_loop1/unroll.tmpli", 16, 2, "avx512");
+        kloop!(f, "1x1x4", (16 * 1 * 1), "1x1/packed_packed_loop1/unroll-4.tmpli", 16, 4, "avx512");
+        kloop!(f, "1x1x8", (16 * 1 * 1), "1x1/packed_packed_loop1/unroll-8.tmpli", 16, 8, "avx512");
+        kloop!(f, "1x1x16", (16 * 1 * 1), "1x1/packed_packed_loop1/unroll-16.tmpli", 16, 16, "avx512");
+    }
+    println!();
+}
+
+unsafe fn packed_packed_10x1(f: Option<&str>) {
+    println!("-- 10x1 kernels");
+    kloop!(f, "10x1x1", (80 * 1), "10x1/packed_packed_loop1/avx.tmpli", 8);
+    kloop!(f, "10x1x2", (80 * 1), "10x1/packed_packed_loop1/avx-unroll.tmpli", 8, 2);
+    if std::is_x86_feature_detected!("avx512f") {
+        kloop!(f, "10x1x1", (16 * 10 * 1), "10x1/packed_packed_loop1/avx-512.tmpli", 16, 1, "avx512");
+        kloop!(f, "10x1x2", (16 * 10 * 1), "10x1/packed_packed_loop1/avx-512-unroll.tmpli", 16, 2, "avx512");
+    }
+    println!();
+}
+
+fn main() {
+    let filter = std::env::args().skip(1).find(|a| a != "--bench");
+    unsafe {
+        packed_packed_1x1(filter.as_deref());
+        packed_packed_1x12(filter.as_deref());
+        packed_packed_1x8(filter.as_deref());
+        packed_packed_2x6(filter.as_deref());
+        packed_packed_2x5(filter.as_deref());
+        packed_packed_3x4(filter.as_deref());
+        packed_packed_4x3(filter.as_deref());
+        packed_packed_5x2(filter.as_deref());
+        packed_packed_6x2(filter.as_deref());
+        packed_packed_8x2(filter.as_deref());
+        packed_packed_6x1(filter.as_deref());
+        packed_packed_7x1(filter.as_deref());
+        packed_packed_8x1(filter.as_deref());
+        packed_packed_10x1(filter.as_deref());
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/build.rs b/vendor/tract-linalg-0.22.1/build.rs
new file mode 100644
index 000000000..cc7f4449a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/build.rs
@@ -0,0 +1,374 @@
+#![allow(clippy::box_default)]
+
+use liquid_core::Runtime;
+use liquid_core::{Display_filter, Filter, FilterReflection, ParseFilter};
+use liquid_core::{Value, ValueView};
+
+use std::{env, ffi, fs, path};
+
+#[path = "arm64/apple_amx/instructions.rs"]
+mod apple_amx_instructions;
+
+fn var(k: &str) -> String {
+    env::var(k).unwrap()
+}
+
+fn use_masm() -> bool {
+    env::var("CARGO_CFG_TARGET_ENV") == Ok("msvc".to_string()) && var("HOST").contains("-windows-")
+}
+
+fn include_amx() -> bool {
+    let arch = var("CARGO_CFG_TARGET_ARCH");
+    let os = var("CARGO_CFG_TARGET_OS");
+    os == "macos"
+        || (env::var("CARGO_FEATURE_APPLE_AMX_IOS").is_ok() && os == "ios" && arch == "aarch64")
+}
+
+fn jump_table() -> Vec<String> {
+    println!("cargo:rerun-if-changed=src/frame/mmm/fuse.rs");
+    std::fs::read_to_string("src/frame/mmm/fuse.rs")
+        .unwrap()
+        .lines()
+        .filter(|l| l.contains("// jump_to:"))
+        .map(|l| l.split("jump_to:").nth(1).unwrap().to_owned())
+        .collect()
+}
+
+#[derive(Clone, Debug)]
+struct ConfigForHalf {
+    extra_flags: Vec<String>,
+    needs_pragma: bool,
+}
+
+impl ConfigForHalf {
+    fn new(extra_flags: Vec<String>, needs_pragma: bool) -> ConfigForHalf {
+        ConfigForHalf { extra_flags, needs_pragma }
+    }
+
+    fn all() -> Vec<ConfigForHalf> {
+        let mut configs = vec![];
+        for extra_flags in
+            [vec![], vec!["-march=armv8.2-a".to_string()], vec!["-mcpu=cortex-a55".to_string()]]
+        {
+            for needs_pragma in [false, true] {
+                configs.push(ConfigForHalf::new(extra_flags.clone(), needs_pragma))
+            }
+        }
+        configs
+    }
+
+    fn cc(&self) -> cc::Build {
+        let mut cc = cc::Build::new();
+        for flag in &self.extra_flags {
+            cc.flag(flag);
+        }
+        cc
+    }
+
+    fn works(&self) -> bool {
+        let filename = if self.needs_pragma {
+            "arm64/arm64fp16/dummy_fmla_pragma.S"
+        } else {
+            "arm64/arm64fp16/dummy_fmla_no_pragma.S"
+        };
+        self.cc().static_flag(true).file(filename).try_compile("dummy").is_ok()
+    }
+
+    pub fn probe() -> Option<ConfigForHalf> {
+        Self::all().iter().find(|c| c.works()).cloned()
+    }
+}
+
+fn main() {
+    let target = var("TARGET");
+    let arch = var("CARGO_CFG_TARGET_ARCH");
+    let os = var("CARGO_CFG_TARGET_OS");
+    let out_dir = path::PathBuf::from(var("OUT_DIR"));
+
+    let suffix = env!("CARGO_PKG_VERSION").replace(['-', '.'], "_");
+    make_extern_kernel_decl_macro(&out_dir, &suffix);
+
+    match arch.as_ref() {
+        "x86_64" => {
+            let mut files = preprocess_files("x86_64/fma", &[], &suffix, false);
+            files.extend(preprocess_files("x86_64/avx512", &[], &suffix, false));
+
+            if os == "windows" {
+                if use_masm() {
+                    let mut lib_exe = cc::windows_registry::find(&target, "lib.exe")
+                        .expect("Could not find lib.exe");
+                    lib_exe
+                        .arg(format!("/out:{}", out_dir.join("x86_64_fma.lib").to_str().unwrap()));
+                    for f in files {
+                        let mut obj = f.clone();
+                        obj.set_extension("o");
+                        let mut ml_exe = cc::windows_registry::find(&target, "ml64.exe")
+                            .expect("Could not find ml64.exe");
+                        if !ml_exe
+                            .arg("/Fo")
+                            .arg(&obj)
+                            .arg("/c")
+                            .arg(&f)
+                            .status()
+                            .unwrap()
+                            .success()
+                        {
+                            for (i, l) in std::fs::read_to_string(&f).unwrap().lines().enumerate() {
+                                println!("{i:8} {l}");
+                            }
+                            panic!();
+                        }
+                        lib_exe.arg(obj);
+                    }
+                    assert!(lib_exe.status().unwrap().success());
+                    println!("cargo:rustc-link-search=native={}", out_dir.to_str().unwrap());
+                    println!("cargo:rustc-link-lib=static=x86_64_fma");
+                } else {
+                    cc::Build::new()
+                        .files(files)
+                        .flag("-mfma")
+                        .flag("-mf16c")
+                        .static_flag(true)
+                        .compile("x86_64_fma");
+
+                    // clang at least (dunno about gcc) outputs .asm files in the
+                    // root directory that we need to clean up so we don't pollute
+                    // the build output/working directory
+                    let _ = fs::remove_file("fma_mmm_f32_16x6.asm");
+                    let _ = fs::remove_file("fma_mmm_i32_8x8.asm");
+                    let _ = fs::remove_file("fma_sigmoid_f32.asm");
+                    let _ = fs::remove_file("fma_tanh_f32.asm");
+                }
+            } else {
+                cc::Build::new().files(files).flag("-mfma").static_flag(true).compile("x86_64_fma");
+            }
+        }
+        "arm" | "armv7" => {
+            let files = preprocess_files("arm32/armvfpv2", &[], &suffix, false);
+            cc::Build::new()
+                .files(files)
+                .flag("-marm")
+                .flag("-mfpu=vfp")
+                .static_flag(true)
+                .compile("armvfpv2");
+            let files = preprocess_files(
+                "arm32/armv7neon",
+                &[("core", vec!["cortexa7", "cortexa9", "generic"])],
+                &suffix,
+                false,
+            );
+            cc::Build::new()
+                .files(files)
+                .flag("-marm")
+                .flag("-mfpu=neon")
+                .static_flag(true)
+                .compile("armv7neon");
+        }
+        "aarch64" => {
+            let files = preprocess_files(
+                "arm64/arm64simd",
+                &[("core", vec!["a53", "a55", "gen"])],
+                &suffix,
+                false,
+            );
+            cc::Build::new().files(files).static_flag(true).compile("arm64simd");
+            if include_amx() {
+                let files = preprocess_files("arm64/apple_amx", &[], &suffix, false);
+                cc::Build::new().files(files).static_flag(true).compile("appleamx");
+            }
+            if std::env::var("CARGO_FEATURE_NO_FP16").is_err() {
+                let config =
+                    ConfigForHalf::probe().expect("No configuration found for fp16 support");
+                let files = preprocess_files(
+                    "arm64/arm64fp16",
+                    &[("core", vec!["a55", "gen"])],
+                    &suffix,
+                    config.needs_pragma,
+                );
+                config.cc().files(files).static_flag(true).compile("arm64fp16")
+            }
+        }
+        _ => {}
+    }
+}
+
+type Variant = (&'static str, Vec<&'static str>);
+
+fn preprocess_files(
+    input: impl AsRef<path::Path>,
+    variants: &[Variant],
+    suffix: &str,
+    needs_pragma: bool,
+) -> Vec<path::PathBuf> {
+    let out_dir = path::PathBuf::from(var("OUT_DIR"));
+    let mut files = vec![];
+    let dir_entries = {
+        let mut dir_entries: Vec<fs::DirEntry> =
+            input.as_ref().read_dir().unwrap().map(|f| f.unwrap()).collect();
+        dir_entries.sort_by_key(|a| a.path());
+        dir_entries
+    };
+    for f in dir_entries {
+        if f.path().extension() == Some(ffi::OsStr::new("tmpl")) {
+            let tmpl_file = f.path().file_name().unwrap().to_str().unwrap().to_owned();
+            let concerned_variants: Vec<&Variant> =
+                variants.iter().filter(|v| tmpl_file.contains(v.0)).collect();
+            let expanded_variants = concerned_variants.iter().map(|pair| pair.1.len()).product();
+            for v in 0..expanded_variants {
+                let mut tmpl_file = tmpl_file.clone();
+                let mut id = v;
+                let mut globals = vec![];
+                for variable in variants {
+                    let key = variable.0;
+                    let value = variable.1[id % variable.1.len()];
+                    globals.push((key, value));
+                    tmpl_file = tmpl_file.replace(key, value);
+                    id /= variable.1.len();
+                }
+                let mut file = out_dir.join(tmpl_file);
+                file.set_extension("S");
+                preprocess_file(f.path(), &file, &globals, suffix, needs_pragma);
+                files.push(file);
+            }
+        }
+    }
+    files
+}
+
+fn strip_comments(s: String, msvc: bool) -> String {
+    if msvc {
+        s.lines().map(|line| line.replace("//", ";")).collect::<Vec<String>>().join("\n")
+    } else {
+        s
+    }
+}
+
+fn preprocess_file(
+    template: impl AsRef<path::Path>,
+    output: impl AsRef<path::Path>,
+    variants: &[(&'static str, &'static str)],
+    suffix: &str,
+    needs_pragma: bool,
+) {
+    println!("cargo:rerun-if-changed={}", template.as_ref().to_string_lossy());
+    let family = var("CARGO_CFG_TARGET_FAMILY");
+    let os = var("CARGO_CFG_TARGET_OS");
+
+    // We also check to see if we're on a windows host, if we aren't, we won't be
+    // able to use the Microsoft assemblers,
+    let msvc = use_masm();
+    println!("cargo:rerun-if-changed={}", template.as_ref().to_string_lossy());
+    let mut input = fs::read_to_string(&template).unwrap();
+    input = strip_comments(input, msvc);
+    let l = if os == "macos" {
+        "L"
+    } else if family == "windows" {
+        ""
+    } else {
+        ".L"
+    }
+    .to_owned();
+    let long = if msvc { "dd" } else { ".long" };
+    let g = if os == "macos" || os == "ios" { "_" } else { "" };
+    // note: use .align with bytes instead of p2align since they both use direct bytes.
+    let align = if msvc { "align" } else { ".align" };
+    let mut globals = liquid::object!({
+        "msvc": msvc,
+        "needs_pragma": needs_pragma,
+        "family": family,
+        "os": os,
+        "L": l,
+        "G": g,
+        "suffix": suffix,
+        "long": long,
+        "jump_table": jump_table(),
+        "align": align,
+        "offset": if msvc { "offset" } else { "rip + "},
+    });
+    for (k, v) in variants {
+        globals.insert(k.to_string().into(), liquid::model::Value::scalar(*v));
+    }
+    let partials = load_partials(template.as_ref().parent().unwrap(), msvc);
+    let mut parser = liquid::ParserBuilder::with_stdlib()
+        .partials(liquid::partials::LazyCompiler::new(partials))
+        .filter(F16);
+    if include_amx() {
+        parser = apple_amx_instructions::register(parser);
+        globals.extend(apple_amx_instructions::globals());
+    }
+    if let Err(e) = parser
+        .build()
+        .and_then(|p| p.parse(&input))
+        .and_then(|r| r.render_to(&mut fs::File::create(&output).unwrap(), &globals))
+    {
+        eprintln!("Processing {}", template.as_ref().to_string_lossy());
+        eprintln!("{e}");
+        panic!()
+    }
+}
+
+fn load_partials(p: &path::Path, msvc: bool) -> liquid::partials::InMemorySource {
+    let mut mem = liquid::partials::InMemorySource::new();
+    for f in walkdir::WalkDir::new(p) {
+        let f = f.unwrap();
+        if f.path().is_dir() {
+            continue;
+        }
+
+        let ext = f.path().extension().map(|s| s.to_string_lossy()).unwrap_or("".into());
+        let text = std::fs::read_to_string(f.path()).unwrap_or_else(|_| panic!("file {f:?}"));
+        let text = match ext.as_ref() {
+            "tmpli" => Some(text.replace("{{", "{").replace("}}", "}")),
+            "tmpliq" => Some(text),
+            _ => None,
+        };
+        if let Some(text) = text {
+            let text = strip_comments(text, msvc);
+            let key =
+                f.path().strip_prefix(p).unwrap().to_str().unwrap().to_owned().replace('\\', "/");
+            println!("cargo:rerun-if-changed={}", f.path().to_string_lossy().replace('\\', "/"));
+
+            mem.add(key, text);
+        }
+    }
+    mem
+}
+
+fn make_extern_kernel_decl_macro(out_dir: &path::Path, suffix: &str) {
+    let macro_decl = r#"
+    macro_rules! extern_kernel {
+        (fn $name: ident($($par_name:ident : $par_type: ty ),*) -> $rv: ty) => {
+            paste! {
+                unsafe extern "C" { pub fn [<$name _ _suffix>]($(par_name: $par_type),*) -> $rv; }
+                pub use [<$name _ _suffix>] as $name;
+            }
+        }
+    }"#
+    .replace("_suffix", suffix);
+    std::fs::write(out_dir.join("extern_kernel_macro.rs"), macro_decl).unwrap();
+}
+
+#[derive(Clone, ParseFilter, FilterReflection)]
+#[filter(
+    name = "float16",
+    description = "Write a float16 constant with the .float16 directive in gcc, or as short in clang",
+    parsed(F16Filter)
+)]
+pub struct F16;
+
+#[derive(Debug, Default, Display_filter)]
+#[name = "float16"]
+struct F16Filter;
+
+impl Filter for F16Filter {
+    fn evaluate(
+        &self,
+        input: &dyn ValueView,
+        _runtime: &dyn Runtime,
+    ) -> liquid_core::Result<Value> {
+        let input: f32 = input.as_scalar().unwrap().to_float().unwrap() as f32;
+        let value = half::f16::from_f32(input);
+        let bits = value.to_bits();
+        Ok(format!(".short {bits}").to_value())
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm32.rs b/vendor/tract-linalg-0.22.1/src/arm32.rs
new file mode 100644
index 000000000..29c0a94a4
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm32.rs
@@ -0,0 +1,101 @@
+use std::{env, fs};
+pub mod armv7neon;
+mod armvfpv2;
+mod cortex_a7;
+mod cortex_a9;
+use armv7neon::*;
+
+use crate::frame::element_wise::ElementWiseKer;
+
+use crate::Ops;
+
+fn has_neon_cpuinfo() -> std::io::Result<bool> {
+    let cpu_info = fs::read_to_string("/proc/cpuinfo")?;
+    let neon = cpu_info.split("\n").any(|line| {
+        line.starts_with("Features") && (line.contains("neon") || line.contains("asimd"))
+    });
+    Ok(neon)
+}
+
+fn cpu_part() -> Option<usize> {
+    fs::read_to_string("/proc/cpuinfo").ok().and_then(|cpuinfo| {
+        cpuinfo
+            .lines()
+            .find(|line| line.starts_with("CPU part"))
+            .and_then(|s| s.trim().split_whitespace().last())
+            .and_then(|s| s.strip_prefix("0x"))
+            .and_then(|s| usize::from_str_radix(s, 16).ok())
+    })
+}
+
+fn has_neon() -> bool {
+    if let Ok(v) = env::var("TRACT_CPU_ARM32_NEON") {
+        return v == "true" || v == "1";
+    }
+    has_neon_cpuinfo().unwrap_or(false)
+}
+
+pub fn plug(ops: &mut Ops) {
+    if has_neon() {
+        log::info!("armv7neon activated (smmm, ssigmoid), stanh)");
+        armv7neon::plug(ops);
+
+        let cpu = cpu_part().unwrap_or(0);
+
+        fn prefer_8x4(_m: Option<usize>, _k: Option<usize>, n: Option<usize>) -> bool {
+            n.map(|n| n % 4 == 0 && n % 6 != 0 && n <= 12).unwrap_or(false)
+        }
+
+        let cost_managed_impls = vec![
+            armv7neon_mmm_f32_8x4_cortexa7.mmm(),
+            armv7neon_mmm_f32_8x6_cortexa7.mmm(),
+            armv7neon_mmm_f32_8x4_cortexa9.mmm(),
+            armv7neon_mmm_f32_8x6_cortexa9.mmm(),
+            armv7neon_mmm_f32_8x4_generic.mmm(),
+            armv7neon_mmm_f32_8x6_generic.mmm(),
+            crate::generic::mmm::generic_f32_4x4.mmm(),
+        ];
+        ops.mmv_f32 = match cpu {
+            0xc07 => Box::new(|_, _| armv7neon::armv7neon_mmm_f32_32x1_cortexa7.mmm()),
+            0xc09 => Box::new(|_, _| armv7neon::armv7neon_mmm_f32_32x1_cortexa9.mmm()),
+            _ => Box::new(|_, _| armv7neon::armv7neon_mmm_f32_32x1_generic.mmm()),
+        };
+
+        ops.mmm_f32 = match cpu {
+            0xc07 => {
+                let model = cortex_a7::model();
+                Box::new(move |m, k, n| model.pick(&cost_managed_impls, m, k, n))
+            }
+            0xc09 => {
+                let model = cortex_a9::model();
+                Box::new(move |m, k, n| model.pick(&cost_managed_impls, m, k, n))
+            }
+            _ => Box::new(|m, k, n| {
+                if prefer_8x4(m, k, n) {
+                    armv7neon::armv7neon_mmm_f32_8x4_generic.mmm()
+                } else {
+                    armv7neon::armv7neon_mmm_f32_8x6_generic.mmm()
+                }
+            }),
+        };
+        ops.qmmm_i32 = Box::new(|_, _, _| armv7neon::armv7neon_mmm_i32_8x4.mmm());
+        ops.qmmv_i32 = Box::new(|_, _| armv7neon::armv7neon_mmm_i32_32x1.mmm());
+        ops.sigmoid_f32 = Box::new(|| armv7neon_sigmoid_f32_4n::ew());
+        ops.tanh_f32 = Box::new(|| armv7neon_tanh_f32_4n::ew());
+    } else {
+        armvfpv2::plug(ops);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn may_have_neon() {
+        println!("Has neon ? {:?}", has_neon());
+        if let Ok(neon) = env::var("TRACT_CPU_EXPECT_ARM32_NEON") {
+            assert_eq!(neon == "true", has_neon());
+        }
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm32/armv7neon.rs b/vendor/tract-linalg-0.22.1/src/arm32/armv7neon.rs
new file mode 100644
index 000000000..3ca3ac441
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm32/armv7neon.rs
@@ -0,0 +1,46 @@
+use crate::frame::mmm::ImplementationQuality::ManuallyOptimized;
+use crate::pack::PackedFormat;
+use crate::Ops;
+
+const NEON: fn() -> bool = || crate::arm32::has_neon();
+
+MMMExternKernel!(armv7neon_mmm_f32_8x4_cortexa7 <f32>( 8, 4)@(16, 4) where(NEON) quality(ManuallyOptimized));
+MMMExternKernel!(armv7neon_mmm_f32_8x4_cortexa9 <f32>( 8, 4)@(16, 4) where(NEON) quality(ManuallyOptimized));
+MMMExternKernel!(armv7neon_mmm_f32_8x4_generic  <f32>( 8, 4)@(16, 4) where(NEON) quality(ManuallyOptimized));
+MMMExternKernel!(armv7neon_mmm_f32_8x6_cortexa7 <f32>( 8, 6)@(16, 4) where(NEON) quality(ManuallyOptimized));
+MMMExternKernel!(armv7neon_mmm_f32_8x6_cortexa9 <f32>( 8, 6)@(16, 4) where(NEON) quality(ManuallyOptimized));
+MMMExternKernel!(armv7neon_mmm_f32_8x6_generic  <f32>( 8, 6)@(16, 4) where(NEON) quality(ManuallyOptimized));
+MMMExternKernel!(armv7neon_mmm_f32_8x1_generic  <f32>( 8, 1)@(16, 4) where(NEON) quality(ManuallyOptimized));
+MMMExternKernel!(armv7neon_mmm_f32_32x1_cortexa7<f32>(32, 1)@(16, 4) where(NEON) quality(ManuallyOptimized));
+MMMExternKernel!(armv7neon_mmm_f32_32x1_cortexa9<f32>(32, 1)@(16, 4) where(NEON) quality(ManuallyOptimized));
+MMMExternKernel!(armv7neon_mmm_f32_32x1_generic <f32>(32, 1)@(16, 4) where(NEON) quality(ManuallyOptimized));
+
+MMMExternKernel!(armv7neon_mmm_i32_8x4<i32>(8, 4)@(32, 4) where(NEON)
+  packing[1] = i8i8 => |k| k.with_packing(PackedFormat::new(DatumType::I8, 8, 32), PackedFormat::new(DatumType::I8, 4, 32));
+  quality(ManuallyOptimized)
+  store(i8)
+);
+
+MMMExternKernel!(armv7neon_mmm_i32_32x1<i32>(32, 1)@(32, 4) where(NEON)
+  packing[1] = i8i8 => |k| k.with_packing(PackedFormat::new(DatumType::I8, 32, 32), PackedFormat::new(DatumType::I8, 1, 4));
+  quality(ManuallyOptimized)
+  store(i8)
+);
+
+pub fn plug(ops: &mut Ops) {
+    ops.mmm_impls.extend_from_slice(&[
+        armv7neon_mmm_f32_8x4_cortexa7.mmm(),
+        armv7neon_mmm_f32_8x4_cortexa9.mmm(),
+        armv7neon_mmm_f32_8x4_generic.mmm(),
+        armv7neon_mmm_f32_8x6_cortexa7.mmm(),
+        armv7neon_mmm_f32_8x6_cortexa9.mmm(),
+        armv7neon_mmm_f32_8x6_generic.mmm(),
+        armv7neon_mmm_f32_8x1_generic.mmm(),
+        armv7neon_mmm_f32_32x1_cortexa7.mmm(),
+        armv7neon_mmm_f32_32x1_cortexa9.mmm(),
+        armv7neon_mmm_f32_32x1_generic.mmm(),
+    ]);
+}
+
+sigmoid_impl!(f32, armv7neon_sigmoid_f32_4n, 4, 4, crate::arm32::has_neon());
+tanh_impl!(f32, armv7neon_tanh_f32_4n, 4, 4, crate::arm32::has_neon());
diff --git a/vendor/tract-linalg-0.22.1/src/arm32/armvfpv2.rs b/vendor/tract-linalg-0.22.1/src/arm32/armvfpv2.rs
new file mode 100644
index 000000000..46e75f774
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm32/armvfpv2.rs
@@ -0,0 +1,11 @@
+use crate::frame::mmm::ImplementationQuality::ManuallyOptimized;
+use crate::frame::mmm::*;
+use crate::Ops;
+
+MMMExternKernel!(armvfpv2_mmm_f32_4x4<f32>(4, 4)@(4, 4) quality(ManuallyOptimized));
+
+pub fn plug(ops: &mut Ops) {
+    log::info!("armvfpv2 activated for smmm");
+    ops.mmm_f32 = Box::new(|_, _, _| armvfpv2_mmm_f32_4x4.mmm());
+    ops.mmm_impls.push(armvfpv2_mmm_f32_4x4.mmm());
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.rs b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.rs
new file mode 100644
index 000000000..f50aabdcc
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.rs
@@ -0,0 +1,16 @@
+use crate::frame::mmm::CostModel;
+        pub fn model() -> CostModel<'static> {
+            CostModel {
+                big_product_mkn_threshold: 4193728.0,
+                big_product_kernel_choice: "armv7neon_mmm_f32_8x6_cortexa7",
+                kernels: &["armv7neon_mmm_f32_8x4_cortexa7", "armv7neon_mmm_f32_8x4_cortexa9", "armv7neon_mmm_f32_8x4_generic", "armv7neon_mmm_f32_8x6_cortexa7", "armv7neon_mmm_f32_8x6_cortexa9", "armv7neon_mmm_f32_8x6_generic", "generic_f32_4x4"],
+                mrs: &[4, 8],
+                nrs: &[4, 6],
+                feat_norm_mean: &[4.589878771602424, 4.5739692460187005, 4.598167981532298, 13.762015999153403, 1.5038983903420524, 0.749874245472837, 3.465165995975855, 0.8777665995975855, 1.5022635814889336, 0.7570422535211268, 2.482142857142857, 0.8333752515090543],
+                feat_norm_stddev: &[1.2587312982588519, 1.2603116830524392, 1.2581181647300588, 1.3169322340874257, 1.1192637768418767, 0.43308528195884044, 2.2762097127791114, 0.32755518043295856, 1.1069539235554247, 0.42886977033219037, 1.7067987601825914, 0.37264049924995035],
+                w1: &[0.06765510141849518, 0.024555781856179237, -0.8821254968643188, -0.004870870150625706, -0.10525479167699814, 0.1827959418296814, 0.1633400171995163, -0.2377464473247528, -0.17880690097808838, 0.19097138941287994, 0.04676022008061409, -0.11329511553049088, 0.4089120030403137, -0.3100685477256775, -0.1652061492204666, -0.19124962389469147, -0.03810987249016762, -0.00785011239349842, 0.09714752435684204, -0.11142419278621674, 0.19261880218982697, -0.2893339991569519, -0.19540216028690338, 0.39759594202041626, -0.00619965186342597, -0.8473111391067505, 0.343344122171402, -0.12575943768024445, 0.029266485944390297, -0.02900734543800354, -0.019343264400959015, 0.08306540548801422, -0.1927606761455536, 0.23312175273895264, 0.2576882541179657, -0.35881471633911133, -0.27300119400024414, -0.2995607852935791, -0.7934547662734985, -0.9349930286407471, -0.011614155024290085, -0.12521372735500336, 0.011371670290827751, 0.05779163911938667, 0.17875070869922638, -0.23169392347335815, -0.09749509394168854, 0.07436174154281616, 0.24035069346427917, -0.1262669861316681, 0.3874961733818054, -0.11149000376462936, 0.03639678284525871, 0.17740628123283386, 0.03768332302570343, -0.20480288565158844, -0.1955408751964569, 0.44144806265830994, 0.3628064692020416, -0.2537013292312622, 0.019405143335461617, 0.06186319515109062, 0.5196826457977295, 0.3010406494140625, 0.04013144597411156, 0.03517461195588112, -0.037290964275598526, 0.009919736534357071, -0.3135205805301666, 0.4654330909252167, 0.46720823645591736, 0.29665476083755493, 0.09099660068750381, -0.7376689314842224, -0.07840575277805328, -0.5192644000053406, 0.019796665757894516, -0.021734869107604027, 0.13953897356987, -0.04154204577207565, 0.10942933708429337, -0.13621817529201508, -0.04218055680394173, 0.09188657253980637, -0.16021296381950378, -0.19393481314182281, 0.3737955689430237, 0.08288388699293137, -0.08280416578054428, -0.13087297976016998, -0.09470323473215103, 0.2779513895511627, 0.03663017228245735, 0.36601993441581726, 0.8102841377258301, 0.6883901953697205, -0.33066609501838684, -0.34960171580314636, 0.923985481262207, 0.5853908061981201, 0.07039576023817062, -0.11843020468950272, -0.06797836720943451, 0.0974433571100235, -0.4707315266132355, 0.37827417254447937, 0.15521520376205444, -0.7403592467308044, -0.25005313754081726, 0.596679151058197, -0.7277861833572388, -0.6915309429168701, -0.0050544412806630135, -0.12311484664678574, 0.04149714484810829, 0.05289606750011444, 0.2448417991399765, -0.47261708974838257, -0.3535511791706085, 0.4614925682544708, 0.9230178594589233, -0.5351396799087524, 0.8224894404411316, 0.37244901061058044, -0.08826857805252075, -0.0452042818069458, 0.0035054143518209457, 0.09203510731458664, 0.08918709307909012, -0.0694250762462616, -0.053435735404491425, 0.1012222170829773, 0.3401939570903778, -0.38458573818206787, 0.3040490746498108, 0.7614821791648865, -0.17064380645751953, 0.22403603792190552, 0.08646601438522339, -0.08289062976837158, -0.20126193761825562, 0.2795524299144745, 0.13253425061702728, -0.07332615554332733, 0.2151418924331665, 0.16798575222492218, 0.003749655559659004, 0.2437056005001068, -0.09098415076732635, 0.18923071026802063, 0.07854695618152618, -0.25417080521583557, 0.15693743526935577, -0.30657434463500977, -0.19041943550109863, 0.26519766449928284, 0.24278832972049713, -0.18357035517692566, -0.015992645174264908, 0.43973660469055176, 0.02785446122288704, 0.3032245934009552, -0.021606506779789925, -0.2682349383831024, -0.10395143181085587, 0.050348248332738876, 0.12892353534698486, -0.10498340427875519, -0.027477847412228584, 0.09730125963687897, -0.16150422394275665, -0.21831916272640228, 0.10376061499118805, -0.25544440746307373, 0.031593386083841324, 0.11986788362264633, 0.22690074145793915, -0.3509098291397095, -0.1881190538406372, -0.04210145026445389, 0.6883101463317871, -0.07829979062080383, 0.4657376706600189, 0.9263871908187866, 0.08322961628437042, 0.04429711028933525, -0.08905605971813202, -0.06788893789052963, -0.056182388216257095, -0.04881853610277176, -0.04854113608598709, 0.15449045598506927, 0.32911357283592224, -0.5772383809089661, -0.00027374469209462404, -0.2995521128177643, -0.027322502806782722, 0.5023694038391113, 0.045783523470163345, -0.4035968780517578, 0.053967904299497604, 0.00014662329340353608, 0.021607715636491776, -0.028252260759472847, -0.05918470770120621, -0.1273883581161499, 0.0679078996181488, 0.25051605701446533, -0.0745333656668663, 0.18680104613304138, -0.12048312276601791, 0.013110226020216942, -0.07659415900707245, 0.2906968295574188, 0.3136366307735443, -0.47699007391929626, 0.02583535574376583, -0.15701107680797577, 0.045304182916879654, 0.23456838726997375, -0.06186807528138161, 0.3926846981048584, -0.13252438604831696, -0.16362214088439941, 0.013557562604546547, -0.09991434961557388, 0.09150815010070801, -0.006477471441030502, 0.2915862202644348, 0.5867642164230347, -0.37984445691108704, 0.033169880509376526, 0.024414243176579475, -0.0384003147482872, -0.06395144015550613, 0.07380940765142441, -0.025898484513163567, 0.03951931372284889, -0.2343142330646515, 0.27318838238716125, 0.1105947494506836, 0.290696382522583, -0.17851489782333374, -0.17699271440505981, -0.210996612906456, -0.10575137287378311, 0.15886521339416504, 0.10631759464740753, 0.22946283221244812, -0.3170112073421478, -0.49773311614990234, -0.10753292590379715, -0.1114523783326149, -0.10953730344772339, 0.4754663109779358, 0.20793643593788147, 0.021392812952399254, -0.0691467821598053, 0.03368104621767998, -0.017844771966338158, 0.1657843142747879, -0.5556477904319763, -1.108074426651001, -0.822117805480957, -0.06053074076771736, -0.4072379469871521, 0.09109722077846527, -0.5544739961624146, -0.13978064060211182, -0.36262163519859314, 0.20034632086753845, 0.050625383853912354, 0.1497042030096054, -0.18745489418506622, 0.0894727036356926, 0.00417149206623435, 0.2228451371192932, 0.00852279644459486, -0.028313757851719856, 0.04104698821902275, -0.0874263271689415, 0.19788521528244019, -0.019343160092830658, -0.03962515667080879, 0.2092486023902893, -0.44425246119499207, -0.48542261123657227, -0.04222029820084572, 0.7616084218025208, 0.512810468673706, -0.17871123552322388, 0.5459727644920349, -0.13069608807563782, 0.09155352413654327, 0.11548610031604767, -0.15368784964084625, 0.038799818605184555, -0.049028217792510986, -0.03215758875012398, -0.050522346049547195, 0.1663637012243271, -0.15482299029827118, -0.9425870180130005, -0.7017998695373535, 0.04315050691366196, -0.019968662410974503, 0.03749818727374077, -0.07611791789531708, 0.32011789083480835, -0.6925904750823975, -0.49334919452667236, 0.23214411735534668, 1.1447347402572632, -0.6757001876831055, 0.7940422296524048, 0.40169182419776917, -0.018513813614845276, 0.048821814358234406, -0.016693273559212685, 0.008068449795246124, 0.04566117003560066, -0.09829569607973099, -0.026971371844410896, 0.05381541699171066, -0.3659301698207855, 0.3473235070705414, 0.14521746337413788, 0.11228122562170029, -0.041056130081415176, -0.11228874325752258, 0.006667478010058403, 0.15931302309036255, -0.30010080337524414, 0.3464723229408264, 0.4476386308670044, -0.3498152494430542, 0.2616507112979889, -0.19995814561843872, 0.10946320742368698, 0.4034257233142853, -0.08651446551084518, 0.018647747114300728, 0.11572548002004623, -0.100877545773983, -0.16341210901737213, 0.2377898246049881, 0.3417612910270691, -0.49084869027137756, -0.02805873565375805, -0.09811390936374664, 0.17161016166210175, 0.3627470135688782, -0.08954513072967529, 0.06629404425621033, 0.012786897830665112, 0.01578289456665516, -0.32630467414855957, 0.4854920506477356, 0.12709765136241913, -0.4909423291683197, -0.3745254874229431, -0.6513142585754395, -0.040075208991765976, -0.569782018661499, -0.009953420609235764, 0.04735071584582329, 0.0230120699852705, -0.07381311058998108, -0.06293600797653198, 0.20196016132831573, 0.26551517844200134, -0.42071688175201416, 0.28809165954589844, 0.19747501611709595, -0.5686206221580505, -0.5285986661911011, 0.02009684592485428, 0.11322621256113052, -0.1082596555352211, -0.0856761634349823, -0.04493662342429161, -0.6179490089416504, -0.1442672610282898, 0.028762176632881165, 0.12426868081092834, -0.5771384835243225, 0.1608373522758484, 0.004147801548242569, -0.047590240836143494, 0.10347189754247665, 0.11780986934900284, -0.08490656316280365, -0.0746934711933136, 0.15699702501296997, 0.1298881322145462, -0.14411042630672455, -0.08601037412881851, 0.2997709810733795, -0.05418943241238594, -0.1772651970386505, 0.04576871916651726, -0.13510753214359283, -0.057203926146030426, 0.18647770583629608, 0.0055348677560687065, -0.12238732725381851, -0.11199415475130081, 0.43077343702316284, 0.1349855363368988, 0.21327465772628784, 0.05924845486879349, 0.12549948692321777, -0.060076650232076645, 0.23921678960323334, 0.02152605727314949, -0.1352948695421219, 0.09325127303600311, -0.14411674439907074, 0.010495728813111782, 0.11577513813972473, -0.07580242305994034, 0.42641204595565796, -0.5557231903076172, -0.12044595927000046, 0.024152765050530434, -0.14175696671009064, 0.024960221722722054, 0.10017693042755127, -0.07402117550373077, 0.09156208485364914, 0.455565482378006, 0.424320250749588, -0.07668061554431915, 0.10318724811077118, -0.32521969079971313, -0.2653461694717407, -0.03919212520122528, 0.12909358739852905, -0.17091549932956696, 0.07353391498327255, 0.11510979384183884, -0.23758216202259064, -0.3059186339378357, -0.046047650277614594, 0.17527209222316742, 0.19020265340805054, -0.20766229927539825, -0.23476286232471466, -0.14011070132255554, 0.1085173636674881, -0.020777594298124313, 0.014691418968141079, 0.21648286283016205, -0.21576255559921265, 0.28203028440475464, 0.6320008635520935, -0.23609709739685059, 0.16072526574134827, 0.30149686336517334, -0.05675647035241127, -0.018186205998063087, -0.1844293773174286, 0.13510139286518097, 0.05780869722366333, 0.07202577590942383, 0.07459436357021332, 0.18700383603572845, -0.09449177235364914, 0.057188909500837326, 0.21453143656253815, -0.30002379417419434, -0.12217795103788376, 0.03723505884408951, -0.18360234797000885, -0.029992947354912758, 0.10999765247106552, 0.09575961530208588, -0.36028456687927246, -0.4311397075653076, 0.5812231302261353],
+                b1: &[0.3801889419555664, -0.5001883506774902, 0.19484910368919373, 0.6488791704177856, 0.38620173931121826, 0.8780303597450256, -0.1126403734087944, 0.021730314940214157, -0.7806469202041626, -0.04312174394726753, 0.3102167546749115, 0.9241658449172974, 0.8900863528251648, -0.2938256561756134, -0.5012822151184082, -0.00329477502964437, 0.5169500708580017, 0.4563848376274109, -0.4903448224067688, 0.27919942140579224, -0.4288303554058075, -0.1836952418088913, -0.09118890762329102, 0.5528226494789124, -0.19896377623081207, 0.33588215708732605, 0.07895006239414215, 0.07812929153442383, 0.6203332543373108, 0.8427650332450867, -0.684628427028656, 0.5408275723457336, -0.5548633933067322, -0.49557214975357056, 0.7953769564628601, -0.4109633266925812, -0.6270897388458252, -0.43285393714904785, -0.7562689781188965, -0.7167727947235107],
+                w2: &[0.15592391788959503, 0.25119924545288086, -0.499594122171402, -0.5441639423370361, -0.11186911165714264, -0.6334478855133057, 0.28880706429481506, -0.592946469783783, 0.7188563942909241, -0.49322614073753357, -0.1398385912179947, -0.1868145614862442, 0.9288992881774902, -0.07525540888309479, 0.2288437783718109, 0.09932874143123627, 0.2782813012599945, -0.12644614279270172, -0.14151062071323395, 0.38845404982566833, 0.2691279947757721, -0.9148958921432495, 0.19230225682258606, 0.6098687052726746, -0.24782557785511017, -0.6989489197731018, -0.30721813440322876, -0.4890380799770355, -0.43724432587623596, -0.38428765535354614, -0.6491377353668213, -0.28134995698928833, -0.36228886246681213, -0.05963568389415741, 0.5086851119995117, 0.4664144814014435, 0.3797634541988373, 0.5596290826797485, -0.1977449357509613, 0.6540879607200623, -0.24533972144126892, 0.6865915656089783, -0.18364377319812775, 0.0013501447392627597, -0.4037604331970215, -0.287411093711853, -0.43570032715797424, -0.4085054099559784, 0.7341827750205994, -0.29973891377449036, -0.18240050971508026, -0.23446109890937805, 0.7225431799888611, 0.008502814918756485, 0.04582007974386215, 0.03352205455303192, 0.12457727640867233, -0.2019437849521637, -0.1299249827861786, -0.09946829080581665, 0.40665051341056824, -0.6841736435890198, -0.523845911026001, 0.21656402945518494, 0.6046024560928345, -0.6393186450004578, -0.3965637981891632, -0.7872777581214905, -0.13687947392463684, -0.19312888383865356, -0.5453231930732727, -0.21912647783756256, 0.011589044705033302, 0.2665385603904724, 0.3249806761741638, 0.293254017829895, 0.1047254130244255, 0.4246895909309387, -0.0033608688972890377, 0.4066942632198334, 0.06138676777482033, 0.382074236869812, 0.0787188857793808, -0.28631800413131714, -0.3500039279460907, -0.1490340679883957, -0.14991725981235504, -0.180477574467659, 0.15140952169895172, -0.35168370604515076, 0.38904908299446106, -0.11262823641300201, -0.18404939770698547, 0.5045862197875977, 0.23344825208187103, 0.6740546226501465, -0.054060351103544235, -0.47260594367980957, 0.287933886051178, 0.28975099325180054, 0.2366262525320053, -0.1751112937927246, -0.15358465909957886, -0.062381260097026825, 0.45881521701812744, -0.12647950649261475, 0.45258036255836487, -0.21084383130073547, -0.15994171798229218, -0.4229416847229004, -0.18642400205135345, -0.2506699860095978, 0.20604389905929565, 0.16662882268428802, -0.23073841631412506, 0.045810505747795105, 0.33520498871803284, 0.37685254216194153, 0.11563336104154587, 0.22259201109409332, -0.010484708473086357, -0.45855188369750977, 0.24794596433639526, 0.33667632937431335, 0.20378778874874115, 0.4198003113269806, 0.23384596407413483, 0.23601709306240082, -0.509751558303833, 0.5694931149482727, -0.08933047205209732, 0.037133198231458664, 0.20635388791561127, -0.2857131361961365, -0.4278101921081543, -0.26602792739868164, 0.1998632550239563, 0.4324374794960022, -0.13389578461647034, 0.11837134510278702, -0.17028754949569702, 0.37928706407546997, 0.10062910616397858, -0.04736608266830444, -0.04692180082201958, 0.6633663773536682, -0.3517492711544037, 0.2055688351392746, 0.44142597913742065, 0.42460545897483826, 0.4567111134529114, 0.3061029016971588, -0.16390416026115417, -0.3541538417339325, 0.2544074058532715, -0.18162837624549866, -0.21904821693897247, -0.2520917057991028, -0.07266020774841309, -0.23432950675487518, -0.1989256739616394, 0.09460597485303879, -0.24563294649124146, 0.9719013571739197, 0.2578149139881134, 0.26680076122283936, -0.39480605721473694, 0.22382304072380066, -0.4284250736236572, 0.4294125437736511, -0.04923247918486595, 0.5011574625968933, 0.1887599676847458, -0.02984841726720333, -0.16428305208683014, -0.33957910537719727, -0.16184143722057343, 0.37313663959503174, -0.11775537580251694, -0.34507161378860474, -0.24848994612693787, 0.3492432236671448, -0.2122095823287964, -0.022055158391594887, 0.07298140972852707, 0.36230477690696716, -0.2514148950576782, 0.11675992608070374, 0.4010731875896454, 0.31790846586227417, 0.0585796944797039, 0.30878275632858276, 0.5536429286003113, -0.061644136905670166, -0.06381722539663315, -0.1873038411140442, -0.24746698141098022, -0.3139619529247284, -0.19278131425380707, -0.48264867067337036, 0.5122742056846619, 0.09536745399236679, 0.17870695888996124, 0.18145892024040222, 0.2471739798784256, -0.16399677097797394, -0.18874068558216095, 0.21305255591869354, -0.6930050253868103, -0.4031701982021332, 0.5250658392906189, 0.4295860230922699, -0.464653879404068, -0.026941847056150436, -0.08213993161916733, 0.34638163447380066, -0.15401627123355865, 0.021148433908820152, 0.19726167619228363, -0.25100240111351013, 3.085673233726993e-05, 0.16563303768634796, -0.008333534933626652, -0.02890022285282612, -0.284770667552948, 0.3429299592971802, 0.6073935627937317, -0.10915102809667587, 0.3420248329639435, 0.07347360253334045, 0.18400518596172333, 0.2084905058145523, 0.3218590021133423, 0.16883575916290283, -0.6880696415901184, -0.37455135583877563, 0.04792584478855133, -0.04572531208395958, -0.17001567780971527, -0.12369263172149658, -0.3716808259487152, -0.04167286679148674, 0.04307235777378082, -0.1655367612838745, -0.47902533411979675, -0.21886907517910004, 0.4065888226032257, 0.30626556277275085, 0.25965678691864014, 0.07168732583522797, -0.17138782143592834, -0.6293558478355408, -0.6350710988044739, 0.25923609733581543, 0.5668261647224426, -0.030662082135677338, -0.7059182524681091, -0.25901535153388977, 0.25449642539024353, -0.3232290744781494, 0.42758384346961975, 0.7120643258094788, 0.023215001448988914, -0.40807682275772095, 0.1332295536994934, -0.33705568313598633, 0.1038941740989685, 0.39904412627220154, -0.567590057849884, -0.26575762033462524, 0.7635160088539124, -0.38967835903167725, -0.08988548815250397, 0.4150312840938568, -0.540441632270813, 0.33467426896095276, -0.03507159277796745, 0.00720902718603611, 0.6702240109443665, 0.2707512676715851],
+                b2: &[0.3580038547515869, 0.06861710548400879, -0.04651366174221039, 0.24638813734054565, 0.1557426154613495, -0.40271297097206116, -0.405432790517807],
+            }
+        }
diff --git a/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.txt b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.txt
new file mode 100644
index 000000000..8cfc6928a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a7.txt
@@ -0,0 +1,1701 @@
+armv7neon_mmm_f32_8x4_cortexa7 16 128 8 0.000019373978862224142
+armv7neon_mmm_f32_8x6_generic 24 4 18 0.000005589467629481233
+armv7neon_mmm_f32_8x6_cortexa7 24 32 5 0.0000068541067994687505
+armv7neon_mmm_f32_8x6_cortexa7 24 128 12 0.000038981217414944424
+generic_f32_4x4 4 32 3 0.000001790047741390679
+armv7neon_mmm_f32_8x4_generic 17 32 12 0.000013250293625322834
+armv7neon_mmm_f32_8x4_cortexa7 9 4 9 0.00000392359813080775
+armv7neon_mmm_f32_8x6_generic 16 128 18 0.00003780372998360008
+armv7neon_mmm_f32_8x6_generic 7 32 7 0.000004508038841829227
+armv7neon_mmm_f32_8x4_cortexa9 25 128 4 0.00001979557034383245
+armv7neon_mmm_f32_8x4_cortexa7 24 128 5 0.00002916219359816615
+generic_f32_4x4 8 4 7 0.000002792935337190097
+armv7neon_mmm_f32_8x6_generic 25 32 5 0.000008513973930688824
+armv7neon_mmm_f32_8x4_generic 8 128 4 0.000004984706018353683
+armv7neon_mmm_f32_8x6_cortexa7 23 4 6 0.0000024972411854271217
+armv7neon_mmm_f32_8x6_cortexa9 17 32 18 0.000018034380706056615
+generic_f32_4x4 9 32 5 0.000007787097193216308
+armv7neon_mmm_f32_8x4_cortexa7 7 32 11 0.000005445588694072235
+armv7neon_mmm_f32_8x4_cortexa9 7 4 3 0.0000011467255960994079
+armv7neon_mmm_f32_8x6_cortexa7 7 128 6 0.0000071177868474168225
+armv7neon_mmm_f32_8x6_generic 15 128 5 0.000013291532044598022
+armv7neon_mmm_f32_8x6_cortexa7 25 4 18 0.000007704060547781454
+armv7neon_mmm_f32_8x4_generic 15 128 11 0.000028366457109510148
+armv7neon_mmm_f32_8x6_cortexa7 7 4 12 0.000002031241642569242
+generic_f32_4x4 12 32 9 0.000011111678190760999
+armv7neon_mmm_f32_8x6_cortexa9 24 32 17 0.000018402463967012353
+armv7neon_mmm_f32_8x4_cortexa7 8 4 12 0.0000019146048075747953
+armv7neon_mmm_f32_8x4_cortexa7 23 4 8 0.00000383750508652125
+armv7neon_mmm_f32_8x4_cortexa7 25 4 7 0.0000052043882186278224
+armv7neon_mmm_f32_8x4_generic 16 32 12 0.000008739868831484192
+generic_f32_4x4 12 4 8 0.00000347003352464419
+armv7neon_mmm_f32_8x6_cortexa7 25 32 19 0.00003152139986100901
+armv7neon_mmm_f32_8x6_cortexa9 25 32 18 0.000023769189598361993
+armv7neon_mmm_f32_8x4_cortexa7 15 128 5 0.00001982738009364039
+armv7neon_mmm_f32_8x6_generic 9 128 11 0.000025754948633914126
+armv7neon_mmm_f32_8x4_cortexa7 15 4 4 0.0000016751657126978973
+armv7neon_mmm_f32_8x4_cortexa9 24 32 8 0.000009220363478862854
+generic_f32_4x4 3 32 13 0.000005627827228507873
+armv7neon_mmm_f32_8x6_cortexa9 23 128 13 0.00006012344410992782
+armv7neon_mmm_f32_8x6_cortexa7 24 128 6 0.00001973568156590405
+armv7neon_mmm_f32_8x4_generic 17 128 8 0.000027722972081041346
+armv7neon_mmm_f32_8x4_generic 9 128 4 0.000009605985891246275
+armv7neon_mmm_f32_8x4_cortexa7 15 32 3 0.000003754036592781278
+armv7neon_mmm_f32_8x4_cortexa7 25 128 8 0.00003835708390551408
+armv7neon_mmm_f32_8x4_cortexa9 9 32 4 0.0000035373293731924376
+generic_f32_4x4 7 4 11 0.000004044788183106201
+armv7neon_mmm_f32_8x6_cortexa9 7 32 12 0.000004744792047894666
+armv7neon_mmm_f32_8x4_cortexa9 23 4 8 0.0000038749254387558
+generic_f32_4x4 9 32 8 0.0000075635927841880176
+armv7neon_mmm_f32_8x4_cortexa7 9 4 12 0.000003727778947810718
+armv7neon_mmm_f32_8x6_cortexa7 23 32 5 0.0000067956400405037536
+generic_f32_4x4 8 4 5 0.000002723326536848814
+armv7neon_mmm_f32_8x6_cortexa9 15 32 5 0.000004690163698982028
+armv7neon_mmm_f32_8x4_cortexa9 9 32 3 0.0000037317693401172363
+armv7neon_mmm_f32_8x6_cortexa7 25 128 13 0.00007831443093305969
+armv7neon_mmm_f32_8x4_cortexa7 24 32 12 0.00001331331475399187
+armv7neon_mmm_f32_8x6_cortexa7 24 128 19 0.00007802209851667727
+armv7neon_mmm_f32_8x4_generic 16 128 11 0.00002788579513287149
+armv7neon_mmm_f32_8x4_cortexa7 17 128 4 0.000014706021218529391
+armv7neon_mmm_f32_8x6_generic 8 4 13 0.0000023378593097707485
+armv7neon_mmm_f32_8x6_generic 7 4 7 0.0000019186954240278283
+armv7neon_mmm_f32_8x4_cortexa9 16 4 8 0.000002428666090877155
+armv7neon_mmm_f32_8x6_generic 24 128 13 0.00005734829975884668
+generic_f32_4x4 8 128 11 0.000021025833076703056
+armv7neon_mmm_f32_8x6_cortexa9 17 4 19 0.00000803059286526004
+armv7neon_mmm_f32_8x6_cortexa7 25 4 7 0.00000571019568400475
+armv7neon_mmm_f32_8x4_cortexa9 7 128 3 0.000005476007735837036
+generic_f32_4x4 9 32 9 0.000011314156862861827
+armv7neon_mmm_f32_8x4_cortexa9 8 32 3 0.000002175330632998311
+armv7neon_mmm_f32_8x4_cortexa7 9 128 13 0.00003871137553728846
+armv7neon_mmm_f32_8x6_cortexa7 16 32 19 0.00001594300556874623
+armv7neon_mmm_f32_8x4_cortexa7 17 4 13 0.000007095518187604073
+armv7neon_mmm_f32_8x4_generic 17 4 12 0.00000511222434537087
+armv7neon_mmm_f32_8x4_generic 23 32 3 0.000005240662819589997
+armv7neon_mmm_f32_8x6_generic 7 4 19 0.000003459698540347226
+armv7neon_mmm_f32_8x6_cortexa9 7 128 19 0.000027297846707037145
+armv7neon_mmm_f32_8x4_cortexa7 23 4 9 0.000005799914306902319
+armv7neon_mmm_f32_8x4_generic 23 4 4 0.0000021622122978664557
+armv7neon_mmm_f32_8x4_cortexa9 17 128 12 0.00004402054920694197
+armv7neon_mmm_f32_8x6_cortexa9 15 4 18 0.000004616359494672357
+armv7neon_mmm_f32_8x4_cortexa9 8 4 4 0.0000010041412858292361
+armv7neon_mmm_f32_8x4_cortexa9 8 32 13 0.000006496433067761439
+generic_f32_4x4 11 128 3 0.000011040772633263298
+armv7neon_mmm_f32_8x6_generic 24 4 11 0.000004523362892147976
+generic_f32_4x4 9 4 4 0.0000021034125372056796
+generic_f32_4x4 8 4 9 0.000003722094027081023
+generic_f32_4x4 4 4 8 0.0000014952745701485973
+armv7neon_mmm_f32_8x4_generic 16 128 3 0.000009908089584838082
+armv7neon_mmm_f32_8x4_generic 25 32 7 0.000012421086220512836
+armv7neon_mmm_f32_8x6_cortexa7 8 4 5 0.0000013403828133224992
+armv7neon_mmm_f32_8x4_cortexa7 23 32 4 0.00000503241852364444
+armv7neon_mmm_f32_8x6_cortexa9 7 32 6 0.000002618507739501653
+armv7neon_mmm_f32_8x4_cortexa7 7 4 13 0.0000031793808774197963
+armv7neon_mmm_f32_8x4_generic 16 4 7 0.00000282397574390546
+armv7neon_mmm_f32_8x4_cortexa9 17 128 4 0.00001500697741030371
+armv7neon_mmm_f32_8x6_cortexa7 24 4 6 0.000002233395192000468
+armv7neon_mmm_f32_8x6_cortexa9 15 128 6 0.0000138021942440989
+armv7neon_mmm_f32_8x4_cortexa9 15 128 11 0.000030164013207782058
+generic_f32_4x4 7 4 5 0.0000028577946255424533
+armv7neon_mmm_f32_8x4_cortexa9 16 128 5 0.000020020455876347578
+armv7neon_mmm_f32_8x6_cortexa9 16 128 6 0.000013512716817189273
+armv7neon_mmm_f32_8x4_cortexa7 16 4 11 0.0000037996360400635168
+armv7neon_mmm_f32_8x4_generic 7 32 4 0.0000021000959358733572
+armv7neon_mmm_f32_8x4_cortexa9 9 4 11 0.000004001256873073548
+armv7neon_mmm_f32_8x6_generic 24 128 7 0.00003837839936999645
+armv7neon_mmm_f32_8x4_cortexa9 15 128 4 0.00001034155567067011
+armv7neon_mmm_f32_8x4_generic 7 128 9 0.000014585897754194483
+armv7neon_mmm_f32_8x4_cortexa9 7 128 11 0.000015558039194528556
+armv7neon_mmm_f32_8x4_generic 25 32 11 0.000018056248294844393
+armv7neon_mmm_f32_8x6_cortexa7 9 32 6 0.0000044289694676171784
+armv7neon_mmm_f32_8x4_cortexa9 9 32 7 0.000006775203907440964
+armv7neon_mmm_f32_8x4_cortexa7 17 32 4 0.000004906536483038752
+armv7neon_mmm_f32_8x6_generic 25 128 13 0.00007697771420855431
+armv7neon_mmm_f32_8x6_generic 15 32 13 0.000012236156128465641
+armv7neon_mmm_f32_8x4_generic 9 32 11 0.000009363916532723868
+armv7neon_mmm_f32_8x6_cortexa9 16 32 7 0.000008475723903784643
+armv7neon_mmm_f32_8x4_cortexa9 24 4 8 0.000003400276973989579
+armv7neon_mmm_f32_8x4_generic 8 32 9 0.000004816590922050574
+armv7neon_mmm_f32_8x6_cortexa9 7 32 18 0.0000068452889719178265
+armv7neon_mmm_f32_8x4_cortexa9 9 32 5 0.000006726253331965845
+generic_f32_4x4 11 32 13 0.000015033889948226767
+armv7neon_mmm_f32_8x4_generic 17 4 11 0.000005556840437336695
+generic_f32_4x4 13 32 7 0.000010313627379766583
+generic_f32_4x4 9 4 3 0.0000023801388628702307
+armv7neon_mmm_f32_8x4_generic 8 128 3 0.00000520201390212873
+armv7neon_mmm_f32_8x4_generic 8 32 12 0.000004612565409735615
+armv7neon_mmm_f32_8x4_cortexa9 16 128 4 0.000010106142800191435
+armv7neon_mmm_f32_8x4_cortexa7 8 4 5 0.0000016424950539860917
+armv7neon_mmm_f32_8x4_generic 8 32 8 0.000003246701797046072
+armv7neon_mmm_f32_8x6_cortexa7 17 128 7 0.00003957490867985703
+armv7neon_mmm_f32_8x4_cortexa7 7 32 12 0.000005449789255433999
+armv7neon_mmm_f32_8x4_cortexa7 15 32 5 0.000006774977020301673
+armv7neon_mmm_f32_8x6_generic 7 4 13 0.0000027111208216897315
+armv7neon_mmm_f32_8x4_cortexa7 16 32 5 0.000006561637517767873
+generic_f32_4x4 3 128 9 0.000011017761032513612
+armv7neon_mmm_f32_8x4_cortexa7 23 128 5 0.00002948816149020999
+armv7neon_mmm_f32_8x4_cortexa7 17 32 11 0.00001417097516045665
+armv7neon_mmm_f32_8x6_cortexa7 16 4 12 0.000002812978829286788
+armv7neon_mmm_f32_8x6_cortexa9 23 32 5 0.0000068076361854689625
+armv7neon_mmm_f32_8x4_cortexa7 24 4 3 0.0000025622772317463136
+armv7neon_mmm_f32_8x4_cortexa9 16 32 13 0.000012465423813216206
+generic_f32_4x4 4 32 11 0.000004083209759238612
+armv7neon_mmm_f32_8x4_generic 7 32 5 0.000003626955080135103
+armv7neon_mmm_f32_8x6_cortexa7 23 128 6 0.000020078583004924116
+armv7neon_mmm_f32_8x6_cortexa9 25 128 19 0.00010584384897256878
+armv7neon_mmm_f32_8x6_generic 17 4 12 0.000004107213127611975
+armv7neon_mmm_f32_8x6_cortexa7 16 4 17 0.000004432708536915881
+armv7neon_mmm_f32_8x6_cortexa7 16 32 18 0.00001183035782221589
+armv7neon_mmm_f32_8x4_cortexa9 25 4 3 0.000003166221225598654
+armv7neon_mmm_f32_8x4_cortexa9 24 128 9 0.00004426766902863704
+armv7neon_mmm_f32_8x6_generic 25 4 18 0.000007554578118985422
+armv7neon_mmm_f32_8x6_cortexa7 15 32 5 0.000004708546886703234
+armv7neon_mmm_f32_8x6_cortexa9 9 128 6 0.000013657373073808522
+armv7neon_mmm_f32_8x6_cortexa7 25 32 7 0.00001623345709716141
+armv7neon_mmm_f32_8x4_cortexa7 9 32 5 0.000006618675517469529
+armv7neon_mmm_f32_8x4_cortexa7 7 32 13 0.000007030082349548267
+armv7neon_mmm_f32_8x4_cortexa9 17 4 7 0.0000040753817592878874
+armv7neon_mmm_f32_8x6_generic 15 4 5 0.0000019991656588201214
+generic_f32_4x4 12 128 5 0.00002108090993489815
+armv7neon_mmm_f32_8x6_cortexa9 25 128 17 0.00007984886358906369
+generic_f32_4x4 8 128 5 0.000014230862544269676
+armv7neon_mmm_f32_8x6_generic 24 128 18 0.000056743692014886694
+armv7neon_mmm_f32_8x4_cortexa9 8 32 8 0.0000034133387591211906
+armv7neon_mmm_f32_8x4_generic 9 128 12 0.000027800478110813017
+generic_f32_4x4 7 32 7 0.000005509675967097076
+armv7neon_mmm_f32_8x6_cortexa9 16 32 13 0.000012310372306220252
+armv7neon_mmm_f32_8x4_generic 15 128 13 0.00003762533200230424
+generic_f32_4x4 4 128 5 0.000007369710897637148
+armv7neon_mmm_f32_8x6_generic 16 128 6 0.000012947918354537283
+armv7neon_mmm_f32_8x4_cortexa7 8 32 3 0.000002138296476186818
+armv7neon_mmm_f32_8x4_generic 9 128 11 0.000028058363419186727
+armv7neon_mmm_f32_8x4_cortexa7 7 128 13 0.00002008308819140427
+armv7neon_mmm_f32_8x6_cortexa9 15 128 12 0.000027068411466415602
+armv7neon_mmm_f32_8x6_generic 8 128 12 0.000012940488826817833
+armv7neon_mmm_f32_8x4_generic 24 32 7 0.00000940385598649166
+armv7neon_mmm_f32_8x6_cortexa7 15 128 19 0.00005293187157460067
+armv7neon_mmm_f32_8x4_cortexa9 25 128 3 0.000020446918539621497
+armv7neon_mmm_f32_8x4_cortexa7 17 4 9 0.000005534537134563046
+armv7neon_mmm_f32_8x6_cortexa9 15 128 19 0.00005366369135659099
+armv7neon_mmm_f32_8x4_generic 25 4 4 0.000002522928321866339
+armv7neon_mmm_f32_8x6_generic 16 4 19 0.000005280876227150452
+armv7neon_mmm_f32_8x6_generic 23 128 12 0.000038470533759474454
+armv7neon_mmm_f32_8x4_generic 16 32 3 0.00000367665371967417
+armv7neon_mmm_f32_8x6_cortexa7 7 32 12 0.000004702345831209106
+armv7neon_mmm_f32_8x6_generic 16 32 11 0.000008300619599261807
+armv7neon_mmm_f32_8x4_cortexa9 23 4 5 0.000004136206632272795
+armv7neon_mmm_f32_8x4_cortexa9 23 32 7 0.000010130368574328868
+armv7neon_mmm_f32_8x4_generic 7 128 12 0.000014642209408823472
+armv7neon_mmm_f32_8x4_generic 16 4 3 0.0000018668229452058654
+armv7neon_mmm_f32_8x4_cortexa9 24 128 8 0.00002933769331717579
+armv7neon_mmm_f32_8x4_cortexa9 15 32 5 0.0000068812833008684075
+armv7neon_mmm_f32_8x6_generic 16 4 17 0.000004317183704154257
+armv7neon_mmm_f32_8x6_cortexa9 17 32 7 0.000012443561351360363
+generic_f32_4x4 13 128 7 0.000028155411441347644
+armv7neon_mmm_f32_8x6_cortexa9 25 4 12 0.000005357355009390057
+armv7neon_mmm_f32_8x6_cortexa7 9 128 18 0.00003939231793264078
+generic_f32_4x4 9 128 13 0.00004157559429631316
+armv7neon_mmm_f32_8x6_cortexa9 23 32 11 0.000012894467716591857
+armv7neon_mmm_f32_8x6_generic 17 32 19 0.000023157961942761285
+armv7neon_mmm_f32_8x6_generic 23 128 13 0.00005762004261565962
+armv7neon_mmm_f32_8x6_generic 25 32 12 0.000015419463508057784
+armv7neon_mmm_f32_8x6_generic 23 32 19 0.000023607314480173466
+generic_f32_4x4 11 4 9 0.0000056154966368798884
+armv7neon_mmm_f32_8x4_generic 23 4 3 0.000002512455070432762
+armv7neon_mmm_f32_8x6_generic 17 128 12 0.000038205761728052566
+armv7neon_mmm_f32_8x6_cortexa9 25 4 6 0.0000029441583370270357
+armv7neon_mmm_f32_8x6_cortexa7 8 128 5 0.00000718614713533821
+armv7neon_mmm_f32_8x6_cortexa9 9 32 11 0.000008580295552398777
+armv7neon_mmm_f32_8x4_generic 17 4 8 0.0000036027501506567706
+armv7neon_mmm_f32_8x4_cortexa9 24 128 12 0.000043681280972474085
+armv7neon_mmm_f32_8x6_cortexa7 23 128 19 0.00007892740320659124
+armv7neon_mmm_f32_8x4_cortexa7 7 4 7 0.000001867957586629068
+armv7neon_mmm_f32_8x4_cortexa7 17 128 3 0.000015105393976853567
+armv7neon_mmm_f32_8x6_cortexa9 16 32 5 0.000004737870196682113
+armv7neon_mmm_f32_8x6_cortexa7 17 32 11 0.000012513663992372357
+armv7neon_mmm_f32_8x4_cortexa9 8 4 13 0.000002615891904894993
+generic_f32_4x4 9 128 8 0.00002092560518618388
+armv7neon_mmm_f32_8x6_cortexa9 24 4 13 0.000006118924275544352
+armv7neon_mmm_f32_8x4_cortexa9 23 128 12 0.00004443207751479167
+armv7neon_mmm_f32_8x4_cortexa9 9 32 13 0.000012776770370453443
+armv7neon_mmm_f32_8x4_cortexa7 7 128 4 0.000005416694010439176
+generic_f32_4x4 3 32 3 0.0000017607189056083412
+generic_f32_4x4 8 32 13 0.00000985693332598433
+armv7neon_mmm_f32_8x4_generic 15 128 9 0.00002827090625785394
+armv7neon_mmm_f32_8x6_generic 17 4 5 0.0000026659420735671337
+armv7neon_mmm_f32_8x6_cortexa9 7 4 19 0.000003496314614706576
+generic_f32_4x4 9 32 12 0.00001103802239472411
+armv7neon_mmm_f32_8x6_cortexa7 24 32 19 0.00002361513173387037
+armv7neon_mmm_f32_8x6_cortexa9 24 32 13 0.00001816124851845242
+armv7neon_mmm_f32_8x6_cortexa7 24 4 19 0.000007820954390784165
+armv7neon_mmm_f32_8x4_cortexa9 16 4 9 0.0000037661558813876353
+armv7neon_mmm_f32_8x6_generic 25 4 13 0.000007941364297025098
+armv7neon_mmm_f32_8x6_cortexa7 23 128 7 0.0000397790896264826
+armv7neon_mmm_f32_8x4_cortexa7 23 32 12 0.000014064248486107495
+armv7neon_mmm_f32_8x6_cortexa9 25 32 17 0.000024451766821636896
+armv7neon_mmm_f32_8x4_generic 16 32 13 0.000011863952520137507
+armv7neon_mmm_f32_8x6_cortexa9 15 4 13 0.000004662604065748643
+armv7neon_mmm_f32_8x4_cortexa7 16 128 13 0.00003851383301865242
+armv7neon_mmm_f32_8x4_cortexa9 17 4 3 0.0000024871767077518544
+armv7neon_mmm_f32_8x6_generic 9 128 18 0.000038275747204244
+armv7neon_mmm_f32_8x4_cortexa7 23 4 7 0.000004236340851810833
+armv7neon_mmm_f32_8x4_generic 24 32 12 0.000012855667391841598
+armv7neon_mmm_f32_8x6_cortexa9 15 32 17 0.000012917112497463966
+armv7neon_mmm_f32_8x4_generic 23 128 11 0.000042177094013740186
+armv7neon_mmm_f32_8x4_cortexa7 16 128 12 0.000028624335558632763
+armv7neon_mmm_f32_8x6_cortexa7 23 4 17 0.000006797176857939227
+armv7neon_mmm_f32_8x4_generic 25 4 3 0.0000031059316600251953
+armv7neon_mmm_f32_8x6_cortexa7 23 4 19 0.000008441504854510506
+armv7neon_mmm_f32_8x6_cortexa7 23 128 17 0.00005957716872417335
+armv7neon_mmm_f32_8x4_generic 24 32 3 0.000005255472766158138
+armv7neon_mmm_f32_8x6_cortexa7 7 32 6 0.0000025969460967210575
+generic_f32_4x4 12 128 9 0.00003118081783563989
+armv7neon_mmm_f32_8x6_cortexa7 24 32 7 0.000012316039376583595
+armv7neon_mmm_f32_8x4_cortexa9 9 128 11 0.00002984202658897139
+armv7neon_mmm_f32_8x6_generic 24 4 12 0.00000390465698063092
+armv7neon_mmm_f32_8x6_generic 24 128 5 0.000019825223712031248
+generic_f32_4x4 11 128 12 0.00003129955987835105
+armv7neon_mmm_f32_8x4_generic 15 32 8 0.000006474371340367591
+armv7neon_mmm_f32_8x6_cortexa7 25 128 17 0.00007864370121172857
+armv7neon_mmm_f32_8x4_cortexa7 7 4 9 0.000002506847196975368
+armv7neon_mmm_f32_8x6_cortexa9 25 4 11 0.000005933707367961421
+armv7neon_mmm_f32_8x4_cortexa9 16 4 5 0.000002787654264318038
+armv7neon_mmm_f32_8x6_generic 17 32 13 0.00001761805014832438
+armv7neon_mmm_f32_8x4_cortexa9 7 32 4 0.0000021905945506812955
+armv7neon_mmm_f32_8x6_cortexa9 17 128 11 0.00004027130474721357
+generic_f32_4x4 7 32 9 0.000007912119634032119
+armv7neon_mmm_f32_8x4_generic 23 4 13 0.0000073828945636745854
+generic_f32_4x4 12 128 11 0.00003129061011102201
+armv7neon_mmm_f32_8x6_cortexa7 9 4 5 0.0000019755012039429016
+armv7neon_mmm_f32_8x6_cortexa9 8 4 13 0.000002405709930798169
+armv7neon_mmm_f32_8x6_cortexa9 24 4 7 0.0000044131675061571695
+armv7neon_mmm_f32_8x4_cortexa7 25 128 3 0.00002004605546076412
+armv7neon_mmm_f32_8x6_cortexa7 25 32 11 0.000016504451247846665
+generic_f32_4x4 11 4 3 0.0000024035209703323816
+generic_f32_4x4 5 128 9 0.00002116685987655445
+armv7neon_mmm_f32_8x4_generic 9 32 9 0.000009310383786805071
+armv7neon_mmm_f32_8x6_cortexa7 15 32 12 0.00000858295982977344
+generic_f32_4x4 9 4 9 0.0000055122591860013075
+armv7neon_mmm_f32_8x4_cortexa7 24 128 4 0.000014582241975346975
+armv7neon_mmm_f32_8x6_cortexa7 7 32 11 0.0000047102983594351624
+armv7neon_mmm_f32_8x6_cortexa7 17 32 5 0.000006703902999940797
+armv7neon_mmm_f32_8x4_cortexa9 9 32 12 0.000009569569775041666
+generic_f32_4x4 11 32 4 0.000004102122527868448
+armv7neon_mmm_f32_8x4_cortexa7 23 128 11 0.00004393294052475175
+armv7neon_mmm_f32_8x6_cortexa7 17 128 18 0.00005867315609945908
+armv7neon_mmm_f32_8x6_generic 15 4 19 0.000005888722731281036
+armv7neon_mmm_f32_8x4_cortexa9 25 128 12 0.000058416478283637004
+armv7neon_mmm_f32_8x6_cortexa7 15 128 18 0.00003981402002143062
+armv7neon_mmm_f32_8x6_generic 17 32 11 0.000012156545175144957
+armv7neon_mmm_f32_8x6_cortexa9 17 128 5 0.000020505815042923473
+armv7neon_mmm_f32_8x4_generic 23 4 8 0.0000037989035755978144
+armv7neon_mmm_f32_8x4_cortexa9 23 128 4 0.000015133679307570648
+generic_f32_4x4 12 32 3 0.000004382513702192395
+generic_f32_4x4 12 4 5 0.000003824564176783666
+generic_f32_4x4 5 32 7 0.000005434572832921092
+generic_f32_4x4 3 4 8 0.0000017566767353451639
+armv7neon_mmm_f32_8x6_cortexa7 7 32 13 0.0000067229229930212575
+armv7neon_mmm_f32_8x4_generic 8 4 13 0.0000025649388423337978
+generic_f32_4x4 11 32 12 0.000011236073634740423
+armv7neon_mmm_f32_8x6_generic 25 4 17 0.000008196880964751204
+armv7neon_mmm_f32_8x6_cortexa9 7 32 11 0.000004753650782271882
+armv7neon_mmm_f32_8x4_cortexa9 7 128 4 0.000005529569938400554
+generic_f32_4x4 13 4 13 0.000009131495505295274
+armv7neon_mmm_f32_8x6_cortexa9 9 4 7 0.00000312421222650566
+armv7neon_mmm_f32_8x6_generic 17 32 7 0.000011970222410677647
+armv7neon_mmm_f32_8x4_cortexa7 7 4 3 0.0000011523161892903704
+armv7neon_mmm_f32_8x6_cortexa7 23 128 18 0.00005908293678660039
+armv7neon_mmm_f32_8x4_cortexa9 23 4 13 0.000007508665249371487
+armv7neon_mmm_f32_8x4_cortexa9 7 32 8 0.000003875838102020609
+armv7neon_mmm_f32_8x6_cortexa7 24 32 11 0.00001256332295885272
+armv7neon_mmm_f32_8x6_cortexa9 23 128 6 0.000020319056425196668
+armv7neon_mmm_f32_8x4_generic 23 32 4 0.00000488125169723422
+armv7neon_mmm_f32_8x6_generic 24 32 18 0.00001705594140661088
+armv7neon_mmm_f32_8x4_generic 7 4 3 0.000001143118878212571
+armv7neon_mmm_f32_8x4_generic 24 4 3 0.000002536821660220204
+armv7neon_mmm_f32_8x6_cortexa9 7 4 17 0.000002840883149941229
+armv7neon_mmm_f32_8x6_cortexa7 17 32 13 0.000018113836070828896
+armv7neon_mmm_f32_8x6_cortexa7 24 4 5 0.000002893802636827323
+armv7neon_mmm_f32_8x4_cortexa9 7 128 8 0.00001055308814900546
+armv7neon_mmm_f32_8x4_cortexa9 15 4 4 0.0000017017415253459248
+armv7neon_mmm_f32_8x6_generic 9 128 17 0.00003847119775045202
+armv7neon_mmm_f32_8x4_cortexa9 17 128 7 0.00003001254835210308
+generic_f32_4x4 7 128 3 0.0000075145364744355526
+armv7neon_mmm_f32_8x6_cortexa9 24 128 13 0.00005963153865876477
+armv7neon_mmm_f32_8x6_generic 16 4 11 0.0000031955847464919697
+armv7neon_mmm_f32_8x6_generic 24 128 17 0.000057273707239946334
+armv7neon_mmm_f32_8x6_cortexa9 16 128 17 0.00004016422558583101
+armv7neon_mmm_f32_8x4_generic 24 4 7 0.000003971324732226714
+armv7neon_mmm_f32_8x6_cortexa9 25 32 11 0.0000166679830493271
+armv7neon_mmm_f32_8x4_generic 7 128 3 0.000005169798410902207
+armv7neon_mmm_f32_8x6_cortexa7 15 4 11 0.000003482445418673263
+armv7neon_mmm_f32_8x6_cortexa7 8 128 6 0.0000069109869984566186
+armv7neon_mmm_f32_8x4_cortexa9 24 32 13 0.000018445950950001155
+armv7neon_mmm_f32_8x6_generic 8 4 11 0.0000018570966900607452
+armv7neon_mmm_f32_8x6_cortexa9 24 4 6 0.000002228168146536169
+armv7neon_mmm_f32_8x6_generic 8 128 7 0.000013146479087947972
+armv7neon_mmm_f32_8x6_cortexa9 25 32 12 0.000016067383961246167
+armv7neon_mmm_f32_8x4_cortexa9 16 32 5 0.000006662778599544951
+armv7neon_mmm_f32_8x6_generic 9 4 11 0.0000031451298378122295
+armv7neon_mmm_f32_8x4_generic 16 32 7 0.000006438918321306648
+armv7neon_mmm_f32_8x6_cortexa9 7 32 7 0.000004657116192635584
+armv7neon_mmm_f32_8x6_generic 17 32 12 0.00001176030032709091
+armv7neon_mmm_f32_8x4_generic 15 4 8 0.0000028163110410039656
+armv7neon_mmm_f32_8x4_cortexa9 7 4 9 0.0000025260788973778824
+generic_f32_4x4 8 4 11 0.000003795516523798154
+armv7neon_mmm_f32_8x6_generic 15 32 6 0.0000044168833936544535
+armv7neon_mmm_f32_8x4_cortexa7 8 128 9 0.000014761720958613642
+armv7neon_mmm_f32_8x6_cortexa7 9 32 7 0.000008410754850495816
+armv7neon_mmm_f32_8x4_cortexa7 16 4 13 0.000004684953060253782
+armv7neon_mmm_f32_8x4_generic 16 32 5 0.000006354175934259964
+armv7neon_mmm_f32_8x4_generic 24 4 4 0.0000019148396982311507
+armv7neon_mmm_f32_8x4_generic 17 128 4 0.000014107689106847269
+armv7neon_mmm_f32_8x6_cortexa9 24 32 11 0.000012681731796133347
+generic_f32_4x4 3 4 5 0.0000017351550450721446
+armv7neon_mmm_f32_8x4_cortexa7 16 4 9 0.0000037512996244278985
+armv7neon_mmm_f32_8x6_cortexa9 23 32 13 0.000018600736039744996
+generic_f32_4x4 7 32 13 0.000010363567629711427
+generic_f32_4x4 7 128 13 0.000028209372228861907
+armv7neon_mmm_f32_8x6_generic 24 32 13 0.000017489604886914003
+armv7neon_mmm_f32_8x4_cortexa7 24 32 8 0.00000906737294441132
+armv7neon_mmm_f32_8x6_generic 15 32 5 0.000004544284306242495
+armv7neon_mmm_f32_8x6_cortexa9 16 128 19 0.00005301309753082383
+armv7neon_mmm_f32_8x6_cortexa7 15 128 11 0.000026895954754680612
+armv7neon_mmm_f32_8x4_cortexa9 15 32 12 0.000009927381424295026
+armv7neon_mmm_f32_8x4_cortexa9 16 128 12 0.00002922630339731252
+armv7neon_mmm_f32_8x4_cortexa9 24 4 11 0.000005481062579336857
+armv7neon_mmm_f32_8x6_generic 7 128 18 0.000019821467429857938
+armv7neon_mmm_f32_8x6_cortexa9 7 32 5 0.000002575926161215081
+armv7neon_mmm_f32_8x6_cortexa7 15 32 7 0.000008595071585263579
+armv7neon_mmm_f32_8x4_cortexa9 17 32 4 0.000005001583148868544
+armv7neon_mmm_f32_8x4_cortexa7 25 32 4 0.000006334740769256929
+armv7neon_mmm_f32_8x4_cortexa7 25 128 7 0.00003905527355458101
+generic_f32_4x4 4 4 5 0.0000016159570864189527
+armv7neon_mmm_f32_8x4_cortexa9 17 128 11 0.000044522329607090446
+armv7neon_mmm_f32_8x6_cortexa9 8 128 18 0.0000200309695734825
+armv7neon_mmm_f32_8x6_generic 24 32 19 0.000022973011330361666
+generic_f32_4x4 3 4 13 0.000002984204583865987
+armv7neon_mmm_f32_8x6_cortexa9 15 4 5 0.000002015871636210361
+generic_f32_4x4 7 4 9 0.000003999946420645267
+armv7neon_mmm_f32_8x4_generic 7 4 9 0.0000024834088306212943
+armv7neon_mmm_f32_8x4_cortexa7 25 128 9 0.00005782975703195741
+armv7neon_mmm_f32_8x4_generic 25 32 8 0.000011779346411467398
+armv7neon_mmm_f32_8x6_cortexa9 25 32 7 0.000016399753838712434
+armv7neon_mmm_f32_8x6_generic 24 4 6 0.000002204607070591687
+armv7neon_mmm_f32_8x6_cortexa7 9 32 17 0.000012402833540807695
+armv7neon_mmm_f32_8x4_cortexa7 25 128 13 0.00007672799293438395
+armv7neon_mmm_f32_8x6_generic 24 128 12 0.0000379228420372398
+armv7neon_mmm_f32_8x4_cortexa7 16 128 3 0.00001030544963732427
+armv7neon_mmm_f32_8x4_generic 15 128 3 0.000009885955298583161
+armv7neon_mmm_f32_8x4_generic 15 4 12 0.0000039769457516235065
+armv7neon_mmm_f32_8x6_generic 23 4 11 0.000004727342544050073
+armv7neon_mmm_f32_8x6_cortexa7 15 4 12 0.0000032858094110604857
+armv7neon_mmm_f32_8x4_cortexa7 16 4 3 0.0000018824484779381146
+armv7neon_mmm_f32_8x6_cortexa9 23 4 7 0.000004580846920018228
+generic_f32_4x4 11 4 8 0.00000379091798312253
+armv7neon_mmm_f32_8x6_cortexa9 23 32 18 0.00001843667314281391
+armv7neon_mmm_f32_8x6_generic 8 32 18 0.0000060026095209057905
+armv7neon_mmm_f32_8x4_cortexa7 8 32 7 0.00000356603711659208
+armv7neon_mmm_f32_8x6_generic 15 128 18 0.00003865113024109526
+armv7neon_mmm_f32_8x6_generic 7 32 5 0.0000025012501670870884
+generic_f32_4x4 12 128 3 0.00001108291043110956
+armv7neon_mmm_f32_8x6_cortexa9 9 128 17 0.00004019255842003589
+armv7neon_mmm_f32_8x4_cortexa9 15 128 3 0.000010489952376847188
+armv7neon_mmm_f32_8x4_cortexa9 7 4 5 0.0000018283099982388463
+armv7neon_mmm_f32_8x6_cortexa7 23 32 13 0.000018410568030938504
+armv7neon_mmm_f32_8x6_generic 15 4 17 0.000004759242853957918
+armv7neon_mmm_f32_8x6_cortexa7 17 128 5 0.00002027503703993735
+armv7neon_mmm_f32_8x4_generic 7 4 12 0.000002526741586689988
+armv7neon_mmm_f32_8x6_cortexa9 17 4 13 0.000006256702206265106
+armv7neon_mmm_f32_8x6_cortexa7 16 128 13 0.000039449772577428465
+generic_f32_4x4 5 32 4 0.0000028896370502300425
+armv7neon_mmm_f32_8x6_cortexa9 23 4 6 0.000002496256395769302
+armv7neon_mmm_f32_8x4_generic 17 4 9 0.000005469161034223508
+armv7neon_mmm_f32_8x4_generic 17 4 13 0.0000070099392970628585
+armv7neon_mmm_f32_8x6_generic 24 4 17 0.000006212264617752923
+armv7neon_mmm_f32_8x6_cortexa7 23 32 18 0.000018244504523002484
+armv7neon_mmm_f32_8x4_cortexa7 17 32 3 0.000005314636591625876
+armv7neon_mmm_f32_8x6_generic 23 4 13 0.00000640173388890707
+armv7neon_mmm_f32_8x6_cortexa7 25 4 19 0.00001046601238496823
+armv7neon_mmm_f32_8x4_cortexa7 9 128 5 0.000019669882407282677
+generic_f32_4x4 5 4 3 0.0000017330186385522962
+armv7neon_mmm_f32_8x6_generic 9 4 13 0.000004297017223755962
+armv7neon_mmm_f32_8x4_cortexa9 9 128 7 0.0000201239354954616
+armv7neon_mmm_f32_8x4_cortexa7 8 4 7 0.0000016739241733780589
+armv7neon_mmm_f32_8x6_cortexa7 7 4 6 0.000001271018362845475
+generic_f32_4x4 13 32 9 0.000014839920846066823
+armv7neon_mmm_f32_8x4_generic 25 4 9 0.000007058701573468786
+armv7neon_mmm_f32_8x4_cortexa7 7 32 5 0.000003729557835063842
+armv7neon_mmm_f32_8x6_cortexa9 7 128 5 0.000007173464418699434
+armv7neon_mmm_f32_8x6_cortexa7 24 128 13 0.00005881823134405547
+armv7neon_mmm_f32_8x6_cortexa9 17 32 12 0.000012231167345130436
+armv7neon_mmm_f32_8x4_cortexa7 8 4 4 0.0000009861752945826601
+armv7neon_mmm_f32_8x4_cortexa9 23 128 3 0.00001548775471015623
+armv7neon_mmm_f32_8x4_generic 8 128 9 0.000014166892351007583
+armv7neon_mmm_f32_8x4_cortexa9 25 32 3 0.000007055926590582058
+armv7neon_mmm_f32_8x6_cortexa7 8 32 6 0.0000023905642959740767
+armv7neon_mmm_f32_8x4_generic 24 4 12 0.000004705581936932163
+generic_f32_4x4 13 32 13 0.000019466458499229495
+armv7neon_mmm_f32_8x6_cortexa7 8 128 12 0.000013354945269855691
+generic_f32_4x4 4 32 8 0.000002784564616216757
+generic_f32_4x4 11 4 13 0.000007235980721358389
+armv7neon_mmm_f32_8x4_cortexa9 15 4 9 0.000004211132231298468
+armv7neon_mmm_f32_8x4_generic 23 4 7 0.000004197975758631663
+armv7neon_mmm_f32_8x4_generic 24 4 13 0.000006669346538195287
+generic_f32_4x4 9 4 11 0.000005605492408222339
+armv7neon_mmm_f32_8x4_cortexa9 7 4 7 0.0000018866714926916992
+armv7neon_mmm_f32_8x6_generic 7 4 12 0.00000203404671528183
+armv7neon_mmm_f32_8x6_cortexa9 16 32 12 0.000008156581758904169
+armv7neon_mmm_f32_8x6_generic 9 4 19 0.000005498641044027106
+generic_f32_4x4 7 4 8 0.000002762174951002869
+armv7neon_mmm_f32_8x4_cortexa9 16 4 3 0.0000019059695727093323
+armv7neon_mmm_f32_8x4_generic 25 4 12 0.00000651099979096713
+armv7neon_mmm_f32_8x6_cortexa7 24 128 5 0.000020414031398736494
+armv7neon_mmm_f32_8x6_cortexa7 7 128 18 0.000020361101695155362
+generic_f32_4x4 11 128 5 0.00002125700072345171
+armv7neon_mmm_f32_8x6_cortexa7 17 32 18 0.000017837908064150246
+armv7neon_mmm_f32_8x6_cortexa7 25 32 13 0.00002390699968855355
+armv7neon_mmm_f32_8x4_generic 23 4 5 0.0000040641271507242515
+generic_f32_4x4 8 4 3 0.0000017865876036353628
+armv7neon_mmm_f32_8x6_cortexa9 16 4 17 0.00000442887972122732
+armv7neon_mmm_f32_8x4_cortexa9 25 32 8 0.000012384930380190053
+generic_f32_4x4 5 4 5 0.000002798692518373374
+armv7neon_mmm_f32_8x6_cortexa9 8 128 6 0.0000070012080134920035
+armv7neon_mmm_f32_8x6_cortexa7 17 32 6 0.000006327458516647519
+armv7neon_mmm_f32_8x4_cortexa7 24 32 7 0.0000097071140215999
+armv7neon_mmm_f32_8x6_generic 23 32 12 0.000012023256652212629
+armv7neon_mmm_f32_8x6_generic 9 32 11 0.000008252674922417165
+armv7neon_mmm_f32_8x6_generic 9 32 6 0.000004271117675286997
+armv7neon_mmm_f32_8x4_cortexa7 7 4 4 0.0000011809046933853917
+armv7neon_mmm_f32_8x6_cortexa7 15 32 19 0.00001661331304395181
+armv7neon_mmm_f32_8x4_generic 25 128 5 0.00003728262011412803
+armv7neon_mmm_f32_8x6_cortexa9 17 128 19 0.00007942199229108003
+generic_f32_4x4 11 128 9 0.000031521930244918895
+armv7neon_mmm_f32_8x6_cortexa7 25 4 6 0.000002951526384640025
+armv7neon_mmm_f32_8x6_generic 25 128 18 0.00007559930226918245
+armv7neon_mmm_f32_8x6_generic 24 32 7 0.000011956301096424158
+armv7neon_mmm_f32_8x4_cortexa7 24 4 4 0.000001942280219822285
+armv7neon_mmm_f32_8x4_cortexa7 16 4 5 0.000002773541103005484
+generic_f32_4x4 13 4 7 0.000005120137461912069
+armv7neon_mmm_f32_8x4_generic 9 4 9 0.000003880248068324046
+armv7neon_mmm_f32_8x4_cortexa9 8 4 5 0.0000016579504301247745
+armv7neon_mmm_f32_8x6_cortexa7 7 32 18 0.000006779881862785529
+armv7neon_mmm_f32_8x4_generic 24 32 9 0.000013438884904462624
+armv7neon_mmm_f32_8x4_generic 17 32 11 0.000013721051068287207
+armv7neon_mmm_f32_8x4_cortexa7 23 32 3 0.000005392987682793705
+armv7neon_mmm_f32_8x6_cortexa9 17 4 18 0.000005996181221285945
+armv7neon_mmm_f32_8x4_generic 23 128 12 0.000041706025206470595
+armv7neon_mmm_f32_8x4_generic 25 128 7 0.00003743774855697319
+generic_f32_4x4 3 4 12 0.000002379029629256848
+armv7neon_mmm_f32_8x6_cortexa9 24 128 18 0.00005914765901143539
+armv7neon_mmm_f32_8x4_cortexa7 23 32 5 0.00000981049054917298
+armv7neon_mmm_f32_8x6_cortexa7 16 4 19 0.000005412079670267658
+armv7neon_mmm_f32_8x4_cortexa7 8 32 4 0.0000019274117652778804
+armv7neon_mmm_f32_8x6_generic 17 128 6 0.000019306808921524692
+armv7neon_mmm_f32_8x6_cortexa7 7 128 12 0.000013761467144605992
+armv7neon_mmm_f32_8x6_cortexa7 16 32 7 0.000008404964853562951
+armv7neon_mmm_f32_8x6_cortexa9 9 32 7 0.000008475033766481017
+armv7neon_mmm_f32_8x6_cortexa9 15 128 5 0.000013894752583522413
+armv7neon_mmm_f32_8x6_generic 16 128 17 0.000038424998460484644
+armv7neon_mmm_f32_8x4_cortexa7 8 32 9 0.00000497069436204451
+armv7neon_mmm_f32_8x6_generic 9 32 5 0.000004444687551155496
+armv7neon_mmm_f32_8x4_cortexa7 24 32 5 0.00000958372113096483
+armv7neon_mmm_f32_8x6_cortexa9 24 4 12 0.000003962114122866607
+armv7neon_mmm_f32_8x4_generic 15 4 11 0.000004231967658632821
+armv7neon_mmm_f32_8x6_cortexa7 24 32 13 0.000017973816622821935
+armv7neon_mmm_f32_8x4_cortexa9 23 32 13 0.000019157368846217136
+armv7neon_mmm_f32_8x4_generic 16 32 9 0.00000912682021206548
+armv7neon_mmm_f32_8x6_generic 16 32 5 0.000004588157612379825
+armv7neon_mmm_f32_8x6_generic 16 4 13 0.0000041677198842138695
+armv7neon_mmm_f32_8x6_generic 15 128 17 0.00003886891043603058
+armv7neon_mmm_f32_8x6_cortexa9 7 128 11 0.000013961383587786538
+armv7neon_mmm_f32_8x4_generic 7 128 5 0.000009860600106626588
+armv7neon_mmm_f32_8x4_cortexa9 16 4 12 0.0000033867538649589902
+armv7neon_mmm_f32_8x6_cortexa9 17 32 19 0.000024087066037173788
+armv7neon_mmm_f32_8x6_generic 9 128 5 0.000013189504068813325
+armv7neon_mmm_f32_8x6_cortexa9 8 32 18 0.000006222642744442069
+armv7neon_mmm_f32_8x6_cortexa7 24 32 12 0.000011846425063097506
+generic_f32_4x4 5 4 13 0.000004993325421206121
+armv7neon_mmm_f32_8x6_generic 8 128 11 0.000013157428320463883
+generic_f32_4x4 7 32 4 0.0000029484328830194284
+armv7neon_mmm_f32_8x4_cortexa9 7 32 9 0.000005475212517453128
+armv7neon_mmm_f32_8x4_cortexa9 9 128 13 0.00003949164898061412
+armv7neon_mmm_f32_8x4_cortexa9 23 4 11 0.000005947124934955378
+armv7neon_mmm_f32_8x6_generic 23 32 11 0.00001239533186039336
+armv7neon_mmm_f32_8x6_generic 7 32 17 0.000006661670995420206
+armv7neon_mmm_f32_8x4_cortexa7 7 4 8 0.0000018776318580783529
+armv7neon_mmm_f32_8x6_generic 8 32 5 0.0000025552685069830137
+armv7neon_mmm_f32_8x4_cortexa9 9 32 9 0.000009774693013502346
+generic_f32_4x4 13 128 5 0.000027995490696525238
+armv7neon_mmm_f32_8x4_cortexa9 8 4 11 0.0000021722237770380927
+armv7neon_mmm_f32_8x4_generic 23 128 9 0.0000419391925884448
+armv7neon_mmm_f32_8x4_cortexa9 25 32 7 0.000013031732285626503
+armv7neon_mmm_f32_8x4_cortexa7 25 32 12 0.000017966835523846677
+armv7neon_mmm_f32_8x4_cortexa7 16 32 11 0.000009494335431932482
+armv7neon_mmm_f32_8x6_cortexa7 17 128 19 0.0000783675036460944
+armv7neon_mmm_f32_8x6_cortexa7 17 4 7 0.0000044328101817797105
+generic_f32_4x4 3 32 8 0.000003088179719444515
+generic_f32_4x4 12 4 11 0.000005408469300584859
+armv7neon_mmm_f32_8x4_cortexa7 23 32 13 0.000018862031131309997
+armv7neon_mmm_f32_8x4_generic 23 32 11 0.000014024445889402156
+generic_f32_4x4 4 128 11 0.00001075990813733738
+armv7neon_mmm_f32_8x6_generic 24 4 7 0.000004301779173383875
+armv7neon_mmm_f32_8x6_cortexa9 25 128 12 0.000052850505948183045
+armv7neon_mmm_f32_8x4_generic 15 32 4 0.0000034946744383909553
+armv7neon_mmm_f32_8x4_generic 17 4 3 0.0000024363476636608272
+armv7neon_mmm_f32_8x4_cortexa7 16 128 4 0.000009891301569266948
+armv7neon_mmm_f32_8x6_cortexa9 25 4 13 0.000008105640345047328
+armv7neon_mmm_f32_8x4_cortexa9 16 4 4 0.000001487225488304516
+generic_f32_4x4 5 32 9 0.000007777979342543209
+armv7neon_mmm_f32_8x6_cortexa7 8 4 19 0.0000029885727357209406
+generic_f32_4x4 8 32 9 0.000007585061806416855
+armv7neon_mmm_f32_8x4_cortexa7 17 128 13 0.000057643034032365925
+armv7neon_mmm_f32_8x4_generic 7 128 4 0.000005216038661083481
+armv7neon_mmm_f32_8x4_cortexa7 16 32 12 0.000009049743188132303
+armv7neon_mmm_f32_8x4_cortexa9 17 4 8 0.000003682177568678712
+armv7neon_mmm_f32_8x4_cortexa9 17 32 11 0.000014400917572892256
+armv7neon_mmm_f32_8x4_cortexa9 15 4 12 0.0000040529055101759995
+armv7neon_mmm_f32_8x4_cortexa7 15 4 11 0.000004271052070252677
+armv7neon_mmm_f32_8x6_cortexa7 16 128 5 0.000013791269931666714
+armv7neon_mmm_f32_8x6_generic 8 32 7 0.000004325189484240455
+generic_f32_4x4 3 128 7 0.000007522472627920054
+armv7neon_mmm_f32_8x6_cortexa9 9 32 6 0.000004463650285453612
+generic_f32_4x4 11 32 7 0.000007948963253145964
+armv7neon_mmm_f32_8x4_cortexa9 24 128 4 0.000014890215129529216
+armv7neon_mmm_f32_8x6_cortexa9 16 4 19 0.000005414025318591343
+armv7neon_mmm_f32_8x6_cortexa7 17 4 6 0.00000238128254631235
+armv7neon_mmm_f32_8x4_cortexa9 24 4 5 0.000003916194410369112
+armv7neon_mmm_f32_8x4_cortexa9 8 128 9 0.000015067666888020707
+armv7neon_mmm_f32_8x6_cortexa7 23 32 11 0.000012775953874196187
+generic_f32_4x4 8 32 3 0.0000030910360857362123
+armv7neon_mmm_f32_8x4_cortexa7 7 32 3 0.0000021042104056045423
+armv7neon_mmm_f32_8x6_cortexa9 25 32 5 0.000008813546121201035
+armv7neon_mmm_f32_8x6_cortexa7 25 4 17 0.000008369215065652083
+generic_f32_4x4 3 32 9 0.000004340286298208673
+armv7neon_mmm_f32_8x4_generic 17 4 5 0.000003911088956924729
+generic_f32_4x4 9 4 13 0.000007067880206932553
+armv7neon_mmm_f32_8x4_cortexa7 8 128 13 0.00001944164572907436
+armv7neon_mmm_f32_8x4_cortexa7 25 4 8 0.000004598061889383109
+armv7neon_mmm_f32_8x4_generic 15 32 5 0.000006570904917754489
+armv7neon_mmm_f32_8x4_cortexa9 7 4 4 0.000001199774667010064
+armv7neon_mmm_f32_8x4_cortexa9 16 32 9 0.000009580069903775671
+armv7neon_mmm_f32_8x6_generic 17 4 19 0.000007879961865942272
+armv7neon_mmm_f32_8x4_cortexa7 23 128 13 0.00005805408197282095
+armv7neon_mmm_f32_8x6_generic 25 32 13 0.000023213828889712974
+armv7neon_mmm_f32_8x6_cortexa9 23 4 17 0.000006797095061554012
+armv7neon_mmm_f32_8x6_cortexa9 24 32 6 0.000006232031189380453
+armv7neon_mmm_f32_8x4_cortexa7 15 128 11 0.00002955574134898906
+armv7neon_mmm_f32_8x6_cortexa9 24 4 11 0.000004626561901203258
+armv7neon_mmm_f32_8x6_cortexa9 8 4 7 0.0000018344586562526734
+armv7neon_mmm_f32_8x6_generic 15 4 18 0.000004518763751153566
+armv7neon_mmm_f32_8x6_cortexa9 16 128 11 0.000027030003785723685
+armv7neon_mmm_f32_8x4_cortexa7 9 4 4 0.0000015888326511808313
+armv7neon_mmm_f32_8x4_cortexa7 7 4 12 0.0000025479152659232096
+armv7neon_mmm_f32_8x4_cortexa7 23 128 7 0.000029563915293732792
+armv7neon_mmm_f32_8x4_cortexa9 25 4 13 0.000009216194666264275
+generic_f32_4x4 4 4 7 0.0000016464048874437295
+armv7neon_mmm_f32_8x4_cortexa7 7 128 11 0.00001525304959159917
+armv7neon_mmm_f32_8x4_cortexa9 9 4 9 0.000003956900236836743
+generic_f32_4x4 4 128 8 0.000007233357504684728
+armv7neon_mmm_f32_8x4_cortexa9 16 128 9 0.000029612198853914824
+armv7neon_mmm_f32_8x4_generic 15 128 12 0.000028154522908512787
+armv7neon_mmm_f32_8x6_cortexa9 8 128 5 0.000007227187608771384
+armv7neon_mmm_f32_8x4_cortexa9 25 128 7 0.000039766026339765396
+generic_f32_4x4 9 128 5 0.00002114949804637606
+armv7neon_mmm_f32_8x4_generic 17 32 4 0.000004758199768605913
+armv7neon_mmm_f32_8x6_cortexa9 7 4 13 0.0000027386176506007003
+generic_f32_4x4 7 32 8 0.000005375358081458721
+armv7neon_mmm_f32_8x6_cortexa9 8 4 18 0.000002220900812833037
+armv7neon_mmm_f32_8x4_cortexa9 16 128 11 0.000029669482734966696
+armv7neon_mmm_f32_8x6_cortexa7 8 32 18 0.000006161276860676508
+armv7neon_mmm_f32_8x4_cortexa7 8 32 8 0.000003353396080834466
+armv7neon_mmm_f32_8x6_generic 16 128 13 0.000038174416411990654
+armv7neon_mmm_f32_8x6_generic 15 32 19 0.00001612698422472605
+armv7neon_mmm_f32_8x4_cortexa9 9 128 3 0.000010402587362113396
+armv7neon_mmm_f32_8x6_cortexa9 9 4 11 0.0000032196424297717835
+armv7neon_mmm_f32_8x4_cortexa9 8 128 11 0.000015086238127499093
+armv7neon_mmm_f32_8x6_generic 16 128 19 0.00005061640609152132
+armv7neon_mmm_f32_8x4_generic 16 128 9 0.00002782092751958622
+generic_f32_4x4 4 32 4 0.000001643238458184986
+armv7neon_mmm_f32_8x6_cortexa7 17 4 11 0.0000046002558984855774
+generic_f32_4x4 5 4 9 0.0000039003390490700075
+generic_f32_4x4 9 128 7 0.000021241983402647673
+generic_f32_4x4 7 4 13 0.000005155057897108874
+armv7neon_mmm_f32_8x6_generic 17 4 17 0.000006303527455540143
+armv7neon_mmm_f32_8x4_generic 16 4 13 0.000004620655887935485
+armv7neon_mmm_f32_8x4_cortexa7 8 128 5 0.000010060854723538398
+armv7neon_mmm_f32_8x4_cortexa9 16 32 11 0.000009653502461801779
+armv7neon_mmm_f32_8x6_generic 23 128 5 0.00001973212006224306
+armv7neon_mmm_f32_8x4_generic 8 4 7 0.000001657225281211141
+armv7neon_mmm_f32_8x6_generic 16 4 12 0.0000027740993216529004
+armv7neon_mmm_f32_8x6_cortexa9 15 128 18 0.00004029185998372536
+armv7neon_mmm_f32_8x4_cortexa9 25 32 4 0.000006456592320044242
+armv7neon_mmm_f32_8x6_cortexa7 25 128 12 0.000052118173447989496
+armv7neon_mmm_f32_8x4_cortexa7 25 32 9 0.000018537854346104505
+armv7neon_mmm_f32_8x6_cortexa7 16 4 7 0.000003147284440858119
+armv7neon_mmm_f32_8x4_generic 17 128 5 0.000028048493049013062
+armv7neon_mmm_f32_8x6_cortexa7 23 4 18 0.000006335447404352516
+armv7neon_mmm_f32_8x6_cortexa9 24 4 19 0.000007813810057423332
+armv7neon_mmm_f32_8x4_cortexa7 17 128 8 0.000028915520714406696
+armv7neon_mmm_f32_8x4_cortexa7 17 128 7 0.000029363768223094683
+generic_f32_4x4 3 32 11 0.0000043748460947431475
+armv7neon_mmm_f32_8x4_cortexa9 9 128 5 0.0000200735894165016
+armv7neon_mmm_f32_8x4_cortexa7 17 4 3 0.000002458111098809617
+generic_f32_4x4 5 128 11 0.0000211987993849004
+generic_f32_4x4 12 32 7 0.00000782487986470221
+armv7neon_mmm_f32_8x4_generic 7 128 11 0.000014659497491826627
+armv7neon_mmm_f32_8x6_cortexa7 17 128 11 0.00003969667780371537
+armv7neon_mmm_f32_8x6_generic 7 4 6 0.0000012612776856940052
+armv7neon_mmm_f32_8x6_cortexa9 7 4 6 0.0000012711169589040185
+armv7neon_mmm_f32_8x6_cortexa7 8 4 13 0.0000024259808837385263
+armv7neon_mmm_f32_8x4_cortexa7 24 128 9 0.000043285670969312205
+armv7neon_mmm_f32_8x6_generic 15 128 19 0.00005126058311511034
+generic_f32_4x4 13 128 11 0.00004170340313011054
+armv7neon_mmm_f32_8x6_cortexa9 8 128 12 0.000013510136481590433
+armv7neon_mmm_f32_8x6_generic 23 32 17 0.00001815155326925384
+armv7neon_mmm_f32_8x4_generic 24 32 8 0.000008758373656325976
+armv7neon_mmm_f32_8x4_generic 15 32 13 0.000012566353080909321
+armv7neon_mmm_f32_8x4_generic 15 4 9 0.000004138286673960469
+armv7neon_mmm_f32_8x4_cortexa7 16 32 4 0.000003365053985432853
+armv7neon_mmm_f32_8x4_cortexa7 9 128 3 0.0000101978289798725
+generic_f32_4x4 11 32 8 0.000007681371193157391
+generic_f32_4x4 7 128 9 0.00002127278439974851
+armv7neon_mmm_f32_8x4_cortexa7 24 4 13 0.000006759425188306811
+armv7neon_mmm_f32_8x4_cortexa7 24 32 9 0.000013896132257150766
+armv7neon_mmm_f32_8x4_cortexa9 25 32 9 0.000018829874971593756
+armv7neon_mmm_f32_8x4_cortexa7 9 32 4 0.000003474275688786252
+armv7neon_mmm_f32_8x6_cortexa7 8 32 7 0.0000044833206937891445
+generic_f32_4x4 3 32 7 0.0000030714723992174605
+armv7neon_mmm_f32_8x6_cortexa9 17 4 11 0.000004585922035285582
+armv7neon_mmm_f32_8x4_cortexa9 24 128 5 0.000029781338716311416
+armv7neon_mmm_f32_8x6_generic 15 32 18 0.000012214298327316169
+armv7neon_mmm_f32_8x6_generic 25 32 6 0.000007957229153899586
+armv7neon_mmm_f32_8x6_cortexa9 9 32 18 0.000012307833924250695
+armv7neon_mmm_f32_8x6_generic 8 32 13 0.000006159219915111089
+armv7neon_mmm_f32_8x4_cortexa7 8 128 3 0.000005401956819775254
+armv7neon_mmm_f32_8x4_generic 15 32 9 0.000009577293347035024
+armv7neon_mmm_f32_8x4_cortexa9 25 128 13 0.00007830393224178603
+generic_f32_4x4 4 4 13 0.0000026249296920626683
+armv7neon_mmm_f32_8x6_cortexa9 8 32 6 0.000002404882860716393
+armv7neon_mmm_f32_8x6_cortexa9 9 32 5 0.0000045921033389599396
+armv7neon_mmm_f32_8x6_generic 17 128 11 0.000038492804993535863
+armv7neon_mmm_f32_8x4_generic 17 32 9 0.000013624172849176604
+armv7neon_mmm_f32_8x4_cortexa9 24 4 9 0.000005393334776799241
+armv7neon_mmm_f32_8x4_cortexa9 23 32 8 0.000009723809855547939
+armv7neon_mmm_f32_8x4_cortexa9 25 4 4 0.0000025900920077984026
+armv7neon_mmm_f32_8x6_generic 23 4 5 0.0000027608446978317408
+armv7neon_mmm_f32_8x4_generic 8 4 12 0.0000019060496583230645
+armv7neon_mmm_f32_8x4_cortexa7 15 32 8 0.000006678781571694655
+generic_f32_4x4 4 4 9 0.0000021081574199754717
+armv7neon_mmm_f32_8x4_cortexa7 17 128 9 0.00004346458158147509
+armv7neon_mmm_f32_8x4_generic 25 4 8 0.000004544403472760322
+armv7neon_mmm_f32_8x4_generic 24 128 4 0.000013981279265161887
+armv7neon_mmm_f32_8x6_cortexa7 25 4 12 0.000005353769509302435
+armv7neon_mmm_f32_8x4_cortexa9 15 4 8 0.000002871100263594495
+armv7neon_mmm_f32_8x6_cortexa7 9 4 12 0.0000030500030063725167
+armv7neon_mmm_f32_8x4_cortexa9 15 128 7 0.000020348668200781455
+generic_f32_4x4 4 4 3 0.0000011380263710773134
+armv7neon_mmm_f32_8x4_generic 17 32 7 0.000009461004385359903
+armv7neon_mmm_f32_8x6_cortexa7 16 4 11 0.000003292952568894936
+armv7neon_mmm_f32_8x4_generic 9 128 7 0.0000189264627670604
+armv7neon_mmm_f32_8x4_cortexa7 9 32 12 0.000009404582356065284
+armv7neon_mmm_f32_8x6_cortexa7 24 4 11 0.000004645745440552646
+armv7neon_mmm_f32_8x4_generic 7 4 8 0.0000018628381186813514
+generic_f32_4x4 13 32 12 0.000014451033360231452
+armv7neon_mmm_f32_8x6_generic 7 128 11 0.00001335338354211472
+armv7neon_mmm_f32_8x4_cortexa9 24 32 5 0.000009736881086291142
+armv7neon_mmm_f32_8x4_generic 9 32 8 0.000006244209351860461
+armv7neon_mmm_f32_8x4_cortexa7 23 32 7 0.000009963814618687657
+armv7neon_mmm_f32_8x6_cortexa7 24 128 7 0.000039514548121680776
+armv7neon_mmm_f32_8x4_cortexa7 17 32 5 0.000009649055437118146
+armv7neon_mmm_f32_8x6_generic 7 128 7 0.000013256390141337729
+armv7neon_mmm_f32_8x6_cortexa7 9 128 17 0.00003956707654525446
+armv7neon_mmm_f32_8x4_generic 9 128 8 0.000018708339822835962
+generic_f32_4x4 9 4 7 0.00000399084452417743
+armv7neon_mmm_f32_8x4_generic 15 128 8 0.000018936209183566976
+armv7neon_mmm_f32_8x6_cortexa7 9 32 19 0.00001616084997042782
+armv7neon_mmm_f32_8x6_generic 16 128 5 0.00001333369244237612
+armv7neon_mmm_f32_8x4_cortexa7 9 32 3 0.0000036715654135840626
+armv7neon_mmm_f32_8x4_cortexa9 23 32 5 0.000009965038150728467
+armv7neon_mmm_f32_8x6_cortexa7 9 128 7 0.000026494406670274787
+armv7neon_mmm_f32_8x4_cortexa9 24 128 13 0.00005851957886711075
+armv7neon_mmm_f32_8x6_cortexa7 9 4 18 0.000004273474311217914
+generic_f32_4x4 5 32 12 0.000007635355402928665
+armv7neon_mmm_f32_8x4_cortexa9 7 128 7 0.000010528249080204253
+armv7neon_mmm_f32_8x6_cortexa9 24 32 19 0.00002386884326098149
+armv7neon_mmm_f32_8x6_generic 8 4 7 0.0000017826372979170567
+armv7neon_mmm_f32_8x4_generic 7 4 5 0.000001794625708774814
+armv7neon_mmm_f32_8x6_cortexa7 9 4 6 0.000001801617764943881
+armv7neon_mmm_f32_8x4_generic 25 32 13 0.00002352372957802219
+armv7neon_mmm_f32_8x4_generic 24 128 13 0.00005496909042425587
+armv7neon_mmm_f32_8x6_cortexa9 23 32 17 0.000018875847839557392
+generic_f32_4x4 12 32 4 0.000003926824640983302
+armv7neon_mmm_f32_8x6_cortexa9 7 128 18 0.000020631769324589798
+armv7neon_mmm_f32_8x6_cortexa9 8 128 19 0.000026726033464302463
+armv7neon_mmm_f32_8x6_cortexa7 7 4 5 0.0000012346138819052426
+armv7neon_mmm_f32_8x4_generic 17 128 7 0.00002816390176070489
+armv7neon_mmm_f32_8x6_generic 9 32 19 0.000015684300494904117
+armv7neon_mmm_f32_8x4_cortexa7 9 128 4 0.000010001875449763685
+armv7neon_mmm_f32_8x4_generic 23 32 13 0.000018251850356188044
+generic_f32_4x4 13 4 3 0.000003021647059953821
+armv7neon_mmm_f32_8x6_cortexa9 25 128 7 0.000053220906812986594
+armv7neon_mmm_f32_8x4_cortexa9 16 128 8 0.00001967608213206112
+armv7neon_mmm_f32_8x6_generic 24 32 5 0.000006620165371496042
+armv7neon_mmm_f32_8x4_cortexa7 9 4 7 0.0000028674611086712135
+armv7neon_mmm_f32_8x4_cortexa9 17 4 11 0.000005653151614630126
+generic_f32_4x4 13 4 4 0.000002594385628155042
+armv7neon_mmm_f32_8x6_generic 25 128 5 0.000026094659734105824
+armv7neon_mmm_f32_8x4_generic 17 32 3 0.000005161144602046728
+armv7neon_mmm_f32_8x4_cortexa9 24 32 11 0.000014220571969428708
+generic_f32_4x4 4 128 3 0.000004015357652724343
+armv7neon_mmm_f32_8x4_cortexa9 8 128 5 0.00001027176965651363
+armv7neon_mmm_f32_8x6_cortexa7 15 4 17 0.000004874430261781061
+armv7neon_mmm_f32_8x4_cortexa7 15 4 12 0.0000040169005856724945
+armv7neon_mmm_f32_8x4_cortexa7 8 32 12 0.00000477050855780904
+armv7neon_mmm_f32_8x6_cortexa7 9 4 7 0.0000031516329281183407
+armv7neon_mmm_f32_8x4_generic 7 32 11 0.000005308351521432454
+armv7neon_mmm_f32_8x4_cortexa7 25 32 5 0.000012670365676368953
+generic_f32_4x4 5 128 5 0.000014294465011666321
+armv7neon_mmm_f32_8x6_cortexa7 7 32 17 0.000006818696484359277
+armv7neon_mmm_f32_8x4_cortexa9 24 32 7 0.00000986925847989351
+generic_f32_4x4 4 128 13 0.000014106152963483687
+armv7neon_mmm_f32_8x4_cortexa9 16 128 3 0.000010512285712675924
+generic_f32_4x4 8 128 4 0.000007240602279573043
+armv7neon_mmm_f32_8x4_cortexa9 7 128 9 0.00001548686209510202
+armv7neon_mmm_f32_8x4_cortexa9 7 32 3 0.000002138015694680355
+armv7neon_mmm_f32_8x4_generic 24 32 13 0.000017536918553280824
+armv7neon_mmm_f32_8x6_cortexa9 23 4 12 0.0000044455569065126315
+armv7neon_mmm_f32_8x4_cortexa7 8 4 13 0.0000026000896469710663
+armv7neon_mmm_f32_8x4_cortexa7 25 32 8 0.000012179917551949525
+generic_f32_4x4 5 32 5 0.000005390504735926875
+generic_f32_4x4 11 128 13 0.00004175553018581371
+armv7neon_mmm_f32_8x4_cortexa7 7 32 4 0.0000021521595449855865
+armv7neon_mmm_f32_8x4_cortexa9 8 4 9 0.000002148291671877107
+armv7neon_mmm_f32_8x6_cortexa7 24 4 18 0.000005654885858199984
+armv7neon_mmm_f32_8x4_generic 25 128 9 0.00005533756635099798
+armv7neon_mmm_f32_8x4_cortexa9 9 128 9 0.00002978120797293901
+armv7neon_mmm_f32_8x6_cortexa7 8 32 12 0.000004281164608278993
+armv7neon_mmm_f32_8x4_generic 9 4 13 0.000004935869563311462
+armv7neon_mmm_f32_8x4_cortexa7 23 128 3 0.000015184873420309865
+generic_f32_4x4 12 4 12 0.000004912969841843077
+armv7neon_mmm_f32_8x6_generic 16 4 7 0.0000030460066806302122
+armv7neon_mmm_f32_8x4_cortexa7 9 4 13 0.000004992355989646887
+armv7neon_mmm_f32_8x6_cortexa9 15 4 12 0.0000032832021366886516
+armv7neon_mmm_f32_8x6_cortexa9 17 4 7 0.000004416231510276644
+armv7neon_mmm_f32_8x4_generic 9 128 5 0.000018874961280340985
+armv7neon_mmm_f32_8x4_cortexa7 25 4 9 0.000007146809416343016
+armv7neon_mmm_f32_8x6_cortexa9 23 4 5 0.000002791403862004092
+generic_f32_4x4 3 128 11 0.000011052312539972672
+armv7neon_mmm_f32_8x4_cortexa7 16 32 13 0.000012274313788252517
+generic_f32_4x4 5 4 8 0.000002691374011509211
+armv7neon_mmm_f32_8x6_generic 8 4 12 0.0000016391318699699268
+generic_f32_4x4 8 4 12 0.0000034590196765442797
+armv7neon_mmm_f32_8x6_generic 17 128 18 0.00005685255235859907
+armv7neon_mmm_f32_8x6_cortexa9 17 128 6 0.00002017521356859541
+armv7neon_mmm_f32_8x6_generic 25 128 6 0.00002553797390337986
+armv7neon_mmm_f32_8x6_cortexa9 25 128 5 0.000027210481604244293
+armv7neon_mmm_f32_8x4_cortexa7 15 128 7 0.000019945561157727263
+armv7neon_mmm_f32_8x4_generic 7 4 11 0.000002550958711536068
+generic_f32_4x4 13 4 8 0.000004699429428545618
+armv7neon_mmm_f32_8x4_cortexa9 9 4 5 0.0000028517696924614755
+armv7neon_mmm_f32_8x4_cortexa9 8 4 3 0.0000012050980961045702
+armv7neon_mmm_f32_8x6_cortexa9 24 128 17 0.000059842842681725297
+armv7neon_mmm_f32_8x6_cortexa9 25 128 6 0.000026691322740044253
+generic_f32_4x4 9 32 3 0.000004333031052520466
+armv7neon_mmm_f32_8x6_cortexa7 9 32 11 0.000008502744568312225
+armv7neon_mmm_f32_8x6_cortexa9 16 32 11 0.000008643328393303817
+armv7neon_mmm_f32_8x6_cortexa9 16 4 11 0.00000327884229270897
+armv7neon_mmm_f32_8x4_generic 8 128 5 0.000009661930895932827
+armv7neon_mmm_f32_8x4_cortexa9 15 4 5 0.0000029989007242676373
+armv7neon_mmm_f32_8x4_cortexa7 23 32 11 0.000014477721812785248
+armv7neon_mmm_f32_8x6_cortexa7 9 4 13 0.000004411946245984795
+armv7neon_mmm_f32_8x6_generic 7 128 12 0.000013345094425498999
+armv7neon_mmm_f32_8x4_cortexa9 24 128 11 0.00004426398857587651
+armv7neon_mmm_f32_8x6_generic 25 128 7 0.00005086677641643256
+armv7neon_mmm_f32_8x6_generic 16 32 17 0.00001197546406643872
+generic_f32_4x4 4 32 13 0.000005196854104063007
+armv7neon_mmm_f32_8x4_cortexa7 16 4 7 0.000002852179426317928
+armv7neon_mmm_f32_8x6_cortexa7 23 4 7 0.000004598943528457281
+armv7neon_mmm_f32_8x6_generic 17 32 6 0.000006120980761955415
+armv7neon_mmm_f32_8x6_cortexa7 17 32 17 0.00001829474178461942
+generic_f32_4x4 7 4 12 0.000003918843035175344
+armv7neon_mmm_f32_8x6_generic 9 4 7 0.0000030530125579888694
+armv7neon_mmm_f32_8x6_cortexa9 23 4 13 0.000006537462226861253
+generic_f32_4x4 12 32 11 0.000011229611140019307
+armv7neon_mmm_f32_8x4_cortexa7 7 32 9 0.000005392896316953727
+armv7neon_mmm_f32_8x4_cortexa9 15 4 7 0.00000310030600994376
+armv7neon_mmm_f32_8x4_cortexa9 23 32 3 0.000005486033134258231
+armv7neon_mmm_f32_8x6_cortexa7 25 4 11 0.000005954018792233075
+armv7neon_mmm_f32_8x4_generic 23 128 8 0.000027951206051686114
+armv7neon_mmm_f32_8x6_generic 23 4 12 0.000004342351795004267
+armv7neon_mmm_f32_8x4_cortexa9 8 32 5 0.0000035943022440195
+armv7neon_mmm_f32_8x6_cortexa9 7 4 18 0.0000028075381670446466
+armv7neon_mmm_f32_8x4_generic 8 32 13 0.0000061830339053399435
+armv7neon_mmm_f32_8x6_generic 15 32 7 0.000008321528944284549
+armv7neon_mmm_f32_8x6_cortexa7 16 128 6 0.00001332317233161335
+armv7neon_mmm_f32_8x4_generic 9 4 12 0.000003689645307663438
+armv7neon_mmm_f32_8x4_cortexa9 16 32 7 0.000006751619359417928
+generic_f32_4x4 7 4 4 0.0000016418721244643219
+armv7neon_mmm_f32_8x4_generic 7 128 13 0.000019287387814897854
+armv7neon_mmm_f32_8x6_cortexa9 23 4 11 0.000004831555332090134
+armv7neon_mmm_f32_8x4_cortexa9 8 128 8 0.00001009639921913338
+armv7neon_mmm_f32_8x6_generic 23 32 5 0.000006582337877689147
+armv7neon_mmm_f32_8x4_generic 25 128 3 0.00001920626106575073
+armv7neon_mmm_f32_8x4_generic 7 32 8 0.000003711210711678028
+armv7neon_mmm_f32_8x6_cortexa7 16 128 19 0.00005218314273770285
+armv7neon_mmm_f32_8x6_cortexa9 7 128 12 0.000013936034942170719
+armv7neon_mmm_f32_8x6_generic 17 32 5 0.000006487710283772798
+armv7neon_mmm_f32_8x4_generic 8 128 7 0.000009698770362126403
+armv7neon_mmm_f32_8x4_generic 15 4 7 0.0000030411777158900867
+armv7neon_mmm_f32_8x6_generic 25 128 19 0.00010155326542007438
+armv7neon_mmm_f32_8x6_cortexa7 9 128 5 0.000013650595324888383
+armv7neon_mmm_f32_8x4_cortexa9 9 32 11 0.00000982080901853953
+generic_f32_4x4 9 32 11 0.00001141647780993558
+generic_f32_4x4 8 32 11 0.000007662846153151327
+armv7neon_mmm_f32_8x6_generic 8 128 17 0.000019396378758926843
+armv7neon_mmm_f32_8x6_cortexa9 8 4 17 0.00000248907607573393
+generic_f32_4x4 13 128 12 0.000041173556961397094
+armv7neon_mmm_f32_8x4_generic 25 32 3 0.0000067405543706379265
+armv7neon_mmm_f32_8x6_cortexa9 16 4 13 0.000004284450398155638
+armv7neon_mmm_f32_8x6_generic 7 32 13 0.000006564286527582003
+generic_f32_4x4 9 4 12 0.000005239644799087451
+generic_f32_4x4 3 128 12 0.00001105053431182824
+armv7neon_mmm_f32_8x4_cortexa9 16 32 12 0.000009204841041049037
+armv7neon_mmm_f32_8x4_cortexa9 8 4 7 0.0000016950881959591487
+armv7neon_mmm_f32_8x4_cortexa9 8 4 8 0.00000148340332010083
+armv7neon_mmm_f32_8x4_cortexa9 24 32 4 0.000004874669124633379
+armv7neon_mmm_f32_8x4_cortexa9 16 128 7 0.00002009832072993214
+armv7neon_mmm_f32_8x4_generic 16 4 5 0.000002739180418018979
+armv7neon_mmm_f32_8x4_cortexa9 8 4 12 0.000001953874992361998
+armv7neon_mmm_f32_8x4_generic 15 4 4 0.0000016638026379811712
+armv7neon_mmm_f32_8x4_cortexa9 25 128 8 0.000039087913140379676
+armv7neon_mmm_f32_8x6_cortexa9 17 4 12 0.000004212564422068874
+armv7neon_mmm_f32_8x6_cortexa9 8 32 5 0.00000263061064210809
+armv7neon_mmm_f32_8x4_generic 8 4 5 0.000001623245576042727
+armv7neon_mmm_f32_8x6_cortexa7 15 128 6 0.000013618693064061564
+armv7neon_mmm_f32_8x6_cortexa7 7 4 18 0.000002804843066747126
+generic_f32_4x4 3 4 11 0.0000023913102926543824
+armv7neon_mmm_f32_8x6_generic 16 4 6 0.000001638807649221633
+armv7neon_mmm_f32_8x6_generic 8 128 6 0.0000067051548997103826
+armv7neon_mmm_f32_8x6_cortexa9 16 4 6 0.0000016568160988445508
+armv7neon_mmm_f32_8x6_cortexa9 16 128 7 0.000026851544464649677
+armv7neon_mmm_f32_8x4_cortexa9 8 128 12 0.000014872272678272856
+armv7neon_mmm_f32_8x6_generic 24 128 19 0.00007580588220559739
+generic_f32_4x4 3 128 5 0.0000074971724012246544
+generic_f32_4x4 3 128 8 0.000007539035525450043
+armv7neon_mmm_f32_8x6_cortexa9 24 128 7 0.00004003350952160301
+armv7neon_mmm_f32_8x4_cortexa7 23 32 9 0.000014356067971382847
+armv7neon_mmm_f32_8x6_cortexa9 15 32 18 0.000012709917183710011
+generic_f32_4x4 9 128 3 0.000011011983185480526
+armv7neon_mmm_f32_8x4_generic 15 128 7 0.000019154850070723174
+generic_f32_4x4 7 32 12 0.000007829205768353918
+generic_f32_4x4 12 4 3 0.000002428328614613921
+armv7neon_mmm_f32_8x4_cortexa7 8 128 4 0.000005191807529366239
+armv7neon_mmm_f32_8x4_cortexa9 17 128 3 0.00001540571905602067
+armv7neon_mmm_f32_8x4_cortexa9 17 128 9 0.000044358855138628255
+armv7neon_mmm_f32_8x4_generic 8 32 4 0.0000018712794095957982
+armv7neon_mmm_f32_8x6_cortexa9 9 128 7 0.000026855539948208353
+armv7neon_mmm_f32_8x6_cortexa7 7 128 13 0.000020286298173428065
+armv7neon_mmm_f32_8x4_cortexa7 7 32 7 0.0000037916453247576294
+armv7neon_mmm_f32_8x4_cortexa7 16 128 11 0.000029071525948432538
+armv7neon_mmm_f32_8x4_generic 8 128 8 0.000009479495707619602
+armv7neon_mmm_f32_8x4_cortexa7 9 32 13 0.000012575930597830614
+armv7neon_mmm_f32_8x6_cortexa7 7 4 7 0.0000019563323222872665
+armv7neon_mmm_f32_8x4_cortexa7 15 128 3 0.000010282742767020747
+armv7neon_mmm_f32_8x4_generic 17 128 9 0.00004166868490626211
+armv7neon_mmm_f32_8x6_cortexa9 15 32 11 0.000008848994549844927
+armv7neon_mmm_f32_8x4_cortexa7 16 128 7 0.000019694781274526964
+armv7neon_mmm_f32_8x6_cortexa9 7 128 17 0.00002067047797610865
+armv7neon_mmm_f32_8x4_cortexa7 16 4 8 0.000002421339765366324
+armv7neon_mmm_f32_8x6_cortexa9 25 4 19 0.000010458910160208884
+armv7neon_mmm_f32_8x4_cortexa7 17 4 12 0.000005174031688170145
+armv7neon_mmm_f32_8x4_cortexa7 8 128 7 0.000010096523777664552
+armv7neon_mmm_f32_8x4_cortexa7 9 32 11 0.000009664173576551376
+armv7neon_mmm_f32_8x4_cortexa9 15 32 13 0.000013172771066560177
+armv7neon_mmm_f32_8x6_cortexa7 15 32 6 0.0000045760506363113526
+armv7neon_mmm_f32_8x4_generic 16 128 7 0.000018904285841254675
+armv7neon_mmm_f32_8x6_cortexa9 24 32 7 0.000012425961839460377
+armv7neon_mmm_f32_8x4_cortexa7 8 128 8 0.00000988276098193371
+armv7neon_mmm_f32_8x4_cortexa7 25 32 13 0.000024332756642022888
+armv7neon_mmm_f32_8x6_cortexa9 8 4 12 0.000001657247635058824
+armv7neon_mmm_f32_8x4_cortexa7 8 4 9 0.000002131892294337394
+generic_f32_4x4 8 4 8 0.000002499295427359815
+armv7neon_mmm_f32_8x6_cortexa9 16 128 5 0.000013927438070206447
+generic_f32_4x4 3 4 7 0.0000017546173861422715
+armv7neon_mmm_f32_8x4_cortexa7 23 128 12 0.000043445426131070976
+armv7neon_mmm_f32_8x4_generic 8 128 12 0.00001395887153959496
+armv7neon_mmm_f32_8x4_cortexa9 7 4 11 0.0000025905119707065483
+generic_f32_4x4 12 128 4 0.00001060459652081298
+armv7neon_mmm_f32_8x6_generic 17 128 13 0.00005710733590612565
+armv7neon_mmm_f32_8x4_cortexa7 7 4 11 0.000002572252868831556
+armv7neon_mmm_f32_8x6_generic 15 128 11 0.000026019339889436572
+generic_f32_4x4 9 4 5 0.000003906378066863749
+armv7neon_mmm_f32_8x6_cortexa9 15 4 17 0.00000485956993190203
+armv7neon_mmm_f32_8x6_generic 9 32 17 0.000012038137584352969
+armv7neon_mmm_f32_8x4_cortexa9 8 32 11 0.000005081279026778461
+armv7neon_mmm_f32_8x4_cortexa7 25 128 11 0.000057856501014562726
+armv7neon_mmm_f32_8x6_cortexa9 25 32 19 0.00003186559880573022
+armv7neon_mmm_f32_8x6_cortexa9 8 32 12 0.000004319717163614961
+armv7neon_mmm_f32_8x6_cortexa9 8 32 19 0.000008329624592686573
+armv7neon_mmm_f32_8x6_cortexa9 9 32 13 0.000012409078932821099
+armv7neon_mmm_f32_8x6_cortexa9 17 128 12 0.00003982138220281054
+armv7neon_mmm_f32_8x4_cortexa9 8 32 4 0.0000019683045582115826
+armv7neon_mmm_f32_8x6_cortexa9 17 128 13 0.000059714513355137016
+armv7neon_mmm_f32_8x6_cortexa9 25 32 13 0.00002415767938328582
+armv7neon_mmm_f32_8x4_cortexa7 7 128 8 0.000010344849965132014
+armv7neon_mmm_f32_8x6_cortexa9 8 128 11 0.000013785904590501993
+armv7neon_mmm_f32_8x4_cortexa7 24 32 13 0.000018144082756384124
+armv7neon_mmm_f32_8x6_generic 15 4 13 0.000004567420998191548
+armv7neon_mmm_f32_8x4_generic 9 32 13 0.000012173282994755266
+generic_f32_4x4 4 4 4 0.0000009923563076021443
+armv7neon_mmm_f32_8x4_cortexa9 24 128 7 0.000029905631027460932
+armv7neon_mmm_f32_8x4_cortexa7 9 128 8 0.0000195010462183809
+armv7neon_mmm_f32_8x6_cortexa9 9 4 5 0.000001915573042137183
+generic_f32_4x4 7 128 12 0.000021189012747949428
+armv7neon_mmm_f32_8x4_generic 23 32 9 0.000013896926880805095
+armv7neon_mmm_f32_8x4_generic 8 128 13 0.000018647991490366943
+armv7neon_mmm_f32_8x4_cortexa7 15 32 11 0.00000998072291720506
+generic_f32_4x4 3 128 4 0.0000040235161328678346
+armv7neon_mmm_f32_8x4_generic 9 128 3 0.000009799742651068634
+armv7neon_mmm_f32_8x6_generic 25 4 11 0.000005815747914439624
+armv7neon_mmm_f32_8x4_cortexa9 9 4 12 0.0000037593832702220446
+armv7neon_mmm_f32_8x4_generic 8 32 7 0.0000034630101008340834
+armv7neon_mmm_f32_8x4_cortexa9 24 4 7 0.000004049582371373125
+armv7neon_mmm_f32_8x6_generic 9 32 12 0.000008052850558834548
+armv7neon_mmm_f32_8x4_cortexa7 25 4 3 0.000003139298464286286
+armv7neon_mmm_f32_8x4_generic 15 4 3 0.0000018347526374757481
+armv7neon_mmm_f32_8x6_cortexa9 25 4 18 0.000007705086964181997
+generic_f32_4x4 5 128 13 0.00002796513369706675
+armv7neon_mmm_f32_8x4_cortexa9 23 4 12 0.0000055210994766811515
+generic_f32_4x4 7 128 4 0.000007400928983118127
+generic_f32_4x4 12 128 8 0.000020684245069264504
+armv7neon_mmm_f32_8x4_generic 9 4 7 0.0000028402738019775784
+generic_f32_4x4 13 4 9 0.000007094831203377165
+armv7neon_mmm_f32_8x4_cortexa9 17 4 4 0.000002109614550782161
+generic_f32_4x4 9 32 7 0.00000788121798038347
+armv7neon_mmm_f32_8x4_cortexa9 17 32 9 0.000014308883462633102
+armv7neon_mmm_f32_8x4_generic 16 128 12 0.00002742841574783841
+armv7neon_mmm_f32_8x6_generic 9 32 13 0.000011930679710766873
+armv7neon_mmm_f32_8x4_cortexa7 24 128 7 0.000029306857678894273
+armv7neon_mmm_f32_8x4_cortexa9 15 4 11 0.0000043086270265983256
+generic_f32_4x4 11 4 11 0.000005690020384668188
+generic_f32_4x4 11 4 5 0.000003971103754939711
+armv7neon_mmm_f32_8x4_generic 23 128 7 0.000028363425957340315
+armv7neon_mmm_f32_8x4_cortexa7 25 32 7 0.000012820527346969179
+armv7neon_mmm_f32_8x6_generic 17 4 13 0.0000061204912028192775
+armv7neon_mmm_f32_8x4_cortexa7 23 4 13 0.000007468189054284558
+generic_f32_4x4 4 32 7 0.0000029448449923973884
+armv7neon_mmm_f32_8x4_cortexa9 25 32 12 0.000018265077104449187
+armv7neon_mmm_f32_8x6_cortexa9 17 32 5 0.000006709520484710961
+armv7neon_mmm_f32_8x4_cortexa9 23 128 7 0.00003016688393762355
+armv7neon_mmm_f32_8x4_cortexa9 16 128 13 0.000039189527740613905
+armv7neon_mmm_f32_8x4_cortexa7 16 128 5 0.000019619696312768993
+armv7neon_mmm_f32_8x4_generic 17 128 13 0.00005528191915660002
+armv7neon_mmm_f32_8x6_generic 17 128 19 0.00007601141239533076
+armv7neon_mmm_f32_8x4_generic 8 32 5 0.0000034293993184402013
+generic_f32_4x4 9 32 4 0.000004039130783975973
+armv7neon_mmm_f32_8x4_cortexa9 23 32 9 0.000014579239774429512
+armv7neon_mmm_f32_8x6_cortexa7 8 128 11 0.000013612858256767989
+armv7neon_mmm_f32_8x6_cortexa9 15 32 12 0.000008658441674303882
+armv7neon_mmm_f32_8x4_generic 9 128 13 0.00003711806630993475
+armv7neon_mmm_f32_8x6_generic 17 4 11 0.000004486220214719747
+armv7neon_mmm_f32_8x6_generic 24 32 17 0.00001769011948694583
+armv7neon_mmm_f32_8x6_generic 9 4 18 0.00000416889219962654
+armv7neon_mmm_f32_8x6_cortexa7 23 32 19 0.000024297125708617196
+armv7neon_mmm_f32_8x6_cortexa9 9 32 19 0.000016308328501596863
+generic_f32_4x4 5 32 13 0.000010150287138446791
+generic_f32_4x4 11 128 8 0.000021041592663498075
+armv7neon_mmm_f32_8x4_generic 17 32 8 0.000009030738946480375
+armv7neon_mmm_f32_8x4_generic 25 4 5 0.0000050194271140758065
+armv7neon_mmm_f32_8x4_generic 8 128 11 0.000014188042949903446
+generic_f32_4x4 13 32 11 0.000014981513303829195
+armv7neon_mmm_f32_8x4_cortexa9 8 128 4 0.000005303802744401548
+armv7neon_mmm_f32_8x6_cortexa7 17 4 5 0.0000027556390651666856
+armv7neon_mmm_f32_8x6_generic 25 128 12 0.000050521454291978235
+armv7neon_mmm_f32_8x6_cortexa9 16 4 7 0.0000031344346513242154
+armv7neon_mmm_f32_8x6_cortexa7 16 32 5 0.000004750660751815082
+armv7neon_mmm_f32_8x4_cortexa9 17 128 8 0.000029514324154126036
+armv7neon_mmm_f32_8x4_cortexa7 8 4 8 0.000001465122696586523
+armv7neon_mmm_f32_8x6_cortexa9 8 128 17 0.00002029296708644369
+armv7neon_mmm_f32_8x4_generic 25 32 12 0.00001736483445253461
+generic_f32_4x4 3 32 5 0.0000030443235090952762
+generic_f32_4x4 7 128 11 0.000021331736800406107
+armv7neon_mmm_f32_8x4_generic 24 128 3 0.000014605163771206865
+armv7neon_mmm_f32_8x4_cortexa9 15 32 8 0.000006785141126690698
+armv7neon_mmm_f32_8x6_cortexa7 23 4 11 0.000004845198031614994
+armv7neon_mmm_f32_8x6_cortexa7 8 4 18 0.0000022246057256413265
+armv7neon_mmm_f32_8x4_generic 23 32 12 0.000013609481500575639
+armv7neon_mmm_f32_8x6_generic 7 128 17 0.00001990485288299381
+armv7neon_mmm_f32_8x6_cortexa7 25 128 19 0.00010434796032564001
+armv7neon_mmm_f32_8x6_generic 15 4 7 0.0000032121848403447733
+armv7neon_mmm_f32_8x4_generic 25 32 9 0.00001793284879597201
+armv7neon_mmm_f32_8x4_cortexa7 25 4 5 0.0000050795046947410985
+armv7neon_mmm_f32_8x4_cortexa7 17 32 12 0.000013704665363244266
+generic_f32_4x4 5 32 3 0.0000030334097589492826
+armv7neon_mmm_f32_8x4_cortexa9 17 128 5 0.000030021544003020483
+armv7neon_mmm_f32_8x4_cortexa7 24 4 9 0.000005360352425382144
+armv7neon_mmm_f32_8x6_cortexa7 9 32 18 0.000012173297962511017
+armv7neon_mmm_f32_8x4_cortexa7 15 4 13 0.000005358787865343933
+armv7neon_mmm_f32_8x6_cortexa9 17 32 11 0.000012635537581820717
+armv7neon_mmm_f32_8x4_generic 9 32 12 0.000009105324476823614
+armv7neon_mmm_f32_8x4_cortexa7 17 128 11 0.00004382978043849
+armv7neon_mmm_f32_8x6_generic 17 4 6 0.0000023082534746308964
+armv7neon_mmm_f32_8x6_cortexa7 8 128 17 0.000020059146620211786
+armv7neon_mmm_f32_8x4_cortexa7 15 4 7 0.000003068689058039639
+armv7neon_mmm_f32_8x4_cortexa9 15 4 13 0.000005392948403824124
+armv7neon_mmm_f32_8x4_cortexa7 15 4 8 0.0000028443546860500008
+armv7neon_mmm_f32_8x4_cortexa9 24 4 3 0.000002583780394094933
+generic_f32_4x4 13 4 11 0.0000072216668884748755
+armv7neon_mmm_f32_8x6_cortexa7 25 4 13 0.00000811613473861232
+armv7neon_mmm_f32_8x6_generic 7 32 18 0.000006623134760255213
+armv7neon_mmm_f32_8x4_cortexa7 16 4 4 0.0000014656631347323804
+armv7neon_mmm_f32_8x6_cortexa9 24 128 12 0.00003980745750816077
+armv7neon_mmm_f32_8x6_cortexa9 17 32 6 0.00000638262644761529
+generic_f32_4x4 8 128 12 0.000020674001160178604
+armv7neon_mmm_f32_8x6_cortexa9 9 128 5 0.000013783853077105059
+armv7neon_mmm_f32_8x4_cortexa7 23 4 11 0.00000590971284549428
+armv7neon_mmm_f32_8x6_generic 17 128 7 0.00003951740163826464
+armv7neon_mmm_f32_8x6_cortexa7 23 128 12 0.00003972987011474437
+generic_f32_4x4 11 32 9 0.000011454015846057979
+armv7neon_mmm_f32_8x4_generic 7 128 8 0.000009944618306958537
+armv7neon_mmm_f32_8x4_cortexa7 9 128 12 0.00002904338895016975
+generic_f32_4x4 7 32 3 0.000003062772103932835
+armv7neon_mmm_f32_8x6_generic 7 32 19 0.000008594590275392258
+armv7neon_mmm_f32_8x4_cortexa9 24 4 4 0.0000019652194228578485
+armv7neon_mmm_f32_8x6_generic 25 32 17 0.00002354000930758944
+armv7neon_mmm_f32_8x4_cortexa7 24 4 12 0.000004776517171804484
+generic_f32_4x4 9 128 12 0.00003113614108946529
+armv7neon_mmm_f32_8x4_generic 25 4 13 0.000009058320096239628
+armv7neon_mmm_f32_8x4_cortexa9 15 32 7 0.000007013841485260632
+armv7neon_mmm_f32_8x6_cortexa7 8 32 13 0.000006373539570632082
+armv7neon_mmm_f32_8x6_cortexa9 7 128 7 0.000013849670236184067
+armv7neon_mmm_f32_8x6_cortexa7 16 4 5 0.0000021140389285040373
+armv7neon_mmm_f32_8x6_cortexa7 15 4 19 0.000006018452488388481
+armv7neon_mmm_f32_8x4_cortexa9 25 4 11 0.000007297177417716392
+armv7neon_mmm_f32_8x4_cortexa7 7 128 3 0.0000053704412970208095
+armv7neon_mmm_f32_8x6_cortexa7 25 32 6 0.000008216173586447742
+armv7neon_mmm_f32_8x6_cortexa9 23 32 6 0.000006530580171598388
+armv7neon_mmm_f32_8x6_cortexa9 8 4 11 0.0000019099336489036645
+armv7neon_mmm_f32_8x4_cortexa7 8 4 3 0.0000011915557137944658
+armv7neon_mmm_f32_8x4_cortexa9 7 32 13 0.000007135759350139172
+armv7neon_mmm_f32_8x4_cortexa9 15 128 8 0.000020152163526097688
+armv7neon_mmm_f32_8x6_generic 16 128 12 0.000025531348127989936
+armv7neon_mmm_f32_8x4_cortexa9 9 32 8 0.00000655179781932232
+armv7neon_mmm_f32_8x4_generic 25 4 7 0.000005150952172982353
+armv7neon_mmm_f32_8x6_generic 23 128 7 0.00003975509670222853
+generic_f32_4x4 7 128 7 0.000014417564018686427
+armv7neon_mmm_f32_8x6_generic 7 32 6 0.0000025435007870682203
+generic_f32_4x4 12 32 13 0.000014516218594466092
+armv7neon_mmm_f32_8x6_generic 24 4 13 0.000005989131584497686
+armv7neon_mmm_f32_8x6_cortexa7 24 4 13 0.000006130318576111218
+armv7neon_mmm_f32_8x4_cortexa7 24 4 8 0.0000033735678137031244
+armv7neon_mmm_f32_8x6_cortexa9 15 32 6 0.000004614375157776428
+armv7neon_mmm_f32_8x4_cortexa7 17 128 5 0.000029428735157155848
+generic_f32_4x4 11 32 3 0.000004361903012775443
+armv7neon_mmm_f32_8x6_cortexa9 24 128 11 0.00004051475238084589
+armv7neon_mmm_f32_8x6_cortexa7 8 4 11 0.0000019289152314594922
+armv7neon_mmm_f32_8x4_generic 7 32 12 0.00000529445196979877
+armv7neon_mmm_f32_8x4_cortexa9 25 4 9 0.0000071851687447521495
+armv7neon_mmm_f32_8x4_cortexa9 7 128 13 0.0000205036178463644
+armv7neon_mmm_f32_8x4_cortexa7 25 128 4 0.00001951695951825782
+generic_f32_4x4 11 32 11 0.0000115439545889861
+armv7neon_mmm_f32_8x6_cortexa7 23 32 7 0.000012514048765991971
+generic_f32_4x4 13 32 8 0.000009849544291047146
+armv7neon_mmm_f32_8x4_cortexa9 24 4 13 0.0000068036126797526975
+armv7neon_mmm_f32_8x6_cortexa9 23 128 7 0.00004044892329500272
+armv7neon_mmm_f32_8x4_cortexa9 24 32 9 0.000014128824207386303
+armv7neon_mmm_f32_8x6_cortexa9 15 4 19 0.000006004743734908054
+armv7neon_mmm_f32_8x6_generic 16 128 7 0.000025803175203093092
+armv7neon_mmm_f32_8x4_generic 24 4 9 0.000005289440520085159
+armv7neon_mmm_f32_8x4_generic 7 128 7 0.000009924053412179527
+armv7neon_mmm_f32_8x6_cortexa7 15 4 5 0.0000020760458197512403
+armv7neon_mmm_f32_8x4_cortexa9 23 32 12 0.000014302742013508701
+armv7neon_mmm_f32_8x6_cortexa9 23 128 18 0.00006018418795895432
+armv7neon_mmm_f32_8x4_cortexa7 7 128 12 0.000015239535080520215
+armv7neon_mmm_f32_8x6_generic 15 4 6 0.0000018544842241934815
+armv7neon_mmm_f32_8x6_cortexa7 25 32 18 0.000023497641993495917
+generic_f32_4x4 9 4 8 0.0000037065692747303202
+armv7neon_mmm_f32_8x6_generic 9 4 17 0.0000044060023407811705
+armv7neon_mmm_f32_8x4_generic 17 4 7 0.000004004011518049962
+armv7neon_mmm_f32_8x4_generic 16 4 9 0.0000037009767742197034
+armv7neon_mmm_f32_8x4_generic 16 4 11 0.0000037573621028760724
+armv7neon_mmm_f32_8x6_cortexa9 23 4 18 0.000006335024379033378
+generic_f32_4x4 7 32 11 0.00000797109252276388
+generic_f32_4x4 8 32 8 0.00000506136423984456
+armv7neon_mmm_f32_8x6_cortexa9 16 32 18 0.000011962958224342696
+armv7neon_mmm_f32_8x4_generic 9 32 3 0.0000035695396503977124
+armv7neon_mmm_f32_8x4_cortexa7 15 128 9 0.000029527917733416504
+armv7neon_mmm_f32_8x4_cortexa7 16 32 3 0.0000037787003081727403
+armv7neon_mmm_f32_8x4_cortexa9 17 4 5 0.000003983936158752065
+generic_f32_4x4 5 4 7 0.0000028427432000736736
+armv7neon_mmm_f32_8x6_generic 24 128 6 0.000019285772023622517
+armv7neon_mmm_f32_8x6_cortexa9 24 128 6 0.000020079389293074595
+armv7neon_mmm_f32_8x4_cortexa7 24 128 11 0.000043650740679119
+armv7neon_mmm_f32_8x6_cortexa7 15 4 6 0.0000019214162046321274
+armv7neon_mmm_f32_8x6_generic 24 128 11 0.000039928070800596723
+armv7neon_mmm_f32_8x6_cortexa7 17 128 13 0.00005915137603637775
+armv7neon_mmm_f32_8x6_cortexa7 9 128 12 0.00002647047584440748
+generic_f32_4x4 8 32 7 0.000005396919284647475
+armv7neon_mmm_f32_8x4_cortexa7 16 32 7 0.000006638663857963081
+armv7neon_mmm_f32_8x4_cortexa7 9 128 9 0.000029246794882198228
+armv7neon_mmm_f32_8x4_generic 15 32 12 0.000009467451463683956
+armv7neon_mmm_f32_8x6_generic 23 128 6 0.00001954577461053118
+armv7neon_mmm_f32_8x4_cortexa9 7 32 7 0.000003851842872992441
+armv7neon_mmm_f32_8x6_cortexa9 16 32 6 0.000004321654529645647
+generic_f32_4x4 13 32 4 0.00000517924119222049
+armv7neon_mmm_f32_8x4_generic 8 4 9 0.000002104797317503441
+armv7neon_mmm_f32_8x4_cortexa7 24 128 3 0.000015233751299574878
+armv7neon_mmm_f32_8x6_cortexa9 16 128 18 0.000039798911987149584
+armv7neon_mmm_f32_8x6_cortexa7 23 128 13 0.00005946225660703637
+generic_f32_4x4 7 32 5 0.000005462565009156774
+armv7neon_mmm_f32_8x6_generic 8 128 19 0.000025872976344928823
+generic_f32_4x4 5 4 12 0.0000037781682890084256
+armv7neon_mmm_f32_8x4_cortexa9 7 4 13 0.000003202055925058074
+armv7neon_mmm_f32_8x6_cortexa9 8 32 13 0.000006422608725593242
+armv7neon_mmm_f32_8x6_generic 9 128 7 0.000025813127546183672
+armv7neon_mmm_f32_8x4_generic 23 128 3 0.000014644120134860135
+armv7neon_mmm_f32_8x4_generic 8 4 3 0.0000011807711987459403
+armv7neon_mmm_f32_8x6_cortexa7 16 128 18 0.00003920283874886212
+armv7neon_mmm_f32_8x4_cortexa9 15 32 3 0.0000038264927174656245
+armv7neon_mmm_f32_8x6_cortexa7 25 32 12 0.000015882492018463845
+armv7neon_mmm_f32_8x6_generic 15 128 13 0.00003967007811496026
+armv7neon_mmm_f32_8x6_generic 25 4 12 0.00000523795399145237
+armv7neon_mmm_f32_8x4_cortexa9 16 4 7 0.0000028765969548151297
+armv7neon_mmm_f32_8x4_cortexa7 7 4 5 0.0000018096325702613645
+armv7neon_mmm_f32_8x6_generic 25 4 19 0.00001025642311454773
+armv7neon_mmm_f32_8x6_generic 9 128 6 0.000013045121912181557
+armv7neon_mmm_f32_8x6_cortexa7 15 32 11 0.000008770690105208661
+armv7neon_mmm_f32_8x6_generic 23 128 17 0.00005984628206960245
+armv7neon_mmm_f32_8x4_cortexa9 8 32 9 0.000005054839401749921
+generic_f32_4x4 9 32 13 0.000014812319931001716
+armv7neon_mmm_f32_8x4_cortexa9 8 32 7 0.0000036300075704923473
+generic_f32_4x4 12 128 12 0.00003080051055479753
+armv7neon_mmm_f32_8x4_generic 9 4 4 0.0000015775072922596973
+generic_f32_4x4 11 128 11 0.00003163786674887252
+generic_f32_4x4 11 128 7 0.000021315035021180117
+armv7neon_mmm_f32_8x4_generic 24 128 11 0.00004302931660486898
+armv7neon_mmm_f32_8x6_cortexa7 7 128 7 0.000013660184612749007
+armv7neon_mmm_f32_8x6_generic 15 4 12 0.0000032030364372184837
+armv7neon_mmm_f32_8x6_cortexa7 15 128 17 0.00004014988958130008
+armv7neon_mmm_f32_8x4_cortexa7 17 32 7 0.000009763008233938806
+armv7neon_mmm_f32_8x4_generic 23 4 11 0.000005850624187247901
+armv7neon_mmm_f32_8x6_cortexa9 25 4 7 0.000005691323284198507
+armv7neon_mmm_f32_8x6_generic 9 4 5 0.0000018995170310061328
+generic_f32_4x4 11 32 5 0.000007873582158405535
+armv7neon_mmm_f32_8x6_generic 17 4 7 0.0000043144070272553465
+armv7neon_mmm_f32_8x6_cortexa9 7 32 13 0.000006785328240517207
+armv7neon_mmm_f32_8x4_generic 25 128 12 0.00005645345647499795
+armv7neon_mmm_f32_8x4_generic 9 4 8 0.0000026364183937099707
+armv7neon_mmm_f32_8x4_generic 9 4 3 0.000001754783422972911
+armv7neon_mmm_f32_8x6_cortexa9 24 4 5 0.000002822627399800846
+armv7neon_mmm_f32_8x4_cortexa9 23 4 7 0.000004277460484355386
+armv7neon_mmm_f32_8x4_cortexa9 16 4 13 0.000004703052607837251
+armv7neon_mmm_f32_8x4_cortexa7 9 32 8 0.0000064472411584695615
+generic_f32_4x4 5 128 3 0.000007484402044593335
+armv7neon_mmm_f32_8x6_cortexa7 24 4 12 0.000003960884362652896
+armv7neon_mmm_f32_8x4_cortexa7 7 32 8 0.0000038146920144727004
+armv7neon_mmm_f32_8x4_cortexa9 24 4 12 0.000004810650577018378
+armv7neon_mmm_f32_8x4_cortexa9 9 4 13 0.000005031022350762627
+armv7neon_mmm_f32_8x6_cortexa9 9 128 12 0.000026841942459090627
+generic_f32_4x4 3 4 3 0.0000011084107564139803
+armv7neon_mmm_f32_8x4_generic 23 128 13 0.000057838678203389286
+armv7neon_mmm_f32_8x6_cortexa7 23 128 5 0.000020389937552256748
+armv7neon_mmm_f32_8x4_cortexa7 15 32 13 0.000012977109930123348
+armv7neon_mmm_f32_8x4_generic 16 32 11 0.00000919341672636989
+armv7neon_mmm_f32_8x6_cortexa9 17 128 17 0.000060215116898669664
+armv7neon_mmm_f32_8x4_generic 24 4 8 0.000003324206058446326
+armv7neon_mmm_f32_8x4_cortexa7 17 4 5 0.000003957357480955988
+armv7neon_mmm_f32_8x6_cortexa9 16 32 19 0.000016111976497239984
+generic_f32_4x4 13 4 12 0.000006710960240784156
+armv7neon_mmm_f32_8x6_cortexa9 23 128 5 0.000020639189093056077
+armv7neon_mmm_f32_8x6_generic 24 32 11 0.000012182073684360654
+armv7neon_mmm_f32_8x6_cortexa9 9 128 13 0.00004021781819635084
+armv7neon_mmm_f32_8x4_cortexa9 17 4 12 0.000005225946660532542
+armv7neon_mmm_f32_8x6_cortexa9 15 4 7 0.0000032840412749160575
+armv7neon_mmm_f32_8x4_cortexa9 17 128 13 0.000059220265497108164
+armv7neon_mmm_f32_8x6_cortexa9 7 4 11 0.0000020564874758047424
+generic_f32_4x4 8 128 8 0.000013969303780467269
+armv7neon_mmm_f32_8x4_cortexa7 25 4 12 0.000006595762756379242
+armv7neon_mmm_f32_8x6_generic 16 32 19 0.000015466876363304513
+armv7neon_mmm_f32_8x4_generic 24 4 11 0.0000053731939607204945
+generic_f32_4x4 8 128 13 0.00002774022280530327
+generic_f32_4x4 7 128 8 0.00001428610505820359
+armv7neon_mmm_f32_8x4_cortexa9 15 128 12 0.00002999051242337208
+armv7neon_mmm_f32_8x4_cortexa9 17 32 5 0.00000980239512919155
+armv7neon_mmm_f32_8x4_cortexa9 9 128 8 0.00001992992679699177
+armv7neon_mmm_f32_8x4_generic 25 32 5 0.00001226452231341447
+generic_f32_4x4 11 128 4 0.0000107871041093641
+generic_f32_4x4 13 32 5 0.000010185534520279402
+armv7neon_mmm_f32_8x6_cortexa7 9 32 13 0.000012306258740029597
+generic_f32_4x4 3 4 9 0.0000023591282065930476
+armv7neon_mmm_f32_8x4_cortexa7 15 128 12 0.000029408783696922103
+armv7neon_mmm_f32_8x6_cortexa9 25 4 5 0.000003460717936029046
+armv7neon_mmm_f32_8x6_cortexa9 15 32 7 0.000008659647810573247
+armv7neon_mmm_f32_8x6_generic 15 32 12 0.000008324450366911187
+armv7neon_mmm_f32_8x4_generic 7 4 4 0.0000011733671526145581
+armv7neon_mmm_f32_8x6_generic 8 32 17 0.000006245151171661497
+generic_f32_4x4 4 128 12 0.000010589994729591713
+generic_f32_4x4 13 32 3 0.000005623254183753802
+armv7neon_mmm_f32_8x4_cortexa9 17 32 8 0.000009489997139764281
+armv7neon_mmm_f32_8x6_cortexa7 15 128 12 0.0000267366663985427
+armv7neon_mmm_f32_8x6_generic 8 32 12 0.000004170620267459084
+generic_f32_4x4 12 128 13 0.00004142579938644744
+armv7neon_mmm_f32_8x6_cortexa7 7 4 17 0.0000028372537187841064
+armv7neon_mmm_f32_8x6_cortexa7 17 4 18 0.000005987713490844775
+armv7neon_mmm_f32_8x6_cortexa9 7 4 5 0.0000012358111649104604
+armv7neon_mmm_f32_8x4_generic 7 4 7 0.0000018542946008174326
+armv7neon_mmm_f32_8x4_generic 7 4 13 0.0000031498896105686864
+armv7neon_mmm_f32_8x6_cortexa9 15 4 11 0.0000034645213247919744
+armv7neon_mmm_f32_8x6_cortexa7 23 4 5 0.0000028436717084834024
+armv7neon_mmm_f32_8x4_generic 17 128 3 0.00001456481059501176
+armv7neon_mmm_f32_8x4_cortexa7 17 4 7 0.00000404574535805843
+armv7neon_mmm_f32_8x6_cortexa9 23 4 19 0.000008441880755451333
+armv7neon_mmm_f32_8x6_generic 16 4 5 0.000002037583190427157
+armv7neon_mmm_f32_8x4_cortexa7 17 4 11 0.000005618041346469936
+generic_f32_4x4 8 128 3 0.000007542706448510692
+armv7neon_mmm_f32_8x6_cortexa7 16 4 6 0.0000016599211327997808
+armv7neon_mmm_f32_8x4_generic 24 32 11 0.000013531694734026834
+armv7neon_mmm_f32_8x4_cortexa7 24 128 13 0.00005774273822898698
+generic_f32_4x4 12 32 8 0.000007328921605870887
+armv7neon_mmm_f32_8x4_generic 17 32 5 0.000009344539695515143
+armv7neon_mmm_f32_8x6_cortexa7 25 4 5 0.0000035354392629416224
+armv7neon_mmm_f32_8x4_generic 9 128 9 0.000028070243170981636
+armv7neon_mmm_f32_8x4_generic 7 32 7 0.0000036902878668462197
+generic_f32_4x4 4 32 5 0.000002913735845387932
+armv7neon_mmm_f32_8x4_cortexa7 23 128 9 0.00004401834026329482
+armv7neon_mmm_f32_8x6_cortexa7 15 32 17 0.000012793909984998035
+armv7neon_mmm_f32_8x4_cortexa7 24 32 11 0.000013985445748841752
+armv7neon_mmm_f32_8x4_generic 17 128 12 0.0000426589070381794
+armv7neon_mmm_f32_8x6_generic 9 128 19 0.000053237918402024005
+armv7neon_mmm_f32_8x6_cortexa7 17 4 17 0.000006443148021217296
+armv7neon_mmm_f32_8x4_cortexa7 8 128 12 0.000014559920878726009
+armv7neon_mmm_f32_8x6_cortexa9 8 4 19 0.000002972990505690349
+armv7neon_mmm_f32_8x6_cortexa7 25 128 5 0.000027009599886340098
+armv7neon_mmm_f32_8x6_cortexa7 7 32 5 0.0000025526809022378453
+generic_f32_4x4 13 128 13 0.00005568030788678991
+armv7neon_mmm_f32_8x4_cortexa7 15 32 12 0.000009774179275886764
+armv7neon_mmm_f32_8x4_generic 16 128 8 0.000018479196791574824
+armv7neon_mmm_f32_8x6_generic 25 128 17 0.00007786728690688477
+armv7neon_mmm_f32_8x6_generic 7 32 11 0.000004604319680609591
+armv7neon_mmm_f32_8x4_cortexa9 15 128 5 0.000020252737794258734
+armv7neon_mmm_f32_8x4_cortexa9 25 4 12 0.000006639922481547946
+armv7neon_mmm_f32_8x4_cortexa7 23 128 8 0.000029341008489406463
+armv7neon_mmm_f32_8x4_cortexa7 8 32 5 0.000003533774210652598
+generic_f32_4x4 8 128 7 0.000014303959630243704
+armv7neon_mmm_f32_8x4_cortexa9 25 32 5 0.000012867678837939383
+armv7neon_mmm_f32_8x6_generic 8 128 13 0.000019410057867203144
+armv7neon_mmm_f32_8x4_generic 15 4 5 0.0000029437961456768246
+generic_f32_4x4 12 4 9 0.000005303987106806706
+armv7neon_mmm_f32_8x6_generic 24 4 19 0.000007657205135344466
+armv7neon_mmm_f32_8x4_generic 9 4 11 0.000003929237868686891
+generic_f32_4x4 13 128 9 0.00004175491457338789
+armv7neon_mmm_f32_8x6_cortexa7 9 4 11 0.0000032396587513198363
+armv7neon_mmm_f32_8x6_cortexa7 9 32 5 0.000004609642726421324
+armv7neon_mmm_f32_8x6_cortexa7 8 128 13 0.000019980200207777093
+generic_f32_4x4 8 4 13 0.000004697441883300323
+armv7neon_mmm_f32_8x4_generic 24 128 5 0.000028404231306910303
+armv7neon_mmm_f32_8x4_cortexa7 15 128 8 0.00001974335359333113
+armv7neon_mmm_f32_8x4_generic 8 4 4 0.000000975840577396329
+armv7neon_mmm_f32_8x6_cortexa9 17 4 6 0.000002372991977511254
+armv7neon_mmm_f32_8x6_cortexa7 24 4 17 0.0000063580907262403335
+armv7neon_mmm_f32_8x6_cortexa9 24 4 17 0.00000634338841510274
+armv7neon_mmm_f32_8x6_cortexa7 8 128 19 0.000026457168307267402
+armv7neon_mmm_f32_8x4_cortexa7 15 32 4 0.0000035973058136569266
+armv7neon_mmm_f32_8x6_cortexa7 23 128 11 0.000040137380065831114
+armv7neon_mmm_f32_8x6_generic 8 32 11 0.0000044080486884835774
+armv7neon_mmm_f32_8x6_cortexa7 16 32 13 0.000012184909260827915
+armv7neon_mmm_f32_8x4_generic 24 128 12 0.00004230513222395041
+armv7neon_mmm_f32_8x6_generic 16 32 6 0.000004173665390794717
+armv7neon_mmm_f32_8x4_generic 15 128 5 0.000019049336126167508
+generic_f32_4x4 5 32 11 0.000007837409156830309
+armv7neon_mmm_f32_8x6_cortexa7 15 128 5 0.00001375002565612756
+armv7neon_mmm_f32_8x6_generic 17 128 17 0.00005946391401813929
+generic_f32_4x4 8 128 9 0.00002095153754448952
+generic_f32_4x4 12 4 7 0.000003927725122844973
+armv7neon_mmm_f32_8x6_cortexa9 25 32 6 0.000008303393078126922
+armv7neon_mmm_f32_8x4_generic 17 128 11 0.000043129532556527814
+armv7neon_mmm_f32_8x6_generic 23 128 19 0.00007849962174893098
+armv7neon_mmm_f32_8x4_cortexa9 25 32 11 0.00001896209821400977
+generic_f32_4x4 13 128 4 0.000014088650845963764
+armv7neon_mmm_f32_8x6_generic 17 32 17 0.000017768659574740396
+armv7neon_mmm_f32_8x6_generic 25 32 11 0.000016016335806526446
+generic_f32_4x4 4 128 9 0.000010729958874422145
+armv7neon_mmm_f32_8x6_generic 9 128 12 0.00002573025108782965
+armv7neon_mmm_f32_8x6_cortexa7 9 128 19 0.000052645037711891825
+armv7neon_mmm_f32_8x4_cortexa7 15 32 9 0.000009881132385003031
+armv7neon_mmm_f32_8x6_generic 15 4 11 0.0000033903277250974355
+armv7neon_mmm_f32_8x4_cortexa9 23 4 9 0.000005836030607874567
+armv7neon_mmm_f32_8x6_cortexa9 15 32 13 0.00001271636871027449
+armv7neon_mmm_f32_8x4_cortexa9 7 128 12 0.00001554481747780415
+armv7neon_mmm_f32_8x6_cortexa7 9 128 11 0.00002665664619685107
+armv7neon_mmm_f32_8x6_generic 23 4 19 0.000008276510705477435
+armv7neon_mmm_f32_8x6_cortexa7 9 128 13 0.00003963680038872068
+armv7neon_mmm_f32_8x6_generic 9 4 12 0.0000029664494321281043
+armv7neon_mmm_f32_8x4_cortexa7 9 4 8 0.0000026618049788927614
+armv7neon_mmm_f32_8x6_generic 25 4 5 0.0000034227395786859638
+armv7neon_mmm_f32_8x6_cortexa7 7 4 11 0.00000205419298901934
+armv7neon_mmm_f32_8x4_generic 15 128 4 0.000009727827829578247
+armv7neon_mmm_f32_8x4_generic 23 32 5 0.000009507106099903071
+generic_f32_4x4 13 128 3 0.000014531958964466404
+generic_f32_4x4 8 4 4 0.0000014932684997984591
+armv7neon_mmm_f32_8x6_cortexa7 9 32 12 0.0000083110843565406
+armv7neon_mmm_f32_8x4_cortexa9 9 128 4 0.000010214368441429339
+armv7neon_mmm_f32_8x6_cortexa7 24 128 11 0.000039934073240007294
+armv7neon_mmm_f32_8x4_cortexa9 25 128 9 0.0000592519545970783
+armv7neon_mmm_f32_8x4_cortexa7 17 32 13 0.000018457944384762052
+armv7neon_mmm_f32_8x6_generic 15 32 17 0.000012429374164019118
+armv7neon_mmm_f32_8x6_generic 7 4 5 0.0000012252588749082722
+generic_f32_4x4 4 4 11 0.000002142420120818127
+armv7neon_mmm_f32_8x4_cortexa7 17 32 8 0.000009331391202514006
+armv7neon_mmm_f32_8x6_cortexa9 9 32 12 0.000008401621869943723
+armv7neon_mmm_f32_8x6_cortexa7 16 32 17 0.00001235175359963665
+armv7neon_mmm_f32_8x4_cortexa9 16 4 11 0.000003828421619878369
+armv7neon_mmm_f32_8x4_cortexa9 9 4 8 0.000002691222544669364
+armv7neon_mmm_f32_8x4_cortexa9 15 32 4 0.0000036579305511926493
+armv7neon_mmm_f32_8x6_cortexa7 8 32 11 0.0000045660588709311665
+generic_f32_4x4 5 4 11 0.000003955847289217088
+generic_f32_4x4 7 128 5 0.00001437315927626327
+generic_f32_4x4 7 4 7 0.0000028946154117180987
+generic_f32_4x4 8 32 12 0.000007313658366544497
+armv7neon_mmm_f32_8x4_cortexa7 16 4 12 0.000003356919407698039
+armv7neon_mmm_f32_8x4_cortexa9 17 32 12 0.000013931859029317578
+armv7neon_mmm_f32_8x6_generic 7 128 13 0.000019807746403591995
+armv7neon_mmm_f32_8x6_generic 16 32 12 0.000007859148665693045
+armv7neon_mmm_f32_8x6_cortexa9 7 32 17 0.000006883792927542178
+armv7neon_mmm_f32_8x4_generic 24 32 4 0.000004632821669832301
+armv7neon_mmm_f32_8x6_cortexa7 15 32 18 0.000012578243798367562
+armv7neon_mmm_f32_8x6_cortexa9 17 128 7 0.000040256201289956
+armv7neon_mmm_f32_8x6_cortexa9 25 128 13 0.00007968673747995729
+armv7neon_mmm_f32_8x6_cortexa7 16 32 12 0.000008069060391993281
+armv7neon_mmm_f32_8x6_cortexa9 8 128 13 0.000020267415746870704
+armv7neon_mmm_f32_8x4_cortexa7 23 4 12 0.0000054788039005168164
+armv7neon_mmm_f32_8x4_cortexa7 7 128 9 0.000015183203798286496
+armv7neon_mmm_f32_8x4_cortexa9 25 128 11 0.00005936651957856526
+armv7neon_mmm_f32_8x6_cortexa9 7 128 13 0.00002061155210279855
+armv7neon_mmm_f32_8x6_cortexa9 16 128 12 0.000026606165960514827
+armv7neon_mmm_f32_8x4_cortexa7 24 4 7 0.000004016126966910536
+armv7neon_mmm_f32_8x4_cortexa9 16 32 4 0.000003429114676947513
+armv7neon_mmm_f32_8x4_cortexa7 9 128 7 0.000019732799833911844
+armv7neon_mmm_f32_8x6_generic 17 128 5 0.00001976574761517156
+armv7neon_mmm_f32_8x6_cortexa7 16 128 12 0.00002622611042000424
+armv7neon_mmm_f32_8x4_generic 23 128 4 0.000014300391832012492
+generic_f32_4x4 4 32 9 0.0000040473563381120025
+armv7neon_mmm_f32_8x6_cortexa9 16 4 12 0.00000281228428913529
+armv7neon_mmm_f32_8x6_generic 15 128 7 0.000025996819067612833
+armv7neon_mmm_f32_8x4_generic 25 128 13 0.00007519120583347806
+armv7neon_mmm_f32_8x6_cortexa7 25 128 18 0.00007816972627698026
+armv7neon_mmm_f32_8x6_generic 23 4 7 0.000004478475265472157
+armv7neon_mmm_f32_8x6_cortexa7 23 32 17 0.000018694963726284716
+armv7neon_mmm_f32_8x4_cortexa9 24 128 3 0.000015528002576413032
+armv7neon_mmm_f32_8x4_generic 15 4 13 0.000005299524671067579
+armv7neon_mmm_f32_8x6_cortexa9 8 32 7 0.000004505823387651013
+armv7neon_mmm_f32_8x4_cortexa9 9 4 7 0.0000028943361192655857
+armv7neon_mmm_f32_8x4_generic 9 32 4 0.0000033743539224745525
+armv7neon_mmm_f32_8x4_cortexa9 23 128 5 0.000030175843280997272
+armv7neon_mmm_f32_8x6_generic 8 4 6 0.000001069506404381519
+armv7neon_mmm_f32_8x6_cortexa9 9 32 17 0.000012530842578999487
+armv7neon_mmm_f32_8x6_generic 25 4 7 0.0000055691779651847645
+armv7neon_mmm_f32_8x6_cortexa7 24 32 18 0.000017489152288872757
+armv7neon_mmm_f32_8x4_generic 8 32 3 0.0000020846296384167996
+armv7neon_mmm_f32_8x4_generic 24 128 9 0.000042880640962818326
+armv7neon_mmm_f32_8x6_cortexa7 16 32 6 0.000004280562843189148
+generic_f32_4x4 11 4 7 0.000004035828927778453
+armv7neon_mmm_f32_8x6_cortexa7 17 128 6 0.000019945123788870955
+armv7neon_mmm_f32_8x6_cortexa7 23 4 13 0.000006544590047318178
+generic_f32_4x4 3 128 13 0.00001453817180213847
+armv7neon_mmm_f32_8x6_cortexa9 7 4 7 0.000001955919692684627
+armv7neon_mmm_f32_8x4_generic 8 4 8 0.0000014466616080596564
+armv7neon_mmm_f32_8x4_cortexa9 7 4 12 0.00000257120273391917
+armv7neon_mmm_f32_8x6_cortexa7 23 32 12 0.000012365351347550488
+armv7neon_mmm_f32_8x4_cortexa7 24 128 8 0.000028841373491782905
+generic_f32_4x4 12 4 4 0.0000019842828386500398
+armv7neon_mmm_f32_8x4_cortexa7 25 32 11 0.000018655746399310692
+armv7neon_mmm_f32_8x4_cortexa7 15 4 5 0.0000029733858994116004
+armv7neon_mmm_f32_8x4_cortexa9 8 128 3 0.000005515258544137823
+armv7neon_mmm_f32_8x6_cortexa7 24 4 7 0.000004423529851941163
+generic_f32_4x4 8 32 4 0.0000027872687254075905
+armv7neon_mmm_f32_8x6_generic 17 32 18 0.000017325844518327246
+armv7neon_mmm_f32_8x4_cortexa7 25 4 13 0.000009174920440845347
+armv7neon_mmm_f32_8x4_generic 25 128 8 0.00003833303029711307
+armv7neon_mmm_f32_8x4_generic 16 4 4 0.0000014454956826662534
+armv7neon_mmm_f32_8x4_cortexa7 17 4 4 0.0000020761952577578897
+armv7neon_mmm_f32_8x6_cortexa7 8 32 5 0.0000026618366158480003
+generic_f32_4x4 3 4 4 0.0000011319274511202646
+armv7neon_mmm_f32_8x6_cortexa7 8 32 17 0.000006455239935042666
+armv7neon_mmm_f32_8x4_cortexa7 15 4 3 0.0000018508727527174426
+armv7neon_mmm_f32_8x4_cortexa9 7 128 5 0.000010464038311697658
+armv7neon_mmm_f32_8x4_cortexa7 9 128 11 0.000029303633762621288
+armv7neon_mmm_f32_8x6_generic 15 128 6 0.000013191467223518409
+armv7neon_mmm_f32_8x6_generic 8 32 19 0.00000799220605890635
+armv7neon_mmm_f32_8x4_cortexa7 25 32 3 0.000006944685928632884
+generic_f32_4x4 5 128 7 0.000014344402542266391
+armv7neon_mmm_f32_8x6_generic 25 32 18 0.00002286640455940777
+armv7neon_mmm_f32_8x6_generic 23 4 17 0.000006658878255694999
+armv7neon_mmm_f32_8x4_cortexa9 15 32 11 0.000010153701137788554
+armv7neon_mmm_f32_8x4_cortexa7 8 32 13 0.000006386976234107792
+generic_f32_4x4 9 128 11 0.0000315155364444221
+armv7neon_mmm_f32_8x4_generic 16 128 5 0.00001883838484826069
+armv7neon_mmm_f32_8x6_generic 8 4 18 0.0000021936894870634253
+armv7neon_mmm_f32_8x6_cortexa7 24 32 17 0.00001822762304619792
+armv7neon_mmm_f32_8x6_generic 24 32 6 0.000006012607283304525
+armv7neon_mmm_f32_8x4_cortexa9 7 4 8 0.0000018992159881966028
+generic_f32_4x4 5 32 8 0.000005268936385502091
+armv7neon_mmm_f32_8x6_generic 23 32 13 0.000017886686307111186
+armv7neon_mmm_f32_8x6_generic 23 128 11 0.00004012008266709576
+armv7neon_mmm_f32_8x4_generic 7 32 3 0.000002052664755929161
+armv7neon_mmm_f32_8x4_generic 7 32 13 0.0000068232658761941584
+armv7neon_mmm_f32_8x4_cortexa9 15 4 3 0.0000018728759441468222
+armv7neon_mmm_f32_8x4_cortexa9 17 32 7 0.000009921472462970692
+armv7neon_mmm_f32_8x4_cortexa7 23 4 5 0.000004108135771777712
+armv7neon_mmm_f32_8x6_generic 17 4 18 0.0000058654454576938545
+armv7neon_mmm_f32_8x6_generic 8 32 6 0.0000023314322550231567
+armv7neon_mmm_f32_8x6_cortexa9 24 32 12 0.000011982465286422162
+armv7neon_mmm_f32_8x6_cortexa9 9 4 12 0.000003049708218088651
+armv7neon_mmm_f32_8x6_cortexa7 9 4 19 0.000005629342662833259
+generic_f32_4x4 3 32 12 0.00000437213523735793
+armv7neon_mmm_f32_8x6_generic 23 128 18 0.00005941769593137103
+armv7neon_mmm_f32_8x6_cortexa7 15 4 18 0.000004622352036845354
+armv7neon_mmm_f32_8x6_cortexa9 24 128 19 0.00007950053817997623
+armv7neon_mmm_f32_8x4_generic 16 128 4 0.000009494054447373921
+armv7neon_mmm_f32_8x6_cortexa7 7 128 17 0.000020414604977074606
+armv7neon_mmm_f32_8x4_cortexa9 7 32 12 0.000005535747117413851
+armv7neon_mmm_f32_8x6_cortexa9 17 4 5 0.000002695145363194299
+generic_f32_4x4 4 128 4 0.000003870015760471616
+armv7neon_mmm_f32_8x4_generic 15 32 11 0.000009681542653487903
+armv7neon_mmm_f32_8x6_cortexa7 7 4 13 0.000002737908308165428
+armv7neon_mmm_f32_8x4_cortexa7 9 32 7 0.000006661491356810654
+armv7neon_mmm_f32_8x4_generic 16 4 8 0.0000023864199459892154
+armv7neon_mmm_f32_8x6_generic 24 32 12 0.000011535291893158489
+armv7neon_mmm_f32_8x6_cortexa9 15 128 11 0.00002729544131868298
+armv7neon_mmm_f32_8x4_cortexa7 15 128 13 0.00003933216865420138
+generic_f32_4x4 11 4 4 0.0000021466514853413024
+armv7neon_mmm_f32_8x4_cortexa7 15 32 7 0.000006892235042247499
+armv7neon_mmm_f32_8x4_generic 24 128 7 0.000028572043647170913
+armv7neon_mmm_f32_8x4_cortexa9 9 128 12 0.000029617239836207004
+armv7neon_mmm_f32_8x6_generic 23 32 6 0.000006267673062941748
+armv7neon_mmm_f32_8x6_cortexa9 23 128 17 0.00006059797167110977
+armv7neon_mmm_f32_8x6_cortexa9 9 128 18 0.00004014798753858387
+armv7neon_mmm_f32_8x6_generic 25 32 19 0.00003066922065226773
+armv7neon_mmm_f32_8x6_generic 9 32 7 0.00000813819010478461
+armv7neon_mmm_f32_8x6_generic 8 128 18 0.000019253370491633585
+armv7neon_mmm_f32_8x6_generic 8 4 5 0.0000012747243711687536
+armv7neon_mmm_f32_8x4_cortexa7 25 4 4 0.000002551159854962235
+armv7neon_mmm_f32_8x4_cortexa9 8 128 13 0.000019869699675831398
+armv7neon_mmm_f32_8x6_cortexa9 9 128 19 0.00005342392770358122
+armv7neon_mmm_f32_8x6_cortexa9 7 4 12 0.000002051630860797947
+armv7neon_mmm_f32_8x6_cortexa9 16 128 13 0.00004015173417099066
+armv7neon_mmm_f32_8x6_cortexa9 23 128 12 0.0000403164031404094
+armv7neon_mmm_f32_8x4_generic 23 128 5 0.000028627967286799668
+armv7neon_mmm_f32_8x4_cortexa9 25 4 8 0.000004634829587692317
+armv7neon_mmm_f32_8x4_cortexa7 15 128 4 0.000010126569600218532
+armv7neon_mmm_f32_8x4_cortexa9 23 32 4 0.000005123975664670012
+armv7neon_mmm_f32_8x6_generic 8 4 17 0.0000024242275965441413
+armv7neon_mmm_f32_8x6_cortexa7 25 128 7 0.000052727920949761586
+armv7neon_mmm_f32_8x4_cortexa7 16 32 9 0.000009437089513621594
+armv7neon_mmm_f32_8x4_generic 23 4 9 0.000005734226973840947
+armv7neon_mmm_f32_8x4_cortexa7 17 128 12 0.0000433599443382883
+armv7neon_mmm_f32_8x4_cortexa7 24 32 3 0.000005409349524302138
+armv7neon_mmm_f32_8x6_cortexa7 8 4 7 0.0000018575873013342354
+armv7neon_mmm_f32_8x6_generic 7 128 19 0.000026452799909726964
+armv7neon_mmm_f32_8x4_cortexa9 24 32 12 0.000013540981249723916
+armv7neon_mmm_f32_8x4_cortexa9 23 128 9 0.000044894692860566556
+armv7neon_mmm_f32_8x6_cortexa9 9 4 17 0.000004505536489134122
+armv7neon_mmm_f32_8x6_generic 15 32 11 0.000008507986633036132
+armv7neon_mmm_f32_8x6_cortexa7 7 32 19 0.000008805049700237493
+armv7neon_mmm_f32_8x4_generic 17 4 4 0.0000020583169447652696
+armv7neon_mmm_f32_8x4_generic 7 32 9 0.000005237513207003328
+armv7neon_mmm_f32_8x6_cortexa9 8 4 5 0.0000012863007612366299
+armv7neon_mmm_f32_8x6_cortexa7 16 128 17 0.00003972939922069338
+armv7neon_mmm_f32_8x6_cortexa9 16 32 17 0.000012476036240479964
+generic_f32_4x4 5 128 4 0.000007343673936530293
+armv7neon_mmm_f32_8x6_cortexa7 17 4 13 0.000006264666040125929
+armv7neon_mmm_f32_8x6_cortexa9 25 128 18 0.00007930397959821057
+armv7neon_mmm_f32_8x6_cortexa9 16 4 5 0.0000020584584410478216
+armv7neon_mmm_f32_8x4_cortexa9 7 32 5 0.00000378972657151533
+armv7neon_mmm_f32_8x6_cortexa9 16 4 18 0.000003945842563203367
+armv7neon_mmm_f32_8x4_generic 23 32 7 0.000009665874611953724
+armv7neon_mmm_f32_8x6_cortexa7 25 128 6 0.000026459671946913376
+generic_f32_4x4 7 4 3 0.0000017591858474192314
+armv7neon_mmm_f32_8x6_cortexa7 16 4 18 0.000003943668999181098
+generic_f32_4x4 4 128 7 0.0000074032510257738404
+armv7neon_mmm_f32_8x4_generic 16 128 13 0.0000374668929079076
+armv7neon_mmm_f32_8x4_cortexa9 9 4 3 0.0000017875665004976476
+armv7neon_mmm_f32_8x4_cortexa9 15 128 9 0.00003010940924012135
+armv7neon_mmm_f32_8x6_cortexa9 17 32 13 0.000018295003723366057
+armv7neon_mmm_f32_8x6_cortexa9 9 4 13 0.000004395459186352696
+armv7neon_mmm_f32_8x6_cortexa7 8 4 12 0.0000016616372440035075
+generic_f32_4x4 5 128 12 0.000021001699183635672
+armv7neon_mmm_f32_8x6_generic 25 4 6 0.000002871872138224116
+armv7neon_mmm_f32_8x6_generic 7 128 5 0.000006875620566369796
+generic_f32_4x4 5 4 4 0.000001600585638499612
+armv7neon_mmm_f32_8x4_cortexa9 23 4 4 0.0000022070594896923405
+armv7neon_mmm_f32_8x6_cortexa9 8 32 17 0.0000065161951430999815
+armv7neon_mmm_f32_8x6_cortexa7 17 4 19 0.000008057471663557758
+armv7neon_mmm_f32_8x6_cortexa9 8 128 7 0.000013701107804035602
+armv7neon_mmm_f32_8x4_cortexa9 23 128 11 0.00004502007320686303
+armv7neon_mmm_f32_8x4_cortexa7 8 32 11 0.000004991462488279611
+armv7neon_mmm_f32_8x6_cortexa9 24 32 18 0.00001769207861513086
+armv7neon_mmm_f32_8x4_generic 25 128 4 0.000019508351532460557
+armv7neon_mmm_f32_8x6_cortexa7 23 4 12 0.0000044400455773711846
+armv7neon_mmm_f32_8x6_cortexa9 24 32 5 0.000006843511403564935
+armv7neon_mmm_f32_8x6_cortexa9 25 128 11 0.000053747765759359946
+armv7neon_mmm_f32_8x4_cortexa7 25 4 11 0.000007253341541711691
+armv7neon_mmm_f32_8x6_cortexa9 23 32 19 0.000024549141146277552
+armv7neon_mmm_f32_8x6_cortexa7 16 128 11 0.000026732044849478285
+armv7neon_mmm_f32_8x6_cortexa9 23 128 19 0.0000801357471530611
+generic_f32_4x4 3 32 4 0.0000017966908912230104
+armv7neon_mmm_f32_8x6_cortexa7 24 128 17 0.00005929496892410179
+armv7neon_mmm_f32_8x6_cortexa9 23 32 12 0.000012490651961414628
+armv7neon_mmm_f32_8x4_cortexa7 16 128 9 0.000029084669529414875
+armv7neon_mmm_f32_8x6_cortexa7 7 128 19 0.000026966388966864694
+armv7neon_mmm_f32_8x4_cortexa7 23 32 8 0.000009566852853696024
+armv7neon_mmm_f32_8x4_generic 17 32 13 0.00001785315018140605
+armv7neon_mmm_f32_8x4_cortexa7 23 4 4 0.00000218193596958567
+armv7neon_mmm_f32_8x4_generic 9 32 7 0.000006462010776076977
+armv7neon_mmm_f32_8x4_generic 23 32 8 0.000009259693411977238
+armv7neon_mmm_f32_8x6_generic 15 128 12 0.000025997314252108204
+generic_f32_4x4 12 32 5 0.000007719446523382617
+armv7neon_mmm_f32_8x6_cortexa9 15 128 17 0.000040755037497853366
+armv7neon_mmm_f32_8x4_generic 24 32 5 0.00000927870929356689
+armv7neon_mmm_f32_8x4_cortexa9 23 32 11 0.00001470624762457902
+armv7neon_mmm_f32_8x4_cortexa7 7 128 7 0.000010321518649139075
+armv7neon_mmm_f32_8x4_cortexa9 17 32 3 0.000005401396003956081
+armv7neon_mmm_f32_8x4_cortexa7 25 128 5 0.00003910615015651058
+armv7neon_mmm_f32_8x6_cortexa9 17 128 18 0.00005978117998553773
+armv7neon_mmm_f32_8x6_generic 7 4 11 0.000002036996116605385
+armv7neon_mmm_f32_8x6_cortexa9 15 4 6 0.0000019083928751646085
+armv7neon_mmm_f32_8x6_cortexa9 9 4 19 0.000005613783088587898
+armv7neon_mmm_f32_8x4_cortexa7 24 32 4 0.0000047895509427620865
+armv7neon_mmm_f32_8x4_cortexa9 8 32 12 0.000004854203771648976
+armv7neon_mmm_f32_8x4_generic 25 4 11 0.000007170056956827459
+armv7neon_mmm_f32_8x6_cortexa7 24 32 6 0.000006172583934492533
+armv7neon_mmm_f32_8x6_generic 23 32 18 0.000017730914760808268
+generic_f32_4x4 9 128 4 0.00001072609031817021
+armv7neon_mmm_f32_8x6_generic 16 32 7 0.000008131169126293927
+armv7neon_mmm_f32_8x6_cortexa9 8 32 11 0.000004596930720411251
+generic_f32_4x4 3 128 3 0.000003987497974732173
+armv7neon_mmm_f32_8x6_generic 16 32 18 0.000011518507884431023
+generic_f32_4x4 12 4 13 0.0000067677797185944386
+armv7neon_mmm_f32_8x6_cortexa7 17 4 12 0.00000420578831765641
+armv7neon_mmm_f32_8x6_cortexa7 25 128 11 0.00005298746347142357
+armv7neon_mmm_f32_8x6_cortexa9 17 4 17 0.0000064380265240168815
+armv7neon_mmm_f32_8x4_cortexa9 16 32 3 0.000003850508528044626
+armv7neon_mmm_f32_8x4_cortexa7 15 4 9 0.0000041824910345443945
+armv7neon_mmm_f32_8x6_cortexa7 16 4 13 0.000004283476073931167
+armv7neon_mmm_f32_8x6_cortexa9 15 32 19 0.00001676006214460718
+armv7neon_mmm_f32_8x6_cortexa9 9 4 6 0.0000017882273242368877
+generic_f32_4x4 11 4 12 0.000005385894826689573
+armv7neon_mmm_f32_8x6_generic 25 128 11 0.00005279747695062305
+armv7neon_mmm_f32_8x4_generic 25 128 11 0.000057163651606893397
+armv7neon_mmm_f32_8x6_generic 16 32 13 0.000011809288098621582
+armv7neon_mmm_f32_8x6_cortexa7 16 128 7 0.00002656376898311115
+armv7neon_mmm_f32_8x6_generic 8 4 19 0.0000028980842866481956
+armv7neon_mmm_f32_8x6_cortexa9 25 4 17 0.000008360160748075129
+armv7neon_mmm_f32_8x6_cortexa7 25 32 17 0.00002416762541107772
+armv7neon_mmm_f32_8x4_cortexa7 9 4 11 0.000003971891623914232
+armv7neon_mmm_f32_8x6_cortexa7 8 128 18 0.000019766077162036943
+armv7neon_mmm_f32_8x4_generic 15 32 3 0.0000036533992903578352
+armv7neon_mmm_f32_8x6_cortexa7 25 32 5 0.0000087944366611451
+armv7neon_mmm_f32_8x6_cortexa7 7 128 5 0.000007076691588612229
+armv7neon_mmm_f32_8x6_generic 23 4 6 0.000002427909563551317
+armv7neon_mmm_f32_8x4_cortexa9 17 4 9 0.000005573474441160942
+armv7neon_mmm_f32_8x4_cortexa9 25 4 5 0.000005112325491179266
+generic_f32_4x4 8 32 5 0.000005323391922801219
+armv7neon_mmm_f32_8x6_cortexa7 24 128 18 0.00005856236723783224
+armv7neon_mmm_f32_8x6_generic 23 4 18 0.000006215918837505929
+armv7neon_mmm_f32_8x4_generic 16 32 4 0.000003260594187485516
+armv7neon_mmm_f32_8x4_generic 8 4 11 0.000002124764231070715
+armv7neon_mmm_f32_8x6_cortexa9 15 128 7 0.000027104995655962794
+armv7neon_mmm_f32_8x4_cortexa7 23 4 3 0.000002539994674766333
+armv7neon_mmm_f32_8x4_cortexa9 15 128 13 0.000040120613837028105
+armv7neon_mmm_f32_8x4_generic 9 32 5 0.000006416826225047117
+armv7neon_mmm_f32_8x6_generic 9 128 13 0.000039231938860404315
+armv7neon_mmm_f32_8x4_cortexa7 17 32 9 0.000014080751383917914
+armv7neon_mmm_f32_8x4_generic 24 4 5 0.0000038475341959329615
+armv7neon_mmm_f32_8x6_cortexa7 17 32 19 0.00002385320479583871
+armv7neon_mmm_f32_8x4_cortexa9 25 4 7 0.000005243554539274239
+armv7neon_mmm_f32_8x4_cortexa9 8 128 7 0.000010306643098752432
+armv7neon_mmm_f32_8x4_cortexa7 9 32 9 0.000009612025649062918
+armv7neon_mmm_f32_8x6_generic 7 128 6 0.000006918233017670394
+armv7neon_mmm_f32_8x6_cortexa7 23 32 6 0.000006480642874727307
+armv7neon_mmm_f32_8x4_cortexa7 17 4 8 0.0000036467312614380765
+armv7neon_mmm_f32_8x6_cortexa9 9 4 18 0.000004263423188010347
+armv7neon_mmm_f32_8x4_cortexa9 23 4 3 0.000002565453674896801
+armv7neon_mmm_f32_8x6_cortexa7 17 128 17 0.00005934218053895787
+armv7neon_mmm_f32_8x6_cortexa9 9 128 11 0.00002703084778716899
+armv7neon_mmm_f32_8x6_cortexa7 8 32 19 0.000008254990878887053
+armv7neon_mmm_f32_8x4_cortexa7 24 4 11 0.000005437311620791031
+armv7neon_mmm_f32_8x6_cortexa7 17 32 12 0.000012103957972190128
+armv7neon_mmm_f32_8x6_cortexa7 16 32 11 0.000008575915963542344
+armv7neon_mmm_f32_8x6_cortexa7 15 128 13 0.00003995376024317136
+armv7neon_mmm_f32_8x4_cortexa7 25 128 12 0.00005747997493377876
+armv7neon_mmm_f32_8x6_cortexa7 9 4 17 0.000004513992669521213
+armv7neon_mmm_f32_8x4_cortexa9 7 32 11 0.000005542086596158504
+armv7neon_mmm_f32_8x4_generic 16 32 8 0.00000600791906169857
+armv7neon_mmm_f32_8x6_cortexa9 8 4 6 0.000001079992540189233
+armv7neon_mmm_f32_8x6_cortexa7 15 4 7 0.0000033103105460849674
+armv7neon_mmm_f32_8x6_generic 7 4 18 0.0000027798204465348195
+armv7neon_mmm_f32_8x4_cortexa9 9 4 4 0.0000016086729123039774
+armv7neon_mmm_f32_8x4_cortexa9 17 4 13 0.000007139907020236834
+armv7neon_mmm_f32_8x6_generic 7 4 17 0.000002811022144491351
+armv7neon_mmm_f32_8x4_generic 16 4 12 0.00000330778427406012
+armv7neon_mmm_f32_8x6_cortexa7 17 32 7 0.000012331946306204285
+armv7neon_mmm_f32_8x4_cortexa7 23 128 4 0.000014866985895709814
+armv7neon_mmm_f32_8x6_generic 8 128 5 0.000006930294425648014
+armv7neon_mmm_f32_8x6_cortexa7 7 128 11 0.000013755100434106755
+armv7neon_mmm_f32_8x4_cortexa7 9 4 3 0.0000017694089316929863
+armv7neon_mmm_f32_8x4_cortexa9 25 128 5 0.00003989811331140034
+armv7neon_mmm_f32_8x6_cortexa9 24 4 18 0.000005659337689416026
+armv7neon_mmm_f32_8x6_cortexa7 7 32 7 0.000004615694133689407
+armv7neon_mmm_f32_8x4_generic 23 4 12 0.000005414200630328292
+armv7neon_mmm_f32_8x6_cortexa7 15 128 7 0.00002674534329947021
+armv7neon_mmm_f32_8x4_cortexa9 23 128 13 0.0000596298628444228
+armv7neon_mmm_f32_8x6_cortexa9 7 32 19 0.000008889380829716555
+armv7neon_mmm_f32_8x6_cortexa7 8 4 17 0.0000025025489216272165
+armv7neon_mmm_f32_8x6_cortexa9 24 128 5 0.000020677151329306246
+armv7neon_mmm_f32_8x6_cortexa9 17 32 17 0.000018491383989214777
+armv7neon_mmm_f32_8x4_cortexa7 8 4 11 0.000002146399138479012
+armv7neon_mmm_f32_8x6_cortexa7 7 4 19 0.0000034947036576981197
+armv7neon_mmm_f32_8x6_cortexa7 15 32 13 0.00001261074648998989
+armv7neon_mmm_f32_8x6_cortexa7 17 128 12 0.000039467248529014934
+armv7neon_mmm_f32_8x6_cortexa9 15 128 13 0.0000405486222699268
+armv7neon_mmm_f32_8x6_cortexa7 8 4 6 0.0000010838257676938484
+armv7neon_mmm_f32_8x4_cortexa7 8 128 11 0.00001478367268084039
+generic_f32_4x4 13 4 5 0.000005005970941882336
+armv7neon_mmm_f32_8x4_generic 9 4 5 0.0000027981911123646093
+armv7neon_mmm_f32_8x4_cortexa9 24 32 3 0.000005499754513818659
+armv7neon_mmm_f32_8x4_cortexa9 16 32 8 0.00000632372871965183
+armv7neon_mmm_f32_8x6_generic 25 32 7 0.000015750403152655726
+armv7neon_mmm_f32_8x6_cortexa7 9 128 6 0.000013469652928515112
+generic_f32_4x4 5 128 8 0.000014178354640152658
+armv7neon_mmm_f32_8x4_cortexa9 25 32 13 0.00002471922478461092
+armv7neon_mmm_f32_8x4_generic 15 32 7 0.000006693321105057111
+armv7neon_mmm_f32_8x4_cortexa9 23 128 8 0.000029940232861406133
+armv7neon_mmm_f32_8x4_cortexa7 7 128 5 0.000010257823038892243
+generic_f32_4x4 4 32 12 0.000003911119050209719
+armv7neon_mmm_f32_8x6_cortexa7 8 128 7 0.000013528068455942153
+armv7neon_mmm_f32_8x4_cortexa7 24 4 5 0.000003893712023496801
+armv7neon_mmm_f32_8x4_cortexa7 9 4 5 0.00000282867688987085
+armv7neon_mmm_f32_8x6_generic 9 4 6 0.0000017337687872243595
+armv7neon_mmm_f32_8x4_generic 8 32 11 0.000004839072297522764
+armv7neon_mmm_f32_8x4_cortexa7 16 32 8 0.000006215610136029107
+armv7neon_mmm_f32_8x4_cortexa7 24 128 12 0.00004296408794693615
+armv7neon_mmm_f32_8x6_generic 23 32 7 0.000012139369977001298
+armv7neon_mmm_f32_8x6_cortexa9 23 32 7 0.00001262657347999726
+armv7neon_mmm_f32_8x6_generic 9 32 18 0.000011811540149507637
+generic_f32_4x4 4 4 12 0.0000019950687694365225
+armv7neon_mmm_f32_8x4_cortexa9 17 32 13 0.000018760347719013833
+armv7neon_mmm_f32_8x4_generic 24 128 8 0.000027893007985374452
+armv7neon_mmm_f32_8x6_generic 7 32 12 0.000004596558109391685
+generic_f32_4x4 12 32 12 0.0000107075710637508
+armv7neon_mmm_f32_8x6_generic 16 4 18 0.000003888078280974674
+generic_f32_4x4 12 128 7 0.000021189568041458556
+armv7neon_mmm_f32_8x6_generic 16 128 11 0.000025972024072475202
+armv7neon_mmm_f32_8x6_cortexa7 15 4 13 0.000004683720683139814
+armv7neon_mmm_f32_8x6_cortexa9 7 128 6 0.000007213870080063144
+armv7neon_mmm_f32_8x6_cortexa9 23 128 11 0.00004071170407195971
+armv7neon_mmm_f32_8x6_generic 24 4 5 0.000002793903924978298
+armv7neon_mmm_f32_8x4_cortexa9 15 32 9 0.000010036115634563751
+generic_f32_4x4 9 128 9 0.000031422044792444864
+armv7neon_mmm_f32_8x4_generic 25 32 4 0.000006133898665196312
+generic_f32_4x4 13 128 8 0.00002772292203595808
diff --git a/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.rs b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.rs
new file mode 100644
index 000000000..6ccb6adbe
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.rs
@@ -0,0 +1,16 @@
+use crate::frame::mmm::CostModel;
+        pub fn model() -> CostModel<'static> {
+            CostModel {
+                big_product_mkn_threshold: 4194036.0,
+                big_product_kernel_choice: "armv7neon_mmm_f32_8x6_cortexa9",
+                kernels: &["armv7neon_mmm_f32_8x4_cortexa7", "armv7neon_mmm_f32_8x4_cortexa9", "armv7neon_mmm_f32_8x4_generic", "armv7neon_mmm_f32_8x6_cortexa7", "armv7neon_mmm_f32_8x6_cortexa9", "armv7neon_mmm_f32_8x6_generic", "generic_f32_4x4"],
+                mrs: &[4, 8],
+                nrs: &[4, 6],
+                feat_norm_mean: &[4.582296677813486, 4.595402322442016, 4.571260231028445, 13.748959231283994, 1.5179177668804225, 0.7575757575757576, 3.5337608449641644, 0.8831887338111405, 1.5048409405255878, 0.7526719476926946, 2.489123601156796, 0.8326417704011065],
+                feat_norm_stddev: &[1.2635817489024164, 1.2723436827339079, 1.2620157548883217, 1.3497763942449361, 1.1141159992246472, 0.42854956435545316, 2.2880460409304937, 0.32119525880720723, 1.1154901716833412, 0.43145902105435263, 1.7051378780434328, 0.37329539587896904],
+                w1: &[0.5391961336135864, -0.32089367508888245, 0.203999862074852, -0.10011337697505951, 0.09040801972150803, -0.14198464155197144, 0.031854499131441116, 0.12334256619215012, 0.15339604020118713, -0.20091375708580017, -0.014548280276358128, 0.12154694646596909, 0.31225234270095825, 0.10782113671302795, 0.44618168473243713, 0.8267014026641846, -0.1204405128955841, -0.08261110633611679, -0.052502430975437164, 0.3066086769104004, 0.1493932157754898, -0.14119412004947662, -0.1985343098640442, 0.19361039996147156, -0.4636686146259308, 0.08120443671941757, 0.03210291638970375, 0.17303235828876495, 0.16502155363559723, -0.19771894812583923, -0.11060577630996704, 0.08698348701000214, -0.07793445140123367, 0.32749465107917786, 0.3663202226161957, -0.4629170894622803, -0.1586134433746338, 0.4272242486476898, -0.12016090005636215, -0.17830348014831543, -0.05493386462330818, -0.036517318338155746, 0.01293050218373537, 0.016577009111642838, 0.10738552361726761, -0.3662779629230499, -0.2917434275150299, 0.5752639770507812, 0.11406347155570984, 0.8622727394104004, 0.07158719748258591, 0.29530274868011475, -0.11287810653448105, 0.12262264639139175, 0.02478562481701374, 0.17749948799610138, -0.036227867007255554, 0.10140471905469894, -0.011896232143044472, -0.021761735901236534, 0.06046223267912865, 0.5727048516273499, -0.007826486602425575, 0.3863913118839264, -0.04224887117743492, 0.056023009121418, -0.02467598207294941, 0.0385640449821949, 0.0219524335116148, -0.03437826409935951, -0.2060588151216507, 0.2895224988460541, 0.10751669108867645, 0.00845037866383791, -0.1836385875940323, -0.24757762253284454, -0.09606243669986725, 0.03918633610010147, 0.07913251221179962, 0.06499160826206207, -0.08156774938106537, 0.08835449814796448, 0.13896305859088898, -0.16936920583248138, 0.010146846994757652, -0.42553824186325073, 0.39916151762008667, -0.004584060981869698, -0.10256388038396835, 0.041573416441679, 0.05155385658144951, 0.015019520185887814, 0.09554271399974823, -0.20487457513809204, -0.4146610200405121, -0.773110032081604, 0.3662724494934082, -0.23762361705303192, 0.6974321603775024, 0.8990052938461304, 0.02772649936378002, 0.042197681963443756, -0.0022736566606909037, -0.028843341395258904, -0.4559306204319, 0.6326258778572083, 0.4568879008293152, -0.4892531633377075, -0.032289132475852966, 0.04378330707550049, -0.4118069112300873, 0.2493579089641571, -0.021955665200948715, -0.01538186427205801, -0.21400974690914154, -0.09971866756677628, 0.02185226045548916, -0.18125569820404053, -0.13828244805335999, -0.20846466720104218, -0.10373540222644806, 0.4842098653316498, -0.06586655229330063, 0.03369470313191414, 0.013142148964107037, 0.017437899485230446, 0.15891534090042114, 0.5269678831100464, 0.02546108327805996, -0.004250233061611652, -5.8676625485531986e-05, 0.06777831166982651, -0.14051207900047302, 0.6876491904258728, -0.3455996811389923, 0.0378129817545414, 0.15291574597358704, -0.03829087316989899, -0.05761529877781868, -0.05344394966959953, 0.1421334147453308, -0.3614322543144226, -0.21606910228729248, 0.1558765172958374, 0.14480257034301758, -0.1799984872341156, 0.4238421618938446, -0.08961529284715652, -0.04010967165231705, 0.14250615239143372, -0.0038367861416190863, -0.044531334191560745, -0.08958051353693008, -0.1577986180782318, -0.5795103907585144, -1.1048516035079956, 0.16444185376167297, -0.09989812225103378, -0.26304998993873596, 0.040687527507543564, 0.065303735435009, -0.06267901510000229, 0.08742637187242508, 0.02480895072221756, 0.23719966411590576, -0.09509539604187012, 0.39278310537338257, 0.18978112936019897, 0.11301649361848831, -0.16268616914749146, -0.14119602739810944, -0.04518252611160278, 0.10456270724534988, 0.008367948234081268, 0.004280170891433954, 0.01894286274909973, -0.1547478288412094, 0.197267547249794, 0.20271208882331848, -0.28377917408943176, -0.26751258969306946, 0.15954937040805817, 0.33988064527511597, 0.16848208010196686, 0.11668887734413147, -0.057433612644672394, -0.049777109175920486, 0.00744214653968811, -0.012330793775618076, -0.08413149416446686, -0.2053118497133255, 0.09235486388206482, -0.1354941576719284, 0.41610953211784363, 0.8428494334220886, 0.880882740020752, 0.024029193446040154, -0.08453702926635742, 0.00771496444940567, -0.013013732619583607, -0.23804998397827148, 0.4110376536846161, 0.23720477521419525, -0.13951541483402252, -0.1747516244649887, -0.34215790033340454, 0.014357345178723335, 0.34224632382392883, 0.03783192113041878, 0.01125166192650795, -0.08253959566354752, 0.015717405825853348, -0.22759634256362915, 0.3980898857116699, 0.2427154779434204, -0.3319437801837921, 0.11146843433380127, -0.9666317105293274, -0.12227121740579605, -0.1948898285627365, -0.030186548829078674, 0.0011711223050951958, -0.040062546730041504, -0.16316139698028564, -0.14714862406253815, 0.13224393129348755, -0.0019320327555760741, -0.09674090147018433, 0.3630145490169525, -0.019513679668307304, -0.07729464769363403, -0.34592965245246887, 0.15215164422988892, 0.046678490936756134, 0.06675180792808533, -0.08943335711956024, 0.006386714521795511, 0.10086977481842041, -0.07409387081861496, -0.19604018330574036, -0.042700666934251785, 0.12124726921319962, 0.5694677233695984, 0.25033196806907654, 0.01862989366054535, 0.0053687929175794125, -0.0017405126709491014, -0.01638556271791458, -0.32222822308540344, 0.5348804593086243, 0.5546748042106628, 1.2770946025848389, 0.11648745834827423, -0.058405984193086624, -0.2997635006904602, -0.2040756195783615, 0.15525077283382416, -0.12436354905366898, -0.089121975004673, 0.06441225856542587, 0.2444663643836975, -0.3495825529098511, -0.05243751034140587, 0.08752834796905518, 0.08800745010375977, -0.09807545691728592, -0.3823537230491638, -0.13047000765800476, 0.029333092272281647, 0.11618250608444214, -0.0638590008020401, -0.09598273783922195, -0.07390140742063522, 0.09151650220155716, -0.1700282245874405, 0.23608872294425964, 0.24879834055900574, -0.15922772884368896, -0.33795130252838135, -0.053850702941417694, 0.1014639139175415, -0.05480973795056343, -0.06753639131784439, 0.04606246575713158, -0.07082260400056839, 0.07848796248435974, 0.05011916160583496, -0.05570689216256142, -0.14584510028362274, -0.8908579349517822, -0.5959509611129761, -0.8982105255126953, 0.0788002535700798, -0.03575791418552399, 0.052424680441617966, -0.08019822835922241, 0.10848221182823181, 0.0957408994436264, 0.1457311511039734, -0.1956494003534317, -0.21669772267341614, 0.9854136109352112, -0.23215851187705994, 0.16359730064868927, 0.02025810070335865, -0.08975380659103394, -0.013868067413568497, -0.22188447415828705, 0.020666224882006645, -0.22304703295230865, 0.06407633423805237, 0.19804184138774872, -0.05285267159342766, -0.5510660409927368, -0.8522927761077881, -0.6061599850654602, 0.08484024554491043, -0.08973539620637894, 0.013228937052190304, -0.07834818214178085, 0.02858446165919304, -0.3826225996017456, 0.059726644307374954, 0.1139102503657341, -0.19311848282814026, 0.05770142376422882, 0.22584261000156403, 0.34312352538108826, -0.15085645020008087, 0.34372228384017944, 0.08070214092731476, 0.5744000673294067, -0.08693907409906387, -0.003695777617394924, -0.1334235966205597, 0.06418291479349136, 0.02848576195538044, -0.34958112239837646, -0.3419312834739685, -0.09599799662828445, 0.015022341161966324, 0.03255023807287216, 0.09713662415742874, -0.1730588674545288, 0.1904430240392685, -0.32815566658973694, -0.16749203205108643, 0.35736411809921265, -0.503787100315094, 0.5057004690170288, -0.47198373079299927, 0.11386436969041824, -0.0722493901848793, 0.03358639404177666, 0.005928087048232555, -0.05637047439813614, 0.06552420556545258, -0.07283362001180649, -0.09314802289009094, 0.13586974143981934, -0.5054865479469299, -0.18127793073654175, 0.08853171765804291, -0.13333705067634583, -0.2623322308063507, 0.17757390439510345, 0.04408252611756325, -0.0277855321764946, -0.05175777152180672, 0.40444689989089966, -0.03518976643681526, -0.36402902007102966, -0.019589770585298538, -0.05277400091290474, -0.27273234724998474, -0.07373850792646408, -0.058221735060214996, 0.14292845129966736, -0.005004828795790672, -0.05554938316345215, 0.20361287891864777, -0.30462127923965454, -0.1140812486410141, 0.16081976890563965, -0.07133162021636963, -0.20463652908802032, 0.34733739495277405, 0.17099761962890625, 0.025868643075227737, -0.02960631065070629, -0.02717636525630951, 0.02027258090674877, -0.13165302574634552, 0.36201152205467224, 0.5002728700637817, 0.39691421389579773, -0.04605599492788315, 0.28801581263542175, -1.0140656232833862, -0.5481916666030884, 0.0896061584353447, -0.049390073865652084, 0.08813252300024033, -0.1784677952528, 0.34480658173561096, -0.36402803659439087, 0.16948284208774567, 0.45740315318107605, -0.23747704923152924, 0.580975353717804, -0.24338461458683014, -0.11410018056631088, 0.06431885808706284, -0.0317281149327755, -0.024683356285095215, -0.10083278268575668, 0.024547407403588295, -0.16270779073238373, -0.07757837325334549, 0.19732129573822021, 0.03790999948978424, -0.18804220855236053, 0.8675169348716736, 0.5377629399299622, -0.0036910742055624723, -0.0016441351035609841, -0.030448857694864273, 0.07757671177387238, -0.1475408971309662, 0.613543689250946, 0.30266445875167847, 0.12106148898601532, 0.05485830456018448, -0.04748840630054474, -0.23233623802661896, -0.1949906051158905, 0.05692804977297783, 0.07474583387374878, -0.11879625171422958, 0.07200933247804642, -0.012743310071527958, -0.02546215057373047, -0.3765566349029541, 0.28637346625328064, -0.18051809072494507, 0.5034835934638977, -0.34970414638519287, -0.2386687994003296, -0.03804561868309975, -0.03649319335818291, -0.10303670912981033, 0.1299818456172943, 0.24685724079608917, -0.34168556332588196, -0.086674265563488, 0.32085898518562317, 0.48488491773605347, -0.522548258304596, 0.309568852186203, 0.167385995388031, 0.11308691650629044, 0.14733079075813293, -0.22416195273399353, 0.14763982594013214, -0.07242503017187119, 0.07601745426654816, -0.10375087708234787, -0.03409396857023239, -0.35759225487709045, 0.18936687707901, 0.28248289227485657, 0.26482364535331726, 0.061123836785554886, -0.021603189408779144, -0.13469825685024261, 0.07248867303133011, -0.03464066982269287, 0.06557167321443558, 0.16093865036964417, -0.1718607246875763],
+                b1: &[-0.3893989324569702, -0.2791002690792084, 0.07853052020072937, -0.4629746377468109, -0.7148261070251465, 0.8680436015129089, -0.46459102630615234, 0.0404132716357708, -0.44012945890426636, 0.08434166759252548, 0.32190972566604614, -0.20194832980632782, -0.3781348764896393, -0.23968002200126648, -0.581799328327179, 0.6500483155250549, -0.6192854046821594, 0.5922245383262634, 0.44006091356277466, 0.2982949912548065, 0.6136102676391602, -0.597486138343811, -0.3697699308395386, -0.45241132378578186, 0.60771644115448, -0.3373708128929138, 0.5697194337844849, 0.4784911870956421, -0.49601855874061584, 0.5023709535598755, 0.21592296659946442, -0.45412343740463257, 0.5104787945747375, 0.558862566947937, 0.4729066491127014, -0.5520593523979187, -0.5120576620101929, -0.7157037258148193, 0.12596718966960907, 0.4773174524307251],
+                w2: &[0.1379607617855072, 0.09308824688196182, -0.2596932649612427, 0.4461972713470459, 0.3480601906776428, 0.036684323102235794, 0.4057384729385376, -0.3081648051738739, 0.4561280608177185, 0.2749394178390503, -0.1400817334651947, 0.3145979046821594, -0.16919250786304474, 0.7247185707092285, 0.3479674756526947, -0.7546817064285278, 0.38135531544685364, -0.3939172029495239, -0.038021210581064224, 0.026914050802588463, -0.5281358361244202, 0.39009571075439453, 0.4090450406074524, 0.5053343772888184, -0.23938016593456268, 0.488080233335495, -0.38536468148231506, -0.23763014376163483, 0.2661689519882202, -0.14746293425559998, -0.7541974186897278, 0.27726081013679504, -0.4072169065475464, -0.8030230402946472, -0.386343389749527, 0.6674754619598389, 0.06677238643169403, 0.5055669546127319, -0.44330647587776184, -0.3423362970352173, -0.10948927700519562, 0.11290912330150604, -0.2759379744529724, 0.5522158741950989, -0.5766478776931763, 0.7288797497749329, -0.4967955946922302, -0.5466133952140808, 0.7254890203475952, 0.1274457424879074, 0.3098924458026886, 0.2524661719799042, -0.7162019610404968, 0.19503603875637054, -0.5212412476539612, 0.0968603864312172, 0.4835629463195801, -0.5865079164505005, 0.27647316455841064, 0.1975109577178955, -0.845225989818573, 0.4172143042087555, -0.014424118213355541, -0.24702520668506622, -0.16123531758785248, -0.047759659588336945, -0.09985388815402985, 0.10430619865655899, 0.53556889295578, 0.2595883011817932, 0.11729882657527924, 0.36996161937713623, -0.41997936367988586, -0.3332042694091797, 0.2527308464050293, 0.6039140820503235, 0.35183605551719666, 0.42042237520217896, -0.2265913337469101, -0.06852111965417862, 0.3749903440475464, 0.3698897361755371, -0.43096107244491577, 0.1275794953107834, 0.27926334738731384, -0.3282606303691864, 0.290679931640625, -0.14467079937458038, 0.3357028663158417, -0.0683436468243599, -0.35492125153541565, -0.14275093376636505, -0.1504347324371338, 0.1782987266778946, 0.07464402168989182, -0.2788643538951874, 0.5896115303039551, -0.314520001411438, -0.3235827684402466, -0.2899278700351715, -0.21264874935150146, 0.41862159967422485, 0.3237628936767578, 0.2948566973209381, -0.6101413369178772, -0.025511808693408966, -0.4238346517086029, -0.28283095359802246, 0.32077667117118835, -0.34138476848602295, -0.5257527232170105, 0.24129967391490936, -0.38175472617149353, -0.20559589564800262, -0.11267697811126709, 0.32475054264068604, 0.29545050859451294, 0.0010625360300764441, 0.4097916781902313, -0.3120468556880951, 0.3134985566139221, 0.33620578050613403, -0.27408266067504883, -0.0118736382573843, 0.21356475353240967, -0.6716119647026062, 0.14166241884231567, 0.020748334005475044, 0.27158322930336, -0.27066248655319214, -0.5078546404838562, 0.39642488956451416, 0.4044502079486847, 0.1363500952720642, 0.38089585304260254, -0.18438327312469482, -0.08652642369270325, 0.05718545988202095, -0.5758764743804932, 0.0948563665151596, 0.298057496547699, -0.07299521565437317, -0.24248233437538147, 0.29135069251060486, -0.44556060433387756, 0.6689074039459229, -0.12930674850940704, -0.12669484317302704, 0.1074564978480339, -0.20472179353237152, 0.14787982404232025, -0.13180267810821533, 0.3045596182346344, -0.3345180153846741, -0.3405822217464447, 0.22327540814876556, 0.02809770777821541, 0.17404714226722717, 0.22873322665691376, -0.3915692865848541, -0.39005470275878906, -0.4675980806350708, 0.44798821210861206, -0.31790846586227417, -0.21734853088855743, 0.2172199934720993, -0.3485357165336609, 0.1241735890507698, -0.6933310031890869, -0.09649480134248734, 0.24731965363025665, -0.20421941578388214, 0.13033808767795563, -0.4282769560813904, -0.22173112630844116, 0.08912057429552078, -0.3927532434463501, 0.3523387908935547, 0.36073970794677734, -0.036902282387018204, 0.5880261063575745, -0.29945725202560425, -0.40845751762390137, -0.3265145421028137, 0.370391309261322, -0.3553546965122223, 0.5133077502250671, 0.1800842434167862, -0.34683868288993835, 0.28811708092689514, 0.3033837080001831, -0.4140017628669739, 0.4362258017063141, 0.3689269423484802, 0.3121638596057892, -0.3287503123283386, -0.15226924419403076, -0.17191028594970703, -0.10683685541152954, 0.34219542145729065, 0.34955963492393494, 0.22892920672893524, -0.20123478770256042, -0.3934169411659241, 0.25449705123901367, -0.541163444519043, 0.21640898287296295, 0.19343338906764984, -0.14020974934101105, 0.010480044409632683, -0.24229897558689117, -0.4682120084762573, 0.02336042746901512, 0.039344485849142075, 0.42446646094322205, -0.3173693120479584, 0.23609045147895813, 0.20335273444652557, -0.19347436726093292, -0.05698636546730995, 0.17990583181381226, 0.30915674567222595, 0.3115670382976532, 0.4147215485572815, -0.38558056950569153, -0.12379863113164902, 0.025996098294854164, -0.3010733425617218, 0.03275908902287483, -0.6039671897888184, 0.06267470866441727, -0.012677585706114769, 0.3484704792499542, 0.24301587045192719, -0.40881243348121643, -0.16732162237167358, 0.190901979804039, -0.5619192719459534, 0.30009278655052185, -0.43359509110450745, 0.26643550395965576, 0.5083268880844116, 0.3491555452346802, 0.4731655716896057, 0.6301924586296082, -0.8111121654510498, 0.6473397016525269, -0.001451796037144959, 0.3649038076400757, -0.6002859473228455, -0.41925248503685, 0.05584913119673729, 0.7823511362075806, 0.421135276556015, 0.5779385566711426, -0.49475061893463135, 0.5293950438499451, -0.45432502031326294, -0.680946946144104, -0.3506624102592468, -0.21028658747673035, 0.4775547385215759, 0.25049126148223877, 0.2707470655441284, -0.3469635546207428, 0.5959001779556274, -0.5623777508735657, -0.6334168910980225, 0.4096938669681549, -0.3921370208263397, -0.27649807929992676, 0.4424516260623932, -0.28308066725730896, -0.22009265422821045, -0.386872798204422, 0.5130718350410461, 0.5702601075172424, 0.7469420433044434, -0.09606175124645233, -0.4271978437900543],
+                b2: &[-0.07522959262132645, 0.3644154667854309, -0.25166040658950806, -0.12973527610301971, 0.25026997923851013, -0.2794199585914612, -0.17614373564720154],
+            }
+        }
diff --git a/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.txt b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.txt
new file mode 100644
index 000000000..201cd54ce
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm32/cortex_a9.txt
@@ -0,0 +1,1701 @@
+armv7neon_mmm_f32_8x6_generic 17 128 19 0.00006235573582381347
+armv7neon_mmm_f32_8x4_cortexa7 23 32 3 0.000006021597781788675
+armv7neon_mmm_f32_8x6_cortexa7 17 128 7 0.000041163109831630036
+armv7neon_mmm_f32_8x6_generic 9 4 5 0.0000020753617625129768
+generic_f32_4x4 13 4 3 0.000003220368712907131
+armv7neon_mmm_f32_8x4_cortexa7 9 128 3 0.000011219671010907719
+armv7neon_mmm_f32_8x6_generic 24 4 12 0.00000416032880372066
+armv7neon_mmm_f32_8x6_cortexa9 15 128 12 0.00002134037524275856
+armv7neon_mmm_f32_8x4_cortexa7 16 32 7 0.000007487730700761545
+armv7neon_mmm_f32_8x4_generic 25 128 4 0.000015723210051937644
+armv7neon_mmm_f32_8x6_cortexa9 15 128 13 0.000031729639665247244
+armv7neon_mmm_f32_8x4_cortexa7 17 128 5 0.00003233807616782481
+generic_f32_4x4 5 4 9 0.000004114340189363069
+armv7neon_mmm_f32_8x4_cortexa9 7 4 12 0.0000026805076801341797
+armv7neon_mmm_f32_8x6_cortexa7 8 4 7 0.0000020191503624854738
+generic_f32_4x4 5 32 4 0.0000030555103653558445
+armv7neon_mmm_f32_8x4_cortexa7 8 128 12 0.000016128094616247412
+generic_f32_4x4 13 128 8 0.000029057855790622486
+armv7neon_mmm_f32_8x6_cortexa7 23 128 5 0.00002125083853467516
+generic_f32_4x4 5 128 4 0.000007724128084704853
+generic_f32_4x4 9 128 5 0.000022305019807747277
+armv7neon_mmm_f32_8x6_generic 23 32 11 0.000011268391904458938
+armv7neon_mmm_f32_8x4_generic 7 4 12 0.0000026796237449338948
+armv7neon_mmm_f32_8x4_cortexa9 8 32 11 0.000004490236277399054
+armv7neon_mmm_f32_8x6_cortexa7 23 4 7 0.000005431543985839428
+armv7neon_mmm_f32_8x4_cortexa9 7 32 11 0.000004893302315959357
+armv7neon_mmm_f32_8x6_generic 17 4 6 0.0000024671248311817606
+armv7neon_mmm_f32_8x6_cortexa7 7 4 7 0.0000023786534435590155
+armv7neon_mmm_f32_8x6_cortexa9 23 32 7 0.000011142179078429717
+armv7neon_mmm_f32_8x6_cortexa7 23 128 18 0.00006187753967619513
+armv7neon_mmm_f32_8x4_cortexa9 17 4 13 0.000007583246734362711
+armv7neon_mmm_f32_8x4_cortexa7 25 32 8 0.000013742616132230942
+armv7neon_mmm_f32_8x4_cortexa9 15 128 13 0.00003174823725370825
+armv7neon_mmm_f32_8x6_cortexa7 8 32 17 0.0000068525292311713665
+armv7neon_mmm_f32_8x4_generic 23 4 12 0.000005946433902061308
+armv7neon_mmm_f32_8x4_generic 7 32 12 0.000004913908866329809
+armv7neon_mmm_f32_8x4_generic 9 4 5 0.0000029635862825605913
+armv7neon_mmm_f32_8x6_generic 24 128 7 0.00003097026633557495
+armv7neon_mmm_f32_8x4_generic 24 4 13 0.000007343587112941961
+armv7neon_mmm_f32_8x4_generic 23 4 8 0.000004129384710709428
+generic_f32_4x4 11 128 8 0.00002206478635932785
+armv7neon_mmm_f32_8x4_generic 8 4 9 0.000002203244153911471
+armv7neon_mmm_f32_8x4_cortexa9 17 32 3 0.000004810722960669039
+armv7neon_mmm_f32_8x4_cortexa7 7 128 12 0.000016707563395180685
+armv7neon_mmm_f32_8x6_cortexa7 7 4 12 0.000002638802872509435
+armv7neon_mmm_f32_8x4_cortexa7 17 128 9 0.0000480527815279415
+armv7neon_mmm_f32_8x4_cortexa7 7 128 7 0.000011282871511736955
+armv7neon_mmm_f32_8x4_cortexa9 16 4 5 0.000002910382588603331
+armv7neon_mmm_f32_8x4_generic 17 128 13 0.00004671426916376633
+armv7neon_mmm_f32_8x4_cortexa7 9 128 8 0.00002150638840328972
+armv7neon_mmm_f32_8x4_cortexa9 25 32 12 0.000015994893141833952
+armv7neon_mmm_f32_8x4_cortexa7 23 32 11 0.000016244020387064702
+armv7neon_mmm_f32_8x6_cortexa9 7 4 11 0.000002387141196478666
+armv7neon_mmm_f32_8x6_cortexa7 7 4 5 0.0000013875129728218855
+armv7neon_mmm_f32_8x4_cortexa7 15 32 4 0.0000039730943727200555
+armv7neon_mmm_f32_8x4_cortexa7 24 32 3 0.000006034875853921332
+armv7neon_mmm_f32_8x4_cortexa9 8 4 5 0.0000016776748569042105
+armv7neon_mmm_f32_8x6_cortexa7 16 4 18 0.0000045013629937264245
+armv7neon_mmm_f32_8x4_cortexa9 8 128 8 0.000008024678845849025
+armv7neon_mmm_f32_8x6_cortexa9 25 4 18 0.000008301730396887936
+armv7neon_mmm_f32_8x4_generic 7 32 11 0.000004892898505691065
+armv7neon_mmm_f32_8x4_cortexa9 16 128 4 0.000008030995808345325
+armv7neon_mmm_f32_8x4_cortexa7 16 32 3 0.000004194758783711011
+armv7neon_mmm_f32_8x4_generic 25 32 3 0.000006291923472603681
+generic_f32_4x4 3 32 13 0.000005952451348897524
+armv7neon_mmm_f32_8x6_cortexa9 23 32 19 0.000021623159632105592
+armv7neon_mmm_f32_8x4_generic 9 4 7 0.0000030054177268153855
+armv7neon_mmm_f32_8x4_cortexa9 7 128 3 0.0000044229477614846184
+armv7neon_mmm_f32_8x4_generic 25 32 12 0.0000159741241286288
+armv7neon_mmm_f32_8x6_generic 8 32 5 0.000002351046891403544
+armv7neon_mmm_f32_8x6_generic 8 128 17 0.00001572220287732098
+armv7neon_mmm_f32_8x6_cortexa9 25 4 13 0.00000878947090986632
+armv7neon_mmm_f32_8x6_cortexa7 9 128 18 0.00004088792544048485
+armv7neon_mmm_f32_8x4_generic 9 128 8 0.000015874731663155136
+armv7neon_mmm_f32_8x4_generic 16 128 5 0.000015914646023810615
+armv7neon_mmm_f32_8x6_cortexa7 17 32 18 0.000019117240551030633
+armv7neon_mmm_f32_8x4_cortexa9 7 4 11 0.0000026740706182831685
+armv7neon_mmm_f32_8x4_cortexa7 9 128 13 0.00004271080548146737
+armv7neon_mmm_f32_8x6_generic 16 128 11 0.000020976534312868846
+armv7neon_mmm_f32_8x4_cortexa7 16 128 7 0.00002177826510527881
+armv7neon_mmm_f32_8x4_generic 15 32 7 0.000006163372672989967
+armv7neon_mmm_f32_8x4_generic 9 128 11 0.000023697623425846144
+armv7neon_mmm_f32_8x6_cortexa7 16 4 6 0.0000018480490329829836
+armv7neon_mmm_f32_8x4_cortexa9 25 4 7 0.00000554548266542983
+armv7neon_mmm_f32_8x6_generic 15 128 13 0.00003163198682165344
+armv7neon_mmm_f32_8x6_cortexa7 25 32 18 0.000025184955719578352
+armv7neon_mmm_f32_8x6_generic 8 4 6 0.000001113907974305815
+armv7neon_mmm_f32_8x6_cortexa7 16 32 5 0.0000050492542920704264
+armv7neon_mmm_f32_8x6_generic 8 32 19 0.000007097287986832066
+armv7neon_mmm_f32_8x6_cortexa9 25 32 5 0.000007806390304931795
+armv7neon_mmm_f32_8x6_cortexa9 15 128 11 0.000021399956173572563
+armv7neon_mmm_f32_8x6_generic 16 32 6 0.000003721529767067075
+armv7neon_mmm_f32_8x4_cortexa7 17 32 5 0.000010843745704621824
+armv7neon_mmm_f32_8x4_generic 7 128 7 0.00000843066632668338
+armv7neon_mmm_f32_8x4_generic 23 32 13 0.000016850629087203773
+armv7neon_mmm_f32_8x4_cortexa9 16 32 9 0.000008424770761452617
+armv7neon_mmm_f32_8x4_cortexa9 15 32 13 0.000011652714145510248
+armv7neon_mmm_f32_8x4_generic 23 128 5 0.000023910719482642124
+armv7neon_mmm_f32_8x4_cortexa9 23 128 3 0.000012498528953898262
+armv7neon_mmm_f32_8x4_generic 16 32 8 0.000005593957529271591
+armv7neon_mmm_f32_8x6_cortexa7 7 128 13 0.000021419187824431687
+armv7neon_mmm_f32_8x6_cortexa9 7 128 6 0.000005900049478814555
+armv7neon_mmm_f32_8x4_cortexa7 24 32 11 0.000015824034451655578
+armv7neon_mmm_f32_8x6_cortexa7 16 4 17 0.000004987657461648502
+armv7neon_mmm_f32_8x4_cortexa7 23 32 8 0.000010735439695395534
+armv7neon_mmm_f32_8x4_cortexa9 25 4 5 0.000005392929349101573
+armv7neon_mmm_f32_8x4_generic 25 4 5 0.000005418182338760064
+armv7neon_mmm_f32_8x4_generic 7 128 9 0.000012382199177609924
+armv7neon_mmm_f32_8x4_cortexa9 9 32 9 0.000008579188782470308
+armv7neon_mmm_f32_8x4_generic 8 128 12 0.000011801915167566902
+armv7neon_mmm_f32_8x4_cortexa9 23 4 8 0.0000041152642927900605
+armv7neon_mmm_f32_8x4_generic 7 4 11 0.0000026764041239252324
+generic_f32_4x4 13 128 12 0.00004319526080896362
+generic_f32_4x4 5 32 5 0.000005725142687375275
+generic_f32_4x4 5 128 12 0.00002200938035136394
+armv7neon_mmm_f32_8x4_generic 23 4 5 0.000004369620528795486
+armv7neon_mmm_f32_8x6_generic 9 4 11 0.000003424705713246475
+armv7neon_mmm_f32_8x6_generic 17 32 12 0.000010395428642618952
+generic_f32_4x4 13 32 11 0.000015867743214986118
+armv7neon_mmm_f32_8x6_cortexa9 9 32 17 0.000010850143335646025
+armv7neon_mmm_f32_8x6_cortexa9 23 4 6 0.0000028251957000612876
+armv7neon_mmm_f32_8x6_cortexa9 9 128 7 0.000020914788448206477
+armv7neon_mmm_f32_8x4_cortexa7 15 128 11 0.000032581738066809436
+armv7neon_mmm_f32_8x4_cortexa9 25 128 3 0.000016370587474295946
+armv7neon_mmm_f32_8x4_cortexa7 23 4 4 0.0000024826703257808083
+armv7neon_mmm_f32_8x4_generic 9 128 4 0.000008108848864529598
+armv7neon_mmm_f32_8x6_cortexa9 25 128 5 0.00002136490649839919
+armv7neon_mmm_f32_8x6_generic 23 128 19 0.00006253554336626349
+generic_f32_4x4 12 4 12 0.000005236763691820686
+armv7neon_mmm_f32_8x6_cortexa9 8 4 7 0.0000019309843288986318
+armv7neon_mmm_f32_8x4_cortexa7 15 4 5 0.000003357403897484634
+armv7neon_mmm_f32_8x6_cortexa9 24 4 6 0.0000024072433754401144
+armv7neon_mmm_f32_8x6_cortexa7 25 128 7 0.000054619400677372974
+armv7neon_mmm_f32_8x6_generic 7 4 19 0.000004265433096040261
+generic_f32_4x4 7 4 12 0.0000041235655802886916
+armv7neon_mmm_f32_8x6_generic 17 128 17 0.000046385914441781564
+armv7neon_mmm_f32_8x4_generic 8 32 7 0.0000032122117166780275
+armv7neon_mmm_f32_8x4_cortexa9 16 4 7 0.0000030113926906216977
+generic_f32_4x4 13 128 13 0.00005791151370731608
+armv7neon_mmm_f32_8x4_cortexa7 9 32 9 0.000010771893810221605
+armv7neon_mmm_f32_8x4_cortexa7 23 4 7 0.000004820473947807521
+armv7neon_mmm_f32_8x4_cortexa9 24 4 13 0.00000731015216867438
+generic_f32_4x4 3 4 5 0.0000018607872137198219
+armv7neon_mmm_f32_8x4_generic 15 32 9 0.000008854455084657494
+armv7neon_mmm_f32_8x6_cortexa9 15 32 6 0.000004174355801441308
+armv7neon_mmm_f32_8x6_cortexa9 16 4 5 0.000002250377451226225
+armv7neon_mmm_f32_8x6_cortexa9 16 32 19 0.000013757586520904355
+armv7neon_mmm_f32_8x6_cortexa9 24 128 11 0.000031276431090202235
+armv7neon_mmm_f32_8x4_generic 25 128 7 0.00003164406261833124
+armv7neon_mmm_f32_8x4_generic 24 4 4 0.0000020584132727266
+generic_f32_4x4 7 128 11 0.000022350186275600858
+armv7neon_mmm_f32_8x4_cortexa9 23 4 4 0.0000022753692017814563
+armv7neon_mmm_f32_8x6_cortexa9 17 32 19 0.00002069369907930334
+generic_f32_4x4 4 128 13 0.00001478326569151377
+armv7neon_mmm_f32_8x6_cortexa9 23 32 18 0.00001626278262596922
+armv7neon_mmm_f32_8x6_generic 7 128 18 0.000016667585395966858
+armv7neon_mmm_f32_8x4_cortexa9 8 128 3 0.000004452778028984887
+armv7neon_mmm_f32_8x4_cortexa7 16 128 13 0.000042499574729319346
+generic_f32_4x4 3 32 11 0.000004620575096918176
+generic_f32_4x4 4 4 8 0.0000015603089093623964
+armv7neon_mmm_f32_8x6_cortexa9 15 32 7 0.00000772763531212548
+armv7neon_mmm_f32_8x4_cortexa7 9 4 11 0.000004505536773333099
+armv7neon_mmm_f32_8x6_cortexa9 15 32 18 0.000011416109977905379
+armv7neon_mmm_f32_8x4_cortexa7 24 128 12 0.00004730343110835064
+armv7neon_mmm_f32_8x4_cortexa9 8 4 7 0.000001726865000473268
+armv7neon_mmm_f32_8x4_cortexa9 17 4 12 0.00000554999550214869
+armv7neon_mmm_f32_8x4_cortexa9 24 32 3 0.000004919761516416024
+armv7neon_mmm_f32_8x4_cortexa7 7 4 7 0.0000020327105105148146
+generic_f32_4x4 8 32 11 0.000008129741741591272
+armv7neon_mmm_f32_8x6_cortexa9 17 128 6 0.000015685032106978185
+armv7neon_mmm_f32_8x6_cortexa7 24 4 13 0.0000069628322548194484
+generic_f32_4x4 11 32 7 0.000008418658676221946
+armv7neon_mmm_f32_8x4_cortexa9 23 32 13 0.000016859244394935546
+armv7neon_mmm_f32_8x6_cortexa7 15 4 19 0.000007361880271251662
+armv7neon_mmm_f32_8x6_cortexa7 25 128 19 0.00010862057908423823
+generic_f32_4x4 7 128 5 0.00001508121452285347
+armv7neon_mmm_f32_8x4_generic 8 4 12 0.0000020475145659989298
+armv7neon_mmm_f32_8x6_generic 23 32 6 0.000005742234510888845
+armv7neon_mmm_f32_8x6_cortexa9 8 32 7 0.000003926599359988243
+armv7neon_mmm_f32_8x4_cortexa7 16 32 5 0.00000738840656931522
+generic_f32_4x4 3 128 5 0.000007875053042737703
+armv7neon_mmm_f32_8x6_cortexa9 9 128 5 0.000010860557674890965
+armv7neon_mmm_f32_8x6_generic 15 32 11 0.000007797932920988772
+generic_f32_4x4 8 4 3 0.0000019148150797610994
+armv7neon_mmm_f32_8x4_generic 23 32 9 0.000012836974653040903
+armv7neon_mmm_f32_8x4_cortexa7 25 4 3 0.0000035757992765103294
+armv7neon_mmm_f32_8x4_generic 7 32 8 0.0000034181328210189654
+armv7neon_mmm_f32_8x6_cortexa9 25 32 7 0.000014158102081231813
+armv7neon_mmm_f32_8x6_generic 9 4 18 0.000004486229240869606
+armv7neon_mmm_f32_8x4_cortexa7 8 4 4 0.0000010559381010362017
+armv7neon_mmm_f32_8x4_cortexa9 25 32 3 0.000006292048691193805
+armv7neon_mmm_f32_8x4_cortexa9 24 4 11 0.000005847758703659802
+armv7neon_mmm_f32_8x4_cortexa9 7 128 12 0.00001242120677392976
+armv7neon_mmm_f32_8x4_cortexa7 16 128 9 0.00003210848238315819
+armv7neon_mmm_f32_8x4_cortexa9 24 128 7 0.00002382518082369683
+armv7neon_mmm_f32_8x4_generic 24 32 3 0.0000049198649021802165
+armv7neon_mmm_f32_8x4_cortexa9 25 4 11 0.000007781278370753281
+armv7neon_mmm_f32_8x6_cortexa7 17 32 12 0.000012940679584474003
+armv7neon_mmm_f32_8x6_cortexa7 16 32 17 0.000013190821660739104
+armv7neon_mmm_f32_8x6_cortexa7 25 4 19 0.000012097338566670792
+armv7neon_mmm_f32_8x6_generic 16 32 18 0.000010138504593611297
+armv7neon_mmm_f32_8x4_generic 9 128 5 0.000015947315101595927
+armv7neon_mmm_f32_8x6_cortexa9 23 4 5 0.0000031081830446744932
+armv7neon_mmm_f32_8x6_cortexa9 24 128 7 0.00003103549849311001
+armv7neon_mmm_f32_8x4_cortexa7 25 4 13 0.000010646545231619916
+generic_f32_4x4 9 128 12 0.00003260110763311798
+armv7neon_mmm_f32_8x6_cortexa9 23 4 17 0.000007631985122319121
+armv7neon_mmm_f32_8x4_cortexa9 8 128 13 0.00001574791956845722
+armv7neon_mmm_f32_8x6_cortexa9 23 4 18 0.000007303182081264746
+armv7neon_mmm_f32_8x4_cortexa7 24 128 13 0.00006349168486923089
+armv7neon_mmm_f32_8x4_generic 23 32 11 0.000012994166844502801
+generic_f32_4x4 7 32 4 0.0000031041482961797043
+armv7neon_mmm_f32_8x4_cortexa9 9 32 7 0.000005959447070414145
+armv7neon_mmm_f32_8x4_cortexa7 23 128 11 0.00004845207607490223
+armv7neon_mmm_f32_8x6_cortexa7 17 4 11 0.0000052419075338197185
+generic_f32_4x4 8 4 13 0.00000500809427570026
+armv7neon_mmm_f32_8x4_generic 24 32 7 0.000008731713037367055
+armv7neon_mmm_f32_8x6_cortexa9 25 128 13 0.00006164900543131858
+armv7neon_mmm_f32_8x6_cortexa9 17 4 7 0.000004817790997463295
+armv7neon_mmm_f32_8x6_cortexa9 8 32 19 0.000007141501791335855
+armv7neon_mmm_f32_8x4_cortexa9 16 4 4 0.000001530544742707957
+armv7neon_mmm_f32_8x6_generic 25 4 17 0.000008895187762533163
+armv7neon_mmm_f32_8x4_cortexa7 23 4 3 0.0000028668169166829235
+armv7neon_mmm_f32_8x6_cortexa7 24 128 11 0.00004128096933839809
+generic_f32_4x4 7 4 13 0.000005447538386200191
+armv7neon_mmm_f32_8x6_cortexa9 9 128 17 0.0000311696244066944
+armv7neon_mmm_f32_8x6_cortexa7 25 4 17 0.000009561336620584211
+armv7neon_mmm_f32_8x4_generic 16 128 8 0.000015609467528565242
+armv7neon_mmm_f32_8x6_cortexa9 7 4 6 0.0000015274572782560943
+armv7neon_mmm_f32_8x6_cortexa9 7 4 19 0.000004327266570693739
+generic_f32_4x4 4 128 3 0.0000042186208925339794
+armv7neon_mmm_f32_8x6_cortexa9 16 32 6 0.000003783089782648409
+armv7neon_mmm_f32_8x4_generic 24 4 12 0.000005240725524660556
+armv7neon_mmm_f32_8x6_generic 24 4 5 0.000003075012528242585
+armv7neon_mmm_f32_8x4_cortexa7 16 128 3 0.000011341419957358612
+generic_f32_4x4 13 32 7 0.000010885101829856154
+generic_f32_4x4 8 128 13 0.00002903953507476824
+armv7neon_mmm_f32_8x6_cortexa9 7 32 12 0.000004528441801230068
+armv7neon_mmm_f32_8x4_cortexa9 8 128 9 0.000011968397967338213
+generic_f32_4x4 9 4 11 0.000005879906070039223
+armv7neon_mmm_f32_8x4_generic 9 128 9 0.000023684049431562338
+armv7neon_mmm_f32_8x6_cortexa7 23 128 19 0.00008252903643379173
+armv7neon_mmm_f32_8x6_generic 23 128 7 0.000031381294348434004
+armv7neon_mmm_f32_8x4_generic 15 4 13 0.000005669760296114337
+armv7neon_mmm_f32_8x4_generic 15 4 11 0.000004507200935249084
+armv7neon_mmm_f32_8x4_generic 15 4 3 0.0000019251390208258458
+generic_f32_4x4 9 32 7 0.00000832450452449881
+generic_f32_4x4 9 4 9 0.000005828318767488796
+armv7neon_mmm_f32_8x4_cortexa7 16 4 3 0.000002093301960254744
+armv7neon_mmm_f32_8x4_generic 15 128 12 0.000023812063445324994
+armv7neon_mmm_f32_8x6_cortexa7 7 4 19 0.000004468409864573403
+generic_f32_4x4 8 4 7 0.000002955604897568221
+armv7neon_mmm_f32_8x6_cortexa9 9 32 12 0.000007238338427088594
+armv7neon_mmm_f32_8x6_generic 16 32 12 0.00000694068903036478
+armv7neon_mmm_f32_8x6_cortexa7 25 128 6 0.000027308370544312844
+armv7neon_mmm_f32_8x6_cortexa9 24 4 17 0.000006795821587814609
+armv7neon_mmm_f32_8x6_cortexa7 15 128 12 0.00002803261400606977
+generic_f32_4x4 9 32 8 0.000008044423595514706
+armv7neon_mmm_f32_8x4_generic 23 128 9 0.00003548379543389711
+armv7neon_mmm_f32_8x4_cortexa7 17 32 3 0.000005926615429744039
+armv7neon_mmm_f32_8x4_generic 8 128 7 0.000008235490634332163
+armv7neon_mmm_f32_8x4_cortexa7 9 4 12 0.000004255018272123806
+armv7neon_mmm_f32_8x4_cortexa9 16 32 11 0.000008524184894246237
+armv7neon_mmm_f32_8x4_cortexa9 24 32 4 0.000004264845803875226
+generic_f32_4x4 3 4 11 0.000002555428269031667
+armv7neon_mmm_f32_8x4_cortexa7 24 4 4 0.0000022261643758837446
+armv7neon_mmm_f32_8x4_cortexa7 15 128 3 0.000011306220407671872
+armv7neon_mmm_f32_8x6_generic 8 4 18 0.000002337607386621077
+armv7neon_mmm_f32_8x4_cortexa9 25 4 4 0.0000026788269418985845
+generic_f32_4x4 11 4 7 0.000004308040885452502
+armv7neon_mmm_f32_8x4_cortexa9 15 128 4 0.000008223517905022214
+generic_f32_4x4 5 32 11 0.000008288284199113315
+armv7neon_mmm_f32_8x6_cortexa9 9 4 12 0.0000032467302759009237
+armv7neon_mmm_f32_8x4_generic 24 128 7 0.00002382735987416326
+armv7neon_mmm_f32_8x6_generic 24 32 13 0.000015487434262812744
+armv7neon_mmm_f32_8x6_generic 17 128 13 0.00004629753684559049
+armv7neon_mmm_f32_8x6_cortexa7 24 128 13 0.00006117633525152988
+armv7neon_mmm_f32_8x6_generic 24 4 19 0.000008326315174295047
+armv7neon_mmm_f32_8x6_cortexa7 17 32 7 0.000013248073815175261
+armv7neon_mmm_f32_8x4_cortexa9 8 4 9 0.0000022033600613983073
+generic_f32_4x4 13 4 12 0.000007132280989099253
+armv7neon_mmm_f32_8x4_cortexa7 16 128 4 0.000010902014380146801
+armv7neon_mmm_f32_8x6_cortexa7 9 4 6 0.0000019491119088955803
+generic_f32_4x4 3 4 7 0.0000019042920757312053
+armv7neon_mmm_f32_8x4_generic 7 32 5 0.0000033764800411714497
+armv7neon_mmm_f32_8x6_cortexa7 23 4 19 0.000010215694576435441
+armv7neon_mmm_f32_8x4_generic 17 4 3 0.0000025969094160852644
+generic_f32_4x4 5 4 11 0.0000041349188443565096
+armv7neon_mmm_f32_8x6_cortexa7 9 32 17 0.000013295032359470915
+armv7neon_mmm_f32_8x4_cortexa7 15 128 13 0.00004313466302520057
+armv7neon_mmm_f32_8x4_cortexa9 15 4 8 0.0000030055981545968782
+armv7neon_mmm_f32_8x6_generic 16 4 12 0.0000029570565051151263
+armv7neon_mmm_f32_8x6_cortexa9 8 4 11 0.000002014524477563071
+armv7neon_mmm_f32_8x4_cortexa9 8 4 3 0.0000012065261711370315
+armv7neon_mmm_f32_8x4_cortexa7 7 32 7 0.000004136959156912223
+armv7neon_mmm_f32_8x4_generic 17 4 9 0.000005894535633035915
+generic_f32_4x4 9 4 13 0.00000752174902729491
+armv7neon_mmm_f32_8x6_cortexa9 7 32 5 0.000002364950726746234
+generic_f32_4x4 13 32 4 0.000005521387429490463
+armv7neon_mmm_f32_8x4_generic 8 128 3 0.000004474358471860054
+armv7neon_mmm_f32_8x4_cortexa7 23 4 12 0.0000063640165444956975
+armv7neon_mmm_f32_8x4_generic 7 32 4 0.0000019290673256470733
+armv7neon_mmm_f32_8x4_generic 17 32 12 0.000012180617018748219
+armv7neon_mmm_f32_8x6_generic 17 4 11 0.000004899442085076861
+armv7neon_mmm_f32_8x4_cortexa7 24 128 7 0.000032404931744711066
+armv7neon_mmm_f32_8x4_cortexa9 16 32 12 0.000008099332243893017
+armv7neon_mmm_f32_8x6_generic 9 128 18 0.00003081307981321012
+armv7neon_mmm_f32_8x6_generic 15 128 11 0.00002132277281707897
+armv7neon_mmm_f32_8x6_cortexa7 7 128 6 0.000007540283032400861
+armv7neon_mmm_f32_8x6_generic 25 32 12 0.000013597212519317418
+armv7neon_mmm_f32_8x6_generic 23 4 17 0.000007516814516170969
+armv7neon_mmm_f32_8x6_cortexa7 24 32 11 0.000013420883457543118
+generic_f32_4x4 9 32 11 0.000012087233281631852
+armv7neon_mmm_f32_8x6_cortexa7 24 4 17 0.000007204721001644005
+armv7neon_mmm_f32_8x6_cortexa7 25 32 7 0.000017455571757923716
+armv7neon_mmm_f32_8x4_cortexa9 23 32 11 0.000012992255106920675
+armv7neon_mmm_f32_8x4_cortexa9 9 4 9 0.000004090278120441601
+armv7neon_mmm_f32_8x4_cortexa7 23 32 13 0.000021193261455653347
+armv7neon_mmm_f32_8x6_cortexa7 24 32 19 0.00002532279550192562
+armv7neon_mmm_f32_8x4_generic 25 128 11 0.00004692401023814237
+armv7neon_mmm_f32_8x4_cortexa9 17 32 7 0.000008780640229470475
+armv7neon_mmm_f32_8x4_cortexa7 9 128 9 0.00003233195046796576
+armv7neon_mmm_f32_8x4_cortexa7 23 128 5 0.000032464808470770534
+armv7neon_mmm_f32_8x4_cortexa9 9 4 5 0.00000293936322263601
+armv7neon_mmm_f32_8x6_cortexa7 25 32 6 0.000008732005752739124
+armv7neon_mmm_f32_8x6_cortexa9 8 4 5 0.0000013801093771363888
+armv7neon_mmm_f32_8x6_cortexa7 16 128 6 0.000013837384210811002
+armv7neon_mmm_f32_8x4_generic 23 32 8 0.000008530859801735756
+armv7neon_mmm_f32_8x4_cortexa7 17 4 8 0.000004162931637169246
+armv7neon_mmm_f32_8x6_generic 7 32 13 0.000006254936643865642
+armv7neon_mmm_f32_8x6_cortexa7 9 32 7 0.00000901443584736606
+armv7neon_mmm_f32_8x4_cortexa7 15 4 7 0.0000034347738366310684
+generic_f32_4x4 9 32 13 0.000015744382160735945
+armv7neon_mmm_f32_8x6_generic 9 32 18 0.000010478111403829218
+armv7neon_mmm_f32_8x6_cortexa9 23 128 11 0.00003166377905937062
+armv7neon_mmm_f32_8x4_generic 15 128 7 0.000016210803535229283
+armv7neon_mmm_f32_8x4_cortexa7 23 4 9 0.000006649300631510142
+armv7neon_mmm_f32_8x4_generic 17 32 3 0.000004810545788546653
+armv7neon_mmm_f32_8x6_generic 24 32 12 0.000010144077161084113
+armv7neon_mmm_f32_8x4_cortexa9 15 32 7 0.000006152877670399033
+armv7neon_mmm_f32_8x4_cortexa9 23 4 12 0.000005900735151833346
+armv7neon_mmm_f32_8x4_cortexa7 16 32 4 0.0000037548262311678015
+armv7neon_mmm_f32_8x6_cortexa7 23 4 18 0.000007713970848848978
+armv7neon_mmm_f32_8x6_cortexa7 16 32 6 0.000004565838055826264
+armv7neon_mmm_f32_8x4_cortexa9 25 32 5 0.000011311149331127226
+armv7neon_mmm_f32_8x6_cortexa9 16 128 18 0.00003053013458613619
+armv7neon_mmm_f32_8x6_cortexa7 15 32 17 0.000013972664044561828
+armv7neon_mmm_f32_8x4_generic 16 32 4 0.000003010794297325924
+armv7neon_mmm_f32_8x6_generic 7 128 7 0.000011021195155003848
+generic_f32_4x4 4 32 13 0.000005510533372512351
+armv7neon_mmm_f32_8x4_cortexa9 16 32 7 0.000005969619454408248
+armv7neon_mmm_f32_8x4_cortexa9 24 32 12 0.000011900133300984794
+armv7neon_mmm_f32_8x4_cortexa7 8 128 3 0.000005902896580930324
+armv7neon_mmm_f32_8x4_generic 16 32 3 0.000003439802295093395
+armv7neon_mmm_f32_8x4_cortexa7 17 32 4 0.000005503311716430611
+armv7neon_mmm_f32_8x6_generic 9 32 12 0.000007171024246490819
+armv7neon_mmm_f32_8x6_cortexa7 23 32 5 0.000007315917802939159
+armv7neon_mmm_f32_8x6_cortexa7 23 128 12 0.0000413645417403771
+armv7neon_mmm_f32_8x4_generic 16 32 7 0.000005981518355563633
+generic_f32_4x4 7 4 7 0.000003097228954014147
+armv7neon_mmm_f32_8x6_cortexa7 24 4 18 0.000006499393673193748
+armv7neon_mmm_f32_8x6_cortexa9 24 4 7 0.000004723972074276078
+armv7neon_mmm_f32_8x4_cortexa7 16 32 11 0.00001075379201180745
+generic_f32_4x4 5 32 8 0.000005591832396338314
+armv7neon_mmm_f32_8x6_cortexa7 7 4 17 0.000003519015871119777
+armv7neon_mmm_f32_8x6_generic 25 4 13 0.000008668362849414556
+armv7neon_mmm_f32_8x4_cortexa7 15 128 4 0.000011117502498464982
+armv7neon_mmm_f32_8x6_cortexa7 15 4 17 0.000005776336577667951
+armv7neon_mmm_f32_8x4_cortexa9 8 32 13 0.000005695220879131919
+armv7neon_mmm_f32_8x4_cortexa7 24 4 12 0.000005659072390762546
+armv7neon_mmm_f32_8x4_generic 23 128 4 0.00001202844464043909
+armv7neon_mmm_f32_8x6_generic 17 128 7 0.00003102141310178231
+armv7neon_mmm_f32_8x4_generic 16 128 11 0.00002360551094138009
+armv7neon_mmm_f32_8x4_cortexa7 25 128 9 0.00006388979998675838
+armv7neon_mmm_f32_8x4_generic 9 32 9 0.00000858128920884699
+armv7neon_mmm_f32_8x6_cortexa7 9 4 12 0.0000034111339123867222
+armv7neon_mmm_f32_8x6_cortexa9 17 128 11 0.00003127371093460866
+armv7neon_mmm_f32_8x4_cortexa9 15 128 11 0.00002401650595136234
+armv7neon_mmm_f32_8x6_cortexa9 23 128 12 0.00003137052216487605
+armv7neon_mmm_f32_8x6_cortexa9 16 128 17 0.00003103196066568324
+generic_f32_4x4 7 32 3 0.000003268544296254094
+armv7neon_mmm_f32_8x4_generic 9 4 8 0.0000027781499181131053
+armv7neon_mmm_f32_8x4_cortexa7 16 32 9 0.000010640273784126178
+armv7neon_mmm_f32_8x4_cortexa9 25 4 9 0.00000761410675406794
+generic_f32_4x4 3 4 12 0.00000255471395132772
+generic_f32_4x4 12 32 13 0.00001543431169651167
+armv7neon_mmm_f32_8x6_cortexa9 15 4 7 0.0000037646981265010737
+armv7neon_mmm_f32_8x4_cortexa7 7 4 9 0.0000027630250787792764
+armv7neon_mmm_f32_8x6_cortexa9 15 4 13 0.000005407861539689391
+armv7neon_mmm_f32_8x4_cortexa7 24 4 5 0.000004483592435521624
+armv7neon_mmm_f32_8x6_cortexa9 16 128 5 0.000010995701297253243
+armv7neon_mmm_f32_8x4_cortexa9 7 4 13 0.000003344708907386396
+generic_f32_4x4 3 4 4 0.0000012029323677452384
+armv7neon_mmm_f32_8x6_cortexa7 8 4 5 0.0000014134340225429588
+armv7neon_mmm_f32_8x4_cortexa7 16 4 5 0.000003166142335812665
+armv7neon_mmm_f32_8x6_cortexa9 15 4 17 0.000005534126802188086
+armv7neon_mmm_f32_8x4_generic 17 128 4 0.000011910311848497855
+armv7neon_mmm_f32_8x4_cortexa9 17 128 9 0.00003520115855849476
+armv7neon_mmm_f32_8x6_cortexa7 24 128 17 0.00006131644295051564
+armv7neon_mmm_f32_8x4_generic 8 32 12 0.000004263261337563466
+armv7neon_mmm_f32_8x4_generic 15 128 13 0.000031724034290535855
+generic_f32_4x4 13 128 4 0.000014842273639272524
+armv7neon_mmm_f32_8x6_generic 16 128 17 0.00003115527449889593
+armv7neon_mmm_f32_8x4_cortexa7 25 128 13 0.00008540107215655228
+armv7neon_mmm_f32_8x4_cortexa9 17 4 8 0.000003867693626667699
+armv7neon_mmm_f32_8x6_generic 9 4 6 0.0000018447023821962963
+armv7neon_mmm_f32_8x4_generic 15 4 5 0.0000031342530980756114
+armv7neon_mmm_f32_8x4_cortexa9 25 32 11 0.000016711914928453156
+armv7neon_mmm_f32_8x4_cortexa9 9 4 7 0.0000029971874356879734
+armv7neon_mmm_f32_8x4_cortexa9 17 32 8 0.000008299083390781581
+armv7neon_mmm_f32_8x6_generic 9 128 11 0.000020922916297729766
+armv7neon_mmm_f32_8x6_cortexa9 25 4 11 0.000006455717226984232
+generic_f32_4x4 9 128 13 0.00004380056172414816
+armv7neon_mmm_f32_8x4_generic 9 4 3 0.0000018459881520629983
+generic_f32_4x4 13 32 13 0.000020718460422326147
+generic_f32_4x4 11 128 11 0.00003310679264480642
+armv7neon_mmm_f32_8x4_cortexa7 17 128 12 0.00004762075330394028
+armv7neon_mmm_f32_8x6_cortexa9 25 4 6 0.0000031406906098450826
+armv7neon_mmm_f32_8x4_cortexa7 15 4 3 0.0000020595941028725178
+generic_f32_4x4 4 32 5 0.000003094039509493947
+armv7neon_mmm_f32_8x4_cortexa7 17 4 13 0.000008167970707514618
+armv7neon_mmm_f32_8x6_cortexa9 16 128 7 0.000020852896757305592
+armv7neon_mmm_f32_8x4_generic 24 32 12 0.000011875110028324704
+armv7neon_mmm_f32_8x6_cortexa7 23 32 6 0.0000070036066823672265
+armv7neon_mmm_f32_8x4_generic 24 32 9 0.000012379847822938195
+armv7neon_mmm_f32_8x6_generic 16 32 7 0.000007288146983219525
+armv7neon_mmm_f32_8x4_generic 8 32 4 0.0000017213389522499293
+armv7neon_mmm_f32_8x6_cortexa7 25 32 12 0.00001698633380929768
+armv7neon_mmm_f32_8x4_generic 25 32 8 0.000010843062239811664
+armv7neon_mmm_f32_8x6_generic 16 128 5 0.000010970239776322642
+armv7neon_mmm_f32_8x6_cortexa9 23 32 12 0.00001105222841817834
+armv7neon_mmm_f32_8x4_cortexa7 7 4 12 0.0000028259675178908435
+armv7neon_mmm_f32_8x4_generic 17 32 13 0.000016456626258678596
+armv7neon_mmm_f32_8x6_generic 23 32 19 0.000021489093420466062
+armv7neon_mmm_f32_8x6_cortexa9 9 32 7 0.000007388528782309879
+armv7neon_mmm_f32_8x6_generic 16 128 13 0.0000308218952023933
+armv7neon_mmm_f32_8x4_cortexa9 24 128 4 0.000011799248370378306
+armv7neon_mmm_f32_8x6_cortexa7 15 4 5 0.000002311967551514126
+armv7neon_mmm_f32_8x6_cortexa9 15 32 19 0.000015002244448339703
+armv7neon_mmm_f32_8x4_cortexa7 25 32 4 0.000007113231524121585
+armv7neon_mmm_f32_8x4_generic 9 4 9 0.0000040996991473354
+generic_f32_4x4 11 4 11 0.00000604219727392619
+armv7neon_mmm_f32_8x4_generic 23 128 12 0.0000351784079527385
+armv7neon_mmm_f32_8x4_generic 8 32 8 0.0000029996111096440974
+generic_f32_4x4 3 128 11 0.000011575521961547323
+armv7neon_mmm_f32_8x6_generic 17 4 19 0.000008636925572322338
+armv7neon_mmm_f32_8x6_cortexa7 17 32 17 0.000019651252774271738
+armv7neon_mmm_f32_8x4_cortexa7 25 32 5 0.000014279031192163009
+armv7neon_mmm_f32_8x6_cortexa9 7 128 5 0.000005734808712355398
+armv7neon_mmm_f32_8x4_cortexa9 15 4 5 0.0000031170038028798296
+armv7neon_mmm_f32_8x4_cortexa9 24 128 13 0.000046424963618177915
+armv7neon_mmm_f32_8x6_generic 16 4 5 0.0000022248730615242665
+generic_f32_4x4 8 128 9 0.0000220091711298822
+armv7neon_mmm_f32_8x6_cortexa9 24 32 17 0.000015790292495872178
+armv7neon_mmm_f32_8x6_cortexa7 25 4 12 0.000006064814099731578
+armv7neon_mmm_f32_8x4_generic 9 32 3 0.0000033141535571016215
+armv7neon_mmm_f32_8x4_cortexa7 24 32 7 0.000010955777874109968
+armv7neon_mmm_f32_8x6_generic 7 32 19 0.000008240412026502634
+armv7neon_mmm_f32_8x6_generic 15 128 12 0.000021273134898855332
+armv7neon_mmm_f32_8x4_generic 9 128 12 0.000023427967692188372
+generic_f32_4x4 9 128 4 0.000011233611698379809
+armv7neon_mmm_f32_8x4_generic 23 32 3 0.000004898603928972602
+armv7neon_mmm_f32_8x4_cortexa9 15 4 7 0.000003196236363013327
+generic_f32_4x4 13 128 9 0.000043631376927673965
+armv7neon_mmm_f32_8x6_cortexa7 8 32 13 0.000006769099621081802
+armv7neon_mmm_f32_8x6_cortexa9 16 32 11 0.000007489162129911177
+armv7neon_mmm_f32_8x6_cortexa7 15 4 11 0.000004047240611447739
+armv7neon_mmm_f32_8x6_cortexa9 8 128 12 0.000010521407486118232
+armv7neon_mmm_f32_8x6_generic 24 128 12 0.000030475350289631447
+armv7neon_mmm_f32_8x6_cortexa9 8 4 18 0.0000023963651215663894
+armv7neon_mmm_f32_8x4_generic 7 4 13 0.0000033498033347340617
+armv7neon_mmm_f32_8x6_cortexa7 24 128 5 0.000021225542006705262
+armv7neon_mmm_f32_8x6_generic 8 128 7 0.000010658035873946595
+armv7neon_mmm_f32_8x6_cortexa9 16 128 13 0.00003086535920353521
+armv7neon_mmm_f32_8x6_cortexa7 9 4 13 0.000005015620872516637
+armv7neon_mmm_f32_8x6_cortexa9 8 4 13 0.0000025412882105309308
+armv7neon_mmm_f32_8x4_cortexa9 24 32 11 0.000012564889907233448
+armv7neon_mmm_f32_8x4_generic 7 128 13 0.000016356824884577277
+armv7neon_mmm_f32_8x4_cortexa7 15 32 11 0.00001113088620100476
+armv7neon_mmm_f32_8x6_generic 23 32 18 0.000016159850846463288
+armv7neon_mmm_f32_8x6_generic 15 32 19 0.000014889331973563625
+armv7neon_mmm_f32_8x4_generic 7 128 11 0.000012426167611771446
+generic_f32_4x4 4 4 4 0.000001047996290990507
+armv7neon_mmm_f32_8x4_generic 23 4 9 0.000006177048756338135
+armv7neon_mmm_f32_8x6_cortexa9 16 4 11 0.0000034974057138003183
+armv7neon_mmm_f32_8x6_cortexa9 7 128 12 0.00001128966410991496
+armv7neon_mmm_f32_8x4_cortexa9 9 4 8 0.0000027723235031605195
+armv7neon_mmm_f32_8x4_generic 7 128 12 0.000012422711675746062
+armv7neon_mmm_f32_8x4_cortexa7 25 128 12 0.0000632498211882511
+armv7neon_mmm_f32_8x4_generic 7 128 5 0.000008380542348646574
+armv7neon_mmm_f32_8x4_cortexa9 17 4 3 0.000002593764492859534
+generic_f32_4x4 9 4 3 0.0000025396068006223777
+generic_f32_4x4 9 4 12 0.0000055571336153173335
+armv7neon_mmm_f32_8x4_cortexa7 16 128 8 0.000021301840263398342
+armv7neon_mmm_f32_8x4_generic 9 32 12 0.000008360119586696751
+armv7neon_mmm_f32_8x6_cortexa9 23 32 13 0.00001642300983789001
+armv7neon_mmm_f32_8x4_cortexa7 9 32 13 0.000014115360584179656
+armv7neon_mmm_f32_8x4_generic 7 4 4 0.0000011988761531825325
+generic_f32_4x4 4 32 12 0.000004173516705669169
+armv7neon_mmm_f32_8x6_cortexa7 8 4 17 0.000002748952290655516
+armv7neon_mmm_f32_8x6_cortexa9 15 128 17 0.000031856750398890617
+armv7neon_mmm_f32_8x4_cortexa9 17 128 4 0.00001190476277599742
+armv7neon_mmm_f32_8x6_cortexa7 23 4 17 0.000008057629471932516
+armv7neon_mmm_f32_8x4_cortexa7 24 128 9 0.00004787648599530554
+armv7neon_mmm_f32_8x4_cortexa7 23 4 5 0.000004689549145067684
+armv7neon_mmm_f32_8x6_cortexa9 17 4 18 0.000006445994468840472
+armv7neon_mmm_f32_8x6_generic 9 128 17 0.00003107927056434744
+armv7neon_mmm_f32_8x4_generic 24 128 8 0.000023288445002113136
+armv7neon_mmm_f32_8x6_cortexa9 16 32 17 0.000010725045258389392
+armv7neon_mmm_f32_8x4_cortexa7 17 128 11 0.00004814359255643903
+armv7neon_mmm_f32_8x4_generic 7 4 3 0.0000011794687664451176
+generic_f32_4x4 7 32 5 0.000005808524626140803
+armv7neon_mmm_f32_8x6_generic 24 32 11 0.000010897102444398395
+armv7neon_mmm_f32_8x4_generic 7 4 7 0.0000019250466424784192
+armv7neon_mmm_f32_8x4_cortexa9 7 32 13 0.000006310599463529341
+generic_f32_4x4 4 32 8 0.000002964574479014875
+armv7neon_mmm_f32_8x6_cortexa7 17 4 12 0.000004769238750897463
+armv7neon_mmm_f32_8x6_generic 8 32 17 0.000005580365730578937
+armv7neon_mmm_f32_8x4_cortexa9 7 4 5 0.0000018767262951495324
+armv7neon_mmm_f32_8x4_cortexa9 9 128 3 0.000008376722642527261
+generic_f32_4x4 8 32 8 0.000005427007518436356
+generic_f32_4x4 3 32 7 0.000003269319162898471
+armv7neon_mmm_f32_8x6_generic 23 128 11 0.000031615917676258355
+generic_f32_4x4 11 32 3 0.000004643343538119153
+armv7neon_mmm_f32_8x6_generic 17 4 13 0.000006685914546845586
+armv7neon_mmm_f32_8x4_cortexa9 7 128 13 0.000016356503104634956
+armv7neon_mmm_f32_8x6_generic 8 4 11 0.000001987955413940483
+armv7neon_mmm_f32_8x6_cortexa9 23 32 6 0.0000058061790469852035
+armv7neon_mmm_f32_8x6_cortexa9 17 128 12 0.00003080777419909108
+generic_f32_4x4 3 4 13 0.00000320269810797615
+armv7neon_mmm_f32_8x6_cortexa9 17 32 12 0.000010479324211722284
+armv7neon_mmm_f32_8x4_cortexa9 7 4 4 0.000001192052740790223
+armv7neon_mmm_f32_8x4_cortexa9 24 128 12 0.00003457434886740767
+armv7neon_mmm_f32_8x6_cortexa9 23 32 11 0.000011348364973457056
+armv7neon_mmm_f32_8x4_generic 16 4 12 0.000003656453112482303
+armv7neon_mmm_f32_8x6_cortexa9 16 128 6 0.000010542301986141948
+armv7neon_mmm_f32_8x4_generic 24 128 4 0.000011845830939195392
+armv7neon_mmm_f32_8x6_cortexa7 9 32 18 0.000013032529505137248
+armv7neon_mmm_f32_8x4_generic 24 128 12 0.00003455867526442596
+armv7neon_mmm_f32_8x4_generic 23 128 7 0.00002405539721722316
+armv7neon_mmm_f32_8x4_cortexa9 23 32 4 0.000004484967279906347
+armv7neon_mmm_f32_8x4_cortexa9 17 32 4 0.000004369095064954239
+armv7neon_mmm_f32_8x6_cortexa7 8 4 12 0.000001838051428755667
+armv7neon_mmm_f32_8x4_generic 15 4 8 0.000003030902158049641
+armv7neon_mmm_f32_8x4_cortexa7 8 128 9 0.000016361621231447548
+armv7neon_mmm_f32_8x6_generic 15 4 5 0.000002193007903880869
+armv7neon_mmm_f32_8x6_cortexa9 24 32 18 0.000015047680763476656
+armv7neon_mmm_f32_8x6_generic 25 128 6 0.00002065407414270236
+armv7neon_mmm_f32_8x4_generic 25 32 9 0.00001655596981137993
+armv7neon_mmm_f32_8x4_cortexa7 7 4 8 0.000002046321793305707
+armv7neon_mmm_f32_8x4_generic 24 32 5 0.000008563586932451436
+armv7neon_mmm_f32_8x6_cortexa7 25 32 19 0.00003389329143278319
+generic_f32_4x4 3 4 3 0.0000011908985597580253
+generic_f32_4x4 11 128 13 0.000043852326643092945
+armv7neon_mmm_f32_8x6_cortexa9 7 32 6 0.0000025216564013293366
+generic_f32_4x4 8 128 11 0.00002204318798460829
+generic_f32_4x4 9 32 5 0.00000828023901975494
+armv7neon_mmm_f32_8x6_generic 25 4 11 0.000006352168944400001
+armv7neon_mmm_f32_8x4_cortexa9 24 32 5 0.000008590022043400085
+armv7neon_mmm_f32_8x4_generic 8 32 3 0.0000019449019781287236
+armv7neon_mmm_f32_8x4_cortexa7 24 128 8 0.0000317811111212887
+armv7neon_mmm_f32_8x4_cortexa9 7 128 9 0.000012372639572013476
+armv7neon_mmm_f32_8x6_generic 15 128 19 0.00004250719000509339
+armv7neon_mmm_f32_8x6_cortexa7 17 4 7 0.000005084587454728339
+armv7neon_mmm_f32_8x6_generic 15 32 5 0.000004169790668084077
+generic_f32_4x4 9 128 3 0.000011555287825087273
+armv7neon_mmm_f32_8x4_cortexa7 25 128 7 0.00004317760438423037
+generic_f32_4x4 8 32 12 0.00000779706808746927
+armv7neon_mmm_f32_8x6_generic 8 32 12 0.000003719914023610946
+armv7neon_mmm_f32_8x6_cortexa7 15 128 17 0.000041839968426337905
+armv7neon_mmm_f32_8x6_cortexa7 8 128 13 0.000020713409071600263
+armv7neon_mmm_f32_8x6_cortexa9 24 4 5 0.000003104317520805454
+armv7neon_mmm_f32_8x6_cortexa9 9 4 13 0.000004778161797091213
+armv7neon_mmm_f32_8x4_cortexa9 25 128 12 0.00004644468928643539
+generic_f32_4x4 11 4 9 0.000005973042856611991
+armv7neon_mmm_f32_8x4_cortexa7 23 128 8 0.00003217939318511314
+armv7neon_mmm_f32_8x6_generic 24 4 11 0.00000490822983265492
+armv7neon_mmm_f32_8x4_generic 17 4 12 0.000005542606566473033
+armv7neon_mmm_f32_8x6_cortexa9 16 32 5 0.000004236072973244724
+armv7neon_mmm_f32_8x6_cortexa7 24 32 12 0.000012668891408140166
+armv7neon_mmm_f32_8x6_cortexa7 17 128 19 0.00008180995267293593
+armv7neon_mmm_f32_8x4_cortexa7 8 4 9 0.0000023929041278364927
+armv7neon_mmm_f32_8x4_generic 17 4 11 0.000005996779011118073
+armv7neon_mmm_f32_8x4_generic 23 32 5 0.000008783310902330428
+armv7neon_mmm_f32_8x4_cortexa9 24 4 5 0.000004126054215412369
+armv7neon_mmm_f32_8x4_cortexa7 15 4 4 0.0000018664924896343623
+generic_f32_4x4 12 32 12 0.000011419939741453591
+armv7neon_mmm_f32_8x6_cortexa9 9 128 19 0.00004140314991254094
+armv7neon_mmm_f32_8x6_cortexa7 24 4 19 0.000008976504261442944
+armv7neon_mmm_f32_8x6_cortexa7 23 32 12 0.000013517680029411935
+armv7neon_mmm_f32_8x6_generic 23 128 17 0.00004767212397281204
+armv7neon_mmm_f32_8x6_cortexa7 9 128 12 0.000027472854173113385
+generic_f32_4x4 13 4 4 0.0000027493566687604084
+armv7neon_mmm_f32_8x6_cortexa7 7 32 7 0.000005097398974457722
+armv7neon_mmm_f32_8x4_generic 25 4 3 0.0000033394241283747185
+armv7neon_mmm_f32_8x4_cortexa7 25 4 5 0.000005839267769079541
+generic_f32_4x4 8 4 9 0.0000039467461932377605
+armv7neon_mmm_f32_8x6_cortexa7 8 32 6 0.0000025355396885136015
+armv7neon_mmm_f32_8x4_cortexa7 7 4 13 0.0000035454436358925364
+generic_f32_4x4 12 128 5 0.000022119589677203316
+armv7neon_mmm_f32_8x4_cortexa9 17 32 5 0.000008619398425928128
+generic_f32_4x4 5 32 12 0.00000809048145029521
+armv7neon_mmm_f32_8x6_generic 7 32 12 0.000004497135112528872
+armv7neon_mmm_f32_8x6_cortexa9 17 128 18 0.00004612047264624732
+armv7neon_mmm_f32_8x4_cortexa9 24 128 9 0.00003503924267370847
+armv7neon_mmm_f32_8x6_cortexa9 25 128 18 0.00006126219444229572
+armv7neon_mmm_f32_8x4_cortexa9 7 32 7 0.0000034009841492082025
+armv7neon_mmm_f32_8x6_cortexa7 8 32 18 0.0000065795660386672954
+armv7neon_mmm_f32_8x4_cortexa7 9 4 7 0.0000032333021816508883
+generic_f32_4x4 11 4 5 0.000004260253310204167
+armv7neon_mmm_f32_8x4_cortexa7 15 32 12 0.00001091466040761069
+armv7neon_mmm_f32_8x6_generic 25 128 17 0.00006242813577501301
+armv7neon_mmm_f32_8x6_cortexa7 23 128 17 0.00006230757214971725
+armv7neon_mmm_f32_8x4_cortexa7 9 4 3 0.000001972675027149826
+armv7neon_mmm_f32_8x6_generic 9 32 19 0.000013969471313273173
+armv7neon_mmm_f32_8x4_cortexa7 9 4 4 0.0000017499654922061095
+generic_f32_4x4 8 4 8 0.0000026159902565290668
+armv7neon_mmm_f32_8x6_generic 23 128 12 0.000031313730389866355
+armv7neon_mmm_f32_8x4_cortexa7 15 128 7 0.000021942752305858642
+armv7neon_mmm_f32_8x6_cortexa7 9 32 6 0.000004675388926372666
+armv7neon_mmm_f32_8x6_cortexa7 8 32 5 0.000002772093015202498
+armv7neon_mmm_f32_8x6_cortexa7 9 32 19 0.000017344416985284313
+armv7neon_mmm_f32_8x4_cortexa9 8 128 7 0.000008229649908268804
+generic_f32_4x4 3 128 12 0.000011565133509989783
+generic_f32_4x4 9 32 12 0.000011785850787897203
+armv7neon_mmm_f32_8x6_cortexa9 17 128 13 0.00004651964090971228
+armv7neon_mmm_f32_8x6_cortexa9 25 32 6 0.0000071127343061349165
+armv7neon_mmm_f32_8x6_cortexa7 25 128 13 0.00008178262598187339
+armv7neon_mmm_f32_8x4_cortexa9 25 32 13 0.000021697932623977668
+armv7neon_mmm_f32_8x4_cortexa9 16 4 11 0.0000040619610195960516
+armv7neon_mmm_f32_8x4_generic 23 128 3 0.000012468771697124281
+generic_f32_4x4 11 32 13 0.000015931672245334774
+generic_f32_4x4 8 32 9 0.000008097203894273956
+armv7neon_mmm_f32_8x4_cortexa9 17 4 5 0.000004171816997731455
+armv7neon_mmm_f32_8x6_generic 15 32 17 0.000011434650801767795
+armv7neon_mmm_f32_8x4_cortexa7 25 128 3 0.000022089826731240803
+armv7neon_mmm_f32_8x4_cortexa9 17 32 9 0.000012562108538363274
+generic_f32_4x4 7 128 4 0.000007740927731735862
+armv7neon_mmm_f32_8x6_cortexa7 16 128 17 0.00004105266320070572
+armv7neon_mmm_f32_8x6_generic 17 128 5 0.000016103121774230763
+armv7neon_mmm_f32_8x6_cortexa9 25 128 19 0.0000820798385767341
+armv7neon_mmm_f32_8x4_generic 15 32 12 0.000008721032730753763
+armv7neon_mmm_f32_8x4_generic 25 4 9 0.000007623970700084407
+armv7neon_mmm_f32_8x4_generic 17 32 11 0.000012672883096785458
+armv7neon_mmm_f32_8x6_generic 16 32 19 0.000013691087754459915
+armv7neon_mmm_f32_8x6_generic 9 4 12 0.0000031605677124646883
+generic_f32_4x4 5 4 5 0.0000029759889706969683
+armv7neon_mmm_f32_8x4_generic 16 32 9 0.00000841650213766426
+generic_f32_4x4 8 32 13 0.000010487684258287977
+generic_f32_4x4 4 128 5 0.000007733485289678424
+armv7neon_mmm_f32_8x6_cortexa7 8 4 19 0.0000033370479940342483
+armv7neon_mmm_f32_8x6_cortexa7 16 4 13 0.000004820409587660123
+armv7neon_mmm_f32_8x6_cortexa7 7 128 5 0.000007382676538540774
+armv7neon_mmm_f32_8x6_cortexa9 9 4 7 0.0000034194906166596
+armv7neon_mmm_f32_8x4_generic 24 128 3 0.000012489994751068132
+armv7neon_mmm_f32_8x6_generic 17 4 5 0.0000029369592565703416
+armv7neon_mmm_f32_8x6_cortexa9 7 32 7 0.000004299364469367014
+generic_f32_4x4 8 128 8 0.000014675074170610452
+armv7neon_mmm_f32_8x6_generic 24 32 6 0.000005321602550624656
+generic_f32_4x4 8 128 3 0.000007920826245860217
+armv7neon_mmm_f32_8x6_cortexa7 9 128 17 0.000041149683356221505
+armv7neon_mmm_f32_8x6_generic 15 128 18 0.0000317697933254728
+armv7neon_mmm_f32_8x4_generic 15 32 11 0.000008972113808644295
+armv7neon_mmm_f32_8x6_generic 17 128 11 0.00003128672479118261
+armv7neon_mmm_f32_8x6_generic 16 32 5 0.000004210171223198828
+armv7neon_mmm_f32_8x4_cortexa9 23 32 7 0.000008919434958029703
+armv7neon_mmm_f32_8x6_cortexa7 23 4 11 0.000005630918362501218
+generic_f32_4x4 13 4 13 0.000009734041681688549
+armv7neon_mmm_f32_8x4_cortexa7 23 4 8 0.000004419109488183114
+armv7neon_mmm_f32_8x6_generic 17 128 18 0.00004641763657018708
+armv7neon_mmm_f32_8x6_cortexa7 23 32 7 0.000013604910503058097
+armv7neon_mmm_f32_8x4_cortexa9 16 4 8 0.0000025811126658507737
+armv7neon_mmm_f32_8x6_generic 16 4 17 0.000004664791456959476
+armv7neon_mmm_f32_8x6_cortexa9 15 128 19 0.00004234538403110192
+armv7neon_mmm_f32_8x4_cortexa7 7 128 8 0.000011300292613931975
+generic_f32_4x4 4 32 7 0.000003116445195774716
+armv7neon_mmm_f32_8x4_generic 17 128 3 0.000012383648487655847
+armv7neon_mmm_f32_8x6_cortexa9 24 32 12 0.00001022987009394415
+armv7neon_mmm_f32_8x4_cortexa9 17 32 11 0.000012684638765009217
+armv7neon_mmm_f32_8x6_cortexa9 9 128 18 0.000030893903863542745
+armv7neon_mmm_f32_8x6_cortexa7 9 4 18 0.000004852117278755586
+armv7neon_mmm_f32_8x6_cortexa9 8 128 13 0.00001571685239499597
+armv7neon_mmm_f32_8x6_cortexa9 23 4 7 0.000005166723329090456
+armv7neon_mmm_f32_8x6_cortexa9 7 128 18 0.000016699359662800567
+armv7neon_mmm_f32_8x4_generic 17 32 4 0.0000043747418052874395
+armv7neon_mmm_f32_8x4_cortexa9 25 128 13 0.00006245169757364539
+armv7neon_mmm_f32_8x4_generic 24 128 11 0.00003520805509267024
+armv7neon_mmm_f32_8x4_generic 7 4 8 0.000001940897025671589
+armv7neon_mmm_f32_8x4_cortexa9 24 32 8 0.000008105445830543554
+generic_f32_4x4 3 32 5 0.000003239638778592903
+armv7neon_mmm_f32_8x6_generic 16 128 6 0.000010481759011880018
+armv7neon_mmm_f32_8x6_cortexa9 23 4 11 0.000005380241199288215
+armv7neon_mmm_f32_8x6_cortexa7 24 4 11 0.000005233250260928121
+armv7neon_mmm_f32_8x6_cortexa9 23 4 19 0.000009671598448323651
+armv7neon_mmm_f32_8x6_cortexa7 23 4 6 0.0000029172933842802905
+armv7neon_mmm_f32_8x4_cortexa9 23 32 8 0.000008528926728389173
+armv7neon_mmm_f32_8x4_cortexa9 24 128 8 0.000023207480788900347
+generic_f32_4x4 7 32 11 0.00000844108827118094
+generic_f32_4x4 12 4 5 0.000004095794522926841
+armv7neon_mmm_f32_8x4_generic 25 4 11 0.000007781652426587433
+armv7neon_mmm_f32_8x4_generic 16 32 12 0.000008090083939378293
+armv7neon_mmm_f32_8x4_generic 9 128 7 0.000016030853255705617
+armv7neon_mmm_f32_8x6_generic 15 4 19 0.000006928708092302355
+armv7neon_mmm_f32_8x6_cortexa7 24 128 12 0.00004054881877870741
+generic_f32_4x4 8 4 4 0.0000015677362682744682
+armv7neon_mmm_f32_8x6_generic 7 128 19 0.000021822554568277356
+armv7neon_mmm_f32_8x6_cortexa9 24 4 12 0.00000423613681186459
+armv7neon_mmm_f32_8x4_cortexa7 16 4 4 0.0000016584422934049926
+armv7neon_mmm_f32_8x6_cortexa7 15 4 12 0.0000039842977137122266
+armv7neon_mmm_f32_8x4_cortexa7 7 128 4 0.000005876303683712701
+armv7neon_mmm_f32_8x6_cortexa9 7 4 5 0.000001360389294622995
+armv7neon_mmm_f32_8x4_cortexa7 24 4 9 0.000006182082152017003
+armv7neon_mmm_f32_8x6_generic 7 128 17 0.000016508734849042377
+armv7neon_mmm_f32_8x4_generic 17 128 9 0.000035220470083517575
+generic_f32_4x4 11 32 4 0.000004333160180811584
+armv7neon_mmm_f32_8x6_cortexa7 7 32 12 0.000005349730883357719
+armv7neon_mmm_f32_8x4_generic 8 32 5 0.0000031582408088385206
+armv7neon_mmm_f32_8x6_cortexa9 8 32 6 0.000002149345359409538
+generic_f32_4x4 5 32 13 0.000010767359583748404
+armv7neon_mmm_f32_8x4_generic 17 32 5 0.000008612220700720838
+generic_f32_4x4 5 4 4 0.0000016841544626715604
+armv7neon_mmm_f32_8x4_cortexa9 9 32 8 0.000005740276944412148
+armv7neon_mmm_f32_8x6_cortexa9 23 128 7 0.00003146196425036688
+armv7neon_mmm_f32_8x4_cortexa9 25 128 9 0.00004693016970364154
+armv7neon_mmm_f32_8x4_generic 24 4 8 0.0000036269745603468488
+armv7neon_mmm_f32_8x6_cortexa9 24 32 5 0.000006081990180966961
+armv7neon_mmm_f32_8x6_generic 7 4 6 0.000001511590523783866
+generic_f32_4x4 13 128 7 0.00002944642529063838
+armv7neon_mmm_f32_8x4_cortexa9 8 4 11 0.000002254110071814015
+armv7neon_mmm_f32_8x6_cortexa9 24 128 19 0.00006153515122537746
+armv7neon_mmm_f32_8x4_cortexa9 16 128 13 0.00003108122366900544
+generic_f32_4x4 11 128 9 0.000033046521573243154
+armv7neon_mmm_f32_8x4_generic 7 4 9 0.0000026137777410885136
+armv7neon_mmm_f32_8x6_cortexa7 7 4 13 0.0000034217472919863715
+armv7neon_mmm_f32_8x6_generic 16 4 11 0.000003453597254301285
+armv7neon_mmm_f32_8x4_cortexa7 17 32 11 0.000015933978694713537
+armv7neon_mmm_f32_8x6_cortexa7 23 4 12 0.0000053181832633687915
+armv7neon_mmm_f32_8x6_generic 17 32 6 0.000005447406276972202
+armv7neon_mmm_f32_8x4_cortexa7 7 32 4 0.000002302756052490624
+armv7neon_mmm_f32_8x4_cortexa9 15 4 13 0.000005662932902369066
+armv7neon_mmm_f32_8x4_cortexa7 17 32 8 0.000010470053145450968
+armv7neon_mmm_f32_8x4_generic 17 4 8 0.000003846416592921501
+armv7neon_mmm_f32_8x4_cortexa7 25 32 7 0.000014416885420743293
+armv7neon_mmm_f32_8x6_cortexa9 7 4 7 0.000002316979424271203
+armv7neon_mmm_f32_8x6_cortexa9 16 4 17 0.000004718807435203576
+generic_f32_4x4 11 32 12 0.000011882704283190954
+armv7neon_mmm_f32_8x4_cortexa7 23 32 9 0.000016122379106643026
+armv7neon_mmm_f32_8x6_cortexa7 9 32 11 0.000009102642142770847
+armv7neon_mmm_f32_8x6_generic 25 4 7 0.000006116994655414601
+armv7neon_mmm_f32_8x4_generic 23 4 3 0.000002685832196401119
+armv7neon_mmm_f32_8x6_cortexa9 25 32 11 0.000014409038834226244
+armv7neon_mmm_f32_8x6_generic 23 32 12 0.000010966793413222081
+armv7neon_mmm_f32_8x4_cortexa7 15 4 8 0.0000032350546103453447
+armv7neon_mmm_f32_8x6_cortexa7 15 128 7 0.000027943938585492828
+generic_f32_4x4 7 4 4 0.0000017389973856077342
+armv7neon_mmm_f32_8x6_generic 8 4 12 0.0000017181145762324274
+armv7neon_mmm_f32_8x4_cortexa9 15 4 11 0.000004502163536054048
+armv7neon_mmm_f32_8x6_generic 15 128 6 0.000010890629782885533
+armv7neon_mmm_f32_8x4_cortexa7 23 128 12 0.00004796960186341638
+armv7neon_mmm_f32_8x6_generic 8 32 7 0.000003896823245256928
+armv7neon_mmm_f32_8x6_cortexa9 15 128 7 0.000021306419489905895
+armv7neon_mmm_f32_8x6_cortexa7 7 4 11 0.0000024493622993257104
+armv7neon_mmm_f32_8x6_cortexa7 7 32 19 0.000009918011828071643
+armv7neon_mmm_f32_8x4_cortexa9 24 4 7 0.000004277922454765341
+armv7neon_mmm_f32_8x4_cortexa7 9 4 5 0.0000031825583861368363
+armv7neon_mmm_f32_8x6_cortexa9 8 32 11 0.000004007622996506884
+armv7neon_mmm_f32_8x6_generic 15 4 11 0.0000038091887644203395
+armv7neon_mmm_f32_8x6_cortexa7 23 128 11 0.000041674027523638683
+armv7neon_mmm_f32_8x6_cortexa7 17 4 18 0.0000068571632794369916
+armv7neon_mmm_f32_8x6_cortexa9 9 128 12 0.000020793132831426426
+armv7neon_mmm_f32_8x6_generic 25 32 19 0.000027146627303420713
+armv7neon_mmm_f32_8x6_cortexa7 9 128 19 0.00005467645413703322
+armv7neon_mmm_f32_8x6_cortexa7 8 32 11 0.0000048195055146364755
+armv7neon_mmm_f32_8x4_cortexa7 8 128 5 0.000011083761554457595
+generic_f32_4x4 7 128 12 0.000022151037088381854
+generic_f32_4x4 9 128 11 0.00003295464355640065
+armv7neon_mmm_f32_8x6_cortexa9 25 4 17 0.000009038405830025483
+generic_f32_4x4 8 32 3 0.0000032862084964825317
+armv7neon_mmm_f32_8x4_cortexa9 23 128 12 0.00003536277613858418
+armv7neon_mmm_f32_8x6_cortexa9 16 128 11 0.000021144715462112054
+armv7neon_mmm_f32_8x6_generic 17 32 5 0.000005941189535355633
+armv7neon_mmm_f32_8x6_generic 15 32 13 0.000011297818131567483
+armv7neon_mmm_f32_8x6_generic 16 32 11 0.000007448947403712638
+generic_f32_4x4 5 4 7 0.0000029999149449467176
+armv7neon_mmm_f32_8x4_generic 8 32 13 0.000005690794573889263
+armv7neon_mmm_f32_8x4_cortexa9 23 128 4 0.000012046555463702128
+armv7neon_mmm_f32_8x4_cortexa7 25 128 5 0.00004305932269978789
+armv7neon_mmm_f32_8x6_cortexa7 24 32 7 0.000013181202441221999
+armv7neon_mmm_f32_8x4_cortexa9 23 4 3 0.0000026820954481134975
+generic_f32_4x4 12 4 8 0.0000036441773649361156
+armv7neon_mmm_f32_8x6_cortexa9 24 128 17 0.000046509213530800746
+armv7neon_mmm_f32_8x6_cortexa9 8 4 6 0.0000011575433314376175
+generic_f32_4x4 3 32 4 0.0000018882099872634265
+armv7neon_mmm_f32_8x4_cortexa7 9 32 3 0.000004070589988484743
+armv7neon_mmm_f32_8x4_cortexa7 17 32 9 0.000015851096904703647
+generic_f32_4x4 13 32 12 0.00001535208709935954
+armv7neon_mmm_f32_8x6_cortexa9 25 4 19 0.000011387388616753858
+armv7neon_mmm_f32_8x6_cortexa9 23 128 5 0.00001624764366012842
+armv7neon_mmm_f32_8x4_cortexa9 9 128 12 0.00002346244933254337
+armv7neon_mmm_f32_8x4_cortexa9 25 4 12 0.000007150024568853224
+armv7neon_mmm_f32_8x6_cortexa9 15 32 12 0.000007825900269054534
+armv7neon_mmm_f32_8x4_cortexa7 25 4 12 0.000007707673254391108
+generic_f32_4x4 11 32 5 0.000008363778534607299
+armv7neon_mmm_f32_8x4_cortexa7 17 128 8 0.00003194246552397465
+armv7neon_mmm_f32_8x6_generic 8 32 6 0.0000021066884422501677
+armv7neon_mmm_f32_8x6_cortexa7 16 4 11 0.0000036716657035155366
+armv7neon_mmm_f32_8x4_generic 17 128 7 0.00002384609528431003
+armv7neon_mmm_f32_8x4_cortexa7 9 128 7 0.000021759641210777887
+armv7neon_mmm_f32_8x6_cortexa7 17 32 11 0.000013412538846571892
+generic_f32_4x4 7 32 8 0.000005690920861795519
+armv7neon_mmm_f32_8x6_generic 25 128 19 0.00008258096370056387
+armv7neon_mmm_f32_8x6_cortexa7 15 4 7 0.000003922803920269034
+armv7neon_mmm_f32_8x4_cortexa9 16 4 12 0.0000036509568763084446
+generic_f32_4x4 11 128 4 0.000011287445429820831
+armv7neon_mmm_f32_8x6_cortexa7 15 128 5 0.000014299033646853434
+armv7neon_mmm_f32_8x6_cortexa9 24 128 12 0.00003053734816197179
+armv7neon_mmm_f32_8x6_cortexa9 17 32 6 0.000005510675534687666
+armv7neon_mmm_f32_8x6_cortexa9 9 32 19 0.000014080838768872731
+generic_f32_4x4 13 128 5 0.000029375850896877852
+armv7neon_mmm_f32_8x6_cortexa7 25 32 5 0.000009454087084930695
+armv7neon_mmm_f32_8x4_generic 8 4 8 0.0000015055905623190838
+armv7neon_mmm_f32_8x4_cortexa7 16 4 13 0.00000545158088053187
+armv7neon_mmm_f32_8x6_generic 8 128 6 0.0000054873008191504395
+generic_f32_4x4 5 128 7 0.000015018624971335435
+armv7neon_mmm_f32_8x6_generic 15 128 7 0.00002123110926361701
+armv7neon_mmm_f32_8x4_cortexa7 7 32 3 0.000002283721924125051
+armv7neon_mmm_f32_8x6_generic 9 128 19 0.00004153518147749415
+armv7neon_mmm_f32_8x4_generic 9 32 11 0.000008626343403146056
+armv7neon_mmm_f32_8x6_cortexa9 9 4 6 0.0000018884740207207647
+armv7neon_mmm_f32_8x6_cortexa7 9 128 6 0.000013948466701679792
+generic_f32_4x4 11 4 12 0.000005708922503580876
+armv7neon_mmm_f32_8x6_cortexa7 24 128 6 0.000020527418679269756
+armv7neon_mmm_f32_8x4_cortexa7 8 4 11 0.0000024406051533290663
+armv7neon_mmm_f32_8x4_cortexa9 23 128 9 0.000035493958882244616
+armv7neon_mmm_f32_8x6_cortexa9 17 4 11 0.00000498803165144454
+armv7neon_mmm_f32_8x6_generic 8 32 18 0.000005320856703844532
+armv7neon_mmm_f32_8x4_cortexa9 9 128 4 0.00000810350837903274
+generic_f32_4x4 9 4 8 0.000003892676050216947
+armv7neon_mmm_f32_8x6_cortexa9 24 128 5 0.000016255452753216072
+armv7neon_mmm_f32_8x6_generic 7 4 5 0.0000013389729556607902
+armv7neon_mmm_f32_8x6_generic 17 32 19 0.000020571678410473347
+armv7neon_mmm_f32_8x6_cortexa9 7 32 13 0.0000063039500398145465
+armv7neon_mmm_f32_8x4_cortexa9 17 128 7 0.000023826784148315835
+armv7neon_mmm_f32_8x4_generic 24 4 7 0.000004274685140650307
+armv7neon_mmm_f32_8x6_cortexa9 17 32 17 0.000015955807283560996
+armv7neon_mmm_f32_8x6_cortexa7 17 4 13 0.000007179865477053954
+armv7neon_mmm_f32_8x4_cortexa9 7 32 12 0.00000488792089076881
+generic_f32_4x4 4 128 11 0.000011283150201350866
+armv7neon_mmm_f32_8x6_cortexa7 7 128 17 0.00002152414507851334
+armv7neon_mmm_f32_8x4_generic 25 32 13 0.0000216921434601138
+armv7neon_mmm_f32_8x6_generic 15 32 12 0.000007763276203034865
+generic_f32_4x4 7 128 9 0.00002231642099570999
+armv7neon_mmm_f32_8x4_generic 23 4 7 0.0000044972515990527706
+armv7neon_mmm_f32_8x4_generic 16 128 9 0.00002352868039758383
+armv7neon_mmm_f32_8x4_generic 24 32 8 0.000008105856924464553
+armv7neon_mmm_f32_8x4_cortexa7 25 4 9 0.000008239024137829502
+armv7neon_mmm_f32_8x4_generic 8 4 7 0.000001726186960943308
+armv7neon_mmm_f32_8x6_cortexa7 24 4 12 0.000004476686880431161
+armv7neon_mmm_f32_8x4_generic 8 4 5 0.0000016761254405249265
+armv7neon_mmm_f32_8x6_cortexa9 17 32 18 0.000015401920673930987
+armv7neon_mmm_f32_8x6_cortexa7 8 4 18 0.000002497401986754502
+armv7neon_mmm_f32_8x6_cortexa7 23 32 18 0.000019976606065621178
+armv7neon_mmm_f32_8x4_cortexa9 16 128 5 0.000015920451629989893
+armv7neon_mmm_f32_8x6_cortexa7 16 128 19 0.00005438554195188942
+armv7neon_mmm_f32_8x4_generic 25 32 11 0.00001669591843012648
+armv7neon_mmm_f32_8x6_cortexa9 7 32 18 0.000006518795701369289
+generic_f32_4x4 3 128 7 0.000007891377185866899
+armv7neon_mmm_f32_8x6_cortexa9 9 128 11 0.000021029476636249923
+armv7neon_mmm_f32_8x4_generic 9 32 8 0.00000573930624839856
+armv7neon_mmm_f32_8x4_cortexa9 23 32 9 0.000012851357918328979
+armv7neon_mmm_f32_8x4_cortexa7 25 128 11 0.00006422947640130408
+armv7neon_mmm_f32_8x4_cortexa7 9 128 5 0.000021690553836350107
+armv7neon_mmm_f32_8x4_cortexa7 8 128 4 0.000005680428664405247
+armv7neon_mmm_f32_8x6_cortexa9 17 32 11 0.000010949048608533127
+armv7neon_mmm_f32_8x4_cortexa9 9 32 13 0.00001119516860361631
+armv7neon_mmm_f32_8x4_cortexa9 8 128 11 0.000012017424644185006
+armv7neon_mmm_f32_8x4_cortexa9 17 4 9 0.000005874110633384251
+armv7neon_mmm_f32_8x6_cortexa7 9 4 7 0.0000035812085152963736
+armv7neon_mmm_f32_8x4_cortexa7 8 4 12 0.0000022112784369711703
+armv7neon_mmm_f32_8x6_generic 23 32 13 0.000016306177258992233
+armv7neon_mmm_f32_8x4_cortexa7 9 32 11 0.000010819099493176479
+armv7neon_mmm_f32_8x4_generic 25 128 12 0.00004665173717365099
+armv7neon_mmm_f32_8x4_cortexa9 25 4 8 0.000004903249015406008
+armv7neon_mmm_f32_8x4_generic 15 128 8 0.000016011765752869696
+generic_f32_4x4 4 4 5 0.00000173237663586998
+generic_f32_4x4 7 32 13 0.000010951743398686358
+armv7neon_mmm_f32_8x6_cortexa9 7 32 11 0.000004373180249911381
+armv7neon_mmm_f32_8x6_generic 9 4 7 0.000003340351034825373
+armv7neon_mmm_f32_8x6_cortexa7 8 128 6 0.000007164475272224828
+generic_f32_4x4 8 4 11 0.000003982256860993786
+armv7neon_mmm_f32_8x6_generic 15 128 5 0.000010930509860719574
+armv7neon_mmm_f32_8x4_cortexa9 25 128 11 0.00004712675673590545
+armv7neon_mmm_f32_8x4_cortexa7 7 4 11 0.000002822400310815202
+armv7neon_mmm_f32_8x6_cortexa7 17 32 13 0.000019470812344800868
+armv7neon_mmm_f32_8x4_cortexa7 23 32 4 0.000005619794374416781
+generic_f32_4x4 11 32 8 0.000008140965167056766
+armv7neon_mmm_f32_8x4_cortexa7 17 4 5 0.000004516405012537028
+armv7neon_mmm_f32_8x4_cortexa7 17 128 4 0.00001625158956213561
+armv7neon_mmm_f32_8x6_generic 16 128 18 0.000030484389052945016
+armv7neon_mmm_f32_8x6_cortexa7 17 128 11 0.00004127386103261296
+armv7neon_mmm_f32_8x6_cortexa7 24 32 6 0.000006586776362035912
+armv7neon_mmm_f32_8x4_generic 9 32 5 0.000005900796369564617
+armv7neon_mmm_f32_8x6_cortexa7 7 128 11 0.000014441097326114378
+armv7neon_mmm_f32_8x6_cortexa9 24 4 19 0.000008421665590898334
+armv7neon_mmm_f32_8x4_cortexa7 15 32 9 0.000011049029459002566
+armv7neon_mmm_f32_8x6_cortexa9 17 32 7 0.000010792251610714281
+armv7neon_mmm_f32_8x4_generic 24 32 11 0.000012578955321334167
+armv7neon_mmm_f32_8x6_generic 8 4 17 0.0000025848174603713075
+armv7neon_mmm_f32_8x4_cortexa9 8 128 12 0.000011812308047768464
+generic_f32_4x4 12 128 8 0.00002174331661540877
+armv7neon_mmm_f32_8x4_cortexa7 16 128 12 0.0000317241936699183
+armv7neon_mmm_f32_8x6_cortexa9 9 4 5 0.000002126610313888205
+armv7neon_mmm_f32_8x6_cortexa9 8 32 12 0.0000037644801015301184
+armv7neon_mmm_f32_8x4_cortexa9 9 32 4 0.000003081152195958602
+generic_f32_4x4 7 32 9 0.000008408374041494756
+generic_f32_4x4 4 128 4 0.0000040497405415986515
+armv7neon_mmm_f32_8x4_cortexa9 15 128 7 0.000016202393035594662
+armv7neon_mmm_f32_8x6_cortexa9 25 128 6 0.000020788151245438416
+armv7neon_mmm_f32_8x4_cortexa7 8 128 13 0.000021488179929071295
+armv7neon_mmm_f32_8x4_cortexa7 25 32 9 0.00002091079321932589
+armv7neon_mmm_f32_8x4_cortexa9 8 128 5 0.000008180420598290364
+armv7neon_mmm_f32_8x4_generic 7 128 3 0.000004423621357206741
+armv7neon_mmm_f32_8x4_cortexa9 15 4 9 0.000004394233374561228
+armv7neon_mmm_f32_8x6_generic 7 4 12 0.0000025127271377138368
+armv7neon_mmm_f32_8x4_generic 23 128 8 0.00002364193719899604
+armv7neon_mmm_f32_8x4_cortexa7 7 32 11 0.000005969501269507202
+armv7neon_mmm_f32_8x6_cortexa9 16 4 7 0.0000033362358276857998
+armv7neon_mmm_f32_8x4_generic 25 4 4 0.000002679652210124942
+armv7neon_mmm_f32_8x6_cortexa7 24 128 19 0.00008148701915820504
+armv7neon_mmm_f32_8x4_cortexa7 25 32 12 0.000020301609592643522
+armv7neon_mmm_f32_8x6_generic 25 4 12 0.000005597867792560783
+armv7neon_mmm_f32_8x4_cortexa9 16 4 13 0.0000050370172264265826
+generic_f32_4x4 11 4 8 0.000003987191713644613
+armv7neon_mmm_f32_8x4_cortexa7 7 32 9 0.00000592156011559423
+armv7neon_mmm_f32_8x6_cortexa7 25 4 13 0.000009317851927845776
+armv7neon_mmm_f32_8x6_generic 7 32 6 0.0000025009886657895365
+armv7neon_mmm_f32_8x4_cortexa9 9 32 5 0.0000059031902576708605
+armv7neon_mmm_f32_8x4_cortexa7 25 4 7 0.000005997625417148922
+armv7neon_mmm_f32_8x4_generic 15 128 9 0.000023965770004599912
+armv7neon_mmm_f32_8x4_generic 9 128 3 0.000008336141118552254
+armv7neon_mmm_f32_8x6_cortexa9 15 4 5 0.0000022485015750854945
+armv7neon_mmm_f32_8x4_cortexa7 25 32 13 0.000027508557090202434
+armv7neon_mmm_f32_8x4_cortexa9 7 32 4 0.0000019276370605991633
+armv7neon_mmm_f32_8x6_cortexa7 7 32 13 0.000007512654299558133
+generic_f32_4x4 12 128 12 0.000032295866967880904
+armv7neon_mmm_f32_8x6_generic 8 128 13 0.00001569076394434193
+armv7neon_mmm_f32_8x4_cortexa9 17 128 8 0.000023412675748873913
+armv7neon_mmm_f32_8x4_cortexa9 16 4 3 0.000001964056838467611
+armv7neon_mmm_f32_8x4_generic 16 4 4 0.0000015347326629006993
+armv7neon_mmm_f32_8x4_cortexa9 24 4 4 0.0000020572993016505205
+armv7neon_mmm_f32_8x4_cortexa9 8 32 3 0.0000019507567628415824
+armv7neon_mmm_f32_8x4_cortexa7 8 32 7 0.000004003993118643105
+armv7neon_mmm_f32_8x4_generic 17 4 13 0.000007623324255157955
+armv7neon_mmm_f32_8x6_cortexa9 9 4 11 0.0000035090633204775533
+armv7neon_mmm_f32_8x6_generic 9 4 19 0.00000601244922696809
+armv7neon_mmm_f32_8x6_generic 9 128 7 0.000020875839952630357
+generic_f32_4x4 4 32 11 0.000004326527977488747
+armv7neon_mmm_f32_8x6_cortexa9 25 32 18 0.00002022815888923194
+armv7neon_mmm_f32_8x6_cortexa9 15 4 18 0.000005434062942764287
+armv7neon_mmm_f32_8x4_cortexa9 24 4 3 0.000002704298016252254
+armv7neon_mmm_f32_8x4_generic 24 4 9 0.000005701271915969865
+armv7neon_mmm_f32_8x6_cortexa9 7 128 19 0.000021850407441008384
+armv7neon_mmm_f32_8x4_cortexa9 15 4 3 0.0000019251432800014627
+armv7neon_mmm_f32_8x6_cortexa7 15 4 18 0.000005711033859652051
+armv7neon_mmm_f32_8x6_cortexa7 9 4 17 0.000005102785803942399
+armv7neon_mmm_f32_8x4_generic 8 128 4 0.000004232482987549164
+armv7neon_mmm_f32_8x4_cortexa9 25 128 7 0.00003172200454891407
+armv7neon_mmm_f32_8x4_cortexa9 9 32 12 0.000008361571818242635
+armv7neon_mmm_f32_8x6_cortexa7 24 32 17 0.000019514235657015186
+armv7neon_mmm_f32_8x6_cortexa7 23 128 6 0.000020944404830412158
+armv7neon_mmm_f32_8x6_cortexa9 17 128 5 0.000016134142374025674
+armv7neon_mmm_f32_8x4_cortexa7 8 4 7 0.000001873057019803265
+armv7neon_mmm_f32_8x6_cortexa7 25 128 5 0.00002802974521426182
+armv7neon_mmm_f32_8x6_cortexa9 17 4 17 0.000006958050831218578
+armv7neon_mmm_f32_8x4_cortexa9 15 32 5 0.00000608182468505122
+armv7neon_mmm_f32_8x4_generic 7 128 4 0.0000044404498411270055
+generic_f32_4x4 13 4 5 0.000005350599856702572
+armv7neon_mmm_f32_8x4_generic 16 4 9 0.0000039608523692233575
+armv7neon_mmm_f32_8x6_cortexa9 16 4 12 0.000003013935297522993
+armv7neon_mmm_f32_8x6_generic 15 4 13 0.000005319052004340107
+armv7neon_mmm_f32_8x4_cortexa9 8 4 13 0.0000027408020700732427
+armv7neon_mmm_f32_8x6_generic 8 128 18 0.000015505866042830093
+armv7neon_mmm_f32_8x6_cortexa7 15 32 18 0.000013888471960481304
+armv7neon_mmm_f32_8x4_cortexa7 15 128 8 0.00002174159034943732
+armv7neon_mmm_f32_8x4_generic 9 32 4 0.0000030856158641387716
+armv7neon_mmm_f32_8x4_generic 24 4 5 0.000004129514406774627
+generic_f32_4x4 4 4 7 0.0000017497295508923947
+armv7neon_mmm_f32_8x6_generic 23 32 17 0.00001653018733416135
+generic_f32_4x4 7 128 13 0.000029515913541067617
+armv7neon_mmm_f32_8x6_cortexa9 24 32 19 0.000020360891894208073
+armv7neon_mmm_f32_8x6_cortexa9 9 32 18 0.000010557768641496435
+armv7neon_mmm_f32_8x4_generic 8 4 13 0.000002736501222546797
+armv7neon_mmm_f32_8x4_generic 16 32 11 0.000008543993772071838
+armv7neon_mmm_f32_8x4_cortexa7 7 32 12 0.0000059782618274442185
+armv7neon_mmm_f32_8x6_cortexa7 8 128 11 0.000014095624176931188
+armv7neon_mmm_f32_8x4_cortexa7 7 32 13 0.00000775363354362587
+armv7neon_mmm_f32_8x4_cortexa7 24 128 5 0.000032299734402349505
+armv7neon_mmm_f32_8x6_cortexa9 25 32 12 0.000013697867410484751
+armv7neon_mmm_f32_8x4_cortexa7 24 128 4 0.000016113162651910352
+armv7neon_mmm_f32_8x6_cortexa9 9 32 5 0.0000041006560791439575
+generic_f32_4x4 7 4 11 0.000004296735480110931
+armv7neon_mmm_f32_8x6_generic 8 32 11 0.000003977003783384346
+armv7neon_mmm_f32_8x4_generic 24 32 13 0.000016227668114322495
+armv7neon_mmm_f32_8x6_generic 7 32 5 0.0000023419854212729095
+generic_f32_4x4 3 128 3 0.00000421323412258659
+generic_f32_4x4 8 4 12 0.0000036950676170218278
+armv7neon_mmm_f32_8x4_cortexa7 15 4 12 0.000004609879642718733
+armv7neon_mmm_f32_8x6_generic 8 128 11 0.000010745832002669769
+armv7neon_mmm_f32_8x6_cortexa7 16 32 18 0.000012726640475291294
+armv7neon_mmm_f32_8x6_generic 24 4 13 0.000006477448734654056
+armv7neon_mmm_f32_8x4_cortexa9 23 128 5 0.000023893090361578605
+armv7neon_mmm_f32_8x6_cortexa9 7 128 7 0.000011059489910827611
+armv7neon_mmm_f32_8x4_cortexa7 24 4 13 0.000007916232793484605
+armv7neon_mmm_f32_8x6_generic 24 32 5 0.000006050244864860637
+armv7neon_mmm_f32_8x6_generic 23 4 5 0.000003050740283377829
+armv7neon_mmm_f32_8x4_generic 17 4 7 0.000004317179186717401
+generic_f32_4x4 13 128 3 0.000015316541870867625
+armv7neon_mmm_f32_8x6_generic 25 128 11 0.00004196449161824797
+armv7neon_mmm_f32_8x6_generic 24 32 7 0.000010654786791684403
+armv7neon_mmm_f32_8x6_generic 9 128 13 0.0000310078542141026
+armv7neon_mmm_f32_8x4_cortexa7 17 128 13 0.00006395441906545403
+armv7neon_mmm_f32_8x6_cortexa7 25 4 11 0.000006802751222800881
+armv7neon_mmm_f32_8x6_cortexa7 16 32 12 0.000008630037205832748
+armv7neon_mmm_f32_8x6_cortexa9 8 128 6 0.000005530202507023102
+armv7neon_mmm_f32_8x4_cortexa7 24 4 3 0.000002895093469687887
+armv7neon_mmm_f32_8x6_cortexa7 23 32 13 0.00002011431966689971
+armv7neon_mmm_f32_8x6_cortexa7 8 128 17 0.000020781906415830542
+armv7neon_mmm_f32_8x6_cortexa7 8 128 7 0.00001401297102177983
+generic_f32_4x4 12 128 4 0.000011113783764067
+armv7neon_mmm_f32_8x4_cortexa9 23 128 8 0.000023633711148043694
+armv7neon_mmm_f32_8x6_cortexa7 8 32 7 0.00000473866242690848
+armv7neon_mmm_f32_8x6_cortexa7 23 128 13 0.00006200535049902772
+armv7neon_mmm_f32_8x6_generic 25 128 12 0.000040933940992834904
+armv7neon_mmm_f32_8x6_generic 8 4 7 0.0000019018312390917206
+armv7neon_mmm_f32_8x6_cortexa9 9 128 13 0.0000311113672962118
+armv7neon_mmm_f32_8x6_cortexa7 17 32 6 0.000006712613519894369
+armv7neon_mmm_f32_8x6_generic 17 32 17 0.00001584305104507533
+armv7neon_mmm_f32_8x6_cortexa9 25 4 5 0.000003846834546374256
+armv7neon_mmm_f32_8x4_generic 23 128 11 0.00003563405640596688
+armv7neon_mmm_f32_8x6_cortexa7 7 32 18 0.000007762095957340233
+armv7neon_mmm_f32_8x4_cortexa7 8 4 3 0.000001278724116798178
+armv7neon_mmm_f32_8x4_generic 8 32 9 0.000004428552791795867
+armv7neon_mmm_f32_8x4_cortexa7 9 32 8 0.000007212913220352459
+armv7neon_mmm_f32_8x6_cortexa7 16 32 11 0.000009132197785046325
+armv7neon_mmm_f32_8x6_generic 15 4 6 0.0000021293993921351435
+armv7neon_mmm_f32_8x6_cortexa9 8 32 17 0.000005618722861518267
+generic_f32_4x4 12 32 8 0.000007824605805291677
+generic_f32_4x4 3 4 9 0.0000025426825999668207
+armv7neon_mmm_f32_8x6_cortexa9 15 128 6 0.000010938928121944877
+armv7neon_mmm_f32_8x6_generic 8 128 12 0.000010479271785006985
+armv7neon_mmm_f32_8x6_cortexa7 25 4 18 0.000008843120394707296
+generic_f32_4x4 5 128 8 0.000014867005692635484
+armv7neon_mmm_f32_8x4_generic 25 4 12 0.000007134980758480078
+armv7neon_mmm_f32_8x6_generic 23 4 6 0.0000027550692223444384
+generic_f32_4x4 8 128 4 0.000007594935908642548
+armv7neon_mmm_f32_8x4_generic 24 4 3 0.0000027070026756528944
+armv7neon_mmm_f32_8x6_generic 15 4 7 0.0000036879615500804916
+armv7neon_mmm_f32_8x6_cortexa7 8 4 13 0.0000026670243954904784
+armv7neon_mmm_f32_8x6_generic 17 4 7 0.000004739940833422853
+armv7neon_mmm_f32_8x6_generic 24 128 11 0.00003139196614178293
+armv7neon_mmm_f32_8x6_cortexa9 24 128 13 0.0000461884138564535
+generic_f32_4x4 9 128 9 0.00003291260190607621
+armv7neon_mmm_f32_8x4_cortexa9 7 128 11 0.000012420788787006248
+armv7neon_mmm_f32_8x6_generic 17 128 12 0.00003088346853188957
+armv7neon_mmm_f32_8x4_cortexa7 8 4 13 0.000002968959195086507
+armv7neon_mmm_f32_8x6_cortexa9 24 128 6 0.000015548482246921477
+armv7neon_mmm_f32_8x4_cortexa7 17 32 7 0.000010943141781236415
+armv7neon_mmm_f32_8x6_cortexa7 16 4 12 0.000003160151719580885
+generic_f32_4x4 5 128 5 0.000014996303737029157
+armv7neon_mmm_f32_8x6_generic 25 4 6 0.0000030664133228659144
+armv7neon_mmm_f32_8x4_cortexa9 9 4 11 0.0000041810312425253365
+armv7neon_mmm_f32_8x6_generic 15 4 18 0.000005353071509503547
+armv7neon_mmm_f32_8x6_cortexa7 8 32 12 0.000004562116220290897
+armv7neon_mmm_f32_8x4_cortexa9 8 32 7 0.0000032056568363054847
+armv7neon_mmm_f32_8x4_generic 16 4 11 0.0000040609047361039316
+armv7neon_mmm_f32_8x6_cortexa9 17 4 19 0.00000876633957316096
+armv7neon_mmm_f32_8x6_cortexa7 15 128 18 0.00004180200146350024
+generic_f32_4x4 5 128 9 0.000022192516532888443
+armv7neon_mmm_f32_8x6_cortexa9 24 32 11 0.000010954278251461477
+armv7neon_mmm_f32_8x6_cortexa9 8 32 5 0.000002372663230804753
+armv7neon_mmm_f32_8x6_generic 8 4 5 0.0000013605175071348517
+armv7neon_mmm_f32_8x6_cortexa7 24 4 6 0.0000025054020094278158
+armv7neon_mmm_f32_8x6_generic 24 128 5 0.000016222838915570737
+generic_f32_4x4 12 32 4 0.000004159176762509275
+generic_f32_4x4 13 4 11 0.000007582485226342242
+generic_f32_4x4 9 4 5 0.000004196638831417376
+armv7neon_mmm_f32_8x6_cortexa7 25 32 17 0.000026071317528366354
+armv7neon_mmm_f32_8x4_cortexa9 16 128 8 0.000015686411664235285
+generic_f32_4x4 13 128 11 0.00004380249651194818
+armv7neon_mmm_f32_8x4_cortexa7 24 32 13 0.000020563104321953744
+armv7neon_mmm_f32_8x4_cortexa9 23 4 7 0.0000044861575535086865
+generic_f32_4x4 12 128 3 0.000011613772328414718
+armv7neon_mmm_f32_8x6_cortexa7 16 4 19 0.000006163473976132481
+generic_f32_4x4 7 4 8 0.0000029313315346635237
+armv7neon_mmm_f32_8x4_cortexa9 23 32 12 0.000012545342985454958
+armv7neon_mmm_f32_8x6_cortexa9 7 128 11 0.000011136453503507945
+armv7neon_mmm_f32_8x6_cortexa7 9 128 13 0.000041110907972239005
+armv7neon_mmm_f32_8x4_cortexa9 7 32 8 0.000003418961974304266
+armv7neon_mmm_f32_8x4_cortexa7 24 32 12 0.000015105629538772432
+generic_f32_4x4 7 128 3 0.000007903637217361153
+armv7neon_mmm_f32_8x6_cortexa7 9 4 11 0.000003661168986658601
+generic_f32_4x4 4 128 9 0.00001126621217728165
+armv7neon_mmm_f32_8x4_cortexa7 23 128 3 0.000016760586455833876
+armv7neon_mmm_f32_8x6_cortexa9 16 32 12 0.000007056930252112048
+armv7neon_mmm_f32_8x6_cortexa9 8 4 19 0.000003165350777332514
+armv7neon_mmm_f32_8x6_generic 8 32 13 0.000005498586776368978
+generic_f32_4x4 3 128 13 0.000015227075547579779
+generic_f32_4x4 12 4 13 0.000007217490541690208
+armv7neon_mmm_f32_8x6_cortexa7 16 128 18 0.000040584526180649477
+armv7neon_mmm_f32_8x6_cortexa9 25 4 7 0.000006216705461889991
+armv7neon_mmm_f32_8x4_generic 24 128 9 0.000035021164515684145
+generic_f32_4x4 3 32 12 0.000004609887398147942
+armv7neon_mmm_f32_8x4_generic 25 128 9 0.00004684932933132534
+armv7neon_mmm_f32_8x6_generic 7 4 17 0.0000033628632970040393
+generic_f32_4x4 4 128 12 0.000011148897561634414
+armv7neon_mmm_f32_8x4_generic 8 128 8 0.000008028324048468155
+armv7neon_mmm_f32_8x4_generic 8 4 11 0.0000022515619534309057
+armv7neon_mmm_f32_8x4_cortexa7 9 32 5 0.00000739044702644143
+armv7neon_mmm_f32_8x4_generic 8 128 5 0.000008179707225297848
+armv7neon_mmm_f32_8x4_generic 9 4 4 0.0000016079691037580586
+armv7neon_mmm_f32_8x4_generic 16 128 4 0.00000803169255760427
+armv7neon_mmm_f32_8x4_generic 25 128 3 0.000016355748669331108
+armv7neon_mmm_f32_8x6_cortexa9 24 4 18 0.0000061179448610506165
+armv7neon_mmm_f32_8x6_cortexa9 16 4 19 0.000005803928147585998
+armv7neon_mmm_f32_8x6_cortexa7 17 128 6 0.00002064753338392539
+armv7neon_mmm_f32_8x4_cortexa9 9 4 12 0.000003936841431536775
+armv7neon_mmm_f32_8x4_cortexa7 17 32 13 0.000020815595163068456
+armv7neon_mmm_f32_8x6_cortexa7 23 32 11 0.000013810001606181058
+armv7neon_mmm_f32_8x4_cortexa7 15 32 13 0.00001449569792912252
+armv7neon_mmm_f32_8x4_cortexa7 7 4 3 0.0000012380899483592991
+armv7neon_mmm_f32_8x6_cortexa9 23 128 17 0.00004728149304262184
+armv7neon_mmm_f32_8x4_cortexa7 15 32 3 0.000004158381257275162
+armv7neon_mmm_f32_8x4_cortexa7 9 4 9 0.000004430434184731522
+armv7neon_mmm_f32_8x6_generic 23 128 13 0.00004715382202102794
+armv7neon_mmm_f32_8x6_cortexa9 9 4 19 0.000006116079821760871
+armv7neon_mmm_f32_8x6_cortexa9 7 4 18 0.0000035442600221807646
+armv7neon_mmm_f32_8x4_generic 15 128 5 0.000016261604433183137
+armv7neon_mmm_f32_8x4_cortexa9 8 128 4 0.000004251382345566181
+armv7neon_mmm_f32_8x6_cortexa7 25 128 11 0.000055294824394928404
+armv7neon_mmm_f32_8x6_cortexa9 23 128 13 0.00004716175024845285
+generic_f32_4x4 12 128 11 0.00003283409583810123
+armv7neon_mmm_f32_8x6_cortexa7 15 128 6 0.000014244472752339967
+armv7neon_mmm_f32_8x4_generic 17 32 8 0.000008307634701683482
+armv7neon_mmm_f32_8x4_generic 16 4 13 0.000005034888545956025
+armv7neon_mmm_f32_8x6_cortexa7 9 128 11 0.000027682771225447373
+armv7neon_mmm_f32_8x4_generic 25 128 8 0.000030970064243633
+armv7neon_mmm_f32_8x6_cortexa9 15 32 5 0.0000042126470351483745
+armv7neon_mmm_f32_8x6_cortexa9 8 128 7 0.000010688502492077545
+armv7neon_mmm_f32_8x6_cortexa7 8 32 19 0.000008785622923040567
+generic_f32_4x4 8 32 7 0.000005699367356537332
+armv7neon_mmm_f32_8x4_cortexa9 23 4 13 0.000007982321009915726
+armv7neon_mmm_f32_8x6_cortexa9 7 32 19 0.000008296837727225683
+armv7neon_mmm_f32_8x4_cortexa9 17 128 12 0.00003482723156416195
+armv7neon_mmm_f32_8x4_cortexa7 16 4 12 0.000003962627391393735
+armv7neon_mmm_f32_8x4_generic 24 32 4 0.000004275975584058462
+armv7neon_mmm_f32_8x4_generic 15 4 4 0.00000172768002087017
+generic_f32_4x4 12 128 13 0.000043289852359027704
+armv7neon_mmm_f32_8x6_cortexa7 24 32 18 0.00001876149316286456
+armv7neon_mmm_f32_8x4_cortexa9 23 128 13 0.000047081790611742305
+armv7neon_mmm_f32_8x4_generic 25 4 8 0.000004894765654717258
+generic_f32_4x4 13 32 9 0.000015809325344243507
+armv7neon_mmm_f32_8x6_cortexa9 15 32 11 0.000007869115330631703
+armv7neon_mmm_f32_8x6_cortexa7 16 32 19 0.000017064208112188813
+armv7neon_mmm_f32_8x4_cortexa7 9 32 12 0.000010571719001405112
+armv7neon_mmm_f32_8x6_cortexa9 7 4 12 0.000002545629385927187
+armv7neon_mmm_f32_8x4_cortexa9 24 128 5 0.00002368492009732677
+armv7neon_mmm_f32_8x6_cortexa7 25 4 5 0.000004026698373090156
+generic_f32_4x4 9 32 9 0.000012044684258478282
+armv7neon_mmm_f32_8x6_cortexa9 25 128 12 0.000040909803644855135
+generic_f32_4x4 8 128 12 0.00002170370665519434
+armv7neon_mmm_f32_8x6_generic 23 4 13 0.000007322110702891982
+armv7neon_mmm_f32_8x4_generic 8 128 9 0.000011964404434625917
+armv7neon_mmm_f32_8x6_generic 24 4 6 0.00000234486203657902
+armv7neon_mmm_f32_8x4_generic 16 128 12 0.000023289775015844785
+armv7neon_mmm_f32_8x4_generic 25 32 4 0.00000566239966628999
+armv7neon_mmm_f32_8x4_cortexa7 8 128 7 0.000011133117389377919
+armv7neon_mmm_f32_8x4_cortexa7 23 128 7 0.000032614653457132354
+generic_f32_4x4 11 128 3 0.000011596634661777904
+armv7neon_mmm_f32_8x6_generic 9 32 7 0.000007301874181587425
+armv7neon_mmm_f32_8x6_cortexa7 25 4 6 0.0000032842534990270153
+armv7neon_mmm_f32_8x4_cortexa7 25 4 11 0.000008398472143479583
+armv7neon_mmm_f32_8x4_cortexa9 17 128 3 0.000012367498051511246
+armv7neon_mmm_f32_8x4_cortexa9 15 4 4 0.0000017226474127640693
+armv7neon_mmm_f32_8x6_cortexa9 8 4 12 0.0000017663174571893155
+armv7neon_mmm_f32_8x6_generic 24 32 18 0.000014939528824089691
+armv7neon_mmm_f32_8x4_cortexa9 15 128 3 0.000008424928636646119
+armv7neon_mmm_f32_8x4_cortexa9 25 32 8 0.000010844406621465396
+armv7neon_mmm_f32_8x6_generic 24 128 6 0.00001549274717546449
+armv7neon_mmm_f32_8x4_cortexa9 7 4 8 0.0000019411905486089406
+generic_f32_4x4 5 4 12 0.000003970968097394295
+armv7neon_mmm_f32_8x4_cortexa7 17 128 3 0.000016675513856755485
+armv7neon_mmm_f32_8x6_generic 15 128 17 0.0000319988391989146
+armv7neon_mmm_f32_8x4_generic 8 4 3 0.0000012046114173177262
+armv7neon_mmm_f32_8x4_generic 9 4 13 0.000005271381310779438
+armv7neon_mmm_f32_8x6_generic 25 4 5 0.000003787680148419445
+armv7neon_mmm_f32_8x6_cortexa9 16 4 13 0.00000455965655028581
+armv7neon_mmm_f32_8x6_generic 16 128 12 0.000020507038874403576
+armv7neon_mmm_f32_8x4_cortexa9 7 4 3 0.0000011789464739113475
+armv7neon_mmm_f32_8x4_cortexa7 15 4 9 0.000004718086287004601
+generic_f32_4x4 3 32 8 0.0000032590697277049294
+armv7neon_mmm_f32_8x4_cortexa7 25 32 11 0.000021029318586240916
+armv7neon_mmm_f32_8x4_cortexa9 7 128 7 0.000008425064090676705
+armv7neon_mmm_f32_8x4_cortexa9 23 128 7 0.000024027014834706714
+armv7neon_mmm_f32_8x4_cortexa9 9 128 13 0.000031337531340777375
+generic_f32_4x4 12 32 3 0.0000046491920750409604
+armv7neon_mmm_f32_8x4_cortexa7 9 128 12 0.00003204704023977606
+armv7neon_mmm_f32_8x6_cortexa9 15 128 5 0.000010976146066847136
+armv7neon_mmm_f32_8x6_generic 9 128 5 0.000010821808595127155
+armv7neon_mmm_f32_8x6_cortexa9 16 32 18 0.000010217778748183444
+armv7neon_mmm_f32_8x6_cortexa9 8 32 13 0.000005537991894862071
+armv7neon_mmm_f32_8x6_cortexa7 15 128 11 0.000028075729994494098
+armv7neon_mmm_f32_8x6_cortexa7 24 4 7 0.000004992618899850428
+armv7neon_mmm_f32_8x6_generic 15 32 6 0.000004128781340215273
+armv7neon_mmm_f32_8x6_generic 15 32 18 0.000011342396294923752
+generic_f32_4x4 3 128 9 0.000011614205849736826
+armv7neon_mmm_f32_8x4_generic 7 32 9 0.000004862452767728157
+armv7neon_mmm_f32_8x4_cortexa9 17 4 7 0.000004308345799806778
+armv7neon_mmm_f32_8x6_cortexa9 16 32 13 0.000010598291750058129
+armv7neon_mmm_f32_8x4_generic 17 32 7 0.000008775628241420315
+armv7neon_mmm_f32_8x4_cortexa9 8 4 8 0.00000151354666080862
+armv7neon_mmm_f32_8x6_cortexa7 9 4 19 0.0000064774482704190535
+armv7neon_mmm_f32_8x4_cortexa9 9 4 4 0.000001610951102254284
+armv7neon_mmm_f32_8x4_cortexa7 7 128 3 0.00000588268473785512
+armv7neon_mmm_f32_8x4_cortexa9 9 128 7 0.00001614593088465898
+armv7neon_mmm_f32_8x6_cortexa9 9 128 6 0.000010694366815766805
+armv7neon_mmm_f32_8x6_cortexa9 17 4 5 0.000003000192050393217
+armv7neon_mmm_f32_8x4_cortexa9 8 32 4 0.0000017239720455885444
+armv7neon_mmm_f32_8x6_cortexa7 25 128 18 0.0000814675708223554
+generic_f32_4x4 12 128 7 0.000022191318720751985
+generic_f32_4x4 5 128 13 0.000029341360540777852
+generic_f32_4x4 7 4 5 0.000003063610185404798
+armv7neon_mmm_f32_8x4_cortexa7 23 32 12 0.000015790133040211727
+armv7neon_mmm_f32_8x6_cortexa7 17 128 13 0.00006139081303315256
+armv7neon_mmm_f32_8x4_cortexa9 8 32 12 0.000004268120941798172
+armv7neon_mmm_f32_8x6_generic 16 32 17 0.000010656142728555816
+generic_f32_4x4 13 4 8 0.000004925203028119758
+armv7neon_mmm_f32_8x6_generic 16 4 6 0.0000017414671485444925
+armv7neon_mmm_f32_8x6_generic 23 4 7 0.000005090461887505994
+armv7neon_mmm_f32_8x4_cortexa9 7 128 4 0.000004439163092495918
+armv7neon_mmm_f32_8x4_cortexa9 15 32 4 0.000003199516110418427
+generic_f32_4x4 13 4 9 0.000007521941079552681
+armv7neon_mmm_f32_8x4_generic 9 128 13 0.00003132672472959027
+armv7neon_mmm_f32_8x6_cortexa7 9 128 5 0.000014186965083572584
+armv7neon_mmm_f32_8x4_cortexa9 16 32 13 0.000010950995485777078
+armv7neon_mmm_f32_8x4_cortexa7 24 4 11 0.000006343455120468695
+armv7neon_mmm_f32_8x4_generic 23 32 12 0.000012518582750578088
+armv7neon_mmm_f32_8x4_cortexa9 23 4 5 0.000004357470789736493
+armv7neon_mmm_f32_8x6_generic 9 128 12 0.000020738827544116163
+armv7neon_mmm_f32_8x4_generic 7 128 8 0.000008441257822863349
+armv7neon_mmm_f32_8x6_generic 25 4 18 0.000008206815180054958
+armv7neon_mmm_f32_8x6_cortexa7 16 128 7 0.00002754330318348781
+armv7neon_mmm_f32_8x4_cortexa9 15 128 12 0.000023818850811398474
+armv7neon_mmm_f32_8x6_cortexa7 7 32 5 0.0000027477786822716826
+generic_f32_4x4 4 32 9 0.000004310962413140207
+armv7neon_mmm_f32_8x6_cortexa9 9 32 11 0.000007472405716714093
+generic_f32_4x4 9 4 7 0.0000042110601811121985
+armv7neon_mmm_f32_8x4_cortexa9 25 128 5 0.00003145000193606305
+armv7neon_mmm_f32_8x6_generic 16 128 7 0.000020961637268522957
+armv7neon_mmm_f32_8x6_cortexa7 8 128 18 0.00002061441681055432
+armv7neon_mmm_f32_8x6_cortexa7 15 32 11 0.000009542090957976056
+armv7neon_mmm_f32_8x4_cortexa9 8 32 8 0.0000030204030341535007
+armv7neon_mmm_f32_8x6_cortexa9 16 4 18 0.000004289256443600587
+armv7neon_mmm_f32_8x4_cortexa7 15 32 8 0.000007451057144596197
+armv7neon_mmm_f32_8x6_cortexa7 25 128 12 0.00005431857023175915
+armv7neon_mmm_f32_8x6_generic 7 128 11 0.000011099973931488493
+armv7neon_mmm_f32_8x4_cortexa9 16 128 11 0.000023648803168223167
+armv7neon_mmm_f32_8x6_cortexa9 17 4 12 0.0000044808063412694585
+armv7neon_mmm_f32_8x6_generic 25 32 7 0.000014053446208239492
+armv7neon_mmm_f32_8x6_generic 16 128 19 0.0000411706712809725
+armv7neon_mmm_f32_8x4_generic 16 32 13 0.000010937227005963747
+armv7neon_mmm_f32_8x4_cortexa7 8 32 3 0.000002329652194595725
+armv7neon_mmm_f32_8x4_cortexa9 23 4 11 0.000006308879061820964
+armv7neon_mmm_f32_8x6_cortexa7 16 4 7 0.0000035065261101064537
+armv7neon_mmm_f32_8x6_generic 9 128 6 0.000010599309996894829
+armv7neon_mmm_f32_8x4_cortexa7 24 128 11 0.000048035927729516165
+armv7neon_mmm_f32_8x6_generic 7 32 11 0.000004333152377602563
+armv7neon_mmm_f32_8x4_cortexa7 15 4 11 0.000004824968358151188
+armv7neon_mmm_f32_8x6_cortexa7 7 32 6 0.0000029036965940094368
+generic_f32_4x4 12 4 3 0.0000026010633583656195
+armv7neon_mmm_f32_8x6_cortexa9 8 32 18 0.000005373878070970815
+armv7neon_mmm_f32_8x4_generic 16 128 7 0.000016057054409709098
+armv7neon_mmm_f32_8x4_generic 17 128 5 0.000023713194238003484
+armv7neon_mmm_f32_8x6_generic 24 128 19 0.0000639824281045487
+generic_f32_4x4 5 4 13 0.000005260715034760787
+armv7neon_mmm_f32_8x6_cortexa9 16 128 19 0.000040932625772708764
+armv7neon_mmm_f32_8x4_generic 16 32 5 0.00000587187854127478
+armv7neon_mmm_f32_8x4_generic 15 32 4 0.0000032031359648686487
+armv7neon_mmm_f32_8x4_cortexa9 24 32 13 0.00001619115724351509
+armv7neon_mmm_f32_8x4_generic 15 32 8 0.000005976862649343733
+armv7neon_mmm_f32_8x6_cortexa9 16 32 7 0.000007329062657266122
+armv7neon_mmm_f32_8x4_cortexa9 25 32 7 0.00001147457512424339
+armv7neon_mmm_f32_8x4_cortexa9 24 128 3 0.000012480179872163079
+armv7neon_mmm_f32_8x6_generic 16 4 13 0.000004503173649426117
+armv7neon_mmm_f32_8x6_generic 16 4 19 0.000005734445256719681
+armv7neon_mmm_f32_8x4_cortexa7 24 4 8 0.000003952435244053743
+armv7neon_mmm_f32_8x6_cortexa9 9 32 6 0.000003904765036328731
+armv7neon_mmm_f32_8x6_cortexa7 23 4 13 0.000007856276222270553
+armv7neon_mmm_f32_8x6_cortexa9 15 4 6 0.000002182843988681052
+armv7neon_mmm_f32_8x6_generic 23 4 18 0.000007195841362691159
+armv7neon_mmm_f32_8x4_cortexa7 16 32 12 0.000010255406371017558
+armv7neon_mmm_f32_8x4_cortexa9 25 128 8 0.0000309762014941751
+armv7neon_mmm_f32_8x6_cortexa7 25 128 17 0.00008204444005539351
+armv7neon_mmm_f32_8x6_cortexa7 17 4 6 0.0000026268622339640763
+armv7neon_mmm_f32_8x4_cortexa9 15 32 9 0.000008859604622252251
+generic_f32_4x4 5 32 9 0.000008264557275071169
+generic_f32_4x4 13 32 5 0.000010823964577863127
+generic_f32_4x4 5 4 3 0.000001851289980714991
+armv7neon_mmm_f32_8x4_generic 15 128 4 0.000008225264190077282
+armv7neon_mmm_f32_8x6_generic 17 32 18 0.000015295586544658026
+armv7neon_mmm_f32_8x6_cortexa9 9 4 18 0.000004572833723975051
+armv7neon_mmm_f32_8x4_cortexa7 15 4 13 0.000006075307141015665
+armv7neon_mmm_f32_8x6_cortexa7 17 32 19 0.000025630924279728742
+generic_f32_4x4 7 32 7 0.0000058385014382941885
+armv7neon_mmm_f32_8x4_cortexa9 16 128 7 0.000016054999854639244
+armv7neon_mmm_f32_8x4_cortexa7 25 4 4 0.0000029326008363454604
+armv7neon_mmm_f32_8x6_cortexa7 16 128 12 0.000027204121533574832
+armv7neon_mmm_f32_8x4_generic 15 128 3 0.000008426038232256742
+armv7neon_mmm_f32_8x4_cortexa9 7 32 5 0.0000033516027030490246
+armv7neon_mmm_f32_8x4_cortexa7 16 4 7 0.0000032624014554442194
+armv7neon_mmm_f32_8x4_cortexa9 16 128 3 0.000008462143402947945
+armv7neon_mmm_f32_8x4_generic 25 4 7 0.000005557466852852917
+armv7neon_mmm_f32_8x4_generic 23 32 4 0.000004495607981128392
+armv7neon_mmm_f32_8x4_cortexa9 15 4 12 0.000004291414852711907
+generic_f32_4x4 4 4 13 0.0000027727007811134473
+armv7neon_mmm_f32_8x4_generic 16 4 8 0.0000025796442872451462
+armv7neon_mmm_f32_8x4_cortexa9 23 4 9 0.000006179316744537218
+armv7neon_mmm_f32_8x4_cortexa7 25 32 3 0.000007764249905705397
+armv7neon_mmm_f32_8x6_generic 17 32 7 0.000010687826852062675
+generic_f32_4x4 13 4 7 0.000005399817435719611
+generic_f32_4x4 12 4 4 0.0000021094685699886824
+armv7neon_mmm_f32_8x6_cortexa9 7 4 17 0.0000034025163658270913
+generic_f32_4x4 4 4 9 0.0000022359497586694876
+armv7neon_mmm_f32_8x6_cortexa9 17 128 17 0.00004660020476545699
+armv7neon_mmm_f32_8x4_generic 15 4 12 0.00000428385702551851
+armv7neon_mmm_f32_8x4_generic 9 4 11 0.000004185007110545824
+armv7neon_mmm_f32_8x6_cortexa9 24 128 18 0.000045657972165917084
+armv7neon_mmm_f32_8x6_cortexa9 15 4 19 0.000007027336593004323
+armv7neon_mmm_f32_8x4_cortexa9 9 128 9 0.00002367365046977455
+armv7neon_mmm_f32_8x4_cortexa9 16 4 9 0.000003960113146754688
+armv7neon_mmm_f32_8x4_cortexa7 8 32 9 0.000005561279017749759
+armv7neon_mmm_f32_8x4_cortexa9 23 32 3 0.000004897369194486712
+armv7neon_mmm_f32_8x4_cortexa9 16 32 3 0.0000034377989421061634
+armv7neon_mmm_f32_8x6_generic 25 32 5 0.00000775592482259445
+armv7neon_mmm_f32_8x4_cortexa9 9 128 11 0.000023733495515623523
+armv7neon_mmm_f32_8x4_generic 23 32 7 0.000008935361824557136
+armv7neon_mmm_f32_8x4_cortexa7 7 128 9 0.00001664390737173506
+generic_f32_4x4 5 4 8 0.0000028477215638418112
+armv7neon_mmm_f32_8x6_generic 23 4 19 0.000009557906532901389
+armv7neon_mmm_f32_8x6_cortexa9 16 128 12 0.000020636825667982075
+armv7neon_mmm_f32_8x6_generic 9 32 11 0.000007421682594966975
+armv7neon_mmm_f32_8x4_generic 23 128 13 0.000047353976390946026
+armv7neon_mmm_f32_8x4_cortexa7 17 4 7 0.000004622886247456753
+armv7neon_mmm_f32_8x6_cortexa9 8 128 5 0.000005754129281131009
+armv7neon_mmm_f32_8x4_cortexa7 9 128 11 0.000032319021391833177
+armv7neon_mmm_f32_8x4_cortexa9 24 4 9 0.000005698490674755626
+armv7neon_mmm_f32_8x6_generic 17 4 18 0.000006326121915439696
+armv7neon_mmm_f32_8x4_cortexa9 16 128 12 0.000023196293109823396
+armv7neon_mmm_f32_8x4_cortexa9 25 32 9 0.000016534317998128363
+armv7neon_mmm_f32_8x6_cortexa7 24 4 5 0.0000032362274636792855
+armv7neon_mmm_f32_8x6_cortexa7 9 32 13 0.000013201464669659568
+armv7neon_mmm_f32_8x4_generic 25 128 5 0.000031432252054608495
+armv7neon_mmm_f32_8x4_cortexa7 16 32 8 0.000007004694197132091
+armv7neon_mmm_f32_8x4_cortexa7 25 128 8 0.00004234315278499361
+armv7neon_mmm_f32_8x6_cortexa9 25 128 11 0.00004162498921103509
+armv7neon_mmm_f32_8x4_cortexa9 17 4 11 0.000005990145971795821
+armv7neon_mmm_f32_8x6_generic 7 128 5 0.000005711586442858516
+generic_f32_4x4 5 32 7 0.000005745084475312582
+armv7neon_mmm_f32_8x4_cortexa7 17 4 3 0.0000027778998365925647
+armv7neon_mmm_f32_8x6_generic 7 32 17 0.000006323975084622719
+armv7neon_mmm_f32_8x4_cortexa7 24 32 4 0.0000053627426905310905
+armv7neon_mmm_f32_8x6_cortexa9 7 128 13 0.000016465054999968026
+armv7neon_mmm_f32_8x6_cortexa9 25 32 17 0.000021029276181809713
+armv7neon_mmm_f32_8x6_generic 7 4 18 0.0000035040816774469106
+armv7neon_mmm_f32_8x6_cortexa7 17 128 12 0.00004084848200658872
+armv7neon_mmm_f32_8x6_cortexa7 16 128 11 0.000027707012596319906
+armv7neon_mmm_f32_8x6_cortexa7 8 128 12 0.0000138330433065325
+armv7neon_mmm_f32_8x4_cortexa9 7 32 9 0.000004837549683885907
+armv7neon_mmm_f32_8x4_generic 7 32 7 0.000003409099607922408
+armv7neon_mmm_f32_8x6_cortexa7 16 4 5 0.0000023317089411955496
+armv7neon_mmm_f32_8x6_cortexa7 7 32 17 0.000007581121899800583
+armv7neon_mmm_f32_8x4_cortexa7 16 4 9 0.0000042985642379541856
+armv7neon_mmm_f32_8x6_generic 16 4 7 0.0000032960593218992506
+armv7neon_mmm_f32_8x4_generic 15 32 5 0.0000060704297110345465
+armv7neon_mmm_f32_8x6_cortexa9 24 32 6 0.00000538028834244819
+armv7neon_mmm_f32_8x6_generic 25 32 18 0.0000200993477092098
+armv7neon_mmm_f32_8x6_generic 23 128 6 0.00001591506566691279
+armv7neon_mmm_f32_8x6_generic 16 32 13 0.00001049572423866497
+armv7neon_mmm_f32_8x4_cortexa7 15 32 5 0.000007565304970371118
+generic_f32_4x4 12 32 7 0.000008258205556820225
+armv7neon_mmm_f32_8x4_cortexa7 17 4 9 0.000006349279290785205
+generic_f32_4x4 8 128 5 0.000014938883570524155
+armv7neon_mmm_f32_8x4_generic 15 32 3 0.0000034015953691065083
+armv7neon_mmm_f32_8x6_cortexa7 7 4 18 0.000003692100644999123
+generic_f32_4x4 8 128 7 0.000015006901203342376
+generic_f32_4x4 4 4 3 0.0000012181206340328286
+generic_f32_4x4 12 32 5 0.000008206014399831754
+armv7neon_mmm_f32_8x6_cortexa9 8 128 19 0.000020717968248585576
+generic_f32_4x4 5 32 3 0.000003225094305304002
+armv7neon_mmm_f32_8x6_generic 24 4 18 0.000006021495896771489
+armv7neon_mmm_f32_8x6_cortexa7 16 128 13 0.00004093551587387655
+armv7neon_mmm_f32_8x4_cortexa9 15 128 8 0.00001604448990415154
+armv7neon_mmm_f32_8x6_generic 9 32 13 0.000010651318368732651
+armv7neon_mmm_f32_8x4_cortexa7 8 32 8 0.000003772107419321899
+generic_f32_4x4 9 32 3 0.000004621846936980697
+armv7neon_mmm_f32_8x4_cortexa9 16 32 5 0.000005906329928132943
+armv7neon_mmm_f32_8x6_cortexa9 16 4 6 0.000001808587040184933
+armv7neon_mmm_f32_8x6_cortexa9 9 32 13 0.000010778660548919894
+armv7neon_mmm_f32_8x4_cortexa9 16 32 4 0.0000030105150384631114
+armv7neon_mmm_f32_8x6_generic 17 4 17 0.000006838981837746129
+armv7neon_mmm_f32_8x6_cortexa7 7 128 19 0.000028499453971524397
+armv7neon_mmm_f32_8x6_cortexa7 23 32 17 0.000020319754403004648
+armv7neon_mmm_f32_8x6_generic 9 32 17 0.000010765407372129566
+armv7neon_mmm_f32_8x4_generic 23 4 13 0.000008003285656461567
+armv7neon_mmm_f32_8x4_generic 17 4 4 0.000002161502194915875
+armv7neon_mmm_f32_8x4_generic 23 4 11 0.000006323737070277622
+armv7neon_mmm_f32_8x4_generic 9 4 12 0.00000394067683542442
+armv7neon_mmm_f32_8x4_cortexa7 23 128 4 0.00001636523555605817
+armv7neon_mmm_f32_8x4_cortexa9 17 32 12 0.000012187305627907507
+armv7neon_mmm_f32_8x4_generic 25 32 5 0.000011305023296577953
+armv7neon_mmm_f32_8x4_cortexa7 23 128 13 0.0000641420493251813
+armv7neon_mmm_f32_8x6_cortexa9 17 4 13 0.0000067882601578195375
+armv7neon_mmm_f32_8x6_cortexa9 24 32 13 0.00001554482246687919
+armv7neon_mmm_f32_8x4_cortexa7 15 128 12 0.00003239866815806022
+armv7neon_mmm_f32_8x6_cortexa9 23 4 13 0.000007410621365009766
+armv7neon_mmm_f32_8x4_cortexa9 17 128 5 0.000023718649742852314
+armv7neon_mmm_f32_8x6_cortexa9 15 32 13 0.000011397983577281118
+armv7neon_mmm_f32_8x6_cortexa7 15 32 5 0.0000050262586648579474
+armv7neon_mmm_f32_8x4_cortexa9 16 128 9 0.000023527233248412315
+armv7neon_mmm_f32_8x6_cortexa7 17 4 17 0.000007344312997274551
+armv7neon_mmm_f32_8x6_cortexa7 15 128 13 0.00004174083709909147
+armv7neon_mmm_f32_8x4_cortexa9 24 128 11 0.00003518379984445077
+armv7neon_mmm_f32_8x6_cortexa7 15 4 6 0.0000022388984932072717
+armv7neon_mmm_f32_8x6_generic 17 32 11 0.000010862418538636322
+armv7neon_mmm_f32_8x6_cortexa9 17 128 19 0.00006205059984963943
+armv7neon_mmm_f32_8x6_cortexa7 24 128 7 0.00004110240285622754
+armv7neon_mmm_f32_8x6_generic 25 32 11 0.00001430805053910281
+armv7neon_mmm_f32_8x4_cortexa9 23 128 11 0.00003562503787212338
+armv7neon_mmm_f32_8x4_generic 8 128 13 0.00001577265144197705
+armv7neon_mmm_f32_8x4_cortexa9 15 32 3 0.0000034017274319352206
+generic_f32_4x4 13 32 3 0.000005959878460430609
+armv7neon_mmm_f32_8x4_cortexa9 8 32 9 0.000004432308412032044
+armv7neon_mmm_f32_8x4_cortexa7 8 4 5 0.0000018214760004994945
+armv7neon_mmm_f32_8x4_generic 16 4 5 0.0000029127359283609326
+armv7neon_mmm_f32_8x4_cortexa7 8 32 11 0.000005611833179795977
+armv7neon_mmm_f32_8x4_generic 17 128 12 0.00003481391410825688
+armv7neon_mmm_f32_8x6_cortexa7 17 4 19 0.00000928821589263059
+armv7neon_mmm_f32_8x6_generic 8 4 19 0.00000313513885546113
+generic_f32_4x4 11 32 11 0.000012240954381721343
+armv7neon_mmm_f32_8x4_cortexa9 9 128 8 0.000015834005248731945
+armv7neon_mmm_f32_8x6_generic 25 128 7 0.00004137887977916487
+armv7neon_mmm_f32_8x6_cortexa7 25 32 13 0.000025709475387022286
+armv7neon_mmm_f32_8x6_cortexa7 24 32 5 0.000007312970559712554
+armv7neon_mmm_f32_8x4_cortexa9 7 32 3 0.0000019121426586981612
+armv7neon_mmm_f32_8x4_cortexa9 8 4 12 0.0000020507155861457753
+armv7neon_mmm_f32_8x4_cortexa9 25 128 4 0.000015694825557140796
+armv7neon_mmm_f32_8x6_cortexa9 9 4 17 0.0000048654285121987666
+armv7neon_mmm_f32_8x4_generic 15 32 13 0.000011594181812430167
+generic_f32_4x4 8 4 5 0.0000029234671714958714
+armv7neon_mmm_f32_8x6_generic 25 128 18 0.00006398210071667638
+armv7neon_mmm_f32_8x4_generic 25 32 7 0.000011481869355677313
+armv7neon_mmm_f32_8x6_cortexa7 15 32 13 0.000013835762311238379
+armv7neon_mmm_f32_8x6_generic 7 32 7 0.000004270239691930375
+armv7neon_mmm_f32_8x4_cortexa9 24 4 12 0.0000052513903304867065
+armv7neon_mmm_f32_8x4_cortexa7 7 128 13 0.000022090489564676277
+armv7neon_mmm_f32_8x4_cortexa7 9 32 7 0.0000074510499136902
+armv7neon_mmm_f32_8x4_generic 15 128 11 0.00002405185440078953
+armv7neon_mmm_f32_8x6_cortexa7 24 128 18 0.00006059904552298362
+armv7neon_mmm_f32_8x4_cortexa9 24 4 8 0.0000036249029546823233
+armv7neon_mmm_f32_8x4_generic 17 4 5 0.000004186211638893127
+armv7neon_mmm_f32_8x4_cortexa7 15 128 5 0.00002190540588603946
+armv7neon_mmm_f32_8x6_cortexa7 9 32 5 0.000004913938240718097
+armv7neon_mmm_f32_8x4_cortexa7 25 128 4 0.000021428570118948462
+armv7neon_mmm_f32_8x6_generic 9 4 17 0.000004759335274456197
+armv7neon_mmm_f32_8x4_cortexa9 23 32 5 0.000008792076478618836
+armv7neon_mmm_f32_8x4_cortexa9 17 128 13 0.00004665611088212803
+generic_f32_4x4 11 128 12 0.00003275274568553372
+armv7neon_mmm_f32_8x6_cortexa7 7 128 12 0.000014620372931092478
+armv7neon_mmm_f32_8x6_generic 24 128 13 0.000046299674326571746
+generic_f32_4x4 12 4 9 0.000005632703760211813
+armv7neon_mmm_f32_8x6_cortexa7 15 32 6 0.000004965054605236343
+armv7neon_mmm_f32_8x6_cortexa9 7 128 17 0.000016538122105202102
+armv7neon_mmm_f32_8x6_cortexa9 17 128 7 0.00003112401925909139
+armv7neon_mmm_f32_8x4_cortexa7 7 32 5 0.0000040832664009078
+armv7neon_mmm_f32_8x4_cortexa9 25 4 3 0.000003352094433744027
+armv7neon_mmm_f32_8x4_cortexa7 17 4 4 0.0000023752236037367475
+armv7neon_mmm_f32_8x4_cortexa9 8 4 4 0.000000986252516977325
+armv7neon_mmm_f32_8x6_cortexa7 9 4 5 0.0000022147026524486433
+armv7neon_mmm_f32_8x6_cortexa9 24 4 13 0.000006569367159583456
+armv7neon_mmm_f32_8x6_cortexa9 25 32 13 0.000020818742897252734
+armv7neon_mmm_f32_8x4_cortexa9 15 32 12 0.000008751598473767715
+armv7neon_mmm_f32_8x4_cortexa7 7 4 4 0.000001249360695162341
+armv7neon_mmm_f32_8x6_generic 25 4 19 0.000011234098334898354
+generic_f32_4x4 7 4 9 0.000004262590318570047
+generic_f32_4x4 12 128 9 0.000032724926249473206
+armv7neon_mmm_f32_8x4_cortexa7 23 4 11 0.0000067800221845095914
+armv7neon_mmm_f32_8x4_cortexa7 23 4 13 0.00000857826635505948
+armv7neon_mmm_f32_8x4_generic 7 32 3 0.0000019119582227554076
+armv7neon_mmm_f32_8x6_cortexa9 25 4 12 0.000005714183543712814
+generic_f32_4x4 7 128 8 0.000014963178444176004
+armv7neon_mmm_f32_8x6_generic 23 4 11 0.000005289174016444985
+armv7neon_mmm_f32_8x4_generic 15 4 7 0.000003212917021733202
+armv7neon_mmm_f32_8x4_cortexa9 17 4 4 0.000002157304549786207
+armv7neon_mmm_f32_8x4_cortexa7 25 4 8 0.000005296217018377082
+armv7neon_mmm_f32_8x4_cortexa7 16 4 8 0.000002789622045679376
+armv7neon_mmm_f32_8x4_cortexa9 9 32 11 0.00000862398264754348
+armv7neon_mmm_f32_8x6_generic 16 4 18 0.000004182651246093671
+armv7neon_mmm_f32_8x4_cortexa9 7 4 7 0.0000019258225946178918
+armv7neon_mmm_f32_8x4_generic 7 32 13 0.0000063109264345399555
+armv7neon_mmm_f32_8x6_generic 15 32 7 0.000007649176449323901
+armv7neon_mmm_f32_8x4_cortexa7 9 4 13 0.00000566559429146675
+generic_f32_4x4 11 4 13 0.00000768804413911549
+armv7neon_mmm_f32_8x6_cortexa7 15 128 19 0.000055484906874277455
+armv7neon_mmm_f32_8x6_generic 8 128 19 0.000020785294814215668
+armv7neon_mmm_f32_8x4_cortexa7 23 32 7 0.000011137926376204002
+armv7neon_mmm_f32_8x4_cortexa9 25 32 4 0.000005634291322373054
+armv7neon_mmm_f32_8x4_cortexa7 16 4 11 0.0000043943657333441155
+generic_f32_4x4 3 128 4 0.000004206283247834624
+armv7neon_mmm_f32_8x6_cortexa9 23 32 17 0.000016635289447796483
+armv7neon_mmm_f32_8x6_cortexa7 23 4 5 0.0000032303127776050868
+generic_f32_4x4 7 128 7 0.000015111291354960824
+armv7neon_mmm_f32_8x6_cortexa7 8 128 19 0.000027387915918929853
+armv7neon_mmm_f32_8x4_cortexa7 24 32 9 0.000015681272955767534
+armv7neon_mmm_f32_8x6_cortexa7 15 32 7 0.000009363971820493224
+armv7neon_mmm_f32_8x6_generic 15 4 17 0.000005464908042174986
+generic_f32_4x4 11 4 4 0.000002266320385947867
+armv7neon_mmm_f32_8x4_cortexa7 7 4 5 0.0000019827556786453945
+armv7neon_mmm_f32_8x4_generic 7 4 5 0.0000018753720795687383
+armv7neon_mmm_f32_8x6_cortexa7 23 32 19 0.000026555363914009545
+generic_f32_4x4 12 32 11 0.000011904804943326893
+generic_f32_4x4 9 32 4 0.000004278605538448279
+armv7neon_mmm_f32_8x4_cortexa9 9 4 3 0.0000018374246302226002
+armv7neon_mmm_f32_8x4_generic 17 128 11 0.00003530676669688071
+armv7neon_mmm_f32_8x4_cortexa7 23 32 5 0.000011014184598781204
+armv7neon_mmm_f32_8x4_cortexa7 8 128 8 0.000010889817517963246
+armv7neon_mmm_f32_8x4_generic 24 128 5 0.000023658482563487358
+armv7neon_mmm_f32_8x4_generic 23 4 4 0.0000022865778486536074
+armv7neon_mmm_f32_8x6_cortexa7 16 32 7 0.000008963466531834949
+armv7neon_mmm_f32_8x4_cortexa7 16 128 11 0.00003220406926334493
+armv7neon_mmm_f32_8x4_cortexa9 7 4 9 0.000002612401333065891
+armv7neon_mmm_f32_8x6_generic 24 128 18 0.00004570910076801179
+armv7neon_mmm_f32_8x6_cortexa7 15 32 19 0.00001825949129411215
+armv7neon_mmm_f32_8x4_cortexa7 7 128 11 0.00001669033881154567
+armv7neon_mmm_f32_8x6_cortexa7 17 128 17 0.00006152323359147728
+armv7neon_mmm_f32_8x6_generic 23 4 12 0.0000049613667404571205
+armv7neon_mmm_f32_8x6_cortexa9 17 4 6 0.0000025304413562385254
+armv7neon_mmm_f32_8x6_generic 25 32 13 0.000020634077057855525
+generic_f32_4x4 3 128 8 0.000007895048382688004
+armv7neon_mmm_f32_8x6_generic 7 128 6 0.000005880420229157795
+armv7neon_mmm_f32_8x6_cortexa7 7 4 6 0.0000015435048498918165
+armv7neon_mmm_f32_8x6_cortexa9 15 4 12 0.0000038035548284711065
+armv7neon_mmm_f32_8x4_generic 16 128 13 0.00003105986968794038
+armv7neon_mmm_f32_8x6_generic 17 128 6 0.00001561934075445538
+armv7neon_mmm_f32_8x6_generic 23 32 5 0.000006038048456367719
+generic_f32_4x4 3 4 8 0.0000018887559388867697
+armv7neon_mmm_f32_8x6_cortexa7 17 128 5 0.000021122166031945488
+armv7neon_mmm_f32_8x4_cortexa7 17 4 12 0.000005984255865199454
+armv7neon_mmm_f32_8x6_cortexa9 8 4 17 0.00000262350755833462
+armv7neon_mmm_f32_8x6_generic 24 32 17 0.000015705211312591065
+armv7neon_mmm_f32_8x6_generic 23 32 7 0.00001103425525544337
+armv7neon_mmm_f32_8x6_generic 23 128 18 0.00004698363257869806
+armv7neon_mmm_f32_8x4_cortexa7 9 128 4 0.000010997242917041898
+generic_f32_4x4 12 32 9 0.000011855594903877275
+armv7neon_mmm_f32_8x4_cortexa7 8 32 4 0.000002107136592547487
+armv7neon_mmm_f32_8x4_cortexa7 9 32 4 0.0000038494164534325605
+generic_f32_4x4 8 32 4 0.0000029596097342538777
+armv7neon_mmm_f32_8x6_cortexa9 7 4 13 0.0000033183395006378194
+armv7neon_mmm_f32_8x4_cortexa9 7 128 5 0.00000837292374457436
+armv7neon_mmm_f32_8x6_cortexa7 9 32 12 0.000008879244649291164
+armv7neon_mmm_f32_8x6_cortexa9 23 32 5 0.00000608726564780247
+generic_f32_4x4 4 4 11 0.000002253269465041155
+armv7neon_mmm_f32_8x6_generic 15 4 12 0.000003737614965115773
+armv7neon_mmm_f32_8x4_cortexa7 8 32 13 0.000007179427212160366
+armv7neon_mmm_f32_8x6_generic 7 4 7 0.00000227244051714339
+armv7neon_mmm_f32_8x6_generic 7 128 13 0.000016430484681121403
+armv7neon_mmm_f32_8x4_cortexa9 25 4 13 0.000009867842979190694
+armv7neon_mmm_f32_8x4_cortexa9 24 32 7 0.000008714828341166468
+armv7neon_mmm_f32_8x6_generic 7 128 12 0.00001125899429533851
+armv7neon_mmm_f32_8x4_cortexa7 24 128 3 0.000016779227724494087
+armv7neon_mmm_f32_8x6_cortexa7 25 32 11 0.000017700250695390733
+armv7neon_mmm_f32_8x6_cortexa9 8 128 17 0.00001578057628845984
+armv7neon_mmm_f32_8x6_cortexa9 15 128 18 0.00003176376886582956
+armv7neon_mmm_f32_8x6_generic 7 32 18 0.000006481310483268657
+armv7neon_mmm_f32_8x6_generic 24 32 19 0.000020260301856480532
+armv7neon_mmm_f32_8x6_cortexa7 25 4 7 0.000006556992855989281
+armv7neon_mmm_f32_8x4_cortexa7 9 4 8 0.0000030004666340701197
+armv7neon_mmm_f32_8x6_generic 8 4 13 0.0000025029052985754286
+armv7neon_mmm_f32_8x4_cortexa7 16 32 13 0.00001386814401860724
+armv7neon_mmm_f32_8x4_generic 16 128 3 0.000008461514063719399
+generic_f32_4x4 8 32 5 0.000005666217550078829
+armv7neon_mmm_f32_8x4_generic 17 32 9 0.000012567178399696468
+armv7neon_mmm_f32_8x4_cortexa7 15 128 9 0.00003252514906200416
+armv7neon_mmm_f32_8x6_generic 25 128 13 0.00006413295578186804
+armv7neon_mmm_f32_8x6_generic 7 4 11 0.0000023405037923286433
+generic_f32_4x4 7 32 12 0.000008239110132919408
+armv7neon_mmm_f32_8x6_cortexa9 23 128 6 0.00001597150399238562
+armv7neon_mmm_f32_8x4_generic 25 4 13 0.000009863674729272788
+armv7neon_mmm_f32_8x4_cortexa7 16 128 5 0.00002172651909764639
+armv7neon_mmm_f32_8x6_cortexa7 15 32 12 0.000009468415752921276
+armv7neon_mmm_f32_8x6_cortexa9 24 4 11 0.000004968600626897013
+armv7neon_mmm_f32_8x4_generic 24 128 13 0.00004635289274725685
+armv7neon_mmm_f32_8x6_generic 23 128 5 0.000016196412852382603
+armv7neon_mmm_f32_8x6_cortexa7 16 32 13 0.000013024830178376121
+armv7neon_mmm_f32_8x4_cortexa9 9 32 3 0.0000033151175658832473
+armv7neon_mmm_f32_8x6_generic 7 4 13 0.000003265379138781528
+armv7neon_mmm_f32_8x4_cortexa9 17 32 13 0.000016459192383496957
+armv7neon_mmm_f32_8x6_cortexa7 24 32 13 0.000019272536127977423
+generic_f32_4x4 4 32 3 0.0000019007760051192194
+generic_f32_4x4 9 128 8 0.000021954328754919918
+armv7neon_mmm_f32_8x4_cortexa7 8 32 12 0.000005371382475600821
+generic_f32_4x4 7 4 3 0.000001903414095718297
+armv7neon_mmm_f32_8x6_generic 24 4 7 0.000004666010098240403
+armv7neon_mmm_f32_8x4_cortexa9 16 32 8 0.000005563656358158919
+generic_f32_4x4 3 32 9 0.000004602770059818119
+armv7neon_mmm_f32_8x4_generic 9 32 13 0.000011194068458304182
+armv7neon_mmm_f32_8x4_cortexa9 15 32 11 0.000008943973907120027
+generic_f32_4x4 11 128 7 0.000022328923764966492
+armv7neon_mmm_f32_8x6_generic 8 128 5 0.000005731544283554465
+armv7neon_mmm_f32_8x6_cortexa9 8 128 11 0.00001076946555010939
+armv7neon_mmm_f32_8x4_cortexa7 7 128 5 0.000011229094508072489
+armv7neon_mmm_f32_8x4_cortexa9 15 128 9 0.000023961524252030835
+generic_f32_4x4 11 128 5 0.000022274598932900346
+generic_f32_4x4 5 128 3 0.000007857636386132928
+generic_f32_4x4 4 128 7 0.00000774651849385228
+armv7neon_mmm_f32_8x4_cortexa9 15 128 5 0.000016175167728379695
+armv7neon_mmm_f32_8x6_cortexa7 17 4 5 0.0000031159889361188713
+generic_f32_4x4 12 4 11 0.000005685376438998431
+armv7neon_mmm_f32_8x4_cortexa7 8 32 5 0.000003934735618425319
+generic_f32_4x4 4 32 4 0.0000017328412461691005
+armv7neon_mmm_f32_8x4_cortexa7 8 128 11 0.000016332637899779713
+armv7neon_mmm_f32_8x6_cortexa9 15 32 17 0.000011527836971780865
+armv7neon_mmm_f32_8x4_generic 15 4 9 0.000004398255246018968
+armv7neon_mmm_f32_8x4_cortexa9 15 32 8 0.000005965121319547414
+armv7neon_mmm_f32_8x6_cortexa7 8 4 11 0.0000020902150325449643
+armv7neon_mmm_f32_8x6_cortexa7 15 4 13 0.000005648201530441972
+armv7neon_mmm_f32_8x6_cortexa9 23 128 18 0.00004690308298866514
+armv7neon_mmm_f32_8x6_cortexa7 17 128 18 0.00006100420785992372
+generic_f32_4x4 3 32 3 0.000001876030465879534
+generic_f32_4x4 4 128 8 0.000007586370052411088
+armv7neon_mmm_f32_8x6_cortexa9 8 128 18 0.000015532022218663232
+armv7neon_mmm_f32_8x4_cortexa7 17 4 11 0.000006454744732596089
+armv7neon_mmm_f32_8x6_cortexa9 15 4 11 0.0000038999665208551116
+armv7neon_mmm_f32_8x6_cortexa9 25 128 17 0.00006218409021295646
+armv7neon_mmm_f32_8x6_cortexa7 9 128 7 0.000027592791728379908
+armv7neon_mmm_f32_8x4_cortexa7 17 128 7 0.0000324174579838398
+armv7neon_mmm_f32_8x6_generic 17 32 13 0.00001566115346007812
+generic_f32_4x4 9 4 4 0.0000022019063671444208
+armv7neon_mmm_f32_8x4_cortexa9 17 128 11 0.0000353036171900936
+armv7neon_mmm_f32_8x4_cortexa7 17 32 12 0.0000154170992202297
+generic_f32_4x4 4 4 12 0.0000020915219079462592
+armv7neon_mmm_f32_8x4_generic 16 4 3 0.000001964189345127758
+generic_f32_4x4 11 4 3 0.0000025875707471480883
+armv7neon_mmm_f32_8x4_cortexa7 24 32 8 0.000010240674992864977
+armv7neon_mmm_f32_8x4_cortexa7 24 4 7 0.00000464383472294094
+armv7neon_mmm_f32_8x6_generic 24 128 17 0.00004653762565754179
+armv7neon_mmm_f32_8x4_cortexa9 9 128 5 0.000015984472800286198
+armv7neon_mmm_f32_8x6_cortexa9 17 32 13 0.000015793322899399953
+armv7neon_mmm_f32_8x4_cortexa7 23 128 9 0.00004832080937818461
+armv7neon_mmm_f32_8x6_generic 17 4 12 0.0000043886752215018795
+armv7neon_mmm_f32_8x6_generic 24 4 17 0.000006718970310010203
+armv7neon_mmm_f32_8x6_cortexa9 23 4 12 0.000005053037256901505
+armv7neon_mmm_f32_8x4_generic 25 128 13 0.00006314133455535656
+generic_f32_4x4 9 128 7 0.000022234980219629062
+armv7neon_mmm_f32_8x6_cortexa7 7 128 7 0.000014376478927807395
+armv7neon_mmm_f32_8x6_cortexa7 16 128 5 0.0000143212403106337
+armv7neon_mmm_f32_8x6_generic 25 32 6 0.000007045905360966285
+armv7neon_mmm_f32_8x6_generic 25 32 17 0.000020892213993308648
+armv7neon_mmm_f32_8x6_cortexa9 24 32 7 0.000010711106258433335
+armv7neon_mmm_f32_8x4_cortexa7 15 32 7 0.000007644594488957809
+armv7neon_mmm_f32_8x4_generic 9 32 7 0.0000059612924557323865
+armv7neon_mmm_f32_8x4_cortexa7 24 32 5 0.000010813567327083286
+generic_f32_4x4 11 32 9 0.000012181640310257489
+armv7neon_mmm_f32_8x4_generic 8 32 11 0.0000044856528446547376
+armv7neon_mmm_f32_8x6_cortexa7 8 128 5 0.00000740698393210008
+armv7neon_mmm_f32_8x6_generic 9 4 13 0.00000467415217244372
+armv7neon_mmm_f32_8x4_cortexa9 8 32 5 0.0000031557846756887957
+armv7neon_mmm_f32_8x6_generic 9 32 6 0.000003833293115622011
+generic_f32_4x4 5 128 11 0.000022197811800866368
+armv7neon_mmm_f32_8x4_cortexa7 8 4 8 0.000001630228451378558
+armv7neon_mmm_f32_8x6_cortexa7 7 128 18 0.00002168993830362326
+armv7neon_mmm_f32_8x4_generic 16 4 7 0.000003012988093329698
+armv7neon_mmm_f32_8x6_generic 9 32 5 0.0000040586320817732015
+armv7neon_mmm_f32_8x4_generic 8 128 11 0.000012020324272970633
+armv7neon_mmm_f32_8x4_cortexa9 9 4 13 0.00000525401614258308
+armv7neon_mmm_f32_8x6_cortexa9 17 32 5 0.000005959198344084862
+armv7neon_mmm_f32_8x6_cortexa9 25 128 7 0.00004136188393626292
+armv7neon_mmm_f32_8x6_cortexa7 8 4 6 0.0000011677036715350285
+armv7neon_mmm_f32_8x6_cortexa7 7 32 11 0.000005167860829086469
+armv7neon_mmm_f32_8x4_generic 24 4 11 0.000005847812768700571
+armv7neon_mmm_f32_8x4_generic 17 128 8 0.0000233987853798507
+armv7neon_mmm_f32_8x4_cortexa9 7 128 8 0.000008435297227106425
+armv7neon_mmm_f32_8x4_generic 8 4 4 0.000000981928574441578
+generic_f32_4x4 13 32 8 0.000010469226231367714
+generic_f32_4x4 12 4 7 0.000004168142600643075
+armv7neon_mmm_f32_8x6_generic 25 128 5 0.000021400005561991285
+armv7neon_mmm_f32_8x6_cortexa9 23 128 19 0.00006309986803011645
+armv7neon_mmm_f32_8x6_cortexa7 17 32 5 0.000007188792393610533
+armv7neon_mmm_f32_8x6_cortexa9 7 32 17 0.00000637568429598606
+armv7neon_mmm_f32_8x6_cortexa9 25 32 19 0.000027309010551769435
+armv7neon_mmm_f32_8x4_cortexa7 7 32 8 0.000004145572586004413
+armv7neon_mmm_f32_8x4_cortexa9 24 32 9 0.000012468417684719209
+armv7neon_mmm_f32_8x6_cortexa7 23 128 7 0.00004157442562492112
diff --git a/vendor/tract-linalg-0.22.1/src/arm64.rs b/vendor/tract-linalg-0.22.1/src/arm64.rs
new file mode 100644
index 000000000..f44c38a63
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64.rs
@@ -0,0 +1,383 @@
+#![allow(clippy::excessive_precision)]
+#[cfg(any(target_os = "macos", all(target_os = "ios", feature = "apple-amx-ios")))]
+mod apple_amx;
+mod arm64simd;
+pub mod cortex_a53;
+mod cortex_a55;
+//mod cortex_a72;
+//mod cortex_a73;
+pub use arm64simd::*;
+
+#[cfg(not(feature = "no_fp16"))]
+pub mod arm64fp16;
+#[cfg(not(feature = "no_fp16"))]
+pub use arm64fp16::*;
+
+use crate::f16;
+use crate::{BinOp, DatumType, LinalgRegistry, Ops};
+
+use crate::frame::by_scalar::ByScalarKer;
+use crate::frame::element_wise::ElementWiseKer;
+use crate::frame::reduce::{MapReduceKer, ReduceKer};
+use crate::frame::unicast::UnicastKer;
+
+// https://en.wikipedia.org/wiki/Comparison_of_ARMv8-A_cores
+const PART_A53: &str = "0xd03";
+const PART_A55: &str = "0xd05";
+#[allow(dead_code)]
+const PART_A72: &str = "0xd08";
+#[allow(dead_code)]
+const PART_A73: &str = "0xd09";
+#[allow(dead_code)]
+const PART_A75: &str = "0xd0a";
+#[allow(dead_code)]
+const PART_NEOVERSE_N1: &str = "0xd0c";
+#[allow(dead_code)]
+const PART_NEOVERSE_N2: &str = "0xd49";
+#[allow(dead_code)]
+const PART_NEOVERSE_N3: &str = "0xd8e";
+#[allow(dead_code)]
+const PART_NEOVERSE_V1: &str = "0xd40";
+#[allow(dead_code)]
+const PART_NEOVERSE_V2: &str = "0xd4f";
+#[allow(dead_code)]
+const PART_NEOVERSE_V3: &str = "0xd83";
+
+fn max_cpuid() -> std::io::Result<String> {
+    let cpu_info = std::fs::read_to_string("/proc/cpuinfo")?;
+    let max = cpu_info
+        .lines()
+        .filter(|line| line.starts_with("CPU part"))
+        .map(|line| line.split_whitespace().last().unwrap_or(""))
+        .max();
+    Ok(max.unwrap_or("").to_string())
+}
+
+lazy_static::lazy_static! {
+    static ref KIND: Kind = Kind::choose();
+
+    static ref CPU_FEATURES: Vec<String> = {
+        #[cfg(test)] crate::setup_test_logger();
+        let Ok(cpu_info) = std::fs::read_to_string("/proc/cpuinfo") else {
+            log::warn!("Could not read /proc/cpuinfo. CPU Features detection may be impaired.");
+            return vec!();
+        };
+        if let Some(line) = cpu_info
+            .lines()
+                .find(|line| line.starts_with("Features")) {
+                    line.split_once(':').unwrap().1.split_whitespace().map(|s| s.to_string()).collect()
+                } else {
+                    log::warn!("Could not find \"Features  :\" lines in /proc/cpuinfo. CPU Features detection may be impaired.");
+                    vec!()
+        }
+    };
+
+    static ref HAS_FP16: bool = {
+        CPU_FEATURES.iter().any(|s| &**s == "asimdhp")
+    };
+}
+
+#[cfg(any(target_os = "macos", target_os = "ios"))]
+fn apple_get_syscall(key: &str) -> String {
+    use std::ffi::{c_char, c_void, CStr, CString};
+    use std::ptr::null_mut;
+
+    unsafe extern "C" {
+        fn sysctlbyname(
+            name: *const c_char,
+            oldp: *mut c_void,
+            oldlenp: *mut isize,
+            newp: *mut c_void,
+            newlen: isize,
+        );
+    }
+
+    unsafe {
+        let mut len: isize = 0;
+        let name = CString::new(key).unwrap();
+        sysctlbyname(name.as_ptr(), null_mut(), &mut len, null_mut(), 0);
+        let mut buf = vec![0u8; len as _];
+        sysctlbyname(name.as_ptr(), buf.as_mut_ptr() as _, &mut len, null_mut(), 0);
+        CStr::from_bytes_with_nul(&buf).unwrap().to_string_lossy().into_owned()
+    }
+}
+
+#[cfg(target_os = "macos")]
+pub fn has_amx() -> bool {
+    !apple_get_syscall("machdep.cpu.brand_string").contains("(Virtual)")
+}
+
+#[cfg(target_os = "ios")]
+lazy_static::lazy_static! {
+    static ref IPHONE_MODEL_MAJOR:Option<usize> = {
+        let version = apple_get_syscall("hw.machine");
+        let Some((major, _)) = version.trim_start_matches("iPhone").split_once(",") else { return None };
+        major.parse::<usize>().ok()
+    };
+}
+
+#[cfg(all(target_os = "ios", feature = "apple-amx-ios"))]
+fn has_amx() -> bool {
+    // iPhone12,1 is the one branded "iPhone 11", with Apple A13 bionic, first CPU featuring amx
+    IPHONE_MODEL_MAJOR.map(|it| it >= 12).unwrap_or(false)
+}
+
+#[inline]
+#[cfg(target_os = "ios")]
+pub fn has_fp16() -> bool {
+    // iPhone10,1 is the one branded "iPhone 8", with Apple A11 bionic, first CPU featuring fp16
+    IPHONE_MODEL_MAJOR.map(|it| it >= 10).unwrap_or(false)
+}
+
+#[inline]
+#[cfg(not(target_os = "ios"))]
+pub fn has_fp16() -> bool {
+    cfg!(target_os = "macos")
+        || cfg!(feature_cpu = "fp16")
+        || *KIND == Kind::CortexA55
+        || *KIND == Kind::CortexA75
+        || *HAS_FP16
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub unsafe fn add_f16(a: f16, b: f16) -> f16 {
+    unsafe {
+        let result: u16;
+        std::arch::asm!(
+        "fadd {0:h}, {1:h}, {2:h}",
+        lateout(vreg) result,
+        in(vreg) a.to_bits(),
+        in(vreg) b.to_bits(),
+        options(pure, nomem, nostack, preserves_flags));
+        f16::from_bits(result)
+    }
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub unsafe fn mul_f16(a: f16, b: f16) -> f16 {
+    unsafe {
+        let result: u16;
+        std::arch::asm!(
+        "fmul {0:h}, {1:h}, {2:h}",
+        lateout(vreg) result,
+        in(vreg) a.to_bits(),
+        in(vreg) b.to_bits(),
+        options(pure, nomem, nostack, preserves_flags));
+        f16::from_bits(result)
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, Copy, Clone)]
+pub enum Kind {
+    Generic,
+    AppleM,
+    Neoverse,
+    CortexA53,
+    CortexA55,
+    CortexA72,
+    CortexA73,
+    CortexA75,
+}
+
+impl Kind {
+    pub fn choose() -> Kind {
+        #[cfg(test)]
+        crate::setup_test_logger();
+        let kind = if let Ok(kind) = std::env::var("TRACT_CPU_AARCH64_KIND") {
+            log::info!("CPU kind forced with TRACT_CPU_AARCH64_KIND: {}", kind);
+            let kind = kind.to_lowercase();
+            if kind.contains("a53") {
+                Kind::CortexA53
+            } else if kind.contains("a55") {
+                Kind::CortexA55
+            } else if kind.contains("a72") {
+                Kind::CortexA72
+            } else if kind.contains("a73") {
+                Kind::CortexA73
+            } else if kind.contains("a75") {
+                Kind::CortexA75
+            } else if kind.contains("neoverse") {
+                Kind::Neoverse
+            } else if kind.contains("applem") {
+                Kind::AppleM
+            } else {
+                Kind::Generic
+            }
+        } else if cfg!(target_os = "macos") {
+            Kind::AppleM
+        } else {
+            let part = if let Ok(part) = std::env::var("TRACT_CPU_AARCH64_OVERRIDE_CPU_PART") {
+                log::info!("CPU part forced with TRACT_CPU_AARCH64_OVERRIDE_CPU_PART: {}", part);
+                part
+            } else if cfg!(target_os = "linux") {
+                let part = max_cpuid().unwrap_or_else(|_| "0x00".to_string());
+                log::info!("CPU part auto detected: {}", part);
+                part
+            } else {
+                log::info!("Unknown CPU part");
+                "0x00".to_string()
+            };
+            match &*part {
+                PART_A53 => Kind::CortexA53,
+                PART_A55 => Kind::CortexA55,
+                PART_A72 => Kind::CortexA72,
+                PART_A73 => Kind::CortexA73,
+                PART_A75 => Kind::CortexA75,
+                PART_NEOVERSE_N1 | PART_NEOVERSE_N2 | PART_NEOVERSE_N3 | PART_NEOVERSE_V1
+                | PART_NEOVERSE_V2 | PART_NEOVERSE_V3 => Kind::Neoverse,
+                _ => Kind::Generic,
+            }
+        };
+        log::info!("CPU optimisation: {:?}", kind);
+        kind
+    }
+}
+
+pub(crate) fn register_all_unicast(registry: &mut LinalgRegistry) {
+    registry
+        .insert((BinOp::Mul, DatumType::F32), Box::new(|| arm64simd_unicast_mul_f32_16n::bin()));
+    registry
+        .insert((BinOp::Mul, DatumType::F16), Box::new(|| arm64fp16_unicast_mul_f16_32n::bin()));
+    registry
+        .insert((BinOp::Add, DatumType::F32), Box::new(|| arm64simd_unicast_add_f32_16n::bin()));
+    registry
+        .insert((BinOp::Add, DatumType::F16), Box::new(|| arm64fp16_unicast_add_f16_32n::bin()));
+    registry
+        .insert((BinOp::Sub, DatumType::F32), Box::new(|| arm64simd_unicast_sub_f32_16n::bin()));
+    registry
+        .insert((BinOp::Sub, DatumType::F16), Box::new(|| arm64fp16_unicast_sub_f16_32n::bin()));
+    registry
+        .insert((BinOp::SubF, DatumType::F32), Box::new(|| arm64simd_unicast_subf_f32_16n::bin()));
+    registry
+        .insert((BinOp::SubF, DatumType::F16), Box::new(|| arm64fp16_unicast_subf_f16_32n::bin()));
+    registry
+        .insert((BinOp::Min, DatumType::F32), Box::new(|| arm64simd_unicast_min_f32_16n::bin()));
+    registry
+        .insert((BinOp::Min, DatumType::F16), Box::new(|| arm64fp16_unicast_min_f16_32n::bin()));
+    registry
+        .insert((BinOp::Max, DatumType::F32), Box::new(|| arm64simd_unicast_max_f32_16n::bin()));
+    registry
+        .insert((BinOp::Max, DatumType::F16), Box::new(|| arm64fp16_unicast_max_f16_32n::bin()));
+}
+
+pub(crate) fn register_all_by_scalar(registry: &mut LinalgRegistry) {
+    registry
+        .insert((BinOp::Mul, DatumType::F32), Box::new(|| arm64simd_mul_by_scalar_f32_16n::bin()));
+    registry
+        .insert((BinOp::Mul, DatumType::F16), Box::new(|| arm64fp16_mul_by_scalar_f16_32n::bin()));
+    registry
+        .insert((BinOp::Add, DatumType::F32), Box::new(|| arm64simd_add_by_scalar_f32_16n::bin()));
+    registry
+        .insert((BinOp::Add, DatumType::F16), Box::new(|| arm64fp16_add_by_scalar_f16_32n::bin()));
+    registry
+        .insert((BinOp::Sub, DatumType::F32), Box::new(|| arm64simd_sub_by_scalar_f32_16n::bin()));
+    registry
+        .insert((BinOp::Sub, DatumType::F16), Box::new(|| arm64fp16_sub_by_scalar_f16_32n::bin()));
+    registry.insert(
+        (BinOp::SubF, DatumType::F32),
+        Box::new(|| arm64simd_subf_by_scalar_f32_16n::bin()),
+    );
+    registry.insert(
+        (BinOp::SubF, DatumType::F16),
+        Box::new(|| arm64fp16_subf_by_scalar_f16_32n::bin()),
+    );
+    registry
+        .insert((BinOp::Min, DatumType::F32), Box::new(|| arm64simd_min_by_scalar_f32_16n::bin()));
+    registry
+        .insert((BinOp::Min, DatumType::F16), Box::new(|| arm64fp16_min_by_scalar_f16_32n::bin()));
+    registry
+        .insert((BinOp::Max, DatumType::F32), Box::new(|| arm64simd_max_by_scalar_f32_16n::bin()));
+    registry
+        .insert((BinOp::Max, DatumType::F16), Box::new(|| arm64fp16_max_by_scalar_f16_32n::bin()));
+}
+
+pub fn plug(ops: &mut Ops) {
+    arm64simd::plug(ops);
+
+    #[cfg(not(feature = "no_fp16"))]
+    if has_fp16() {
+        arm64fp16::plug(ops);
+    }
+
+    ops.qmmm_i32 = Box::new(|_, _, _| arm64simd_mmm_i32_8x8.mmm());
+    ops.qmmv_i32 = Box::new(|_, _| arm64simd_mmm_i32_64x1.mmm());
+    ops.mmv_f32 = match *KIND {
+        Kind::CortexA53 => Box::new(|_, _| arm64simd_mmm_f32_64x1_a53.mmm()),
+        Kind::CortexA55 => Box::new(|_, _| arm64simd_mmm_f32_64x1_a55.mmm()),
+        _ => Box::new(|_, _| arm64simd_mmm_f32_64x1_gen.mmm()),
+    };
+    let model = match *KIND {
+        Kind::CortexA53 => Some(cortex_a53::model()),
+        Kind::CortexA55 => Some(cortex_a55::model()),
+        _ => None,
+    };
+    let impls = ops.mmm_impls.clone();
+    ops.mmm_f32 = if let Some(model) = model {
+        Box::new(move |m, k, n| model.pick(&impls, m, k, n))
+    } else {
+        Box::new(move |_, _, n| {
+            if n.unwrap_or(8) < 8 {
+                arm64simd_mmm_f32_16x4_gen.mmm()
+            } else {
+                arm64simd_mmm_f32_8x8_gen.mmm()
+            }
+        })
+    };
+    #[cfg(feature = "no_fp16")]
+    if has_fp16() {
+        log::warn!(
+            "This is a build with fp16 disabled, while your platform CPU seems to support it."
+        );
+    }
+    #[cfg(not(feature = "no_fp16"))]
+    if has_fp16() {
+        if *KIND == Kind::CortexA55 {
+            log::info!("Cortex-A55 mmm_f16 and mmv_f16 activated");
+            ops.mmm_f16 = Box::new(|_, _, n| {
+                use tract_data::internal::DimLike;
+                if n.unwrap_or(1024).divceil(4) * 4 < n.unwrap_or(1024).divceil(8) * 8 {
+                    arm64fp16_mmm_f16_32x4_a55.mmm()
+                } else {
+                    arm64fp16_mmm_f16_16x8_a55.mmm()
+                }
+            });
+            ops.mmv_f16 = Box::new(|_, _| arm64fp16_mmm_f16_128x1_a55.mmm());
+        } else {
+            log::info!("ARMv8.2 mmm_f16 and mmv_f16 activated");
+            ops.mmm_f16 = Box::new(|_, _, n| {
+                use tract_data::internal::DimLike;
+                if n.unwrap_or(1024).divceil(4) * 4 < n.unwrap_or(1024).divceil(8) * 8 {
+                    arm64fp16_mmm_f16_32x4_gen.mmm()
+                } else {
+                    arm64fp16_mmm_f16_16x8_gen.mmm()
+                }
+            });
+            ops.mmv_f16 = Box::new(|_, _| arm64fp16_mmm_f16_128x1_gen.mmm());
+        }
+    }
+    ops.leaky_relu_f32 = Box::new(|| arm64simd_leaky_relu_f32_8n::ew());
+    ops.sigmoid_f32 = Box::new(|| arm64simd_sigmoid_f32_4n::ew());
+    ops.tanh_f32 = Box::new(|| arm64simd_tanh_f32_4n::ew());
+    ops.max_f32 = Box::new(|| arm64simd_max_f32_16n::red());
+    ops.sum_f32 = Box::new(|| arm64simd_sum_f32_16n::red());
+    ops.mul_by_scalar_f32 = Box::new(|| arm64simd_mul_by_scalar_f32_16n::ew());
+    ops.softmax2_fastcompact_f32 = Box::new(|| arm64simd_softmax2_fastcompact_f32_16n::red());
+    #[cfg(not(feature = "no_fp16"))]
+    if has_fp16() {
+        log::info!("ARMv8.2 tanh_f16 and sigmoid_f16 activated");
+        ops.leaky_relu_f16 = Box::new(|| arm64fp16_leaky_relu_f16_16n::ew());
+        ops.tanh_f16 = Box::new(|| arm64fp16_tanh_f16_8n::ew());
+        ops.sigmoid_f16 = Box::new(|| arm64fp16_sigmoid_f16_8n::ew());
+        ops.max_f16 = Box::new(|| arm64fp16_max_f16_32n::red());
+        ops.sum_f16 = Box::new(|| arm64fp16_sum_f16_32n::red());
+        ops.mul_by_scalar_f16 = Box::new(|| arm64fp16_mul_by_scalar_f16_32n::ew());
+    } else {
+        log::info!("No native fp16 support");
+    }
+    #[cfg(any(target_os = "macos", all(target_os = "ios", feature = "apple-amx-ios")))]
+    {
+        apple_amx::plug(ops);
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/apple_amx.rs b/vendor/tract-linalg-0.22.1/src/arm64/apple_amx.rs
new file mode 100644
index 000000000..512c65322
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/apple_amx.rs
@@ -0,0 +1,32 @@
+use crate::frame::mmm::ImplementationQuality::ManuallyOptimized;
+use crate::mmm::*;
+use crate::Ops;
+use tract_data::prelude::*;
+
+use super::has_amx;
+
+const AMX: fn() -> bool = crate::arm64::has_amx;
+const CAN_FUSE: fn(&FusedSpec) -> bool = |f| !matches!(f, &FusedSpec::LeakyRelu(_));
+
+MMMExternKernel!(apple_amx_mmm_f32_32x32<f32>(32, 32)@(128, 128) where(AMX) can_fuse(CAN_FUSE) quality(ManuallyOptimized));
+MMMExternKernel!(apple_amx_mmm_f32_32x1<f32>(32, 1)@(128, 128) where(AMX) can_fuse(CAN_FUSE) quality(ManuallyOptimized));
+MMMExternKernel!(apple_amx_mmm_f16_64x32<f16>(64, 32)@(128, 128) where(AMX) can_fuse(CAN_FUSE) quality(ManuallyOptimized));
+MMMExternKernel!(apple_amx_mmm_f16_64x1<f16>(64, 1)@(128, 128) where(AMX) can_fuse(CAN_FUSE) quality(ManuallyOptimized));
+
+pub fn plug(ops: &mut Ops) {
+    if has_amx() {
+        log::info!("AMX optimisation activated");
+        ops.mmm_f16 = Box::new(|_, _, _| apple_amx_mmm_f16_64x32.mmm());
+        ops.mmm_f32 = Box::new(|_, _, _| apple_amx_mmm_f32_32x32.mmm());
+        ops.mmv_f16 = Box::new(|_, _| apple_amx_mmm_f16_64x1.mmm());
+        ops.mmv_f32 = Box::new(|_, _| apple_amx_mmm_f32_32x1.mmm());
+        ops.mmm_impls.extend_from_slice(&[
+            apple_amx_mmm_f32_32x32.mmm(),
+            apple_amx_mmm_f32_32x1.mmm(),
+            apple_amx_mmm_f16_64x32.mmm(),
+            apple_amx_mmm_f16_64x1.mmm(),
+        ]);
+    } else {
+        log::info!("No AMX optimisation");
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16.rs
new file mode 100644
index 000000000..a09df594f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16.rs
@@ -0,0 +1,64 @@
+use tract_data::half::f16;
+
+mod by_scalar;
+mod leaky_relu;
+mod max;
+pub mod panel_extract;
+mod sum;
+mod unicast;
+pub use by_scalar::*;
+pub use leaky_relu::*;
+pub use max::*;
+pub use sum::*;
+pub use unicast::*;
+
+use crate::block_quant::PackedBlockQuantFormat;
+use crate::block_quant::Q4_0;
+use crate::frame::mmm::ImplementationQuality::ManuallyOptimized;
+use crate::Ops;
+
+const FP16: fn() -> bool = crate::arm64::has_fp16;
+
+MMMExternKernel!(arm64fp16_mmm_f16_16x8_gen<f16>(16, 8)@(16, 16) where(FP16) quality(ManuallyOptimized));
+MMMExternKernel!(arm64fp16_mmm_f16_16x8_a55<f16>(16, 8)@(16, 16) where(FP16) quality(ManuallyOptimized));
+MMMExternKernel!(arm64fp16_mmm_f16_32x4_gen<f16>(32, 4)@(16, 16) where(FP16) quality(ManuallyOptimized));
+MMMExternKernel!(arm64fp16_mmm_f16_32x4_a55<f16>(32, 4)@(16, 16) where(FP16) quality(ManuallyOptimized));
+MMMExternKernel!(arm64fp16_mmm_f16_128x1_gen<f16>(128,1)@(16, 16) where(FP16) quality(ManuallyOptimized));
+MMMExternKernel!(arm64fp16_mmm_f16_128x1_a55<f16>(128,1)@(16, 16) where(FP16) quality(ManuallyOptimized));
+
+MMMExternKernel!(arm64fp16_mmm_f16_64x3_gen<f16>(64, 3)@(16, 16) where(FP16) quality(ManuallyOptimized));
+MMMExternKernel!(arm64fp16_mmm_f16_32x6_gen<f16>(32, 6)@(16, 16) where(FP16) quality(ManuallyOptimized));
+
+MMMExternKernel! { arm64fp16_mmm_f16_64x1_gen<f16>(64, 1)@(16, 16) where(FP16)
+    packing[1] = q40f16z16se => |k| k.with_packing_a(PackedBlockQuantFormat::new(&Q4_0, 64, 16, true));
+    packing[2] = q40f16z16 => |k| k.with_packing_a(PackedBlockQuantFormat::new(&Q4_0, 64, 16, false));
+    quality(ManuallyOptimized)
+}
+
+pub fn plug(ops: &mut Ops) {
+    panel_extract::plug(ops);
+    ops.mmm_impls.extend_from_slice(&[
+        arm64fp16_mmm_f16_16x8_a55.mmm(),
+        arm64fp16_mmm_f16_16x8_gen.mmm(),
+        arm64fp16_mmm_f16_32x4_a55.mmm(),
+        arm64fp16_mmm_f16_32x4_gen.mmm(),
+        arm64fp16_mmm_f16_128x1_a55.mmm(),
+        arm64fp16_mmm_f16_128x1_gen.mmm(),
+        arm64fp16_mmm_f16_64x3_gen.mmm(),
+        arm64fp16_mmm_f16_32x6_gen.mmm(),
+        arm64fp16_mmm_f16_64x1_gen.mmm(),
+    ]);
+}
+
+tanh_impl!(f16, arm64fp16_tanh_f16_8n, 8, 8, crate::arm64::has_fp16());
+sigmoid_impl!(f16, arm64fp16_sigmoid_f16_8n, 8, 8, crate::arm64::has_fp16());
+
+#[cfg(test)]
+mod test {
+
+    #[test]
+    fn kits() {
+        let mut ops = crate::generic();
+        super::plug(&mut ops);
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/by_scalar.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/by_scalar.rs
new file mode 100644
index 000000000..e791890bd
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/by_scalar.rs
@@ -0,0 +1,258 @@
+use crate::f16;
+
+by_scalar_impl_wrap!(
+    f16,
+    arm64fp16_mul_by_scalar_f16_32n,
+    32,
+    4,
+    f16,
+    fn run(buf: &mut [f16], s: f16) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(buf: &mut [f16], s: f16) {
+            unsafe {
+                let len = buf.len();
+                let ptr = buf.as_ptr();
+                std::arch::asm!("
+            dup v0.8h, v0.h[0]
+            2:
+                ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}]
+                fmul v4.8h, v4.8h, v0.8h
+                fmul v5.8h, v5.8h, v0.8h
+                fmul v6.8h, v6.8h, v0.8h
+                fmul v7.8h, v7.8h, v0.8h
+                st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64
+                subs {len}, {len}, 32
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s.to_bits(),
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+            }
+        }
+        unsafe { run(buf, s) }
+    }
+);
+
+by_scalar_impl_wrap!(
+    f16,
+    arm64fp16_add_by_scalar_f16_32n,
+    32,
+    4,
+    f16,
+    fn run(buf: &mut [f16], s: f16) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(buf: &mut [f16], s: f16) {
+            unsafe {
+                let len = buf.len();
+                let ptr = buf.as_ptr();
+                std::arch::asm!("
+            dup v0.8h, v0.h[0]
+            2:
+                ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}]
+                fadd v4.8h, v4.8h, v0.8h
+                fadd v5.8h, v5.8h, v0.8h
+                fadd v6.8h, v6.8h, v0.8h
+                fadd v7.8h, v7.8h, v0.8h
+                st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64
+                subs {len}, {len}, 32
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s.to_bits(),
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+            }
+        }
+        unsafe { run(buf, s) }
+    }
+);
+
+by_scalar_impl_wrap!(
+    f16,
+    arm64fp16_sub_by_scalar_f16_32n,
+    32,
+    4,
+    f16,
+    fn run(buf: &mut [f16], s: f16) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(buf: &mut [f16], s: f16) {
+            unsafe {
+                let len = buf.len();
+                let ptr = buf.as_ptr();
+                std::arch::asm!("
+            dup v0.8h, v0.h[0]
+            2:
+                ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}]
+                fsub v4.8h, v4.8h, v0.8h
+                fsub v5.8h, v5.8h, v0.8h
+                fsub v6.8h, v6.8h, v0.8h
+                fsub v7.8h, v7.8h, v0.8h
+                st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64
+                subs {len}, {len}, 32
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s.to_bits(),
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+            }
+        }
+        unsafe { run(buf, s) }
+    }
+);
+
+by_scalar_impl_wrap!(
+    f16,
+    arm64fp16_subf_by_scalar_f16_32n,
+    32,
+    4,
+    f16,
+    fn run(buf: &mut [f16], s: f16) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(buf: &mut [f16], s: f16) {
+            unsafe {
+                let len = buf.len();
+                let ptr = buf.as_ptr();
+                std::arch::asm!("
+            dup v0.8h, v0.h[0]
+            2:
+                ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}]
+                fsub v4.8h, v0.8h, v4.8h
+                fsub v5.8h, v0.8h, v5.8h
+                fsub v6.8h, v0.8h, v6.8h
+                fsub v7.8h, v0.8h, v7.8h
+                st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64
+                subs {len}, {len}, 32
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s.to_bits(),
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+            }
+        }
+        unsafe { run(buf, s) }
+    }
+);
+
+by_scalar_impl_wrap!(
+    f16,
+    arm64fp16_min_by_scalar_f16_32n,
+    32,
+    4,
+    f16,
+    fn run(buf: &mut [f16], s: f16) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(buf: &mut [f16], s: f16) {
+            unsafe {
+                let len = buf.len();
+                let ptr = buf.as_ptr();
+                std::arch::asm!("
+            dup v0.8h, v0.h[0]
+            2:
+                ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}]
+                fmin v4.8h, v4.8h, v0.8h
+                fmin v5.8h, v5.8h, v0.8h
+                fmin v6.8h, v6.8h, v0.8h
+                fmin v7.8h, v7.8h, v0.8h
+                st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64
+                subs {len}, {len}, 32
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s.to_bits(),
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+            }
+        }
+        unsafe { run(buf, s) }
+    }
+);
+
+by_scalar_impl_wrap!(
+    f16,
+    arm64fp16_max_by_scalar_f16_32n,
+    32,
+    4,
+    f16,
+    fn run(buf: &mut [f16], s: f16) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(buf: &mut [f16], s: f16) {
+            unsafe {
+                let len = buf.len();
+                let ptr = buf.as_ptr();
+                std::arch::asm!("
+            dup v0.8h, v0.h[0]
+            2:
+                ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}]
+                fmax v4.8h, v4.8h, v0.8h
+                fmax v5.8h, v5.8h, v0.8h
+                fmax v6.8h, v6.8h, v0.8h
+                fmax v7.8h, v7.8h, v0.8h
+                st1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64
+                subs {len}, {len}, 32
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s.to_bits(),
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+            }
+        }
+        unsafe { run(buf, s) }
+    }
+);
+
+#[cfg(test)]
+mod test_arm64fp16_mul_by_scalar_f16_32n {
+    use super::*;
+    by_scalar_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_mul_by_scalar_f16_32n,
+        |a, b| a * b
+    );
+    by_scalar_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_add_by_scalar_f16_32n,
+        |a, b| a + b
+    );
+    by_scalar_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_sub_by_scalar_f16_32n,
+        |a, b| a - b
+    );
+    by_scalar_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_subf_by_scalar_f16_32n,
+        |a, b| b - a
+    );
+    by_scalar_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_min_by_scalar_f16_32n,
+        |a, b| a.min(b)
+    );
+    by_scalar_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_max_by_scalar_f16_32n,
+        |a, b| a.max(b)
+    );
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/leaky_relu.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/leaky_relu.rs
new file mode 100644
index 000000000..f4f6204aa
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/leaky_relu.rs
@@ -0,0 +1,56 @@
+use tract_data::internal::f16;
+
+ew_impl_wrap!(
+    f16,
+    arm64fp16_leaky_relu_f16_16n,
+    16,
+    8,
+    f16,
+    #[inline(never)]
+    fn run(buf: &mut [f16], alpha: f16) {
+        assert!(buf.len() % 8 == 0);
+        assert!(buf.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(buf: &mut [f16], alpha: f16) {
+            unsafe {
+                let len = buf.len();
+                let ptr = buf.as_ptr();
+                std::arch::asm!("
+                    dup v0.8h, {alpha:v}.h[0]
+                    dup v1.8h, {one:v}.h[0]
+                    2:
+                        ldp q3, q4, [{ptr}]
+
+                        fcmgt v5.8h, v3.8h, #0.0
+                        fcmgt v6.8h, v4.8h, #0.0
+                        bsl   v5.16b, v1.16b, v0.16b
+                        bsl   v6.16b, v1.16b, v0.16b
+                        fmul  v3.8h, v3.8h, v5.8h
+                        fmul  v4.8h, v4.8h, v6.8h
+
+                        stp q3, q4, [{ptr}], #32
+                        subs {len}, {len}, 16
+                        bne 2b
+                ",
+                one = in(vreg) f16::from_f32(1.0f32).to_bits(),
+                alpha = in(vreg) alpha.to_bits(),
+                len = inout(reg) len => _,
+                ptr = inout(reg) ptr => _,
+                out("v0") _,
+                out("v1") _,
+                out("q3") _,
+                out("q4") _,
+                out("q5") _,
+                out("q6") _,
+                );
+            }
+        }
+        unsafe { run(buf, alpha) }
+    }
+);
+
+#[cfg(test)]
+pub mod test_arm64simd_leaky_relu_f16_16n {
+    use super::*;
+    leaky_relu_frame_tests!(crate::arm64::has_fp16(), f16, arm64fp16_leaky_relu_f16_16n);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/max.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/max.rs
new file mode 100644
index 000000000..7a7b1033f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/max.rs
@@ -0,0 +1,63 @@
+use tract_data::half::f16;
+
+reduce_impl_wrap!(
+    f16,
+    arm64fp16_max_f16_32n,
+    32,
+    8,
+    (),
+    f16::MIN,
+    #[inline(never)]
+    fn run(buf: &[f16], _: ()) -> f16 {
+        assert!(buf.len() % 32 == 0);
+        assert!(buf.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(buf: &[f16]) -> f16 {
+            unsafe {
+                let len = buf.len();
+                let ptr = buf.as_ptr();
+                let mut out: u16;
+                std::arch::asm!("
+                ins v0.h[0], {min:w}
+                dup v0.8h, v0.h[0]
+                dup v1.8h, v0.h[0]
+                dup v2.8h, v0.h[0]
+                dup v3.8h, v0.h[0]
+
+                2:
+                    ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64
+                    fmax v0.8h, v0.8h, v4.8h
+                    fmax v1.8h, v1.8h, v5.8h
+                    fmax v2.8h, v2.8h, v6.8h
+                    fmax v3.8h, v3.8h, v7.8h
+
+                    subs {len}, {len}, 32
+                    bne 2b
+
+                fmax v0.8h, v0.8h, v1.8h
+                fmax v2.8h, v2.8h, v3.8h
+                fmax v0.8h, v0.8h, v2.8h
+                fmaxv h0, v0.8h
+                ",
+                // using v0 as inout triggers https://github.com/rust-lang/rust/issues/120374
+                min = in(reg) f16::MIN.to_bits(),
+                ptr = inout(reg) ptr => _,
+                len = inout(reg) len => _,
+                out("v0") out, out("v1") _, out("v2") _, out("v3") _,
+                out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+                f16::from_bits(out)
+            }
+        }
+        unsafe { run(buf) }
+    },
+    #[inline(never)]
+    fn reduce_two(a: f16, b: f16) -> f16 {
+        a.max(b)
+    }
+);
+
+#[cfg(test)]
+mod test_arm64fp16_max_f16_32n {
+    use super::*;
+    crate::max_frame_tests!(crate::arm64::has_fp16(), f16, arm64fp16_max_f16_32n);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/panel_extract.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/panel_extract.rs
new file mode 100644
index 000000000..a96aec077
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/panel_extract.rs
@@ -0,0 +1,94 @@
+use super::FP16;
+use crate::block_quant::{PackedBlockQuantFormat, Q4_0};
+use crate::pack::Packing;
+use crate::Ops;
+use tract_data::internal::*;
+
+pub fn plug(ops: &mut Ops) {
+    ops.panel_extractors.push(packed_64_q40_to_f16.clone());
+}
+
+panel_extractor!(kernel_packed_64_q40_to_f16 as packed_64_q40_to_f16(
+    Box::new(PackedBlockQuantFormat::new(&Q4_0, 64, 16, true)),
+    f16::packing(64).align(16)
+) where(FP16));
+
+#[target_feature(enable = "fp16")]
+unsafe fn kernel_packed_64_q40_to_f16(input: *const u8, output: *mut u8, k: usize) {
+    unsafe {
+        if k == 0 {
+            return;
+        }
+        let lookup_table: [u8; 16] = [
+            0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc2, 0xc0, 0xbc, 0x00, 0x3c, 0x40, 0x42, 0x44, 0x45,
+            0x46, 0x47,
+        ];
+        std::arch::asm!("
+    ld1      {{v13.16b}}, [{lookup_table}]
+    movi     v15.16b, 15
+    eor      v12.16b, v12.16b, v12.16b
+
+    2:
+        add     {scales}, {i}, 1024  // scales at end: 32 (cols) * 64 (rows) / 2 (half byte)
+        ld1     {{v16.16b-v19.16b}}, [{scales}], #64
+        ld1     {{v20.16b-v23.16b}}, [{scales}]
+
+        mov     {k2}, 32
+    3:
+        ld1     {{ v9.16b-v10.16b }}, [{i}], #32
+
+        and     v0.16b, v9.16b, v15.16b
+        ushr    v2.16b, v9.16b, 4
+
+        and     v4.16b, v10.16b, v15.16b
+        ushr    v6.16b, v10.16b, 4
+
+        tbl     v0.16b, {{ v13.16b }}, v0.16b
+        tbl     v2.16b, {{ v13.16b }}, v2.16b
+        tbl     v4.16b, {{ v13.16b }}, v4.16b
+        tbl     v6.16b, {{ v13.16b }}, v6.16b
+
+        zip2    v1.16b, v12.16b, v0.16b
+        zip2    v3.16b, v12.16b, v2.16b
+        zip2    v5.16b, v12.16b, v4.16b
+        zip2    v7.16b, v12.16b, v6.16b
+
+        zip1    v0.16b, v12.16b, v0.16b
+        zip1    v2.16b, v12.16b, v2.16b
+        zip1    v4.16b, v12.16b, v4.16b
+        zip1    v6.16b, v12.16b, v6.16b
+
+        fmul    v0.8h, v0.8h, v16.8h
+        fmul    v1.8h, v1.8h, v17.8h
+        fmul    v2.8h, v2.8h, v18.8h
+        fmul    v3.8h, v3.8h, v19.8h
+        fmul    v4.8h, v4.8h, v20.8h
+        fmul    v5.8h, v5.8h, v21.8h
+        fmul    v6.8h, v6.8h, v22.8h
+        fmul    v7.8h, v7.8h, v23.8h
+
+        st1     {{v0.16b-v3.16b}}, [{o}], #64
+        st1     {{v4.16b-v7.16b}}, [{o}], #64
+
+        subs    {k2}, {k2}, #1
+        bne     3b
+
+        add     {i}, {i}, 128 // skip scales
+        subs    {k}, {k}, 32
+        bne     2b
+            ",
+        lookup_table = in(reg) &lookup_table,
+        k = inout(reg) k => _,
+        k2 = out(reg) _,
+        scales = out(reg) _,
+        i = inout(reg) input => _,
+        o = inout(reg) output => _,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        out("v16") _, out("v17") _, out("v18") _, out("v19") _,
+        out("v20") _, out("v21") _, out("v22") _, out("v23") _,
+        );
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/sum.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/sum.rs
new file mode 100644
index 000000000..e13139c25
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/sum.rs
@@ -0,0 +1,62 @@
+use crate::num_traits::Zero;
+use tract_data::half::f16;
+
+reduce_impl_wrap!(
+    f16,
+    arm64fp16_sum_f16_32n,
+    32,
+    8,
+    (),
+    f16::zero(),
+    #[inline(never)]
+    fn run(buf: &[f16], _: ()) -> f16 {
+        assert!(buf.len() % 32 == 0);
+        assert!(buf.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(buf: &[f16]) -> f16 {
+            unsafe {
+                let len = buf.len();
+                let ptr = buf.as_ptr();
+                let mut out: u16;
+                std::arch::asm!("
+                movi v0.8h, #0
+                movi v1.8h, #0
+                movi v2.8h, #0
+                movi v3.8h, #0
+                2:
+                    ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{ptr}], 64
+                    fadd v0.8h, v0.8h, v4.8h
+                    fadd v1.8h, v1.8h, v5.8h
+                    fadd v2.8h, v2.8h, v6.8h
+                    fadd v3.8h, v3.8h, v7.8h
+
+                    subs {len}, {len}, 32
+                    bne 2b
+
+                fadd v0.8h, v0.8h, v1.8h
+                fadd v2.8h, v2.8h, v3.8h
+                fadd v0.8h, v0.8h, v2.8h
+                faddp v0.8h, v0.8h, v0.8h
+                faddp v0.8h, v0.8h, v0.8h
+                faddp v0.8h, v0.8h, v0.8h
+                ",
+                ptr = inout(reg) ptr => _,
+                len = inout(reg) len => _,
+                out("s0") out, out("v1") _, out("v2") _, out("v3") _,
+                out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+                f16::from_bits(out)
+            }
+        }
+        unsafe { run(buf) }
+    },
+    #[inline(never)]
+    fn reduce_two(a: f16, b: f16) -> f16 {
+        a + b
+    }
+);
+
+#[cfg(test)]
+mod test_arm64fp16_sum_f16_32n {
+    use super::*;
+    crate::sum_frame_tests!(crate::arm64::has_fp16(), f16, arm64fp16_sum_f16_32n);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/unicast.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/unicast.rs
new file mode 100644
index 000000000..d57ba1062
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64fp16/unicast.rs
@@ -0,0 +1,271 @@
+use tract_data::half::f16;
+
+unicast_impl_wrap!(
+    f16,
+    arm64fp16_unicast_mul_f16_32n,
+    32,
+    8,
+    #[inline(never)]
+    fn run(a: &mut [f16], b: &[f16]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 32 == 0);
+        assert!(a.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(a: &mut [f16], b: &[f16]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}]
+                    ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64
+                    fmul v0.8h, v0.8h, v4.8h
+                    fmul v1.8h, v1.8h, v5.8h
+                    fmul v2.8h, v2.8h, v6.8h
+                    fmul v3.8h, v3.8h, v7.8h
+                    st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64
+                    subs {len}, {len}, 32
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+unicast_impl_wrap!(
+    f16,
+    arm64fp16_unicast_add_f16_32n,
+    32,
+    8,
+    #[inline(never)]
+    fn run(a: &mut [f16], b: &[f16]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 32 == 0);
+        assert!(a.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(a: &mut [f16], b: &[f16]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}]
+                    ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64
+                    fadd v0.8h, v0.8h, v4.8h
+                    fadd v1.8h, v1.8h, v5.8h
+                    fadd v2.8h, v2.8h, v6.8h
+                    fadd v3.8h, v3.8h, v7.8h
+                    st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64
+                    subs {len}, {len}, 32
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+unicast_impl_wrap!(
+    f16,
+    arm64fp16_unicast_sub_f16_32n,
+    32,
+    8,
+    #[inline(never)]
+    fn run(a: &mut [f16], b: &[f16]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 32 == 0);
+        assert!(a.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(a: &mut [f16], b: &[f16]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}]
+                    ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64
+                    fsub v0.8h, v0.8h, v4.8h
+                    fsub v1.8h, v1.8h, v5.8h
+                    fsub v2.8h, v2.8h, v6.8h
+                    fsub v3.8h, v3.8h, v7.8h
+                    st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64
+                    subs {len}, {len}, 32
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+unicast_impl_wrap!(
+    f16,
+    arm64fp16_unicast_subf_f16_32n,
+    32,
+    8,
+    #[inline(never)]
+    fn run(a: &mut [f16], b: &[f16]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 32 == 0);
+        assert!(a.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(a: &mut [f16], b: &[f16]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}]
+                    ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64
+                    fsub v0.8h, v4.8h, v0.8h
+                    fsub v1.8h, v5.8h, v1.8h
+                    fsub v2.8h, v6.8h, v2.8h
+                    fsub v3.8h, v7.8h, v3.8h
+                    st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64
+                    subs {len}, {len}, 32
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+unicast_impl_wrap!(
+    f16,
+    arm64fp16_unicast_min_f16_32n,
+    32,
+    8,
+    #[inline(never)]
+    fn run(a: &mut [f16], b: &[f16]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 32 == 0);
+        assert!(a.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(a: &mut [f16], b: &[f16]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}]
+                    ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64
+                    fmin v0.8h, v0.8h, v4.8h
+                    fmin v1.8h, v1.8h, v5.8h
+                    fmin v2.8h, v2.8h, v6.8h
+                    fmin v3.8h, v3.8h, v7.8h
+                    st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64
+                    subs {len}, {len}, 32
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+unicast_impl_wrap!(
+    f16,
+    arm64fp16_unicast_max_f16_32n,
+    32,
+    8,
+    #[inline(never)]
+    fn run(a: &mut [f16], b: &[f16]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 32 == 0);
+        assert!(a.len() > 0);
+        #[target_feature(enable = "fp16")]
+        unsafe fn run(a: &mut [f16], b: &[f16]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}]
+                    ld1 {{v4.8h, v5.8h, v6.8h, v7.8h}}, [{b_ptr}], 64
+                    fmax v0.8h, v0.8h, v4.8h
+                    fmax v1.8h, v1.8h, v5.8h
+                    fmax v2.8h, v2.8h, v6.8h
+                    fmax v3.8h, v3.8h, v7.8h
+                    st1 {{v0.8h, v1.8h, v2.8h, v3.8h}}, [{a_ptr}], 64
+                    subs {len}, {len}, 32
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+#[cfg(test)]
+mod test_arm64fp16_unicast_mul_f16_32n {
+    use super::*;
+    use proptest::strategy::Strategy;
+    crate::unicast_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_unicast_mul_f16_32n,
+        |a, b| a * b
+    );
+    crate::unicast_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_unicast_add_f16_32n,
+        |a, b| a + b
+    );
+    crate::unicast_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_unicast_sub_f16_32n,
+        |a, b| a - b
+    );
+    crate::unicast_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_unicast_subf_f16_32n,
+        |a, b| b - a
+    );
+    crate::unicast_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_unicast_min_f16_32n,
+        |a, b| a.min(b)
+    );
+    crate::unicast_frame_tests!(
+        crate::arm64::has_fp16(),
+        f16,
+        arm64fp16_unicast_max_f16_32n,
+        |a, b| a.max(b)
+    );
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd.rs
new file mode 100644
index 000000000..be0c505b9
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd.rs
@@ -0,0 +1,117 @@
+mod by_scalar;
+mod leaky_relu;
+mod max;
+mod panel_extract;
+mod softmax;
+mod sum;
+mod unicast;
+
+pub use by_scalar::*;
+pub use leaky_relu::arm64simd_leaky_relu_f32_8n;
+pub use max::arm64simd_max_f32_16n;
+pub use softmax::arm64simd_softmax2_fastcompact_f32_16n;
+pub use sum::arm64simd_sum_f32_16n;
+pub use unicast::*;
+
+use crate::block_quant::{PackedBlockQuantFormat, Q4_0};
+use crate::frame::mmm::ImplementationQuality::ManuallyOptimized;
+use crate::pack::PackedFormat;
+use crate::Ops;
+
+use super::Kind;
+
+fn a55() -> isize {
+    if *super::KIND == Kind::CortexA55 {
+        1
+    } else {
+        -1
+    }
+}
+
+fn a53() -> isize {
+    if *super::KIND == Kind::CortexA53 {
+        1
+    } else {
+        -1
+    }
+}
+
+MMMExternKernel!(arm64simd_mmm_f32_8x8_a55 <f32>(8,  8)@(16, 16) quality(ManuallyOptimized) boost(a55));
+MMMExternKernel!(arm64simd_mmm_f32_12x8_a55<f32>(12, 8)@(16, 16) quality(ManuallyOptimized) boost(a55));
+MMMExternKernel!(arm64simd_mmm_f32_16x4_a55<f32>(16, 4)@(16, 16) quality(ManuallyOptimized) boost(a55));
+MMMExternKernel!(arm64simd_mmm_f32_24x4_a55<f32>(24, 4)@(16, 16) quality(ManuallyOptimized) boost(a55));
+MMMExternKernel!(arm64simd_mmm_f32_64x1_a55<f32>(64, 1)@(16, 16) quality(ManuallyOptimized) boost(a55));
+
+MMMExternKernel!(arm64simd_mmm_f32_16x4_a53<f32>(16, 4)@(16, 16) quality(ManuallyOptimized) boost(a53));
+MMMExternKernel!(arm64simd_mmm_f32_24x4_a53<f32>(24, 4)@(16, 16) quality(ManuallyOptimized) boost(a53));
+MMMExternKernel!(arm64simd_mmm_f32_8x8_a53 <f32>(8,  8)@(16, 16) quality(ManuallyOptimized) boost(a53));
+MMMExternKernel!(arm64simd_mmm_f32_12x8_a53<f32>(12, 8)@(16, 16) quality(ManuallyOptimized) boost(a53));
+MMMExternKernel!(arm64simd_mmm_f32_64x1_a53<f32>(64, 1)@(16, 16) quality(ManuallyOptimized) boost(a53));
+
+MMMExternKernel!(arm64simd_mmm_f32_16x4_gen<f32>(16, 4)@(16, 16) quality(ManuallyOptimized));
+MMMExternKernel!(arm64simd_mmm_f32_24x4_gen<f32>(24, 4)@(16, 16) quality(ManuallyOptimized));
+MMMExternKernel!(arm64simd_mmm_f32_8x8_gen <f32>(8,  8)@(16, 16) quality(ManuallyOptimized));
+MMMExternKernel!(arm64simd_mmm_f32_12x8_gen<f32>(12, 8)@(16, 16) quality(ManuallyOptimized));
+MMMExternKernel!(arm64simd_mmm_f32_64x1_gen<f32>(64, 1)@(16, 16) quality(ManuallyOptimized));
+
+fn q40p32z16se() -> PackedBlockQuantFormat {
+    PackedBlockQuantFormat::new(&Q4_0, 32, 16, true)
+}
+
+MMMExternKernel!(arm64simd_mmm_f32_32x1_gen<f32>(32, 1)@(16, 16)
+    packing[1] = q40f16 => |k| k.with_packing(q40p32z16se(), f16::packing(1));
+    packing[2] = q40f32 => |k| k.with_packing(q40p32z16se(), f32::packing(1));
+    packing[3] = f16f16 => |k| k.with_packing(f16::packing(32), f16::packing(1));
+    packing[4] = f32f16 => |k| k.with_packing(f32::packing(32), f16::packing(1));
+    packing[5] = f16f32 => |k| k.with_packing(f16::packing(32), f32::packing(1));
+    quality(ManuallyOptimized)
+    store(f16)
+);
+
+MMMExternKernel!(arm64simd_mmm_f32_32x3_gen<f32>(32, 3)@(16, 16)
+    packing[1] = f32f16 => |k| k.with_packing(f32::packing(32), f16::packing(3));
+    packing[2] = f16f32 => |k| k.with_packing(f16::packing(32), f32::packing(3));
+    packing[3] = f16f16 => |k| k.with_packing(f16::packing(32), f16::packing(3));
+    quality(ManuallyOptimized)
+    store(f16)
+);
+
+MMMExternKernel!(arm64simd_mmm_i32_8x8<i32>(8, 8)@(16, 16)
+   packing[1] = i8i8 => |k| k.with_packing(PackedFormat::new(DatumType::I8, 8, 16), PackedFormat::new(DatumType::I8, 8, 16));
+   quality(ManuallyOptimized)
+   store(i8)
+);
+
+MMMExternKernel!(arm64simd_mmm_i32_64x1<i32>(64, 1)@(16, 1)
+   packing[1] = i8i8 => |k| k.with_packing(PackedFormat::new(DatumType::I8, 64,16), PackedFormat::new(DatumType::I8, 1, 1));
+   quality(ManuallyOptimized)
+   store(i8)
+);
+
+pub fn plug(ops: &mut Ops) {
+    ops.mmm_impls.extend([
+        arm64simd_mmm_f32_12x8_gen.mmm(),
+        arm64simd_mmm_f32_12x8_a53.mmm(),
+        arm64simd_mmm_f32_12x8_a55.mmm(),
+        arm64simd_mmm_f32_8x8_gen.mmm(),
+        arm64simd_mmm_f32_8x8_a53.mmm(),
+        arm64simd_mmm_f32_8x8_a55.mmm(),
+        arm64simd_mmm_f32_16x4_gen.mmm(),
+        arm64simd_mmm_f32_16x4_a53.mmm(),
+        arm64simd_mmm_f32_16x4_a55.mmm(),
+        arm64simd_mmm_f32_24x4_gen.mmm(),
+        arm64simd_mmm_f32_24x4_a53.mmm(),
+        arm64simd_mmm_f32_24x4_a55.mmm(),
+        arm64simd_mmm_f32_32x1_gen.mmm(),
+        arm64simd_mmm_f32_32x3_gen.mmm(),
+        arm64simd_mmm_f32_64x1_gen.mmm(),
+        arm64simd_mmm_f32_64x1_a53.mmm(),
+        arm64simd_mmm_f32_64x1_a55.mmm(),
+        arm64simd_mmm_i32_8x8.mmm(),
+        arm64simd_mmm_i32_64x1.mmm(),
+    ]);
+    panel_extract::plug(ops);
+}
+
+tanh_impl!(f32, arm64simd_tanh_f32_4n, 4, 4, true);
+sigmoid_impl!(f32, arm64simd_sigmoid_f32_4n, 4, 4, true);
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/by_scalar.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/by_scalar.rs
new file mode 100644
index 000000000..49b2c7550
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/by_scalar.rs
@@ -0,0 +1,202 @@
+by_scalar_impl_wrap!(
+    f32,
+    arm64simd_mul_by_scalar_f32_16n,
+    16,
+    4,
+    f32,
+    fn run(buf: &mut [f32], s: f32) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        unsafe {
+            let len = buf.len();
+            let ptr = buf.as_ptr();
+            std::arch::asm!("
+            dup v0.4s, v0.s[0]
+            2:
+                ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}]
+                fmul v4.4s, v4.4s, v0.4s
+                fmul v5.4s, v5.4s, v0.4s
+                fmul v6.4s, v6.4s, v0.4s
+                fmul v7.4s, v7.4s, v0.4s
+                st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64
+                subs {len}, {len}, 16
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s,
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+        }
+    }
+);
+
+by_scalar_impl_wrap!(
+    f32,
+    arm64simd_add_by_scalar_f32_16n,
+    16,
+    4,
+    f32,
+    fn run(buf: &mut [f32], s: f32) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        unsafe {
+            let len = buf.len();
+            let ptr = buf.as_ptr();
+            std::arch::asm!("
+            dup v0.4s, v0.s[0]
+            2:
+                ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}]
+                fadd v4.4s, v4.4s, v0.4s
+                fadd v5.4s, v5.4s, v0.4s
+                fadd v6.4s, v6.4s, v0.4s
+                fadd v7.4s, v7.4s, v0.4s
+                st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64
+                subs {len}, {len}, 16
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s,
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+        }
+    }
+);
+
+by_scalar_impl_wrap!(
+    f32,
+    arm64simd_sub_by_scalar_f32_16n,
+    16,
+    4,
+    f32,
+    fn run(buf: &mut [f32], s: f32) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        unsafe {
+            let len = buf.len();
+            let ptr = buf.as_ptr();
+            std::arch::asm!("
+            dup v0.4s, v0.s[0]
+            2:
+                ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}]
+                fsub v4.4s, v4.4s, v0.4s
+                fsub v5.4s, v5.4s, v0.4s
+                fsub v6.4s, v6.4s, v0.4s
+                fsub v7.4s, v7.4s, v0.4s
+                st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64
+                subs {len}, {len}, 16
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s,
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+        }
+    }
+);
+
+by_scalar_impl_wrap!(
+    f32,
+    arm64simd_subf_by_scalar_f32_16n,
+    16,
+    4,
+    f32,
+    fn run(buf: &mut [f32], s: f32) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        unsafe {
+            let len = buf.len();
+            let ptr = buf.as_ptr();
+            std::arch::asm!("
+            dup v0.4s, v0.s[0]
+            2:
+                ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}]
+                fsub v4.4s, v0.4s, v4.4s
+                fsub v5.4s, v0.4s, v5.4s
+                fsub v6.4s, v0.4s, v6.4s
+                fsub v7.4s, v0.4s, v7.4s
+                st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64
+                subs {len}, {len}, 16
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s,
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+        }
+    }
+);
+
+by_scalar_impl_wrap!(
+    f32,
+    arm64simd_min_by_scalar_f32_16n,
+    16,
+    4,
+    f32,
+    fn run(buf: &mut [f32], s: f32) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        unsafe {
+            let len = buf.len();
+            let ptr = buf.as_ptr();
+            std::arch::asm!("
+            dup v0.4s, v0.s[0]
+            2:
+                ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}]
+                fmin v4.4s, v4.4s, v0.4s
+                fmin v5.4s, v5.4s, v0.4s
+                fmin v6.4s, v6.4s, v0.4s
+                fmin v7.4s, v7.4s, v0.4s
+                st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64
+                subs {len}, {len}, 16
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s,
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+        }
+    }
+);
+
+by_scalar_impl_wrap!(
+    f32,
+    arm64simd_max_by_scalar_f32_16n,
+    16,
+    4,
+    f32,
+    fn run(buf: &mut [f32], s: f32) {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        unsafe {
+            let len = buf.len();
+            let ptr = buf.as_ptr();
+            std::arch::asm!("
+            dup v0.4s, v0.s[0]
+            2:
+                ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}]
+                fmax v4.4s, v4.4s, v0.4s
+                fmax v5.4s, v5.4s, v0.4s
+                fmax v6.4s, v6.4s, v0.4s
+                fmax v7.4s, v7.4s, v0.4s
+                st1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64
+                subs {len}, {len}, 16
+                bne 2b
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            in("v0") s,
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+        }
+    }
+);
+
+#[cfg(test)]
+mod test_arm64simd_mul_by_scalar_f32_16n {
+    use super::*;
+    by_scalar_frame_tests!(true, f32, arm64simd_mul_by_scalar_f32_16n, |a, b| a * b);
+    by_scalar_frame_tests!(true, f32, arm64simd_add_by_scalar_f32_16n, |a, b| a + b);
+    by_scalar_frame_tests!(true, f32, arm64simd_sub_by_scalar_f32_16n, |a, b| a - b);
+    by_scalar_frame_tests!(true, f32, arm64simd_subf_by_scalar_f32_16n, |a, b| b - a);
+    by_scalar_frame_tests!(true, f32, arm64simd_min_by_scalar_f32_16n, |a, b| a.min(b));
+    by_scalar_frame_tests!(true, f32, arm64simd_max_by_scalar_f32_16n, |a, b| a.max(b));
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/leaky_relu.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/leaky_relu.rs
new file mode 100644
index 000000000..d71666895
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/leaky_relu.rs
@@ -0,0 +1,50 @@
+ew_impl_wrap!(
+    f32,
+    arm64simd_leaky_relu_f32_8n,
+    8,
+    4,
+    f32,
+    #[inline(never)]
+    fn run(buf: &mut [f32], alpha: f32) {
+        assert!(buf.len() % 8 == 0);
+        assert!(buf.len() > 0);
+        unsafe {
+            let len = buf.len();
+            let ptr = buf.as_ptr();
+            std::arch::asm!("
+                dup v0.4s, {alpha:v}.s[0]
+                dup v1.4s, {one:v}.s[0]
+                2:
+                    ldp q3, q4, [{ptr}]
+
+                    fcmgt v5.4s, v3.4s, #0.0
+                    fcmgt v6.4s, v4.4s, #0.0
+                    bsl   v5.16b, v1.16b, v0.16b
+                    bsl   v6.16b, v1.16b, v0.16b
+                    fmul  v3.4s, v3.4s, v5.4s
+                    fmul  v4.4s, v4.4s, v6.4s
+
+                    stp q3, q4, [{ptr}], #32
+                    subs {len}, {len}, 8
+                    bne 2b
+            ",
+            one = in(vreg) 1.0f32,
+            alpha = in(vreg) alpha,
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            out("v0") _,
+            out("v1") _,
+            out("q3") _,
+            out("q4") _,
+            out("q5") _,
+            out("q6") _,
+            );
+        }
+    }
+);
+
+#[cfg(test)]
+pub mod test_arm64simd_leaky_relu_f32_8n {
+    use super::*;
+    leaky_relu_frame_tests!(true, f32, arm64simd_leaky_relu_f32_8n);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/max.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/max.rs
new file mode 100644
index 000000000..3c32aa7ea
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/max.rs
@@ -0,0 +1,52 @@
+use std::arch::aarch64::{float32x4_t, vdupq_n_f32, vgetq_lane_f32};
+
+reduce_impl_wrap!(
+    f32,
+    arm64simd_max_f32_16n,
+    16,
+    4,
+    (),
+    f32::MIN,
+    #[inline(never)]
+    fn run(buf: &[f32], _: ()) -> f32 {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        unsafe {
+            let len = buf.len();
+            let ptr = buf.as_ptr();
+            let mut out: float32x4_t = vdupq_n_f32(f32::MIN);
+            std::arch::asm!("
+            and v1.16b, v0.16b, v0.16b
+            and v2.16b, v0.16b, v0.16b
+            and v3.16b, v0.16b, v0.16b
+            2:
+                ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64
+                fmax v0.4s, v0.4s, v4.4s
+                fmax v1.4s, v1.4s, v5.4s
+                fmax v2.4s, v2.4s, v6.4s
+                fmax v3.4s, v3.4s, v7.4s
+                subs {len}, {len}, 16
+                bne 2b
+            fmax v0.4s, v0.4s, v1.4s
+            fmax v2.4s, v2.4s, v3.4s
+            fmax v0.4s, v0.4s, v2.4s
+            fmaxv s0, v0.4s
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            inout("v0") out, out("v1") _, out("v2") _, out("v3") _,
+            out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+            vgetq_lane_f32(out, 0)
+        }
+    },
+    #[inline(never)]
+    fn reduce_two(a: f32, b: f32) -> f32 {
+        a.max(b)
+    }
+);
+
+#[cfg(test)]
+mod test_arm64simd_max_f32_16n {
+    use super::*;
+    crate::max_frame_tests!(true, f32, arm64simd_max_f32_16n);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/panel_extract.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/panel_extract.rs
new file mode 100644
index 000000000..bbfb22d9f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/panel_extract.rs
@@ -0,0 +1,98 @@
+use crate::pack::Packing;
+use crate::Ops;
+
+pub fn plug(ops: &mut Ops) {
+    ops.panel_extractors.push(packed_32_q40_to_f32.clone());
+}
+
+panel_extractor!(kernel_packed_32_q40_to_f32 as packed_32_q40_to_f32(
+    Box::new(super::q40p32z16se()),
+    f32::packing(32).align(16)
+));
+
+unsafe fn kernel_packed_32_q40_to_f32(input: *const u8, output: *mut u8, k: usize) {
+    unsafe {
+        if k == 0 {
+            return;
+        }
+        let lookup_table: [u8; 16] = [
+            0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc2, 0xc0, 0xbc, 0x00, 0x3c, 0x40, 0x42, 0x44, 0x45,
+            0x46, 0x47,
+        ];
+        std::arch::asm!("
+    ld1      {{v13.16b}}, [{lookup_table}]
+    movi     v15.16b, 15
+    eor      v12.16b, v12.16b, v12.16b
+
+    2:
+        add     {scales}, {i}, 512  // scales at end: 32 (cols) * 32 (rows) / 2 (half byte)
+        ld1     {{v0.8h-v3.8h}}, [{scales}]
+
+        fcvtl   v16.4s, v0.4h
+        fcvtl2  v17.4s, v0.8h
+        fcvtl   v18.4s, v1.4h
+        fcvtl2  v19.4s, v1.8h
+        fcvtl   v20.4s, v2.4h
+        fcvtl2  v21.4s, v2.8h
+        fcvtl   v22.4s, v3.4h
+        fcvtl2  v23.4s, v3.8h
+
+        mov     {k2}, 32
+    3:
+        ld1     {{ v9.16b }}, [{i}], #16
+
+        and     v0.16b, v9.16b, v15.16b
+        ushr    v4.16b, v9.16b, 4
+
+        tbl     v0.16b, {{ v13.16b }}, v0.16b
+        tbl     v4.16b, {{ v13.16b }}, v4.16b
+
+        zip2    v2.16b, v12.16b, v0.16b
+        zip2    v6.16b, v12.16b, v4.16b
+
+        zip1    v0.16b, v12.16b, v0.16b
+        zip1    v4.16b, v12.16b, v4.16b
+
+        fcvtl2  v1.4s, v0.8h
+        fcvtl   v0.4s, v0.4h
+        fcvtl2  v3.4s, v2.8h
+        fcvtl   v2.4s, v2.4h
+        fcvtl2  v5.4s, v4.8h
+        fcvtl   v4.4s, v4.4h
+        fcvtl2  v7.4s, v6.8h
+        fcvtl   v6.4s, v6.4h
+
+        fmul    v0.4s, v0.4s, v16.4s
+        fmul    v1.4s, v1.4s, v17.4s
+        fmul    v2.4s, v2.4s, v18.4s
+        fmul    v3.4s, v3.4s, v19.4s
+        fmul    v4.4s, v4.4s, v20.4s
+        fmul    v5.4s, v5.4s, v21.4s
+        fmul    v6.4s, v6.4s, v22.4s
+        fmul    v7.4s, v7.4s, v23.4s
+
+        st1     {{v0.16b-v3.16b}}, [{o}], #64
+        st1     {{v4.16b-v7.16b}}, [{o}], #64
+
+        subs    {k2}, {k2}, #1
+        bne     3b
+
+        add     {i}, {i}, 64 // skip scales
+        subs    {k}, {k}, 32
+        bne     2b
+            ",
+        lookup_table = in(reg) &lookup_table,
+        k = inout(reg) k => _,
+        k2 = out(reg) _,
+        scales = out(reg) _,
+        i = inout(reg) input => _,
+        o = inout(reg) output => _,
+        out("v0") _, out("v1") _, out("v2") _, out("v3") _,
+        out("v4") _, out("v5") _, out("v6") _, out("v7") _,
+        out("v8") _, out("v9") _, out("v10") _, out("v11") _,
+        out("v12") _, out("v13") _, out("v14") _, out("v15") _,
+        out("v16") _, out("v17") _, out("v18") _, out("v19") _,
+        out("v20") _, out("v21") _, out("v22") _, out("v23") _,
+        );
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/softmax.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/softmax.rs
new file mode 100644
index 000000000..a9cfc162d
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/softmax.rs
@@ -0,0 +1,110 @@
+map_reduce_impl_wrap!(
+    f32,
+    arm64simd_softmax2_fastcompact_f32_16n,
+    16,
+    4,
+    f32,
+    f32::MIN,
+    0f32,
+    #[inline(never)]
+    fn run(buf: &mut [f32], max: f32) -> f32 {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        let len = buf.len();
+        let ptr = buf.as_ptr();
+        let mut acc;
+        const MLN2: f32 = 0.6931471805f32;
+        const A: f32 = 8388608.0f32;
+        const B: f32 = 1065353216.0f32;
+        const C: f32 = 60801.0f32;
+        const SLOPE: f32 = A / MLN2;
+        const OFFSET: f32 = B - C;
+        unsafe {
+            std::arch::asm!("
+            // v0-v3 sum acc
+            eor v0.16b, v0.16b, v0.16b
+            eor v1.16b, v1.16b, v1.16b
+            eor v2.16b, v2.16b, v2.16b
+            eor v3.16b, v3.16b, v3.16b
+
+            dup v4.4s, v4.s[0] // max
+            dup v5.4s, v5.s[0] // slope
+            dup v6.4s, v6.s[0] // offset
+            eor v7.16b, v7.16b, v7.16b // zero for max
+            2:
+                ld1 {{v8.4s, v9.4s, v10.4s, v11.4s}}, [{ptr}]
+
+                fsub v8.4s, v8.4s, v4.4s
+                fsub v9.4s, v9.4s, v4.4s
+                fsub v10.4s, v10.4s, v4.4s
+                fsub v11.4s, v11.4s, v4.4s
+
+                fmul v8.4s, v8.4s, v5.4s
+                fmul v9.4s, v9.4s, v5.4s
+                fmul v10.4s, v10.4s, v5.4s
+                fmul v11.4s, v11.4s, v5.4s
+
+                fadd v8.4s, v8.4s, v6.4s
+                fadd v9.4s, v9.4s, v6.4s
+                fadd v10.4s, v10.4s, v6.4s
+                fadd v11.4s, v11.4s, v6.4s
+
+                fmax v8.4s, v8.4s, v7.4s
+                fmax v9.4s, v9.4s, v7.4s
+                fmax v10.4s, v10.4s, v7.4s
+                fmax v11.4s, v11.4s, v7.4s
+
+                fcvtnu v8.4s, v8.4s
+                fcvtnu v9.4s, v9.4s
+                fcvtnu v10.4s, v10.4s
+                fcvtnu v11.4s, v11.4s
+
+                fadd v0.4s, v0.4s, v8.4s
+                fadd v1.4s, v1.4s, v9.4s
+                fadd v2.4s, v2.4s, v10.4s
+                fadd v3.4s, v3.4s, v11.4s
+
+                st1 {{v8.4s, v9.4s, v10.4s, v11.4s}}, [{ptr}], 64
+                subs {len}, {len}, 16
+                bne 2b
+
+            fadd v0.4s, v0.4s, v1.4s
+            fadd v2.4s, v2.4s, v3.4s
+            fadd v0.4s, v0.4s, v2.4s
+
+            ext v1.16b, v0.16b, v0.16b, 4
+            ext v2.16b, v0.16b, v0.16b, 8
+            ext v3.16b, v0.16b, v0.16b, 12
+            fadd v0.4s, v0.4s, v1.4s
+            fadd v2.4s, v2.4s, v3.4s
+            fadd v0.4s, v0.4s, v2.4s
+            ",
+            len = inout(reg) len => _,
+            ptr = inout(reg) ptr => _,
+            out("v0") acc,
+            out("v1") _,
+            out("v2") _,
+            out("v3") _,
+            inout("v4") max => _,
+            inout("v5") SLOPE => _,
+            inout("v6") OFFSET => _,
+            out("v7") _,
+            out("v8") _,
+            out("v9") _,
+            out("v10") _,
+            out("v11") _,
+            );
+        }
+        acc
+    },
+    #[inline(never)]
+    fn reduce_two(a: f32, b: f32) -> f32 {
+        a + b
+    }
+);
+
+#[cfg(test)]
+mod test_arm64simd_softmax2_fastcompact_f32_16n {
+    use super::*;
+    crate::softmax_l2_frame_tests!(true, f32, arm64simd_softmax2_fastcompact_f32_16n);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/sum.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/sum.rs
new file mode 100644
index 000000000..87116a8f1
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/sum.rs
@@ -0,0 +1,59 @@
+use crate::num_traits::Zero;
+
+reduce_impl_wrap!(
+    f32,
+    arm64simd_sum_f32_16n,
+    16,
+    4,
+    (),
+    f32::zero(),
+    #[inline(never)]
+    fn run(buf: &[f32], _: ()) -> f32 {
+        assert!(buf.len() % 16 == 0);
+        assert!(buf.len() > 0);
+        unsafe fn run(buf: &[f32]) -> f32 {
+            unsafe {
+                let len = buf.len();
+                let ptr = buf.as_ptr();
+                let mut out: u32;
+                std::arch::asm!("
+                movi v0.4s, #0
+                movi v1.4s, #0
+                movi v2.4s, #0
+                movi v3.4s, #0
+                2:
+                    ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64
+                    fadd v0.4s, v0.4s, v4.4s
+                    fadd v1.4s, v1.4s, v5.4s
+                    fadd v2.4s, v2.4s, v6.4s
+                    fadd v3.4s, v3.4s, v7.4s
+
+                    subs {len}, {len}, 16
+                    bne 2b
+
+                fadd v0.4s, v0.4s, v1.4s
+                fadd v2.4s, v2.4s, v3.4s
+                fadd v0.4s, v0.4s, v2.4s
+                faddp v0.4s, v0.4s, v0.4s
+                faddp v0.4s, v0.4s, v0.4s
+                ",
+                ptr = inout(reg) ptr => _,
+                len = inout(reg) len => _,
+                out("s0") out, out("v1") _, out("v2") _, out("v3") _,
+                out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
+                f32::from_bits(out)
+            }
+        }
+        unsafe { run(buf) }
+    },
+    #[inline(never)]
+    fn reduce_two(a: f32, b: f32) -> f32 {
+        a + b
+    }
+);
+
+#[cfg(test)]
+mod test_arm64simd_sum_f32_16n {
+    use super::*;
+    crate::sum_frame_tests!(true, f32, arm64simd_sum_f32_16n);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/unicast.rs b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/unicast.rs
new file mode 100644
index 000000000..a7a4d4114
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/arm64simd/unicast.rs
@@ -0,0 +1,233 @@
+unicast_impl_wrap!(
+    f32,
+    arm64simd_unicast_mul_f32_16n,
+    16,
+    4,
+    #[inline(never)]
+    fn run(a: &mut [f32], b: &[f32]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 16 == 0);
+        assert!(a.len() > 0);
+        unsafe fn run(a: &mut [f32], b: &[f32]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}]
+                    ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64
+                    fmul v0.4s, v0.4s, v4.4s
+                    fmul v1.4s, v1.4s, v5.4s
+                    fmul v2.4s, v2.4s, v6.4s
+                    fmul v3.4s, v3.4s, v7.4s
+                    st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64
+                    subs {len}, {len}, 16
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+unicast_impl_wrap!(
+    f32,
+    arm64simd_unicast_add_f32_16n,
+    16,
+    4,
+    #[inline(never)]
+    fn run(a: &mut [f32], b: &[f32]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 16 == 0);
+        assert!(a.len() > 0);
+        unsafe fn run(a: &mut [f32], b: &[f32]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}]
+                    ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64
+                    fadd v0.4s, v0.4s, v4.4s
+                    fadd v1.4s, v1.4s, v5.4s
+                    fadd v2.4s, v2.4s, v6.4s
+                    fadd v3.4s, v3.4s, v7.4s
+                    st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64
+                    subs {len}, {len}, 16
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+unicast_impl_wrap!(
+    f32,
+    arm64simd_unicast_sub_f32_16n,
+    16,
+    4,
+    #[inline(never)]
+    fn run(a: &mut [f32], b: &[f32]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 16 == 0);
+        assert!(a.len() > 0);
+        unsafe fn run(a: &mut [f32], b: &[f32]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}]
+                    ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64
+                    fsub v0.4s, v0.4s, v4.4s
+                    fsub v1.4s, v1.4s, v5.4s
+                    fsub v2.4s, v2.4s, v6.4s
+                    fsub v3.4s, v3.4s, v7.4s
+                    st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64
+                    subs {len}, {len}, 16
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+unicast_impl_wrap!(
+    f32,
+    arm64simd_unicast_subf_f32_16n,
+    16,
+    4,
+    #[inline(never)]
+    fn run(a: &mut [f32], b: &[f32]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 16 == 0);
+        assert!(a.len() > 0);
+        unsafe fn run(a: &mut [f32], b: &[f32]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}]
+                    ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64
+                    fsub v0.4s, v4.4s, v0.4s
+                    fsub v1.4s, v5.4s, v1.4s
+                    fsub v2.4s, v6.4s, v2.4s
+                    fsub v3.4s, v7.4s, v3.4s
+                    st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64
+                    subs {len}, {len}, 16
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+unicast_impl_wrap!(
+    f32,
+    arm64simd_unicast_max_f32_16n,
+    16,
+    4,
+    #[inline(never)]
+    fn run(a: &mut [f32], b: &[f32]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 16 == 0);
+        assert!(a.len() > 0);
+        unsafe fn run(a: &mut [f32], b: &[f32]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}]
+                    ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64
+                    fmax v0.4s, v0.4s, v4.4s
+                    fmax v1.4s, v1.4s, v5.4s
+                    fmax v2.4s, v2.4s, v6.4s
+                    fmax v3.4s, v3.4s, v7.4s
+                    st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64
+                    subs {len}, {len}, 16
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+unicast_impl_wrap!(
+    f32,
+    arm64simd_unicast_min_f32_16n,
+    16,
+    4,
+    #[inline(never)]
+    fn run(a: &mut [f32], b: &[f32]) {
+        assert!(a.len() == b.len());
+        assert!(a.len() % 16 == 0);
+        assert!(a.len() > 0);
+        unsafe fn run(a: &mut [f32], b: &[f32]) {
+            unsafe {
+                let len = a.len();
+                let a_ptr = a.as_ptr();
+                let b_ptr = b.as_ptr();
+                std::arch::asm!("
+                2:
+                    ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}]
+                    ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{b_ptr}], 64
+                    fmin v0.4s, v0.4s, v4.4s
+                    fmin v1.4s, v1.4s, v5.4s
+                    fmin v2.4s, v2.4s, v6.4s
+                    fmin v3.4s, v3.4s, v7.4s
+                    st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [{a_ptr}], 64
+                    subs {len}, {len}, 16
+                    bne 2b
+            ",
+            len = inout(reg) len => _,
+            a_ptr = inout(reg) a_ptr => _,
+            b_ptr = inout(reg) b_ptr => _,
+            out("v0") _, out("v1") _, out("v2") _, out("v3") _,);
+            }
+        }
+        unsafe { run(a, b) }
+    }
+);
+
+#[cfg(test)]
+mod test_arm64simd_unicast_mul_f32_16n {
+    use super::*;
+    use proptest::strategy::Strategy;
+    crate::unicast_frame_tests!(true, f32, arm64simd_unicast_mul_f32_16n, |a, b| a * b);
+    crate::unicast_frame_tests!(true, f32, arm64simd_unicast_add_f32_16n, |a, b| a + b);
+    crate::unicast_frame_tests!(true, f32, arm64simd_unicast_sub_f32_16n, |a, b| a - b);
+    crate::unicast_frame_tests!(true, f32, arm64simd_unicast_subf_f32_16n, |a, b| b - a);
+    crate::unicast_frame_tests!(true, f32, arm64simd_unicast_min_f32_16n, |a, b| a.min(b));
+    crate::unicast_frame_tests!(true, f32, arm64simd_unicast_max_f32_16n, |a, b| a.max(b));
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/cortex_a53.rs b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a53.rs
new file mode 100644
index 000000000..5ccf9b689
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a53.rs
@@ -0,0 +1,16 @@
+use crate::frame::mmm::CostModel;
+        pub fn model() -> CostModel<'static> {
+            CostModel {
+                big_product_mkn_threshold: 4193280.0,
+                big_product_kernel_choice: "arm64simd_mmm_f32_12x8_a53",
+                kernels: &["arm64simd_mmm_f32_12x8_a53", "arm64simd_mmm_f32_12x8_gen", "arm64simd_mmm_f32_16x4_a53", "arm64simd_mmm_f32_16x4_gen", "arm64simd_mmm_f32_24x4_a53", "arm64simd_mmm_f32_24x4_gen", "arm64simd_mmm_f32_8x8_a53", "arm64simd_mmm_f32_8x8_gen", "generic_f32_4x4"],
+                mrs: &[4, 8, 12, 16, 24],
+                nrs: &[4, 8],
+                feat_norm_mean: &[4.592185479105843, 4.595318666792368, 4.579484503710355, 13.76698864960861, 1.5094315895372235, 0.7603118712273642, 3.47170523138833, 0.8752515090543259, 5.487801810865191, 0.9224094567404426, 7.414361167002012, 0.9387575452716298, 11.415367203219317, 0.959758551307847, 1.5074195171026157, 0.750125754527163, 3.47170523138833, 0.875125754527163],
+                feat_norm_stddev: &[1.2629893666668983, 1.2446322895476982, 1.258916587498509, 1.3105293102858375, 1.1063478713873012, 0.4268931127321023, 2.3025561444671223, 0.330433510637837, 3.431728816936762, 0.2675261685447694, 4.624258056138275, 0.23977451171303063, 6.954988241153163, 0.19652499713600946, 1.1207056563030822, 0.4329400731304941, 2.292868878895526, 0.3305762669799629],
+                w1: &[-0.6321063041687012, 0.24184978008270264, -0.4356610178947449, -0.1422707587480545, 0.10410869866609573, 0.09415467828512192, 0.1568029671907425, -0.25644537806510925, -0.37143954634666443, 0.15696385502815247, 0.050514884293079376, -0.07972156256437302, -0.253411203622818, 0.27587205171585083, 0.02698700875043869, -0.07245094329118729, -0.013899300247430801, 0.022088056430220604, 0.2630922496318817, -0.06870237737894058, 0.40947580337524414, 0.22110328078269958, 0.03808840364217758, -0.008957616984844208, -0.11127127707004547, 0.07818343490362167, 0.025474127382040024, -0.09513817727565765, 0.10613243281841278, 0.029441041871905327, 0.0819312185049057, -0.03519295156002045, -0.3130439519882202, 0.4705337882041931, 0.4476615786552429, -0.616556704044342, 0.2223544716835022, -0.23584842681884766, -0.3312308192253113, 0.18874213099479675, -0.033394988626241684, 0.09006354957818985, 0.014722823165357113, 0.0877116397023201, 0.07635975629091263, 0.04284617677330971, -0.029695890843868256, -0.05645013228058815, -0.096514992415905, 0.16431200504302979, 0.11922749876976013, -0.08329842984676361, -0.15593503415584564, 0.33497852087020874, 0.5143201947212219, -0.4143742322921753, -0.07121813297271729, 0.032980211079120636, -0.014759342186152935, -0.10575086623430252, -0.08755142986774445, 0.053559254854917526, 0.2959750294685364, -0.210640087723732, -0.09462635219097137, 0.14600691199302673, 0.22388464212417603, -0.185477152466774, -0.100673608481884, -0.10946766287088394, 0.03957876190543175, -0.10485030710697174, 0.01792730763554573, 0.15610192716121674, -0.14726269245147705, 0.30900657176971436, 0.21081387996673584, -0.06592089682817459, 0.03168980032205582, 0.20096036791801453, 0.021350117400288582, -0.04456694424152374, 0.35106319189071655, 0.04561518132686615, -0.14208926260471344, 0.06227286159992218, -0.20092618465423584, 0.08163813501596451, 0.23094142973423004, -0.0332462415099144, 0.26035502552986145, 0.4639679193496704, 0.11891252547502518, 0.4722647964954376, -0.025709064677357674, 0.1651654839515686, -0.009242026135325432, 0.02252785675227642, 0.13325856626033783, -0.32073062658309937, -0.05948975682258606, -0.07114000618457794, -0.04468341916799545, -0.002579547930508852, 0.2056179940700531, -0.14614446461200714, -0.11110267788171768, 0.09043771028518677, 0.135812908411026, -0.3300320506095886, 0.290109783411026, 0.23399846255779266, -0.04882314056158066, -7.729629578534514e-05, 0.04754950851202011, 0.003435821272432804, 0.1115187332034111, -0.08208155632019043, 0.018088344484567642, -0.01600349321961403, -0.025757616385817528, 0.060233402997255325, -0.08445348590612411, 0.375010222196579, 0.7828134298324585, -0.836024820804596, 0.041282471269369125, -0.07747451961040497, 0.31279265880584717, -0.05552798509597778, -0.03274049609899521, -0.1147448793053627, -0.1660863310098648, 0.390122652053833, 0.29283249378204346, -0.0705522671341896, -0.2927100956439972, 0.038575850427150726, -0.15336857736110687, -0.028894517570734024, -0.06372164189815521, 0.2578844130039215, 0.060502175241708755, -0.14235782623291016, 0.6358739137649536, -0.2645033001899719, 0.01847453974187374, 0.3809853792190552, 0.0059107388369739056, -0.07365082949399948, -0.17490413784980774, 0.26099810004234314, 0.38216090202331543, -0.44192376732826233, -0.1497800052165985, 0.11983825266361237, 0.05704215168952942, -0.09331715852022171, -0.027353238314390182, 0.07132093608379364, 0.013686291873455048, -0.14973664283752441, -0.6386663317680359, -0.42794787883758545, 0.43632233142852783, -0.022474655881524086, 0.011099671013653278, 0.08784982562065125, 0.046248968690633774, 0.011553826741874218, 0.0328642763197422, 0.08678832650184631, 0.3153251111507416, -0.15444470942020416, -0.5339609980583191, 0.10007581859827042, -0.02821769379079342, -0.3091129660606384, -0.6009559631347656, -0.555920422077179, 0.9594710469245911, -0.5884919166564941, -0.08316593617200851, 0.07074970006942749, 0.026868166401982307, 0.03690064698457718, -0.2468167096376419, 0.20655325055122375, 0.2654767632484436, -0.11032287031412125, 0.09603621065616608, 0.12746618688106537, 0.11097392439842224, -0.046335164457559586, 0.2753968834877014, -0.4040895402431488, -0.20803606510162354, 0.29299837350845337, -0.21050886809825897, -0.02308674342930317, 0.32019543647766113, -0.010012545622885227, -0.07219666987657547, 0.03816547617316246, -0.03670865297317505, -0.023583250120282173, -0.2030763179063797, 0.4087490737438202, 0.19682352244853973, -0.061049312353134155, -0.34018784761428833, 0.4121433198451996, -0.10742263495922089, -0.2883375287055969, 0.15564028918743134, -0.014489974826574326, -0.40427249670028687, 0.04029366746544838, -0.46333804726600647, -0.5811125636100769, 0.1686166524887085, -0.08247993886470795, 0.02783152647316456, -0.07444962859153748, -0.11033248156309128, 0.17976728081703186, -0.05866902321577072, -0.037863120436668396, 0.016240332275629044, 0.08362828195095062, 0.04285397008061409, -0.2676204442977905, -0.18113869428634644, 0.10164932906627655, 0.5798585414886475, -0.2936221659183502, -0.16815273463726044, 0.3153108060359955, 0.1320323497056961, 0.29474350810050964, -0.31565147638320923, 0.032277628779411316, 0.5137525796890259, 0.13915763795375824, -0.08313784748315811, 0.0871160700917244, 0.07447603344917297, -0.4863177537918091, 0.022499559447169304, 0.07244526594877243, -0.1484450399875641, -0.08256664127111435, 0.09993510693311691, 0.33980417251586914, -0.5465939044952393, -0.18684262037277222, 0.050183601677417755, 0.015223318710923195, -0.32613685727119446, 0.2532300353050232, 0.21044038236141205, -0.24877160787582397, 0.17659279704093933, -0.14793306589126587, 0.054353710263967514, -0.07312241941690445, 0.04128497466444969, -0.0071349963545799255, -0.17010675370693207, 0.3045605719089508, -0.391606867313385, 0.19206605851650238, 0.10403380542993546, -0.3808597922325134, -0.016270365566015244, -0.09313700348138809, 0.11184006929397583, 0.01242944784462452, -0.03349926695227623, -0.1107369139790535, 0.2315940409898758, 0.03170541673898697, -0.48357459902763367, 0.21056240797042847, -0.25072887539863586, 0.3221265375614166, 0.5108669400215149, -0.6159838438034058, -0.5540208220481873, 0.38405123353004456, 0.1323588639497757, -0.11752784997224808, 0.07821227610111237, 0.0494898185133934, 0.28607267141342163, -0.45723024010658264, -0.5914809703826904, -0.15741930902004242, -0.09551641345024109, -0.769051730632782, -0.2119017094373703, -0.8505933284759521, 0.025818098336458206, 0.11196669936180115, 0.013385393656790257, -0.02640729956328869, -0.061663247644901276, -0.012524818070232868, -0.8237857222557068, -0.40553018450737, -0.06807617098093033, -0.07508324831724167, -0.011943532153964043, 0.07591933757066727, 0.18625806272029877, -0.14417743682861328, 0.0031204342376440763, -0.031199704855680466, -0.037418268620967865, -0.062444642186164856, 0.0434197299182415, -0.12462416291236877, -0.256317675113678, -0.0023087849840521812, 0.20042477548122406, 0.17625926434993744, -0.21970611810684204, 0.1626158505678177, -0.09550918638706207, -0.10577445477247238, -0.17239737510681152, 0.28190216422080994, 0.003485368099063635, -0.24596424400806427, 0.5330491662025452, -0.6179713010787964, -0.19186368584632874, 0.04049135372042656, 0.005797799210995436, 0.10468537360429764, -0.03522713482379913, 0.2554764151573181, -0.6601210832595825, 0.3554987609386444, -0.1528356373310089, -0.2578294575214386, -0.01912580616772175, 0.14837700128555298, 0.28032413125038147, 0.6525465250015259, -0.16390740871429443, -0.12456659972667694, -0.04434182122349739, 0.44120529294013977, -0.06832294911146164, 0.4077378511428833, -0.07938709110021591, 0.23457404971122742, -0.05966708064079285, 0.09640492498874664, 0.7555295825004578, -0.3110663592815399, 0.035311225801706314, 0.25391876697540283, 0.09088675677776337, 0.03320888802409172, -0.1745719611644745, 0.2270633578300476, 0.2851920425891876, -0.07204318791627884, -0.05483328923583031, 0.189837247133255, -0.15304607152938843, -0.08311894536018372, -0.06649994850158691, -0.0776129737496376, 0.11864881962537766, -0.06670717149972916, -0.00406235596165061, -0.6984686255455017, 0.28291743993759155, -0.04160117730498314, -0.09169034659862518, 0.14924104511737823, 0.46138641238212585, -0.29699283838272095, -0.6411864757537842, 0.26037612557411194, 0.21487018465995789, -0.20806393027305603, -0.4174681007862091, 0.1901395320892334, 0.049021925777196884, 0.2822348475456238, -0.03862098604440689, 0.029824024066329002, 0.2657202184200287, -0.43108099699020386, 0.37041717767715454, -0.025845345109701157, -0.09200481325387955, -0.017871620133519173, 0.281535267829895, -0.20838744938373566, -0.400356650352478, 0.4133286476135254, -0.08745774626731873, 0.02171195112168789, 0.4766440987586975, -0.24629971385002136, 0.2504408657550812, -0.5850875973701477, -0.49699774384498596, 0.7086884379386902, -0.479250967502594, 0.6140879392623901, 0.0023341099731624126, -0.06628652662038803, -0.0873338133096695, -0.2862805724143982, 0.28077220916748047, 0.030578527599573135, -0.281633198261261, -0.7042887806892395, -0.03409203886985779, 0.3272986114025116, 0.3397904634475708, -0.7069221138954163, 0.09408266842365265, -0.05243761092424393, -0.20503726601600647, 0.15679042041301727, 0.4723545014858246, -0.39158886671066284, 0.17581138014793396, 0.10779093205928802, -0.013951681554317474, 0.052481986582279205, -0.36543500423431396, 0.29497984051704407, 0.4044850766658783, -0.3766767382621765, -0.07298431545495987, 0.9660398364067078, 0.27753373980522156, -0.11616200953722, 0.05277060344815254, -0.05379771068692207, 0.026094499975442886, -0.011136082001030445, -0.13593854010105133, 0.033518679440021515, 0.6947338581085205, 0.6335914134979248, -0.06526267528533936, 0.019844267517328262, 0.10042254626750946, -0.16847042739391327, -0.15717101097106934, -0.7462965250015259, -0.0653005987405777, 0.057602036744356155, 0.010834889486432076, -0.46870648860931396, -0.1872870922088623, 0.3152116537094116, 0.0731910765171051, -0.13902369141578674, 0.10666802525520325, 0.3094567656517029, -0.926356315612793, -0.38388797640800476, -0.02191060781478882, -0.005548040382564068, -0.20935170352458954, 0.24779647588729858, 0.12304577976465225, -0.2883053123950958, 0.019766222685575485, -0.029659172520041466, 0.06051887571811676, -0.01741836965084076, 0.04409812018275261, 0.011840295046567917, -0.14320705831050873, 0.31673386693000793, -0.069312185049057, -0.00935965683311224, 0.019028477370738983, -0.1078404039144516, -0.12472966313362122, 0.10027194768190384, 0.31244829297065735, -0.10855710506439209, -0.3165830969810486, 0.4076120853424072, 0.05742274224758148, 0.17263729870319366, 0.3141464293003082, -0.13655878603458405, 0.07613589614629745, -0.10808823257684708, -0.19837258756160736, 0.16735948622226715, 0.055960867553949356, 0.005388774909079075, -0.30227115750312805, -0.009724846109747887, -0.11610261350870132, 0.05133519321680069, -0.029441826045513153, 0.06810834258794785, -0.13311177492141724, 0.2196519374847412, 0.19138571619987488, -0.2621391713619232, 0.11996466666460037, -0.05961257219314575, 0.1763487011194229, -0.10918399691581726, -0.14629563689231873, 0.5217060446739197, -0.0012722538085654378, 0.08564157783985138, -0.6640400290489197, -0.41702714562416077, 0.045037489384412766, -0.059789709746837616, -0.05092751979827881, 0.10446680337190628, -0.05335049331188202, 0.0846114456653595, 0.04981796815991402, -0.14310699701309204, 0.01863306201994419, -0.0474325567483902, 0.23124581575393677, -0.6166588068008423, -0.7533295154571533, -1.1133880615234375, -0.1241607666015625, -0.5540894865989685, 0.2806711494922638, -0.4259497821331024, -0.07380827516317368, 0.009988346137106419, 0.3110937178134918, 0.0072226757183671, 0.2422133982181549, -0.351376473903656, -0.5103139877319336, 0.5470908284187317, -0.14952707290649414, -0.005531645845621824, -0.24725599586963654, 0.1639375537633896, 0.07172811776399612, -0.1566568911075592, 0.32833099365234375, 0.06875353306531906, -0.17773276567459106, -0.09706790000200272, -0.019849322736263275, 0.1257631778717041, 0.02103520557284355, 0.12721672654151917, 0.012451020069420338, 0.039879027754068375, 0.17779605090618134, -0.09887054562568665, -0.08146625012159348, 0.05893132835626602, 0.18479469418525696, -0.2479601502418518, -0.26928654313087463, 0.3720027506351471, -0.45930227637290955, 0.3673400282859802, 0.016545426100492477, 0.13507097959518433, -0.006458526011556387, 0.036685895174741745, 0.309455007314682, -0.23917894065380096, -0.11758854985237122, 0.2146540731191635, -0.11578961461782455, 0.006646907888352871, -0.04229713976383209, 0.09812270104885101, 0.06730903685092926, 0.28935620188713074, -0.02212020941078663, 0.007341589778661728, -0.1257125288248062, -0.4639318287372589, 0.41743314266204834, 0.40524497628211975, -0.20389464497566223, 0.1286880075931549, 0.05365758389234543, -0.14487741887569427, 0.1511518359184265, 0.11219878494739532, 0.13080842792987823, -0.175934836268425, -0.08939457684755325, 0.16476190090179443, -0.061722587794065475, 0.15382836759090424, 0.15293729305267334, -0.23814627528190613, -0.778872013092041, 0.2813372313976288, 0.20388194918632507, -0.34535032510757446, -0.014981378801167011, 0.1560390293598175, 0.534339189529419, 0.7075706124305725, -0.20866382122039795, 0.050050001591444016, -0.030285198241472244, 0.430580735206604, 0.06858251988887787, 0.32321590185165405, 0.006104054860770702, 0.11919829249382019, -0.09377042204141617, -0.028785547241568565, 0.489607572555542, -0.321664422750473, 0.020770607516169548, 0.5259214639663696, -0.0682888925075531, 0.10569659620523453, -0.18257132172584534, 0.2565872073173523, 0.2177353799343109, 0.029641704633831978, 0.0678875744342804, 0.1679811030626297, -0.04851052165031433, -0.1633165180683136, -0.007416700944304466, -0.06638842821121216, 0.06177712231874466, -0.0709109827876091, -0.11213518679141998, -0.20582593977451324, 0.7092531323432922, 0.43438467383384705, -0.0060964771546423435, -0.12442151457071304, -0.008676152676343918, 0.21390584111213684, -0.014475004747509956, -0.7601429224014282, 0.15622451901435852, -0.3261253833770752, 0.005610095337033272, -0.5111817121505737, -0.003055301494896412, 0.32741662859916687, -0.022710084915161133, -0.24255472421646118, -0.6487520933151245, 0.08797790110111237, 0.2754897177219391, -0.2213398665189743, -0.17206217348575592, 0.1177680641412735, 0.16599608957767487, -0.19922694563865662, -0.07098120450973511, -0.1628963202238083, 0.03356413170695305, -0.24303652346134186, -0.2067747414112091, 0.1192406490445137, -0.020932691171765327, 0.07735628634691238, 0.24762177467346191, -0.3007707893848419, -0.43011191487312317, -0.07597793638706207, 0.2528873085975647, -0.3795652985572815, 0.14651291072368622, 0.07552091032266617, 0.026706784963607788, -0.11118876934051514, 0.0460294634103775, 0.4268769323825836, 0.32645294070243835, -0.09493713080883026, 0.18892213702201843, 0.17980137467384338, 0.06521839648485184, 0.03702569752931595, 0.05443478748202324, -0.030978504568338394, -0.11806164681911469, -0.20229215919971466, 0.6260767579078674, 0.6068219542503357, -0.060956377536058426, 0.05200914293527603, 0.04499080404639244, -0.09300816804170609, 0.0501115508377552, 0.9676806926727295, -0.12394528090953827, 0.17313909530639648, -0.0274575874209404, 1.0245190858840942, -0.24425312876701355, 0.3827340602874756, 0.270155131816864, -0.7169324159622192],
+                b1: &[-0.518636167049408, 0.7074531316757202, -0.4965735971927643, 0.6063699126243591, -0.3258720934391022, 0.4608336389064789, 0.8324258327484131, -0.6118353605270386, 0.8226121664047241, 0.3534131944179535, -0.43312883377075195, -0.05448569357395172, -0.5826212167739868, 0.8478071689605713, 0.23062080144882202, -0.30911386013031006, -0.5776869058609009, 0.5107449293136597, 0.18762148916721344, 0.2889731228351593, -0.5579098463058472, 0.7818499207496643, 0.7910265922546387, -0.4228874444961548, 0.6197248697280884, -0.4563252627849579, 0.27223169803619385, -0.2859383523464203, -0.4862801730632782, -0.7853735089302063, -0.1534343808889389, -0.5592636466026306, -0.6364999413490295, -0.5210756063461304, 0.3506944477558136, -0.5348182916641235, -0.5098673105239868, 0.45690369606018066, -0.3907462954521179, 0.8493368029594421],
+                w2: &[-0.525189995765686, 0.44041961431503296, -0.4107511341571808, 0.3741440176963806, -0.02630656771361828, 0.27733951807022095, 0.3907228410243988, -0.05409616604447365, 0.3991526663303375, 0.24264170229434967, -0.657869279384613, -0.3758363425731659, -0.5133534669876099, 0.3480457663536072, 0.5088834166526794, 0.0942729115486145, -0.4167974889278412, 0.4895906448364258, 0.17553496360778809, 0.3702719211578369, -0.5372111201286316, -0.1560969352722168, -0.30670106410980225, -0.48799967765808105, 0.4005548357963562, -0.3075137138366699, 0.656658947467804, -0.4914362132549286, -0.36532747745513916, -0.5505443811416626, 0.1328023225069046, -0.3564044237136841, -0.467242956161499, -0.3465808629989624, 0.4501214027404785, -0.4742763936519623, -0.35285890102386475, 0.46182748675346375, -0.28942185640335083, 0.2825036346912384, -0.1725425124168396, -0.17012473940849304, 0.5306965708732605, -0.34125325083732605, 0.21301832795143127, -0.49370092153549194, -0.06135714799165726, 0.5665233135223389, -0.01510544028133154, -0.0015591675182804465, 0.4308379292488098, 0.09525317698717117, 0.06129995733499527, -0.06124228611588478, -0.28377535939216614, -0.038286369293928146, 0.19221894443035126, -0.45041826367378235, -0.4307488799095154, -0.30516454577445984, 0.3670405447483063, -0.1779327690601349, -0.36808863282203674, 0.344722718000412, -0.2691067159175873, 0.5803861021995544, -0.42112261056900024, 0.1169033870100975, 0.35742461681365967, 0.16161565482616425, 0.44920068979263306, 0.2572435438632965, 0.263318806886673, 0.7236857414245605, -0.2759736180305481, 0.37376394867897034, 0.37350600957870483, -0.4067005515098572, 0.18588955700397491, -0.4281120300292969, 0.4204690456390381, -0.448592871427536, 0.11808016151189804, -0.4660882353782654, 0.33337321877479553, -0.11569353938102722, -0.589764416217804, -0.17854063212871552, -0.44001755118370056, 0.7101057767868042, 0.057653751224279404, 0.3937684893608093, 0.257487416267395, -0.38924211263656616, 0.08511713892221451, 0.10950952023267746, 0.0917661041021347, -0.25429144501686096, 0.6342174410820007, -0.15891794860363007, -0.021509289741516113, 0.535305380821228, 0.28721731901168823, -0.32432296872138977, -0.26846611499786377, 0.07051636278629303, -0.12710770964622498, 0.14568471908569336, 0.6293584704399109, 0.4198862612247467, -0.8883509039878845, 0.5271400809288025, 0.17345309257507324, 0.1771862506866455, -0.214192733168602, 0.17817191779613495, 0.44757506251335144, 0.04112042486667633, 0.6819244027137756, -0.7277362942695618, 0.19224950671195984, -0.2905896008014679, 0.5791959762573242, -0.4898945093154907, 0.47323065996170044, -0.40173205733299255, -0.36294564604759216, 0.6861273050308228, -0.2955973744392395, -0.19740070402622223, 0.4044080674648285, -0.11244003474712372, 0.58234703540802, -0.31175708770751953, -0.3454722762107849, 0.12274620682001114, 0.29693669080734253, -0.41234102845191956, -0.1583351045846939, -0.2763107419013977, 0.34174609184265137, -0.7301539182662964, -0.4137580394744873, 0.5135444402694702, -0.19664454460144043, 0.3913029730319977, -0.47720086574554443, 0.2519521415233612, 0.3860025703907013, 0.4073657691478729, 0.06604084372520447, 0.32879960536956787, 0.4341438114643097, 0.4072171449661255, -0.3755425810813904, 0.29250237345695496, 0.4723772704601288, -0.39177075028419495, 0.3535446524620056, -0.5977760553359985, -0.11535356938838959, -0.8606860637664795, 0.3202466070652008, 0.534551203250885, -0.10786011070013046, 0.5766461491584778, -1.0034655332565308, -0.08353354036808014, 0.20165663957595825, -0.8530645370483398, 0.2801732122898102, -0.2713226079940796, 0.460101842880249, 0.5550602078437805, 0.11862986534833908, -0.8431587219238281, -0.41269758343696594, -0.36862486600875854, 0.08385410159826279, 0.1634000688791275, -0.22930988669395447, -0.39085301756858826, 0.8845512270927429, 0.2522968053817749, 0.3779301643371582, 0.3454946279525757, -0.14984408020973206, 0.2937467098236084, 0.3651972711086273, 1.1317671537399292, -0.4535387456417084, 0.07272656261920929, -0.29987066984176636, -0.03405649587512016, 0.1012202724814415, -0.12492970377206802, -0.048626113682985306, -0.3150321841239929, -0.4124220013618469, -0.7775830030441284, 0.25562793016433716, -0.4026365876197815, 0.27681317925453186, -0.3169574439525604, 0.414761483669281, -0.37095436453819275, -0.2815983295440674, 0.6821384429931641, -0.23631460964679718, -0.391885370016098, 0.32081300020217896, 0.029309673234820366, 0.3151959478855133, -0.23872429132461548, -0.2680605947971344, 0.2245175689458847, 0.28024742007255554, -0.5187304615974426, -0.17155316472053528, -0.18662460148334503, 0.44196388125419617, -0.7731465697288513, -0.39956656098365784, 0.4926709830760956, -0.2705640196800232, 0.5851831436157227, -0.28655296564102173, 0.21914565563201904, 0.42291808128356934, 0.3754308521747589, 0.12476411461830139, 0.4564429223537445, 0.41455739736557007, 0.24721866846084595, -0.39062193036079407, 0.47335484623908997, 0.4390261769294739, -0.2776612639427185, 0.36352279782295227, -0.4658246338367462, 0.5458199977874756, 0.2368425875902176, -0.28375834226608276, -0.21349868178367615, -0.12575705349445343, -0.314109742641449, 0.2133757472038269, -0.4604170322418213, -0.5457999110221863, 0.347943514585495, 0.3864844739437103, 0.2128392457962036, 0.06274894624948502, -0.5941122174263, -0.4954967200756073, 0.3897503614425659, 0.6681548953056335, 0.011607992462813854, -0.5754616260528564, -0.4551040530204773, 0.14332124590873718, 0.5475043058395386, 0.35485684871673584, 0.516143798828125, -0.43508225679397583, -0.2927212119102478, -0.38220953941345215, 0.22585861384868622, -0.49666696786880493, -0.47814127802848816, 0.6455125212669373, -0.4184291362762451, 0.5714888572692871, -0.06349734216928482, -0.337534636259079, 0.08359762281179428, -0.6663680672645569, -0.05490731820464134, 0.27789443731307983, 0.44944822788238525, -0.12919825315475464, -0.24064187705516815, 0.3863179683685303, -0.21315856277942657, -0.010893935337662697, -0.49465489387512207, -0.1953386515378952, 0.4405977129936218, -0.362499862909317, -0.15224213898181915, 0.503758430480957, 0.13674911856651306, 0.24574719369411469, -0.2888658046722412, -0.5966756939888, 0.24279867112636566, 0.43060633540153503, -0.2950061857700348, -0.3071616590023041, -0.31878525018692017, 0.5719135999679565, -0.46542906761169434, -0.33102989196777344, 0.2584391236305237, -0.3341030776500702, 0.35185420513153076, -0.5347702503204346, 0.2021929919719696, 0.3747906982898712, 0.3017856478691101, 0.4192887842655182, 0.2290816456079483, 0.26369208097457886, 0.30613088607788086, -0.2766033113002777, 0.48649486899375916, 0.28767234086990356, -0.31826111674308777, 0.47518086433410645, -0.2643313407897949, 0.38674306869506836, -0.20252466201782227, 0.2426745593547821, -0.2963939607143402, 0.35027387738227844, -0.40756842494010925, -0.17158618569374084, 0.6504075527191162, -0.23639068007469177, -0.5520732998847961, 0.34597641229629517, 0.12782879173755646, 0.46479496359825134, -0.4128115773200989, -0.4125882685184479, 0.20131008327007294, 0.4997844099998474, -0.21766024827957153, -0.2570849657058716, -0.1471637338399887, 0.5070111155509949, -0.6722937226295471, -0.5443961024284363, 0.5341878533363342, -0.29976886510849, 0.6135430932044983, -0.3595261573791504, 0.49033448100090027, 0.3653552234172821, 0.2656362056732178, 0.10900922119617462, 0.4813465476036072, 0.41922783851623535, 0.2692069411277771, -0.4056242108345032, 0.33006641268730164, 0.27100467681884766, -0.5306692123413086, 0.2701503336429596, -0.6044796705245972],
+                b2: &[0.044342152774333954, -0.28361865878105164, -0.0350283607840538, -0.129508376121521, -0.006770995445549488, -0.24053514003753662, 0.3617520332336426, -0.3381704092025757, -0.24953331053256989],
+            }
+        }
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/cortex_a55.rs b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a55.rs
new file mode 100644
index 000000000..8156850af
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a55.rs
@@ -0,0 +1,16 @@
+use crate::frame::mmm::CostModel;
+        pub fn model() -> CostModel<'static> {
+            CostModel {
+                big_product_mkn_threshold: 263214080.0,
+                big_product_kernel_choice: "arm64simd_mmm_f32_12x8_a55",
+                kernels: &["arm64simd_mmm_f32_12x8_a53", "arm64simd_mmm_f32_12x8_a55", "arm64simd_mmm_f32_12x8_gen", "arm64simd_mmm_f32_16x4_a53", "arm64simd_mmm_f32_16x4_a55", "arm64simd_mmm_f32_16x4_gen", "arm64simd_mmm_f32_24x4_a53", "arm64simd_mmm_f32_24x4_a55", "arm64simd_mmm_f32_24x4_gen", "arm64simd_mmm_f32_8x8_a53", "arm64simd_mmm_f32_8x8_a55", "arm64simd_mmm_f32_8x8_gen", "generic_f32_4x4"],
+                mrs: &[4, 8, 12, 16, 24],
+                nrs: &[4, 8],
+                feat_norm_mean: &[5.27886946965165, 6.250454700699139, 5.241114620514529, 16.770438790865423, 1.540625, 0.770625, 3.518125, 0.8775, 5.560625, 0.923125, 7.453125, 0.943125, 11.613125, 0.9575, 1.509375, 0.771875, 3.581875, 0.898125],
+                feat_norm_stddev: &[0.9509890252368828, 0.6930410342704738, 1.0261938261805659, 1.600617293156687, 1.0981118382819681, 0.42043086158725473, 2.338198341538852, 0.3278623949159141, 3.494112850120205, 0.26639300736881577, 4.56457037785321, 0.2316036147710134, 7.043415558830455, 0.20172691937369452, 1.0925484471523377, 0.4196236222795381, 2.273113830052299, 0.30248385804039707],
+                w1: &[-0.13682155311107635, -0.1783919334411621, 0.26539096236228943, 0.19552235305309296, -0.10618806630373001, 0.13501706719398499, 0.21776071190834045, 0.08390733599662781, -0.2215081751346588, 0.18829140067100525, -0.20535176992416382, -0.0463368222117424, 0.05815611779689789, -0.13855215907096863, -0.024539709091186523, -0.4855460524559021, -0.414151668548584, -0.7574286460876465, 0.8987273573875427, 0.5316352844238281, 0.8244147896766663, 0.8388808369636536, -0.02545193023979664, 0.04357631504535675, -0.007071307860314846, 0.18223997950553894, -0.04292978346347809, 0.004330582916736603, -0.013073648326098919, -0.04028080403804779, -0.09901119023561478, 0.062175191938877106, -0.006247916258871555, 0.009531030431389809, 0.09731218218803406, 0.004297865089029074, 0.6260067224502563, -0.10042139887809753, 0.807989239692688, 0.6866835951805115, -0.018399836495518684, -0.07194910198450089, -0.18889868259429932, -0.07729395478963852, -0.03907148540019989, 0.017019111663103104, 0.06159460172057152, -0.02395886555314064, 0.23730705678462982, -0.15546496212482452, -0.04492897167801857, -0.003982013091444969, -0.09160511195659637, 0.03185845538973808, -0.27577653527259827, 0.5699247121810913, -0.6027079820632935, -0.4136800467967987, -0.04364140331745148, -0.11226192861795425, 0.16899903118610382, -0.11524038016796112, 0.12308179587125778, 0.027925828471779823, -0.06269104778766632, 0.11644940823316574, -0.369202196598053, 0.3338239789009094, -0.06509242206811905, 0.2303273230791092, 0.018171854317188263, -0.08709719777107239, 0.18228614330291748, -0.071574367582798, -0.012407014146447182, -0.284942090511322, -0.1326635330915451, -0.08634418249130249, 0.11018547415733337, -0.09423547983169556, 0.13697068393230438, -0.03515861555933952, 0.014629656448960304, -0.21159854531288147, 0.15693463385105133, -0.021487032994627953, 0.032396819442510605, 0.028369005769491196, 0.08724819868803024, -0.13204769790172577, 0.4691336452960968, 0.1237262561917305, -0.06020978465676308, 0.24037614464759827, 0.05237792432308197, -0.10641840100288391, 0.1820996105670929, 0.6079273819923401, -0.4903985857963562, 0.40744978189468384, 0.43370547890663147, 0.5092437863349915, 0.0810965895652771, 0.4670366048812866, -0.11692337691783905, -0.013550599105656147, -0.364605575799942, 0.34470170736312866, -0.01755279302597046, 0.30621764063835144, 0.35784396529197693, 0.42736300826072693, 0.022546129301190376, 0.08497388660907745, 0.07601173967123032, 0.0696730837225914, -0.21918217837810516, 0.6236687898635864, -0.0793512761592865, -0.0668395534157753, 0.010559543035924435, 0.46621084213256836, -0.2632196843624115, -0.03991322219371796, 0.1392994076013565, 0.003188274335116148, -0.2655166983604431, -0.22143644094467163, -0.19157607853412628, -0.39395904541015625, -0.021266555413603783, 0.08848410844802856, 0.08152330666780472, 0.013220606371760368, -0.18424198031425476, 0.05234640836715698, 0.05919161066412926, -0.16255362331867218, -0.04549096152186394, 0.044437166303396225, 0.21704396605491638, -0.5149197578430176, -0.6047705411911011, -0.8048356175422668, -0.36901935935020447, -0.19035962224006653, 1.3252514600753784, 0.19824109971523285, 0.07630860805511475, -0.011165079660713673, 0.011559495702385902, 0.10554458945989609, -0.12820255756378174, 0.29352235794067383, -0.06662449240684509, -0.15792854130268097, 0.0345480814576149, -0.04881494492292404, -0.06912268698215485, -0.00013739474525209516, -0.1597173660993576, 0.34323570132255554, 0.34446775913238525, 0.3795395493507385, 0.017453864216804504, 0.1102253794670105, 0.04026523232460022, 0.1107630804181099, 0.10295291990041733, 0.5326219201087952, 0.1749747395515442, -0.2661803066730499, 0.11752097308635712, 0.08010037988424301, -0.5501991510391235, -0.059987347573041916, -0.04125819355249405, 0.16356444358825684, 0.020170046016573906, -0.08306766301393509, 0.17777052521705627, 0.4687126874923706, 0.7723219394683838, 0.7309747934341431, -0.019829215481877327, 0.10945341736078262, 0.06796073168516159, -0.12042505294084549, -0.26762208342552185, 0.10846878588199615, 0.013867417350411415, 0.01077105849981308, -0.10193657130002975, -0.1757654845714569, -0.245382159948349, 0.20442898571491241, 0.10115985572338104, 0.2514199912548065, -0.3793720304965973, -0.6926521062850952, -0.6686761975288391, -0.607191264629364, 0.16187654435634613, -0.0073340232484042645, -0.09948350489139557, -0.21431320905685425, -0.12334707379341125, -0.15290899574756622, -0.026063116267323494, 0.26553207635879517, 0.18921764194965363, -0.1665697544813156, -0.00264778733253479, 0.20274107158184052, 0.3660823404788971, -0.35731416940689087, 0.50246661901474, 0.2781502604484558, 0.1629776805639267, -0.03493829071521759, -0.16012291610240936, -0.08139592409133911, -0.1440155804157257, 0.32721832394599915, 0.1312151998281479, 0.17874418199062347, 0.06143738701939583, -0.05158458650112152, 0.4802771806716919, -0.6857288479804993, 0.08245638012886047, -0.09577414393424988, -0.12872998416423798, 0.16155612468719482, 0.24089869856834412, 0.44030725955963135, -0.30994167923927307, 0.12139064073562622, 0.029418930411338806, -0.051672156900167465, -0.10080718994140625, -0.007311842869967222, -0.15189751982688904, -0.1559375822544098, 0.2731820344924927, -0.03627878054976463, 0.10538394004106522, 0.15048423409461975, 0.12981411814689636, 0.0002639668236952275, 0.05666665732860565, 0.08173252642154694, -0.16131722927093506, -0.043261025100946426, -0.14845971763134003, -0.29335740208625793, 0.039398159831762314, -0.02791670151054859, 0.22897064685821533, -0.12178067117929459, -0.4062419831752777, 0.3934949040412903, -0.05093907564878464, -0.06126153841614723, -0.07318481802940369, -0.08793392032384872, -0.01818496361374855, -0.24753189086914062, -0.30580347776412964, 0.44876909255981445, 0.5379880666732788, 0.11587893962860107, 0.2174995243549347, -0.035063862800598145, -0.0010147193679586053, -0.12281838059425354, -0.21301835775375366, 0.3645245432853699, 0.39920729398727417, -0.45564430952072144, 0.03503882512450218, 0.6949061155319214, -0.5742982625961304, 0.38680514693260193, -0.018345845863223076, 0.04529440030455589, -0.04468340799212456, -0.020917288959026337, 0.2523670792579651, -0.4574699103832245, 0.17178472876548767, -0.12147565186023712, 0.043810319155454636, -0.17998050153255463, -0.09663069248199463, -0.03498067706823349, 0.06111514940857887, -0.11410824209451675, 0.18208050727844238, -0.09109053015708923, 0.08489643037319183, 0.15014725923538208, 0.18506401777267456, -0.060843177139759064, -0.11932594329118729, 0.11290943622589111, -0.23226700723171234, -0.2114422470331192, -0.36001038551330566, -0.29864072799682617, -0.05599717050790787, -0.21294310688972473, -0.1301364004611969, -0.4993196725845337, 0.097460076212883, 0.030209479853510857, 0.35134217143058777, -0.9156147837638855, 0.0173207875341177, -0.9142565131187439, 0.13512593507766724, -0.1926516443490982, -0.2812888026237488, 0.04805266484618187, 0.5790673494338989, -0.28300249576568604, -0.10372477024793625, 0.2964925169944763, 0.16425621509552002, -0.25588271021842957, 0.37744808197021484, -0.07827199995517731, -0.7785226702690125, -0.4873232841491699, -0.0240982286632061, -0.31732890009880066, -0.7271391749382019, -0.40648236870765686, -0.08706668019294739, -0.0876365602016449, -0.08107846975326538, 0.049622420221567154, 0.5049374103546143, -0.09109669923782349, -0.2958216369152069, 0.23400314152240753, 0.0727144181728363, -0.06163109838962555, -0.3235352635383606, -0.08323507010936737, 0.06926267594099045, 0.12505480647087097, 0.06806384027004242, -0.1783592253923416, -0.09036792814731598, 0.007250780239701271, 0.07478834688663483, 0.37752634286880493, 0.10522382706403732, -0.3126020133495331, -0.339804470539093, -0.2922729253768921, -0.04612985998392105, 0.06431944668292999, 0.08483731746673584, 0.12883307039737701, -0.015924949198961258, 0.10468991845846176, -0.3394957184791565, 0.23376204073429108, -0.22720825672149658, 0.005506275221705437, -0.22926953434944153, -0.10148110240697861, 0.06526672840118408, -0.2586720287799835, -0.32853958010673523, 0.3440588712692261, -0.11197478324174881, -0.24647162854671478, 0.32472386956214905, 0.18955329060554504, 0.22783295810222626, 0.27004650235176086, 0.06792190670967102, -0.25404539704322815, -0.0421239472925663, 0.19141103327274323, -0.1919824779033661, 0.024490466341376305, -0.45774775743484497, 0.15080632269382477, -0.21607035398483276, -0.15506379306316376, -0.4421549439430237, -0.3747740089893341, -0.40712970495224, -0.01002188865095377, -0.18514835834503174, -0.052659012377262115, -0.009491002187132835, -0.04560127854347229, 0.5816720724105835, -0.8684999942779541, -0.6074734330177307, -0.6023196578025818, 0.09026342630386353, -0.8521136045455933, -0.677777886390686, -0.7927519083023071, 0.05012498050928116, 0.006620208732783794, 0.09600439667701721, 0.006934305187314749, -0.41822823882102966, 0.5416979193687439, 1.3451576232910156, 0.6131516098976135, -0.1447380781173706, 0.09429032355546951, 0.06888633966445923, 0.09988542646169662, -0.09572823345661163, 0.09141702950000763, 0.05828794091939926, -0.20784544944763184, -0.14200495183467865, 0.014049896970391273, -0.081334687769413, 0.15918458998203278, 0.001768372836522758, 0.009856577031314373, 0.5256384611129761, 0.49961280822753906, 0.5969673991203308, 0.37020817399024963, -0.07463415712118149, -0.0038648881018161774, 0.014317997731268406, 0.07256675511598587, 0.27220791578292847, -0.14287996292114258, -0.18170645833015442, -0.021593274548649788, -0.15909305214881897, 0.3259168863296509, -0.11064229905605316, 0.12034989148378372, 0.36166661977767944, -0.21680544316768646, -0.14505243301391602, -0.24518895149230957, -0.054052721709012985, 0.11477477848529816, 0.10946492105722427, -0.004644579254090786, -0.11873581260442734, 0.00934956781566143, 0.026955196633934975, -0.0947655513882637, -0.0432097427546978, 0.2264525443315506, 0.4585563540458679, -0.2117093950510025, 0.06864829361438751, 0.01817937195301056, -0.09130346775054932, -0.031736359000205994, -0.6623827219009399, 0.07924489676952362, 0.30316102504730225, 0.06474705785512924, 0.12052184343338013, -0.06878554821014404, 0.048135798424482346, 0.14442582428455353, -0.1945008486509323, 0.16308918595314026, 0.13180820643901825, -0.3005691170692444, -0.08318639546632767, -0.0371159091591835, -0.036223117262125015, 0.27411049604415894, -0.008904200047254562, -0.21584218740463257, -0.22458405792713165, -0.2840893864631653, 0.9380438327789307, -0.026274412870407104, -0.03674294427037239, -0.039288733154535294, 0.20259428024291992, -0.2627299726009369, -0.03588804602622986, -0.09061996638774872, 0.0026293552946299314, -1.1599351167678833, -0.0888570249080658, 0.3020864427089691, 0.10419020056724548, -0.2301473766565323, -0.2372182309627533, 0.255910724401474, -0.9108321666717529, -0.17266617715358734, -0.21715109050273895, -0.4768790900707245, 0.02349638193845749, 0.06996935606002808, 0.2306048572063446, -0.2647320032119751, -0.5029106140136719, 0.18124276399612427, 0.05404527485370636, -0.556660532951355, -0.20282964408397675, 0.1787903904914856, -0.13809867203235626, 0.012665750458836555, -0.007909105159342289, -0.11666542291641235, 0.192016139626503, 0.20280246436595917, 0.04091315343976021, 0.21129484474658966, 0.06015581637620926, -0.1396055370569229, 0.11048803478479385, -0.22130873799324036, 0.10175041109323502, 0.15478093922138214, -0.06699641793966293, 0.16655825078487396, -0.5767931938171387, 0.23376262187957764, -0.06561370939016342, 0.08572515100240707, 0.22690269351005554, -0.10714394599199295, 0.2328615039587021, 0.06609856337308884, 0.15064586699008942, 0.1398843675851822, 9.159173350781202e-05, -0.006412057671695948, 0.1231503039598465, 0.2868848741054535, -0.37850138545036316, -0.4390513002872467, -0.10716433078050613, -0.16492293775081635, -0.17774488031864166, -0.006263014394789934, -0.15535981953144073, -0.15121980011463165, -0.022719506174325943, -0.3260766863822937, 0.1365034133195877, 0.7772430777549744, 0.8306354880332947, 0.8039601445198059, 0.16534824669361115, -0.03939266875386238, -0.15611104667186737, 0.21217003464698792, -0.022034769877791405, -0.025939559563994408, 0.1058378517627716, -0.08505864441394806, 0.08503950387239456, -0.0037705348804593086, -0.0026697057764977217, 0.3492349088191986, 0.15157155692577362, -0.3159380555152893, -0.10824967920780182, -0.04872310906648636, 0.19715555012226105, -0.2658633291721344, -0.06968845427036285, 0.009916169568896294, 0.18593478202819824, -0.038871243596076965, -0.3416462540626526, 0.1855567842721939, 0.21629339456558228, -0.10832708328962326, -0.04190235957503319, 0.2388715296983719, -0.11624565720558167, -0.10361404716968536, 0.0536813959479332, 0.12528158724308014, -0.262010782957077, 0.05081893876194954, 0.29551735520362854, 0.05958620831370354, -0.01989975944161415, -0.19261345267295837, 0.01736867055296898, -0.07923264801502228, -0.4404444694519043, 0.3125889301300049, 0.10095971822738647, 0.17173698544502258, 0.23782190680503845, -0.07170403748750687, 0.013639729470014572, 0.19007621705532074, 0.1901141107082367, -0.052342064678668976, -0.9643150568008423, -0.12307217717170715, -0.21010802686214447, -0.5640560984611511, 0.010125457309186459, 0.1314179003238678, 0.10721258819103241, -0.24371789395809174, -0.5925355553627014, 0.49424877762794495, -0.03528435528278351, -0.21386614441871643, 1.4134130477905273, -0.2751445770263672, 0.007012579124420881, -0.023824317380785942, 0.004113825503736734, -0.06332013010978699, 0.286077082157135, 0.04896686226129532, 0.31404414772987366, 0.15028351545333862, 0.003490754636004567, 0.0802399218082428, -0.230818971991539, -0.022719932720065117, 0.26083019375801086, -0.2885863184928894, 0.07537354528903961, 0.12282905727624893, -0.38638314604759216, 0.1752759963274002, -0.07370781153440475, 0.13994526863098145, 0.13313405215740204, 0.2851952016353607, 0.905279278755188, 0.34521353244781494, -0.36453402042388916, 0.46360254287719727, -0.002040385501459241, -0.003476516343653202, -0.19058215618133545, 0.27096763253211975, 0.08722586184740067, 0.03202880546450615, -0.06164764240384102, 0.011678489856421947, 0.21189850568771362, -0.40100231766700745, 0.022941868752241135, 0.0394427627325058, 0.0675845518708229, -0.22503064572811127, 0.14730903506278992, 0.24842065572738647, -0.34360530972480774, 0.21811245381832123, -0.05238509923219681, -0.008763357996940613, -0.1336073875427246, 0.15671105682849884, 0.4475333094596863, -0.5187726616859436, 0.005388418212532997, -0.07889139652252197, 0.10729073733091354, 0.22381159663200378, 0.07434546202421188, -0.0843898206949234, 0.13574494421482086, 0.01853088103234768, -0.41072791814804077, 0.40448933839797974, -0.8231801986694336, -0.4780847728252411, -0.11237931996583939, 0.012673617340624332, -0.04672158136963844, -0.23933981359004974, 0.01667654886841774, -0.14681674540042877, 0.077765092253685, 0.15309257805347443, 0.03254099190235138, -0.015896232798695564, -0.029608771204948425, -0.288953959941864, -0.32651081681251526, 0.06307528167963028, -0.09873636066913605, 0.08938323706388474, 0.27018269896507263, 0.018129458650946617, -0.050469521433115005, -0.17951229214668274, 0.02319747768342495, 0.06737810373306274, 0.2690926194190979, -0.10778623819351196, -0.04740763455629349, 0.30407941341400146, -0.08746829628944397, -0.2184152454137802, 0.14826175570487976, 0.18092381954193115, 0.07989493757486343, -0.1297195851802826],
+                b1: &[-0.5191351175308228, 0.6662623882293701, 0.610133707523346, -1.1585999727249146, 0.6903770565986633, 0.4241520166397095, 0.754120945930481, -0.7599878907203674, -0.3445088267326355, 0.9317805767059326, -0.2041703462600708, 0.17219330370426178, 1.1566059589385986, -0.41121166944503784, -0.6977726817131042, 0.7911778092384338, 0.6611397862434387, -0.6938921213150024, -0.03742314130067825, -0.16022440791130066, 0.11257349699735641, 0.07743008434772491, -0.6286312937736511, 0.544836699962616, -0.15634237229824066, -0.5572881698608398, 0.9681645035743713, -0.7440500855445862, 0.10288882255554199, 0.9043763875961304, 0.14654643833637238, -0.024421239271759987, -0.4609592854976654, 0.917902410030365, 0.2704138457775116, 0.6341348886489868, 0.034945350140333176, 0.5565919876098633, 0.1746397614479065, -0.6341800093650818],
+                w2: &[0.07229708135128021, 0.2507615387439728, 0.16330942511558533, 0.5204483866691589, 0.24313874542713165, -0.5474504232406616, -0.28332123160362244, -0.2225571572780609, -0.1043124571442604, 0.06595291197299957, 0.21239061653614044, -0.14725270867347717, -0.8134568333625793, 0.07381946593523026, -0.24956485629081726, 0.4919748604297638, 0.2962062954902649, 0.3260444402694702, 0.07504145801067352, -0.053836897015571594, 0.2531750500202179, -0.04855559393763542, -0.5578967332839966, -0.5225025415420532, 0.055111128836870193, -0.21510563790798187, 0.5871708989143372, -0.19132649898529053, 0.007392226252704859, -0.298953115940094, 0.16707110404968262, -0.04706822335720062, 0.07302752882242203, -0.08172990381717682, 0.23955324292182922, -0.15824700891971588, -0.3977665305137634, 0.5267415642738342, -0.11258449405431747, -0.3343915045261383, 0.23245088756084442, -0.7491211891174316, -0.6333310604095459, 0.0232061930000782, -0.2315434217453003, -0.3745144307613373, -0.03209906071424484, -0.4041699469089508, 0.041345734149217606, 0.19181972742080688, -0.2760458290576935, -0.07779327034950256, 0.24569696187973022, -0.18802686035633087, -0.6544056534767151, 0.556419849395752, 0.11468080431222916, -0.32528090476989746, 0.38538315892219543, 0.33702555298805237, -0.442532479763031, 0.00750756124034524, -0.45737770199775696, -0.06860284507274628, -0.4411284625530243, -0.23914210498332977, 0.06834587454795837, 0.14571186900138855, 0.6887655258178711, 0.5702284574508667, 0.3135473430156708, -0.3360161781311035, -0.5353860259056091, 0.06292688101530075, 0.735708475112915, 0.7143703103065491, -0.3693147897720337, 0.525284469127655, 0.39448651671409607, -0.09941494464874268, 0.09564384818077087, 0.5881519913673401, 0.05619557946920395, 0.4508857727050781, -0.2834583520889282, -0.16902177035808563, 0.24799591302871704, -0.182522252202034, 0.0468696765601635, 0.14808374643325806, -0.013205822557210922, -0.12705814838409424, 0.0614711195230484, 0.14103399217128754, -0.2599405348300934, 0.028414186090230942, -0.2865449786186218, -0.08163938671350479, 0.13120926916599274, 0.17990124225616455, -0.16350798308849335, -0.09809352457523346, -0.013590727932751179, -0.17736633121967316, 0.05107983574271202, 0.3411618173122406, -0.2772451341152191, 0.32397109270095825, 0.046551186591386795, 0.13246433436870575, 0.05053735896945, 0.24057962000370026, -0.04693610221147537, -0.1650579869747162, 0.1331019252538681, 0.09457181394100189, -0.16547952592372894, -0.09469929337501526, 0.30049434304237366, 0.12664170563220978, -0.013082812540233135, 0.390655517578125, 0.6400918364524841, -0.0010483618825674057, -0.03533017635345459, 0.16345657408237457, 0.05697643384337425, 0.1748565286397934, 0.0036667422391474247, -0.05557025969028473, 0.016822226345539093, -0.12541711330413818, -0.4695605933666229, 0.008447905071079731, 0.16371716558933258, -0.1481284201145172, -0.10916673392057419, 0.1754710078239441, -0.05557332932949066, 0.17406205832958221, 0.03734235838055611, -0.0014076621737331152, 0.16409075260162354, -0.0339696928858757, 0.11525241285562515, 0.11995170265436172, -0.39020177721977234, 0.01936984248459339, -0.14390763640403748, -0.18344464898109436, -0.08675119280815125, 0.19569827616214752, 0.48439380526542664, -0.232485830783844, -0.004231136757880449, 0.15202505886554718, 0.01103641465306282, -0.1192987710237503, -0.17487019300460815, 0.27336806058883667, -0.5894135236740112, -0.03331466019153595, 0.21942859888076782, 0.30420297384262085, 0.2666693329811096, 0.4481956958770752, -0.020630693063139915, 0.8494743704795837, 0.5691520571708679, 0.5711295008659363, 0.00404204148799181, 0.5070351958274841, 0.09074786305427551, 0.15874768793582916, 0.7676622271537781, 0.6556511521339417, 0.1220490038394928, 0.7263025641441345, -0.07173441350460052, 0.14413252472877502, 0.49090006947517395, -0.3324028253555298, 0.45898303389549255, 0.5931536555290222, 0.19021296501159668, -0.7473744750022888, -0.834629476070404, -0.1385311633348465, -0.05174582824110985, 0.018871335312724113, -0.42817312479019165, 0.20682017505168915, 0.016382897272706032, -0.6684255599975586, 0.3525462746620178, -0.42306870222091675, -0.0817568302154541, 0.3572525084018707, -0.23954586684703827, -0.4869120717048645, 0.016070470213890076, 0.5639761686325073, 0.17797298729419708, 0.2919785678386688, -0.3837592601776123, 0.13362792134284973, 0.09925093501806259, 0.12642522156238556, 0.09690988808870316, -0.08732952922582626, 0.24605968594551086, -0.3894798457622528, -0.174991175532341, 0.2573908269405365, 0.22514064610004425, -0.24535547196865082, -0.2993263006210327, 0.24350187182426453, 0.03375721350312233, 0.16244018077850342, -0.16753582656383514, -0.08621060848236084, 0.1272309273481369, 0.007472787983715534, 0.20557984709739685, 0.1578531116247177, -0.5838948488235474, 0.08410368114709854, -0.2831973135471344, -0.28126293420791626, -0.08023717254400253, 0.5180243849754333, 0.2208152413368225, -0.3613019585609436, -0.06204051896929741, -0.13526616990566254, 0.09384715557098389, -0.27185022830963135, -0.05938927084207535, 0.284194678068161, 0.04228530079126358, 0.5006632208824158, 0.6578063368797302, -0.07014274597167969, -0.3233219087123871, -0.01618030108511448, 0.2888641357421875, -0.08185673505067825, -0.17689819633960724, -0.2994365096092224, 0.016244128346443176, 0.02359011210501194, 0.1367129534482956, -0.01653127372264862, -0.09157261997461319, -0.3516620397567749, -0.09030301123857498, -0.07817772775888443, 0.17603041231632233, -0.01393663790076971, -0.029468189924955368, -0.0814921036362648, -0.12077502906322479, -0.10759524255990982, -0.0750858411192894, 0.2511105239391327, -0.20753242075443268, -0.05136517807841301, -0.024205535650253296, -0.3384825587272644, 0.020664114505052567, 0.11200296878814697, 0.08333364874124527, -0.24177855253219604, -0.07010341435670853, 0.020779477432370186, -0.20839253067970276, -0.0016562794335186481, 0.023504814133048058, 0.3570723235607147, -0.30022287368774414, -0.3554439842700958, -0.027536675333976746, -1.1282703876495361, -0.08706718683242798, 0.0742080882191658, 0.18080361187458038, -0.02274167723953724, -0.704075813293457, -0.9722687602043152, 0.1188407614827156, -0.029379399493336678, 0.8019110560417175, -0.34810709953308105, 0.04902748018503189, -0.7494327425956726, 0.5064789056777954, -0.11681736260652542, 0.2257058471441269, -0.4354608356952667, 0.3252757489681244, -0.1591869592666626, -0.5933760404586792, -0.5259361863136292, 0.22252318263053894, 0.30712220072746277, 0.29186123609542847, -0.7899709343910217, 0.3455640971660614, -0.8577526807785034, 0.19282177090644836, 0.29095181822776794, -0.3287593424320221, 0.0454283282160759, -0.5983009338378906, -0.08342050760984421, -0.8976981043815613, 0.10165920853614807, 0.13396088778972626, 0.2290259599685669, 0.02499830722808838, 0.7539560794830322, 0.1477266401052475, 0.3097168207168579, -0.3993585705757141, 0.0817292109131813, 0.038499560207128525, 0.048502497375011444, 0.10572300106287003, -0.17650842666625977, 0.30300378799438477, -0.3586488962173462, -0.09699319303035736, 0.28980425000190735, 0.1152607873082161, -0.30993735790252686, -0.3226162791252136, 0.2082981914281845, 0.08206543326377869, 0.09643732011318207, -0.09098457545042038, -0.09191355854272842, 0.04240717366337776, -0.08706614375114441, 0.3119218051433563, 0.24132680892944336, -0.5137639045715332, 0.03463784605264664, -0.29585450887680054, -0.3583862781524658, -0.09919128566980362, 0.5263358950614929, 0.19875890016555786, -0.4007430374622345, -0.044145308434963226, -0.24342355132102966, 0.16471655666828156, -0.25901785492897034, 0.012997856363654137, 0.3298455476760864, -0.23130790889263153, 0.4484388828277588, 0.35633817315101624, 0.26454973220825195, 0.15214529633522034, -0.12443697452545166, -0.405061811208725, 0.17236965894699097, -0.36522531509399414, -0.074102483689785, 0.09564346820116043, -0.26696014404296875, -0.7053405046463013, -0.4750596880912781, 0.2850874066352844, -0.42413032054901123, 0.3273111581802368, 0.013779409229755402, -0.7248923182487488, -0.49210208654403687, 0.5041399002075195, -0.14308881759643555, 0.629442036151886, -0.8470776677131653, 0.36798736453056335, -0.17092065513134003, 0.5437707304954529, -0.26034078001976013, -0.4502609074115753, 0.2898317873477936, -0.3266198933124542, 0.1681036651134491, 0.6064534783363342, 0.48974573612213135, -0.3461318910121918, -0.36192092299461365, 0.3675844371318817, -0.731248676776886, -0.21227769553661346, -0.4246974289417267, 0.17397946119308472, -0.3643985986709595, 0.205714613199234, 0.629838228225708, 0.10543780773878098, 0.010421440936625004, 0.6487590670585632, -0.685522198677063, 0.010746597312390804, 0.371294766664505, -0.68584144115448, 0.69797283411026, -0.39890381693840027, 0.2957388460636139, 0.10036955028772354, -0.31620606780052185, -0.5876231789588928, -0.5783882737159729, -0.4745366871356964, 0.20689401030540466, -0.2748165428638458, 0.34110450744628906, 0.817054033279419, 0.8686729073524475, -0.6139298677444458, -0.19506172835826874, -0.03448706120252609, 0.635860025882721, -0.38243091106414795, 0.8843176960945129, 0.08922040462493896, -0.8030375242233276, 0.01003911904990673, 0.49227485060691833, 0.02043282799422741, -0.1812848448753357, 0.8425045609474182, -0.18937410414218903, 0.2360723465681076, -0.0486280657351017, 0.1306903064250946, 0.44811540842056274, -0.09772484004497528, 0.3676001727581024, -0.10864408314228058, 0.10239739716053009, 0.26535993814468384, -0.19465096294879913, -0.05268852412700653, 0.013907784596085548, 0.11859709769487381, -0.008244873955845833, -0.12678827345371246, 0.16795198619365692, 0.09826375544071198, -0.13783332705497742, -0.32474759221076965, -0.018496913835406303, -0.12179988622665405, 0.22411927580833435, -0.10514824092388153, 0.038778163492679596, 0.33486974239349365, 0.31644245982170105, 0.05365574359893799, 0.24912847578525543, -0.31889432668685913, 0.24240325391292572, -0.19231560826301575, 0.18558776378631592, -0.022984078153967857, 0.11608095467090607, 0.15418484807014465, -0.14139854907989502, 0.01758008636534214, -0.12027571350336075, 0.2522386610507965, -0.2922046184539795, 0.049236513674259186, 0.19894357025623322, 0.39957553148269653, 0.3346879780292511, 0.3187335133552551, 0.4501717686653137, -0.8946970701217651, 0.18189306557178497, -0.08766483515501022, 0.2782788574695587, 0.3587392270565033, -0.33824455738067627, 0.6033147573471069, -0.6243746876716614, -0.6177958250045776, 0.6629742383956909, 0.4856598377227783, -0.3099081814289093, -0.678487241268158, 0.47894829511642456, -0.03139176964759827, 0.16848357021808624, -0.5739434957504272, -0.16708984971046448, 0.11146949231624603, 0.090438611805439, 0.4812713861465454, 0.5129365921020508, -0.7324693202972412, 0.26365718245506287, -0.4824923276901245, -0.5487518310546875, -0.20128659904003143, 0.5759150385856628, 0.3504473567008972, -0.36605504155158997, -0.4257725477218628, -0.25298258662223816, 0.512897789478302, -0.4181336462497711, -0.516604483127594, 0.37244912981987],
+                b2: &[0.14859354496002197, -0.018167857080698013, -0.3407953083515167, -0.14991576969623566, 0.4018653333187103, -0.2384500652551651, -0.4047893285751343, 0.15702210366725922, -0.3152092695236206, 0.29297566413879395, 0.26403820514678955, -0.2573520541191101, -0.11290331929922104],
+            }
+        }
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/cortex_a72.rs b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a72.rs
new file mode 100644
index 000000000..6819f4da0
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a72.rs
@@ -0,0 +1,4 @@
+use crate::frame::mmm::cost_model::CostModel;
+pub fn models() -> Vec<(&'static str, CostModel<'static>)> {
+    vec![]
+}
diff --git a/vendor/tract-linalg-0.22.1/src/arm64/cortex_a73.rs b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a73.rs
new file mode 100644
index 000000000..6819f4da0
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/arm64/cortex_a73.rs
@@ -0,0 +1,4 @@
+use crate::frame::mmm::cost_model::CostModel;
+pub fn models() -> Vec<(&'static str, CostModel<'static>)> {
+    vec![]
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/block_quant/helpers.rs b/vendor/tract-linalg-0.22.1/src/frame/block_quant/helpers.rs
new file mode 100644
index 000000000..1158880a6
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/block_quant/helpers.rs
@@ -0,0 +1,65 @@
+use byteorder::{ReadBytesExt, WriteBytesExt, LE};
+use std::io::{Cursor, Read, Write};
+use tract_data::internal::*;
+
+pub struct NibbleReader<R> {
+    second_half: Option<i8>,
+    reader: R,
+}
+
+impl<'s> NibbleReader<Cursor<&'s [u8]>> {
+    pub fn for_slice(slice: &'s [u8]) -> Self {
+        NibbleReader::new(Cursor::new(slice))
+    }
+}
+
+impl<R: Read> NibbleReader<R> {
+    pub fn new(reader: R) -> NibbleReader<R> {
+        NibbleReader { reader, second_half: None }
+    }
+
+    pub fn read_f16(&mut self) -> f16 {
+        assert!(self.second_half.is_none());
+        f16::from_bits(self.reader.read_u16::<LE>().unwrap())
+    }
+
+    pub fn read_i4(&mut self) -> i8 {
+        if let Some(second) = self.second_half.take() {
+            second
+        } else {
+            let byte = self.reader.read_u8().unwrap();
+            self.second_half = Some((byte >> 4) as i8);
+            (byte & 0x0F) as i8
+        }
+    }
+}
+
+pub struct NibbleWriter<W> {
+    first_half: Option<i8>,
+    writer: W,
+}
+
+impl<'s> NibbleWriter<Cursor<&'s mut [u8]>> {
+    pub fn for_slice(slice: &'s mut [u8]) -> Self {
+        NibbleWriter::new(Cursor::new(slice))
+    }
+}
+
+impl<W: Write> NibbleWriter<W> {
+    pub fn new(writer: W) -> NibbleWriter<W> {
+        NibbleWriter { writer, first_half: None }
+    }
+
+    pub fn write_f16(&mut self, f: f16) {
+        assert!(self.first_half.is_none());
+        self.writer.write_u16::<LE>(f.to_bits()).unwrap()
+    }
+
+    pub fn write_i4(&mut self, q: i8) {
+        if let Some(first) = self.first_half.take() {
+            self.writer.write_u8(first as u8 | ((q as u8) << 4)).unwrap()
+        } else {
+            self.first_half = Some(q);
+        }
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/block_quant/mod.rs b/vendor/tract-linalg-0.22.1/src/frame/block_quant/mod.rs
new file mode 100644
index 000000000..28a8fe456
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/block_quant/mod.rs
@@ -0,0 +1,327 @@
+use downcast_rs::{impl_downcast, Downcast};
+use dyn_clone::{clone_box, DynClone};
+use dyn_hash::DynHash;
+use num_traits::Zero;
+use tract_data::internal::*;
+use tract_data::itertools::Itertools;
+
+use std::alloc::Layout;
+use std::borrow::Cow;
+use std::fmt::{Debug, Display};
+use std::hash::Hash;
+use std::sync::Arc;
+
+mod helpers;
+mod q4_0;
+mod value;
+
+pub use helpers::{NibbleReader, NibbleWriter};
+pub use q4_0::Q4_0;
+pub use value::{BlockQuantFact, BlockQuantValue, PackedBlockQuantFact};
+
+use crate::mmm::{EagerPackedInput, MMMInputFormat};
+use crate::pack::PackedFormat;
+
+use crate::WeightType;
+
+use super::mmm::MMMInputValue;
+
+pub trait BlockQuant: Debug + Display + Send + Sync + DynClone + DynHash + Downcast {
+    fn same_as(&self, other: &dyn BlockQuant) -> bool;
+
+    fn block_len(&self) -> usize;
+
+    fn block_bytes(&self) -> usize;
+
+    fn dequant_block_f32(&self, quant: &[u8], block: &mut [f32]);
+    fn dequant_block_f16(&self, quant: &[u8], block: &mut [f16]);
+    fn quant_block_f16(&self, block: &[f16], quant: &mut [u8]);
+    fn quant_block_f32(&self, block: &[f32], quant: &mut [u8]);
+
+    fn quant_f16(&self, input: &[f16]) -> TractResult<Blob> {
+        unsafe {
+            let blocks = input.len() / self.block_len();
+            let mut quant = Blob::for_layout(
+                Layout::from_size_align(blocks * self.block_bytes(), 128).unwrap(),
+            );
+            for b in 0..blocks {
+                let block = &input[b * self.block_len()..][..self.block_len()];
+                let qblock = &mut quant[b * self.block_bytes()..][..self.block_bytes()];
+                self.quant_block_f16(block, qblock);
+            }
+            Ok(quant)
+        }
+    }
+
+    fn quant_f32(&self, input: &[f32]) -> TractResult<Blob> {
+        unsafe {
+            let blocks = input.len() / self.block_len();
+            let mut quant = Blob::for_layout(
+                Layout::from_size_align(blocks * self.block_bytes(), 128).unwrap(),
+            );
+            for b in 0..blocks {
+                let block = &input[b * self.block_len()..][..self.block_len()];
+                let qblock = &mut quant[b * self.block_bytes()..][..self.block_bytes()];
+                self.quant_block_f32(block, qblock);
+            }
+            Ok(quant)
+        }
+    }
+
+    fn dequant_f32(&self, input: &[u8]) -> TractResult<Tensor> {
+        unsafe {
+            let blocks = input.len() / self.block_bytes();
+            let mut tensor = Tensor::uninitialized::<f32>(&[blocks * self.block_len()])?;
+            let slice = tensor.as_slice_mut::<f32>()?;
+            for b in 0..blocks {
+                let block = &mut slice[b * self.block_len()..][..self.block_len()];
+                let qblock = &input[b * self.block_bytes()..][..self.block_bytes()];
+                self.dequant_block_f32(qblock, block);
+            }
+            Ok(tensor)
+        }
+    }
+
+    fn dequant_f16(&self, input: &[u8]) -> TractResult<Tensor> {
+        unsafe {
+            let blocks = input.len() / self.block_bytes();
+            let mut tensor = Tensor::uninitialized::<f16>(&[blocks * self.block_len()])?;
+            let slice = tensor.as_slice_mut::<f16>()?;
+            for b in 0..blocks {
+                let block = &mut slice[b * self.block_len()..][..self.block_len()];
+                let qblock = &input[b * self.block_bytes()..][..self.block_bytes()];
+                self.dequant_block_f16(qblock, block);
+            }
+            Ok(tensor)
+        }
+    }
+
+    fn extract_at_offset_f16(&self, input: &[u8], offset: usize) -> f16 {
+        let len = self.block_len();
+        let block_id = offset / len;
+        let mut block = vec![f16::zero(); self.block_len()];
+        self.dequant_block_f16(
+            &input[block_id * self.block_bytes()..][..self.block_bytes()],
+            &mut block,
+        );
+        block[offset % len]
+    }
+
+    fn extract_at_offset_f32(&self, input: &[u8], offset: usize) -> f32 {
+        let len = self.block_len();
+        let block_id = offset / len;
+        let mut block = vec![f32::zero(); self.block_len()];
+        self.dequant_block_f32(
+            &input[block_id * self.block_bytes()..][..self.block_bytes()],
+            &mut block,
+        );
+        block[offset % len]
+    }
+
+    fn simulate_precision_loss(
+        &self,
+        mut tensor: Tensor,
+        block_axis: usize,
+    ) -> TractResult<Tensor> {
+        ensure!(block_axis == tensor.rank() - 1);
+        ensure!(tensor.shape()[block_axis] % self.block_len() == 0);
+        let mut scratch = vec![0u8; self.block_bytes()];
+        if tensor.datum_type() == f32::datum_type() {
+            for block in tensor.as_slice_mut::<f32>()?.chunks_mut(self.block_len()) {
+                self.quant_block_f32(block, &mut scratch);
+                self.dequant_block_f32(&scratch, block);
+            }
+            Ok(tensor)
+        } else if tensor.datum_type() == f16::datum_type() {
+            for block in tensor.as_slice_mut::<f16>()?.chunks_mut(self.block_len()) {
+                self.quant_block_f16(block, &mut scratch);
+                self.dequant_block_f16(&scratch, block);
+            }
+            Ok(tensor)
+        } else {
+            todo!()
+        }
+    }
+
+    fn pack(
+        &self,
+        input: &[u8],
+        k: usize,
+        r: usize,
+        zip: usize,
+        scales_at_end: bool,
+    ) -> TractResult<EagerPackedInput>;
+
+    unsafe fn extract_packed_panel(
+        &self,
+        value: &EagerPackedInput,
+        target: &PackedFormat,
+        panel: usize,
+        scratch: *mut u8,
+    ) -> TractResult<()>;
+
+    fn extract_at_mn_f16(
+        &self,
+        value: &EagerPackedInput,
+        mn: usize,
+        target: &mut [f16],
+    ) -> TractResult<()>;
+
+    fn extract_at_mn_f32(
+        &self,
+        value: &EagerPackedInput,
+        mn: usize,
+        target: &mut [f32],
+    ) -> TractResult<()>;
+}
+
+dyn_clone::clone_trait_object!(BlockQuant);
+dyn_hash::hash_trait_object!(BlockQuant);
+impl_downcast!(BlockQuant);
+
+#[allow(clippy::derived_hash_with_manual_eq)]
+#[derive(Clone, Hash)]
+pub struct PackedBlockQuantFormat {
+    pub bq: Box<dyn BlockQuant>,
+    pub r: usize,
+    pub zip: usize,
+    pub scales_at_end: bool,
+}
+
+impl PartialEq for PackedBlockQuantFormat {
+    fn eq(&self, other: &Self) -> bool {
+        self.bq.same_as(&*other.bq)
+            && self.r == other.r
+            && self.zip == other.zip
+            && self.scales_at_end == other.scales_at_end
+    }
+}
+
+impl Eq for PackedBlockQuantFormat {}
+
+impl Display for PackedBlockQuantFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Packed{}[{}]", &*self.bq, self.r)?;
+        if self.zip != 0 {
+            write!(f, "Z{}", self.zip)?;
+        }
+        if self.scales_at_end {
+            write!(f, "Se")?;
+        }
+        Ok(())
+    }
+}
+
+impl Debug for PackedBlockQuantFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        <Self as Display>::fmt(self, f)
+    }
+}
+
+impl PackedBlockQuantFormat {
+    pub fn new(bq: &dyn BlockQuant, r: usize, zip: usize, scales_at_end: bool) -> Self {
+        PackedBlockQuantFormat { bq: clone_box(bq), r, zip, scales_at_end }
+    }
+
+    pub fn simulate_precision_loss(
+        &self,
+        tensor: Tensor,
+        block_axis: usize,
+    ) -> TractResult<Tensor> {
+        self.bq.simulate_precision_loss(tensor, block_axis)
+    }
+
+    pub fn pack(&self, input: &[u8], k: usize) -> TractResult<EagerPackedInput> {
+        self.bq.pack(input, k, self.r, self.zip, self.scales_at_end)
+    }
+}
+
+impl MMMInputFormat for PackedBlockQuantFormat {
+    fn prepare_tensor(&self, t: &Tensor, _k_axis: usize, _mn_axis: usize) -> TractResult<Tensor> {
+        let packed = t
+            .as_slice::<Opaque>()?
+            .iter()
+            .map(|o| {
+                let bqv = o.downcast_ref::<BlockQuantValue>().unwrap();
+                let packed = self.pack(&bqv.value, bqv.fact.k())?;
+                Ok(Opaque(Arc::new(Box::new(packed) as Box<dyn MMMInputValue>)))
+            })
+            .collect::<TractResult<Vec<Opaque>>>()?;
+        tensor1(&packed).into_shape(t.shape())
+    }
+
+    fn prepare_one(
+        &self,
+        t: &Tensor,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        // this code path is essentially there for test scenarios
+        let t = if t.datum_type().is_number() {
+            let k = t.shape()[k_axis];
+            let m = t.shape()[mn_axis];
+            assert!(k % self.bq.block_len() == 0);
+            let t: Cow<Tensor> = if k_axis == 1 && mn_axis == 0 {
+                Cow::Borrowed(t)
+            } else {
+                Cow::Owned(t.clone().move_axis(1, 0)?)
+            };
+            let quant = if t.datum_type() == f32::datum_type() {
+                self.bq.quant_f32(t.as_slice()?)?
+            } else if t.datum_type() == f16::datum_type() {
+                self.bq.quant_f16(t.as_slice()?)?
+            } else {
+                todo!()
+            };
+            Cow::Owned(tensor0(Opaque(Arc::new(BlockQuantValue {
+                value: Arc::new(quant),
+                fact: BlockQuantFact::new(self.bq.clone(), tvec!(m, k)),
+            }))))
+        } else {
+            Cow::Borrowed(t)
+        };
+        ensure!(mn_axis == 0);
+        ensure!(k_axis == 1);
+        let bqv = t.to_scalar::<Opaque>()?.downcast_ref::<BlockQuantValue>().unwrap();
+        let packed = self.pack(&bqv.value, bqv.fact.k())?;
+        Ok(Box::new(packed))
+    }
+
+    fn precursor(&self) -> WeightType {
+        WeightType::BlockQuant(self.bq.clone())
+    }
+
+    fn k_alignment(&self) -> usize {
+        self.bq.block_len()
+    }
+
+    fn r(&self) -> usize {
+        self.r
+    }
+
+    fn mem_size(&self, k: TDim, mn: TDim) -> TDim {
+        k * mn * self.bq.block_bytes() / self.bq.block_len()
+    }
+
+    fn same_as(&self, other: &dyn MMMInputFormat) -> bool {
+        other.downcast_ref::<Self>().is_some_and(|other| self == other)
+    }
+
+    fn extract_at_mn_f16(
+        &self,
+        data: &EagerPackedInput,
+        mn: usize,
+        slice: &mut [f16],
+    ) -> TractResult<()> {
+        self.bq.extract_at_mn_f16(data, mn, slice)
+    }
+
+    fn extract_at_mn_f32(
+        &self,
+        data: &EagerPackedInput,
+        mn: usize,
+        slice: &mut [f32],
+    ) -> TractResult<()> {
+        self.bq.extract_at_mn_f32(data, mn, slice)
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/block_quant/q4_0.rs b/vendor/tract-linalg-0.22.1/src/frame/block_quant/q4_0.rs
new file mode 100644
index 000000000..44d12ca81
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/block_quant/q4_0.rs
@@ -0,0 +1,509 @@
+use crate::mmm::PackedOpaqueFact;
+
+use super::*;
+use num_traits::{AsPrimitive, Float, Zero};
+use std::alloc::Layout;
+
+#[derive(Copy, Clone, Hash, PartialEq, Eq)]
+pub struct BaseQ4_0<const QK: usize = 32>;
+
+pub const Q4_0: BaseQ4_0 = BaseQ4_0::<32>;
+
+impl<const QK: usize> Debug for BaseQ4_0<QK> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if QK == 32 {
+            write!(f, "Q4_0")
+        } else {
+            write!(f, "BaseQ4_0<{QK}>")
+        }
+    }
+}
+
+impl<const QK: usize> BaseQ4_0<QK> {
+    fn quant_block<T>(&self, block: &[T], quant: &mut [u8])
+    where
+        f32: AsPrimitive<T>,
+        T: Debug + Float + AsPrimitive<f16> + AsPrimitive<i8> + 'static,
+    {
+        assert!(quant.len() == self.block_bytes());
+        assert!(block.len() == self.block_len());
+        let mut writer = NibbleWriter::for_slice(quant);
+        let mut amax = T::zero();
+        let mut max = T::zero();
+        for v in block {
+            if amax < v.abs() {
+                amax = v.abs();
+                max = *v;
+            }
+        }
+        let scale: T = max / (-8f32).as_();
+        let r_scale = if scale.is_zero() { T::zero() } else { scale.recip() };
+        writer.write_f16(scale.as_());
+
+        for idx in 0..block.len() {
+            // Quant block in GGML nibble order
+            let ggml_idx = (block.len() / 2) * (idx % 2) + (idx / 2);
+            let i: i8 = (block[ggml_idx] * r_scale + (8.5f32).as_()).as_();
+            writer.write_i4(i.min(15));
+        }
+    }
+
+    fn dequant_block<T: Float + 'static>(&self, quant: &[u8], block: &mut [T])
+    where
+        f16: AsPrimitive<T>,
+        i8: AsPrimitive<T>,
+    {
+        assert!(quant.len() == self.block_bytes());
+        assert!(block.len() == self.block_len());
+        let mut nibbles = NibbleReader::for_slice(quant);
+        let d: T = nibbles.read_f16().as_();
+        for idx in 0..block.len() {
+            let ggml_idx = (block.len() / 2) * (idx % 2) + (idx / 2);
+            block[ggml_idx] = (nibbles.read_i4() - 8).as_() * d;
+        }
+    }
+
+    unsafe fn extract_panel_t<T: Float + Debug + 'static>(
+        &self,
+        value: &EagerPackedInput,
+        target: &PackedFormat,
+        panel: usize,
+        scratch: *mut u8,
+    ) -> TractResult<()>
+    where
+        f16: AsPrimitive<T>,
+        i8: AsPrimitive<T>,
+    {
+        let pbqf: &PackedBlockQuantFormat =
+            value.fact.format.downcast_ref().with_context(|| {
+                format!("Expecing PackedBlockQuantFormat, found {:?}", value.fact.format)
+            })?;
+        ensure!(pbqf.r == target.r);
+        ensure!(value.fact.k % self.block_len() == 0);
+        ensure!(pbqf.bq.same_as(self));
+        let scratch =
+            unsafe { std::slice::from_raw_parts_mut(scratch as *mut T, value.fact.k * target.r) };
+        let blocks_for_k = value.fact.k / self.block_len();
+        let row_bytes = blocks_for_k * self.block_bytes();
+        let input = &value.packed[panel * target.r * row_bytes..];
+        let mut scales = vec![T::zero(); target.r];
+        let mut scratch = scratch.iter_mut();
+        let zipped_order = zipped_order(pbqf.r, pbqf.zip);
+        let mut weights = vec![0i8; pbqf.r];
+        let panel_block_bytes = target.r * self.block_bytes();
+        let (scale_offset, weights_offset) = if pbqf.scales_at_end {
+            (panel_block_bytes - target.r * f16::datum_type().size_of(), 0)
+        } else {
+            (0, target.r * f16::datum_type().size_of())
+        };
+        for block in 0..blocks_for_k {
+            let block = &input[block * panel_block_bytes..][..panel_block_bytes];
+            let mut s_reader = NibbleReader::for_slice(&block[scale_offset..]);
+            let mut w_reader = NibbleReader::for_slice(&block[weights_offset..]);
+            for s in &mut scales {
+                *s = s_reader.read_f16().as_();
+            }
+            for _ in 0..self.block_len() {
+                for &o in &zipped_order {
+                    weights[o] = w_reader.read_i4();
+                }
+                for (w, s) in weights.iter().zip(scales.iter()) {
+                    *scratch.next().unwrap() = *s * (*w - 8).as_();
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn extract_at_mn_t<T: Float + Debug + 'static>(
+        &self,
+        value: &EagerPackedInput,
+        mn: usize,
+        target: &mut [T],
+    ) -> TractResult<()>
+    where
+        f16: AsPrimitive<T>,
+        i8: AsPrimitive<T>,
+    {
+        let pbqf: &PackedBlockQuantFormat =
+            value.fact.format.downcast_ref().with_context(|| {
+                format!("Expecing PackedBlockQuantFormat, found {:?}", value.fact.format)
+            })?;
+        ensure!(value.fact.k % self.block_len() == 0);
+        ensure!(pbqf.bq.same_as(self));
+        ensure!(value.fact.mn.to_usize().ok().map(|it| mn < it).unwrap_or(true));
+        ensure!(value.fact.k == target.len());
+        let blocks_for_k = value.fact.k / self.block_len();
+        let row_bytes = blocks_for_k * self.block_bytes();
+        let panel = mn / pbqf.r;
+        let value = &value.packed[panel * pbqf.r * row_bytes..];
+        let mut target = target.iter_mut();
+        let zipped_order =
+            zipped_order(pbqf.r, pbqf.zip).iter().position(|x| *x == mn % pbqf.r).unwrap();
+
+        let panel_block_bytes = pbqf.r * self.block_bytes();
+        let (scale_offset, weights_offset) = if pbqf.scales_at_end {
+            (panel_block_bytes - pbqf.r * f16::datum_type().size_of(), 0)
+        } else {
+            (0, pbqf.r * f16::datum_type().size_of())
+        };
+        unsafe {
+            for block in 0..blocks_for_k {
+                let block = value.as_ptr().add(block * panel_block_bytes);
+                let scale = *((block.add(scale_offset) as *const f16).add(mn % pbqf.r));
+                let scale: T = scale.as_();
+                for i in 0..self.block_len() {
+                    let byte = *block.add(weights_offset + i * pbqf.r / 2 + zipped_order / 2);
+                    let nib = if zipped_order % 2 == 0 { byte & 0x0F } else { byte >> 4 };
+                    *target.next().unwrap() = scale * ((nib as i8) - 8).as_();
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+fn zipped_order(r: usize, zip: usize) -> Vec<usize> {
+    if zip == 0 {
+        (0..r).collect_vec()
+    } else {
+        (0..r)
+            .map(|i| {
+                let vec_pair_ix = i / (2 * zip);
+                let lane = (i % (2 * zip)) / 2;
+                let side = i % 2;
+                vec_pair_ix * 2 * zip + side * zip + lane
+            })
+            .collect_vec()
+    }
+}
+
+impl<const QK: usize> BlockQuant for BaseQ4_0<QK> {
+    fn same_as(&self, other: &dyn BlockQuant) -> bool {
+        other.downcast_ref::<Self>().map(|other| other == self).unwrap_or(false)
+    }
+
+    fn block_len(&self) -> usize {
+        QK
+    }
+
+    fn block_bytes(&self) -> usize {
+        2 + self.block_len() / 2
+    }
+
+    fn quant_block_f32(&self, block: &[f32], quant: &mut [u8]) {
+        self.quant_block(block, quant)
+    }
+
+    fn quant_block_f16(&self, block: &[f16], quant: &mut [u8]) {
+        self.quant_block(block, quant)
+    }
+
+    fn dequant_block_f32(&self, quant: &[u8], block: &mut [f32]) {
+        self.dequant_block(quant, block)
+    }
+
+    fn dequant_block_f16(&self, quant: &[u8], block: &mut [f16]) {
+        self.dequant_block(quant, block)
+    }
+
+    // s0_0 n0_0 n0_1 n0_2 n0_3 ... n0_30n0_31 s0_32 n0_32n0_33 ...
+    // s1_0 n1_0 n1_1 n1_2 n1_3 ... n1_30n1_31 s1_32 n1_32n1_33 ...
+    //
+    //  becomes (with r=4)
+    //
+    //  s0_0 S1_0 S2_0 s3_0  n0_0 n1_0 n2_0 n3_0  n0_1 n1_1 n2_1 n3_1 ... n0_33 n1_33 n2_33 n3_33
+    //  s0_32 S1_32 S2_32 s3_32  n0_0 n1_0 n2_0 n3_0  n0_1 n1_1 n2_1 n3_1 ... n0_33 n1_33 n2_33 n3_33
+    //  ...
+    fn pack(
+        &self,
+        input: &[u8],
+        k: usize,
+        r: usize,
+        zip: usize,
+        scales_at_end: bool,
+    ) -> TractResult<EagerPackedInput> {
+        ensure!(input.len() % self.block_bytes() == 0);
+        ensure!(k % self.block_len() == 0);
+        // ensure!(input.len() == k * r / self.block_len() * self.block_bytes());
+        ensure!(zip < r);
+        let m = if input.len() == 0 {
+            0
+        } else {
+            input.len() / self.block_bytes() * self.block_len() / k
+        };
+        let panels = m.divceil(r);
+        let blocks_for_k = k / self.block_len();
+        let row_bytes = blocks_for_k * self.block_bytes();
+        let panel_bytes = row_bytes * r;
+        let mut blob =
+            unsafe { Blob::for_layout(Layout::from_size_align(panel_bytes * panels, 128)?) };
+        let mut writer = NibbleWriter::for_slice(&mut blob);
+        let order = zipped_order(r, zip);
+        let mut scales = vec![f16::zero(); r];
+        for p in 0..panels {
+            let input = &input[(r * p) * row_bytes..];
+            let mut readers = (0..r)
+                .map(|r| {
+                    // manage partial panel
+                    let offset = if r * row_bytes < input.len() { r * row_bytes } else { 0 };
+                    NibbleReader::for_slice(&input[offset..])
+                })
+                .collect_vec();
+            let mut temp_nibbles = vec![vec![0i8; self.block_len()]; r];
+            for _ in 0..blocks_for_k {
+                for (row, reader) in readers.iter_mut().enumerate() {
+                    scales[row] = reader.read_f16();
+                    temp_nibbles[row] =
+                        (0..self.block_len()).map(|_| reader.read_i4()).collect_vec();
+                }
+                if !scales_at_end {
+                    scales.iter().for_each(|s| writer.write_f16(*s))
+                }
+                for pos in 0..self.block_len() {
+                    for &row in &order {
+                        let ggml_idx = pos / (self.block_len() / 2) + (2 * pos) % self.block_len();
+                        let nib = temp_nibbles[row][ggml_idx];
+                        writer.write_i4(nib);
+                    }
+                }
+                if scales_at_end {
+                    scales.iter().for_each(|s| writer.write_f16(*s))
+                }
+            }
+        }
+        Ok(EagerPackedInput {
+            fact: PackedOpaqueFact {
+                format: Box::new(PackedBlockQuantFormat {
+                    bq: Box::new(*self),
+                    r,
+                    zip,
+                    scales_at_end,
+                }),
+                mn: m.to_dim(),
+                k,
+            },
+            packed: blob.into(),
+            panel_bytes,
+            mn: m,
+        })
+    }
+
+    unsafe fn extract_packed_panel(
+        &self,
+        value: &EagerPackedInput,
+        target: &PackedFormat,
+        panel: usize,
+        scratch: *mut u8,
+    ) -> TractResult<()> {
+        unsafe {
+            dispatch_floatlike!(Self::extract_panel_t(target.dt)(
+                self, value, target, panel, scratch
+            ))
+        }
+    }
+
+    fn extract_at_mn_f16(
+        &self,
+        value: &EagerPackedInput,
+        mn: usize,
+        target: &mut [f16],
+    ) -> TractResult<()> {
+        self.extract_at_mn_t(value, mn, target)
+    }
+
+    fn extract_at_mn_f32(
+        &self,
+        value: &EagerPackedInput,
+        mn: usize,
+        target: &mut [f32],
+    ) -> TractResult<()> {
+        self.extract_at_mn_t(value, mn, target)
+    }
+}
+
+impl<const QK: usize> Display for BaseQ4_0<QK> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Q4_0")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use num_traits::Zero;
+    use tract_data::internal::tract_ndarray::Array2;
+
+    use crate::pack::PackedFormat;
+
+    use super::*;
+
+    fn test_loop_f32(b: impl BlockQuant, data: &[f32]) {
+        let mut input = data.to_vec();
+        while input.len() % b.block_len() != 0 {
+            input.push(0f32);
+        }
+        let quant = b.quant_f32(&input).unwrap();
+        let result = b.dequant_f32(&quant).unwrap();
+        let view = &result.as_slice::<f32>().unwrap()[..data.len()];
+        assert_eq!(data, view);
+    }
+
+    fn test_loop_f16(b: impl BlockQuant, data: &[f32]) {
+        let mut input = data.iter().map(|f| f16::from_f32(*f)).collect_vec();
+        while input.len() % b.block_len() != 0 {
+            input.push(f16::zero());
+        }
+        let quant = b.quant_f16(&input).unwrap();
+        let result = b.dequant_f16(&quant).unwrap();
+        let view = &result.as_slice::<f16>().unwrap();
+        assert_eq!(&input, view);
+    }
+
+    #[test]
+    fn loop_q4f32_pos() {
+        test_loop_f32(Q4_0, &[1.0, 2.0, 3.0, 4.0]);
+    }
+
+    #[test]
+    fn loop_q4f16_pos() {
+        test_loop_f16(Q4_0, &[1.0, 2.0, 3.0, 4.0]);
+    }
+
+    #[test]
+    fn loop_q4f32_neg() {
+        test_loop_f32(Q4_0, &[-1.0, -2.0, -3.0, -4.0]);
+    }
+
+    #[test]
+    fn loop_q4f16_beg() {
+        test_loop_f16(Q4_0, &[-1.0, -2.0, -3.0, -4.0]);
+    }
+
+    #[test]
+    fn loop_q4_big_pos() {
+        test_loop_f32(Q4_0, &[1234.0]);
+        test_loop_f16(Q4_0, &[-1.0, -2.0, -3.0, -4.0]);
+    }
+
+    #[test]
+    fn loop_q4_big_neg() {
+        test_loop_f32(Q4_0, &[-1234.0]);
+        test_loop_f16(Q4_0, &[-1234.0]);
+    }
+
+    fn test_extract_f32(b: impl BlockQuant, data: &[f32]) {
+        let mut input = data.to_vec();
+        while input.len() % b.block_len() != 0 {
+            input.push(0f32);
+        }
+        let quant = b.quant_f32(&input).unwrap();
+        for (ix, v) in data.iter().enumerate() {
+            assert_eq!(b.extract_at_offset_f32(&quant, ix).round(), *v);
+        }
+    }
+
+    #[test]
+    fn extract_q40f32_pos() {
+        let data = (1..).map(|i| ((i % 14) - 6) as f32).take(5 * Q4_0.block_len()).collect_vec();
+        test_extract_f32(Q4_0, &data);
+    }
+
+    fn test_pack_then_extract_panel(
+        q: impl BlockQuant,
+        k: usize,
+        m: usize,
+        r: usize,
+        zip: usize,
+        scales_at_end: bool,
+    ) -> TractResult<()> {
+        let weights_orig =
+            Array2::from_shape_fn((m, k), |(m, k)| ((m * 31 + k * 17) % 20) as f32 - 10.)
+                .into_tensor();
+        let weights_f32 =
+            q.dequant_f32(&q.quant_f32(weights_orig.as_slice::<f32>()?)?)?.into_shape(&[m, k])?;
+        let packer = PackedFormat::new(f32::datum_type(), r, 128);
+        let packed_f32 = packer.pack_tensor(&weights_f32, 1, 0)?;
+
+        let q4 = q.quant_f32(weights_f32.as_slice::<f32>()?)?;
+        let packed_q4 = q.pack(&q4, k, r, zip, scales_at_end)?;
+
+        for panel in 0..packed_f32.panels_count() {
+            unsafe {
+                let panel_f32 = packed_f32.panel_bytes(panel, None)?;
+                let panel_f32 = std::slice::from_raw_parts(panel_f32 as *const f32, k * r);
+                let mut panel_q4 = Tensor::zero::<f32>(&[k * r])?;
+                q.extract_packed_panel(
+                    &packed_q4,
+                    &packer,
+                    panel,
+                    panel_q4.as_bytes_mut().as_mut_ptr(),
+                )?;
+                assert_eq!(panel_q4.as_slice::<f32>()?, panel_f32);
+            }
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn pack_then_extract_panel() -> TractResult<()> {
+        test_pack_then_extract_panel(BaseQ4_0::<2>, 4, 4, 2, 0, false)
+    }
+
+    #[test]
+    fn pack_then_extract_panel_with_zip() -> TractResult<()> {
+        test_pack_then_extract_panel(BaseQ4_0::<2>, 2, 8, 8, 4, false)
+    }
+
+    #[test]
+    fn pack_then_extract_panel_with_scales_at_end() -> TractResult<()> {
+        test_pack_then_extract_panel(BaseQ4_0::<2>, 2, 4, 4, 0, true)
+    }
+
+    fn test_pack_then_extract_row(
+        q: impl BlockQuant,
+        k: usize,
+        m: usize,
+        r: usize,
+        zip: usize,
+        scales_at_end: bool,
+    ) -> TractResult<()> {
+        let weights_orig =
+            Array2::from_shape_fn((m, k), |(m, k)| ((m * 31 + k * 17) % 20) as f32 - 10.)
+                .into_tensor();
+        let weights_f32 =
+            q.dequant_f32(&q.quant_f32(weights_orig.as_slice::<f32>()?)?)?.into_shape(&[m, k])?;
+        let packer = PackedFormat::new(f32::datum_type(), r, 128);
+        let packed_f32 = packer.pack_tensor(&weights_f32, 1, 0)?;
+
+        let q4 = q.quant_f32(weights_f32.as_slice::<f32>()?)?;
+        let packed_q4 = q.pack(&q4, k, r, zip, scales_at_end)?;
+
+        for row in 0..packed_f32.mn() {
+            unsafe {
+                let panel_f32 = packed_f32.panel_bytes(row / r, None)?;
+                let panel_f32 = std::slice::from_raw_parts(panel_f32 as *const f32, k * r);
+                let row_f32 = (0..k).map(|ix| panel_f32[row % r + r * ix]).collect_vec();
+
+                let mut q4 = vec![0f32; k];
+                q.extract_at_mn_f32(&packed_q4, row, &mut q4)?;
+                assert_eq!(q4, row_f32);
+            }
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn pack_then_extract_row() -> TractResult<()> {
+        test_pack_then_extract_row(BaseQ4_0::<2>, 4, 4, 2, 0, false)
+    }
+
+    #[test]
+    fn pack_then_extract_row_with_zip() -> TractResult<()> {
+        test_pack_then_extract_row(BaseQ4_0::<2>, 2, 8, 8, 4, false)
+    }
+
+    #[test]
+    fn pack_then_extract_row_with_scales_at_end() -> TractResult<()> {
+        test_pack_then_extract_row(BaseQ4_0::<2>, 2, 4, 4, 0, true)
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/block_quant/value.rs b/vendor/tract-linalg-0.22.1/src/frame/block_quant/value.rs
new file mode 100644
index 000000000..3b564fa1a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/block_quant/value.rs
@@ -0,0 +1,116 @@
+use std::ops::Range;
+use std::sync::Arc;
+
+use super::{BlockQuant, PackedBlockQuantFormat};
+use tract_data::internal::*;
+use tract_data::TVec;
+
+#[allow(clippy::derived_hash_with_manual_eq)]
+#[derive(Clone, Hash)]
+pub struct BlockQuantFact {
+    pub format: Box<dyn BlockQuant>,
+    shape: TVec<usize>,
+}
+impl BlockQuantFact {
+    pub fn new(format: Box<dyn BlockQuant>, shape: TVec<usize>) -> Self {
+        Self { format, shape }
+    }
+
+    pub fn m(&self) -> usize {
+        self.shape[0]
+    }
+
+    pub fn k(&self) -> usize {
+        self.shape.iter().skip(1).product()
+    }
+
+    pub fn shape(&self) -> &[usize] {
+        &self.shape
+    }
+}
+
+impl std::fmt::Debug for BlockQuantFact {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}({:?})", self.format, self.shape)
+    }
+}
+
+impl OpaqueFact for BlockQuantFact {
+    fn mem_size(&self) -> TDim {
+        (self.shape.iter().product::<usize>() / self.format.block_len() * self.format.block_bytes())
+            .to_dim()
+    }
+
+    fn same_as(&self, other: &dyn OpaqueFact) -> bool {
+        other.downcast_ref::<Self>().is_some_and(|o| o == self)
+    }
+}
+
+impl PartialEq for BlockQuantFact {
+    fn eq(&self, other: &Self) -> bool {
+        self.format.same_as(&*other.format) && self.shape == other.shape
+    }
+}
+
+#[derive(Clone, Hash)]
+pub struct BlockQuantValue {
+    pub fact: BlockQuantFact,
+    pub value: Arc<Blob>,
+}
+
+impl BlockQuantValue {
+    pub fn split_rows(&self, range: Range<usize>) -> TractResult<BlockQuantValue> {
+        let row_bytes =
+            self.fact.k() / self.fact.format.block_len() * self.fact.format.block_bytes();
+        let mut value =
+            unsafe { Blob::new_for_size_and_align(range.len() * row_bytes, vector_size()) };
+        value.copy_from_slice(&self.value[range.start * row_bytes..][..range.len() * row_bytes]);
+        let mut shape = self.fact.shape.clone();
+        shape[0] = range.len();
+        Ok(BlockQuantValue {
+            fact: BlockQuantFact { format: self.fact.format.clone(), shape },
+            value: Arc::new(value),
+        })
+    }
+}
+
+impl OpaquePayload for BlockQuantValue {
+    fn same_as(&self, other: &dyn OpaquePayload) -> bool {
+        other.downcast_ref::<Self>().is_some_and(|o| o.fact == self.fact && o.value == self.value)
+    }
+}
+
+impl std::fmt::Debug for BlockQuantValue {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?} {:?}", self.fact, self.value)
+    }
+}
+
+impl std::fmt::Display for BlockQuantValue {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+#[derive(Clone, Hash, PartialEq)]
+pub struct PackedBlockQuantFact {
+    pub format: PackedBlockQuantFormat,
+    pub shape: TVec<usize>,
+}
+
+impl std::fmt::Debug for PackedBlockQuantFact {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}({:?})", self.format, self.shape)
+    }
+}
+
+impl OpaqueFact for PackedBlockQuantFact {
+    fn mem_size(&self) -> TDim {
+        (self.shape.iter().product::<usize>() / self.format.bq.block_len()
+            * self.format.bq.block_bytes())
+        .to_dim()
+    }
+    fn same_as(&self, other: &dyn OpaqueFact) -> bool {
+        other.downcast_ref::<Self>().is_some_and(|o| o == self)
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/by_scalar.rs b/vendor/tract-linalg-0.22.1/src/frame/by_scalar.rs
new file mode 100644
index 000000000..0405043e8
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/by_scalar.rs
@@ -0,0 +1,96 @@
+use std::fmt::Debug;
+use std::marker::PhantomData;
+
+use crate::element_wise::{ElementWise, ElementWiseKer};
+use crate::element_wise_helper::map_slice_with_alignment;
+use crate::{LADatum, LinalgFn};
+use tract_data::internal::*;
+
+/// Generic implementation struct that unify all by scalar kernels.
+/// A by scalar operation is an ElementWise operation with a scalar paramerer.
+#[derive(Debug, Clone, new)]
+pub struct ByScalarImpl<K, T>
+where
+    T: LADatum,
+    K: ByScalarKer<T> + Clone,
+{
+    phantom: PhantomData<(K, T)>,
+}
+
+impl<K, T> ElementWise<T, T> for ByScalarImpl<K, T>
+where
+    T: LADatum,
+    K: ByScalarKer<T> + Clone,
+{
+    fn name(&self) -> &'static str {
+        K::name()
+    }
+    fn run_with_params(&self, vec: &mut [T], params: T) -> TractResult<()> {
+        map_slice_with_alignment(vec, |data| K::run(data, params), K::nr(), K::alignment_bytes())
+    }
+}
+
+pub trait ByScalarKer<T>: ElementWiseKer<T, T>
+where
+    T: LADatum,
+{
+    fn bin() -> Box<LinalgFn> {
+        Box::new(|a: &mut TensorView, b: &TensorView| {
+            let a_slice = a.as_slice_mut()?;
+            let b = b.as_slice()?[0];
+            (Self::ew()).run_with_params(a_slice, b)
+        })
+    }
+}
+
+macro_rules! by_scalar_impl_wrap {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $params: ty, $run: item) => {
+        paste! {
+            ew_impl_wrap!($ti, $func, $nr, $alignment_items, $ti, $run);
+
+            impl crate::frame::by_scalar::ByScalarKer<$ti> for $func {}
+        }
+    };
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use crate::frame::element_wise::ElementWiseKer;
+    use crate::LADatum;
+    use num_traits::{AsPrimitive, Float};
+    use proptest::test_runner::TestCaseResult;
+
+    #[macro_export]
+    macro_rules! by_scalar_frame_tests {
+        ($cond:expr, $t: ty, $ker:ty, $func:expr) => {
+            pastey::paste! {
+                proptest::proptest! {
+                    #[test]
+                    fn [<prop_ $ker:snake>](xs in proptest::collection::vec(-25f32..25.0, 0..100), scalar in -25f32..25f32) {
+                        if $cond {
+                            $crate::frame::by_scalar::test::test_by_scalar::<$ker, $t>(&*xs, scalar, $func).unwrap()
+                        }
+                    }
+                }
+            }
+        };
+    }
+
+    pub fn test_by_scalar<K: ElementWiseKer<T, T>, T: LADatum + Float>(
+        values: &[f32],
+        scalar: f32,
+        func: impl Fn(T, T) -> T,
+    ) -> TestCaseResult
+    where
+        f32: AsPrimitive<T>,
+    {
+        crate::setup_test_logger();
+        let values: Vec<T> = values.iter().copied().map(|x| x.as_()).collect();
+        crate::frame::element_wise::test::test_element_wise_params::<K, T, _, T>(
+            &values,
+            |a| (func)(a, scalar.as_()),
+            scalar.as_(),
+        )
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/element_wise.rs b/vendor/tract-linalg-0.22.1/src/frame/element_wise.rs
new file mode 100644
index 000000000..824ec36c1
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/element_wise.rs
@@ -0,0 +1,165 @@
+use std::fmt::Debug;
+use std::marker::PhantomData;
+
+use tract_data::TractResult;
+
+use crate::LADatum;
+
+use super::element_wise_helper::map_slice_with_alignment;
+
+macro_rules! ew_impl_wrap {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $params: ty, $run: item) => {
+        paste! {
+            #[derive(Copy, Clone, Debug)]
+            #[allow(non_camel_case_types)]
+            pub struct $func;
+
+            impl crate::frame::element_wise::ElementWiseKer<$ti, $params> for $func {
+                #[inline(always)]
+                fn name() -> &'static str {
+                    stringify!($func)
+                }
+                #[inline(always)]
+                fn nr() -> usize {
+                    $nr
+                }
+                #[inline(always)]
+                fn alignment_items() -> usize {
+                    $alignment_items
+                }
+                $run
+            }
+        }
+    };
+}
+
+macro_rules! ew_impl {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr) => {
+        paste! {
+            mod [<sys_ $func>] {
+                #[allow(unused_imports)]
+                use tract_data::prelude::f16;
+                extern_kernel!(fn $func(ptr: *mut $ti, count: usize) -> ());
+            }
+            ew_impl_wrap!($ti, $func, $nr, $alignment_items, (),
+                #[inline(never)]
+                fn run(buf: &mut [$ti], _params: ()) {
+                    unsafe { [<sys_ $func>]::$func(buf.as_mut_ptr(), buf.len()) }
+                }
+            );
+        }
+    };
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $params: ty) => {
+        paste! {
+            mod [<sys_ $func>] {
+                #[allow(unused_imports)]
+                use tract_data::prelude::f16;
+                extern_kernel!(fn $func(ptr: *mut $ti, count: usize, params: $params) -> ());
+            }
+            ew_impl_wrap!($ti, $func, $nr, $alignment_items, $params,
+                #[inline(never)]
+                fn run(buf: &mut [$ti], params: $params) {
+                    unsafe { [<sys_ $func>]::$func(buf.as_mut_ptr(), buf.len(), params) }
+                }
+            );
+        }
+    };
+}
+
+pub trait ElementWise<T, Params = ()>: Send + Sync + Debug + dyn_clone::DynClone
+where
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    T: Copy + Debug + PartialEq + Send + Sync,
+{
+    fn name(&self) -> &'static str;
+    fn run(&self, vec: &mut [T]) -> TractResult<()> {
+        self.run_with_params(vec, Params::default())
+    }
+    fn run_with_params(&self, vec: &mut [T], params: Params) -> TractResult<()>;
+}
+
+dyn_clone::clone_trait_object!(<T, Params> ElementWise<T, Params> where T: Copy, Params: Copy);
+
+#[derive(Debug, Clone, new)]
+pub struct ElementWiseImpl<K, T, Params = ()>
+where
+    T: LADatum,
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    K: ElementWiseKer<T, Params> + Clone,
+{
+    phantom: PhantomData<(K, T, Params)>,
+}
+
+impl<K, T, Params> ElementWise<T, Params> for ElementWiseImpl<K, T, Params>
+where
+    T: LADatum,
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    K: ElementWiseKer<T, Params> + Clone,
+{
+    fn name(&self) -> &'static str {
+        K::name()
+    }
+    fn run_with_params(&self, vec: &mut [T], params: Params) -> TractResult<()> {
+        map_slice_with_alignment(vec, |data| K::run(data, params), K::nr(), K::alignment_bytes())
+    }
+}
+
+pub trait ElementWiseKer<T, Params = ()>:
+    Send + Sync + Debug + dyn_clone::DynClone + Clone + 'static
+where
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    T: LADatum,
+{
+    fn name() -> &'static str;
+    fn alignment_bytes() -> usize {
+        Self::alignment_items() * T::datum_type().size_of()
+    }
+    fn alignment_items() -> usize;
+    fn nr() -> usize;
+    fn run(vec: &mut [T], params: Params);
+    fn ew() -> Box<dyn ElementWise<T, Params>> {
+        Box::new(ElementWiseImpl::<Self, T, Params>::new())
+    }
+}
+
+#[cfg(test)]
+pub mod test {
+    use crate::{frame::element_wise::*, LADatum};
+    use proptest::test_runner::{TestCaseError, TestCaseResult};
+    use tract_data::internal::*;
+
+    pub fn test_element_wise<K: ElementWiseKer<T, ()>, T: LADatum, F: Fn(T) -> T>(
+        values: &[T],
+        reference: F,
+    ) -> TestCaseResult {
+        test_element_wise_params::<K, T, F, ()>(values, reference, ())
+    }
+
+    pub fn test_element_wise_params<
+        K: ElementWiseKer<T, Params>,
+        T: LADatum,
+        F: Fn(T) -> T,
+        Params,
+    >(
+        values: &[T],
+        reference: F,
+        params: Params,
+    ) -> TestCaseResult
+    where
+        Params: Copy + Send + Sync + Debug + 'static + Default,
+    {
+        crate::setup_test_logger();
+        let op = ElementWiseImpl::<K, T, Params>::new();
+        let mut values = values.to_vec();
+        while values.len() < K::nr() {
+            values.push(T::zero());
+        }
+        let expected = values.iter().copied().map(reference).collect::<Vec<_>>();
+        let mut found = values;
+        op.run_with_params(&mut found, params).unwrap();
+        tensor1(&found)
+            .close_enough(&tensor1(&expected), true)
+            .map_err(|e| TestCaseError::fail(e.root_cause().to_string()))?;
+        Ok(())
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/element_wise_helper.rs b/vendor/tract-linalg-0.22.1/src/frame/element_wise_helper.rs
new file mode 100644
index 000000000..f4b308464
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/element_wise_helper.rs
@@ -0,0 +1,169 @@
+use crate::LADatum;
+use std::alloc::*;
+use tract_data::TractResult;
+
+pub(crate) fn map_slice_with_alignment<T>(
+    vec: &mut [T],
+    f: impl Fn(&mut [T]),
+    nr: usize,
+    alignment_bytes: usize,
+) -> TractResult<()>
+where
+    T: LADatum,
+{
+    if vec.is_empty() {
+        return Ok(());
+    }
+    unsafe {
+        TMP.with(|buffer| {
+            let mut buffer = buffer.borrow_mut();
+            buffer.ensure(nr * T::datum_type().size_of(), alignment_bytes);
+            let tmp = std::slice::from_raw_parts_mut(buffer.buffer as *mut T, nr);
+            let mut compute_via_temp_buffer = |slice: &mut [T]| {
+                tmp[..slice.len()].copy_from_slice(slice);
+                f(tmp);
+                slice.copy_from_slice(&tmp[..slice.len()])
+            };
+            let prefix_len = vec.as_ptr().align_offset(alignment_bytes).min(vec.len());
+            if prefix_len > 0 {
+                compute_via_temp_buffer(&mut vec[..prefix_len]);
+            }
+            let aligned_len = (vec.len() - prefix_len) / nr * nr;
+            if aligned_len > 0 {
+                f(&mut vec[prefix_len..][..aligned_len]);
+            }
+            if prefix_len + aligned_len < vec.len() {
+                compute_via_temp_buffer(&mut vec[prefix_len + aligned_len..]);
+            }
+        })
+    }
+    Ok(())
+}
+
+pub(crate) fn reduce_slice_with_alignment<T>(
+    vec: &[T],
+    f: impl Fn(&[T]) -> T,
+    nr: usize,
+    alignment_bytes: usize,
+    neutral: T,
+    reduce: impl Fn(T, T) -> T,
+) -> TractResult<T>
+where
+    T: LADatum,
+{
+    if vec.is_empty() {
+        return Ok(neutral);
+    }
+    let mut red = neutral;
+    unsafe {
+        TMP.with(|buffer| {
+            let mut buffer = buffer.borrow_mut();
+            buffer.ensure(nr * T::datum_type().size_of(), alignment_bytes);
+            let tmp = std::slice::from_raw_parts_mut(buffer.buffer as *mut T, nr);
+            let mut compute_via_temp_buffer = |slice: &[T], red: &mut T| {
+                tmp[..slice.len()].copy_from_slice(slice);
+                tmp[slice.len()..].fill(neutral);
+                *red = reduce(*red, f(tmp));
+            };
+            let prefix_len = vec.as_ptr().align_offset(alignment_bytes).min(vec.len());
+            if prefix_len > 0 {
+                compute_via_temp_buffer(&vec[..prefix_len], &mut red);
+            }
+            let aligned_len = (vec.len() - prefix_len) / nr * nr;
+            if aligned_len > 0 {
+                let t = f(&vec[prefix_len..][..aligned_len]);
+                red = reduce(red, t);
+            }
+            if prefix_len + aligned_len < vec.len() {
+                compute_via_temp_buffer(&vec[prefix_len + aligned_len..], &mut red);
+            }
+        })
+    }
+    Ok(red)
+}
+
+pub(crate) fn map_reduce_slice_with_alignment<T>(
+    vec: &mut [T],
+    f: impl Fn(&mut [T]) -> T,
+    nr: usize,
+    alignment_bytes: usize,
+    map_neutral: T,
+    neutral: T,
+    reduce: impl Fn(T, T) -> T,
+) -> TractResult<T>
+where
+    T: LADatum,
+{
+    if vec.is_empty() {
+        return Ok(neutral);
+    }
+    let mut red = neutral;
+    unsafe {
+        TMP.with(|buffer| {
+            let mut buffer = buffer.borrow_mut();
+            buffer.ensure(nr * T::datum_type().size_of(), alignment_bytes);
+            let tmp = std::slice::from_raw_parts_mut(buffer.buffer as *mut T, nr);
+            let mut compute_via_temp_buffer = |slice: &mut [T], red: &mut T| {
+                tmp[..slice.len()].copy_from_slice(slice);
+                tmp[slice.len()..].fill(map_neutral);
+                *red = reduce(*red, f(tmp));
+                slice.copy_from_slice(&tmp[..slice.len()]);
+            };
+            let prefix_len = vec.as_ptr().align_offset(alignment_bytes).min(vec.len());
+            if prefix_len > 0 {
+                compute_via_temp_buffer(&mut vec[..prefix_len], &mut red);
+            }
+            let aligned_len = (vec.len() - prefix_len) / nr * nr;
+            if aligned_len > 0 {
+                let t = f(&mut vec[prefix_len..][..aligned_len]);
+                red = reduce(red, t);
+            }
+            if prefix_len + aligned_len < vec.len() {
+                compute_via_temp_buffer(&mut vec[prefix_len + aligned_len..], &mut red);
+            }
+        })
+    }
+    Ok(red)
+}
+
+std::thread_local! {
+    static TMP: std::cell::RefCell<TempBuffer> = std::cell::RefCell::new(TempBuffer::default());
+}
+
+pub struct TempBuffer {
+    pub layout: Layout,
+    pub buffer: *mut u8,
+}
+
+impl Default for TempBuffer {
+    fn default() -> Self {
+        TempBuffer { layout: Layout::new::<()>(), buffer: std::ptr::null_mut() }
+    }
+}
+
+impl TempBuffer {
+    pub fn ensure(&mut self, size: usize, alignment: usize) {
+        unsafe {
+            if size > self.layout.size() || alignment > self.layout.align() {
+                let size = size.max(self.layout.size());
+                let alignment = alignment.max(self.layout.align());
+                if !self.buffer.is_null() {
+                    std::alloc::dealloc(self.buffer, self.layout);
+                }
+                self.layout = Layout::from_size_align_unchecked(size, alignment);
+                self.buffer = std::alloc::alloc(self.layout);
+                assert!(!self.buffer.is_null());
+            }
+        }
+    }
+}
+
+impl Drop for TempBuffer {
+    fn drop(&mut self) {
+        unsafe {
+            if !self.buffer.is_null() {
+                std::alloc::dealloc(self.buffer, self.layout);
+            }
+        }
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/leaky_relu.rs b/vendor/tract-linalg-0.22.1/src/frame/leaky_relu.rs
new file mode 100644
index 000000000..8abf5b01f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/leaky_relu.rs
@@ -0,0 +1,65 @@
+#[allow(unused_macros)]
+macro_rules! leaky_relu_impl {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $cond: expr) => {
+        ew_impl!($ti, $func, $nr, $alignment_items, $ti);
+        #[cfg(test)]
+        paste! {
+            mod [<test_ $func>] {
+                use super::*;
+                leaky_relu_frame_tests!($cond, $ti, $func);
+            }
+        }
+    };
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use crate::{frame::element_wise::*, LADatum};
+    use num_traits::{AsPrimitive, Float};
+    use proptest::test_runner::TestCaseResult;
+
+    #[macro_export]
+    macro_rules! leaky_relu_frame_tests {
+        ($cond:expr, $t: ty, $ker:ty) => {
+            proptest::proptest! {
+                #[test]
+                fn prop(xs in proptest::collection::vec(-25f32..25.0, 0..100), alpha in 0f32..1f32) {
+                    if $cond {
+                        $crate::frame::leaky_relu::test::test_leaky_relu::<$ker, $t>(&*xs, alpha).unwrap()
+                    }
+                }
+            }
+            #[test]
+            fn trivial() {
+                if $cond {
+                    $crate::frame::leaky_relu::test::test_leaky_relu::<$ker, $t>(&[-10f32], 0.0496).unwrap();
+                }
+            }
+        };
+    }
+
+    pub fn test_leaky_relu<K: ElementWiseKer<T, T>, T: LADatum + Float>(
+        values: &[f32],
+        alpha: f32,
+    ) -> TestCaseResult
+    where
+        f32: AsPrimitive<T>,
+    {
+        let data = tract_data::prelude::tensor1(values);
+        let data = data.cast_to::<T>().unwrap();
+        let data = data.as_slice::<T>().unwrap();
+        let alpha: T = tract_data::prelude::tensor0(alpha).cast_to_scalar::<T>().unwrap();
+        crate::frame::element_wise::test::test_element_wise_params::<K, T, _, T>(
+            data,
+            |x: T| {
+                if x > T::zero() {
+                    x
+                } else {
+                    alpha * x
+                }
+            },
+            alpha,
+        )
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/lut.rs b/vendor/tract-linalg-0.22.1/src/frame/lut.rs
new file mode 100644
index 000000000..faaa43b49
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/lut.rs
@@ -0,0 +1,141 @@
+use std::fmt;
+use std::hash::Hash;
+use std::marker::PhantomData;
+use tract_data::internal::*;
+
+pub trait Lut: fmt::Debug + dyn_clone::DynClone + Send + Sync {
+    fn table(&self) -> &[u8];
+    fn run(&self, buf: &mut [u8]);
+}
+
+dyn_clone::clone_trait_object!(Lut);
+
+#[derive(Debug, Clone, Hash)]
+pub struct LutImpl<K: LutKer> {
+    table: Tensor,
+    _boo: PhantomData<K>,
+}
+
+impl<K: LutKer> LutImpl<K> {
+    pub fn new(table: &[u8]) -> LutImpl<K> {
+        unsafe {
+            LutImpl {
+                table: Tensor::from_raw_aligned::<u8>(
+                    &[table.len()],
+                    table,
+                    K::table_alignment_bytes(),
+                )
+                .unwrap(),
+                _boo: PhantomData,
+            }
+        }
+    }
+}
+
+impl<K: LutKer> Lut for LutImpl<K> {
+    fn table(&self) -> &[u8] {
+        self.table.as_slice().unwrap()
+    }
+
+    fn run(&self, buf: &mut [u8]) {
+        unsafe {
+            let table: *const u8 = self.table.as_ptr_unchecked();
+            let align = K::input_alignment_bytes();
+            let aligned_start = (buf.as_ptr() as usize).next_multiple_of(align);
+            let prefix = (aligned_start - buf.as_ptr() as usize).min(buf.len());
+            for i in 0..(prefix as isize) {
+                let ptr = buf.as_mut_ptr().offset(i);
+                *ptr = *table.offset(*ptr as isize);
+            }
+            let remaining = buf.len() - prefix;
+            if remaining == 0 {
+                return;
+            }
+            let n = K::n();
+            let aligned_len = remaining / n * n;
+            if aligned_len > 0 {
+                K::run(buf.as_mut_ptr().add(prefix), aligned_len, table);
+            }
+            let remaining = buf.len() - aligned_len - prefix;
+            for i in 0..remaining {
+                let ptr = buf.as_mut_ptr().add(i + prefix + aligned_len);
+                *ptr = *table.offset(*ptr as isize);
+            }
+        }
+    }
+}
+
+pub trait LutKer: Clone + fmt::Debug + Send + Sync + Hash {
+    fn name() -> &'static str;
+    fn n() -> usize;
+    fn input_alignment_bytes() -> usize;
+    fn table_alignment_bytes() -> usize;
+    unsafe fn run(buf: *mut u8, len: usize, table: *const u8);
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use super::*;
+    use proptest::prelude::*;
+
+    #[derive(Debug)]
+    pub struct LutProblem {
+        pub table: Vec<u8>,
+        pub data: Vec<u8>,
+    }
+
+    impl Arbitrary for LutProblem {
+        type Parameters = ();
+        type Strategy = BoxedStrategy<Self>;
+
+        fn arbitrary_with(_p: ()) -> Self::Strategy {
+            proptest::collection::vec(any::<u8>(), 1..256)
+                .prop_flat_map(|table| {
+                    let data = proptest::collection::vec(0..table.len() as u8, 0..100);
+                    (Just(table), data)
+                })
+                .prop_map(|(table, data)| LutProblem { table, data })
+                .boxed()
+        }
+    }
+
+    impl LutProblem {
+        pub fn reference(&self) -> Vec<u8> {
+            self.data.iter().map(|x| self.table[*x as usize]).collect()
+        }
+
+        pub fn test<K: LutKer>(&self) -> Vec<u8> {
+            let lut = LutImpl::<K>::new(&self.table);
+            let mut data = self.data.clone();
+            lut.run(&mut data);
+            data
+        }
+    }
+
+    #[macro_export]
+    macro_rules! lut_frame_tests {
+        ($cond:expr, $ker:ty) => {
+            mod lut {
+                use proptest::prelude::*;
+                #[allow(unused_imports)]
+                use $crate::frame::lut::test::*;
+
+                proptest::proptest! {
+                    #[test]
+                    fn lut_prop(pb in any::<LutProblem>()) {
+                        if $cond {
+                            prop_assert_eq!(pb.test::<$ker>(), pb.reference())
+                        }
+                    }
+                }
+
+                #[test]
+                fn test_empty() {
+                    let pb = LutProblem { table: vec![0], data: vec![] };
+                    assert_eq!(pb.test::<$ker>(), pb.reference())
+                }
+            }
+        };
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/cost_model.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/cost_model.rs
new file mode 100644
index 000000000..5f8cb46ba
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/cost_model.rs
@@ -0,0 +1,86 @@
+use tract_data::internal::*;
+use tract_data::itertools::{izip, Itertools};
+
+use super::MatMatMul;
+
+fn order_f<F: tract_num_traits::Float>(&a: &F, &b: &F) -> std::cmp::Ordering {
+    if a < b {
+        std::cmp::Ordering::Less
+    } else {
+        std::cmp::Ordering::Greater
+    }
+}
+
+#[derive(Debug)]
+pub struct CostModel<'a> {
+    pub big_product_mkn_threshold: f32,
+    pub big_product_kernel_choice: &'a str,
+    pub kernels: &'a [&'a str],
+    pub mrs: &'a [u32],
+    pub nrs: &'a [u32],
+    pub feat_norm_mean: &'a [f32],
+    pub feat_norm_stddev: &'a [f32],
+    pub w1: &'a [f32],
+    pub b1: &'a [f32],
+    pub w2: &'a [f32],
+    pub b2: &'a [f32],
+}
+
+impl CostModel<'_> {
+    pub fn features(&self, m: usize, k: usize, n: usize) -> Vec<f32> {
+        let mut feat = vec![
+            (m as f32).ln(),
+            (k as f32).ln(),
+            (n as f32).ln(),
+            (n as f32 * m as f32 * k as f32).ln(),
+        ];
+        for &mr in self.mrs {
+            let mr = mr as usize;
+            feat.push((m % mr) as f32);
+            feat.push((m % mr != 0) as usize as f32);
+        }
+        for &nr in self.nrs {
+            let nr = nr as usize;
+            feat.push((n % nr) as f32);
+            feat.push((n % nr != 0) as usize as f32);
+        }
+        feat
+    }
+
+    fn normalize(&self, feat: &mut [f32]) {
+        izip!(feat, self.feat_norm_mean, self.feat_norm_stddev)
+            .for_each(|(x, m, s)| *x = (*x - m) / s)
+    }
+
+    fn dnn(x: &[f32], w: &[f32], b: &[f32]) -> Vec<f32> {
+        let x = tract_ndarray::Array1::from_vec(x.to_vec());
+        let w = tract_ndarray::Array2::from_shape_vec([b.len(), x.len()], w.to_vec()).unwrap();
+        let b = tract_ndarray::Array1::from_vec(b.to_vec());
+        (w.dot(&x) + b).to_vec()
+    }
+
+    pub fn predict(&self, m: usize, k: usize, n: usize) -> &str {
+        let mut x = self.features(m, k, n);
+        self.normalize(&mut x);
+        let mut hidden = Self::dnn(&x, self.w1, self.b1);
+        (crate::generic().tanh_f32)().run(&mut hidden).unwrap();
+        let output = Self::dnn(&hidden, self.w2, self.b2);
+        let ix = output.iter().copied().position_max_by(order_f).unwrap();
+        self.kernels[ix]
+    }
+
+    pub fn pick(
+        &self,
+        impls: &[Box<dyn MatMatMul>],
+        m: Option<usize>,
+        k: Option<usize>,
+        n: Option<usize>,
+    ) -> Box<dyn MatMatMul> {
+        if let (Some(m), Some(k), Some(n)) = (m, k, n) {
+            let choice = self.predict(m, k, n);
+            impls.iter().find(|k| k.name() == choice).unwrap().clone()
+        } else {
+            impls.iter().find(|k| k.name() == self.big_product_kernel_choice).unwrap().clone()
+        }
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/fuse.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/fuse.rs
new file mode 100644
index 000000000..e077b26f7
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/fuse.rs
@@ -0,0 +1,125 @@
+use std::fmt::Debug;
+use std::ops::Deref;
+
+use crate::pack::PackedFormat;
+use crate::BinOp;
+
+use super::{MMMInputValue, OutputStore, OutputStoreKer};
+use tract_data::internal::*;
+
+#[repr(usize)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum RoundingPolicy {
+    Native,
+    Zero,
+    Away,
+    MinusInf,
+    PlusInf,
+    Even,
+    Odd,
+}
+
+#[derive(Clone, Debug)]
+pub enum AsInputValue<'t> {
+    Owned(Box<dyn MMMInputValue>),
+    Borrowed(&'t dyn MMMInputValue),
+}
+
+impl Deref for AsInputValue<'_> {
+    type Target = dyn MMMInputValue;
+    fn deref(&self) -> &Self::Target {
+        match self {
+            AsInputValue::Owned(b) => &**b,
+            AsInputValue::Borrowed(r) => *r,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub enum FusedSpec<'t> {
+    BinScalar(&'t Tensor, BinOp),
+    BinPerRow(TensorView<'t>, BinOp),
+    BinPerCol(TensorView<'t>, BinOp),
+    AddRowColProducts(&'t Tensor, &'t Tensor),
+    AddUnicast(OutputStore),
+    LeakyRelu(&'t Tensor),
+    QScale(isize, RoundingPolicy, i32),
+    RoundingShiftRight(usize, RoundingPolicy),
+    ShiftLeft(usize),
+    Store(OutputStore),
+    AddMatMul { a: AsInputValue<'t>, b: AsInputValue<'t>, packing: usize },
+}
+
+impl FusedSpec<'_> {
+    pub fn prefer_col_outer(&self) -> Option<bool> {
+        if let FusedSpec::AddMatMul { a, b, .. } = self {
+            let a_is_eager = a.format().is::<PackedFormat>();
+            let b_is_eager = b.format().is::<PackedFormat>();
+            if a_is_eager == b_is_eager {
+                None
+            } else {
+                Some(a_is_eager)
+            }
+        } else {
+            None
+        }
+    }
+}
+
+// Careful here, the jump_to comments are used by the build script.
+#[repr(C, usize)]
+#[derive(PartialEq, Eq, Copy, Clone, Debug)]
+#[rustfmt::skip]
+pub enum FusedKerSpec<TI: Copy> {
+    Done,                                       // jump_to:done
+    Clear,                                      // jump_to:clear
+                                                //
+    LoadTile(*const TI, *const TI),             // jump_to:load_tile
+
+    ScalarMin(TI),                              // jump_to:scalar_min
+    ScalarMax(TI),                              // jump_to:scalar_max
+    ScalarAdd(TI),                              // jump_to:scalar_add
+    ScalarMul(TI),                              // jump_to:scalar_mul
+    ScalarSub(TI),                              // jump_to:scalar_sub
+    ScalarSubF(TI),                             // jump_to:scalar_sub_flipped
+
+    LeakyRelu(TI),                              // jump_to:leaky_relu
+
+    PerRowMin(*const TI),                       // jump_to:per_row_min
+    PerRowMax(*const TI),                       // jump_to:per_row_max
+    PerRowAdd(*const TI),                       // jump_to:per_row_add
+    PerRowMul(*const TI),                       // jump_to:per_row_mul
+    PerRowSub(*const TI),                       // jump_to:per_row_sub
+    PerRowSubF(*const TI),                      // jump_to:per_row_sub_flipped
+
+    PerColMin(*const TI),                       // jump_to:per_col_min
+    PerColMax(*const TI),                       // jump_to:per_col_max
+    PerColAdd(*const TI),                       // jump_to:per_col_add
+    PerColMul(*const TI),                       // jump_to:per_col_mul
+    PerColSub(*const TI),                       // jump_to:per_col_sub
+    PerColSubF(*const TI),                      // jump_to:per_col_sub_flipped
+
+    QScale(isize, RoundingPolicy, i32),         // jump_to:q_scale
+    RoundingShiftRight(usize, RoundingPolicy),  // jump_to:q_shr
+    ShiftLeft(usize),                           // jump_to:q_shl
+    AddUnicast(OutputStoreKer),                 // jump_to:add_unicast
+    AddRowColProducts(*const TI, *const TI),    // jump_to:add_row_col_products
+    Store(OutputStoreKer),                      // jump_to:store
+
+    // jump_to:add_mat_mul
+    AddMatMul { k: usize, pa: *const u8, pb: *const u8, packing: usize },
+}
+
+unsafe impl<TI: Copy> Send for FusedKerSpec<TI> {}
+unsafe impl<TI: Copy> Sync for FusedKerSpec<TI> {}
+
+#[cfg(test)]
+#[test]
+fn check_non_linear_enum_size() {
+    assert_eq!(std::mem::size_of::<RoundingPolicy>(), std::mem::size_of::<usize>());
+    assert_eq!(
+        std::mem::size_of::<FusedKerSpec<f32>>(),
+        std::mem::size_of::<usize>() + std::mem::size_of::<OutputStoreKer>()
+    );
+    assert_eq!(std::mem::size_of::<FusedKerSpec<f32>>(), 5 * std::mem::size_of::<usize>());
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/input_store.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/input_store.rs
new file mode 100644
index 000000000..0337578ca
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/input_store.rs
@@ -0,0 +1,179 @@
+use downcast_rs::{impl_downcast, Downcast};
+use dyn_clone::DynClone;
+use dyn_hash::DynHash;
+use std::alloc::Layout;
+use std::fmt::{Debug, Display};
+use std::hash::Hash;
+use std::sync::Arc;
+use tract_data::internal::*;
+
+use crate::WeightType;
+
+pub trait MMMInputFormat: Downcast + Debug + DynHash + DynClone + Send + Sync + Display {
+    fn prepare_tensor(&self, t: &Tensor, k_axis: usize, mn_axis: usize) -> TractResult<Tensor>;
+    fn prepare_one(
+        &self,
+        t: &Tensor,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>>;
+    fn precursor(&self) -> WeightType;
+    fn r(&self) -> usize;
+    fn k_alignment(&self) -> usize;
+    fn same_as(&self, other: &dyn MMMInputFormat) -> bool;
+    fn merge_with<'o, 'a: 'o, 'b: 'o>(
+        &'a self,
+        other: &'b dyn MMMInputFormat,
+    ) -> Option<&'o dyn MMMInputFormat> {
+        if self.same_as(other) {
+            Some(other)
+        } else {
+            None
+        }
+    }
+    fn mem_size(&self, k: TDim, mn: TDim) -> TDim;
+    fn extract_at_mn_f16(
+        &self,
+        data: &EagerPackedInput,
+        mn: usize,
+        slice: &mut [f16],
+    ) -> TractResult<()>;
+    fn extract_at_mn_f32(
+        &self,
+        data: &EagerPackedInput,
+        mn: usize,
+        slice: &mut [f32],
+    ) -> TractResult<()>;
+}
+
+dyn_clone::clone_trait_object!(MMMInputFormat);
+impl_downcast!(MMMInputFormat);
+dyn_hash::hash_trait_object!(MMMInputFormat);
+
+impl Eq for &dyn MMMInputFormat {}
+impl PartialEq for &dyn MMMInputFormat {
+    fn eq(&self, other: &Self) -> bool {
+        self.same_as(*other)
+    }
+}
+
+pub trait MMMInputValue: DynClone + Debug + DynHash + Send + Sync + Display + Downcast {
+    fn format(&self) -> &dyn MMMInputFormat;
+    fn scratch_panel_buffer_layout(&self) -> Option<Layout>;
+    fn panel_bytes(&self, i: usize, buffer: Option<*mut u8>) -> TractResult<*const u8>;
+    fn panels_count(&self) -> usize {
+        self.mn().divceil(self.format().r())
+    }
+    fn mn(&self) -> usize;
+    fn k(&self) -> usize;
+    fn opaque_fact(&self) -> &dyn OpaqueFact;
+    fn same_as(&self, other: &dyn MMMInputValue) -> bool;
+
+    fn extract_at_mn_f16(&self, mn: usize, slice: &mut [f16]) -> TractResult<()>;
+    fn extract_at_mn_f32(&self, mn: usize, slice: &mut [f32]) -> TractResult<()>;
+}
+dyn_clone::clone_trait_object!(MMMInputValue);
+impl_downcast!(MMMInputValue);
+dyn_hash::hash_trait_object!(MMMInputValue);
+
+impl From<Box<dyn MMMInputValue>> for Opaque {
+    fn from(value: Box<dyn MMMInputValue>) -> Self {
+        Opaque(Arc::new(value))
+    }
+}
+
+impl OpaquePayload for Box<dyn MMMInputValue> {
+    fn same_as(&self, other: &dyn OpaquePayload) -> bool {
+        other
+            .downcast_ref::<Self>()
+            .is_some_and(|other| (&**self as &dyn MMMInputValue).same_as(&**other))
+    }
+}
+
+#[allow(clippy::derived_hash_with_manual_eq)]
+#[derive(Clone, Hash, Debug)]
+pub struct PackedOpaqueFact {
+    pub format: Box<dyn MMMInputFormat>,
+    pub mn: TDim,
+    pub k: usize,
+}
+
+impl Display for PackedOpaqueFact {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Eager {} tensor (mn={} k={})", self.format, self.mn, self.k)
+    }
+}
+
+impl OpaqueFact for PackedOpaqueFact {
+    fn mem_size(&self) -> TDim {
+        self.format.mem_size(self.k.to_dim(), self.mn.clone())
+    }
+
+    fn same_as(&self, other: &dyn OpaqueFact) -> bool {
+        other.downcast_ref::<Self>().is_some_and(|o| o == self)
+    }
+}
+
+impl PartialEq for PackedOpaqueFact {
+    fn eq(&self, other: &Self) -> bool {
+        self.format.same_as(&*other.format) && self.mn == other.mn && self.k == other.k
+    }
+}
+
+#[derive(Clone, Hash)]
+pub struct EagerPackedInput {
+    pub fact: PackedOpaqueFact,
+    pub packed: Arc<Blob>,
+    pub panel_bytes: usize,
+    pub mn: usize,
+}
+
+impl MMMInputValue for EagerPackedInput {
+    fn scratch_panel_buffer_layout(&self) -> Option<Layout> {
+        None
+    }
+    fn panel_bytes(&self, i: usize, _buffer: Option<*mut u8>) -> TractResult<*const u8> {
+        unsafe { Ok(self.packed.as_ptr().add(i * self.panel_bytes)) }
+    }
+    fn k(&self) -> usize {
+        self.fact.k
+    }
+    fn mn(&self) -> usize {
+        self.mn
+    }
+    fn format(&self) -> &dyn MMMInputFormat {
+        &*self.fact.format
+    }
+    fn opaque_fact(&self) -> &dyn OpaqueFact {
+        &self.fact
+    }
+    fn same_as(&self, other: &dyn MMMInputValue) -> bool {
+        other.downcast_ref::<Self>().is_some_and(|other| {
+            self.fact.same_as(&other.fact)
+                && self.packed == other.packed
+                && self.panel_bytes == other.panel_bytes
+        })
+    }
+    fn extract_at_mn_f16(&self, mn: usize, slice: &mut [f16]) -> TractResult<()> {
+        ensure!(slice.len() == self.k());
+        ensure!(mn < self.mn());
+        self.fact.format.extract_at_mn_f16(self, mn, slice)
+    }
+    fn extract_at_mn_f32(&self, mn: usize, slice: &mut [f32]) -> TractResult<()> {
+        ensure!(slice.len() == self.k());
+        ensure!(mn < self.mn());
+        self.fact.format.extract_at_mn_f32(self, mn, slice)
+    }
+}
+
+impl Display for EagerPackedInput {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        (&self.fact as &dyn Display).fmt(f)
+    }
+}
+
+impl Debug for EagerPackedInput {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        <Self as Display>::fmt(self, f)
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/kernel.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/kernel.rs
new file mode 100644
index 000000000..8283b5d96
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/kernel.rs
@@ -0,0 +1,159 @@
+use crate::frame::pack::PackedFormat;
+
+use super::*;
+use std::borrow::Cow;
+use std::fmt::Debug;
+
+use crate::LADatum;
+
+pub trait MatMatMulKer: Clone + Debug + Send + Sync + 'static {
+    type Acc: LADatum;
+    fn name(&self) -> &str;
+    fn kernel(&self, op: &[FusedKerSpec<Self::Acc>]) -> isize;
+    fn mr(&self) -> usize;
+    fn nr(&self) -> usize;
+
+    fn quality(&self) -> ImplementationQuality;
+    fn dynamic_boost(&self) -> isize;
+
+    #[allow(clippy::type_complexity)]
+    fn packings(&self) -> &[(Box<dyn MMMInputFormat>, Box<dyn MMMInputFormat>)];
+    fn stores(&self) -> Cow<'_, [DatumType]>;
+
+    #[allow(unused_variables)]
+    fn can_fuse(&self, spec: &FusedSpec) -> bool {
+        true
+    }
+
+    #[allow(unused_variables)]
+    fn is_supported_here(&self) -> bool {
+        true
+    }
+}
+
+type Kernel<Acc> = unsafe fn(&[FusedKerSpec<Acc>]) -> isize;
+
+#[derive(Clone)]
+pub struct DynKernel<const MR: usize, const NR: usize, Acc: LADatum> {
+    pub name: String,
+    pub kernel: Kernel<Acc>,
+    pub quality: ImplementationQuality,
+    pub packings: Vec<(Box<dyn MMMInputFormat>, Box<dyn MMMInputFormat>)>,
+    pub stores: Vec<DatumType>,
+    pub supported_predicate: fn() -> bool,
+    pub boost: fn() -> isize,
+    pub can_fuse: fn(&FusedSpec) -> bool,
+}
+
+impl<const MR: usize, const NR: usize, Acc: LADatum> DynKernel<MR, NR, Acc> {
+    pub fn new(
+        name: &str,
+        kernel: Kernel<Acc>,
+        packing_a: PackedFormat,
+        packing_b: PackedFormat,
+        quality: ImplementationQuality,
+    ) -> Self {
+        let kernel = DynKernel {
+            name: name.to_string(),
+            kernel,
+            quality,
+            packings: vec![],
+            stores: vec![Acc::datum_type()],
+            supported_predicate: || true,
+            boost: || 0,
+            can_fuse: |_| true,
+        };
+        kernel.with_packing(packing_a, packing_b)
+    }
+
+    pub fn with_platform_condition(mut self, f: fn() -> bool) -> Self {
+        self.supported_predicate = f;
+        self
+    }
+
+    pub fn with_boost(mut self, f: fn() -> isize) -> Self {
+        self.boost = f;
+        self
+    }
+
+    pub fn with_packing(mut self, a: impl MMMInputFormat, b: impl MMMInputFormat) -> Self {
+        self.packings.push((Box::new(a), Box::new(b)));
+        self
+    }
+
+    pub fn with_packing_a(self, a: impl MMMInputFormat) -> Self {
+        let b = self.regular_pack_b();
+        self.with_packing(a, b)
+    }
+
+    pub fn regular_pack_a(&self) -> PackedFormat {
+        *self.packings[0].0.clone().downcast::<PackedFormat>().unwrap()
+    }
+
+    pub fn regular_pack_b(&self) -> PackedFormat {
+        *self.packings[0].1.clone().downcast::<PackedFormat>().unwrap()
+    }
+
+    pub fn with_can_fuse(self, can_fuse: fn(&FusedSpec) -> bool) -> Self {
+        Self { can_fuse, ..self }
+    }
+
+    pub fn with_store<D: LADatum>(mut self) -> Self {
+        self.stores.push(D::datum_type());
+        self
+    }
+
+    pub fn mmm(&self) -> Box<dyn MatMatMul> {
+        Box::new(self.clone())
+    }
+}
+
+impl<const MR: usize, const NR: usize, Acc: LADatum> Debug for DynKernel<MR, NR, Acc> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.name)
+    }
+}
+
+impl<const MR: usize, const NR: usize, Acc: LADatum> MatMatMulKer for DynKernel<MR, NR, Acc> {
+    type Acc = Acc;
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn mr(&self) -> usize {
+        MR
+    }
+
+    fn nr(&self) -> usize {
+        NR
+    }
+
+    fn quality(&self) -> ImplementationQuality {
+        self.quality
+    }
+
+    fn is_supported_here(&self) -> bool {
+        (self.supported_predicate)()
+    }
+
+    fn can_fuse(&self, spec: &FusedSpec) -> bool {
+        (self.can_fuse)(spec)
+    }
+
+    fn kernel(&self, op: &[FusedKerSpec<Self::Acc>]) -> isize {
+        unsafe { (self.kernel)(op) }
+    }
+
+    #[allow(clippy::type_complexity)]
+    fn packings(&self) -> &[(Box<dyn MMMInputFormat>, Box<dyn MMMInputFormat>)] {
+        &self.packings
+    }
+
+    fn stores(&self) -> Cow<'_, [DatumType]> {
+        Cow::Borrowed(&self.stores)
+    }
+
+    fn dynamic_boost(&self) -> isize {
+        (self.boost)()
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/macros.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/macros.rs
new file mode 100644
index 000000000..d500c1c3b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/macros.rs
@@ -0,0 +1,124 @@
+macro_rules! MMMExternKernel {
+    (
+            $func:ident<$ti:ident>($mr: expr, $nr: expr)
+            $(@($align_a:expr, $align_b:expr))?
+            $(where($where:expr))?
+            $(can_fuse($can_fuse:expr))?
+            $(packing[$pnum:literal] = $pid:ident => $packing:expr;)*
+            $(quality($quality:expr))?
+            $(boost($boost:expr))?
+            $(store($($store:ty),*))?
+     ) => {
+        paste! {
+            mod [<sys_ $func>] {
+                #[allow(unused_imports)]
+                use super::*;
+                #[allow(unused_imports)]
+                use crate::frame::mmm::*;
+                extern_kernel!(fn $func(op: *const FusedKerSpec<$ti>) -> isize);
+
+                #[inline]
+                pub unsafe fn rusty(op: &[FusedKerSpec<$ti>]) -> isize {
+                    unsafe { $func(op.as_ptr()) }
+                }
+            }
+
+            MMMKernel!([<sys_$func>]::rusty as $func<$ti>($mr, $nr)
+                $(@($align_a, $align_b))?
+                $(where($where))?
+                $(can_fuse($can_fuse))?
+                $(packing[$pnum] = $pid => $packing;)*
+                $(quality($quality))?
+                $(boost($boost))?
+                $(store($($store),*))?
+            );
+        }
+    };
+}
+macro_rules! MMMRustKernel {
+    (       $func: path =>
+            $id:ident<$ti:ident>($mr: expr, $nr: expr)
+            $(@($align_a:expr, $align_b:expr))?
+            $(where($where:expr))?
+            $(can_fuse($can_fuse:expr))?
+            $(packing[$pnum:literal] = $pid:ident => $packing:expr;)*
+            $(quality($quality:expr))?
+            $(store($($store:ty),*))?
+     ) => {
+        paste! {
+            mod [<sys_ $id>] {
+                #[allow(unused_imports)]
+                use crate::frame::mmm::*;
+                use super::*;
+                #[inline]
+                pub unsafe fn rusty(op: &[FusedKerSpec<$ti>]) -> isize {
+                    unsafe { $func(op.as_ptr()) }
+                }
+            }
+            MMMKernel!([<sys_$id>]::rusty as $id<$ti>($mr, $nr)
+                $(@($align_a, $align_b))?
+                generic(true)
+                $(where($where))?
+                $(can_fuse($can_fuse))?
+                $(packing[$pnum] = $pid => $packing;)*
+                $(quality($quality))?
+                $(store($($store),*))?
+            );
+        }
+    }
+}
+
+macro_rules! MMMKernel {
+    (
+            $func: path as
+            $id:ident<$ti:ident>($mr: expr, $nr: expr)
+            $(@($align_a:expr, $align_b:expr))?
+            $(generic($generic:expr))?
+            $(where($where:expr))?
+            $(can_fuse($can_fuse:expr))?
+            $(packing[$pnum:literal] = $pid:ident => $packing:expr;)*
+            $(quality($quality:expr))?
+            $(boost($boost:expr))?
+            $(store($($store:ty),*))?
+     ) => {
+        paste! {
+            lazy_static::lazy_static! {
+                pub static ref $id: $crate::mmm::DynKernel<$mr, $nr, $ti> = {
+                    use $crate::mmm::DynKernel;
+                    #[allow(unused_imports)]
+                    use tract_data::prelude::*;
+                    use $crate::pack::Packing;
+                    #[allow(unused_mut)]
+                    let (mut packing_a, mut packing_b) = ($ti::packing($mr), $ti::packing($nr));
+                    $(
+                        packing_a = packing_a.align($align_a);
+                        packing_b = packing_b.align($align_b);
+                    )?
+                    #[allow(unused_mut)]
+                    let mut k = DynKernel::<$mr, $nr, $ti>::new(stringify!($id), $func, packing_a, packing_b, $crate::frame::mmm::ImplementationQuality::Dreadful);
+                    $(k = k.with_platform_condition($where);)?
+                    $(
+                        assert!(k.packings.len() == $pnum);
+                        let f: fn(DynKernel<$mr, $nr, $ti>) -> DynKernel<$mr, $nr, $ti> = $packing;
+                        k = f(k);
+                    )*
+                    $($(
+                        k.stores.push(<$store>::datum_type());
+                    )*)?
+                    $(k.can_fuse = $can_fuse;)?
+                    $(k.quality = $quality;)?
+                    $(k = k.with_boost($boost);)?
+                    k
+                };
+            }
+
+            #[cfg(test)]
+            mod [<test_$id>] {
+                use super::$id;
+                test_mmm_kernel!($ti, &*super::$id);
+                $(mmm_packed_packed_tests!(&*super::$id, $pid : $pnum);)*
+                $($(mmm_store_test!(&*super::$id, $store);)*)?
+            }
+        }
+    };
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/mod.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/mod.rs
new file mode 100644
index 000000000..3200d293c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/mod.rs
@@ -0,0 +1,307 @@
+#[macro_use]
+mod macros;
+
+pub mod cost_model;
+#[macro_use]
+pub(crate) mod fuse;
+pub(crate) mod input_store;
+pub(crate) mod kernel;
+#[macro_use]
+pub(crate) mod panel_extract;
+mod scratch;
+mod storage;
+
+#[cfg(test)]
+#[macro_use]
+pub mod tests;
+
+use crate::multithread::Executor;
+#[cfg(feature = "multithread-mm")]
+use rayon::prelude::*;
+use std::borrow::Cow;
+use std::cmp::Ordering;
+use std::fmt::Debug;
+use tract_data::internal::*;
+
+pub use cost_model::*;
+pub use fuse::*;
+pub use input_store::*;
+pub use kernel::*;
+pub use panel_extract::*;
+pub use scratch::*;
+pub use storage::*;
+
+pub fn no_prefetch(_ptr: *const u8, _len: usize) {}
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
+pub enum ImplementationQuality {
+    /// Individual operations are emulated by individual conversion (f16->f32->f16)
+    Dreadful,
+    /// Rust scalar operation (with whatever optimisation the compiler manages)
+    Generic,
+    /// Implicit vectorization (e.g. Rust code, some unrolled loops, explicit template instantiations for small constant)
+    RustOptimized,
+    /// Explicit vectorization (e.g. intrinsics vector code)
+    TargetOptimized,
+    /// Hand optimized (assembly)
+    ManuallyOptimized,
+}
+
+impl ImplementationQuality {
+    pub fn best_to_worst() -> &'static [ImplementationQuality] {
+        use ImplementationQuality::*;
+        &[ManuallyOptimized, TargetOptimized, RustOptimized, Generic, Dreadful]
+    }
+
+    pub fn cost(&self) -> usize {
+        ImplementationQuality::best_to_worst().iter().position(|x| x == self).unwrap()
+    }
+}
+
+impl PartialOrd for ImplementationQuality {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(usize::from(*self).cmp(&usize::from(*other)))
+    }
+}
+
+impl From<ImplementationQuality> for usize {
+    fn from(value: ImplementationQuality) -> Self {
+        value.cost()
+    }
+}
+
+pub trait MatMatMul: Debug + dyn_clone::DynClone + Send + Sync + std::any::Any {
+    fn name(&self) -> &str;
+    fn mr(&self) -> usize;
+    fn nr(&self) -> usize;
+
+    fn quality(&self) -> ImplementationQuality;
+    fn dynamic_boost(&self) -> isize;
+
+    #[allow(clippy::type_complexity)]
+    fn packings(&self) -> &[(Box<dyn MMMInputFormat>, Box<dyn MMMInputFormat>)];
+
+    fn internal_type(&self) -> DatumType;
+
+    unsafe fn c_view(&self, m_axis: Option<usize>, n_axis: Option<usize>) -> OutputStoreSpec;
+    unsafe fn c_from_data_and_strides(
+        &self,
+        item_size: usize,
+        row_stride: isize,
+        col_stride: isize,
+    ) -> OutputStoreSpec;
+
+    fn can_fuse(&self, spec: &FusedSpec) -> bool;
+
+    fn stores(&self) -> Cow<'_, [DatumType]>;
+
+    unsafe fn run(&self, m: usize, n: usize, non_linear: &[FusedSpec]) -> TractResult<()> {
+        unsafe {
+            let mut scratch = self.allocate_scratch_space();
+            self.run_with_scratch_space(m, n, &mut *scratch, non_linear)
+        }
+    }
+
+    unsafe fn allocate_scratch_space(&self) -> Box<dyn ScratchSpace>;
+    unsafe fn can_use_scratch_space(&self, scratch: &dyn ScratchSpace) -> bool;
+    unsafe fn run_with_scratch_space(
+        &self,
+        m: usize,
+        n: usize,
+        scratch: &mut dyn ScratchSpace,
+        non_linear: &[FusedSpec],
+    ) -> TractResult<()>;
+}
+
+dyn_clone::clone_trait_object!(MatMatMul);
+
+impl PartialEq for Box<dyn MatMatMul> {
+    fn eq(&self, other: &Box<dyn MatMatMul>) -> bool {
+        self.name() == other.name()
+    }
+}
+
+impl std::hash::Hash for Box<dyn MatMatMul> {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.name().hash(state)
+    }
+}
+
+impl<K: MatMatMulKer> MatMatMul for K {
+    fn name(&self) -> &str {
+        self.name()
+    }
+    fn mr(&self) -> usize {
+        self.mr()
+    }
+    fn nr(&self) -> usize {
+        self.nr()
+    }
+
+    fn quality(&self) -> ImplementationQuality {
+        MatMatMulKer::quality(self)
+    }
+
+    fn dynamic_boost(&self) -> isize {
+        MatMatMulKer::dynamic_boost(self)
+    }
+
+    fn packings(&self) -> &[(Box<dyn MMMInputFormat>, Box<dyn MMMInputFormat>)] {
+        self.packings()
+    }
+
+    fn internal_type(&self) -> DatumType {
+        K::Acc::datum_type()
+    }
+
+    fn can_fuse(&self, spec: &FusedSpec) -> bool {
+        self.can_fuse(spec)
+    }
+
+    unsafe fn c_view(&self, m_axis: Option<usize>, n_axis: Option<usize>) -> OutputStoreSpec {
+        OutputStoreSpec::View { m_axis, n_axis, mr: self.mr(), nr: self.nr() }
+    }
+
+    unsafe fn c_from_data_and_strides(
+        &self,
+        item_size: usize,
+        row_stride: isize,
+        col_stride: isize,
+    ) -> OutputStoreSpec {
+        OutputStoreSpec::Strides {
+            row_byte_stride: row_stride * item_size as isize,
+            col_byte_stride: col_stride * item_size as isize,
+            mr: self.mr(),
+            nr: self.nr(),
+        }
+    }
+
+    fn stores(&self) -> Cow<'_, [DatumType]> {
+        self.stores()
+    }
+
+    unsafe fn allocate_scratch_space(&self) -> Box<dyn ScratchSpace> {
+        Box::<ScratchSpaceImpl<K::Acc>>::default()
+    }
+
+    unsafe fn can_use_scratch_space(&self, scratch: &dyn ScratchSpace) -> bool {
+        scratch.downcast_ref::<ScratchSpaceImpl<K::Acc>>().is_some()
+    }
+
+    unsafe fn run_with_scratch_space(
+        &self,
+        m: usize,
+        n: usize,
+        scratch: &mut dyn ScratchSpace,
+        non_linear: &[FusedSpec],
+    ) -> TractResult<()> {
+        unsafe {
+            let scratch = scratch
+                .downcast_mut::<ScratchSpaceImpl<K::Acc>>()
+                .context("Wrong scratch space type")?;
+            scratch.prepare(self, m, n, non_linear)?;
+            if n == 1 && self.nr() == 1 {
+                run_with_scratch_space_vec(self, m, scratch, non_linear)
+            } else {
+                let (mut prefer_col, mut prefer_row) = (0, 0);
+                for uop in non_linear.iter() {
+                    if let Some(col) = uop.prefer_col_outer() {
+                        prefer_col = col as usize;
+                        prefer_row = (!col) as usize;
+                    }
+                }
+                if prefer_col > prefer_row {
+                    run_with_scratch_space_col_outer(self, m, n, scratch, non_linear)
+                } else {
+                    run_with_scratch_space_row_outer(self, m, n, scratch, non_linear)
+                }
+            }
+        }
+    }
+}
+
+unsafe fn run_with_scratch_space_vec<K: MatMatMulKer>(
+    ker: &K,
+    m: usize,
+    scratch: &mut ScratchSpaceImpl<K::Acc>,
+    non_linear: &[FusedSpec],
+) -> TractResult<()> {
+    unsafe {
+        match crate::multithread::current_tract_executor() {
+            Executor::SingleThread => {
+                for ia in 0..m.divceil(ker.mr()) {
+                    scratch.run(ker, non_linear, ia, 0)?;
+                }
+                Ok(())
+            }
+            #[cfg(feature = "multithread-mm")]
+            Executor::MultiThread(pool) => pool.install(|| {
+                (0..m.div_ceil(ker.mr()))
+                    .into_par_iter()
+                    .try_for_each(|ia| scratch.run(ker, non_linear, ia, 0))
+            }),
+        }
+    }
+}
+
+unsafe fn run_with_scratch_space_col_outer<K: MatMatMulKer>(
+    ker: &K,
+    m: usize,
+    n: usize,
+    scratch: &mut ScratchSpaceImpl<K::Acc>,
+    non_linear: &[FusedSpec],
+) -> TractResult<()> {
+    unsafe {
+        match crate::multithread::current_tract_executor() {
+            Executor::SingleThread => {
+                for ib in 0..n.divceil(ker.nr()) {
+                    for ia in 0..m.divceil(ker.mr()) {
+                        scratch.run(ker, non_linear, ia, ib)?;
+                    }
+                }
+                Ok(())
+            }
+            #[cfg(feature = "multithread-mm")]
+            Executor::MultiThread(pool) => pool.install(|| {
+                (0..n.div_ceil(ker.nr())).into_par_iter().try_for_each(|ib| {
+                    for ia in 0..m.divceil(ker.mr()) {
+                        scratch.run(ker, non_linear, ia, ib)?;
+                    }
+                    Ok(())
+                })
+            }),
+        }
+    }
+}
+
+unsafe fn run_with_scratch_space_row_outer<K: MatMatMulKer>(
+    ker: &K,
+    m: usize,
+    n: usize,
+    scratch: &mut ScratchSpaceImpl<K::Acc>,
+    non_linear: &[FusedSpec],
+) -> TractResult<()> {
+    unsafe {
+        match crate::multithread::current_tract_executor() {
+            Executor::SingleThread => {
+                for ia in 0..m.divceil(ker.mr()) {
+                    for ib in 0..n.divceil(ker.nr()) {
+                        scratch.run(ker, non_linear, ia, ib)?;
+                    }
+                }
+                Ok(())
+            }
+            #[cfg(feature = "multithread-mm")]
+            Executor::MultiThread(pool) => pool.install(|| {
+                pool.install(|| {
+                    (0..m.div_ceil(ker.mr())).into_par_iter().try_for_each(|ia| {
+                        for ib in 0..n.divceil(ker.nr()) {
+                            scratch.run(ker, non_linear, ia, ib)?;
+                        }
+                        Ok(())
+                    })
+                })
+            }),
+        }
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/panel_extract.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/panel_extract.rs
new file mode 100644
index 000000000..31862ab48
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/panel_extract.rs
@@ -0,0 +1,300 @@
+use std::fmt::{Debug, Display};
+use tract_data::internal::*;
+
+use super::{EagerPackedInput, MMMInputFormat, MMMInputValue};
+use crate::pack::PackedFormat;
+
+type Kernel = unsafe fn(input: *const u8, output: *mut u8, k: usize);
+
+#[allow(clippy::derived_hash_with_manual_eq)]
+#[derive(Hash, Clone)]
+pub struct PanelExtractor {
+    pub name: String,
+    pub from: Box<dyn MMMInputFormat>,
+    pub to: PackedFormat,
+    pub kernel: Kernel,
+    pub supported_predicate: fn() -> bool,
+}
+
+impl Debug for PanelExtractor {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} ({:?} -> {:?})", self.name, self.from, self.to)
+    }
+}
+
+impl Display for PanelExtractor {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.name)
+    }
+}
+
+impl PartialEq for PanelExtractor {
+    fn eq(&self, other: &Self) -> bool {
+        self.name == other.name && self.from.same_as(&*other.from) && self.to == other.to
+    }
+}
+
+impl PanelExtractor {
+    #[allow(unused_variables)]
+    pub fn is_supported_here(&self) -> bool {
+        (self.supported_predicate)()
+    }
+}
+
+#[derive(Clone, Hash)]
+pub struct PanelExtractInput {
+    pub format: PanelExtractor,
+    pub data: EagerPackedInput,
+}
+
+impl MMMInputValue for PanelExtractInput {
+    fn scratch_panel_buffer_layout(&self) -> Option<std::alloc::Layout> {
+        Some(self.format.to.single_panel_layout(self.data.k(), self.format.to.dt.size_of()))
+    }
+    fn panel_bytes(&self, i: usize, buffer: Option<*mut u8>) -> TractResult<*const u8> {
+        let scratch = buffer.unwrap();
+        unsafe {
+            let source = self.data.packed.as_ptr().add(self.data.panel_bytes * i);
+            (self.format.kernel)(source, scratch, self.data.k());
+        }
+        Ok(scratch)
+    }
+    fn mn(&self) -> usize {
+        self.data.mn()
+    }
+    fn k(&self) -> usize {
+        self.data.k()
+    }
+    fn format(&self) -> &dyn MMMInputFormat {
+        &self.format.to
+    }
+    fn opaque_fact(&self) -> &dyn OpaqueFact {
+        self.data.opaque_fact()
+    }
+    fn same_as(&self, other: &dyn MMMInputValue) -> bool {
+        other
+            .downcast_ref::<Self>()
+            .is_some_and(|o| o.format == self.format && o.data.same_as(&self.data))
+    }
+    fn extract_at_mn_f16(&self, mn: usize, slice: &mut [f16]) -> TractResult<()> {
+        self.data.extract_at_mn_f16(mn, slice)
+    }
+    fn extract_at_mn_f32(&self, mn: usize, slice: &mut [f32]) -> TractResult<()> {
+        self.data.extract_at_mn_f32(mn, slice)
+    }
+}
+
+impl Display for PanelExtractInput {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "PanelExtract({})", self.data)
+    }
+}
+
+impl Debug for PanelExtractInput {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "PanelExtract({})", self.data)
+    }
+}
+
+#[macro_export]
+macro_rules! panel_extractor {
+    ( $func:path as $id:ident($from:expr, $to: expr)
+            $(where($where:expr))?
+     ) => {
+        paste! {
+            lazy_static::lazy_static! {
+                pub static ref $id: $crate::mmm::PanelExtractor = {
+                    use $crate::mmm::MMMInputFormat;
+                    let (from, to) = ($from, $to);
+                    assert!(from.r() == to.r());
+                    #[allow(unused_mut)]
+                    let mut it = $crate::mmm::PanelExtractor {
+                        name: stringify!($id).to_string(),
+                        from,
+                        to,
+                        kernel: $func,
+                        supported_predicate: || true
+                    };
+                    $(
+                        it.supported_predicate = $where;
+                    )?
+                    it
+                };
+            }
+
+            #[cfg(test)]
+            mod [<test_$id>] {
+                use super::$id;
+                #[test]
+                fn repack_0block_1panel() {
+                    $crate::frame::mmm::panel_extract::test::test_packing(&$id, 0, 1).unwrap();
+                }
+
+                #[test]
+                fn repack_1block_0panel() {
+                    $crate::frame::mmm::panel_extract::test::test_packing(&$id, 1, 0).unwrap();
+                }
+
+                #[test]
+                fn repack_1block_1panel() {
+                    $crate::frame::mmm::panel_extract::test::test_packing(&$id, 1, 1).unwrap();
+                }
+
+                #[test]
+                fn repack_2block_1panel() {
+                    $crate::frame::mmm::panel_extract::test::test_packing(&$id, 2, 1).unwrap();
+                }
+
+                #[test]
+                fn repack_1block_2panel() {
+                    $crate::frame::mmm::panel_extract::test::test_packing(&$id, 1, 2).unwrap();
+                }
+
+                #[test]
+                fn repack_2block_2panel() {
+                    $crate::frame::mmm::panel_extract::test::test_packing(&$id, 2, 2).unwrap();
+                }
+            }
+        }
+    };
+}
+
+#[cfg(test)]
+pub mod test {
+    use crate::frame::block_quant::PackedBlockQuantFormat;
+    use tract_data::internal::*;
+    use tract_ndarray::Array2;
+
+    use super::*;
+
+    pub fn test_packing(
+        extractor: &PanelExtractor,
+        blocks: usize,
+        panels: usize,
+    ) -> TractResult<()> {
+        if !extractor.is_supported_here() {
+            return Ok(());
+        }
+        assert!(extractor.from.r() == extractor.to.r());
+        assert!(extractor.to.dt == f32::datum_type() || extractor.to.dt == f16::datum_type());
+        if let Some(from) = extractor.from.downcast_ref::<PackedBlockQuantFormat>() {
+            test_packing_bq(extractor, from, blocks, panels)
+        } else if let Some(from) = extractor.from.downcast_ref() {
+            test_packing_plain(extractor, from, blocks, panels)
+        } else {
+            todo!()
+        }
+    }
+
+    pub fn test_packing_plain(
+        extractor: &PanelExtractor,
+        from: &PackedFormat,
+        blocks: usize,
+        panels: usize,
+    ) -> TractResult<()> {
+        let m = from.r * panels;
+        let k = 8 * blocks; // 8 is arbitrary
+        let to = &extractor.to;
+        let weights_orig =
+            Array2::from_shape_fn((m, k), |(m, k)| ((m * 31 + k * 17) % 20) as f32 - 10.)
+                .into_tensor()
+                .cast_to_dt(from.dt)?
+                .into_owned();
+        let packed_orig = from.prepare_tensor(&weights_orig, 1, 0)?;
+        let packed_orig =
+            packed_orig.to_scalar::<Opaque>()?.downcast_ref::<Box<dyn MMMInputValue>>().unwrap();
+        let packed_orig = packed_orig.downcast_ref::<EagerPackedInput>().unwrap();
+
+        for panel in 0..panels {
+            let orig_panel = &packed_orig.packed[packed_orig.panel_bytes * panel..]
+                [..k * from.r * from.dt.size_of()];
+            let mut reference_panel = Tensor::zero_dt(from.dt, &[k, from.r])?;
+            reference_panel.as_bytes_mut().copy_from_slice(orig_panel);
+            reference_panel = reference_panel.cast_to_dt(to.dt)?.into_owned();
+
+            let mut tested_panel = Tensor::zero_dt(to.dt, &[k, from.r])?;
+            unsafe {
+                (extractor.kernel)(
+                    orig_panel.as_ptr(),
+                    tested_panel.as_bytes_mut().as_mut_ptr(),
+                    k,
+                );
+            }
+            compare_panels(&tested_panel, &reference_panel, from.r, k);
+        }
+        Ok(())
+    }
+
+    pub fn test_packing_bq(
+        extractor: &PanelExtractor,
+        from: &PackedBlockQuantFormat,
+        blocks: usize,
+        panels: usize,
+    ) -> TractResult<()> {
+        let m = from.r * panels;
+        let k = from.bq.block_len() * blocks;
+        let to = &extractor.to;
+        let weights_orig =
+            Array2::from_shape_fn((m, k), |(m, k)| ((m * 31 + k * 17) % 20) as f32 - 10.)
+                .into_tensor()
+                .cast_to_dt(to.dt)?
+                .into_owned();
+        let weights = if to.dt == f32::datum_type() {
+            from.bq
+                .dequant_f32(&from.bq.quant_f32(weights_orig.as_slice::<f32>()?)?)?
+                .into_shape(&[m, k])?
+        } else {
+            from.bq
+                .dequant_f16(&from.bq.quant_f16(weights_orig.as_slice::<f16>()?)?)?
+                .into_shape(&[m, k])?
+        };
+        let block_quant = if to.dt == f32::datum_type() {
+            from.bq.quant_f32(weights.as_slice::<f32>()?)?
+        } else {
+            from.bq.quant_f16(weights.as_slice::<f16>()?)?
+        };
+        let packed_block_quant =
+            from.bq.pack(&block_quant, k, from.r, from.zip, from.scales_at_end)?;
+
+        let mut reference_panel = Tensor::zero_dt(to.dt, &[k, from.r])?;
+        let mut tested_panel = Tensor::zero_dt(to.dt, &[k, from.r])?;
+
+        for panel in 0..packed_block_quant.panels_count() {
+            unsafe {
+                from.bq.extract_packed_panel(
+                    &packed_block_quant,
+                    to,
+                    panel,
+                    reference_panel.as_bytes_mut().as_mut_ptr(),
+                )?;
+
+                let source =
+                    packed_block_quant.packed.as_ptr().add(packed_block_quant.panel_bytes * panel);
+                (extractor.kernel)(source, tested_panel.as_bytes_mut().as_mut_ptr(), k);
+            }
+            compare_panels(&tested_panel, &reference_panel, from.r, k);
+        }
+        Ok(())
+    }
+
+    fn compare_panels(tested_panel: &Tensor, reference_panel: &Tensor, r: usize, k: usize) {
+        if tested_panel != reference_panel {
+            if reference_panel.datum_type() == f32::datum_type() {
+                crate::frame::mmm::tests::display_error(
+                    tested_panel.as_slice::<f32>().unwrap(),
+                    reference_panel.as_slice::<f32>().unwrap(),
+                    r,
+                    k,
+                );
+            } else {
+                crate::frame::mmm::tests::display_error(
+                    tested_panel.as_slice::<f16>().unwrap(),
+                    reference_panel.as_slice::<f16>().unwrap(),
+                    r,
+                    k,
+                );
+            }
+        }
+        assert_eq!(tested_panel, reference_panel);
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/scratch.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/scratch.rs
new file mode 100644
index 000000000..aec2e265a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/scratch.rs
@@ -0,0 +1,529 @@
+use super::{FusedKerSpec, FusedSpec, MatMatMulKer, OutputStoreKer};
+use crate::{BinOp, LADatum};
+use downcast_rs::{impl_downcast, Downcast};
+use std::cell::RefCell;
+use std::fmt::Debug;
+use std::sync::atomic::AtomicUsize;
+use tract_data::internal::num_integer::Integer;
+use tract_data::internal::*;
+
+static GENERATION: AtomicUsize = AtomicUsize::new(1);
+
+thread_local! {
+    static TLS: RefCell<TLSScratch> = Default::default();
+}
+
+#[derive(Default, Debug)]
+struct TLSScratch {
+    generation: usize,
+    blob: Blob,
+    ker_specs_16: Vec<FusedKerSpec<f16>>,
+    ker_specs_32: Vec<FusedKerSpec<f32>>,
+    ker_specs_64: Vec<FusedKerSpec<f64>>,
+}
+
+impl TLSScratch {
+    #[allow(unknown_lints, clippy::missing_transmute_annotations)]
+    fn ker_specs<TI: LADatum>(&mut self) -> &mut Vec<FusedKerSpec<TI>> {
+        unsafe {
+            if TI::datum_type() == f32::datum_type() || TI::datum_type() == i32::datum_type() {
+                std::mem::transmute(&mut self.ker_specs_32)
+            } else if TI::datum_type() == f16::datum_type() {
+                std::mem::transmute(&mut self.ker_specs_16)
+            } else if TI::datum_type() == f64::datum_type() {
+                std::mem::transmute(&mut self.ker_specs_64)
+            } else {
+                todo!();
+            }
+        }
+    }
+
+    fn sync<TI: LADatum>(&mut self, scratch: &ScratchSpaceImpl<TI>) {
+        if self.generation == scratch.generation {
+            return;
+        }
+        let ker_specs = self.ker_specs::<TI>();
+        ker_specs.clear();
+        ker_specs.extend_from_slice(&scratch.ker_specs);
+
+        unsafe {
+            self.blob.ensure_size_and_align(scratch.blob_size, scratch.blob_align);
+
+            for LocDependant { loc, ker_spec, .. } in &scratch.loc_dependant {
+                #[allow(clippy::single_match)]
+                if matches!(scratch.ker_specs[*ker_spec], FusedKerSpec::AddMatMul { .. }) {
+                    let scratch = &mut *(self.blob.as_ptr().add(*loc) as *mut AddMatMulTemp);
+                    scratch.panel_a_id = usize::MAX;
+                    scratch.panel_b_id = usize::MAX;
+                };
+            }
+        }
+        self.generation = scratch.generation;
+    }
+}
+
+pub trait ScratchSpace: Downcast + Send {}
+impl_downcast!(ScratchSpace);
+
+#[derive(Debug, Default)]
+pub struct ScratchSpaceImpl<TI: LADatum> {
+    generation: usize,
+    blob_size: usize,
+    blob_align: usize,
+    ker_specs: Vec<FusedKerSpec<TI>>,
+    loc_dependant: TVec<LocDependant>,
+    valid_down_tiles: usize,
+    remnant_down: usize,
+    valid_right_tiles: usize,
+    remnant_right: usize,
+}
+
+#[derive(Debug, new)]
+struct LocDependant {
+    spec: usize,
+    ker_spec: usize,
+    // offset for the location dependant structure
+    loc: usize,
+    // offset of its associated dynamic-size buffers
+    buffer_a: Option<usize>,
+    buffer_b: Option<usize>,
+}
+
+impl<TI: LADatum> ScratchSpace for ScratchSpaceImpl<TI> {}
+unsafe impl<TI: LADatum> Send for ScratchSpaceImpl<TI> {}
+
+#[derive(Debug)]
+struct AddMatMulTemp {
+    ptr_a: *const u8,
+    panel_a_id: usize,
+    ptr_b: *const u8,
+    panel_b_id: usize,
+}
+
+impl<TI: LADatum> ScratchSpaceImpl<TI> {
+    pub unsafe fn prepare(
+        &mut self,
+        ker: &impl MatMatMulKer<Acc = TI>,
+        m: usize,
+        n: usize,
+        specs: &[FusedSpec],
+    ) -> TractResult<()> {
+        use FusedKerSpec as FKS;
+        use FusedSpec as FS;
+        self.ker_specs.clear();
+        self.loc_dependant.clear();
+        self.ker_specs.reserve(specs.len() + 2);
+        self.ker_specs.push(FusedKerSpec::Clear);
+        self.valid_down_tiles = m / ker.mr();
+        self.remnant_down = m % ker.mr();
+        self.valid_right_tiles = n / ker.nr();
+        self.remnant_right = n % ker.nr();
+        let mut offset = 0;
+        let mut align = std::mem::size_of::<*const ()>();
+        fn ld(spec: usize, uspec: usize, loc: usize) -> LocDependant {
+            LocDependant { spec, ker_spec: uspec, loc, buffer_a: None, buffer_b: None }
+        }
+        for (ix, spec) in specs.iter().enumerate() {
+            offset = offset.next_multiple_of(&align);
+            let ker_spec = match spec {
+                FS::BinScalar(t, op) => match op {
+                    BinOp::Min => FKS::ScalarMin(*t.to_scalar()?),
+                    BinOp::Max => FKS::ScalarMax(*t.to_scalar()?),
+                    BinOp::Mul => FKS::ScalarMul(*t.to_scalar()?),
+                    BinOp::Add => FKS::ScalarAdd(*t.to_scalar()?),
+                    BinOp::Sub => FKS::ScalarSub(*t.to_scalar()?),
+                    BinOp::SubF => FKS::ScalarSubF(*t.to_scalar()?),
+                },
+                FS::ShiftLeft(s) => FKS::ShiftLeft(*s),
+                FS::RoundingShiftRight(s, rp) => FKS::RoundingShiftRight(*s, *rp),
+                FS::QScale(s, rp, m) => FKS::QScale(*s, *rp, *m),
+                FS::BinPerRow(_, _) => {
+                    self.loc_dependant.push(ld(ix, self.ker_specs.len(), offset));
+                    offset += TI::datum_type().size_of() * ker.mr();
+                    FusedKerSpec::Done
+                }
+                FS::BinPerCol(_, _) => {
+                    self.loc_dependant.push(ld(ix, self.ker_specs.len(), offset));
+                    offset += TI::datum_type().size_of() * ker.nr();
+                    FusedKerSpec::Done
+                }
+                FS::AddRowColProducts(_, _) => {
+                    self.loc_dependant.push(ld(ix, self.ker_specs.len(), offset));
+                    offset += TI::datum_type().size_of() * (ker.mr() + ker.nr());
+                    FusedKerSpec::Done
+                }
+                FS::AddUnicast(_) => {
+                    self.loc_dependant.push(ld(ix, self.ker_specs.len(), offset));
+                    offset += TI::datum_type().size_of() * ker.mr() * ker.nr();
+                    FusedKerSpec::Done
+                }
+                FS::Store(store) => {
+                    self.loc_dependant.push(ld(ix, self.ker_specs.len(), offset));
+                    offset += store.item_size * ker.mr() * ker.nr();
+                    FusedKerSpec::Done
+                }
+                FS::LeakyRelu(t) => FKS::LeakyRelu(*t.to_scalar()?),
+                FS::AddMatMul { a, b, packing } => {
+                    let mut ld = ld(ix, self.ker_specs.len(), offset);
+                    offset += std::mem::size_of::<AddMatMulTemp>();
+                    if let Some(tmp) = a.scratch_panel_buffer_layout() {
+                        align = tmp.align().lcm(&align);
+                        offset = Integer::next_multiple_of(&offset, &tmp.align());
+                        ld.buffer_a = Some(offset);
+                        offset += tmp.size();
+                    }
+                    if let Some(tmp) = b.scratch_panel_buffer_layout() {
+                        align = tmp.align().lcm(&align);
+                        offset = Integer::next_multiple_of(&offset, &tmp.align());
+                        ld.buffer_b = Some(offset);
+                        offset += tmp.size();
+                    }
+                    self.loc_dependant.push(ld);
+                    FusedKerSpec::AddMatMul {
+                        k: 0,
+                        pa: std::ptr::null(),
+                        pb: std::ptr::null(),
+                        packing: *packing,
+                    }
+                }
+            };
+            self.ker_specs.push(ker_spec);
+        }
+        self.ker_specs.push(FKS::Done);
+        self.blob_size = offset;
+        self.blob_align = align;
+
+        self.generation = GENERATION.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        Ok(())
+    }
+
+    pub unsafe fn run(
+        &self,
+        ker: &impl MatMatMulKer<Acc = TI>,
+        specs: &[FusedSpec],
+        down: usize,
+        right: usize,
+    ) -> TractResult<()> {
+        unsafe {
+            TLS.with_borrow_mut(|tls| {
+                tls.sync(self);
+                if down < self.valid_down_tiles && right < self.valid_right_tiles {
+                    self.for_valid_tile(ker, specs, tls, down, right)?;
+                    let err = ker.kernel(tls.ker_specs());
+                    debug_assert_eq!(err, 0, "Kernel return error {err}");
+                } else {
+                    let remnant_down =
+                        if down < self.valid_down_tiles { ker.mr() } else { self.remnant_down };
+                    let remnant_right =
+                        if right < self.valid_right_tiles { ker.nr() } else { self.remnant_right };
+                    self.for_border_tile(
+                        ker,
+                        specs,
+                        tls,
+                        down,
+                        right,
+                        remnant_down,
+                        remnant_right,
+                    )?;
+                    let err = ker.kernel(tls.ker_specs());
+                    debug_assert_eq!(err, 0, "Kernel return error {err}");
+                    self.postprocess_tile(specs, tls, down, right, remnant_down, remnant_right)?;
+                }
+                Ok(())
+            })
+        }
+    }
+
+    #[inline(always)]
+    unsafe fn for_valid_tile(
+        &self,
+        ker: &impl MatMatMulKer<Acc = TI>,
+        specs: &[FusedSpec],
+        tls: &mut TLSScratch,
+        down: usize,
+        right: usize,
+    ) -> TractResult<()> {
+        unsafe {
+            use FusedKerSpec as FKS;
+            use FusedSpec as FS;
+            let ScratchSpaceImpl { ker_specs, loc_dependant, .. } = self;
+            debug_assert!(specs.len() + 2 == ker_specs.len());
+            for LocDependant { spec, ker_spec, loc, buffer_a, buffer_b } in loc_dependant {
+                let spec = specs.get_unchecked(*spec);
+                let it = match spec {
+                    FS::BinPerRow(v, op) => {
+                        let v = v.as_ptr_unchecked::<TI>().add(down * ker.mr());
+                        match op {
+                            BinOp::Min => FKS::PerRowMin(v),
+                            BinOp::Max => FKS::PerRowMax(v),
+                            BinOp::Add => FKS::PerRowAdd(v),
+                            BinOp::Mul => FKS::PerRowMul(v),
+                            BinOp::Sub => FKS::PerRowSub(v),
+                            BinOp::SubF => FKS::PerRowSubF(v),
+                        }
+                    }
+                    FS::BinPerCol(v, op) => {
+                        let v = v.as_ptr_unchecked::<TI>().add(right * ker.nr());
+                        match op {
+                            BinOp::Min => FKS::PerColMin(v),
+                            BinOp::Max => FKS::PerColMax(v),
+                            BinOp::Add => FKS::PerColAdd(v),
+                            BinOp::Mul => FKS::PerColMul(v),
+                            BinOp::Sub => FKS::PerColSub(v),
+                            BinOp::SubF => FKS::PerColSubF(v),
+                        }
+                    }
+                    FS::AddRowColProducts(rows, cols) => {
+                        let row_ptr = rows.as_ptr_unchecked::<TI>().add(down * ker.mr());
+                        let col_ptr = cols.as_ptr_unchecked::<TI>().add(right * ker.nr());
+                        FKS::AddRowColProducts(row_ptr, col_ptr)
+                    }
+                    FS::AddUnicast(store) => FKS::AddUnicast(store.tile_c(down, right)),
+                    FS::Store(c_store) => FKS::Store(c_store.tile_c(down, right)),
+                    FS::AddMatMul { a, b, packing } => {
+                        let scratch = (tls.blob.as_mut_ptr().add(*loc) as *mut AddMatMulTemp)
+                            .as_mut()
+                            .unwrap();
+                        if scratch.panel_a_id != down {
+                            scratch.ptr_a = a.panel_bytes(
+                                down,
+                                buffer_a.map(|o| tls.blob.as_mut_ptr().add(o)),
+                            )?;
+                            scratch.panel_a_id = down;
+                        }
+                        if scratch.panel_b_id != right {
+                            scratch.ptr_b = b.panel_bytes(
+                                right,
+                                buffer_b.map(|o| tls.blob.as_mut_ptr().add(o)),
+                            )?;
+                            scratch.panel_b_id = right;
+                        }
+                        FKS::AddMatMul {
+                            k: b.k(),
+                            pa: scratch.ptr_a,
+                            pb: scratch.ptr_b,
+                            packing: *packing,
+                        }
+                    }
+                    _ => std::hint::unreachable_unchecked(),
+                };
+                *tls.ker_specs().get_unchecked_mut(*ker_spec) = it;
+            }
+            Ok(())
+        }
+    }
+
+    #[inline(never)]
+    #[allow(clippy::too_many_arguments)]
+    unsafe fn for_border_tile(
+        &self,
+        ker: &impl MatMatMulKer<Acc = TI>,
+        specs: &[FusedSpec],
+        tls: &mut TLSScratch,
+        down: usize,
+        right: usize,
+        m_remnant: usize,
+        n_remnant: usize,
+    ) -> TractResult<()> {
+        unsafe {
+            use FusedKerSpec as FKS;
+            use FusedSpec as FS;
+            for LocDependant { spec, ker_spec: uspec, loc, buffer_a, buffer_b } in
+                &self.loc_dependant
+            {
+                let loc = tls.blob.as_mut_ptr().add(*loc);
+                let spec = specs.get_unchecked(*spec);
+                let it = match spec {
+                    FS::BinPerRow(v, op) => {
+                        let buf = std::slice::from_raw_parts_mut(loc as *mut TI, ker.mr());
+                        let ptr = if m_remnant < ker.mr() {
+                            if m_remnant > 0 {
+                                buf.get_unchecked_mut(..m_remnant).copy_from_slice(
+                                    v.as_slice_unchecked()
+                                        .get_unchecked(down * ker.mr()..)
+                                        .get_unchecked(..m_remnant),
+                                );
+                            }
+                            if cfg!(debug_assertions) {
+                                buf.get_unchecked_mut(m_remnant..)
+                                    .iter_mut()
+                                    .for_each(|x| *x = TI::zero());
+                            }
+                            buf.as_ptr()
+                        } else {
+                            v.as_ptr_unchecked::<TI>().add(down * ker.mr())
+                        };
+                        match op {
+                            BinOp::Min => FKS::PerRowMin(ptr),
+                            BinOp::Max => FKS::PerRowMax(ptr),
+                            BinOp::Add => FKS::PerRowAdd(ptr),
+                            BinOp::Mul => FKS::PerRowMul(ptr),
+                            BinOp::Sub => FKS::PerRowSub(ptr),
+                            BinOp::SubF => FKS::PerRowSubF(ptr),
+                        }
+                    }
+                    FS::BinPerCol(v, op) => {
+                        let buf = std::slice::from_raw_parts_mut(loc as *mut TI, ker.nr());
+                        let ptr = if n_remnant < ker.nr() {
+                            if n_remnant > 0 {
+                                buf.get_unchecked_mut(..n_remnant).copy_from_slice(
+                                    v.as_slice_unchecked()
+                                        .get_unchecked(right * ker.nr()..)
+                                        .get_unchecked(..n_remnant),
+                                );
+                            }
+                            if cfg!(debug_assertions) {
+                                buf.get_unchecked_mut(n_remnant..)
+                                    .iter_mut()
+                                    .for_each(|x| *x = TI::zero());
+                            }
+                            buf.as_ptr()
+                        } else {
+                            v.as_ptr_unchecked::<TI>().add(right * ker.nr())
+                        };
+                        match op {
+                            BinOp::Min => FKS::PerColMin(ptr),
+                            BinOp::Max => FKS::PerColMax(ptr),
+                            BinOp::Add => FKS::PerColAdd(ptr),
+                            BinOp::Mul => FKS::PerColMul(ptr),
+                            BinOp::Sub => FKS::PerColSub(ptr),
+                            BinOp::SubF => FKS::PerColSubF(ptr),
+                        }
+                    }
+                    FS::AddRowColProducts(rows, cols) => {
+                        let r = std::slice::from_raw_parts_mut(loc as *mut TI, ker.mr());
+                        let row_ptr = if m_remnant < ker.mr() {
+                            r.get_unchecked_mut(..m_remnant).copy_from_slice(
+                                rows.as_slice_unchecked()
+                                    .get_unchecked(down * ker.mr()..)
+                                    .get_unchecked(..m_remnant),
+                            );
+                            if cfg!(debug_assertions) {
+                                r.get_unchecked_mut(m_remnant..)
+                                    .iter_mut()
+                                    .for_each(|x| *x = TI::zero());
+                            }
+                            r.as_ptr()
+                        } else {
+                            rows.as_ptr_unchecked::<TI>().add(down * ker.mr())
+                        };
+                        let c = std::slice::from_raw_parts_mut(
+                            (loc as *mut TI).add(ker.mr()),
+                            ker.nr(),
+                        );
+                        let col_ptr = if n_remnant < ker.nr() {
+                            c.get_unchecked_mut(..n_remnant).copy_from_slice(
+                                cols.as_slice_unchecked()
+                                    .get_unchecked(right * ker.nr()..)
+                                    .get_unchecked(..n_remnant),
+                            );
+                            if cfg!(debug_assertions) {
+                                r.get_unchecked_mut(n_remnant..)
+                                    .iter_mut()
+                                    .for_each(|x| *x = TI::zero());
+                            }
+                            c.as_ptr()
+                        } else {
+                            cols.as_ptr_unchecked::<TI>().add(right * ker.nr())
+                        };
+                        FKS::AddRowColProducts(row_ptr, col_ptr)
+                    }
+                    FS::AddUnicast(store) => {
+                        let row_byte_stride = store.row_byte_stride;
+                        let col_byte_stride = store.col_byte_stride;
+                        let tile_offset = row_byte_stride * down as isize * ker.mr() as isize
+                            + col_byte_stride * right as isize * ker.nr() as isize;
+                        let tile_ptr = store.ptr.offset(tile_offset);
+                        let tmp_d_tile =
+                            std::slice::from_raw_parts_mut(loc as *mut TI, ker.mr() * ker.nr());
+                        if cfg!(debug_assertions) {
+                            tmp_d_tile.iter_mut().for_each(|t| *t = TI::zero());
+                        }
+                        for r in 0..m_remnant as isize {
+                            for c in 0..n_remnant as isize {
+                                let inner_offset = c * col_byte_stride + r * row_byte_stride;
+                                if inner_offset + tile_offset
+                                    < (store.item_size * store.item_count) as isize
+                                {
+                                    *tmp_d_tile
+                                        .get_unchecked_mut(r as usize + c as usize * ker.mr()) =
+                                        *(tile_ptr.offset(inner_offset) as *const TI);
+                                }
+                            }
+                        }
+                        FKS::AddUnicast(OutputStoreKer {
+                            ptr: tmp_d_tile.as_ptr() as _,
+                            row_byte_stride: std::mem::size_of::<TI>() as isize,
+                            col_byte_stride: (std::mem::size_of::<TI>() * ker.mr()) as isize,
+                            item_size: std::mem::size_of::<TI>(),
+                        })
+                    }
+                    FS::Store(c_store) => {
+                        let tmpc = OutputStoreKer {
+                            ptr: loc as _,
+                            item_size: c_store.item_size,
+                            row_byte_stride: c_store.item_size as isize,
+                            col_byte_stride: (c_store.item_size * ker.mr()) as isize,
+                        };
+                        FKS::Store(tmpc)
+                    }
+                    FS::AddMatMul { a, b, packing } => {
+                        let scratch = (loc as *mut AddMatMulTemp).as_mut().unwrap();
+                        if scratch.panel_a_id != down {
+                            scratch.ptr_a = a.panel_bytes(
+                                down,
+                                buffer_a.map(|o| tls.blob.as_mut_ptr().add(o)),
+                            )?;
+                            scratch.panel_a_id = down;
+                        }
+                        if scratch.panel_b_id != right {
+                            scratch.ptr_b = b.panel_bytes(
+                                right,
+                                buffer_b.map(|o| tls.blob.as_mut_ptr().add(o)),
+                            )?;
+                            scratch.panel_b_id = right;
+                        }
+                        FKS::AddMatMul {
+                            k: b.k(),
+                            pa: scratch.ptr_a,
+                            pb: scratch.ptr_b,
+                            packing: *packing,
+                        }
+                    }
+                    _ => std::hint::unreachable_unchecked(),
+                };
+                *tls.ker_specs().get_unchecked_mut(*uspec) = it;
+            }
+            Ok(())
+        }
+    }
+
+    #[inline]
+    pub fn uspecs(&self) -> &[FusedKerSpec<TI>] {
+        &self.ker_specs
+    }
+
+    unsafe fn postprocess_tile(
+        &self,
+        specs: &[FusedSpec],
+        tls: &mut TLSScratch,
+        down: usize,
+        right: usize,
+        m_remnant: usize,
+        n_remnant: usize,
+    ) -> TractResult<()>
+    where
+        TI: LADatum,
+    {
+        unsafe {
+            for LocDependant { spec, ker_spec: uspec, .. } in self.loc_dependant.iter() {
+                let spec = specs.get_unchecked(*spec);
+                let ker_spec = tls.ker_specs::<TI>().get_unchecked(*uspec);
+                if let (FusedSpec::Store(c_store), FusedKerSpec::Store(tmp)) = (spec, ker_spec) {
+                    c_store.set_from_tile(down, right, m_remnant, n_remnant, tmp)
+                }
+            }
+            Ok(())
+        }
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/storage.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/storage.rs
new file mode 100644
index 000000000..959128cff
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/storage.rs
@@ -0,0 +1,139 @@
+use std::fmt::Debug;
+use tract_data::internal::*;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
+pub enum OutputStoreSpec {
+    View { m_axis: Option<usize>, n_axis: Option<usize>, mr: usize, nr: usize },
+    Strides { row_byte_stride: isize, col_byte_stride: isize, mr: usize, nr: usize },
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct OutputStore {
+    pub(crate) ptr: *mut u8,
+    pub(crate) row_byte_stride: isize,
+    pub(crate) col_byte_stride: isize,
+    pub(crate) panel_row_byte_stride: isize,
+    pub(crate) panel_col_byte_stride: isize,
+    pub(crate) item_size: usize,
+    pub(crate) item_count: usize,
+    pub(crate) mr: usize,
+}
+
+unsafe impl Send for OutputStore {}
+unsafe impl Sync for OutputStore {}
+
+impl OutputStoreSpec {
+    #[inline]
+    pub unsafe fn wrap(&self, tensor: &TensorView) -> OutputStore {
+        let (mr, nr, row_byte_stride, col_byte_stride) = unsafe { self.compute_strides(tensor) };
+        OutputStore {
+            ptr: unsafe { tensor.as_ptr_unchecked::<u8>() } as _,
+            row_byte_stride,
+            col_byte_stride,
+            panel_row_byte_stride: row_byte_stride * mr as isize,
+            panel_col_byte_stride: col_byte_stride * nr as isize,
+            item_size: tensor.datum_type().size_of(),
+            mr,
+            item_count: tensor.len(),
+        }
+    }
+
+    #[inline]
+    unsafe fn compute_strides(&self, tensor: &TensorView) -> (usize, usize, isize, isize) {
+        let size_of = tensor.datum_type().size_of() as isize;
+        match self {
+            OutputStoreSpec::View { m_axis, n_axis, mr, nr, .. } => {
+                let tensor_strides = tensor.strides();
+                let row_item_stride =
+                    m_axis.map(|ax| *unsafe { tensor_strides.get_unchecked(ax) }).unwrap_or(0);
+                let col_item_stride =
+                    n_axis.map(|ax| *unsafe { tensor_strides.get_unchecked(ax) }).unwrap_or(0);
+                let row_byte_stride = row_item_stride * size_of;
+                let col_byte_stride = col_item_stride * size_of;
+                (*mr, *nr, row_byte_stride, col_byte_stride)
+            }
+            OutputStoreSpec::Strides { row_byte_stride, col_byte_stride, mr, nr, .. } => {
+                (*mr, *nr, *row_byte_stride, *col_byte_stride)
+            }
+        }
+    }
+}
+
+impl OutputStore {
+    #[inline]
+    pub(super) unsafe fn tile_c(&self, down: usize, right: usize) -> OutputStoreKer {
+        unsafe {
+            let (down, right) = (down as isize, right as isize);
+            OutputStoreKer {
+                ptr: self
+                    .ptr
+                    .offset(self.panel_row_byte_stride * down + self.panel_col_byte_stride * right)
+                    as *mut _,
+                row_byte_stride: self.row_byte_stride,
+                col_byte_stride: self.col_byte_stride,
+                item_size: self.item_size,
+            }
+        }
+    }
+
+    #[inline]
+    pub fn item_size(&self) -> usize {
+        self.item_size
+    }
+
+    #[inline]
+    pub(super) unsafe fn set_from_tile(
+        &self,
+        down: usize,
+        right: usize,
+        height: usize,
+        width: usize,
+        tile: &OutputStoreKer,
+    ) {
+        unsafe {
+            if self.item_size() == 1 {
+                self.set_from_tile_t::<i8>(down, right, height, width, tile)
+            } else if self.item_size() == 2 {
+                self.set_from_tile_t::<i16>(down, right, height, width, tile)
+            } else if self.item_size() == 4 {
+                self.set_from_tile_t::<i32>(down, right, height, width, tile)
+            } else {
+                self.set_from_tile_t::<i64>(down, right, height, width, tile)
+            }
+        }
+    }
+
+    #[inline]
+    unsafe fn set_from_tile_t<T: Datum + Copy>(
+        &self,
+        down: usize,
+        right: usize,
+        height: usize,
+        width: usize,
+        tile: &OutputStoreKer,
+    ) {
+        unsafe {
+            let tile = tile.ptr as *mut T;
+            let dst = self.ptr.add(
+                self.panel_row_byte_stride as usize * down
+                    + self.panel_col_byte_stride as usize * right,
+            );
+            for y in 0..height as isize {
+                for x in 0..width as isize {
+                    let value = tile.offset(y + x * self.mr as isize);
+                    let dst = dst.offset(y * self.row_byte_stride + x * self.col_byte_stride);
+                    *(dst as *mut T) = *value;
+                }
+            }
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(PartialEq, Eq, Copy, Clone, Debug)]
+pub struct OutputStoreKer {
+    pub ptr: *mut u8,
+    pub row_byte_stride: isize,
+    pub col_byte_stride: isize,
+    pub item_size: usize,
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/frame.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/frame.rs
new file mode 100644
index 000000000..384eab9e3
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/frame.rs
@@ -0,0 +1,295 @@
+use crate::frame::mmm::*;
+use crate::{BinOp, LADatum};
+use num_traits::AsPrimitive;
+use std::ops::Neg;
+use tests::display_error;
+use tract_data::internal::*;
+
+#[macro_export]
+macro_rules! mmm_frame_tests {
+    ($ker:expr, $ta:ty, $tb:ty, $tc:ty, $ti:ty) => {
+        mod frame {
+            use tract_data::internal::*;
+            #[allow(unused_imports)]
+            use $crate::frame::mmm::tests::frame::*;
+
+            #[test]
+            fn row_mul_2_1_3() -> TractResult<()> {
+                unsafe { row_mul::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? }
+                Ok(())
+            }
+
+            #[test]
+            fn row_add_2_1_3() -> TractResult<()> {
+                unsafe { row_add::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? }
+                Ok(())
+            }
+
+            #[test]
+            fn col_mul_2_1_3() -> TractResult<()> {
+                unsafe { col_mul::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? }
+                Ok(())
+            }
+
+            #[test]
+            fn col_add_2_1_3() -> TractResult<()> {
+                unsafe { col_add::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? }
+                Ok(())
+            }
+
+            #[test]
+            fn max_2_1_3() -> TractResult<()> {
+                unsafe { max::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? }
+                Ok(())
+            }
+
+            #[test]
+            fn min_2_1_3() -> TractResult<()> {
+                unsafe { min::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? }
+                Ok(())
+            }
+
+            #[test]
+            fn add_d_2_1_3() -> TractResult<()> {
+                unsafe { add_d::<_, $ta, $tb, $tc, $ti>($ker, 2, 3)? }
+                Ok(())
+            }
+
+            #[test]
+            fn add_d_big() -> TractResult<()> {
+                unsafe { add_d::<_, $ta, $tb, $tc, $ti>($ker, 197, 1)? }
+                Ok(())
+            }
+        }
+    };
+}
+
+pub unsafe fn fused_ops<
+    K: MatMatMulKer<Acc = TI> + 'static,
+    TA,
+    TB,
+    TC,
+    TI,
+    F: Fn(usize, usize) -> TC,
+>(
+    ker: &K,
+    m: usize,
+    n: usize,
+    spec: &[FusedSpec],
+    expect: F,
+) -> TractResult<()>
+where
+    TA: LADatum + AsPrimitive<TI> + 'static,
+    TB: LADatum + AsPrimitive<TI> + 'static,
+    TC: LADatum + AsPrimitive<TI> + 'static,
+    TI: LADatum + AsPrimitive<TC> + 'static,
+    i32: AsPrimitive<TI>,
+    usize: AsPrimitive<TI>,
+{
+    if !ker.is_supported_here() {
+        return Ok(());
+    };
+    crate::setup_test_logger();
+
+    let mut found = Tensor::zero::<TC>(&[m, n])?;
+    let c_store = unsafe {
+        ker.c_from_data_and_strides(TC::datum_type().size_of(), n as isize, 1)
+            .wrap(&found.view_mut())
+    };
+    let mut spec: TVec<FusedSpec> = spec.into();
+    spec.push(FusedSpec::Store(c_store));
+
+    unsafe { ker.run(m, n, &spec) }?;
+    let expected =
+        tract_ndarray::prelude::Array2::from_shape_fn((m, n), |(r, c)| expect(r, c)).into_tensor();
+    let err = found.close_enough(&expected, true);
+    if err.is_err() {
+        display_error(found.as_slice::<TC>()?, expected.as_slice::<TC>()?, m, n);
+    }
+    err
+}
+
+pub unsafe fn row_add<K: MatMatMulKer<Acc = TI> + 'static, TA, TB, TC, TI>(
+    ker: &K,
+    m: usize,
+    n: usize,
+) -> TractResult<()>
+where
+    TA: LADatum + AsPrimitive<TI> + 'static,
+    TB: LADatum + AsPrimitive<TI> + 'static,
+    TC: LADatum + AsPrimitive<TI> + 'static,
+    TI: LADatum + AsPrimitive<TC> + 'static + Neg<Output = TI>,
+    i32: AsPrimitive<TI>,
+    usize: AsPrimitive<TI>,
+{
+    let bias = (0..m).map(|i| i.as_()).collect::<Vec<TI>>();
+    unsafe {
+        fused_ops::<K, TA, TB, TC, TI, _>(
+            ker,
+            m,
+            n,
+            &[FusedSpec::BinPerRow(tensor1(&bias).view(), BinOp::Add)],
+            |r, _| bias[r].as_(),
+        )
+    }
+}
+
+pub unsafe fn row_mul<K: MatMatMulKer<Acc = TI> + 'static, TA, TB, TC, TI>(
+    ker: &K,
+    m: usize,
+    n: usize,
+) -> TractResult<()>
+where
+    TA: LADatum + AsPrimitive<TI> + 'static,
+    TB: LADatum + AsPrimitive<TI> + 'static,
+    TC: LADatum + AsPrimitive<TI> + 'static,
+    TI: LADatum + AsPrimitive<TC> + 'static + Neg<Output = TI>,
+    i32: AsPrimitive<TI>,
+    usize: AsPrimitive<TI>,
+{
+    let bias = (0..m).map(|i| i.as_()).collect::<Vec<TI>>();
+    unsafe {
+        fused_ops::<K, TA, TB, TC, TI, _>(
+            ker,
+            m,
+            n,
+            &[
+                FusedSpec::BinScalar(&tensor0(1i32.as_()), BinOp::Add),
+                FusedSpec::BinPerRow(tensor1(&bias).view(), BinOp::Mul),
+            ],
+            |r, _| bias[r].as_(),
+        )
+    }
+}
+
+pub unsafe fn col_add<K: MatMatMulKer<Acc = TI> + 'static, TA, TB, TC, TI>(
+    ker: &K,
+    m: usize,
+    n: usize,
+) -> TractResult<()>
+where
+    TA: LADatum + AsPrimitive<TI> + 'static,
+    TB: LADatum + AsPrimitive<TI> + 'static,
+    TC: LADatum + AsPrimitive<TI> + 'static,
+    TI: LADatum + AsPrimitive<TC> + 'static + Neg<Output = TI>,
+    i32: AsPrimitive<TI>,
+    usize: AsPrimitive<TI>,
+{
+    let bias = (0..n).map(|i| i.as_()).collect::<Vec<TI>>();
+    unsafe {
+        fused_ops::<K, TA, TB, TC, TI, _>(
+            ker,
+            m,
+            n,
+            &[FusedSpec::BinPerCol(tensor1(&bias).view(), BinOp::Add)],
+            |_, c| bias[c].as_(),
+        )
+    }
+}
+
+pub unsafe fn col_mul<K: MatMatMulKer<Acc = TI> + 'static, TA, TB, TC, TI>(
+    ker: &K,
+    m: usize,
+    n: usize,
+) -> TractResult<()>
+where
+    TA: LADatum + AsPrimitive<TI> + 'static,
+    TB: LADatum + AsPrimitive<TI> + 'static,
+    TC: LADatum + AsPrimitive<TI> + 'static,
+    TI: LADatum + AsPrimitive<TC> + 'static + Neg<Output = TI>,
+    i32: AsPrimitive<TI>,
+    usize: AsPrimitive<TI>,
+{
+    let bias = (0..n).map(|i| i.as_()).collect::<Vec<TI>>();
+    unsafe {
+        fused_ops::<K, TA, TB, TC, TI, _>(
+            ker,
+            m,
+            n,
+            &[
+                FusedSpec::BinScalar(&tensor0(1i32.as_()), BinOp::Add),
+                FusedSpec::BinPerCol(tensor1(&bias).view(), BinOp::Mul),
+            ],
+            |_, c| bias[c].as_(),
+        )
+    }
+}
+
+pub unsafe fn add_d<K: MatMatMulKer<Acc = TI> + 'static, TA, TB, TC, TI>(
+    ker: &K,
+    m: usize,
+    n: usize,
+) -> TractResult<()>
+where
+    TA: LADatum + AsPrimitive<TI> + 'static,
+    TB: LADatum + AsPrimitive<TI> + 'static,
+    TC: LADatum + AsPrimitive<TI> + 'static,
+    TI: LADatum + AsPrimitive<TC> + 'static + Neg<Output = TI>,
+    i32: AsPrimitive<TI>,
+    usize: AsPrimitive<TI>,
+{
+    let d = (0..m * n).map(|i| i.as_()).collect::<Vec<TI>>();
+    let d = tensor1(&d).into_shape(&[m, n])?;
+    let store_spec =
+        OutputStoreSpec::View { m_axis: Some(0), n_axis: Some(1), mr: ker.mr(), nr: ker.nr() };
+    let view_d = d.to_array_view::<TI>()?.into_dimensionality()?;
+    unsafe {
+        fused_ops::<K, TA, TB, TC, TI, _>(
+            ker,
+            m,
+            n,
+            &[FusedSpec::AddUnicast(store_spec.wrap(&d.view()))],
+            |r, c| view_d[(r, c)].as_(),
+        )
+    }
+}
+
+pub unsafe fn max<K: MatMatMulKer<Acc = TI>, TA, TB, TC, TI>(
+    ker: &K,
+    m: usize,
+    n: usize,
+) -> TractResult<()>
+where
+    TA: LADatum + AsPrimitive<TI> + 'static,
+    TB: LADatum + AsPrimitive<TI> + 'static,
+    TC: LADatum + AsPrimitive<TI> + 'static,
+    TI: LADatum + AsPrimitive<TC> + 'static + Neg<Output = TI>,
+    i32: AsPrimitive<TI>,
+    usize: AsPrimitive<TI>,
+{
+    let five: TI = 5.as_();
+    unsafe {
+        fused_ops::<K, TA, TB, TC, TI, _>(
+            ker,
+            m,
+            n,
+            &[FusedSpec::BinScalar(&tensor0(five), BinOp::Max)],
+            |_, _| five.as_(),
+        )
+    }
+}
+
+pub unsafe fn min<K: MatMatMulKer<Acc = TI>, TA, TB, TC, TI>(
+    ker: &K,
+    m: usize,
+    n: usize,
+) -> TractResult<()>
+where
+    TA: LADatum + AsPrimitive<TI> + 'static,
+    TB: LADatum + AsPrimitive<TI> + 'static,
+    TC: LADatum + AsPrimitive<TI> + 'static,
+    TI: LADatum + AsPrimitive<TC> + 'static + Neg<Output = TI>,
+    i32: AsPrimitive<TI>,
+    usize: AsPrimitive<TI>,
+{
+    let five: TI = 5.as_();
+    unsafe {
+        fused_ops::<K, TA, TB, TC, TI, _>(
+            ker,
+            m,
+            n,
+            &[FusedSpec::BinScalar(&tensor0(five), BinOp::Min)],
+            |_, _| TC::zero(),
+        )
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/fuse.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/fuse.rs
new file mode 100644
index 000000000..da909a80b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/fuse.rs
@@ -0,0 +1,287 @@
+use crate::frame::mmm::fuse::FusedKerSpec;
+use crate::frame::mmm::storage::*;
+use crate::frame::mmm::tests::display_error;
+use crate::frame::mmm::tests::store::mmm_stride_storage;
+use crate::frame::mmm::*;
+use num_traits::{AsPrimitive, Bounded};
+use proptest::prelude::*;
+use tract_data::internal::*;
+
+#[macro_export]
+macro_rules! mmm_kernel_fuse_tests {
+    ($ker:expr, $tc:ty, $ti: ty) => {
+        mod fuse {
+            use num_traits::Zero;
+            #[allow(unused_imports)]
+            use tract_data::prelude::f16;
+            use tract_data::prelude::tensor0;
+            use $crate::frame::mmm::tests::fuse as test;
+            #[allow(unused_imports)]
+            use $crate::frame::mmm::tests::fuse::*;
+            use $crate::frame::mmm::MatMatMulKer;
+
+            #[test]
+            fn return_zeros() {
+                test::return_zeros::<_, $tc, $ti>($ker)
+            }
+
+            #[test]
+            fn store_non_contiguous() {
+                test::store_non_contiguous::<_, $tc, $ti>($ker)
+            }
+            proptest::proptest! {
+                #[test]
+                fn return_c_prop(c in tile::<_, $ti>($ker)) {
+                    test::return_c::<_, $ti>($ker, &c)
+                }
+            }
+
+            fn fmin<T: PartialOrd>(a: T, b: T) -> T {
+                if a < b {
+                    a
+                } else {
+                    b
+                }
+            }
+
+            fn fmax<T: PartialOrd>(a: T, b: T) -> T {
+                if a > b {
+                    a
+                } else {
+                    b
+                }
+            }
+
+            macro_rules! bin {
+                ($FKS:ident, $geo:expr, $f:expr, $extra_cond:expr) => {
+                    paste! {
+                        #[test]
+                        fn [<$FKS:snake>]() {
+                            if ($ker).is_supported_here() && $extra_cond {
+                                test::$geo::<_, $ti>($ker, $crate::mmm::FusedKerSpec::$FKS, $f);
+                            }
+                        }
+                    }
+                };
+            }
+
+            bin!(PerColMin, per_col, fmin, true);
+            bin!(PerColMax, per_col, fmax, true);
+            bin!(PerColAdd, per_col, |a, b| a + b, true);
+            bin!(PerColMul, per_col, |a, b| a * b, true);
+            bin!(PerColSub, per_col, |a, b| a - b, true);
+            bin!(PerColSubF, per_col, |a, b| b - a, true);
+
+            bin!(PerRowMin, per_row, fmin, true);
+            bin!(PerRowMax, per_row, fmax, true);
+            bin!(PerRowAdd, per_row, |a, b| a + b, true);
+            bin!(PerRowMul, per_row, |a, b| a * b, true);
+            bin!(PerRowSub, per_row, |a, b| a - b, true);
+            bin!(PerRowSubF, per_row, |a, b| b - a, true);
+
+            bin!(ScalarMin, scalar, fmin, true);
+            bin!(ScalarMax, scalar, fmax, true);
+            bin!(ScalarAdd, scalar, |a, b| a + b, true);
+            bin!(ScalarMul, scalar, |a, b| a * b, true);
+            bin!(ScalarSub, scalar, |a, b| a - b, true);
+            bin!(ScalarSubF, scalar, |a, b| b - a, true);
+
+            bin!(
+                LeakyRelu,
+                scalar,
+                |a, b| if b > <$ti>::zero() { b } else { a * b },
+                ($ker).can_fuse(&$crate::mmm::FusedSpec::LeakyRelu(&tensor0(<$ti>::from(1_u8))))
+            );
+
+            #[test]
+            fn return_c_add_row_col_product() {
+                test::return_c_add_row_col_product::<_, $ti>($ker)
+            }
+
+            #[test]
+            fn return_c_plus_d() {
+                test::return_c_plus_d::<_, $ti, $ti>($ker)
+            }
+
+            #[test]
+            fn return_c_clear() {
+                test::return_c_clear::<_, $ti>($ker)
+            }
+        }
+    };
+}
+
+use crate::LADatum;
+pub fn return_zeros<K, TC, TI>(ker: &K)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TC: LADatum,
+    TI: LADatum + Bounded + PartialEq,
+{
+    if !ker.is_supported_here() {
+        return;
+    }
+    let v = vec![TC::max_value(); ker.mr() * ker.nr()];
+    let c = mmm_stride_storage(&v, ker.nr());
+    let non_linear = tvec![FusedKerSpec::Clear, FusedKerSpec::Store(c), FusedKerSpec::Done];
+    let err = ker.kernel(&non_linear);
+    assert_eq!(err, 0);
+    let expected = vec![TC::zero(); v.len()];
+    display_error(&v, &expected, ker.mr(), ker.nr());
+    assert_eq!(v, expected);
+}
+
+pub fn store_non_contiguous<K, TC, TI>(ker: &K)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TC: LADatum,
+    TI: LADatum + Bounded + PartialEq,
+{
+    if !ker.is_supported_here() {
+        return;
+    }
+    let v = vec![TC::max_value(); ker.mr() * 5 * ker.nr() * 3];
+    let c = OutputStoreKer {
+        ptr: v.as_ptr() as _,
+        row_byte_stride: (std::mem::size_of::<TC>() * 3 * ker.nr() * 5) as isize,
+        col_byte_stride: std::mem::size_of::<TC>() as isize * 3,
+        item_size: std::mem::size_of::<TC>(),
+    };
+    let non_linear = tvec![FusedKerSpec::Clear, FusedKerSpec::Store(c), FusedKerSpec::Done];
+    let err = ker.kernel(&non_linear);
+    assert_eq!(err, 0);
+    let mut expected = vec![TC::max_value(); v.len()];
+    for c in 0..ker.nr() {
+        for r in 0..ker.mr() {
+            expected[c * 3 + r * 3 * 5 * ker.nr()] = TC::zero();
+        }
+    }
+    assert_eq!(v, expected);
+}
+
+pub fn fused_ops<K, TI, E>(ker: &K, c: &[TI], ops: &[FusedKerSpec<TI>], expect: E)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TI: LADatum,
+    E: Fn(usize, usize, TI) -> TI,
+{
+    if !ker.is_supported_here() {
+        return;
+    }
+    assert!(c.len() == ker.mr() * ker.nr());
+    let v = c.to_vec();
+    let c = mmm_stride_storage(&v, ker.nr());
+    let mut ops = ops.to_vec();
+    ops.insert(0, FusedKerSpec::AddUnicast(c));
+    ops.insert(0, FusedKerSpec::Clear);
+    ops.push(FusedKerSpec::Store(c));
+    ops.push(FusedKerSpec::Done);
+    let expected =
+        (0..v.len()).map(|ix| expect(ix / ker.nr(), ix % ker.nr(), v[ix])).collect::<Vec<TI>>();
+    let err = ker.kernel(&ops);
+    assert_eq!(err, 0);
+    display_error(&v, &expected, ker.mr(), ker.nr());
+    assert_eq!(v, expected);
+}
+
+pub fn return_c<K, TI>(ker: &K, v: &[TI])
+where
+    K: MatMatMulKer<Acc = TI>,
+    TI: LADatum,
+    usize: AsPrimitive<TI>,
+{
+    fused_ops::<K, TI, _>(ker, v, &[], |_, _, c| c + 1.as_() - 1.as_())
+}
+
+pub fn return_c_plus_d<K, TI, TD>(ker: &K)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TI: LADatum,
+    TD: LADatum + AsPrimitive<TI>,
+    usize: AsPrimitive<TI> + AsPrimitive<TD>,
+{
+    let len = ker.mr() * ker.nr();
+    let v: Vec<TI> = (0..len).map(|f| f.as_()).collect();
+    let d: Vec<TD> = (0..len).map(|f| ((3 * f) % 7).as_()).collect();
+    fused_ops::<K, TI, _>(
+        ker,
+        &v,
+        &[FusedKerSpec::AddUnicast(mmm_stride_storage(&d, ker.nr()))],
+        |row, col, c| c + d[row * ker.nr() + col].as_(),
+    );
+}
+
+pub fn per_col<K, TI>(ker: &K, op: impl Fn(*const TI) -> FusedKerSpec<TI>, f: impl Fn(TI, TI) -> TI)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TI: LADatum,
+    usize: AsPrimitive<TI>,
+{
+    let len = ker.mr() * ker.nr();
+    let v: Vec<TI> = (0..len).map(|f| f.as_()).collect();
+    let bias: Vec<TI> = (0..ker.nr()).map(|f| (f + 1).as_()).collect();
+    fused_ops::<K, TI, _>(ker, &v, &[op(bias.as_ptr())], |_, col, c| f(bias[col], c))
+}
+
+pub fn per_row<K, TI>(ker: &K, op: impl Fn(*const TI) -> FusedKerSpec<TI>, f: impl Fn(TI, TI) -> TI)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TI: LADatum,
+    usize: AsPrimitive<TI>,
+{
+    let len = ker.mr() * ker.nr();
+    let v: Vec<TI> = (0..len).map(|f| f.as_()).collect();
+    let bias: Vec<TI> = (0..ker.mr()).map(|f| (f + 1).as_()).collect();
+    fused_ops::<K, TI, _>(ker, &v, &[op(bias.as_ptr())], |row, _, c| f(bias[row], c))
+}
+
+pub fn scalar<K, TI>(ker: &K, op: impl Fn(TI) -> FusedKerSpec<TI>, f: impl Fn(TI, TI) -> TI)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TI: LADatum,
+    isize: AsPrimitive<TI>,
+{
+    let len = ker.mr() * ker.nr();
+    let v: Vec<TI> = (0..len as isize).map(|f| (f - len as isize / 2).as_()).collect();
+    let five: TI = 5.as_();
+    fused_ops::<K, TI, _>(ker, &v, &[op(five)], |_, _, c| f(five, c))
+}
+
+pub fn return_c_add_row_col_product<K, TI>(ker: &K)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TI: LADatum,
+    usize: AsPrimitive<TI>,
+{
+    let len = ker.mr() * ker.nr();
+    let v: Vec<TI> = (0..len).map(|f| (f + 1).as_()).collect();
+    let rows: Vec<TI> = (0..ker.mr()).map(|f| (f + 3).as_()).collect();
+    let cols: Vec<TI> = (0..ker.nr()).map(|f| (f + 2).as_()).collect();
+    fused_ops::<K, TI, _>(
+        ker,
+        &v,
+        &[FusedKerSpec::AddRowColProducts(rows.as_ptr(), cols.as_ptr())],
+        |row, col, c| c + cols[col] * rows[row],
+    )
+}
+
+pub fn return_c_clear<K, TI>(ker: &K)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TI: LADatum,
+    usize: AsPrimitive<TI>,
+{
+    let len = ker.mr() * ker.nr();
+    let v: Vec<TI> = (0..len).map(|f| f.as_()).collect();
+    fused_ops::<K, TI, _>(ker, &v, &[FusedKerSpec::Clear], |_, _, _| 0.as_())
+}
+
+pub fn tile<K, TI>(ker: &K) -> BoxedStrategy<Vec<TI>>
+where
+    K: MatMatMulKer<Acc = TI>,
+    TI: LADatum,
+    i8: AsPrimitive<TI>,
+{
+    let len = ker.mr() * ker.nr();
+    proptest::collection::vec(any::<i8>().prop_map(|c| c.as_()), len..=len).boxed()
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/mod.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/mod.rs
new file mode 100644
index 000000000..beb4fb25d
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/mod.rs
@@ -0,0 +1,89 @@
+use crate::LADatum;
+
+#[macro_use]
+pub mod fuse;
+#[macro_use]
+pub mod frame;
+#[macro_use]
+pub mod packed_packed;
+#[macro_use]
+pub mod q_scale;
+#[macro_use]
+pub mod store;
+
+#[cfg(test)]
+macro_rules! test_mmm_kernel {
+    (f16, $ker:expr) => {
+        test_mmm_kernel_f16!($ker);
+    };
+    (f32, $ker:expr) => {
+        test_mmm_kernel_f32!($ker);
+    };
+    (f64, $ker:expr) => {
+        test_mmm_kernel_f64!($ker);
+    };
+    (i32, $ker:expr) => {
+        test_mmm_kernel_i32!($ker);
+    };
+}
+
+#[macro_export]
+macro_rules! test_mmm_kernel_f16 {
+    ($ker: expr) => {
+        mmm_packed_packed_tests!(&*$ker, f16f16:0);
+        mmm_frame_tests!(&*$ker, f16, f16, f16, f16);
+        mmm_kernel_fuse_tests!(&*$ker, f16, f16);
+        mmm_store_test!(&*$ker, f16);
+    };
+}
+
+#[macro_export]
+macro_rules! test_mmm_kernel_f32 {
+    ($ker: expr) => {
+        mmm_packed_packed_tests!(&*$ker, f32f32:0);
+        mmm_frame_tests!(&*$ker, f32, f32, f32, f32);
+        mmm_kernel_fuse_tests!(&*$ker, f32, f32);
+        mmm_store_test!(&*$ker, f32);
+    };
+}
+
+#[macro_export]
+macro_rules! test_mmm_kernel_f64 {
+    ($ker:expr) => {
+        mmm_packed_packed_tests!(&*$ker, f64f64:0);
+        mmm_frame_tests!(&*$ker, f64, f64, f64, f64);
+        mmm_kernel_fuse_tests!(&*$ker, f64, f64);
+        mmm_store_test!(&*$ker, f64);
+    };
+}
+
+#[macro_export]
+macro_rules! test_mmm_kernel_i32 {
+    ($ker: expr) => {
+        mmm_packed_packed_tests!(&*$ker, i32i32:0);
+        mmm_kernel_fuse_tests!(&*$ker, i32, i32);
+        mmm_frame_tests!(&*$ker, i32, i32, i32, i32);
+        mmm_q_scale_tests!(&*$ker);
+        mmm_store_test!(&*$ker, i32);
+    };
+}
+
+pub fn display_error<TC: LADatum>(v: &[TC], expected: &[TC], m: usize, n: usize) {
+    if v != expected {
+        for ixm in 0..m {
+            print!("|");
+            for ixn in 0..n {
+                use nu_ansi_term::Color::*;
+                let f = v[ixm * n + ixn];
+                let e = expected[ixm * n + ixn];
+                let color = if f != e { Red.bold() } else { Green.into() };
+                print!("{}|", color.paint(format!("{f:5}")));
+            }
+            print!("  #  ");
+            for ixn in 0..n {
+                print!("{:5} ", expected[ixm * n + ixn]);
+            }
+            println!();
+        }
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/packed_packed.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/packed_packed.rs
new file mode 100644
index 000000000..028ed2337
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/packed_packed.rs
@@ -0,0 +1,382 @@
+use crate::block_quant::PackedBlockQuantFormat;
+use crate::mmm::tests::display_error;
+use crate::mmm::{AsInputValue, FusedKerSpec, FusedSpec, MatMatMul, MatMatMulKer, OutputStoreKer};
+use crate::pack::PackedFormat;
+use proptest::collection::vec;
+use proptest::prelude::*;
+use std::fmt::Debug;
+use tract_data::internal::*;
+
+#[macro_export]
+macro_rules! mmm_packed_packed_tests {
+    ($ker:expr, $packing_id:ident : $packing: expr) => {
+        mod $packing_id {
+            use super::*;
+            #[allow(unused_imports)]
+            use proptest::prelude::*;
+            #[allow(unused_imports)]
+            use tract_data::prelude::f16;
+            use tract_data::prelude::*;
+            use tract_itertools::Itertools;
+            use $crate::frame::mmm::kernel::MatMatMulKer;
+            #[allow(unused_imports)]
+            use $crate::frame::mmm::tests::packed_packed::*;
+
+            mod fuse {
+                use super::*;
+
+                proptest::proptest! {
+                    #[test]
+                    fn prop(pb in arbitrary_problem(false, $ker, $packing)) {
+                        pb.check().unwrap()
+                    }
+                }
+
+                fn t(a: impl Into<Vec<f32>>, b: impl Into<Vec<f32>>) -> TractResult<()> {
+                    PackedPackedProblem::kernel($ker, $packing, a, b).check()
+                }
+
+                #[test]
+                fn packed_packed_1() -> TractResult<()> {
+                    t(vec![1f32; $ker.mr()], vec![1f32; $ker.nr()])
+                }
+
+                #[test]
+                fn packed_packed_2() -> TractResult<()> {
+                    t(vec![1f32; $ker.mr() * 2], vec![1f32; $ker.nr() * 2])
+                }
+
+                #[test]
+                fn packed_packed_13() -> TractResult<()> {
+                    t(vec![1f32; $ker.mr() * 13], vec![1f32; $ker.nr() * 13])
+                }
+
+                #[test]
+                fn packed_packed_a_scale() -> TractResult<()> {
+                    t((1..=$ker.mr() as i64).map(|x| x as f32).collect_vec(), vec![1f32; $ker.nr()])
+                }
+
+                #[test]
+                fn packed_packed_a_scale_times_2() -> TractResult<()> {
+                    t(
+                        (1..=2 * $ker.mr() as i64).map(|x| x as f32).collect_vec(),
+                        vec![1f32; $ker.nr() * 2],
+                    )
+                }
+
+                #[test]
+                fn packed_packed_empty() -> TractResult<()> {
+                    t(vec![0f32; 0], vec![0f32; 0])
+                }
+
+                #[test]
+                fn packed_packed_bug_1() -> TractResult<()> {
+                    t(vec![0f32; $ker.mr()], vec![0f32; $ker.nr()])
+                }
+
+                #[test]
+                fn packed_packed_bug_2() -> TractResult<()> {
+                    let mut a = vec![0f32; $ker.mr()];
+                    a[0] = 1.;
+                    let mut b = vec![0f32; $ker.nr()];
+                    b[0] = 1.;
+                    t(a, b)
+                }
+
+                #[test]
+                fn packed_packed_bug_3() -> TractResult<()> {
+                    if $ker.mr() >= 4 {
+                        let mut a = vec![0f32; 2 * $ker.mr()];
+                        let mut b = vec![0f32; 2 * $ker.nr()];
+                        a[2] = -0.7548828f32;
+                        a[3] = 0.23547363f32;
+                        b[2 * $ker.nr() - 1] = 0.93603516;
+                        t(a, b)?;
+                    }
+                    Ok(())
+                }
+
+                #[test]
+                fn packed_packed_bug_4() -> TractResult<()> {
+                    if $ker.mr() > 16 {
+                        let mut a = vec![0f32; $ker.mr()];
+                        let mut b = vec![0f32; $ker.nr()];
+                        a[16] = 1.;
+                        b[0] = 1.;
+                        t(a, b)?;
+                    }
+                    Ok(())
+                }
+            }
+
+            mod frame {
+                use super::*;
+
+                proptest::proptest! {
+                    #[test]
+                    fn prop(pb in arbitrary_problem(true, $ker, $packing)) {
+                        pb.check().unwrap()
+                    }
+                }
+
+                fn t(
+                    m: usize,
+                    n: usize,
+                    a: impl Into<Vec<f32>>,
+                    b: impl Into<Vec<f32>>,
+                ) -> TractResult<()> {
+                    PackedPackedProblem::frame($ker, $packing, m, n, a, b).check()
+                }
+
+                fn ti(
+                    m: usize,
+                    n: usize,
+                    a: impl Into<Vec<i32>>,
+                    b: impl Into<Vec<i32>>,
+                ) -> TractResult<()> {
+                    let a = a.into().into_iter().map(|i| i as f32).collect_vec();
+                    let b = b.into().into_iter().map(|i| i as f32).collect_vec();
+                    t(m, n, a, b)
+                }
+
+                #[test]
+                fn trivial_1x2() -> TractResult<()> {
+                    ti(1, 2, [0], [0, 0])
+                }
+
+                #[test]
+                fn packed_packed_empty() -> TractResult<()> {
+                    t($ker.mr(), $ker.nr(), [], [])
+                }
+
+                #[test]
+                fn packed_packed_empty_2() -> TractResult<()> {
+                    t(2 * $ker.mr(), 2 * $ker.nr(), [], [])
+                }
+
+                #[test]
+                fn mat_mul_1() -> TractResult<()> {
+                    ti(3, 2, [-3, 3, 5, -5, 6, 0, -6, -5, 0, 0, 9, 7], [-8, 5, 5, -3, 5, 7, -8, -1])
+                }
+
+                #[test]
+                fn mat_mul_2() -> TractResult<()> {
+                    ti(1, 3, [122, 82], [0, 0, 37, 0, 0, 57])
+                }
+            }
+        }
+    };
+}
+
+#[derive(Debug, new)]
+pub struct PackedPackedProblem<K>
+where
+    K: MatMatMulKer,
+{
+    pub frame_test: Option<(usize, usize)>,
+    pub ker: K,
+    pub packing: usize,
+    pub a: Vec<f32>,
+    pub b: Vec<f32>,
+}
+
+pub fn arbitrary_problem<K: MatMatMulKer>(
+    frame_test: bool,
+    ker: &K,
+    packing: usize,
+) -> BoxedStrategy<PackedPackedProblem<K>> {
+    let (mr, nr) = (ker.mr(), ker.nr());
+    let item_range = if ker.internal_type().is_integer() { (-5f32)..5f32 } else { (-1f32)..1f32 };
+    let (m_range, n_range) =
+        if frame_test { (1usize..3 * mr, 1usize..3 * nr) } else { (mr..mr + 1, nr..nr + 1) };
+    let ker = ker.clone();
+    (m_range, 0usize..40, n_range)
+        .prop_flat_map(move |(m, k, n)| {
+            (
+                vec(item_range.clone(), k * m..=k * m),
+                vec(item_range.clone(), k * n..=k * n),
+                Just((m, n)),
+            )
+        })
+        .prop_map(move |(mut a, mut b, mn)| {
+            a.reverse();
+            b.reverse();
+            PackedPackedProblem {
+                frame_test: Some(mn).filter(|_| frame_test),
+                ker: ker.clone(),
+                packing,
+                a,
+                b,
+            }
+        })
+        .boxed()
+}
+
+impl<K: MatMatMulKer> PackedPackedProblem<K> {
+    pub fn kernel(
+        ker: &K,
+        packing: usize,
+        a: impl Into<Vec<f32>>,
+        b: impl Into<Vec<f32>>,
+    ) -> PackedPackedProblem<K> {
+        PackedPackedProblem {
+            frame_test: None,
+            ker: ker.clone(),
+            packing,
+            a: a.into(),
+            b: b.into(),
+        }
+    }
+
+    pub fn frame(
+        ker: &K,
+        packing: usize,
+        m: usize,
+        n: usize,
+        a: impl Into<Vec<f32>>,
+        b: impl Into<Vec<f32>>,
+    ) -> PackedPackedProblem<K> {
+        PackedPackedProblem {
+            frame_test: Some((m, n)),
+            ker: ker.clone(),
+            packing,
+            a: a.into(),
+            b: b.into(),
+        }
+    }
+
+    pub fn mkn(&self) -> (usize, usize, usize) {
+        let (m, n) = self.frame_test.unwrap_or((self.ker.mr(), self.ker.nr()));
+        assert!(m != 0 && n != 0);
+        let k = self.a.len() / m;
+        assert_eq!(self.b.len() / n, k);
+        (m, k, n)
+    }
+
+    pub fn padded_inputs(&self) -> TractResult<(Tensor, Tensor)> {
+        let (pack_a, pack_b) = &self.ker.packings()[self.packing];
+        assert!(pack_b.k_alignment() == 1);
+        let (m, k, n) = self.mkn();
+        let k_aligned = k.next_multiple_of(pack_a.k_alignment());
+
+        let mut a = Tensor::zero::<f32>(&[m, k_aligned])?;
+        for row in 0..m {
+            for col in 0..k {
+                a.to_array_view_mut()?[[row, col]] = self.a[col + k * row];
+            }
+        }
+        if let Some(pf) = pack_a.downcast_ref::<PackedFormat>() {
+            a = a.cast_to_dt(pf.dt)?.into_owned();
+        }
+        let mut b = Tensor::zero::<f32>(&[k_aligned, n])?;
+        for row in 0..k {
+            for col in 0..n {
+                b.to_array_view_mut()?[[row, col]] = self.b[col + n * row];
+            }
+        }
+        if let Some(pf) = pack_b.downcast_ref::<PackedFormat>() {
+            b = b.cast_to_dt(pf.dt)?.into_owned();
+        }
+
+        Ok((a, b))
+    }
+
+    pub fn reference(&self) -> TractResult<Tensor> {
+        let (m, k, n) = self.mkn();
+        let pack_a = &self.ker.packings()[self.packing].0;
+        let (mut a, b) = self.padded_inputs()?;
+        let k_aligned = k.next_multiple_of(pack_a.k_alignment());
+        if let Some(pbqf) = pack_a.downcast_ref::<PackedBlockQuantFormat>() {
+            a = pbqf.simulate_precision_loss(a, 1)?;
+        };
+        let mut c = Tensor::zero::<K::Acc>(&[m, n])?;
+
+        let a = a.cast_to::<K::Acc>()?;
+        let a = a.as_slice::<K::Acc>()?;
+        let b = b.cast_to::<K::Acc>()?;
+        let b = b.as_slice::<K::Acc>()?;
+        let mut view = c.to_array_view_mut::<K::Acc>()?.into_dimensionality()?;
+        for ix_m in 0..m {
+            for ix_n in 0..n {
+                for ix_k in 0..k {
+                    let a = a[ix_k + k_aligned * ix_m];
+                    let b = b[ix_n + n * ix_k];
+                    view[(ix_m, ix_n)] += a * b;
+                }
+            }
+        }
+        Ok(c)
+    }
+
+    pub fn run(&self) -> TractResult<Tensor> {
+        let (m, k, n) = self.mkn();
+        let (pack_a, pack_b) = &self.ker.packings()[self.packing];
+        assert!(pack_b.k_alignment() == 1);
+        let k_aligned = k.next_multiple_of(pack_a.k_alignment());
+
+        let (a, b) = self.padded_inputs()?;
+        let pa = pack_a.prepare_one(&a, 1, 0)?;
+        let pb = pack_b.prepare_one(&b, 0, 1)?;
+
+        let mut v = unsafe { Tensor::uninitialized_dt(self.ker.internal_type(), &[m, n])? };
+        let item_size = self.ker.internal_type().size_of();
+
+        if self.frame_test.is_some() {
+            unsafe {
+                let c = self.ker.c_view(Some(0), Some(1)).wrap(&v.view_mut());
+                let ops = tvec!(
+                    FusedSpec::AddMatMul {
+                        a: AsInputValue::Borrowed(&*pa),
+                        b: AsInputValue::Borrowed(&*pb),
+                        packing: self.packing
+                    },
+                    FusedSpec::Store(c)
+                );
+                self.ker.run(m, n, &ops)?;
+            }
+        } else {
+            let c = OutputStoreKer {
+                ptr: v.as_bytes_mut().as_mut_ptr(),
+                row_byte_stride: (item_size * self.ker.nr()) as isize,
+                col_byte_stride: item_size as isize,
+                item_size,
+            };
+
+            let non_linear_ops = tvec!(
+                FusedKerSpec::Clear,
+                FusedKerSpec::AddMatMul {
+                    k: k_aligned,
+                    pa: pa.panel_bytes(0, None)?,
+                    pb: pb.panel_bytes(0, None)?,
+                    packing: self.packing
+                },
+                FusedKerSpec::Store(c),
+                FusedKerSpec::Done
+            );
+            let err = self.ker.kernel(&non_linear_ops);
+            assert_eq!(err, 0);
+        }
+        Ok(v)
+    }
+
+    pub fn check(&self) -> TractResult<()> {
+        if !self.ker.is_supported_here() {
+            return Ok(());
+        }
+        let expected = self.reference()?;
+        let found = self.run()?;
+        let app = if K::Acc::datum_type() == f16::datum_type() {
+            Approximation::SuperApproximate
+        } else {
+            Approximation::Approximate
+        };
+        let result = found.close_enough(&expected, app);
+        if result.is_err() {
+            let exp = expected.as_slice::<K::Acc>()?;
+            let found = found.as_slice::<K::Acc>()?;
+            let (m, _, n) = self.mkn();
+            display_error(found, exp, m, n);
+        }
+        result
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/q_scale.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/q_scale.rs
new file mode 100644
index 000000000..7d7678e05
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/q_scale.rs
@@ -0,0 +1,176 @@
+use crate::frame::mmm::fuse::RoundingPolicy;
+use crate::frame::mmm::MatMatMulKer;
+use crate::generic::rounding::ScaleShiftAndRound;
+use crate::mmm::{FusedKerSpec, FusedSpec};
+use crate::Scaler;
+use proptest::prelude::*;
+
+use super::fuse::fused_ops;
+
+#[derive(Debug, new)]
+pub struct QScaleProblem<K>
+where
+    K: MatMatMulKer<Acc = i32>,
+{
+    pub ker: K,
+    pub c: Vec<i32>,
+    pub scaler: Scaler,
+    pub boo: std::marker::PhantomData<K>,
+}
+
+pub fn arbitrary_qscale_problem<K: MatMatMulKer<Acc = i32>>(
+    ker: &K,
+) -> BoxedStrategy<QScaleProblem<K>> {
+    use RoundingPolicy::*;
+    let ker = ker.clone();
+    let len = ker.mr() * ker.nr();
+    (
+        proptest::collection::vec(-20i32..20, len..=len),
+        -5i32..5,
+        prop_oneof!(Just(1f32), 0f32..1f32),
+        proptest::prop_oneof![
+            Just(Zero),
+            Just(Away),
+            Just(PlusInf),
+            Just(MinusInf),
+            Just(Odd),
+            Just(Even)
+        ],
+    )
+        .prop_map(move |(c, scale_pot, scale_mult, policy)| QScaleProblem {
+            ker: ker.clone(),
+            c,
+            scaler: Scaler::new(scale_mult * 2f32.powi(scale_pot), policy),
+            boo: std::marker::PhantomData,
+        })
+        .boxed()
+}
+
+impl<K> QScaleProblem<K>
+where
+    K: MatMatMulKer<Acc = i32>,
+{
+    pub fn run(&self) {
+        if !self.ker.is_supported_here() {
+            return;
+        }
+        if let FusedSpec::QScale(shift, policy, mult) = self.scaler.as_fused_spec() {
+            fused_ops::<K, i32, _>(
+                &self.ker,
+                &self.c,
+                &[FusedKerSpec::QScale(shift, policy, mult)],
+                |_, _, c| c.q_scale(self.scaler),
+            )
+        } else if let FusedSpec::RoundingShiftRight(shift, policy) = self.scaler.as_fused_spec() {
+            fused_ops::<K, i32, _>(
+                &self.ker,
+                &self.c,
+                &[FusedKerSpec::RoundingShiftRight(shift, policy)],
+                |_, _, c| c.q_shr(shift, policy),
+            )
+        } else if let FusedSpec::ShiftLeft(shift) = self.scaler.as_fused_spec() {
+            fused_ops::<K, i32, _>(
+                &self.ker,
+                &self.c,
+                &[FusedKerSpec::ShiftLeft(shift)],
+                |_, _, c| c.q_shl(shift),
+            )
+        } else {
+            unreachable!()
+        }
+    }
+}
+
+pub fn return_c_scale_bigpot<K>(ker: &K)
+where
+    K: MatMatMulKer<Acc = i32>,
+{
+    let ker = ker.clone();
+    let len = ker.mr() * ker.nr();
+    let v: Vec<i32> = (-(len as i32) / 2..).take(len).collect();
+    fused_ops::<K, i32, _>(&ker, &v, &[FusedKerSpec::ShiftLeft(1)], |_, _, c| c.q_shl(1))
+}
+
+#[macro_export]
+macro_rules! mmm_q_scale_tests {
+    ($ker:expr) => {
+        use $crate::frame::mmm::fuse::RoundingPolicy;
+        use $crate::frame::mmm::tests::q_scale::arbitrary_qscale_problem;
+        use $crate::frame::mmm::tests::q_scale::QScaleProblem;
+        use $crate::frame::mmm::MatMatMulKer;
+        use $crate::generic::Scaler;
+        // FIXME: Scaler should be arbitrary
+        macro_rules! test_q_scale {
+            ($policy: ident) => {
+                paste! {
+                    #[test]
+                    fn [<return_q_scale_halfpos_ $policy:lower>]() {
+                        let ker = $ker;
+                        let len = (ker.mr() * ker.nr()) as i64;
+                        let v = (0..len).map(|i| (i - len / 2) as i32).collect();
+                        QScaleProblem::new(ker.clone(), v, Scaler::new(0.5f32, RoundingPolicy::$policy)).run()
+                    }
+
+                    #[test]
+                    fn [<return_q_scale_halfneg_ $policy:lower>]() {
+                        let ker = $ker;
+                        let len = (ker.mr() * ker.nr()) as i64;
+                        let v = (0..len).map(|i| (i - len / 2) as i32).collect();
+                        QScaleProblem::new(ker.clone(), v, Scaler::new(-0.5f32, RoundingPolicy::$policy)).run()
+                    }
+
+                    #[test]
+                    fn [<return_q_scale_pot_ $policy:lower>]() {
+                        let ker = $ker;
+                        let len = (ker.mr() * ker.nr()) as i64;
+                        let v = (0..len).map(|i| (i - len / 2) as i32).collect();
+                        QScaleProblem::new(ker.clone(), v, Scaler::new(0.25f32, RoundingPolicy::$policy)).run()
+                    }
+
+                    #[test]
+                    fn [<return_q_scale_nonpot_ $policy:lower>]() {
+                        let ker = $ker;
+                        let len = (ker.mr() * ker.nr()) as i64;
+                        let v = (0..len).map(|i| (i - len / 2) as i32).collect();
+                        QScaleProblem::new(ker.clone(), v, Scaler::new(1f32 / 5., RoundingPolicy::$policy)).run()
+                    }
+
+                    #[test]
+                    fn [<return_q_scale_bigpot_ $policy:lower>]() {
+                        let ker = $ker;
+                        let len = (ker.mr() * ker.nr()) as i64;
+                        let v = (0..len).map(|i| (i - len / 2) as i32).collect();
+                        QScaleProblem::new(ker.clone(), v, Scaler::new(4f32, RoundingPolicy::$policy)).run()
+                    }
+
+                    #[test]
+                    fn [<return_q_scale_bignonpot_ $policy:lower>]() {
+                        let ker = $ker;
+                        let len = (ker.mr() * ker.nr()) as i64;
+                        let v = (0..len).map(|i| (i - len / 2) as i32).collect();
+                        QScaleProblem::new(ker.clone(), v, Scaler::new(14., RoundingPolicy::$policy)).run()
+                    }
+                }
+            }
+        }
+
+        test_q_scale!(Zero);
+        test_q_scale!(Away);
+        test_q_scale!(MinusInf);
+        test_q_scale!(PlusInf);
+        test_q_scale!(Even);
+        test_q_scale!(Odd);
+
+        proptest::proptest! {
+            #[test]
+            fn return_q_scale_prop(pb in arbitrary_qscale_problem($ker)) {
+                pb.run()
+            }
+        }
+
+        #[test]
+        fn return_c_scale_bigpot() {
+            $crate::frame::mmm::tests::q_scale::return_c_scale_bigpot::<_>($ker)
+        }
+    };
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/store.rs b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/store.rs
new file mode 100644
index 000000000..a44d533cd
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mmm/tests/store.rs
@@ -0,0 +1,131 @@
+use crate::frame::mmm::fuse::FusedKerSpec;
+use crate::frame::mmm::storage::*;
+use crate::frame::mmm::tests::display_error;
+use crate::frame::mmm::*;
+use crate::LADatum;
+use num_traits::Bounded;
+use tract_data::internal::*;
+use tract_itertools::Itertools;
+use tract_ndarray::Axis;
+
+#[macro_export]
+macro_rules! mmm_store_test {
+    ($ker:expr, $tc:ident) => {
+        paste! {
+            mod [<store_$tc>] {
+                #[allow(unused_imports)]
+                use tract_data::prelude::f16;
+                use $crate::frame::mmm::tests::store::StoreLayout;
+
+                #[test] fn store_zeros() {
+                    $crate::frame::mmm::tests::store::store_zeros::<_,$tc,_>($ker);
+                }
+
+                #[test] fn store_col_major() {
+                    $crate::frame::mmm::tests::store::store_pattern::<_,$tc,_>($ker, StoreLayout::ColMajor);
+                }
+
+                #[test] fn store_row_major() {
+                    $crate::frame::mmm::tests::store::store_pattern::<_,$tc,_>($ker, StoreLayout::RowMajor);
+                }
+
+                #[test] fn store_arbitrary() {
+                    $crate::frame::mmm::tests::store::store_pattern::<_,$tc,_>($ker, StoreLayout::Arbitrary);
+                }
+            }
+        }
+    };
+}
+
+pub fn mmm_stride_storage<T: Copy>(v: &[T], rsc: usize) -> OutputStoreKer {
+    OutputStoreKer {
+        ptr: v.as_ptr() as _,
+        row_byte_stride: (std::mem::size_of::<T>() * rsc) as isize,
+        col_byte_stride: std::mem::size_of::<T>() as isize,
+        item_size: std::mem::size_of::<T>(),
+    }
+}
+
+pub fn store_zeros<K, TC, TI>(ker: &K)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TC: LADatum,
+    TI: LADatum + Bounded + PartialEq,
+{
+    if !ker.is_supported_here() {
+        return;
+    }
+    let v = vec![TC::max_value(); ker.mr() * ker.nr()];
+    let c = mmm_stride_storage(&v, ker.nr());
+    let non_linear = tvec![FusedKerSpec::Clear, FusedKerSpec::Store(c), FusedKerSpec::Done];
+    let err = ker.kernel(&non_linear);
+    assert_eq!(err, 0);
+    let expected = vec![TC::zero(); v.len()];
+    display_error(&v, &expected, ker.mr(), ker.nr());
+    assert_eq!(v, expected);
+}
+
+pub enum StoreLayout {
+    ColMajor,
+    RowMajor,
+    Arbitrary,
+}
+
+pub fn store_pattern<K, TC, TI>(ker: &K, layout: StoreLayout)
+where
+    K: MatMatMulKer<Acc = TI>,
+    TC: LADatum,
+    TI: LADatum + Bounded + PartialEq,
+{
+    if !ker.is_supported_here() {
+        return;
+    }
+    let (mr, nr) = (ker.mr(), ker.nr());
+    let pattern = tensor1(&(0..).take(mr * nr).collect_vec())
+        .cast_to::<TI>()
+        .unwrap()
+        .into_owned()
+        .into_shape(&[mr, nr])
+        .unwrap();
+    let pattern_col_major = pattern.clone().permute_axes(&[1, 0]).unwrap();
+    let size_of_tc = std::mem::size_of::<TC>();
+    let (row_stride, col_stride, result_size) = match layout {
+        StoreLayout::RowMajor => (nr, 1, mr * nr),
+        StoreLayout::ColMajor => (1, mr, mr * nr),
+        // like row major, but storing every other third column
+        StoreLayout::Arbitrary => (nr * 3, 3, mr * nr * 3),
+    };
+    let mut result = tensor0(TC::max_value()).broadcast_to_shape(&[result_size]).unwrap();
+    let non_linear = tvec![
+        unsafe {
+            FusedKerSpec::LoadTile(pattern_col_major.as_ptr_unchecked(), pattern.as_ptr_unchecked())
+        },
+        FusedKerSpec::Store(OutputStoreKer {
+            ptr: result.as_bytes_mut().as_mut_ptr(),
+            row_byte_stride: (size_of_tc * row_stride) as isize,
+            col_byte_stride: (size_of_tc * col_stride) as isize,
+            item_size: size_of_tc,
+        }),
+        FusedKerSpec::Done
+    ];
+    let err = ker.kernel(&non_linear);
+    assert_eq!(err, 0);
+    let expected = pattern.cast_to::<TC>().unwrap().into_owned();
+    let result = match layout {
+        StoreLayout::RowMajor => result,
+        StoreLayout::ColMajor => {
+            result.into_shape(&[ker.nr(), ker.mr()]).unwrap().permute_axes(&[1, 0]).unwrap()
+        }
+        StoreLayout::Arbitrary => result
+            .into_array::<TC>()
+            .unwrap()
+            .into_shape_with_order((mr, nr, 3))
+            .unwrap()
+            .index_axis_move(Axis(2), 0)
+            .into_tensor(),
+    };
+    let expected = expected.as_slice::<TC>().unwrap();
+    let result = result.as_slice::<TC>().unwrap();
+    display_error(result, expected, ker.mr(), ker.nr());
+    assert_eq!(result, expected);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/mod.rs b/vendor/tract-linalg-0.22.1/src/frame/mod.rs
new file mode 100644
index 000000000..8528eea35
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/mod.rs
@@ -0,0 +1,25 @@
+#[macro_use]
+pub mod block_quant;
+#[macro_use]
+pub mod element_wise;
+pub mod element_wise_helper;
+#[macro_use]
+pub mod unicast;
+#[macro_use]
+pub mod by_scalar;
+#[macro_use]
+pub mod leaky_relu;
+#[macro_use]
+pub mod lut;
+#[macro_use]
+pub mod mmm;
+#[macro_use]
+pub mod pack;
+#[macro_use]
+pub mod reduce;
+#[macro_use]
+pub mod sigmoid;
+#[macro_use]
+pub mod tanh;
+#[macro_use]
+pub mod weights;
diff --git a/vendor/tract-linalg-0.22.1/src/frame/pack.rs b/vendor/tract-linalg-0.22.1/src/frame/pack.rs
new file mode 100644
index 000000000..ffd65f3d4
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/pack.rs
@@ -0,0 +1,1015 @@
+use std::alloc::Layout;
+use std::fmt::{Debug, Display};
+use std::marker::PhantomData;
+use std::ops::Range;
+use std::sync::Arc;
+use tract_data::internal::*;
+
+use crate::mmm::{EagerPackedInput, MMMInputFormat, MMMInputValue, PackedOpaqueFact};
+
+use crate::WeightType;
+
+#[derive(Clone, Eq, PartialEq, Hash)]
+pub struct PackedFormat {
+    pub dt: DatumType,
+    pub r: usize,
+    pub alignment_bytes: usize,
+    pub end_padding_record: usize,
+}
+
+impl MMMInputFormat for PackedFormat {
+    fn prepare_tensor(&self, t: &Tensor, k_axis: usize, mn_axis: usize) -> TractResult<Tensor> {
+        let packed = PackedFormat::pack_tensor(self, t, k_axis, mn_axis)?;
+        Ok(tensor0(Opaque(Arc::new(packed))))
+    }
+
+    fn prepare_one(
+        &self,
+        t: &Tensor,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        PackedFormat::pack_tensor(self, t, k_axis, mn_axis)
+    }
+
+    fn precursor(&self) -> WeightType {
+        WeightType::Plain(self.dt)
+    }
+
+    fn r(&self) -> usize {
+        self.r
+    }
+
+    fn k_alignment(&self) -> usize {
+        1
+    }
+
+    fn same_as(&self, other: &dyn MMMInputFormat) -> bool {
+        other.downcast_ref::<Self>().is_some_and(|other| self == other)
+    }
+
+    #[allow(clippy::collapsible_if)]
+    fn merge_with<'o, 'a: 'o, 'b: 'o>(
+        &'a self,
+        other: &'b dyn MMMInputFormat,
+    ) -> Option<&'o dyn MMMInputFormat> {
+        if let Some(other) = other.downcast_ref::<PackedFormat>() {
+            if self.r == other.r && self.dt == other.dt {
+                if self.alignment_bytes % other.alignment_bytes == 0
+                    && self.end_padding_record >= other.end_padding_record
+                {
+                    return Some(self);
+                }
+                if other.alignment_bytes % self.alignment_bytes == 0
+                    && other.end_padding_record >= self.end_padding_record
+                {
+                    return Some(other);
+                }
+            }
+        }
+        None
+    }
+
+    fn mem_size(&self, k: TDim, mn: TDim) -> TDim {
+        self.len(k, mn) * self.dt.size_of()
+    }
+
+    fn extract_at_mn_f16(
+        &self,
+        data: &EagerPackedInput,
+        mn: usize,
+        slice: &mut [f16],
+    ) -> TractResult<()> {
+        ensure!(data.format().same_as(self));
+        ensure!(self.len(data.k(), data.mn()) * self.dt.size_of() == data.packed.len());
+        unsafe {
+            let ptr = data.packed.as_ptr().add(
+                (self.single_panel_len(data.k()) * (mn / self.r) + mn % self.r) * self.dt.size_of(),
+            );
+            for (i, slot) in slice.iter_mut().enumerate() {
+                let ptr = ptr.add(i * self.dt.size_of() * self.r);
+                *slot = if self.dt == f16::datum_type() {
+                    *(ptr as *const f16)
+                } else if self.dt == f32::datum_type() {
+                    f16::from_f32(*(ptr as *const f32))
+                } else {
+                    bail!("Unexpected DT {:?}", self.dt)
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn extract_at_mn_f32(
+        &self,
+        data: &EagerPackedInput,
+        mn: usize,
+        slice: &mut [f32],
+    ) -> TractResult<()> {
+        ensure!(data.format().same_as(self));
+        ensure!(self.len(data.k(), data.mn()) * self.dt.size_of() == data.packed.len());
+        unsafe {
+            let ptr = data.packed.as_ptr().add(
+                (self.single_panel_len(data.k()) * (mn / self.r) + mn % self.r) * self.dt.size_of(),
+            );
+            for (i, slot) in slice.iter_mut().enumerate() {
+                let ptr = ptr.add(i * self.dt.size_of() * self.r);
+                *slot = if self.dt == f16::datum_type() {
+                    (*(ptr as *const f16)).to_f32()
+                } else if self.dt == f32::datum_type() {
+                    *(ptr as *const f32)
+                } else {
+                    bail!("Unexpected DT {:?}", self.dt)
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl Display for PackedFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Packed{:?}[{}]", self.dt, self.r)
+    }
+}
+
+impl Debug for PackedFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Packed{:?}[{}]@{}+{}",
+            self.dt, self.r, self.alignment_bytes, self.end_padding_record
+        )
+    }
+}
+
+impl PackedFormat {
+    pub const fn new(dt: DatumType, nr: usize, alignment_bytes: usize) -> PackedFormat {
+        PackedFormat { dt, r: nr, alignment_bytes, end_padding_record: 1 }
+    }
+
+    pub const fn with_end_padding_record(self, end_padding_record: usize) -> Self {
+        PackedFormat { end_padding_record, ..self }
+    }
+
+    #[inline]
+    pub fn align(self, alignment: usize) -> Self {
+        Self { alignment_bytes: alignment, ..self }
+    }
+
+    #[inline]
+    pub fn alignment(&self) -> usize {
+        self.alignment_bytes
+    }
+
+    #[inline]
+    pub fn panel_width(&self) -> usize {
+        self.r
+    }
+
+    #[inline]
+    pub fn len<D: DimLike>(&self, k: D, n: D) -> D {
+        n.divceil(self.r) * self.single_panel_len(k)
+    }
+
+    #[inline]
+    pub fn single_panel_len<D: DimLike>(&self, k: D) -> D {
+        ((k + self.end_padding_record) * self.r).divceil(self.alignment()) * self.alignment()
+    }
+
+    #[inline]
+    pub fn single_panel_layout(&self, k: usize, item_size: usize) -> Layout {
+        Layout::from_size_align(self.single_panel_len(k) * item_size, self.alignment()).unwrap()
+    }
+
+    pub fn pack_tensor(
+        &self,
+        t: &Tensor,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        ensure!(t.datum_type().is_copy());
+        ensure!(
+            t.datum_type().unquantized() == self.dt.unquantized(),
+            "Attempting to pack for {self} tensor {t:?}"
+        );
+        let k = t.shape()[k_axis];
+        let mn = t.shape()[mn_axis];
+        let packed_len = self.len(k, mn);
+        let panel_len = self.single_panel_len(k);
+        let panel_bytes = panel_len * t.datum_type().size_of();
+        let strides = t.strides();
+        unsafe {
+            let mut packed = Blob::new_for_size_and_align(
+                t.datum_type().size_of() * packed_len,
+                self.alignment_bytes,
+            );
+            if cfg!(debug_assertions) {
+                packed.as_bytes_mut().fill(0u8);
+            }
+            dispatch_copy!(Self::pack_t(t.datum_type())(
+                self,
+                packed.as_mut_ptr() as _,
+                t.as_ptr_unchecked(),
+                mn,
+                strides[k_axis],
+                strides[mn_axis],
+                0..k,
+                0..mn
+            ));
+            Ok(Box::new(EagerPackedInput {
+                fact: PackedOpaqueFact { format: Box::new(self.clone()), mn: mn.to_dim(), k },
+                packed: packed.into(),
+                panel_bytes,
+                mn,
+            }))
+        }
+    }
+
+    pub fn pack_tensor_view(
+        &self,
+        t: &TensorView,
+        k_axis: usize,
+        mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        ensure!(
+            t.datum_type().unquantized() == self.dt.unquantized(),
+            "Attempting to pack for {self} tensor view {t:?}"
+        );
+        let k = t.shape()[k_axis];
+        let mn = t.shape()[mn_axis];
+        let packed_len = self.len(k, mn);
+        let panel_len = self.single_panel_len(k);
+        let panel_bytes = panel_len * t.datum_type().size_of();
+        let strides = t.strides();
+        unsafe {
+            let mut packed = Blob::new_for_size_and_align(
+                t.datum_type().size_of() * packed_len,
+                self.alignment_bytes,
+            );
+            if cfg!(debug_assertions) {
+                packed.as_bytes_mut().fill(0u8);
+            }
+            dispatch_copy!(Self::pack_t(t.datum_type())(
+                self,
+                packed.as_mut_ptr() as _,
+                t.as_ptr_unchecked(),
+                mn,
+                strides[k_axis],
+                strides[mn_axis],
+                0..k,
+                0..mn
+            ));
+            Ok(Box::new(EagerPackedInput {
+                fact: PackedOpaqueFact { format: Box::new(self.clone()), mn: mn.to_dim(), k },
+                packed: packed.into(),
+                panel_bytes,
+                mn,
+            }))
+        }
+    }
+
+    pub unsafe fn pack<'a, 'b>(
+        &self,
+        pb: impl std::borrow::BorrowMut<TensorView<'a>>,
+        b: impl std::borrow::Borrow<TensorView<'b>>,
+        k_axis: usize,
+        mn_axis: usize,
+    ) {
+        let k = b.borrow().shape()[k_axis];
+        let mn = b.borrow().shape()[mn_axis];
+        unsafe { self.pack_segment(pb, b, k_axis, mn_axis, 0..k, 0..mn) };
+    }
+
+
+    #[allow(clippy::too_many_arguments)]
+    #[rustfmt::skip]
+    pub unsafe fn pack_t<T: Datum + Copy>(
+        &self,
+        pb: *mut T,
+        b: *const T,
+        mn: usize,
+        k_stride: isize,
+        mn_stride: isize,
+        k_range: Range<usize>,
+        mn_range: Range<usize>,
+        ) { unsafe {
+        if k_range.len() == 0 || mn_range.len() == 0 {
+            return
+        }
+        if self.r == 1 && k_stride == 1 && mn == 1 {
+            pb.copy_from_nonoverlapping(b.add(k_range.start), k_range.len())
+        } else if mn_stride == 1 {
+            let size_of = T::datum_type().size_of();
+            let rbytes = self.r * size_of;
+            let mn_valid_end = mn_range.end.min(mn);
+            let mn_range_bytes = mn_range.start * size_of..mn_valid_end * size_of;
+            let k_stride_bytes = k_stride * size_of as isize;
+            let bb = b as *const u8;
+            let pbb = pb as *mut u8;
+            let panel_len = self.single_panel_len(k_range.len()) * size_of;
+            match rbytes {
+                16 => pack_mn_major::<[u8; 16]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
+                24 => pack_mn_major::<[u8; 24]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
+                32 => pack_mn_major::<[u8; 32]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
+                48 => pack_mn_major::<[u8; 48]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
+                64 => pack_mn_major::<[u8; 64]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
+                _ => {
+                    let mut packer = self.write_with_k_outer(pb, k_range.len(), mn_range.len());
+                    for k in k_range {
+                        for x in mn_range.start..mn_valid_end {
+                            packer.write(*b.offset(x as isize + k_stride * k as isize))
+                        }
+                        for _x in mn_valid_end..mn_range.end {
+                            packer.write(T::default())
+                        }
+                    }
+                }
+            }
+        } else if k_stride == 1 {
+            let mut packer = self.write_with_k_inner(pb, k_range.len(), mn);
+            let mn_valid_end = mn_range.end.min(mn);
+            for x in mn_range.start..mn_valid_end {
+                for k in k_range.clone() {
+                    packer.write(*b.offset(x as isize * mn_stride + k as isize))
+                }
+            }
+            // just ignore invalid mn_range
+        } else {
+            let mut packer = self.write_with_k_outer(pb, k_range.len(), mn);
+            let mn_valid_end = mn_range.end.min(mn);
+            for k in k_range {
+                for x in mn_range.start..mn_valid_end {
+                    packer.write(*b.offset(x as isize * mn_stride + k_stride * k as isize))
+                }
+                for _x in mn_valid_end..mn_range.end {
+                    packer.write(T::default())
+                }
+            }
+        }
+    }}
+
+    #[inline]
+    pub unsafe fn pack_segment<'a, 'b>(
+        &self,
+        mut pb: impl std::borrow::BorrowMut<TensorView<'a>>,
+        b: impl std::borrow::Borrow<TensorView<'b>>,
+        k_axis: usize,
+        mn_axis: usize,
+        k_range: Range<usize>,
+        mn_range: Range<usize>,
+    ) {
+        debug_assert!(pb.borrow().len() >= self.len(k_range.len(), mn_range.len()));
+        let pb = pb.borrow_mut();
+        let b = b.borrow();
+        let dt = pb.datum_type();
+        unsafe {
+            dispatch_copy!(Self::pack_t(dt)(
+                self,
+                pb.as_ptr_mut_unchecked(),
+                b.as_ptr_unchecked(),
+                b.shape()[mn_axis],
+                b.strides()[k_axis],
+                b.strides()[mn_axis],
+                k_range,
+                mn_range
+            ));
+        }
+    }
+
+    pub fn write_with_k_outer<'p, T: Copy + Debug>(
+        &self,
+        pb: *mut T,
+        k: usize,
+        mn: usize,
+    ) -> KOutWriter<'p, T> {
+        KOutWriter::new(pb, self.r, self.single_panel_len(k), mn, k)
+    }
+
+    pub fn write_single_panel_with_k_outer<'p, T: Copy + Debug>(
+        &self,
+        pb: *mut T,
+    ) -> KOutSinglePanelWriter<'p, T> {
+        KOutSinglePanelWriter::new(pb)
+    }
+
+    pub fn write_with_k_inner<'p, T: Copy + Debug>(
+        &self,
+        pb: *mut T,
+        k: usize,
+        mn: usize,
+    ) -> KInWriter<'p, T> {
+        let panel_len = self.single_panel_len(k);
+        KInWriter::new(pb, panel_len, self.r, mn, k)
+    }
+}
+
+pub trait PackingWriter<T: Copy> {
+    fn write(&mut self, t: T);
+}
+
+#[derive(Debug)]
+pub struct KOutSinglePanelWriter<'p, T>
+where
+    T: Copy + std::fmt::Debug,
+{
+    ptr: *mut T,
+    _phantom: PhantomData<&'p T>,
+}
+
+impl<'p, T> KOutSinglePanelWriter<'p, T>
+where
+    T: Copy + std::fmt::Debug,
+{
+    pub fn new(ptr: *mut T) -> KOutSinglePanelWriter<'p, T> {
+        KOutSinglePanelWriter { ptr, _phantom: PhantomData }
+    }
+}
+
+impl<T> PackingWriter<T> for KOutSinglePanelWriter<'_, T>
+where
+    T: Copy + std::fmt::Debug,
+{
+    #[inline(always)]
+    fn write(&mut self, t: T) {
+        unsafe {
+            *self.ptr = t;
+            self.ptr = self.ptr.offset(1);
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct KOutWriter<'p, T>
+where
+    T: Copy + std::fmt::Debug,
+{
+    ptr: *mut T,
+    panels: usize,
+    panel_width: usize,
+    last_panel_width: usize,
+    remain: usize,
+    current_panel: usize,
+    next_panel: isize,
+    next_lane: isize,
+    _phantom: PhantomData<&'p T>,
+}
+
+impl<'p, T> KOutWriter<'p, T>
+where
+    T: Copy + std::fmt::Debug,
+{
+    pub fn new(
+        ptr: *mut T,
+        panel_width: usize,
+        panel_len: usize,
+        mn: usize,
+        _k: usize,
+    ) -> KOutWriter<'p, T> {
+        let panels = mn.divceil(panel_width);
+        let last_panel_width = mn - (panels - 1) * panel_width;
+        KOutWriter {
+            ptr,
+            panels,
+            panel_width,
+            last_panel_width,
+            remain: if panels > 1 { panel_width } else { last_panel_width },
+            current_panel: 0,
+            next_panel: (panel_len - panel_width) as isize,
+            next_lane: (panel_width - last_panel_width) as isize
+                - (panel_len * (panels - 1)) as isize,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<T> PackingWriter<T> for KOutWriter<'_, T>
+where
+    T: Copy + std::fmt::Debug,
+{
+    #[inline(always)]
+    fn write(&mut self, t: T) {
+        unsafe {
+            *self.ptr = t;
+            self.remain -= 1;
+            self.ptr = self.ptr.offset(1);
+            if self.remain == 0 {
+                self.current_panel += 1;
+                if self.current_panel == self.panels {
+                    self.ptr = self.ptr.offset(self.next_lane);
+                    self.current_panel = 0;
+                } else {
+                    self.ptr = self.ptr.offset(self.next_panel);
+                }
+                if self.current_panel == self.panels - 1 {
+                    self.remain = self.last_panel_width;
+                } else {
+                    self.remain = self.panel_width;
+                }
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct KInWriter<'p, T>
+where
+    T: Copy + Debug,
+{
+    ptr: *mut T,
+    k: usize,
+    panels: usize,
+    panel_width: usize,
+    last_panel_width: usize,
+    remain_on_k: usize,
+    remain_on_mn: usize,
+    current_panel: usize,
+    next_mn_offset: isize,
+    next_panel_offset: isize,
+    _phantom: PhantomData<&'p T>,
+}
+
+impl<'p, T> KInWriter<'p, T>
+where
+    T: Copy + Debug,
+{
+    pub fn new(
+        ptr: *mut T,
+        panel_len: usize,
+        panel_width: usize,
+        mn: usize,
+        k: usize,
+    ) -> KInWriter<'p, T> {
+        let panels = mn.divceil(panel_width);
+        let last_panel_width = mn - (panels - 1) * panel_width;
+        KInWriter {
+            ptr,
+            k,
+            panels,
+            panel_width,
+            last_panel_width,
+            remain_on_k: k,
+            remain_on_mn: if panels == 1 { last_panel_width } else { panel_width },
+            current_panel: 0,
+            next_mn_offset: 1 - (k * panel_width) as isize,
+            next_panel_offset: panel_len as isize - (k * panel_width + panel_width - 1) as isize,
+            //                 ^ next panel     ^    ^ rewind left ^   ^ rewind up   ^
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<T> PackingWriter<T> for KInWriter<'_, T>
+where
+    T: Copy + std::fmt::Debug,
+{
+    #[inline(always)]
+    fn write(&mut self, t: T) {
+        unsafe {
+            *self.ptr = t;
+            self.remain_on_k -= 1;
+            self.ptr = self.ptr.add(self.panel_width);
+            if self.remain_on_k == 0 {
+                self.remain_on_k = self.k;
+                self.remain_on_mn -= 1;
+                if self.remain_on_mn > 0 {
+                    self.ptr = self.ptr.offset(self.next_mn_offset);
+                } else {
+                    self.ptr = self.ptr.offset(self.next_panel_offset);
+                    self.current_panel += 1;
+                    if self.current_panel == self.panels - 1 {
+                        self.remain_on_mn = self.last_panel_width;
+                    } else {
+                        self.remain_on_mn = self.panel_width;
+                    }
+                }
+            }
+        }
+    }
+}
+
+#[inline(never)]
+unsafe fn pack_mn_major<Chunk: Copy>(
+    b: *const u8,
+    packed: *mut u8,
+    panel_len: usize,
+    k_stride_bytes: isize,
+    mn_range_bytes: Range<usize>,
+    k_range: Range<usize>,
+) {
+    unsafe {
+        let mnr = std::mem::size_of::<Chunk>();
+        let full_panes = mn_range_bytes.len() / mnr;
+        let partial_pane = mn_range_bytes.len() % mnr;
+        for k in 0..k_range.len() {
+            let mut p_row = packed.add(k * mnr);
+            let mut b_row = b.offset(
+                (k_range.start + k) as isize * k_stride_bytes + mn_range_bytes.start as isize,
+            );
+            for _ in 0..full_panes {
+                p_row.copy_from_nonoverlapping(b_row, mnr);
+                p_row = p_row.add(panel_len);
+                b_row = b_row.add(mnr);
+            }
+            if partial_pane > 0 {
+                p_row.copy_from_nonoverlapping(b_row, partial_pane);
+            }
+        }
+    }
+}
+
+pub trait Packing {
+    fn packing(r: usize) -> PackedFormat;
+}
+
+impl<D: Datum> Packing for D {
+    fn packing(r: usize) -> PackedFormat {
+        PackedFormat::new(Self::datum_type(), r, vector_size())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::ops::Range;
+
+    use proptest::prelude::*;
+    use tract_data::internal::num_integer::Integer;
+    use tract_data::internal::tract_ndarray::Zip;
+    use tract_data::internal::*;
+    use tract_ndarray::prelude::*;
+
+    #[derive(Debug)]
+    struct PackProblem {
+        k: usize,
+        mn: usize,
+        is_a: bool,
+        r: usize,
+        k_range: Range<usize>,
+        mn_range: Range<usize>,
+        align_panel: usize,
+    }
+
+    impl PackProblem {
+        fn input(&self) -> Array2<u32> {
+            let shape = if self.is_a { (self.mn, self.k) } else { (self.k, self.mn) };
+            let data = (0..(self.k * self.mn) as u32).collect();
+            Array2::from_shape_vec(shape, data).unwrap()
+        }
+
+        fn packer(&self) -> Array2<u32> {
+            let panels = self.mn_range.len().divceil(self.r);
+            let packer = super::PackedFormat::new(u32::datum_type(), self.r, self.align_panel)
+                .with_end_padding_record(0);
+            let input = self.input().into_tensor();
+            let panel_len = packer.single_panel_len(self.k_range.len());
+            let mut output =
+                Tensor::zero::<u32>(&[packer.len(self.k_range.len(), self.mn_range.len())])
+                    .unwrap();
+            unsafe {
+                packer.pack_segment(
+                    output.view_mut(),
+                    input.view(),
+                    self.is_a as usize,
+                    !self.is_a as usize,
+                    self.k_range.clone(),
+                    self.mn_range.clone(),
+                )
+            };
+            output.into_array::<u32>().unwrap().into_shape_with_order((panels, panel_len)).unwrap()
+        }
+
+        fn reference(&self) -> Array2<u32> {
+            let input = self.input();
+            let panels = self.mn_range.len().divceil(self.r);
+            let len = Integer::next_multiple_of(&(self.k_range.len() * self.r), &self.align_panel);
+            Array2::from_shape_fn([panels, len], |(panel, z)| {
+                let k = z / self.r;
+                let x = z % self.r;
+                let mn = panel * self.r + x + self.mn_range.start;
+                let k = k + self.k_range.start;
+                let coords = if self.is_a { (mn, k) } else { (k, mn) };
+                *input.get(coords).unwrap_or(&0)
+            })
+        }
+
+        fn valid(&self) -> Array2<bool> {
+            let panels = self.mn_range.len().divceil(self.r);
+            let len = Integer::next_multiple_of(&(self.k_range.len() * self.r), &self.align_panel);
+            Array2::from_shape_fn([panels, len], |(panel, z)| {
+                let k = z / self.r;
+                let x = z % self.r;
+                let k = k + self.k_range.start;
+                let mn = panel * self.r + x + self.mn_range.start;
+                k < self.k_range.end.min(self.k) && mn < self.mn_range.end.min(self.mn)
+            })
+        }
+
+        fn check(&self) {
+            let mut packer = self.packer();
+            let mut reference = self.reference();
+            let valid = self.valid();
+            Zip::from(&mut packer).and(&valid).for_each(|p, v| *p = if *v { *p } else { -1 as _ });
+            Zip::from(&mut reference)
+                .and(&valid)
+                .for_each(|p, v| *p = if *v { *p } else { -1 as _ });
+            assert_eq!(packer, reference);
+        }
+    }
+
+    impl Arbitrary for PackProblem {
+        type Parameters = ();
+        type Strategy = BoxedStrategy<PackProblem>;
+        fn arbitrary_with(_args: ()) -> Self::Strategy {
+            (any::<bool>(), 1usize..9, 1usize..20, 1usize..20)
+                .prop_flat_map(|(is_a, r, k, mn)| {
+                    (
+                        Just((is_a, r, k, mn)),
+                        sub_range_strat(0..k),
+                        sub_range_strat(0..mn),
+                        1usize..5,
+                    )
+                })
+                .prop_map(|((is_a, r, k, mn), k_range, mn_range, align_panel)| PackProblem {
+                    k,
+                    mn,
+                    is_a,
+                    r,
+                    k_range,
+                    mn_range,
+                    align_panel,
+                })
+                .boxed()
+        }
+    }
+
+    fn sub_range_strat(range: Range<usize>) -> BoxedStrategy<Range<usize>> {
+        (0..range.len())
+            .prop_flat_map(|cropped| (Just(cropped), 0..=cropped))
+            .prop_map(move |(cropped, left)| range.start + left..range.end - (cropped - left))
+            .boxed()
+    }
+
+    proptest::proptest! {
+        #[test]
+        fn prop(pb in any::<PackProblem>()) {
+            pb.check();
+        }
+
+        #[test]
+        fn subrange_prop(_range in sub_range_strat(0..20)) {
+        }
+
+    }
+
+    #[test]
+    fn simple_b_1() {
+        PackProblem {
+            k: 2,
+            mn: 1,
+            is_a: false,
+            r: 1,
+            k_range: 0..2,
+            mn_range: 0..1,
+            align_panel: 1,
+        }
+        .check();
+    }
+
+    #[test]
+    fn simple_b_2() {
+        PackProblem {
+            k: 2,
+            mn: 2,
+            is_a: false,
+            r: 1,
+            k_range: 0..2,
+            mn_range: 0..2,
+            align_panel: 1,
+        }
+        .check()
+    }
+
+    #[test]
+    fn simple_b_3() {
+        PackProblem {
+            k: 2,
+            mn: 1,
+            is_a: false,
+            r: 4,
+            k_range: 0..2,
+            mn_range: 0..1,
+            align_panel: 1,
+        }
+        .check();
+    }
+
+    #[test]
+    fn simple_b_4() {
+        PackProblem {
+            k: 1,
+            mn: 3,
+            is_a: false,
+            r: 2,
+            k_range: 0..1,
+            mn_range: 0..3,
+            align_panel: 1,
+        }
+        .check();
+    }
+
+    #[test]
+    fn simple_a_1() {
+        PackProblem {
+            k: 2,
+            mn: 2,
+            is_a: true,
+            r: 1,
+            k_range: 0..2,
+            mn_range: 0..2,
+            align_panel: 1,
+        }
+        .check();
+    }
+
+    #[test]
+    fn simple_a_2() {
+        PackProblem {
+            k: 2,
+            mn: 3,
+            is_a: true,
+            r: 2,
+            k_range: 0..2,
+            mn_range: 0..3,
+            align_panel: 1,
+        }
+        .check();
+    }
+
+    #[test]
+    fn range_k_0() {
+        PackProblem {
+            k: 2,
+            mn: 1,
+            is_a: false,
+            r: 1,
+            k_range: 1..2,
+            mn_range: 0..1,
+            align_panel: 1,
+        }
+        .check();
+    }
+
+    #[test]
+    fn range_k_1() {
+        PackProblem {
+            k: 2,
+            mn: 2,
+            is_a: false,
+            r: 1,
+            k_range: 0..2,
+            mn_range: 0..1,
+            align_panel: 1,
+        }
+        .check();
+    }
+
+    #[test]
+    fn range_k_2() {
+        PackProblem {
+            k: 2,
+            mn: 1,
+            is_a: false,
+            r: 6,
+            k_range: 1..2,
+            mn_range: 0..1,
+            align_panel: 1,
+        }
+        .check();
+    }
+
+    #[test]
+    fn range_mn_0() {
+        PackProblem {
+            k: 1,
+            mn: 2,
+            is_a: false,
+            r: 2,
+            k_range: 0..1,
+            mn_range: 0..1,
+            align_panel: 1,
+        }
+        .check();
+    }
+
+    #[test]
+    fn range_b_4() {
+        PackProblem {
+            k: 1,
+            mn: 2,
+            is_a: false,
+            r: 6,
+            k_range: 0..1,
+            mn_range: 1..2,
+            align_panel: 1,
+        }
+        .check();
+    }
+
+    #[test]
+    fn range_b_5() {
+        PackProblem {
+            k: 1,
+            mn: 7,
+            is_a: false,
+            r: 6,
+            k_range: 0..1,
+            mn_range: 1..7,
+            align_panel: 1,
+        }
+        .check();
+    }
+
+    #[test]
+    fn align_a_1() {
+        PackProblem {
+            k: 2,
+            mn: 2,
+            is_a: true,
+            r: 1,
+            k_range: 0..1,
+            mn_range: 0..2,
+            align_panel: 2,
+        }
+        .check();
+    }
+
+    #[test]
+    fn align_b_1() {
+        PackProblem {
+            k: 1,
+            mn: 1,
+            is_a: false,
+            r: 1,
+            k_range: 0..1,
+            mn_range: 0..1,
+            align_panel: 2,
+        }
+        .check();
+    }
+
+    #[test]
+    fn align_b_2() {
+        PackProblem {
+            k: 3,
+            mn: 1,
+            is_a: false,
+            r: 1,
+            k_range: 0..3,
+            mn_range: 0..1,
+            align_panel: 2,
+        }
+        .check();
+    }
+
+    #[test]
+    fn align_b_3() {
+        PackProblem {
+            k: 1,
+            mn: 1,
+            is_a: false,
+            r: 3,
+            k_range: 0..1,
+            mn_range: 0..1,
+            align_panel: 2,
+        }
+        .check();
+    }
+
+    #[test]
+    fn align_b_4() {
+        PackProblem {
+            k: 2,
+            mn: 1,
+            is_a: false,
+            r: 1,
+            k_range: 0..1,
+            mn_range: 0..1,
+            align_panel: 2,
+        }
+        .check();
+    }
+
+    #[test]
+    fn align_b_5() {
+        PackProblem {
+            k: 1,
+            mn: 5,
+            is_a: false,
+            r: 4,
+            k_range: 0..1,
+            mn_range: 0..5,
+            align_panel: 3,
+        }
+        .check();
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/reduce/max.rs b/vendor/tract-linalg-0.22.1/src/frame/reduce/max.rs
new file mode 100644
index 000000000..616e4310b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/reduce/max.rs
@@ -0,0 +1,42 @@
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use crate::frame::reduce::ReduceKer;
+    use crate::LADatum;
+    use num_traits::{AsPrimitive, Float};
+    use proptest::test_runner::TestCaseResult;
+
+    #[macro_export]
+    macro_rules! max_frame_tests {
+        ($cond:expr, $t: ty, $ker:ty) => {
+            proptest::proptest! {
+                #[test]
+                fn prop(xs in proptest::collection::vec(-25f32..25.0, 0..100)) {
+                    if $cond {
+                        $crate::frame::reduce::max::test::test_max::<$ker, $t>(&*xs).unwrap()
+                    }
+                }
+            }
+
+            #[test]
+            fn empty() {
+                if $cond {
+                    $crate::frame::reduce::max::test::test_max::<$ker, $t>(&[]).unwrap()
+                }
+            }
+        };
+    }
+
+    pub fn test_max<K: ReduceKer<T>, T: LADatum + Float>(values: &[f32]) -> TestCaseResult
+    where
+        f32: AsPrimitive<T>,
+    {
+        crate::setup_test_logger();
+        let values: Vec<T> = values.iter().copied().map(|x| x.as_()).collect();
+        crate::frame::reduce::test::test_reduce::<K, _>(
+            &values,
+            <T as Float>::min_value(),
+            |a, b| a.max(b),
+        )
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/reduce/mod.rs b/vendor/tract-linalg-0.22.1/src/frame/reduce/mod.rs
new file mode 100644
index 000000000..ecc13535f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/reduce/mod.rs
@@ -0,0 +1,300 @@
+pub mod max;
+pub mod softmax;
+pub mod sum;
+
+use std::fmt::Debug;
+use std::marker::PhantomData;
+
+use tract_data::TractResult;
+
+use crate::LADatum;
+
+use super::element_wise_helper::{map_reduce_slice_with_alignment, reduce_slice_with_alignment};
+
+macro_rules! reduce_impl_wrap {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $params: ty, $neutral: expr, $run: item, $reduce_two: item) => {
+        paste! {
+            #[derive(Copy, Clone, Debug)]
+            #[allow(non_camel_case_types)]
+            pub struct $func;
+
+            impl crate::frame::reduce::ReduceKer<$ti, $params> for $func {
+                #[inline(always)]
+                fn name() -> &'static str {
+                    stringify!($func)
+                }
+                #[inline(always)]
+                fn nr() -> usize {
+                    $nr
+                }
+                #[inline(always)]
+                fn alignment_items() -> usize {
+                    $alignment_items
+                }
+                #[inline(always)]
+                fn alignment_bytes() -> usize {
+                    $alignment_items * std::mem::size_of::<$ti>()
+                }
+                #[inline(always)]
+                fn neutral() -> $ti {
+                    $neutral
+                }
+                $run
+                $reduce_two
+            }
+        }
+    };
+}
+
+pub trait Reduce<T, Params = ()>: Send + Sync + Debug + dyn_clone::DynClone
+where
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    T: Copy + Debug + PartialEq + Send + Sync,
+{
+    fn name(&self) -> &'static str;
+    fn run(&self, vec: &[T]) -> TractResult<T> {
+        self.run_with_params(vec, Params::default())
+    }
+    fn run_with_params(&self, vec: &[T], params: Params) -> TractResult<T>;
+}
+
+dyn_clone::clone_trait_object!(<T, Params> Reduce<T, Params> where T: Copy, Params: Copy);
+
+#[derive(Debug, Clone, new)]
+pub struct ReduceImpl<K, T, Params = ()>
+where
+    T: LADatum,
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    K: ReduceKer<T, Params> + Clone,
+{
+    phantom: PhantomData<(K, T, Params)>,
+}
+
+impl<K, T, Params> Reduce<T, Params> for ReduceImpl<K, T, Params>
+where
+    T: LADatum,
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    K: ReduceKer<T, Params> + Clone,
+{
+    fn name(&self) -> &'static str {
+        K::name()
+    }
+
+    fn run_with_params(&self, vec: &[T], params: Params) -> TractResult<T> {
+        reduce_slice_with_alignment(
+            vec,
+            |data| K::run(data, params),
+            K::nr(),
+            K::alignment_bytes(),
+            K::neutral(),
+            K::reduce_two,
+        )
+    }
+}
+
+pub trait ReduceKer<T, Params = ()>:
+    Send + Sync + Debug + dyn_clone::DynClone + Clone + 'static
+where
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    T: LADatum,
+{
+    fn name() -> &'static str;
+    fn alignment_bytes() -> usize {
+        Self::alignment_items() * T::datum_type().size_of()
+    }
+    fn alignment_items() -> usize;
+    fn nr() -> usize;
+    fn neutral() -> T;
+    fn reduce_two(a: T, b: T) -> T;
+    fn run(vec: &[T], params: Params) -> T;
+    fn red() -> Box<dyn Reduce<T, Params>> {
+        Box::new(ReduceImpl::<Self, T, Params>::new())
+    }
+}
+
+#[allow(unused_macros)]
+macro_rules! map_reduce_impl_wrap {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $params: ty, $map_neutral: expr, $reduce_neutral: expr, $run: item, $reduce_two: item) => {
+        paste! {
+            #[derive(Copy, Clone, Debug)]
+            #[allow(non_camel_case_types)]
+            pub struct $func;
+
+            impl crate::frame::reduce::MapReduceKer<$ti, $params> for $func {
+                #[inline(always)]
+                fn name() -> &'static str {
+                    stringify!($func)
+                }
+                #[inline(always)]
+                fn nr() -> usize {
+                    $nr
+                }
+                #[inline(always)]
+                fn alignment_items() -> usize {
+                    $alignment_items
+                }
+                #[inline(always)]
+                fn alignment_bytes() -> usize {
+                    $alignment_items * std::mem::size_of::<$ti>()
+                }
+                #[inline(always)]
+                fn map_neutral() -> $ti {
+                    $map_neutral
+                }
+                #[inline(always)]
+                fn reduce_neutral() -> $ti {
+                    $reduce_neutral
+                }
+                $run
+                $reduce_two
+            }
+        }
+    };
+}
+
+pub trait MapReduce<T, Params = ()>: Send + Sync + Debug + dyn_clone::DynClone
+where
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    T: Copy + Debug + PartialEq + Send + Sync,
+{
+    fn name(&self) -> &'static str;
+    fn run(&self, vec: &mut [T]) -> TractResult<T> {
+        self.run_with_params(vec, Params::default())
+    }
+    fn run_with_params(&self, vec: &mut [T], params: Params) -> TractResult<T>;
+}
+
+dyn_clone::clone_trait_object!(<T, Params> MapReduce<T, Params> where T: Copy, Params: Copy);
+
+#[derive(Debug, Clone, new)]
+pub struct MapReduceImpl<K, T, Params = ()>
+where
+    T: LADatum,
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    K: MapReduceKer<T, Params> + Clone,
+{
+    phantom: PhantomData<(K, T, Params)>,
+}
+
+impl<K, T, Params> MapReduce<T, Params> for MapReduceImpl<K, T, Params>
+where
+    T: LADatum,
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    K: MapReduceKer<T, Params> + Clone,
+{
+    fn name(&self) -> &'static str {
+        K::name()
+    }
+    fn run_with_params(&self, vec: &mut [T], params: Params) -> TractResult<T> {
+        map_reduce_slice_with_alignment(
+            vec,
+            |data| K::run(data, params),
+            K::nr(),
+            K::alignment_bytes(),
+            K::map_neutral(),
+            K::reduce_neutral(),
+            K::reduce_two,
+        )
+    }
+}
+
+pub trait MapReduceKer<T, Params = ()>:
+    Send + Sync + Debug + dyn_clone::DynClone + Clone + 'static
+where
+    Params: Copy + Send + Sync + Debug + 'static + Default,
+    T: LADatum,
+{
+    fn name() -> &'static str;
+    fn alignment_bytes() -> usize {
+        Self::alignment_items() * T::datum_type().size_of()
+    }
+    fn alignment_items() -> usize;
+    fn nr() -> usize;
+    fn map_neutral() -> T;
+    fn reduce_neutral() -> T;
+    fn reduce_two(a: T, b: T) -> T;
+    fn run(vec: &mut [T], params: Params) -> T;
+    fn red() -> Box<dyn MapReduce<T, Params>> {
+        Box::new(MapReduceImpl::<Self, T, Params>::new())
+    }
+}
+
+#[cfg(test)]
+pub mod test {
+    use super::*;
+    use proptest::test_runner::{TestCaseError, TestCaseResult};
+    use tract_data::internal::*;
+    use tract_data::itertools::Itertools;
+
+    pub fn test_reduce<K: ReduceKer<T, ()>, T: LADatum>(
+        values: &[T],
+        neutral: T,
+        reference_reduce: impl Fn(T, T) -> T,
+    ) -> TestCaseResult {
+        test_reduce_params::<K, T, ()>(values, neutral, reference_reduce, ())
+    }
+
+    pub fn test_reduce_params<K: ReduceKer<T, Params>, T: LADatum, Params>(
+        values: &[T],
+        neutral: T,
+        reference_reducer: impl Fn(T, T) -> T,
+        params: Params,
+    ) -> TestCaseResult
+    where
+        Params: Copy + Send + Sync + Debug + 'static + Default,
+    {
+        crate::setup_test_logger();
+        let op = K::red();
+        let expected = values.iter().fold(neutral, |acc, i| reference_reducer(acc, *i));
+        let found = values;
+        let red = op.run_with_params(found, params).unwrap();
+        tensor0(red)
+            .close_enough(&tensor0(expected), true)
+            .map_err(|e| TestCaseError::fail(e.root_cause().to_string()))?;
+        Ok(())
+    }
+
+    pub fn test_map_reduce<K: MapReduceKer<T, ()>, T: LADatum>(
+        values: &[T],
+        map_neutral: T,
+        neutral: T,
+        reference_map: impl Fn(T) -> T,
+        reference_reduce: impl Fn(T, T) -> T,
+    ) -> TestCaseResult {
+        test_map_reduce_params::<K, T, ()>(
+            values,
+            map_neutral,
+            neutral,
+            reference_map,
+            reference_reduce,
+            (),
+        )
+    }
+
+    pub fn test_map_reduce_params<K: MapReduceKer<T, Params>, T: LADatum, Params>(
+        values: &[T],
+        _neutral: T,
+        map_neutral: T,
+        reference_map: impl Fn(T) -> T,
+        reference_reducer: impl Fn(T, T) -> T,
+        params: Params,
+    ) -> TestCaseResult
+    where
+        Params: Copy + Send + Sync + Debug + 'static + Default,
+    {
+        crate::setup_test_logger();
+        let op = K::red();
+        let mut found = values.to_vec();
+        let expected_values = values.iter().copied().map(reference_map).collect_vec();
+        let expected_reduced =
+            expected_values.iter().fold(map_neutral, |acc, i| reference_reducer(acc, *i));
+        let red = op.run_with_params(&mut found, params).unwrap();
+        tensor1(&found)
+            .close_enough(&tensor1(&expected_values), Approximation::SuperApproximate)
+            .map_err(|e| TestCaseError::fail(e.root_cause().to_string()))?;
+        tensor0(red)
+            .close_enough(&tensor0(expected_reduced), Approximation::SuperApproximate)
+            .map_err(|e| TestCaseError::fail(e.root_cause().to_string()))?;
+        Ok(())
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/reduce/softmax.rs b/vendor/tract-linalg-0.22.1/src/frame/reduce/softmax.rs
new file mode 100644
index 000000000..a51708643
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/reduce/softmax.rs
@@ -0,0 +1,86 @@
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use crate::frame::reduce::MapReduceKer;
+    use crate::LADatum;
+    use num_traits::{AsPrimitive, Float};
+    use proptest::test_runner::TestCaseResult;
+
+    #[macro_export]
+    macro_rules! softmax_l2_frame_tests {
+        ($cond:expr, $t: ty, $ker:ty) => {
+            proptest::proptest! {
+                #[test]
+                fn prop(xs in proptest::collection::vec(-25f32..25.0, 1..100)) {
+                    if $cond {
+                        $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&*xs).unwrap()
+                    }
+                }
+            }
+
+            #[test]
+            fn single() {
+                if $cond {
+                    $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&[0.0]).unwrap()
+                }
+            }
+
+            #[test]
+            fn two_zeros() {
+                if $cond {
+                    $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&[0.0, 0.0]).unwrap()
+                }
+            }
+
+            #[test]
+            fn two_0() {
+                if $cond {
+                    $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&[
+                        16.62555, 21.950674,
+                    ])
+                    .unwrap()
+                }
+            }
+
+            #[test]
+            fn two_1() {
+                if $cond {
+                    $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&[0.0f32, 0.38132212])
+                        .unwrap()
+                }
+            }
+
+            #[test]
+            fn two_missing_max() {
+                if $cond {
+                    $crate::frame::reduce::softmax::test::test_softmax_l2::<$ker, $t>(&[
+                        -46.15512, 42.875168,
+                    ])
+                    .unwrap()
+                }
+            }
+        };
+    }
+
+    pub fn test_softmax_l2<K: MapReduceKer<T, T>, T>(
+        values: &[f32],
+    ) -> TestCaseResult
+    where
+        T: LADatum + Float + AsPrimitive<f32>,
+        f32: AsPrimitive<T>,
+    {
+        use crate::generic::reduce::softmax_l2::fast_compact_exp_f32;
+        crate::setup_test_logger();
+        let max = values.iter().max_by(|a, b| a.total_cmp(b)).unwrap();
+        let values: Vec<T> = values.iter().copied().map(|x| x.as_()).collect();
+        crate::frame::reduce::test::test_map_reduce_params::<K, T, T>(
+            &values,
+            <T as Float>::min_value(),
+            T::zero(),
+            //            |x| (x - max.as_()).exp(),
+            |x| fast_compact_exp_f32(x.as_() - max).as_(),
+            |a, b| a + b,
+            max.as_(),
+        )
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/reduce/sum.rs b/vendor/tract-linalg-0.22.1/src/frame/reduce/sum.rs
new file mode 100644
index 000000000..16d4b8970
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/reduce/sum.rs
@@ -0,0 +1,54 @@
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use crate::frame::reduce::ReduceKer;
+    use crate::LADatum;
+    use num_traits::{AsPrimitive, Float, Zero};
+    use proptest::test_runner::TestCaseResult;
+
+    #[macro_export]
+    macro_rules! sum_frame_tests {
+        ($cond:expr, $t: ty, $ker:ty) => {
+            proptest::proptest! {
+                #[test]
+                fn prop(xs in proptest::collection::vec(-25_isize..25, 0..100)) {
+                    if $cond {
+                        let xs_float = xs.into_iter().map(|it| it as f32).collect::<Vec<_>>();
+                        $crate::frame::reduce::sum::test::test_sum::<$ker, $t>(&*xs_float).unwrap()
+                    }
+                }
+            }
+
+            #[test]
+            fn empty() {
+                if $cond {
+                    $crate::frame::reduce::sum::test::test_sum::<$ker, $t>(&[]).unwrap()
+                }
+            }
+
+            #[test]
+            fn simple() {
+                if $cond {
+                    $crate::frame::reduce::sum::test::test_sum::<$ker, $t>(&[1.0, 2.0]).unwrap()
+                }
+            }
+            #[test]
+            fn multiple_tile() {
+                if $cond {
+                    $crate::frame::reduce::sum::test::test_sum::<$ker, $t>(&[1.0; 35]).unwrap()
+                }
+            }
+        };
+    }
+
+    pub fn test_sum<K, T>(values: &[f32]) -> TestCaseResult
+    where
+        K: ReduceKer<T>,
+        f32: AsPrimitive<T>,
+        T: LADatum + Float + Zero + AsPrimitive<f32>,
+    {
+        crate::setup_test_logger();
+        let values: Vec<T> = values.iter().copied().map(|x| x.as_()).collect();
+        crate::frame::reduce::test::test_reduce::<K, _>(&values, <T as Zero>::zero(), |a, b| a + b)
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/sigmoid.rs b/vendor/tract-linalg-0.22.1/src/frame/sigmoid.rs
new file mode 100644
index 000000000..1a3ea85dc
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/sigmoid.rs
@@ -0,0 +1,96 @@
+macro_rules! sigmoid_impl {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $cond: expr) => {
+        ew_impl!($ti, $func, $nr, $alignment_items);
+        #[cfg(test)]
+        paste! {
+            mod [<test_ $func>] {
+                use super::*;
+                sigmoid_frame_tests!($cond, $ti, $func);
+            }
+        }
+    };
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use crate::{frame::element_wise::*, LADatum};
+    use num_traits::{AsPrimitive, Float};
+    use proptest::test_runner::TestCaseResult;
+
+    #[macro_export]
+    macro_rules! sigmoid_frame_tests {
+        ($cond:expr, $t: ty, $ker:ty) => {
+            proptest::proptest! {
+                #[test]
+                fn sigmoid(xs in proptest::collection::vec(-25f32..25.0, 0..100)) {
+                    if $cond {
+                        $crate::frame::sigmoid::test::test_sigmoid::<$ker, $t>(&*xs).unwrap()
+                    }
+                }
+            }
+
+            #[test]
+            fn sigmoid_4_magic() {
+                if $cond {
+                    $crate::frame::sigmoid::test::test_sigmoid::<$ker, $t>(&[
+                        0f32, -20.0, 20.0, 0.0,
+                    ])
+                    .unwrap()
+                }
+            }
+
+            #[test]
+            fn sigmoid_4zeros() {
+                if $cond {
+                    $crate::frame::sigmoid::test::test_sigmoid::<$ker, $t>(&[0.0; 4]).unwrap();
+                }
+            }
+
+            #[test]
+            fn sigmoid_20_ones() {
+                if $cond {
+                    $crate::frame::sigmoid::test::test_sigmoid::<$ker, $t>(&[1.0; 20]).unwrap();
+                }
+            }
+
+            #[test]
+            fn sigmoid_18_zeros() {
+                if $cond {
+                    $crate::frame::sigmoid::test::test_sigmoid::<$ker, $t>(&[0.0; 18]).unwrap();
+                }
+            }
+
+            #[test]
+            fn sigmoid_asymptots() {
+                use tract_data::internal::*;
+                use $crate::frame::element_wise::*;
+                if $cond {
+                    let mut input: Vec<$t> = [-100f32, 100f32]
+                        .iter()
+                        .map(|x| <f32 as num_traits::AsPrimitive<$t>>::as_(*x))
+                        .collect();
+                    let expected: Vec<$t> = [-0f32, 1f32]
+                        .iter()
+                        .map(|x| <f32 as num_traits::AsPrimitive<$t>>::as_(*x))
+                        .collect();
+                    <$ker>::ew().run(&mut input).unwrap();
+                    tensor1(&input)
+                        .close_enough(&tensor1(&expected), Approximation::Close)
+                        .unwrap();
+                }
+            }
+        };
+    }
+
+    pub fn test_sigmoid<K: ElementWiseKer<T>, T: LADatum + Float>(values: &[f32]) -> TestCaseResult
+    where
+        f32: AsPrimitive<T>,
+    {
+        crate::setup_test_logger();
+        let values: Vec<T> = values.iter().copied().map(|x| x.as_()).collect();
+        crate::frame::element_wise::test::test_element_wise::<K, _, _>(&values, |x| {
+            (1f32).as_() / (1f32.as_() + (-x).exp())
+        })
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/tanh.rs b/vendor/tract-linalg-0.22.1/src/frame/tanh.rs
new file mode 100644
index 000000000..fe2af1648
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/tanh.rs
@@ -0,0 +1,101 @@
+macro_rules! tanh_impl {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $cond: expr) => {
+        ew_impl!($ti, $func, $nr, $alignment_items);
+        #[cfg(test)]
+        paste! {
+            mod [<test_ $func>] {
+                use super::*;
+                tanh_frame_tests!($cond, $ti, $func);
+            }
+        }
+    };
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use crate::frame::element_wise::*;
+    use crate::LADatum;
+    use num_traits::float::Float;
+    use num_traits::AsPrimitive;
+    use proptest::test_runner::TestCaseResult;
+
+    #[macro_export]
+    macro_rules! tanh_frame_tests {
+        ($cond:expr, $t:ty, $ker:ty) => {
+            proptest::proptest! {
+                #[test]
+                fn tanh(xs in proptest::collection::vec(-25f32..25.0, 0..100)) {
+                    if $cond {
+                        $crate::frame::tanh::test::test_tanh::<$ker, $t>(&*xs).unwrap()
+                    }
+                }
+            }
+
+            #[test]
+            fn tanh_4_magic() {
+                if $cond {
+                    $crate::frame::tanh::test::test_tanh::<$ker, $t>(&[0f32, -20.0, 20.0, 0.0])
+                        .unwrap()
+                }
+            }
+
+            #[test]
+            fn tanh_4zeros() {
+                if $cond {
+                    $crate::frame::tanh::test::test_tanh::<$ker, $t>(&[0.0; 4]).unwrap();
+                }
+            }
+
+            #[test]
+            fn tanh_20_ones() {
+                if $cond {
+                    $crate::frame::tanh::test::test_tanh::<$ker, $t>(&[1.0; 20]).unwrap();
+                }
+            }
+
+            #[test]
+            fn tanh_18_zeros() {
+                if $cond {
+                    $crate::frame::tanh::test::test_tanh::<$ker, $t>(&[0.0; 18]).unwrap();
+                }
+            }
+
+            #[test]
+            fn tanh_foo() {
+                if $cond {
+                    $crate::frame::tanh::test::test_tanh::<$ker, $t>(&[0.67503357]).unwrap();
+                }
+            }
+
+            #[test]
+            fn tanh_asymptots() {
+                use tract_data::internal::*;
+                use $crate::frame::element_wise::*;
+                if $cond {
+                    let mut input: Vec<$t> = [-100f32, 100f32]
+                        .iter()
+                        .map(|x| <f32 as num_traits::AsPrimitive<$t>>::as_(*x))
+                        .collect();
+                    let expected: Vec<$t> = [-1f32, 1f32]
+                        .iter()
+                        .map(|x| <f32 as num_traits::AsPrimitive<$t>>::as_(*x))
+                        .collect();
+                    <$ker>::ew().run(&mut input).unwrap();
+                    tensor1(&input)
+                        .close_enough(&tensor1(&expected), Approximation::Close)
+                        .unwrap();
+                }
+            }
+        };
+    }
+
+    pub fn test_tanh<K: ElementWiseKer<T>, T: LADatum + Float>(values: &[f32]) -> TestCaseResult
+    where
+        f32: AsPrimitive<T>,
+    {
+        crate::setup_test_logger();
+        let values: Vec<T> = values.iter().copied().map(|x| x.as_()).collect();
+        crate::frame::element_wise::test::test_element_wise::<K, _, _>(&values, |x| x.tanh())
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/unicast.rs b/vendor/tract-linalg-0.22.1/src/frame/unicast.rs
new file mode 100644
index 000000000..fca39f7a5
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/unicast.rs
@@ -0,0 +1,233 @@
+use std::fmt::Debug;
+use std::marker::PhantomData;
+
+use tract_data::internal::TensorView;
+use tract_data::TractResult;
+
+use crate::frame::element_wise_helper::TempBuffer;
+use crate::{LADatum, LinalgFn};
+
+macro_rules! unicast_impl_wrap {
+    ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $run: item) => {
+        paste! {
+            #[derive(Copy, Clone, Debug)]
+            #[allow(non_camel_case_types)]
+            pub struct $func;
+
+            impl crate::frame::unicast::UnicastKer<$ti> for $func {
+                #[inline(always)]
+                fn name() -> &'static str {
+                    stringify!($func)
+                }
+                #[inline(always)]
+                fn nr() -> usize {
+                    $nr
+                }
+                #[inline(always)]
+                fn alignment_items() -> usize {
+                    $alignment_items
+                }
+                $run
+            }
+        }
+    };
+}
+
+pub trait Unicast<T>: Send + Sync + Debug + dyn_clone::DynClone
+where
+    T: Copy + Debug + PartialEq + Send + Sync,
+{
+    fn name(&self) -> &'static str;
+    fn run(&self, a: &mut [T], b: &[T]) -> TractResult<()>;
+}
+
+dyn_clone::clone_trait_object!(<T> Unicast<T> where T: Copy);
+
+#[derive(Debug, Clone, new)]
+pub struct UnicastImpl<K, T>
+where
+    T: LADatum,
+    K: UnicastKer<T> + Clone,
+{
+    phantom: PhantomData<(K, T)>,
+}
+
+impl<K, T> UnicastImpl<K, T>
+where
+    T: LADatum,
+    K: UnicastKer<T> + Clone,
+{
+}
+impl<K, T> Unicast<T> for UnicastImpl<K, T>
+where
+    T: LADatum,
+    K: UnicastKer<T> + Clone,
+{
+    fn name(&self) -> &'static str {
+        K::name()
+    }
+    fn run(&self, a: &mut [T], b: &[T]) -> TractResult<()> {
+        unicast_with_alignment(a, b, |a, b| K::run(a, b), K::nr(), K::alignment_bytes())
+    }
+}
+
+pub trait UnicastKer<T>: Send + Sync + Debug + dyn_clone::DynClone + Clone + 'static
+where
+    T: LADatum,
+{
+    fn name() -> &'static str;
+    fn alignment_bytes() -> usize {
+        Self::alignment_items() * T::datum_type().size_of()
+    }
+    fn alignment_items() -> usize;
+    fn nr() -> usize;
+    fn run(a: &mut [T], b: &[T]);
+    fn bin() -> Box<LinalgFn> {
+        Box::new(|a: &mut TensorView, b: &TensorView| {
+            let a_slice = a.as_slice_mut()?;
+            let b_slice = b.as_slice()?;
+            UnicastImpl::<Self, T>::new().run(a_slice, b_slice)
+        })
+    }
+}
+
+std::thread_local! {
+    static TMP: std::cell::RefCell<(TempBuffer, TempBuffer)> = std::cell::RefCell::new((TempBuffer::default(), TempBuffer::default()));
+}
+
+pub(crate) fn unicast_with_alignment<T>(
+    a: &mut [T],
+    b: &[T],
+    f: impl Fn(&mut [T], &[T]),
+    nr: usize,
+    alignment_bytes: usize,
+) -> TractResult<()>
+where
+    T: LADatum,
+{
+    if a.is_empty() {
+        return Ok(());
+    }
+    unsafe {
+        TMP.with(|buffers| {
+            let mut buffers = buffers.borrow_mut();
+            buffers.0.ensure(nr * T::datum_type().size_of(), alignment_bytes);
+            buffers.1.ensure(nr * T::datum_type().size_of(), alignment_bytes);
+            let tmp_a = std::slice::from_raw_parts_mut(buffers.0.buffer as *mut T, nr);
+            let tmp_b = std::slice::from_raw_parts_mut(buffers.1.buffer as *mut T, nr);
+            let mut compute_via_temp_buffer = |a: &mut [T], b: &[T]| {
+                tmp_a[..a.len()].copy_from_slice(a);
+                tmp_b[..b.len()].copy_from_slice(b);
+                f(tmp_a, tmp_b);
+                a.copy_from_slice(&tmp_a[..a.len()])
+            };
+
+            let mut num_element_processed = 0;
+            let a_prefix_len = a.as_ptr().align_offset(alignment_bytes).min(a.len());
+            let b_prefix_len = b.as_ptr().align_offset(alignment_bytes).min(b.len());
+            assert!(
+                a_prefix_len == b_prefix_len,
+                "Both inputs should be of the same alignement, got {a_prefix_len:?}, {b_prefix_len:?}"
+            );
+            let mut applied_prefix_len = 0;
+            if a_prefix_len > 0 {
+                // Incomplete tile needs to be created to process unaligned data.
+                let sub_a = &mut a[..a_prefix_len];
+                let sub_b = &b[..a_prefix_len];
+                compute_via_temp_buffer(sub_a, sub_b);
+                num_element_processed += a_prefix_len;
+                applied_prefix_len = a_prefix_len;
+            }
+
+            let num_complete_tiles = (a.len() - applied_prefix_len) / nr;
+            if num_complete_tiles > 0 {
+                // Process all tiles that are complete.
+                let sub_a = &mut a[applied_prefix_len..][..(num_complete_tiles * nr)];
+                let sub_b = &b[applied_prefix_len..][..(num_complete_tiles * nr)];
+                f(sub_a, sub_b);
+                num_element_processed += num_complete_tiles * nr;
+            }
+
+            if num_element_processed < a.len() {
+                // Incomplete tile needs to be created to process remaining elements.
+                compute_via_temp_buffer(
+                    &mut a[num_element_processed..],
+                    &b[num_element_processed..],
+                );
+            }
+        })
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    use super::*;
+    use crate::LADatum;
+    use proptest::test_runner::{TestCaseError, TestCaseResult};
+    use tract_data::internal::*;
+    use tract_num_traits::{AsPrimitive, Float};
+
+    pub fn test_unicast<K: UnicastKer<T>, T: LADatum>(
+        a: &mut [T],
+        b: &[T],
+        reference: impl Fn(T, T) -> T,
+    ) -> TestCaseResult {
+        crate::setup_test_logger();
+        let op = UnicastImpl::<K, T>::new();
+        let expected = a.iter().zip(b.iter()).map(|(a, b)| (reference)(*a, *b)).collect::<Vec<_>>();
+        op.run(a, b).unwrap();
+        tensor1(a)
+            .close_enough(&tensor1(&expected), true)
+            .map_err(|e| TestCaseError::fail(e.root_cause().to_string()))?;
+        Ok(())
+    }
+
+    pub fn test_unicast_t<K: UnicastKer<T>, T: LADatum + Float>(
+        a: &[f32],
+        b: &[f32],
+        func: impl Fn(T, T) -> T,
+    ) -> TestCaseResult
+    where
+        f32: AsPrimitive<T>,
+    {
+        crate::setup_test_logger();
+        let vec_a: Vec<T> = a.iter().copied().map(|x| x.as_()).collect();
+        // We allocate a tensor to ensure allocation is done with alignement
+        let mut a = unsafe { Tensor::from_slice_align(vec_a.as_slice(), vector_size()).unwrap() };
+        let vec_b: Vec<T> = b.iter().copied().map(|x| x.as_()).collect();
+        // We allocate a tensor to ensure allocation is done with alignement
+        let b = unsafe { Tensor::from_slice_align(vec_b.as_slice(), vector_size()).unwrap() };
+        crate::frame::unicast::test::test_unicast::<K, _>(
+            a.as_slice_mut::<T>().unwrap(),
+            b.as_slice::<T>().unwrap(),
+            func,
+        )
+    }
+
+    #[macro_export]
+    macro_rules! unicast_frame_tests {
+        ($cond:expr, $t: ty, $ker:ty, $func:expr) => {
+            pastey::paste! {
+                proptest::proptest! {
+                    #[test]
+                    fn [<prop_ $ker:snake>](
+                        (a, b) in (0..100_usize).prop_flat_map(|len| (vec![-25f32..25.0; len], vec![-25f32..25.0; len]))
+                    ) {
+                        if $cond {
+                            $crate::frame::unicast::test::test_unicast_t::<$ker, $t>(&*a, &*b, $func).unwrap()
+                        }
+                    }
+                }
+
+                #[test]
+                fn [<empty_ $ker:snake>]() {
+                    if $cond {
+                        $crate::frame::unicast::test::test_unicast_t::<$ker, $t>(&[], &[], $func).unwrap()
+                    }
+                }
+            }
+        };
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/frame/weights.rs b/vendor/tract-linalg-0.22.1/src/frame/weights.rs
new file mode 100644
index 000000000..527893090
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/frame/weights.rs
@@ -0,0 +1,80 @@
+use std::fmt::Debug;
+use tract_data::prelude::DatumType;
+
+use crate::block_quant::{BlockQuant, PackedBlockQuantFormat};
+
+use crate::mmm::MMMInputFormat;
+use crate::pack::PackedFormat;
+
+#[derive(Clone)]
+pub enum WeightType {
+    Plain(DatumType),
+    BlockQuant(Box<dyn BlockQuant>),
+}
+
+impl From<DatumType> for WeightType {
+    fn from(value: DatumType) -> Self {
+        match value {
+            DatumType::F16 => WeightType::Plain(DatumType::F16),
+            DatumType::F32 => WeightType::Plain(DatumType::F32),
+            DatumType::F64 => WeightType::Plain(DatumType::F64),
+            DatumType::I32 => WeightType::Plain(DatumType::I32),
+            DatumType::I8 | DatumType::QI8(_) => WeightType::Plain(DatumType::I8),
+            DatumType::U8 | DatumType::QU8(_) => WeightType::Plain(DatumType::U8),
+            _ => panic!("Can't build a WeightType from {value:?}"),
+        }
+    }
+}
+
+impl From<Box<dyn MMMInputFormat>> for WeightType {
+    fn from(value: Box<dyn MMMInputFormat>) -> Self {
+        (&*value).into()
+    }
+}
+
+impl From<&dyn MMMInputFormat> for WeightType {
+    fn from(value: &dyn MMMInputFormat) -> Self {
+        if let Some(pf) = value.downcast_ref::<PackedFormat>() {
+            WeightType::Plain(pf.dt)
+        } else if let Some(pbqf) = value.downcast_ref::<PackedBlockQuantFormat>() {
+            WeightType::BlockQuant(dyn_clone::clone_box(&*pbqf.bq))
+        } else {
+            todo!()
+        }
+    }
+}
+
+impl PartialEq for WeightType {
+    fn eq(&self, other: &Self) -> bool {
+        use WeightType::*;
+        match (self, other) {
+            (Plain(a), Plain(b)) => a == b,
+            (BlockQuant(a), BlockQuant(b)) => a.same_as(&**b),
+            _ => false,
+        }
+    }
+}
+
+impl<BQ: BlockQuant> From<BQ> for WeightType {
+    fn from(value: BQ) -> Self {
+        WeightType::BlockQuant(dyn_clone::clone_box(&value))
+    }
+}
+
+impl Debug for WeightType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Plain(p) => write!(f, "{p:?}"),
+            Self::BlockQuant(bq) => write!(f, "{bq:?}"),
+        }
+    }
+}
+
+impl WeightType {
+    pub fn as_dt(&self) -> Option<DatumType> {
+        match self {
+            WeightType::Plain(dt) => Some(*dt),
+            _ => None,
+        }
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/generic.rs b/vendor/tract-linalg-0.22.1/src/generic.rs
new file mode 100644
index 000000000..f2030ff0b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/generic.rs
@@ -0,0 +1,55 @@
+pub mod by_scalar;
+pub mod erf;
+pub mod leaky_relu;
+pub mod lut;
+pub mod mmm;
+pub mod reduce;
+pub mod rounding;
+pub mod sigmoid;
+pub mod tanh;
+pub mod unicast;
+
+use tract_data::prelude::DatumType;
+
+use crate::by_scalar::ByScalarKer;
+use crate::unicast::UnicastKer;
+use crate::{BinOp, LinalgRegistry};
+
+pub use self::by_scalar::{HMulByScalar8, SMulByScalar4};
+pub use self::erf::SErf4;
+pub use self::leaky_relu::{HLeakyRelu8, SLeakyRelu4};
+pub use self::lut::GenericLut8;
+pub use self::reduce::softmax_l2::SSoftMaxL2;
+pub use self::rounding::{ScaleShiftAndRound, Scaler};
+pub use self::sigmoid::{HSigmoid8, SSigmoid4};
+pub use self::tanh::{HTanh8, STanh4};
+
+pub(crate) fn register_all_unicast(registry: &mut LinalgRegistry) {
+    registry.insert((BinOp::Mul, DatumType::F32), Box::new(|| unicast::SUnicastMul4::bin()));
+    registry.insert((BinOp::Mul, DatumType::F16), Box::new(|| unicast::HUnicastMul8::bin()));
+    registry.insert((BinOp::Add, DatumType::F32), Box::new(|| unicast::SUnicastAdd4::bin()));
+    registry.insert((BinOp::Add, DatumType::F16), Box::new(|| unicast::HUnicastAdd8::bin()));
+    registry.insert((BinOp::Sub, DatumType::F32), Box::new(|| unicast::SUnicastSub4::bin()));
+    registry.insert((BinOp::Sub, DatumType::F16), Box::new(|| unicast::HUnicastSub8::bin()));
+    registry.insert((BinOp::SubF, DatumType::F32), Box::new(|| unicast::SUnicastSubF4::bin()));
+    registry.insert((BinOp::SubF, DatumType::F16), Box::new(|| unicast::HUnicastSubF8::bin()));
+    registry.insert((BinOp::Min, DatumType::F32), Box::new(|| unicast::SUnicastMin4::bin()));
+    registry.insert((BinOp::Min, DatumType::F16), Box::new(|| unicast::HUnicastMin8::bin()));
+    registry.insert((BinOp::Max, DatumType::F32), Box::new(|| unicast::SUnicastMax4::bin()));
+    registry.insert((BinOp::Max, DatumType::F16), Box::new(|| unicast::HUnicastMax8::bin()));
+}
+
+pub(crate) fn register_all_by_scalar(registry: &mut LinalgRegistry) {
+    registry.insert((BinOp::Mul, DatumType::F32), Box::new(|| by_scalar::SMulByScalar4::bin()));
+    registry.insert((BinOp::Mul, DatumType::F16), Box::new(|| by_scalar::HMulByScalar8::bin()));
+    registry.insert((BinOp::Add, DatumType::F32), Box::new(|| by_scalar::SAddByScalar4::bin()));
+    registry.insert((BinOp::Add, DatumType::F16), Box::new(|| by_scalar::HAddByScalar8::bin()));
+    registry.insert((BinOp::Sub, DatumType::F32), Box::new(|| by_scalar::SSubByScalar4::bin()));
+    registry.insert((BinOp::Sub, DatumType::F16), Box::new(|| by_scalar::HSubByScalar8::bin()));
+    registry.insert((BinOp::SubF, DatumType::F32), Box::new(|| by_scalar::SSubFByScalar4::bin()));
+    registry.insert((BinOp::SubF, DatumType::F16), Box::new(|| by_scalar::HSubFByScalar8::bin()));
+    registry.insert((BinOp::Min, DatumType::F32), Box::new(|| by_scalar::SMinByScalar4::bin()));
+    registry.insert((BinOp::Min, DatumType::F16), Box::new(|| by_scalar::HMinByScalar8::bin()));
+    registry.insert((BinOp::Max, DatumType::F32), Box::new(|| by_scalar::SMaxByScalar4::bin()));
+    registry.insert((BinOp::Max, DatumType::F16), Box::new(|| by_scalar::HMaxByScalar8::bin()));
+}
diff --git a/vendor/tract-linalg-0.22.1/src/generic/by_scalar.rs b/vendor/tract-linalg-0.22.1/src/generic/by_scalar.rs
new file mode 100644
index 000000000..1aaab592f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/generic/by_scalar.rs
@@ -0,0 +1,181 @@
+use tract_data::internal::f16;
+
+by_scalar_impl_wrap!(
+    f32,
+    SMulByScalar4,
+    4,
+    4,
+    f32,
+    fn run(x: &mut [f32], s: f32) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px *= s)
+    }
+);
+
+by_scalar_impl_wrap!(
+    f32,
+    SAddByScalar4,
+    4,
+    4,
+    f32,
+    fn run(x: &mut [f32], s: f32) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px += s)
+    }
+);
+
+by_scalar_impl_wrap!(
+    f32,
+    SSubByScalar4,
+    4,
+    4,
+    f32,
+    fn run(x: &mut [f32], s: f32) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px -= s)
+    }
+);
+
+by_scalar_impl_wrap!(
+    f32,
+    SSubFByScalar4,
+    4,
+    4,
+    f32,
+    fn run(x: &mut [f32], s: f32) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = s - *px)
+    }
+);
+
+by_scalar_impl_wrap!(
+    f32,
+    SMinByScalar4,
+    4,
+    4,
+    f32,
+    fn run(x: &mut [f32], s: f32) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = px.min(s))
+    }
+);
+
+by_scalar_impl_wrap!(
+    f32,
+    SMaxByScalar4,
+    4,
+    4,
+    f32,
+    fn run(x: &mut [f32], s: f32) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = px.max(s))
+    }
+);
+
+#[cfg(test)]
+#[macro_use]
+pub mod mul_by_scalar_f32 {
+    use super::*;
+    by_scalar_frame_tests!(true, f32, SMulByScalar4, |a, b| a * b);
+    by_scalar_frame_tests!(true, f32, SAddByScalar4, |a, b| a + b );
+    by_scalar_frame_tests!(true, f32, SSubByScalar4, |a, b| a - b);
+    by_scalar_frame_tests!(true, f32, SSubFByScalar4, |a, b| b - a);
+    by_scalar_frame_tests!(true, f32, SMinByScalar4, |a, b| a.min(b));
+    by_scalar_frame_tests!(true, f32, SMaxByScalar4, |a, b| a.max(b));
+}
+
+by_scalar_impl_wrap!(
+    f16,
+    HMulByScalar8,
+    8,
+    8,
+    f16,
+    fn run(x: &mut [f16], s: f16) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px *= s)
+    }
+);
+
+by_scalar_impl_wrap!(
+    f16,
+    HAddByScalar8,
+    8,
+    8,
+    f16,
+    fn run(x: &mut [f16], s: f16) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px += s)
+    }
+);
+
+by_scalar_impl_wrap!(
+    f16,
+    HSubByScalar8,
+    8,
+    8,
+    f16,
+    fn run(x: &mut [f16], s: f16) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px -= s)
+    }
+);
+
+by_scalar_impl_wrap!(
+    f16,
+    HSubFByScalar8,
+    8,
+    8,
+    f16,
+    fn run(x: &mut [f16], s: f16) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = s - *px)
+    }
+);
+
+by_scalar_impl_wrap!(
+    f16,
+    HMinByScalar8,
+    8,
+    8,
+    f16,
+    fn run(x: &mut [f16], s: f16) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = px.min(s))
+    }
+);
+
+by_scalar_impl_wrap!(
+    f16,
+    HMaxByScalar8,
+    8,
+    8,
+    f16,
+    fn run(x: &mut [f16], s: f16) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = px.max(s))
+    }
+);
+
+#[cfg(test)]
+#[macro_use]
+pub mod mul_by_scalar_f16 {
+    use super::*;
+    by_scalar_frame_tests!(true, f16, HMulByScalar8, |a, b| a * b);
+    by_scalar_frame_tests!(true, f16, HAddByScalar8, |a, b| a + b);
+    by_scalar_frame_tests!(true, f16, HSubByScalar8, |a, b| a - b);
+    by_scalar_frame_tests!(true, f16, HSubFByScalar8, |a, b| b - a);
+    by_scalar_frame_tests!(true, f16, HMinByScalar8, |a, b| a.min(b));
+    by_scalar_frame_tests!(true, f16, HMaxByScalar8, |a, b| a.max(b));
+}
diff --git a/vendor/tract-linalg-0.22.1/src/generic/erf.rs b/vendor/tract-linalg-0.22.1/src/generic/erf.rs
new file mode 100644
index 000000000..8f4cdaf43
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/generic/erf.rs
@@ -0,0 +1,51 @@
+use crate::element_wise::ElementWiseKer;
+
+#[allow(non_upper_case_globals)]
+#[allow(clippy::excessive_precision)]
+fn serf(x: &mut f32) {
+    const a1: f32 = 0.0705230784;
+    const a2: f32 = 0.0422820123;
+    const a3: f32 = 0.0092705272;
+    const a4: f32 = 0.0001520143;
+    const a5: f32 = 0.0002765672;
+    const a6: f32 = 0.0000430638;
+
+    let signum = x.signum();
+    let abs = x.abs();
+    let y = a6 * abs;
+    let y = (a5 + y) * abs;
+    let y = (a4 + y) * abs;
+    let y = (a3 + y) * abs;
+    let y = (a2 + y) * abs;
+    let y = (a1 + y) * abs;
+    let y = 1.0 - (y + 1.0).powi(16).recip();
+
+    *x = y.copysign(signum)
+}
+
+#[derive(Clone, Debug)]
+pub struct SErf4;
+
+impl ElementWiseKer<f32> for SErf4 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_items() -> usize {
+        16
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn nr() -> usize {
+        4
+    }
+
+    fn run(x: &mut [f32], _: ()) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(serf)
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/generic/leaky_relu.rs b/vendor/tract-linalg-0.22.1/src/generic/leaky_relu.rs
new file mode 100644
index 000000000..0526319cd
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/generic/leaky_relu.rs
@@ -0,0 +1,74 @@
+#![allow(clippy::excessive_precision)]
+use crate::frame::element_wise::ElementWiseKer;
+use tract_data::internal::*;
+use tract_num_traits::Zero;
+
+#[derive(Clone, Debug)]
+pub struct SLeakyRelu4;
+
+impl ElementWiseKer<f32, f32> for SLeakyRelu4 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        4
+    }
+
+    fn run(x: &mut [f32], alpha: f32) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = if *px < 0. { *px * alpha } else { *px });
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct HLeakyRelu8;
+
+impl ElementWiseKer<f16, f16> for HLeakyRelu8 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        8
+    }
+
+    fn run(x: &mut [f16], alpha: f16) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = if *px < f16::zero() { *px * alpha } else { *px })
+    }
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod s {
+    leaky_relu_frame_tests!(true, f32, crate::generic::leaky_relu::SLeakyRelu4);
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod h {
+    leaky_relu_frame_tests!(
+        true,
+        tract_data::internal::f16,
+        crate::generic::leaky_relu::HLeakyRelu8
+    );
+}
diff --git a/vendor/tract-linalg-0.22.1/src/generic/lut.rs b/vendor/tract-linalg-0.22.1/src/generic/lut.rs
new file mode 100644
index 000000000..b9023bd6c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/generic/lut.rs
@@ -0,0 +1,47 @@
+use crate::frame::lut::LutKer;
+
+#[derive(Clone, Debug, Hash)]
+pub struct GenericLut8;
+
+impl LutKer for GenericLut8 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn input_alignment_bytes() -> usize {
+        1
+    }
+
+    fn table_alignment_bytes() -> usize {
+        1
+    }
+
+    fn n() -> usize {
+        8
+    }
+
+    unsafe fn run(buf: *mut u8, len: usize, table: *const u8) {
+        unsafe {
+            debug_assert!(len % Self::n() == 0);
+            debug_assert!(buf as usize % Self::input_alignment_bytes() == 0);
+            debug_assert!(table as usize % Self::table_alignment_bytes() == 0);
+            for i in 0..((len / 8) as isize) {
+                let ptr = buf.offset(8 * i);
+                *ptr.offset(0) = *table.offset(*ptr.offset(0) as isize);
+                *ptr.offset(1) = *table.offset(*ptr.offset(1) as isize);
+                *ptr.offset(2) = *table.offset(*ptr.offset(2) as isize);
+                *ptr.offset(3) = *table.offset(*ptr.offset(3) as isize);
+                *ptr.offset(4) = *table.offset(*ptr.offset(4) as isize);
+                *ptr.offset(5) = *table.offset(*ptr.offset(5) as isize);
+                *ptr.offset(6) = *table.offset(*ptr.offset(6) as isize);
+                *ptr.offset(7) = *table.offset(*ptr.offset(7) as isize);
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod test {
+    lut_frame_tests!(true, crate::generic::GenericLut8);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/generic/mmm.rs b/vendor/tract-linalg-0.22.1/src/generic/mmm.rs
new file mode 100644
index 000000000..28c3bbd86
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/generic/mmm.rs
@@ -0,0 +1,453 @@
+#![allow(clippy::needless_range_loop)]
+use num_traits::AsPrimitive;
+
+use tract_data::prelude::f16;
+use tract_data::prelude::*;
+
+use super::*;
+use crate::frame::block_quant::{BlockQuant, NibbleReader, PackedBlockQuantFormat, Q4_0};
+use crate::frame::mmm::*;
+use crate::{has_fp16, LADatum, Ops};
+
+macro_rules! scalar {
+    ($ab: expr, $m: expr, $f: expr) => {
+        for i in 0..$ab.len() {
+            for j in 0..$ab[0].len() {
+                $ab[i][j] = $f($m, $ab[i][j])
+            }
+        }
+    };
+}
+
+macro_rules! per_row {
+    ($ab: expr, $m: expr, $f: expr) => {
+        for i in 0..$ab.len() {
+            for j in 0..$ab[0].len() {
+                $ab[i][j] = $f(*$m.add(i), $ab[i][j])
+            }
+        }
+    };
+}
+
+macro_rules! per_col {
+    ($ab: expr, $m: expr, $f: expr) => {
+        for i in 0..$ab.len() {
+            for j in 0..$ab[0].len() {
+                $ab[i][j] = $f(*$m.add(j), $ab[i][j])
+            }
+        }
+    };
+}
+
+unsafe fn add_mat_mul<const MR: usize, const NR: usize, TI, TA, TB>(
+    pa: *const u8,
+    pb: *const u8,
+    k: usize,
+    ab: &mut [[TI; NR]; MR],
+) where
+    TA: LADatum + AsPrimitive<TI>,
+    TB: LADatum + AsPrimitive<TI>,
+    TI: LADatum,
+{
+    unsafe {
+        let a = pa as *const TA;
+        let b = pb as *const TB;
+        for ik in 0..k {
+            let a = std::slice::from_raw_parts(a.add(MR * ik), MR);
+            let b = std::slice::from_raw_parts(b.add(NR * ik), NR);
+            for i in 0..MR {
+                for j in 0..NR {
+                    ab[i][j] += a[i].as_() * b[j].as_();
+                }
+            }
+        }
+    }
+}
+
+unsafe fn add_mat_mul_pq40<const MR: usize, const NR: usize, TB, TI>(
+    pa: *const u8,
+    pb: *const u8,
+    k: usize,
+    ab: &mut [[TI; NR]; MR],
+) where
+    TI: LADatum,
+    f16: AsPrimitive<TI>,
+    TB: AsPrimitive<TI>,
+    i8: AsPrimitive<TI>,
+{
+    unsafe {
+        assert!(k % Q4_0.block_len() == 0);
+        let len = (k * MR) / Q4_0.block_len() * Q4_0.block_bytes();
+        let mut pa = NibbleReader::for_slice(std::slice::from_raw_parts(pa, len));
+        let b = pb as *const TB;
+        for bk in 0..k / 32 {
+            let mut scales: [TI; MR] = [TI::zero(); MR];
+            scales.iter_mut().for_each(|x| *x = pa.read_f16().as_());
+            for ik in 0..32 {
+                let mut a: [TI; MR] = [TI::zero(); MR];
+                a.iter_mut().zip(&scales).for_each(|(x, s)| *x = *s * (pa.read_i4() - 8).as_());
+                let b = std::slice::from_raw_parts(b.add(NR * (ik + 32 * bk)), NR);
+                for i in 0..MR {
+                    for j in 0..NR {
+                        ab[i][j] += a[i] * b[j].as_();
+                    }
+                }
+            }
+        }
+    }
+}
+
+unsafe fn add_mat_mul_pq40_scales_at_end<const MR: usize, const NR: usize, TB, TI>(
+    pa: *const u8,
+    pb: *const u8,
+    k: usize,
+    ab: &mut [[TI; NR]; MR],
+) where
+    TI: LADatum,
+    f16: AsPrimitive<TI>,
+    TB: AsPrimitive<TI>,
+    i8: AsPrimitive<TI>,
+{
+    unsafe {
+        assert!(k % Q4_0.block_len() == 0);
+        let len = (k * MR) / Q4_0.block_len() * Q4_0.block_bytes();
+        let mut pa = NibbleReader::for_slice(std::slice::from_raw_parts(pa, len));
+        let b = pb as *const TB;
+        for bk in 0..k / 32 {
+            let mut temp = [[TI::zero(); NR]; MR];
+            for ik in 0..32 {
+                let mut a: [TI; MR] = [TI::zero(); MR];
+                a.iter_mut().for_each(|x| *x = (pa.read_i4() - 8).as_());
+                let b = std::slice::from_raw_parts(b.add(NR * (ik + 32 * bk)), NR);
+                for i in 0..MR {
+                    for j in 0..NR {
+                        temp[i][j] += a[i] * b[j].as_();
+                    }
+                }
+            }
+            for i in 0..MR {
+                let scale = pa.read_f16().as_();
+                for j in 0..NR {
+                    ab[i][j] += temp[i][j] * scale;
+                }
+            }
+        }
+    }
+}
+
+unsafe fn add_unicast<const MR: usize, const NR: usize, TI, TO>(
+    ab: &mut [[TI; NR]; MR],
+    other: &OutputStoreKer,
+) where
+    TI: LADatum,
+    TO: LADatum + AsPrimitive<TI>,
+{
+    unsafe {
+        for i in 0usize..MR {
+            for j in 0usize..NR {
+                let value: *const TO = other
+                    .ptr
+                    .offset(other.row_byte_stride * i as isize + other.col_byte_stride * j as isize)
+                    as _;
+                ab[i].as_mut()[j] += (*value).as_();
+            }
+        }
+    }
+}
+
+unsafe fn store_t<const MR: usize, const NR: usize, TC, TI>(
+    tile: &OutputStoreKer,
+    ab: &[[TI; NR]; MR],
+) where
+    TC: Copy,
+{
+    unsafe {
+        for i in 0usize..MR {
+            for j in 0usize..NR {
+                let loc: *mut TC = tile
+                    .ptr
+                    .offset(tile.row_byte_stride * i as isize + tile.col_byte_stride * j as isize)
+                    as _;
+                let val: *const TC = (&ab[i].as_ref()[j]) as *const TI as _;
+                *loc = *val
+            }
+        }
+    }
+}
+
+unsafe fn store_float_t<const MR: usize, const NR: usize, TC, TI>(
+    tile: &OutputStoreKer,
+    ab: &[[TI; NR]; MR],
+) where
+    TC: Copy + 'static,
+    TI: Copy + 'static + AsPrimitive<TC>,
+{
+    unsafe {
+        for i in 0usize..MR {
+            for j in 0usize..NR {
+                let loc: *mut TC = tile
+                    .ptr
+                    .offset(tile.row_byte_stride * i as isize + tile.col_byte_stride * j as isize)
+                    as _;
+                let val = ab[i].as_ref()[j].as_();
+                *loc = val
+            }
+        }
+    }
+}
+
+#[inline(never)]
+unsafe fn kernel<TI, const MR: usize, const NR: usize>(mut pnl: *const FusedKerSpec<TI>) -> isize
+where
+    TI: LADatum + ScaleShiftAndRound + AsPrimitive<TI>,
+    TI: AsPrimitive<f16> + AsPrimitive<f32> + AsPrimitive<f64>,
+    usize: AsPrimitive<TI>,
+    f16: AsPrimitive<TI>,
+    f32: AsPrimitive<TI>,
+    f64: AsPrimitive<TI>,
+    i8: AsPrimitive<TI>,
+    i32: AsPrimitive<TI>,
+{
+    unsafe {
+        let mut ab = [[TI::zero(); NR]; MR];
+        loop {
+            if pnl.is_null() {
+                break;
+            }
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => ab = std::mem::zeroed(),
+                FusedKerSpec::LoadTile(col_major, _row_major) => {
+                    for row in 0..MR {
+                        for col in 0..NR {
+                            ab[row][col] = *col_major.add(col * MR + row);
+                        }
+                    }
+                }
+                FusedKerSpec::ScalarAdd(a) => scalar!(ab, a, |a, b| a + b),
+                FusedKerSpec::ScalarMul(a) => scalar!(ab, a, |a, b| a * b),
+                FusedKerSpec::ScalarMin(m) => scalar!(ab, m, |a, b| if a < b { a } else { b }),
+                FusedKerSpec::ScalarMax(m) => scalar!(ab, m, |a, b| if a > b { a } else { b }),
+                FusedKerSpec::ScalarSub(m) => scalar!(ab, m, |a, b| a - b),
+                FusedKerSpec::ScalarSubF(m) => scalar!(ab, m, |a, b| b - a),
+                FusedKerSpec::LeakyRelu(m) => {
+                    scalar!(ab, m, |a, b| if b > TI::zero() { b } else { a * b })
+                }
+                FusedKerSpec::PerRowMin(m) => per_row!(ab, m, |a, b| if a < b { a } else { b }),
+                FusedKerSpec::PerRowMax(m) => per_row!(ab, m, |a, b| if a > b { a } else { b }),
+                FusedKerSpec::PerRowAdd(m) => per_row!(ab, m, |a, b| a + b),
+                FusedKerSpec::PerRowMul(m) => per_row!(ab, m, |a, b| a * b),
+                FusedKerSpec::PerRowSub(m) => per_row!(ab, m, |a, b| a - b),
+                FusedKerSpec::PerRowSubF(m) => per_row!(ab, m, |a, b| b - a),
+                FusedKerSpec::PerColMin(m) => per_col!(ab, m, |a, b| if a < b { a } else { b }),
+                FusedKerSpec::PerColMax(m) => per_col!(ab, m, |a, b| if a > b { a } else { b }),
+                FusedKerSpec::PerColAdd(m) => per_col!(ab, m, |a, b| a + b),
+                FusedKerSpec::PerColMul(m) => per_col!(ab, m, |a, b| a * b),
+                FusedKerSpec::PerColSub(m) => per_col!(ab, m, |a, b| a - b),
+                FusedKerSpec::PerColSubF(m) => per_col!(ab, m, |a, b| b - a),
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    for i in 0..MR {
+                        for j in 0..NR {
+                            ab[i][j] += *rows.add(i) * *cols.add(j);
+                        }
+                    }
+                }
+                FusedKerSpec::AddUnicast(other) => {
+                    if TI::datum_type().is_float() && other.item_size == 2 {
+                        add_unicast::<MR, NR, TI, f16>(&mut ab, &other)
+                    } else if TI::datum_type().is_float() && other.item_size == 4 {
+                        add_unicast::<MR, NR, TI, f32>(&mut ab, &other)
+                    } else if TI::datum_type().is_float() && other.item_size == 8 {
+                        add_unicast::<MR, NR, TI, f64>(&mut ab, &other)
+                    } else if TI::datum_type() == i32::datum_type() && other.item_size == 1 {
+                        add_unicast::<MR, NR, TI, i8>(&mut ab, &other)
+                    } else if TI::datum_type() == i32::datum_type() && other.item_size == 4 {
+                        add_unicast::<MR, NR, TI, i32>(&mut ab, &other)
+                    } else {
+                        unimplemented!("Missing AddUnicast type");
+                    }
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    for i in 0..MR {
+                        for j in 0..NR {
+                            ab[i][j] = ab[i][j].q_shl(shift);
+                        }
+                    }
+                }
+                FusedKerSpec::RoundingShiftRight(shift, rp) => {
+                    for i in 0..MR {
+                        for j in 0..NR {
+                            ab[i][j] = ab[i][j].q_shr(shift, rp);
+                        }
+                    }
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    for i in 0..MR {
+                        for j in 0..NR {
+                            ab[i][j] = ab[i][j].q_scale(Scaler::from_fuse_params(shift, rp, mult));
+                        }
+                    }
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing } => {
+                    use std::mem::transmute;
+                    if TI::datum_type().is_float() {
+                        match packing {
+                            0 => add_mat_mul::<MR, NR, TI, TI, TI>(pa, pb, k, &mut ab),
+                            1 => add_mat_mul::<MR, NR, TI, f16, f16>(pa, pb, k, &mut ab),
+                            2 => add_mat_mul::<MR, NR, TI, f32, f32>(pa, pb, k, &mut ab),
+                            3 => add_mat_mul::<MR, NR, TI, f16, f32>(pa, pb, k, &mut ab),
+                            4 => add_mat_mul::<MR, NR, TI, f32, f16>(pa, pb, k, &mut ab),
+                            5 => add_mat_mul_pq40::<MR, NR, f16, TI>(pa, pb, k, &mut ab),
+                            6 => add_mat_mul_pq40_scales_at_end::<MR, NR, f16, TI>(
+                                pa, pb, k, &mut ab,
+                            ),
+                            7 => add_mat_mul_pq40::<MR, NR, f32, TI>(pa, pb, k, &mut ab),
+                            _ => unreachable!(),
+                        }
+                    } else if TI::datum_type() == i32::datum_type() {
+                        // transmute to allow using explicitly i3 in add_mat_mul generic params
+                        let ab = transmute::<&mut [[TI; NR]; MR], &mut [[i32; NR]; MR]>(&mut ab);
+                        if packing == 0 {
+                            add_mat_mul::<MR, NR, i32, i32, i32>(pa, pb, k, ab)
+                        } else if packing == 1 {
+                            add_mat_mul::<MR, NR, i32, i8, i8>(pa, pb, k, ab)
+                        } else {
+                            return 1;
+                        }
+                    } else {
+                        return 1;
+                    }
+                }
+                FusedKerSpec::Store(tile) => {
+                    if TI::datum_type().is_float() {
+                        match tile.item_size {
+                            2 => store_float_t::<MR, NR, f16, _>(&tile, &ab),
+                            4 => store_float_t::<MR, NR, f32, _>(&tile, &ab),
+                            8 => store_float_t::<MR, NR, f64, _>(&tile, &ab),
+                            _ => unimplemented!(),
+                        }
+                    } else {
+                        match tile.item_size {
+                            1 => store_t::<MR, NR, u8, _>(&tile, &ab),
+                            2 => store_t::<MR, NR, u16, _>(&tile, &ab),
+                            4 => store_t::<MR, NR, u32, _>(&tile, &ab),
+                            8 => store_t::<MR, NR, u64, _>(&tile, &ab),
+                            _ => unimplemented!(),
+                        }
+                    }
+                }
+            };
+            pnl = pnl.add(1);
+        }
+    }
+    0
+}
+
+fn pq40_r4() -> PackedBlockQuantFormat {
+    PackedBlockQuantFormat::new(&Q4_0, 4, 0, false)
+}
+
+fn pq40_r4_se() -> PackedBlockQuantFormat {
+    PackedBlockQuantFormat::new(&Q4_0, 4, 0, true)
+}
+
+// f16 kernels
+MMMRustKernel!(kernel::<f16, 4, 4> => generic_f16_4x4<f16>(4,4)
+    packing[1] = f16f16bis => |k| k.with_packing(f16::packing(4), f16::packing(4));
+    packing[2] = f32f32 => |k| k.with_packing(f32::packing(4), f32::packing(4));
+    packing[3] = f16f32 => |k| k.with_packing(f16::packing(4), f32::packing(4));
+    packing[4] = f32f16 => |k| k.with_packing(f32::packing(4), f16::packing(4));
+    packing[5] = q40f16 => |k| k.with_packing(pq40_r4(), f16::packing(4));
+    packing[6] = q40f16se => |k| k.with_packing(pq40_r4_se(), f16::packing(4));
+    packing[7] = q40f32 => |k| k.with_packing(pq40_r4(), f32::packing(4));
+    quality(if has_fp16() { ImplementationQuality::Generic } else { ImplementationQuality::Dreadful })
+    store(f32, f64)
+);
+
+MMMRustKernel! {kernel::<f16, 4, 1> => generic_f16_4x1<f16>(4,1)
+    packing[1] = f16f16bis => |k| k.with_packing(f16::packing(4), f16::packing(1));
+    packing[2] = f32f32 => |k| k.with_packing(f32::packing(4), f32::packing(1));
+    packing[3] = f16f32 => |k| k.with_packing(f16::packing(4), f32::packing(1));
+    packing[4] = f32f16 => |k| k.with_packing(f32::packing(4), f16::packing(1));
+    packing[5] = q40f16 => |k| k.with_packing(pq40_r4(), f16::packing(1));
+    packing[6] = q40f16se => |k| k.with_packing(pq40_r4_se(), f16::packing(1));
+    packing[7] = q40f32 => |k| k.with_packing(pq40_r4(), f32::packing(1));
+    quality(if has_fp16() { ImplementationQuality::Generic } else { ImplementationQuality::Dreadful })
+    store(f32, f64)
+}
+
+// f32 kernels
+MMMRustKernel!(kernel::<f32, 4, 4> => generic_f32_4x4<f32>(4,4)
+    packing[1] = f16f16 => |k| k.with_packing(f16::packing(4), f16::packing(4));
+    packing[2] = f32f32bis => |k| k.with_packing(f32::packing(4), f32::packing(4));
+    packing[3] = f16f32 => |k| k.with_packing(f16::packing(4), f32::packing(4));
+    packing[4] = f32f16 => |k| k.with_packing(f32::packing(4), f16::packing(4));
+    packing[5] = q40f16 => |k| k.with_packing(pq40_r4(), f16::packing(4));
+    packing[6] = q40f16se => |k| k.with_packing(pq40_r4_se(), f16::packing(4));
+    packing[7] = q40f32 => |k| k.with_packing(pq40_r4(), f32::packing(4));
+    quality(ImplementationQuality::Generic)
+    store(f16, f64)
+);
+MMMRustKernel! {kernel::<f32, 4, 1> => generic_f32_4x1<f32>(4,1)
+    packing[1] = f16f16 => |k| k.with_packing(f16::packing(4), f16::packing(1));
+    packing[2] = f32f32bis => |k| k.with_packing(f32::packing(4), f32::packing(1));
+    packing[3] = f16f32 => |k| k.with_packing(f16::packing(4), f32::packing(1));
+    packing[4] = f32f16 => |k| k.with_packing(f32::packing(4), f16::packing(1));
+    packing[5] = q40f16 => |k| k.with_packing(pq40_r4(), f16::packing(1));
+    packing[6] = q40f16se => |k| k.with_packing(pq40_r4_se(), f16::packing(1));
+    packing[7] = q40f32 => |k| k.with_packing(pq40_r4(), f32::packing(1));
+    quality(ImplementationQuality::Generic)
+    store(f16, f64)
+}
+
+// f64 kernels
+MMMRustKernel!(kernel::<f64, 4, 4> => generic_f64_4x4<f64>(4,4)
+    quality(ImplementationQuality::Generic)
+    store(f16, f32));
+MMMRustKernel!(kernel::<f64, 4, 1> => generic_f64_4x1<f64>(4,1)
+    quality(ImplementationQuality::Generic)
+    store(f16, f32));
+
+// I32 kernels
+MMMRustKernel! {kernel::<i32, 4, 4> => generic_i32_4x4<i32>(4,4)
+    packing[1] = i8i8 => |k| k.with_packing(i8::packing(4), i8::packing(4));
+    quality(ImplementationQuality::Generic)
+    store(i8)
+}
+
+MMMRustKernel! {kernel::<i32, 4, 1> => generic_i32_4x1<i32>(4,1)
+    packing[1] = i8i8 => |k| k.with_packing(i8::packing(4), i8::packing(1));
+    quality(ImplementationQuality::Generic)
+    store(i8)
+}
+
+// extra tests kernels
+#[cfg(test)]
+MMMRustKernel!(kernel::<f32, 3, 2> => generic_f32_3x2<f32>(3,2) store(f16, f64));
+
+#[cfg(test)]
+MMMRustKernel! {kernel::<i32, 3, 2> => generic_i32_3x2<i32>(3,2)
+    packing[1] = i8i8 => |k| k.with_packing(i8::packing(3), i8::packing(2));
+    store(i8)
+}
+
+pub fn plug(ops: &mut Ops) {
+    ops.mmm_impls.push(generic_f16_4x4.mmm());
+    ops.mmm_impls.push(generic_f16_4x1.mmm());
+    ops.mmm_impls.push(generic_f32_4x4.mmm());
+    ops.mmm_impls.push(generic_f32_4x1.mmm());
+    ops.mmm_impls.push(generic_f64_4x4.mmm());
+    ops.mmm_impls.push(generic_f64_4x1.mmm());
+    ops.mmm_impls.push(generic_i32_4x4.mmm());
+    ops.mmm_impls.push(generic_i32_4x1.mmm());
+}
+
+#[cfg(test)]
+mod test {
+
+    #[test]
+    fn kits() {
+        let mut ops = crate::generic();
+        super::plug(&mut ops);
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/generic/reduce.rs b/vendor/tract-linalg-0.22.1/src/generic/reduce.rs
new file mode 100644
index 000000000..af38cfb22
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/generic/reduce.rs
@@ -0,0 +1,187 @@
+// Reduce<max> generic implementation
+pub mod max {
+    pub use tract_data::internal::f16;
+
+    reduce_impl_wrap!(
+        f32,
+        SMax4,
+        4,
+        4,
+        (),
+        f32::MIN,
+        fn run(x: &[f32], _: ()) -> f32 {
+            debug_assert!(x.len() % Self::nr() == 0);
+            debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+            *x.iter().max_by(|a, b| a.total_cmp(b)).unwrap()
+        },
+        fn reduce_two(a: f32, b: f32) -> f32 {
+            a.max(b)
+        }
+    );
+
+    reduce_impl_wrap!(
+        f16,
+        HMax8,
+        8,
+        8,
+        (),
+        f16::MIN,
+        fn run(x: &[f16], _: ()) -> f16 {
+            debug_assert!(x.len() % Self::nr() == 0);
+            debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+            *x.iter().max_by(|a, b| a.total_cmp(b)).unwrap()
+        },
+        fn reduce_two(a: f16, b: f16) -> f16 {
+            a.max(b)
+        }
+    );
+
+    #[cfg(test)]
+    #[macro_use]
+    pub mod s {
+        crate::max_frame_tests!(true, f32, crate::generic::reduce::max::SMax4);
+    }
+
+    #[cfg(test)]
+    #[macro_use]
+    pub mod h {
+        use super::*;
+        crate::max_frame_tests!(true, f16, crate::generic::reduce::max::HMax8);
+    }
+}
+
+// Reduce<sum> generic implementation
+pub mod sum {
+    use crate::num_traits::Zero;
+    pub use tract_data::internal::f16;
+
+    reduce_impl_wrap!(
+        f32,
+        SSum4,
+        4,
+        4,
+        (),
+        0.0,
+        fn run(x: &[f32], _: ()) -> f32 {
+            debug_assert!(x.len() % Self::nr() == 0);
+            debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+            x.iter().sum::<f32>()
+        },
+        fn reduce_two(a: f32, b: f32) -> f32 {
+            a + b
+        }
+    );
+
+    reduce_impl_wrap!(
+        f16,
+        HSum8,
+        8,
+        8,
+        (),
+        f16::zero(),
+        fn run(x: &[f16], _: ()) -> f16 {
+            debug_assert!(x.len() % Self::nr() == 0);
+            debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+            x.iter().sum::<f16>()
+        },
+        fn reduce_two(a: f16, b: f16) -> f16 {
+            a + b
+        }
+    );
+
+    #[cfg(test)]
+    #[macro_use]
+    pub mod s {
+        crate::sum_frame_tests!(true, f32, crate::generic::reduce::sum::SSum4);
+    }
+
+    #[cfg(test)]
+    #[macro_use]
+    pub mod h {
+        use super::*;
+        crate::sum_frame_tests!(true, f16, crate::generic::reduce::sum::HSum8);
+    }
+}
+
+// Softmax generic implementation
+pub mod softmax_l2 {
+    use crate::num_traits::Zero;
+    use tract_data::internal::f16;
+
+    map_reduce_impl_wrap!(
+        f32,
+        SSoftMaxL2,
+        4,
+        4,
+        f32,
+        f32::MIN,
+        0.0,
+        fn run(x: &mut [f32], max: f32) -> f32 {
+            debug_assert!(x.len() % Self::nr() == 0);
+            debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+            let mut sum = 0.;
+            for v in x.iter_mut() {
+                let y = *v - max;
+                let y = fast_compact_exp_f32(y);
+                *v = y;
+                sum += y;
+            }
+            sum
+        },
+        fn reduce_two(a: f32, b: f32) -> f32 {
+            a + b
+        }
+    );
+
+    map_reduce_impl_wrap!(
+        f16,
+        HSoftMaxL2,
+        8,
+        8,
+        f16,
+        f16::MIN,
+        f16::zero(),
+        fn run(x: &mut [f16], max: f16) -> f16 {
+            debug_assert!(x.len() % Self::nr() == 0);
+            debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+            let mut sum = f16::zero();
+            for v in x.iter_mut() {
+                let y = *v - max;
+                let y = f16::from_f32(fast_compact_exp_f32(y.to_f32()));
+                *v = y;
+                sum += y;
+            }
+            sum
+        },
+        fn reduce_two(a: f16, b: f16) -> f16 {
+            a + b
+        }
+    );
+
+    // ported from https://github.com/gnuradio/volk/blob/master/kernels/volk/volk_32f_expfast_32f.h
+    // probably inspired from https://nic.schraudolph.org/pubs/Schraudolph99.pdf
+    // not that the cast to u32 deals with negative right, while implem in volk code are wrong in some
+    // corner cases (need a max(0,x) before the u32 conversion)
+    pub fn fast_compact_exp_f32(v: f32) -> f32 {
+        const MLN2: f32 = 0.6931471805f32;
+        const A: f32 = 8388608.0f32;
+        const B: f32 = 1065353216.0f32;
+        const C: f32 = 60801.0f32;
+        const SLOPE: f32 = A / MLN2;
+        const OFFSET: f32 = B - C;
+        f32::from_bits(((SLOPE * v) + OFFSET) as u32)
+    }
+
+    #[cfg(test)]
+    #[macro_use]
+    pub mod s {
+        crate::softmax_l2_frame_tests!(true, f32, super::SSoftMaxL2);
+    }
+
+    #[cfg(test)]
+    #[macro_use]
+    pub mod h {
+        use super::*;
+        crate::softmax_l2_frame_tests!(true, f16, HSoftMaxL2);
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/generic/rounding.rs b/vendor/tract-linalg-0.22.1/src/generic/rounding.rs
new file mode 100644
index 000000000..97aed257c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/generic/rounding.rs
@@ -0,0 +1,524 @@
+use crate::frame::mmm::*;
+use std::hash::{Hash, Hasher};
+use std::ops::Mul;
+use tract_data::prelude::f16;
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct Scaler {
+    pub scale: f32,
+    pub mult: Option<i32>,
+    pub shift: isize,
+    pub policy: RoundingPolicy,
+}
+
+impl Eq for Scaler {}
+
+#[allow(clippy::derived_hash_with_manual_eq)]
+impl Hash for Scaler {
+    fn hash<H>(&self, state: &mut H)
+    where
+        H: Hasher,
+    {
+        Hash::hash(&self.scale.to_bits(), state)
+    }
+}
+
+impl Scaler {
+    pub fn new(scale: f32, policy: RoundingPolicy) -> Self {
+        let (mult, shift) = Self::convert_scale_to_mult_shift(scale);
+        Self { scale, mult, shift, policy }
+    }
+
+    pub fn as_fused_spec(&self) -> FusedSpec<'_> {
+        if let Some(multiplier) = self.mult {
+            FusedSpec::QScale(self.shift, self.policy, multiplier)
+        } else if self.shift > 0 {
+            FusedSpec::RoundingShiftRight(self.shift as usize, self.policy)
+        } else {
+            FusedSpec::ShiftLeft((-self.shift) as usize)
+        }
+    }
+
+    // FIXME: Only to avoid fused op breaking
+    pub fn from_fuse_params(shift: isize, policy: RoundingPolicy, mult: i32) -> Self {
+        let scale = mult as f32 * 2f32.powi(-(31 + shift as i32));
+        Self { scale, mult: Some(mult), shift, policy }
+    }
+
+    #[inline]
+    // This function convert a scale (actually a fraction of two integers Q/D)
+    // into an integer multiplier and a shift (the multiplier being 1/2D in Q0_31).
+    fn convert_scale_to_mult_shift(scale: f32) -> (Option<i32>, isize) {
+        // Zero is a special case to handle
+        if scale == 0.0 {
+            return (None, 0);
+        }
+
+        // Convert f32 to bits representation with the following pattern
+        // Bit |  31  |  30-23   |   22-0    |
+        //     | Sign | Exponent |  Fraction |
+        let scale_bits = scale.to_bits();
+
+        // Get actual value of the exponent
+        let current_exponent = (scale_bits >> 23) & 0xff;
+
+        // Extract fractional part of the float with:
+        // - 0x007fffff that represents the mask of the 23 lower bits (fractional part)
+        // (partial because it doesn't include the hidden bit (24) of the float representation)
+        let partial_frac = scale_bits & 0x007fffff;
+
+        if partial_frac == 0 {
+            let shift = 127 - current_exponent as isize;
+            (None, shift)
+        } else {
+            // We add 0x800000 that represents the hidden bit set to one.
+            // Here the frac is encoded as a Q8_23.
+            let frac = partial_frac | 0x800000;
+
+            // We rescale the result to be in Q0_31
+            // We should have shifted the result by 8 but the frac value is in [1.0, 2.0)
+            // so we cannot do that (we would need one bit for the integer).
+            // Instead we devide the frac by two to be in [0.5, 1.0) in Q0_31
+            // which lead to a shift of (8-1 = 7).
+            let half_frac = (frac << 7) as i32;
+
+            // Compute the actual value of the shift
+            // Here, we remove one as half_frac needs to be multiplied by 2.
+            let shift = 127 - current_exponent as isize - 1;
+            (Some(half_frac), shift)
+        }
+    }
+}
+
+impl Mul<f16> for Scaler {
+    type Output = f16;
+
+    #[inline]
+    fn mul(self, rhs: f16) -> Self::Output {
+        f16::from_f32(self.scale) * rhs
+    }
+}
+
+impl Mul<f32> for Scaler {
+    type Output = f32;
+
+    #[inline]
+    fn mul(self, rhs: f32) -> Self::Output {
+        self.scale * rhs
+    }
+}
+
+impl Mul<f64> for Scaler {
+    type Output = f64;
+
+    #[inline]
+    fn mul(self, rhs: f64) -> Self::Output {
+        self.scale as f64 * rhs
+    }
+}
+
+impl Mul<Scaler> for f16 {
+    type Output = f16;
+
+    #[inline]
+    fn mul(self, rhs: Scaler) -> Self::Output {
+        rhs * self
+    }
+}
+
+impl Mul<Scaler> for f32 {
+    type Output = f32;
+
+    #[inline]
+    fn mul(self, rhs: Scaler) -> Self::Output {
+        rhs * self
+    }
+}
+
+impl Mul<Scaler> for f64 {
+    type Output = f64;
+
+    #[inline]
+    fn mul(self, rhs: Scaler) -> Self::Output {
+        rhs * self
+    }
+}
+
+impl Mul<i32> for Scaler {
+    type Output = i32;
+
+    #[inline]
+    fn mul(self, rhs: i32) -> Self::Output {
+        let (val, shift) = if let Some(multiplier) = self.mult {
+            (multiplier as i64 * rhs as i64, self.shift + 31)
+        } else {
+            (rhs as i64, self.shift)
+        };
+
+        // Round according to rounding policy
+        use RoundingPolicy::*;
+        if shift > 0 {
+            let half: i64 = 1 << (shift - 1);
+            let nudge: i64 = match self.policy {
+                Zero => -1,
+                MinusInf => -((val >= 0) as i64),
+                PlusInf => -((val <= 0) as i64),
+                Away => 0,
+                Even => ((val.abs() >> shift) & 0x1) - 1,
+                Odd => -((val.abs() >> shift) & 0x1),
+                _ => panic!(),
+            };
+
+            (val.signum() * ((val.abs() + half + nudge) >> shift)) as i32
+        } else {
+            (val << -shift) as i32
+        }
+    }
+}
+
+impl Mul<Scaler> for i32 {
+    type Output = i32;
+
+    #[inline]
+    fn mul(self, rhs: Scaler) -> Self::Output {
+        rhs * self
+    }
+}
+
+pub trait ScaleShiftAndRound {
+    fn q_scale(self, scaler: Scaler) -> Self;
+    fn q_shl(self, shift: usize) -> Self;
+    fn q_shr(self, shift: usize, rp: RoundingPolicy) -> Self;
+}
+
+impl ScaleShiftAndRound for f64 {
+    fn q_scale(self, scaler: Scaler) -> Self {
+        self * scaler
+    }
+    fn q_shl(self, shift: usize) -> Self {
+        self * 2f64.powi(shift as i32)
+    }
+    fn q_shr(self, shift: usize, _rp: RoundingPolicy) -> Self {
+        self * 2f64.powi(-(shift as i32))
+    }
+}
+
+impl ScaleShiftAndRound for f32 {
+    fn q_scale(self, scaler: Scaler) -> Self {
+        self * scaler
+    }
+    fn q_shl(self, shift: usize) -> Self {
+        self * 2f32.powi(shift as i32)
+    }
+    fn q_shr(self, shift: usize, _rp: RoundingPolicy) -> Self {
+        self * 2f32.powi(-(shift as i32))
+    }
+}
+
+impl ScaleShiftAndRound for f16 {
+    fn q_scale(self, scaler: Scaler) -> Self {
+        self * scaler
+    }
+    fn q_shl(self, shift: usize) -> Self {
+        self * f16::from_f32(2f32.powi(shift as i32))
+    }
+    fn q_shr(self, shift: usize, _rp: RoundingPolicy) -> Self {
+        self * f16::from_f32(2f32.powi(-(shift as i32)))
+    }
+}
+
+impl ScaleShiftAndRound for i32 {
+    fn q_scale(self, scaler: Scaler) -> Self {
+        self * scaler
+    }
+    fn q_shr(self, shift: usize, rp: RoundingPolicy) -> Self {
+        use RoundingPolicy::*;
+        let half: i32 = 1 << (shift - 1);
+        let nudge: i32 = match rp {
+            Zero => -1,
+            MinusInf => -((self >= 0) as i32),
+            PlusInf => -((self <= 0) as i32),
+            Away => 0,
+            Even => ((self.abs() >> shift) & 0x1) - 1,
+            Odd => -((self.abs() >> shift) & 0x1),
+            _ => panic!(),
+        };
+        self.signum() * ((self.abs() + half + nudge) >> shift)
+    }
+    fn q_shl(self, shift: usize) -> Self {
+        self << shift
+    }
+}
+
+// 6 / 4 -> 1.5 -> arrondi: 2.  rien a faire
+// 2 / 4 -> 0.5 -> arrondi: 1. veut 0 -> nudge = -1
+
+#[cfg(test)]
+mod test {
+    use super::RoundingPolicy::*;
+    use super::*;
+
+    #[test]
+    fn test_scale_rounding_f32() {
+        assert_eq!(0f32.q_scale(Scaler::new(0.5, Zero)), 0.0);
+        assert_eq!(1f32.q_scale(Scaler::new(0.5, Zero)), 0.5);
+        assert_eq!(2f32.q_scale(Scaler::new(0.5, Zero)), 1.0);
+        assert_eq!(3f32.q_scale(Scaler::new(0.5, Zero)), 1.5);
+        assert_eq!((-1f32).q_scale(Scaler::new(0.5, Zero)), -0.5);
+        assert_eq!((-2f32).q_scale(Scaler::new(0.5, Zero)), -1.0);
+        assert_eq!((-3f32).q_scale(Scaler::new(0.5, Zero)), -1.5);
+    }
+
+    #[test]
+    fn test_shift_rounding_zero() {
+        assert_eq!(0i32.q_shr(1, Zero), 0);
+        assert_eq!(1i32.q_shr(1, Zero), 0);
+        assert_eq!(2i32.q_shr(1, Zero), 1);
+        assert_eq!(3i32.q_shr(1, Zero), 1);
+        assert_eq!(0i32.q_shr(2, Zero), 0);
+        assert_eq!(1i32.q_shr(2, Zero), 0);
+        assert_eq!(2i32.q_shr(2, Zero), 0);
+        assert_eq!(3i32.q_shr(2, Zero), 1);
+        assert_eq!(4i32.q_shr(2, Zero), 1);
+        assert_eq!(5i32.q_shr(2, Zero), 1);
+        assert_eq!(6i32.q_shr(2, Zero), 1);
+        assert_eq!((-1i32).q_shr(2, Zero), 0);
+        assert_eq!((-2i32).q_shr(2, Zero), 0);
+        assert_eq!((-3i32).q_shr(2, Zero), -1);
+        assert_eq!((-4i32).q_shr(2, Zero), -1);
+        assert_eq!((-5i32).q_shr(2, Zero), -1);
+        assert_eq!((-6i32).q_shr(2, Zero), -1);
+    }
+
+    #[test]
+    fn test_scale_rounding_zero() {
+        assert_eq!(0i32.q_scale(Scaler::new(0.5, Zero)), 0);
+        assert_eq!(1i32.q_scale(Scaler::new(0.5, Zero)), 0);
+        assert_eq!(2i32.q_scale(Scaler::new(0.5, Zero)), 1);
+        assert_eq!(3i32.q_scale(Scaler::new(0.5, Zero)), 1);
+        assert_eq!((-1i32).q_scale(Scaler::new(0.5, Zero)), 0);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.5, Zero)), -1);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.5, Zero)), -1);
+        assert_eq!(2i32.q_scale(Scaler::new(0.25, Zero)), 0);
+        assert_eq!(3i32.q_scale(Scaler::new(0.25, Zero)), 1);
+        assert_eq!(4i32.q_scale(Scaler::new(0.25, Zero)), 1);
+        assert_eq!(5i32.q_scale(Scaler::new(0.25, Zero)), 1);
+        assert_eq!(6i32.q_scale(Scaler::new(0.25, Zero)), 1);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.25, Zero)), 0);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.25, Zero)), -1);
+        assert_eq!((-4i32).q_scale(Scaler::new(0.25, Zero)), -1);
+        assert_eq!((-5i32).q_scale(Scaler::new(0.25, Zero)), -1);
+        assert_eq!((-6i32).q_scale(Scaler::new(0.25, Zero)), -1);
+    }
+
+    #[test]
+    fn test_shift_rounding_away() {
+        assert_eq!(0i32.q_shr(1, Away), 0);
+        assert_eq!(1i32.q_shr(1, Away), 1);
+        assert_eq!(2i32.q_shr(1, Away), 1);
+        assert_eq!(3i32.q_shr(1, Away), 2);
+        assert_eq!(0i32.q_shr(2, Away), 0);
+        assert_eq!(1i32.q_shr(2, Away), 0);
+        assert_eq!(2i32.q_shr(2, Away), 1);
+        assert_eq!(3i32.q_shr(2, Away), 1);
+        assert_eq!(4i32.q_shr(2, Away), 1);
+        assert_eq!(5i32.q_shr(2, Away), 1);
+        assert_eq!(6i32.q_shr(2, Away), 2);
+        assert_eq!((-1i32).q_shr(2, Away), 0);
+        assert_eq!((-2i32).q_shr(2, Away), -1);
+        assert_eq!((-3i32).q_shr(2, Away), -1);
+        assert_eq!((-4i32).q_shr(2, Away), -1);
+        assert_eq!((-5i32).q_shr(2, Away), -1);
+        assert_eq!((-6i32).q_shr(2, Away), -2);
+    }
+
+    #[test]
+    fn test_scale_rounding_away() {
+        assert_eq!(0i32.q_scale(Scaler::new(0.5, Away)), 0);
+        assert_eq!(1i32.q_scale(Scaler::new(0.5, Away)), 1);
+        assert_eq!(2i32.q_scale(Scaler::new(0.5, Away)), 1);
+        assert_eq!(3i32.q_scale(Scaler::new(0.5, Away)), 2);
+        assert_eq!((-1i32).q_scale(Scaler::new(0.5, Away)), -1);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.5, Away)), -1);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.5, Away)), -2);
+        assert_eq!(2i32.q_scale(Scaler::new(0.25, Away)), 1);
+        assert_eq!(3i32.q_scale(Scaler::new(0.25, Away)), 1);
+        assert_eq!(4i32.q_scale(Scaler::new(0.25, Away)), 1);
+        assert_eq!(5i32.q_scale(Scaler::new(0.25, Away)), 1);
+        assert_eq!(6i32.q_scale(Scaler::new(0.25, Away)), 2);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.25, Away)), -1);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.25, Away)), -1);
+        assert_eq!((-4i32).q_scale(Scaler::new(0.25, Away)), -1);
+        assert_eq!((-5i32).q_scale(Scaler::new(0.25, Away)), -1);
+        assert_eq!((-6i32).q_scale(Scaler::new(0.25, Away)), -2);
+    }
+
+    #[test]
+    fn test_shift_rounding_plus_inf() {
+        assert_eq!(0i32.q_shr(1, PlusInf), 0);
+        assert_eq!(1i32.q_shr(1, PlusInf), 1);
+        assert_eq!(2i32.q_shr(1, PlusInf), 1);
+        assert_eq!(3i32.q_shr(1, PlusInf), 2);
+        assert_eq!(0i32.q_shr(2, PlusInf), 0);
+        assert_eq!(1i32.q_shr(2, PlusInf), 0);
+        assert_eq!(2i32.q_shr(2, PlusInf), 1);
+        assert_eq!(3i32.q_shr(2, PlusInf), 1);
+        assert_eq!(4i32.q_shr(2, PlusInf), 1);
+        assert_eq!(5i32.q_shr(2, PlusInf), 1);
+        assert_eq!(6i32.q_shr(2, PlusInf), 2);
+        assert_eq!((-1i32).q_shr(2, PlusInf), 0);
+        assert_eq!((-2i32).q_shr(2, PlusInf), 0);
+        assert_eq!((-3i32).q_shr(2, PlusInf), -1);
+        assert_eq!((-4i32).q_shr(2, PlusInf), -1);
+        assert_eq!((-5i32).q_shr(2, PlusInf), -1);
+        assert_eq!((-6i32).q_shr(2, PlusInf), -1);
+    }
+
+    #[test]
+    fn test_scale_rounding_plus_inf() {
+        assert_eq!(0i32.q_scale(Scaler::new(0.5, PlusInf)), 0);
+        assert_eq!(1i32.q_scale(Scaler::new(0.5, PlusInf)), 1);
+        assert_eq!(2i32.q_scale(Scaler::new(0.5, PlusInf)), 1);
+        assert_eq!(3i32.q_scale(Scaler::new(0.5, PlusInf)), 2);
+        assert_eq!((-1i32).q_scale(Scaler::new(0.5, PlusInf)), 0);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.5, PlusInf)), -1);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.5, PlusInf)), -1);
+        assert_eq!(2i32.q_scale(Scaler::new(0.25, PlusInf)), 1);
+        assert_eq!(3i32.q_scale(Scaler::new(0.25, PlusInf)), 1);
+        assert_eq!(4i32.q_scale(Scaler::new(0.25, PlusInf)), 1);
+        assert_eq!(5i32.q_scale(Scaler::new(0.25, PlusInf)), 1);
+        assert_eq!(6i32.q_scale(Scaler::new(0.25, PlusInf)), 2);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.25, PlusInf)), 0);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.25, PlusInf)), -1);
+        assert_eq!((-4i32).q_scale(Scaler::new(0.25, PlusInf)), -1);
+        assert_eq!((-5i32).q_scale(Scaler::new(0.25, PlusInf)), -1);
+        assert_eq!((-6i32).q_scale(Scaler::new(0.25, PlusInf)), -1);
+    }
+
+    #[test]
+    fn test_shift_rounding_minus_inf() {
+        assert_eq!(0i32.q_shr(1, MinusInf), 0);
+        assert_eq!(1i32.q_shr(1, MinusInf), 0);
+        assert_eq!(2i32.q_shr(1, MinusInf), 1);
+        assert_eq!(3i32.q_shr(1, MinusInf), 1);
+        assert_eq!(0i32.q_shr(2, MinusInf), 0);
+        assert_eq!(1i32.q_shr(2, MinusInf), 0);
+        assert_eq!(2i32.q_shr(2, MinusInf), 0);
+        assert_eq!(3i32.q_shr(2, MinusInf), 1);
+        assert_eq!(4i32.q_shr(2, MinusInf), 1);
+        assert_eq!(5i32.q_shr(2, MinusInf), 1);
+        assert_eq!(6i32.q_shr(2, MinusInf), 1);
+        assert_eq!((-1i32).q_shr(2, MinusInf), 0);
+        assert_eq!((-2i32).q_shr(2, MinusInf), -1);
+        assert_eq!((-3i32).q_shr(2, MinusInf), -1);
+        assert_eq!((-4i32).q_shr(2, MinusInf), -1);
+        assert_eq!((-5i32).q_shr(2, MinusInf), -1);
+        assert_eq!((-6i32).q_shr(2, MinusInf), -2);
+    }
+
+    #[test]
+    fn test_scale_rounding_minus_inf() {
+        assert_eq!(0i32.q_scale(Scaler::new(0.5, MinusInf)), 0);
+        assert_eq!(1i32.q_scale(Scaler::new(0.5, MinusInf)), 0);
+        assert_eq!(2i32.q_scale(Scaler::new(0.5, MinusInf)), 1);
+        assert_eq!(3i32.q_scale(Scaler::new(0.5, MinusInf)), 1);
+        assert_eq!((-1i32).q_scale(Scaler::new(0.5, MinusInf)), -1);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.5, MinusInf)), -1);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.5, MinusInf)), -2);
+        assert_eq!(2i32.q_scale(Scaler::new(0.25, MinusInf)), 0);
+        assert_eq!(3i32.q_scale(Scaler::new(0.25, MinusInf)), 1);
+        assert_eq!(4i32.q_scale(Scaler::new(0.25, MinusInf)), 1);
+        assert_eq!(5i32.q_scale(Scaler::new(0.25, MinusInf)), 1);
+        assert_eq!(6i32.q_scale(Scaler::new(0.25, MinusInf)), 1);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.25, MinusInf)), -1);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.25, MinusInf)), -1);
+        assert_eq!((-4i32).q_scale(Scaler::new(0.25, MinusInf)), -1);
+        assert_eq!((-5i32).q_scale(Scaler::new(0.25, MinusInf)), -1);
+        assert_eq!((-6i32).q_scale(Scaler::new(0.25, MinusInf)), -2);
+        //assert_eq!((-9i32).q_scale(ONE_OVER_TWO_IN_Q0_30, 5, MinusInf), 0);
+    }
+
+    #[test]
+    fn test_shift_rounding_even() {
+        assert_eq!(0i32.q_shr(1, Even), 0);
+        assert_eq!(1i32.q_shr(1, Even), 0);
+        assert_eq!(2i32.q_shr(1, Even), 1);
+        assert_eq!(3i32.q_shr(1, Even), 2);
+        assert_eq!(0i32.q_shr(2, Even), 0);
+        assert_eq!(1i32.q_shr(2, Even), 0);
+        assert_eq!(2i32.q_shr(2, Even), 0);
+        assert_eq!(3i32.q_shr(2, Even), 1);
+        assert_eq!(4i32.q_shr(2, Even), 1);
+        assert_eq!(5i32.q_shr(2, Even), 1);
+        assert_eq!(6i32.q_shr(2, Even), 2);
+        assert_eq!((-1i32).q_shr(2, Even), 0);
+        assert_eq!((-2i32).q_shr(2, Even), 0);
+        assert_eq!((-3i32).q_shr(2, Even), -1);
+        assert_eq!((-4i32).q_shr(2, Even), -1);
+        assert_eq!((-5i32).q_shr(2, Even), -1);
+        assert_eq!((-6i32).q_shr(2, Even), -2);
+    }
+
+    #[test]
+    fn test_scale_rounding_even() {
+        assert_eq!(0i32.q_scale(Scaler::new(0.5, Even)), 0);
+        assert_eq!(1i32.q_scale(Scaler::new(0.5, Even)), 0);
+        assert_eq!(2i32.q_scale(Scaler::new(0.5, Even)), 1);
+        assert_eq!(3i32.q_scale(Scaler::new(0.5, Even)), 2);
+        assert_eq!((-1i32).q_scale(Scaler::new(0.5, Even)), 0);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.5, Even)), -1);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.5, Even)), -2);
+        assert_eq!(2i32.q_scale(Scaler::new(0.25, Even)), 0);
+        assert_eq!(3i32.q_scale(Scaler::new(0.25, Even)), 1);
+        assert_eq!(4i32.q_scale(Scaler::new(0.25, Even)), 1);
+        assert_eq!(5i32.q_scale(Scaler::new(0.25, Even)), 1);
+        assert_eq!(6i32.q_scale(Scaler::new(0.25, Even)), 2);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.25, Even)), 0);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.25, Even)), -1);
+        assert_eq!((-4i32).q_scale(Scaler::new(0.25, Even)), -1);
+        assert_eq!((-5i32).q_scale(Scaler::new(0.25, Even)), -1);
+        assert_eq!((-6i32).q_scale(Scaler::new(0.25, Even)), -2);
+    }
+
+    #[test]
+    fn test_shift_rounding_odd() {
+        assert_eq!(0i32.q_shr(1, Odd), 0);
+        assert_eq!(1i32.q_shr(1, Odd), 1);
+        assert_eq!(2i32.q_shr(1, Odd), 1);
+        assert_eq!(3i32.q_shr(1, Odd), 1);
+        assert_eq!(0i32.q_shr(2, Odd), 0);
+        assert_eq!(1i32.q_shr(2, Odd), 0);
+        assert_eq!(2i32.q_shr(2, Odd), 1);
+        assert_eq!(3i32.q_shr(2, Odd), 1);
+        assert_eq!(4i32.q_shr(2, Odd), 1);
+        assert_eq!(5i32.q_shr(2, Odd), 1);
+        assert_eq!(6i32.q_shr(2, Odd), 1);
+        assert_eq!((-1i32).q_shr(2, Odd), 0);
+        assert_eq!((-2i32).q_shr(2, Odd), -1);
+        assert_eq!((-3i32).q_shr(2, Odd), -1);
+        assert_eq!((-4i32).q_shr(2, Odd), -1);
+        assert_eq!((-5i32).q_shr(2, Odd), -1);
+        assert_eq!((-6i32).q_shr(2, Odd), -1);
+    }
+
+    #[test]
+    fn test_scale_rounding_odd() {
+        assert_eq!(0i32.q_scale(Scaler::new(0.5, Odd)), 0);
+        assert_eq!(1i32.q_scale(Scaler::new(0.5, Odd)), 1);
+        assert_eq!(2i32.q_scale(Scaler::new(0.5, Odd)), 1);
+        assert_eq!(3i32.q_scale(Scaler::new(0.5, Odd)), 1);
+        assert_eq!((-1i32).q_scale(Scaler::new(0.5, Odd)), -1);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.5, Odd)), -1);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.5, Odd)), -1);
+        assert_eq!(2i32.q_scale(Scaler::new(0.25, Odd)), 1);
+        assert_eq!(3i32.q_scale(Scaler::new(0.25, Odd)), 1);
+        assert_eq!(4i32.q_scale(Scaler::new(0.25, Odd)), 1);
+        assert_eq!(5i32.q_scale(Scaler::new(0.25, Odd)), 1);
+        assert_eq!(6i32.q_scale(Scaler::new(0.25, Odd)), 1);
+        assert_eq!((-2i32).q_scale(Scaler::new(0.25, Odd)), -1);
+        assert_eq!((-3i32).q_scale(Scaler::new(0.25, Odd)), -1);
+        assert_eq!((-4i32).q_scale(Scaler::new(0.25, Odd)), -1);
+        assert_eq!((-5i32).q_scale(Scaler::new(0.25, Odd)), -1);
+        assert_eq!((-6i32).q_scale(Scaler::new(0.25, Odd)), -1);
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/generic/sigmoid.rs b/vendor/tract-linalg-0.22.1/src/generic/sigmoid.rs
new file mode 100644
index 000000000..c344757d4
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/generic/sigmoid.rs
@@ -0,0 +1,138 @@
+#![allow(clippy::excessive_precision)]
+use crate::frame::element_wise::ElementWiseKer;
+use tract_data::internal::*;
+
+pub fn ssigmoid(x: f32) -> f32 {
+    const LOW: f32 = -18.6;
+    const HIGH: f32 = -LOW;
+
+    const ALPHA_13: f32 = -4.433153405e-18;
+    const ALPHA_11: f32 = 1.169974371e-14;
+    const ALPHA_9: f32 = -1.875289645e-11;
+    const ALPHA_7: f32 = 4.257889523e-8;
+    const ALPHA_5: f32 = 0.00004811817576;
+    const ALPHA_3: f32 = 0.008163842030;
+    const ALPHA_1: f32 = 0.2499999971;
+    const BETA_6: f32 = 3.922935744e-6;
+    const BETA_4: f32 = 0.001524872358;
+    const BETA_2: f32 = 0.1159886749;
+    const BETA_0: f32 = 1.0;
+
+    let x = x.clamp(LOW, HIGH);
+
+    let x2 = x * x;
+
+    let p = ALPHA_13;
+    let p = x2 * p + ALPHA_11;
+    let p = x2 * p + ALPHA_9;
+    let p = x2 * p + ALPHA_7;
+    let p = x2 * p + ALPHA_5;
+    let p = x2 * p + ALPHA_3;
+    let p = x2 * p + ALPHA_1;
+    let p = p * x;
+
+    let q = BETA_6;
+    let q = x2 * q + BETA_4;
+    let q = x2 * q + BETA_2;
+    let q = x2 * q + BETA_0;
+
+    p / q + 0.5
+}
+
+pub fn hsigmoid(x: f16) -> f16 {
+    /*
+     * (x (0.249895 + x^2 (0.00400222 - 0.0000124702 x^2)))
+     * /
+     * (1. + 0.098734 x^2)
+     */
+
+    const LOW: f16 = f16::from_f32_const(-6.92);
+    const HIGH: f16 = f16::from_f32_const(6.92);
+
+    const ALPHA_5: f16 = f16::from_f32_const(-0.0000124702);
+    const ALPHA_3: f16 = f16::from_f32_const(0.00400222);
+    const ALPHA_1: f16 = f16::from_f32_const(0.249895);
+
+    const BETA_2: f16 = f16::from_f32_const(0.098734);
+    const BETA_0: f16 = f16::from_f32_const(1.0);
+
+    let x = x.clamp(LOW, HIGH);
+
+    let x2 = x * x;
+
+    let p = ALPHA_5;
+    let p = x2 * p + ALPHA_3;
+    let p = x2 * p + ALPHA_1;
+    let p = p * x;
+
+    let q = BETA_2;
+    let q = x2 * q + BETA_0;
+
+    p / q + f16::from_f32_const(0.5)
+}
+
+#[derive(Clone, Debug)]
+pub struct SSigmoid4;
+
+impl ElementWiseKer<f32> for SSigmoid4 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        4
+    }
+
+    fn run(x: &mut [f32], _: ()) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = ssigmoid(*px))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct HSigmoid8;
+
+impl ElementWiseKer<f16> for HSigmoid8 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn alignment_items() -> usize {
+        4
+    }
+
+    fn nr() -> usize {
+        8
+    }
+
+    fn run(x: &mut [f16], _: ()) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = hsigmoid(*px))
+    }
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod s {
+    sigmoid_frame_tests!(true, f32, crate::generic::sigmoid::SSigmoid4);
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod h {
+    sigmoid_frame_tests!(true, tract_data::internal::f16, crate::generic::sigmoid::HSigmoid8);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/generic/tanh.rs b/vendor/tract-linalg-0.22.1/src/generic/tanh.rs
new file mode 100644
index 000000000..2c7542dd2
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/generic/tanh.rs
@@ -0,0 +1,133 @@
+#![allow(clippy::excessive_precision)]
+use crate::frame::element_wise::ElementWiseKer;
+use tract_data::internal::*;
+
+pub fn stanh(x: f32) -> f32 {
+    const LOW: f32 = -8.9;
+    const HIGH: f32 = 8.9;
+
+    const ALPHA_13: f32 = -8.488492677e-14;
+    const ALPHA_11: f32 = 5.277853000e-11;
+    const ALPHA_9: f32 = -2.022500419e-8;
+    const ALPHA_7: f32 = 0.00001115424833;
+    const ALPHA_5: f32 = 0.003103950131;
+    const ALPHA_3: f32 = 0.1308400453;
+    const ALPHA_1: f32 = 0.9999999934;
+
+    const BETA_6: f32 = 0.0002546136580;
+    const BETA_4: f32 = 0.02449515379;
+    const BETA_2: f32 = 0.4641733162;
+    const BETA_0: f32 = 1.0;
+
+    let x = x.clamp(LOW, HIGH);
+
+    let x2 = x * x;
+
+    let p = ALPHA_13;
+    let p = x2 * p + ALPHA_11;
+    let p = x2 * p + ALPHA_9;
+    let p = x2 * p + ALPHA_7;
+    let p = x2 * p + ALPHA_5;
+    let p = x2 * p + ALPHA_3;
+    let p = x2 * p + ALPHA_1;
+    let p = p * x;
+
+    let q = BETA_6;
+    let q = x2 * q + BETA_4;
+    let q = x2 * q + BETA_2;
+    let q = x2 * q + BETA_0;
+
+    p / q
+}
+
+pub fn htanh(x: f16) -> f16 {
+    const LOW: f16 = f16::from_f32_const(-3.84);
+    const HIGH: f16 = f16::from_f32_const(3.84);
+
+    const ALPHA_3: f16 = f16::from_f32_const(0.082654955);
+    const ALPHA_1: f16 = f16::from_f32_const(0.99963124);
+
+    const BETA_4: f16 = f16::from_f32_const(0.0065383179);
+    const BETA_2: f16 = f16::from_f32_const(0.41401828);
+    const BETA_0: f16 = f16::from_f32_const(1.0);
+
+    let x = x.clamp(LOW, HIGH);
+
+    let x2 = x * x;
+
+    let p = ALPHA_3;
+    let p = x2 * p + ALPHA_1;
+    let p = p * x;
+
+    let q = BETA_4;
+    let q = x2 * q + BETA_2;
+    let q = x2 * q + BETA_0;
+
+    p / q
+}
+
+#[derive(Clone, Debug)]
+pub struct STanh4;
+
+impl ElementWiseKer<f32> for STanh4 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_items() -> usize {
+        16
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn nr() -> usize {
+        4
+    }
+
+    fn run(x: &mut [f32], _: ()) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = stanh(*px))
+    }
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod s {
+    tanh_frame_tests!(true, f32, crate::generic::tanh::STanh4);
+}
+
+#[derive(Clone, Debug)]
+pub struct HTanh8;
+
+impl ElementWiseKer<f16> for HTanh8 {
+    fn name() -> &'static str {
+        "generic"
+    }
+
+    fn alignment_items() -> usize {
+        16
+    }
+
+    fn alignment_bytes() -> usize {
+        16
+    }
+
+    fn nr() -> usize {
+        8
+    }
+
+    fn run(x: &mut [f16], _: ()) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        x.iter_mut().for_each(|px| *px = htanh(*px))
+    }
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod h {
+    tanh_frame_tests!(true, tract_data::internal::f16, crate::generic::tanh::HTanh8);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/generic/unicast.rs b/vendor/tract-linalg-0.22.1/src/generic/unicast.rs
new file mode 100644
index 000000000..2d7d4875b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/generic/unicast.rs
@@ -0,0 +1,194 @@
+pub use tract_data::internal::f16;
+unicast_impl_wrap!(
+    f32,
+    SUnicastMul4,
+    4,
+    4,
+    fn run(a: &mut [f32], b: &[f32]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a *= b)
+    }
+);
+
+unicast_impl_wrap!(
+    f16,
+    HUnicastMul8,
+    8,
+    8,
+    fn run(a: &mut [f16], b: &[f16]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a *= b)
+    }
+);
+
+unicast_impl_wrap!(
+    f32,
+    SUnicastAdd4,
+    4,
+    4,
+    fn run(a: &mut [f32], b: &[f32]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a += b)
+    }
+);
+
+unicast_impl_wrap!(
+    f16,
+    HUnicastAdd8,
+    8,
+    8,
+    fn run(a: &mut [f16], b: &[f16]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a += b)
+    }
+);
+
+unicast_impl_wrap!(
+    f32,
+    SUnicastSub4,
+    4,
+    4,
+    fn run(a: &mut [f32], b: &[f32]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a -= b)
+    }
+);
+
+unicast_impl_wrap!(
+    f16,
+    HUnicastSub8,
+    8,
+    8,
+    fn run(a: &mut [f16], b: &[f16]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a -= b)
+    }
+);
+
+unicast_impl_wrap!(
+    f32,
+    SUnicastSubF4,
+    4,
+    4,
+    fn run(a: &mut [f32], b: &[f32]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = *b - *a)
+    }
+);
+
+unicast_impl_wrap!(
+    f16,
+    HUnicastSubF8,
+    8,
+    8,
+    fn run(a: &mut [f16], b: &[f16]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = *b - *a)
+    }
+);
+
+unicast_impl_wrap!(
+    f32,
+    SUnicastMin4,
+    4,
+    4,
+    fn run(a: &mut [f32], b: &[f32]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = a.min(*b))
+    }
+);
+
+unicast_impl_wrap!(
+    f16,
+    HUnicastMin8,
+    8,
+    8,
+    fn run(a: &mut [f16], b: &[f16]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = a.min(*b))
+    }
+);
+
+unicast_impl_wrap!(
+    f32,
+    SUnicastMax4,
+    4,
+    4,
+    fn run(a: &mut [f32], b: &[f32]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = a.max(*b))
+    }
+);
+
+unicast_impl_wrap!(
+    f16,
+    HUnicastMax8,
+    8,
+    8,
+    fn run(a: &mut [f16], b: &[f16]) {
+        debug_assert!(a.len() == b.len());
+        debug_assert!(a.len() % Self::nr() == 0);
+        debug_assert!(a.as_ptr() as usize % Self::alignment_bytes() == 0);
+        debug_assert!(b.as_ptr() as usize % Self::alignment_bytes() == 0);
+        a.iter_mut().zip(b.iter()).for_each(|(a, b)| *a = a.max(*b))
+    }
+);
+
+#[cfg(test)]
+#[macro_use]
+pub mod s {
+    use super::*;
+    use proptest::strategy::Strategy;
+    crate::unicast_frame_tests!(true, f32, SUnicastMul4, |a, b| a * b);
+    crate::unicast_frame_tests!(true, f32, SUnicastAdd4, |a, b| a + b);
+    crate::unicast_frame_tests!(true, f32, SUnicastSub4, |a, b| a - b);
+    crate::unicast_frame_tests!(true, f32, SUnicastSubF4, |a, b| b - a);
+    crate::unicast_frame_tests!(true, f32, SUnicastMin4, |a, b| a.min(b));
+    crate::unicast_frame_tests!(true, f32, SUnicastMax4, |a, b| a.max(b));
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod h {
+    use super::*;
+    use proptest::strategy::Strategy;
+    crate::unicast_frame_tests!(true, f16, HUnicastMul8, |a, b| a * b);
+    crate::unicast_frame_tests!(true, f16, HUnicastAdd8, |a, b| a + b);
+    crate::unicast_frame_tests!(true, f16, HUnicastSub8, |a, b| a - b);
+    crate::unicast_frame_tests!(true, f16, HUnicastSubF8, |a, b| b - a);
+    crate::unicast_frame_tests!(true, f16, HUnicastMin8, |a, b| a.min(b));
+    crate::unicast_frame_tests!(true, f16, HUnicastMax8, |a, b| a.max(b));
+}
diff --git a/vendor/tract-linalg-0.22.1/src/hwbench/bandwidth.rs b/vendor/tract-linalg-0.22.1/src/hwbench/bandwidth.rs
new file mode 100644
index 000000000..74c6a0e50
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/hwbench/bandwidth.rs
@@ -0,0 +1,159 @@
+use tract_data::itertools::Itertools;
+use tract_data::prelude::Blob;
+
+use super::runner;
+
+#[cfg(target_arch = "x86_64")]
+static mut HAS_AVX512: bool = false;
+
+#[cfg(target_arch = "x86_64")]
+#[inline(never)]
+fn load_a_slice(slice: &[u8], loops: usize) {
+    unsafe {
+        if HAS_AVX512 {
+            for _ in 0..loops {
+                let mut ptr = slice.as_ptr();
+                let end = ptr.add(slice.len());
+                while ptr < end {
+                    std::arch::asm!("
+                vmovaps zmm0, [rsi]
+                vmovaps zmm1, [rsi + 64]
+                vmovaps zmm2, [rsi + 128]
+                vmovaps zmm3, [rsi + 192]
+                vmovaps zmm4, [rsi + 256]
+                vmovaps zmm5, [rsi + 320]
+                vmovaps zmm6, [rsi + 384]
+                vmovaps zmm7, [rsi + 448]
+                    ", inout("rsi") ptr,
+                    out("zmm0") _,
+                    out("zmm1") _,
+                    );
+                    ptr = ptr.add(512);
+                }
+            }
+        } else {
+            let mut ptr = slice.as_ptr();
+            let end = ptr.add(slice.len());
+            for _ in 0..loops {
+                while ptr < end {
+                    std::arch::asm!("
+                vmovaps ymm0, [rsi]
+                vmovaps ymm1, [rsi + 32]
+                vmovaps ymm2, [rsi + 64]
+                vmovaps ymm3, [rsi + 96]
+                    ", inout("rsi") ptr,
+                    out("ymm0") _,
+                    out("ymm1") _,
+                    out("ymm2") _,
+                    out("ymm3") _,
+                    );
+                    ptr = ptr.add(128);
+                }
+            }
+        }
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+#[inline]
+fn load_a_slice(slice: &[u8], loops: usize) {
+    unsafe {
+        for _ in 0..loops {
+            let mut ptr = slice.as_ptr();
+            let end = ptr.add(slice.len());
+            while ptr < end {
+                std::arch::asm!("
+                    ld1 {{v0.16b-v3.16b}}, [x0], #64
+                    ld1 {{v4.16b-v7.16b}}, [x0], #64
+                        ", inout("x0") ptr,
+                out("v0") _,
+                out("v1") _,
+                out("v2") _,
+                out("v3") _,
+                out("v4") _,
+                out("v5") _,
+                out("v6") _,
+                out("v7") _,
+                );
+            }
+        }
+    }
+}
+
+#[cfg(target_arch = "arm")]
+#[inline(never)]
+fn load_a_slice(slice: &[u8], loops: usize) {
+    unsafe {
+        for _ in 0..loops {
+            let mut ptr = slice.as_ptr();
+            let end = ptr.add(slice.len());
+            while ptr < end {
+                std::arch::asm!("
+                vldmia r1!, {{q0-q3}}
+                vldmia r1!, {{q4-q7}}
+                    ", inout("r1") ptr,
+                out("d0") _, out("d1") _, out("d2") _, out("d3") _,
+                out("d4") _, out("d5") _, out("d6") _, out("d7") _,
+                out("d8") _, out("d9") _, out("d10") _, out("d11") _,
+                out("d12") _, out("d13") _, out("d14") _, out("d15") _,
+                );
+            }
+        }
+    }
+}
+
+fn bandwidth_seq(slice_len: usize, threads: usize) -> f64 {
+    #[cfg(target_arch = "x86_64")]
+    unsafe {
+        HAS_AVX512 = std::is_x86_feature_detected!("avx512f");
+    }
+    std::thread::scope(|s| {
+        let gards = (0..threads)
+            .map(|_| {
+                s.spawn(|| {
+                    let buffer = unsafe { Blob::new_for_size_and_align(slice_len, 1024) };
+                    runner::run_bench(|loops| load_a_slice(&buffer, loops))
+                })
+            })
+            .collect_vec();
+        let time = gards.into_iter().map(|t| t.join().unwrap()).sum::<f64>() / threads as f64;
+        (slice_len * threads) as f64 / time
+    })
+}
+
+pub fn what_is_big() -> usize {
+    1024 * 1024 * if cfg!(target_arch = "arm") { 64 } else { 256 }
+}
+
+pub fn l1_bandwidth_seq(threads: usize) -> f64 {
+    // [1024, 2048, 4096, 8192, 16384, 32768, 65536]
+    [1024]
+        .into_iter()
+        .map(|slice_len| bandwidth_seq(slice_len, threads))
+        .max_by_key(|x| *x as i64)
+        .unwrap()
+}
+
+pub fn main_memory_bandwith_seq(threads: usize) -> f64 {
+    bandwidth_seq(what_is_big(), threads)
+}
+
+#[ignore]
+#[test]
+fn b() {
+    let max = what_is_big();
+    for threads in [1, 2, 3, 4] {
+        println!("Threads: {}", threads);
+        for size in (0..)
+            .flat_map(|po2| (0..2).map(move |f| (1024 + 512 * f) * (1 << po2)))
+            .take_while(|&s| s < max)
+        {
+            let bw = bandwidth_seq(size, threads);
+            println!(
+                "threads: {threads} slice: {} KiB bandwidth: {} GiB/s",
+                size as f64 / 1024.,
+                (bw / (1024. * 1024. * 1024.)) as usize
+            );
+        }
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/hwbench/mod.rs b/vendor/tract-linalg-0.22.1/src/hwbench/mod.rs
new file mode 100644
index 000000000..373235856
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/hwbench/mod.rs
@@ -0,0 +1,4 @@
+pub mod runner;
+
+#[cfg(feature = "hwbench")]
+pub mod bandwidth;
diff --git a/vendor/tract-linalg-0.22.1/src/hwbench/runner.rs b/vendor/tract-linalg-0.22.1/src/hwbench/runner.rs
new file mode 100644
index 000000000..97b62c04f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/hwbench/runner.rs
@@ -0,0 +1,122 @@
+#![allow(unused_macros)]
+
+use std::time::Duration;
+use std::time::Instant;
+
+#[macro_export]
+macro_rules! r1 { ($($stat:stmt)*) => { $( $stat )* } }
+#[macro_export]
+macro_rules! r2 { ($($stat:stmt)*) => { $( $stat )* $( $stat )* } }
+#[macro_export]
+macro_rules! r4 { ($($stat:stmt)*) => { r2!(r2!($($stat)*)) }}
+#[macro_export]
+macro_rules! r8 { ($($stat:stmt)*) => { r2!(r4!($($stat)*)) }}
+#[macro_export]
+macro_rules! r16 { ($($stat:stmt)*) => { r2!(r8!($($stat)*)) }}
+#[macro_export]
+macro_rules! r32 { ($($stat:stmt)*) => { r2!(r16!($($stat)*)) }}
+#[macro_export]
+macro_rules! r64 { ($($stat:stmt)*) => { r2!(r32!($($stat)*)) }}
+#[macro_export]
+macro_rules! r128 { ($($stat:stmt)*) => { r2!(r64!($($stat)*)) }}
+#[macro_export]
+macro_rules! r256 { ($($stat:stmt)*) => { r2!(r128!($($stat)*)) }}
+#[macro_export]
+macro_rules! r512 { ($($stat:stmt)*) => { r2!(r256!($($stat)*)) }}
+#[macro_export]
+macro_rules! r1024 { ($($stat:stmt)*) => { r2!(r512!($($stat)*)) }}
+#[macro_export]
+macro_rules! r2048 { ($($stat:stmt)*) => { r2!(r1024!($($stat)*)) }}
+#[macro_export]
+macro_rules! r4096 { ($($stat:stmt)*) => { r2!(r2048!($($stat)*)) }}
+#[macro_export]
+macro_rules! r8192 { ($($stat:stmt)*) => { r2!(r4096!($($stat)*)) }}
+
+#[macro_export]
+macro_rules! b1 { ($($stat:stmt)*) => { nano::run_bench(|| { r1!($($stat)*); }) / 1.0 } }
+#[macro_export]
+macro_rules! b2 { ($($stat:stmt)*) => { nano::run_bench(|| { r2!($($stat)*); }) / 2.0 } }
+#[macro_export]
+macro_rules! b4 { ($($stat:stmt)*) => { nano::run_bench(|| { r4!($($stat)*); }) / 4.0 } }
+#[macro_export]
+macro_rules! b8 { ($($stat:stmt)*) => { nano::run_bench(|| { r8!($($stat)*); }) / 8.0 } }
+#[macro_export]
+macro_rules! b16 { ($($stat:stmt)*) => { nano::run_bench(|| { r16!($($stat)*); }) / 16.0 } }
+#[macro_export]
+macro_rules! b32 { ($($stat:stmt)*) => { nano::run_bench(|| { r32!($($stat)*); }) / 32.0 } }
+#[macro_export]
+macro_rules! b64 { ($($stat:stmt)*) => { nano::run_bench(|| { r64!($($stat)*); }) / 64.0 } }
+#[macro_export]
+macro_rules! b128 { ($($stat:stmt)*) => { nano::run_bench(|| { r128!($($stat)*); }) / 128.0 } }
+#[macro_export]
+macro_rules! b256 { ($($stat:stmt)*) => { nano::run_bench(|| { r256!($($stat)*); }) / 256.0 } }
+#[macro_export]
+macro_rules! b512 { ($($stat:stmt)*) => { nano::run_bench(|| { r512!($($stat)*); }) / 512.0 } }
+#[macro_export]
+macro_rules! b1024 { ($($stat:stmt)*) => { nano::run_bench(|| { r1024!($($stat)*); }) / 1024.0 } }
+#[macro_export]
+macro_rules! b2048 { ($($stat:stmt)*) => { nano::run_bench(|| { r2048!($($stat)*); }) / 2048.0 } }
+#[macro_export]
+macro_rules! b4096 { ($($stat:stmt)*) => { nano::run_bench(|| { r4096!($($stat)*); }) / 4096.0 } }
+#[macro_export]
+macro_rules! b8192 { ($($stat:stmt)*) => { nano::run_bench(|| { r8192!($($stat)*); }) / 8192.0 } }
+
+#[inline]
+fn black_box<T>(dummy: T) -> T {
+    unsafe {
+        let ret = std::ptr::read_volatile(&dummy);
+        std::mem::forget(dummy);
+        ret
+    }
+}
+
+pub fn run_bench<T, F: FnMut(usize) -> T + Copy>(f: F) -> f64 {
+    let start = Instant::now();
+    let mut f = black_box(f);
+    black_box(f(1));
+    let once = start.elapsed();
+    let evaled = if once < Duration::from_millis(1) {
+        let start = Instant::now();
+        black_box(f)(1000);
+        start.elapsed().as_secs_f64() / 1000.
+    } else {
+        once.as_secs_f64()
+    };
+    // raw evaluation is over a second. stop right there
+    if evaled > 1.0 {
+        return evaled;
+    }
+
+    // we want each individual sample to run for no less than
+    let minimum_sampling_time_s = 0.01;
+    let minimum_samples = 25;
+    let desired_bench_time = 1.0;
+
+    let inner_loops = (minimum_sampling_time_s / evaled).max(1.0) as usize;
+
+    let samples =
+        ((desired_bench_time / (inner_loops as f64 * evaled)) as usize).max(minimum_samples);
+    let warmup = (1.0 / evaled) as usize;
+
+    // println!(
+    //     "evaled: {:?} samples:{samples} inner_loops:{inner_loops} time:{}",
+    //     Duration::from_secs_f64(evaled),
+    //     (samples * inner_loops) as f64 * evaled
+    // );
+    let mut measures = vec![0.0; samples];
+
+    black_box(f(warmup));
+    for m in &mut measures {
+        let start = Instant::now();
+        black_box(black_box(f))(inner_loops);
+        let time = start.elapsed().as_secs_f64();
+        *m = time / inner_loops as f64
+    }
+    measures
+        .sort_by(|a, b| if a < b { std::cmp::Ordering::Less } else { std::cmp::Ordering::Greater });
+    let q1 = measures[samples / 4];
+    let q3 = measures[samples - samples / 4];
+    let iq = q3 - q1;
+    measures.retain(|&x| x >= q1 - 3. * iq && x <= q3 + 3. * iq);
+    measures.iter().copied().sum::<f64>() / measures.len() as f64
+}
diff --git a/vendor/tract-linalg-0.22.1/src/lib.rs b/vendor/tract-linalg-0.22.1/src/lib.rs
new file mode 100644
index 000000000..1af6f78b7
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/lib.rs
@@ -0,0 +1,404 @@
+#![allow(clippy::missing_safety_doc)]
+#![allow(clippy::redundant_closure_call)]
+#![allow(clippy::len_zero)]
+#![allow(clippy::excessive_precision)]
+#![allow(clippy::approx_constant)]
+#![allow(clippy::manual_is_multiple_of)]
+#![allow(unexpected_cfgs)]
+#![allow(unused_macros)]
+#[macro_use]
+extern crate derive_new;
+extern crate lazy_static;
+extern crate log;
+extern crate num_traits;
+#[macro_use]
+extern crate pastey;
+#[cfg(test)]
+extern crate proptest;
+
+include!(concat!(env!("OUT_DIR"), "/extern_kernel_macro.rs"));
+
+#[macro_use]
+mod frame;
+pub mod generic;
+pub mod multithread;
+pub use frame::weights::WeightType;
+pub use generic::{ScaleShiftAndRound, Scaler};
+use lazy_static::lazy_static;
+use mmm::{MMMInputFormat, MatMatMul, PanelExtractor};
+use tract_data::internal::TensorView;
+#[cfg(target_arch = "x86_64")]
+pub mod x86_64_fma;
+
+pub mod hwbench;
+
+#[cfg(target_arch = "aarch64")]
+pub mod arm64;
+
+#[cfg(target_arch = "aarch64")]
+pub use arm64::has_fp16;
+use tract_itertools::Itertools;
+
+#[cfg(not(target_arch = "aarch64"))]
+pub fn has_fp16() -> bool {
+    false
+}
+
+#[cfg(any(target_arch = "arm", target_arch = "armv7", target_arch = "arm"))]
+pub mod arm32;
+
+#[cfg(all(target_family = "wasm", target_feature = "simd128"))]
+pub mod wasm;
+
+pub use self::frame::*;
+
+use tract_data::prelude::*;
+
+pub type MMMImpl = Box<
+    dyn Fn(Option<usize>, Option<usize>, Option<usize>) -> Box<dyn mmm::MatMatMul> + Send + Sync,
+>;
+
+type MMVImpl = Box<dyn Fn(Option<usize>, Option<usize>) -> Box<dyn mmm::MatMatMul> + Send + Sync>;
+
+#[allow(clippy::type_complexity)]
+pub struct Ops {
+    mmm_impls: Vec<Box<dyn mmm::MatMatMul>>,
+    panel_extractors: Vec<mmm::PanelExtractor>,
+
+    mmm_f64: MMMImpl,
+    mmv_f64: MMVImpl,
+
+    mmm_f32: MMMImpl,
+    mmv_f32: MMVImpl,
+
+    mmm_f16: MMMImpl,
+    mmv_f16: MMVImpl,
+
+    qmmm_i32: MMMImpl,
+    qmmv_i32: MMVImpl,
+
+    pub leaky_relu_f16: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f16, f16>> + Send + Sync>,
+    pub leaky_relu_f32: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f32, f32>> + Send + Sync>,
+    pub mul_by_scalar_f32:
+        Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f32, f32>> + Send + Sync>,
+    pub mul_by_scalar_f16:
+        Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f16, f16>> + Send + Sync>,
+
+    pub sigmoid_f16: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f16>> + Send + Sync>,
+    pub sigmoid_f32: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f32>> + Send + Sync>,
+    pub tanh_f16: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f16>> + Send + Sync>,
+    pub tanh_f32: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f32>> + Send + Sync>,
+    pub erf_f32: Box<dyn Fn() -> Box<dyn element_wise::ElementWise<f32>> + Send + Sync>,
+    pub lut_u8: Box<dyn Fn(&[u8]) -> Box<dyn lut::Lut> + Send + Sync>,
+
+    pub max_f16: Box<dyn Fn() -> Box<dyn reduce::Reduce<f16>> + Send + Sync>,
+    pub max_f32: Box<dyn Fn() -> Box<dyn reduce::Reduce<f32>> + Send + Sync>,
+
+    pub sum_f16: Box<dyn Fn() -> Box<dyn reduce::Reduce<f16>> + Send + Sync>,
+    pub sum_f32: Box<dyn Fn() -> Box<dyn reduce::Reduce<f32>> + Send + Sync>,
+
+    pub softmax2_fastcompact_f16:
+        Box<dyn Fn() -> Box<dyn reduce::MapReduce<f16, f16>> + Send + Sync>,
+    pub softmax2_fastcompact_f32:
+        Box<dyn Fn() -> Box<dyn reduce::MapReduce<f32, f32>> + Send + Sync>,
+}
+
+impl Ops {
+    pub fn mmm_impls(&self) -> &[Box<dyn mmm::MatMatMul>] {
+        &self.mmm_impls
+    }
+
+    pub fn all_possible_packing(
+        &self,
+        weight_type: impl Into<WeightType>,
+    ) -> impl Iterator<Item = &dyn MMMInputFormat> {
+        let weight_type = weight_type.into();
+        self.mmm_impls
+            .iter()
+            .flat_map(|m| m.packings())
+            .map(|p| &*p.0)
+            .flat_map(move |p| {
+                let mut packs: Vec<&dyn MMMInputFormat> = vec![];
+                if p.precursor() == weight_type {
+                    packs.push(p)
+                };
+                for pe in &self.panel_extractors {
+                    if pe.from.precursor() == weight_type && pe.to.same_as(p) {
+                        packs.push(&*pe.from);
+                    }
+                }
+                packs.into_iter()
+            })
+            .sorted_by_key(|p| p.to_string())
+            .dedup()
+    }
+
+    pub fn filter_impls<'o>(
+        &'o self,
+        weight: &'o dyn MMMInputFormat,
+        acc: &[DatumType],
+        act: DatumType,
+        store: DatumType,
+    ) -> impl Iterator<
+        Item = (
+            &'o dyn MatMatMul,
+            usize,
+            &'o dyn MMMInputFormat,
+            Option<&'o PanelExtractor>,
+            &'o dyn MMMInputFormat,
+        ),
+    > {
+        let acc = acc.to_vec();
+        self.mmm_impls
+            .iter()
+            .filter(move |mmm| acc.contains(&mmm.internal_type()) && mmm.stores().contains(&store))
+            .flat_map(|mmm| {
+                mmm.packings()
+                    .iter()
+                    .enumerate()
+                    .map(|(pack_ix, (a, b))| (&**mmm, pack_ix, &**a, &**b))
+            })
+            .filter_map(|(mmm, ix, a, b)| {
+                if a.same_as(weight) {
+                    Some((mmm, ix, a, None, b))
+                } else {
+                    self.panel_extractors
+                        .iter()
+                        .find(|pe| pe.from.same_as(weight) && pe.to.same_as(a))
+                        .map(|pe| (mmm, ix, a, Some(pe), b))
+                }
+            })
+            .filter(move |(_mmm, _ix, _a, _pe, b)| {
+                b.precursor().as_dt().is_some_and(|dt| dt == act)
+            })
+    }
+
+    pub fn panel_extractors(&self) -> &[mmm::panel_extract::PanelExtractor] {
+        &self.panel_extractors
+    }
+
+    pub fn mmm(
+        &self,
+        accumulator: DatumType,
+        m: Option<usize>,
+        k: Option<usize>,
+        n: Option<usize>,
+    ) -> Option<Box<dyn mmm::MatMatMul>> {
+        use DatumType::*;
+        match accumulator {
+            F64 => Some(if n == Some(1) { (self.mmv_f64)(m, k) } else { (self.mmm_f64)(m, k, n) }),
+            F32 => Some(if n == Some(1) { (self.mmv_f32)(m, k) } else { (self.mmm_f32)(m, k, n) }),
+            F16 => Some(if n == Some(1) { (self.mmv_f16)(m, k) } else { (self.mmm_f16)(m, k, n) }),
+            I32 => {
+                Some(if n == Some(1) { (self.qmmv_i32)(m, k) } else { (self.qmmm_i32)(m, k, n) })
+            }
+            _ => None,
+        }
+    }
+}
+
+pub fn generic() -> Ops {
+    use crate::generic::mmm::*;
+    use element_wise::ElementWiseKer;
+    use reduce::{MapReduceKer, ReduceKer};
+    let mut ops = Ops {
+        mmm_impls: vec![],
+        panel_extractors: vec![],
+        mmm_f64: Box::new(|_, _, _| generic_f64_4x4.mmm()),
+        mmv_f64: Box::new(|_, _| generic_f64_4x1.mmm()),
+        mmm_f32: Box::new(|_, _, _| generic_f32_4x4.mmm()),
+        mmv_f32: Box::new(|_, _| generic_f32_4x1.mmm()),
+        mmm_f16: Box::new(|_, _, _| generic_f16_4x4.mmm()),
+        mmv_f16: Box::new(|_, _| generic_f16_4x1.mmm()),
+        qmmm_i32: Box::new(|_, _, _| generic_i32_4x4.mmm()),
+        qmmv_i32: Box::new(|_, _| generic_i32_4x4.mmm()),
+        leaky_relu_f16: Box::new(|| generic::HLeakyRelu8::ew()),
+        leaky_relu_f32: Box::new(|| generic::SLeakyRelu4::ew()),
+        mul_by_scalar_f16: Box::new(|| generic::HMulByScalar8::ew()),
+        mul_by_scalar_f32: Box::new(|| generic::SMulByScalar4::ew()),
+        sigmoid_f16: Box::new(|| generic::HSigmoid8::ew()),
+        sigmoid_f32: Box::new(|| generic::SSigmoid4::ew()),
+        tanh_f16: Box::new(|| generic::HTanh8::ew()),
+        tanh_f32: Box::new(|| generic::STanh4::ew()),
+        erf_f32: Box::new(|| generic::SErf4::ew()),
+        lut_u8: Box::new(|table: &[u8]| Box::new(lut::LutImpl::<generic::GenericLut8>::new(table))),
+        max_f16: Box::new(|| generic::reduce::max::HMax8::red()),
+        max_f32: Box::new(|| generic::reduce::max::SMax4::red()),
+        sum_f16: Box::new(|| generic::reduce::sum::HSum8::red()),
+        sum_f32: Box::new(|| generic::reduce::sum::SSum4::red()),
+        /*
+        activation_f32: Box::new(|microcode| generic::SActivation::new(microcode))
+        */
+        softmax2_fastcompact_f16: Box::new(|| generic::reduce::softmax_l2::HSoftMaxL2::red()),
+        softmax2_fastcompact_f32: Box::new(|| generic::reduce::softmax_l2::SSoftMaxL2::red()),
+    };
+    crate::generic::mmm::plug(&mut ops);
+    ops
+}
+
+#[allow(unreachable_code, unused_mut, unexpected_cfgs)]
+pub fn best() -> Ops {
+    let mut ops = generic();
+    #[cfg(target_arch = "x86_64")]
+    x86_64_fma::plug(&mut ops);
+    #[cfg(any(target_arch = "arm", target_arch = "armv7"))]
+    arm32::plug(&mut ops);
+    #[cfg(target_arch = "aarch64")]
+    arm64::plug(&mut ops);
+    #[cfg(all(target_family = "wasm", target_feature = "simd128"))]
+    wasm::plug(&mut ops);
+
+    ops
+}
+
+lazy_static::lazy_static! {
+    static ref OPS: Ops = {
+        best()
+    };
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum BinOp {
+    Min,
+    Max,
+    Add,
+    Mul,
+    Sub,
+    SubF,
+}
+
+impl BinOp {
+    pub fn flip(&self) -> BinOp {
+        use BinOp::*;
+        match self {
+            Sub => SubF,
+            SubF => Sub,
+            sym => *sym,
+        }
+    }
+}
+
+fn register_all_unicast(registry: &mut LinalgRegistry) {
+    generic::register_all_unicast(registry);
+    #[cfg(target_arch = "aarch64")]
+    arm64::register_all_unicast(registry);
+}
+
+fn register_all_by_scalar(registry: &mut LinalgRegistry) {
+    generic::register_all_by_scalar(registry);
+    #[cfg(target_arch = "aarch64")]
+    arm64::register_all_by_scalar(registry);
+}
+
+pub type LinalgFn = dyn Fn(&mut TensorView, &TensorView) -> TractResult<()> + Send + Sync;
+type LinalgRegistry = HashMap<(BinOp, DatumType), Box<dyn Fn() -> Box<LinalgFn> + Send + Sync>>;
+lazy_static! {
+    static ref BIN_UNICAST_OPS: Mutex<LinalgRegistry> = {
+        let mut registry = HashMap::default();
+        register_all_unicast(&mut registry);
+        Mutex::new(registry)
+    };
+    static ref BIN_BY_SCALAR_OPS: Mutex<LinalgRegistry> = {
+        let mut registry = HashMap::default();
+        register_all_by_scalar(&mut registry);
+        Mutex::new(registry)
+    };
+}
+
+pub fn bin_by_scalar(dt: DatumType, bin: BinOp) -> Option<Box<LinalgFn>> {
+    let map = BIN_BY_SCALAR_OPS.lock().unwrap();
+    if (dt == DatumType::F16) && !has_fp16() {
+        return None;
+    }
+    map.get(&(bin, dt)).map(|it| (it)())
+}
+
+pub fn bin_unicast(dt: DatumType, bin: BinOp) -> Option<Box<LinalgFn>> {
+    let map = BIN_UNICAST_OPS.lock().unwrap();
+    if (dt == DatumType::F16) && !has_fp16() {
+        return None;
+    }
+    map.get(&(bin, dt)).map(|it| (it)())
+}
+
+pub fn ops() -> &'static Ops {
+    &OPS
+}
+
+use num_traits::*;
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::ops::*;
+use std::sync::Mutex;
+
+pub trait LADatum:
+    Sized
+    + std::fmt::Display
+    + Debug
+    + Copy
+    + Clone
+    + Zero
+    + One
+    + 'static
+    + Add<Output = Self>
+    + Sub<Output = Self>
+    + Mul
+    + AddAssign
+    + PartialOrd
+    + Bounded
+    + tract_data::prelude::Datum
+{
+    #[cfg(test)]
+    fn strat() -> proptest::prelude::BoxedStrategy<Self>;
+}
+
+#[cfg(test)]
+use proptest::prelude::*;
+
+impl LADatum for f16 {
+    #[cfg(test)]
+    fn strat() -> BoxedStrategy<Self> {
+        f32::strat().prop_map(|f| f.as_()).boxed()
+    }
+}
+
+impl LADatum for f32 {
+    #[cfg(test)]
+    fn strat() -> BoxedStrategy<Self> {
+        (-1000isize..1000).prop_map(|i| i as f32 / 1000.0).boxed()
+    }
+}
+
+impl LADatum for f64 {
+    #[cfg(test)]
+    fn strat() -> BoxedStrategy<Self> {
+        (-1000isize..1000).prop_map(|i| i as f64 / 1000.0).boxed()
+    }
+}
+
+impl LADatum for u8 {
+    #[cfg(test)]
+    fn strat() -> BoxedStrategy<Self> {
+        any::<u8>().boxed()
+    }
+}
+
+impl LADatum for i8 {
+    #[cfg(test)]
+    fn strat() -> BoxedStrategy<Self> {
+        any::<i8>().boxed()
+    }
+}
+
+impl LADatum for i32 {
+    #[cfg(test)]
+    fn strat() -> BoxedStrategy<Self> {
+        any::<i32>().boxed()
+    }
+}
+
+#[cfg(test)]
+#[allow(dead_code)]
+fn setup_test_logger() {
+    let _ = env_logger::Builder::from_env("TRACT_LOG").try_init();
+}
diff --git a/vendor/tract-linalg-0.22.1/src/multithread.rs b/vendor/tract-linalg-0.22.1/src/multithread.rs
new file mode 100644
index 000000000..51f2f07c0
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/multithread.rs
@@ -0,0 +1,57 @@
+use std::cell::RefCell;
+#[allow(unused_imports)]
+use std::sync::{Arc, Mutex};
+
+#[cfg(feature = "multithread-mm")]
+use rayon::{ThreadPool, ThreadPoolBuilder};
+
+#[derive(Debug, Clone, Default)]
+pub enum Executor {
+    #[default]
+    SingleThread,
+    #[cfg(feature = "multithread-mm")]
+    MultiThread(Arc<ThreadPool>),
+}
+
+impl Executor {
+    #[cfg(feature = "multithread-mm")]
+    pub fn multithread(n: usize) -> Executor {
+        Executor::multithread_with_name(n, "tract-default")
+    }
+
+    #[cfg(feature = "multithread-mm")]
+    pub fn multithread_with_name(n: usize, name: &str) -> Executor {
+        let name = name.to_string();
+        let pool = ThreadPoolBuilder::new()
+            .thread_name(move |n| format!("{name}-{n}"))
+            .num_threads(n)
+            .build()
+            .unwrap();
+        Executor::MultiThread(Arc::new(pool))
+    }
+}
+
+static DEFAULT_EXECUTOR: Mutex<Executor> = Mutex::new(Executor::SingleThread);
+
+thread_local! {
+    static TLS_EXECUTOR_OVERRIDE: RefCell<Option<Executor>> = Default::default();
+}
+
+pub fn current_tract_executor() -> Executor {
+    if let Some(over_ride) = TLS_EXECUTOR_OVERRIDE.with_borrow(|tls| tls.clone()) {
+        over_ride
+    } else {
+        DEFAULT_EXECUTOR.lock().unwrap().clone()
+    }
+}
+
+pub fn set_default_executor(executor: Executor) {
+    *DEFAULT_EXECUTOR.lock().unwrap() = executor;
+}
+
+pub fn multithread_tract_scope<R, F: FnOnce() -> R>(pool: Executor, f: F) -> R {
+    let previous = TLS_EXECUTOR_OVERRIDE.replace(Some(pool));
+    let result = f();
+    TLS_EXECUTOR_OVERRIDE.set(previous);
+    result
+}
diff --git a/vendor/tract-linalg-0.22.1/src/wasm.rs b/vendor/tract-linalg-0.22.1/src/wasm.rs
new file mode 100644
index 000000000..628fc720c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/wasm.rs
@@ -0,0 +1,1664 @@
+/// Wasm SIMD implementation of `MatMatMulKer<f32>`
+///
+/// To run test, you need to install `wasmtime`
+/// and export the following environment variables:
+/// ```
+/// > export RUSTFLAGS='-C target-feature=+simd128'
+/// > export CARGO_TARGET_WASM32_WASI_RUNNER=wasmtime
+/// > cargo test --target=wasm32-wasi
+/// ```
+use crate::mmm::FusedKerSpec;
+use crate::mmm::ImplementationQuality;
+use crate::{Ops, Scaler};
+
+pub fn plug(ops: &mut Ops) {
+    ops.mmm_impls.push(wasm_f32_4x4.mmm());
+    ops.mmm_impls.push(wasm_f32_4x1.mmm());
+    ops.mmm_impls.push(wasm_f32_8x1.mmm());
+    ops.mmm_impls.push(wasm_f32_16x1.mmm());
+    ops.mmm_impls.push(wasm_f32_8x8.mmm());
+    // Selection: max(nr*mr) for N>1, max(mr) for N=1.
+    //   - N>1 ops: 8x8 (nr*mr=64) wins over 4x4 (16)
+    //   - N=1 ops: 16x1 (mr=16) wins
+    ops.mmm_f32 = Box::new(|_m, _k, _n| wasm_f32_8x8.mmm());
+    ops.mmv_f32 = Box::new(|m, _k| match m.unwrap_or(0) {
+        0..=7 => wasm_f32_4x1.mmm(),
+        8..=15 => wasm_f32_8x1.mmm(),
+        _ => wasm_f32_16x1.mmm(),
+    });
+}
+
+unsafe fn kernel_f32_4x4(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Each of these variables stores a row of the matrix,
+        // consisting of four packed `f32` numbers.
+        let mut ab0 = f32x4_splat(0.0);
+        let mut ab1 = f32x4_splat(0.0);
+        let mut ab2 = f32x4_splat(0.0);
+        let mut ab3 = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    let a = f32x4_splat(0.0);
+                    ab0 = a;
+                    ab1 = a;
+                    ab2 = a;
+                    ab3 = a;
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    let rows = rows as *const v128;
+                    ab0 = *rows;
+                    ab1 = *rows.add(1);
+                    ab2 = *rows.add(2);
+                    ab3 = *rows.add(3);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_min(a, ab0);
+                    ab1 = f32x4_min(a, ab1);
+                    ab2 = f32x4_min(a, ab2);
+                    ab3 = f32x4_min(a, ab3);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_max(a, ab0);
+                    ab1 = f32x4_max(a, ab1);
+                    ab2 = f32x4_max(a, ab2);
+                    ab3 = f32x4_max(a, ab3);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_add(a, ab0);
+                    ab1 = f32x4_add(a, ab1);
+                    ab2 = f32x4_add(a, ab2);
+                    ab3 = f32x4_add(a, ab3);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_mul(a, ab0);
+                    ab1 = f32x4_mul(a, ab1);
+                    ab2 = f32x4_mul(a, ab2);
+                    ab3 = f32x4_mul(a, ab3);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_sub(a, ab0);
+                    ab1 = f32x4_sub(a, ab1);
+                    ab2 = f32x4_sub(a, ab2);
+                    ab3 = f32x4_sub(a, ab3);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_sub(ab0, a);
+                    ab1 = f32x4_sub(ab1, a);
+                    ab2 = f32x4_sub(ab2, a);
+                    ab3 = f32x4_sub(ab3, a);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let a = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+
+                    let mask0 = f32x4_gt(ab0, zero);
+                    ab0 = v128_bitselect(ab0, f32x4_mul(a, ab0), mask0);
+
+                    let mask1 = f32x4_gt(ab1, zero);
+                    ab1 = v128_bitselect(ab1, f32x4_mul(a, ab1), mask1);
+
+                    let mask2 = f32x4_gt(ab2, zero);
+                    ab2 = v128_bitselect(ab2, f32x4_mul(a, ab2), mask2);
+
+                    let mask3 = f32x4_gt(ab3, zero);
+                    ab3 = v128_bitselect(ab3, f32x4_mul(a, ab3), mask3);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_min(f32x4_splat(row[0]), ab0);
+                    ab1 = f32x4_min(f32x4_splat(row[1]), ab1);
+                    ab2 = f32x4_min(f32x4_splat(row[2]), ab2);
+                    ab3 = f32x4_min(f32x4_splat(row[3]), ab3);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_max(f32x4_splat(row[0]), ab0);
+                    ab1 = f32x4_max(f32x4_splat(row[1]), ab1);
+                    ab2 = f32x4_max(f32x4_splat(row[2]), ab2);
+                    ab3 = f32x4_max(f32x4_splat(row[3]), ab3);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_add(f32x4_splat(row[0]), ab0);
+                    ab1 = f32x4_add(f32x4_splat(row[1]), ab1);
+                    ab2 = f32x4_add(f32x4_splat(row[2]), ab2);
+                    ab3 = f32x4_add(f32x4_splat(row[3]), ab3);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_mul(f32x4_splat(row[0]), ab0);
+                    ab1 = f32x4_mul(f32x4_splat(row[1]), ab1);
+                    ab2 = f32x4_mul(f32x4_splat(row[2]), ab2);
+                    ab3 = f32x4_mul(f32x4_splat(row[3]), ab3);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_sub(f32x4_splat(row[0]), ab0);
+                    ab1 = f32x4_sub(f32x4_splat(row[1]), ab1);
+                    ab2 = f32x4_sub(f32x4_splat(row[2]), ab2);
+                    ab3 = f32x4_sub(f32x4_splat(row[3]), ab3);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_sub(ab0, f32x4_splat(row[0]));
+                    ab1 = f32x4_sub(ab1, f32x4_splat(row[1]));
+                    ab2 = f32x4_sub(ab2, f32x4_splat(row[2]));
+                    ab3 = f32x4_sub(ab3, f32x4_splat(row[3]));
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_min(cols, ab0);
+                    ab1 = f32x4_min(cols, ab1);
+                    ab2 = f32x4_min(cols, ab2);
+                    ab3 = f32x4_min(cols, ab3);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_max(cols, ab0);
+                    ab1 = f32x4_max(cols, ab1);
+                    ab2 = f32x4_max(cols, ab2);
+                    ab3 = f32x4_max(cols, ab3);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_add(cols, ab0);
+                    ab1 = f32x4_add(cols, ab1);
+                    ab2 = f32x4_add(cols, ab2);
+                    ab3 = f32x4_add(cols, ab3);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_mul(cols, ab0);
+                    ab1 = f32x4_mul(cols, ab1);
+                    ab2 = f32x4_mul(cols, ab2);
+                    ab3 = f32x4_mul(cols, ab3);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_sub(cols, ab0);
+                    ab1 = f32x4_sub(cols, ab1);
+                    ab2 = f32x4_sub(cols, ab2);
+                    ab3 = f32x4_sub(cols, ab3);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_sub(ab0, cols);
+                    ab1 = f32x4_sub(ab1, cols);
+                    ab2 = f32x4_sub(ab2, cols);
+                    ab3 = f32x4_sub(ab3, cols);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let scale = f32x4_splat(scaler.scale);
+                    ab0 = f32x4_mul(scale, ab0);
+                    ab1 = f32x4_mul(scale, ab1);
+                    ab2 = f32x4_mul(scale, ab2);
+                    ab3 = f32x4_mul(scale, ab3);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let shift = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab0 = f32x4_mul(shift, ab0);
+                    ab1 = f32x4_mul(shift, ab1);
+                    ab2 = f32x4_mul(shift, ab2);
+                    ab3 = f32x4_mul(shift, ab3);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let shift = f32x4_splat(2f32.powi(shift as i32));
+                    ab0 = f32x4_mul(shift, ab0);
+                    ab1 = f32x4_mul(shift, ab1);
+                    ab2 = f32x4_mul(shift, ab2);
+                    ab3 = f32x4_mul(shift, ab3);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    let mut ptr: *const u8 = tile.ptr;
+
+                    let m0 = *(ptr as *const f32);
+                    let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                    let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                    let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                    ab0 = f32x4_add(ab0, f32x4(m0, m1, m2, m3));
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    let m0 = *(ptr as *const f32);
+                    let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                    let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                    let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                    ab1 = f32x4_add(ab1, f32x4(m0, m1, m2, m3));
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    let m0 = *(ptr as *const f32);
+                    let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                    let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                    let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                    ab2 = f32x4_add(ab2, f32x4(m0, m1, m2, m3));
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    let m0 = *(ptr as *const f32);
+                    let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                    let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                    let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                    ab3 = f32x4_add(ab3, f32x4(m0, m1, m2, m3));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(*rows.add(0)), cols));
+                    ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(*rows.add(1)), cols));
+                    ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(*rows.add(2)), cols));
+                    ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(*rows.add(3)), cols));
+                }
+                FusedKerSpec::Store(tile) => {
+                    let mut ptr: *mut u8 = tile.ptr;
+
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab0);
+                    *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab0);
+                    *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) =
+                        f32x4_extract_lane::<2>(ab0);
+                    *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) =
+                        f32x4_extract_lane::<3>(ab0);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab1);
+                    *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab1);
+                    *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) =
+                        f32x4_extract_lane::<2>(ab1);
+                    *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) =
+                        f32x4_extract_lane::<3>(ab1);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab2);
+                    *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab2);
+                    *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) =
+                        f32x4_extract_lane::<2>(ab2);
+                    *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) =
+                        f32x4_extract_lane::<3>(ab2);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab3);
+                    *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab3);
+                    *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) =
+                        f32x4_extract_lane::<2>(ab3);
+                    *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) =
+                        f32x4_extract_lane::<3>(ab3);
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    let a = pa as *const f32;
+                    let b = pb as *const v128;
+                    for i in 0..k {
+                        let a = std::slice::from_raw_parts(a.offset(4 * i as isize), 4);
+                        let b = v128_load(b.offset(i as isize));
+                        ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(a[0]), b));
+                        ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(a[1]), b));
+                        ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(a[2]), b));
+                        ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(a[3]), b));
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_4x4 => wasm_f32_4x4<f32>(4,4)@(4,4) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 4x1 kernel — GEMV-shaped variant for matrix-vector products
+/// (single-column outputs, e.g., streaming-RNN inference where each frame's
+/// activation is a single column). Mirrors the 4x4 kernel's FusedKerSpec
+/// match arms but collapses the column dimension from 4 to 1: a single
+/// f32x4 accumulator holds 4 output rows × 1 output column packed as
+/// [ab[0], ab[1], ab[2], ab[3]].
+///
+/// Selection: tract-core's einsum kernel_selection::strategize() prefers
+/// kernels with nr() == 1 when op.n.is_one(), so this kernel is
+/// automatically picked for N=1 cases once registered.
+unsafe fn kernel_f32_4x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Single accumulator: 4 rows × 1 col, packed into one f32x4.
+        // lane[i] holds ab[i] = the output value for row i (col 0).
+        let mut ab = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    ab = f32x4_splat(0.0);
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    // Tile is 4 rows × 1 col = 4 contiguous f32s = 1 v128
+                    ab = v128_load(rows as *const v128);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    ab = f32x4_min(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    ab = f32x4_max(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    ab = f32x4_add(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    ab = f32x4_mul(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    ab = f32x4_sub(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    ab = f32x4_sub(ab, f32x4_splat(a));
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let zero = f32x4_splat(0.0);
+                    let mask = f32x4_gt(ab, zero);
+                    ab = v128_bitselect(ab, f32x4_mul(f32x4_splat(a), ab), mask);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    // 4 row values, applied to ab's 4 lanes in order
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_min(r, ab);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_max(r, ab);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_add(r, ab);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_mul(r, ab);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_sub(r, ab);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_sub(ab, r);
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    // Single col value broadcast to all 4 rows
+                    ab = f32x4_min(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    ab = f32x4_max(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    ab = f32x4_add(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    ab = f32x4_mul(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    ab = f32x4_sub(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    ab = f32x4_sub(ab, f32x4_splat(*cols));
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    ab = f32x4_mul(f32x4_splat(scaler.scale), ab);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab = f32x4_mul(s, ab);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    ab = f32x4_mul(s, ab);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 4 rows × 1 col, with row_byte_stride between rows (col_stride irrelevant for N=1)
+                    let mut ptr: *const u8 = tile.ptr;
+                    let m0 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m1 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m2 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m3 = *(ptr as *const f32);
+                    ab = f32x4_add(ab, f32x4(m0, m1, m2, m3));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    // ab[i] += rows[i] * cols[0]  (cols[0] is the single col)
+                    let r = v128_load(rows as *const v128);
+                    let c = f32x4_splat(*cols);
+                    ab = f32x4_add(ab, f32x4_mul(r, c));
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 4 rows × 1 col, write each lane to a separate row
+                    let mut ptr: *mut u8 = tile.ptr;
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab);
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A is packed [k][MR=4]: each k iter loads 4 contiguous f32s = 1 v128.
+                    // B is packed [k][NR=1]: each k iter loads 1 scalar f32, broadcast.
+                    // ab[i] += a[i] * b for all i in 0..4 → SIMD: ab += a_vec * b_splat
+                    let a = pa as *const v128;
+                    let b = pb as *const f32;
+                    for i in 0..k {
+                        let a_vec = v128_load(a.offset(i as isize));
+                        let b_splat = f32x4_splat(*b.offset(i as isize));
+                        ab = f32x4_add(ab, f32x4_mul(a_vec, b_splat));
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_4x1 => wasm_f32_4x1<f32>(4,1)@(4,1) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 8x1 kernel — wider GEMV variant for matrix-vector products
+/// on large M. Uses TWO independent f32x4 accumulators (rows 0-3 in ab_top,
+/// rows 4-7 in ab_bot), enabling 2-way ILP within each k-iteration:
+/// the inner loop issues two independent f32x4_add(f32x4_mul(...)) ops per
+/// k-step, breaking the data-dependency chain depth from K to ~K/2 at the
+/// hardware pipeline level.
+///
+/// Compared to wasm_f32_4x1 (1 accumulator, k-serial dep chain), this is
+/// targeted at GEMV ops where M is a multiple of 8 (or close to it). For
+/// M=256 GRU gate matmuls (the dominant GEMV in DFN3), this should yield
+/// ~2x speedup on the inner loop on hardware where SIMD FMLA throughput
+/// exceeds 1 op/cycle.
+///
+/// Selection: `kernel_selection::strategize()` prefers max mr() for n=1
+/// cases, so this kernel automatically wins over wasm_f32_4x1 for all N=1
+/// ops once registered (including small-M cases where it slightly wastes
+/// rows — for M=1 lsnr_fc-style ops, that's 7-of-8 row waste, but those
+/// ops are <1% of frame so the regression is noise).
+unsafe fn kernel_f32_8x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Two accumulators: 8 rows × 1 col packed as [ab_top, ab_bot]
+        // ab_top.lane[i] holds row i (i in 0..4); ab_bot.lane[i] holds row i+4
+        let mut ab_top = f32x4_splat(0.0);
+        let mut ab_bot = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    ab_top = f32x4_splat(0.0);
+                    ab_bot = f32x4_splat(0.0);
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    // 8 rows × 1 col = 8 contiguous f32 = 2 v128
+                    let p = rows as *const v128;
+                    ab_top = *p;
+                    ab_bot = *p.add(1);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_min(s, ab_top);
+                    ab_bot = f32x4_min(s, ab_bot);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_max(s, ab_top);
+                    ab_bot = f32x4_max(s, ab_bot);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_add(s, ab_top);
+                    ab_bot = f32x4_add(s, ab_bot);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_sub(s, ab_top);
+                    ab_bot = f32x4_sub(s, ab_bot);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_sub(ab_top, s);
+                    ab_bot = f32x4_sub(ab_bot, s);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let s = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+                    let mask_t = f32x4_gt(ab_top, zero);
+                    let mask_b = f32x4_gt(ab_bot, zero);
+                    ab_top = v128_bitselect(ab_top, f32x4_mul(s, ab_top), mask_t);
+                    ab_bot = v128_bitselect(ab_bot, f32x4_mul(s, ab_bot), mask_b);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_min(r_t, ab_top);
+                    ab_bot = f32x4_min(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_max(r_t, ab_top);
+                    ab_bot = f32x4_max(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_add(r_t, ab_top);
+                    ab_bot = f32x4_add(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_mul(r_t, ab_top);
+                    ab_bot = f32x4_mul(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_sub(r_t, ab_top);
+                    ab_bot = f32x4_sub(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_sub(ab_top, r_t);
+                    ab_bot = f32x4_sub(ab_bot, r_b);
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_min(c, ab_top);
+                    ab_bot = f32x4_min(c, ab_bot);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_max(c, ab_top);
+                    ab_bot = f32x4_max(c, ab_bot);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_add(c, ab_top);
+                    ab_bot = f32x4_add(c, ab_bot);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_mul(c, ab_top);
+                    ab_bot = f32x4_mul(c, ab_bot);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_sub(c, ab_top);
+                    ab_bot = f32x4_sub(c, ab_bot);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_sub(ab_top, c);
+                    ab_bot = f32x4_sub(ab_bot, c);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let s = f32x4_splat(scaler.scale);
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 8 rows × 1 col, stride is row_byte_stride between rows
+                    let mut ptr: *const u8 = tile.ptr;
+                    let m0 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m1 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m2 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m3 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m4 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m5 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m6 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m7 = *(ptr as *const f32);
+                    ab_top = f32x4_add(ab_top, f32x4(m0, m1, m2, m3));
+                    ab_bot = f32x4_add(ab_bot, f32x4(m4, m5, m6, m7));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let p = rows as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_add(ab_top, f32x4_mul(r_t, c));
+                    ab_bot = f32x4_add(ab_bot, f32x4_mul(r_b, c));
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 8 rows × 1 col, write each lane to a separate row
+                    let mut ptr: *mut u8 = tile.ptr;
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_bot);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_bot);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_bot);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_bot);
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A: packed [k][MR=8] = each k iter loads 8 f32 = 2 v128
+                    // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast
+                    // The two fmadd ops on (ab_top, ab_bot) are independent — 2-way ILP per iter.
+                    let a = pa as *const v128;
+                    let b = pb as *const f32;
+                    for i in 0..k {
+                        let a_t = v128_load(a.offset((2 * i) as isize));
+                        let a_b = v128_load(a.offset((2 * i + 1) as isize));
+                        let b_splat = f32x4_splat(*b.offset(i as isize));
+                        ab_top = f32x4_add(ab_top, f32x4_mul(a_t, b_splat));
+                        ab_bot = f32x4_add(ab_bot, f32x4_mul(a_b, b_splat));
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_8x1 => wasm_f32_8x1<f32>(8,1)@(8,1) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 16x1 kernel — wider GEMV variant for matrix-vector products
+/// on very large M. Uses FOUR independent f32x4 accumulators (rows 0-3,
+/// 4-7, 8-11, 12-15), enabling 4-way ILP within each k-iteration.
+///
+/// Compared to wasm_f32_8x1 (2 accumulators, 2-way ILP), this exposes more
+/// parallel work to the SIMD pipelines, beneficial on hardware with 3+
+/// SIMD execution units (most modern ARM and x86).
+unsafe fn kernel_f32_16x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Four accumulators: 16 rows × 1 col packed as [ab_q0, ab_q1, ab_q2, ab_q3]
+        // ab_q0 = rows 0-3, ab_q1 = rows 4-7, ab_q2 = rows 8-11, ab_q3 = rows 12-15
+        let mut ab_q0 = f32x4_splat(0.0);
+        let mut ab_q1 = f32x4_splat(0.0);
+        let mut ab_q2 = f32x4_splat(0.0);
+        let mut ab_q3 = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    let z = f32x4_splat(0.0);
+                    ab_q0 = z;
+                    ab_q1 = z;
+                    ab_q2 = z;
+                    ab_q3 = z;
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    let p = rows as *const v128;
+                    ab_q0 = *p;
+                    ab_q1 = *p.add(1);
+                    ab_q2 = *p.add(2);
+                    ab_q3 = *p.add(3);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_min(s, ab_q0);
+                    ab_q1 = f32x4_min(s, ab_q1);
+                    ab_q2 = f32x4_min(s, ab_q2);
+                    ab_q3 = f32x4_min(s, ab_q3);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_max(s, ab_q0);
+                    ab_q1 = f32x4_max(s, ab_q1);
+                    ab_q2 = f32x4_max(s, ab_q2);
+                    ab_q3 = f32x4_max(s, ab_q3);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_add(s, ab_q0);
+                    ab_q1 = f32x4_add(s, ab_q1);
+                    ab_q2 = f32x4_add(s, ab_q2);
+                    ab_q3 = f32x4_add(s, ab_q3);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_sub(s, ab_q0);
+                    ab_q1 = f32x4_sub(s, ab_q1);
+                    ab_q2 = f32x4_sub(s, ab_q2);
+                    ab_q3 = f32x4_sub(s, ab_q3);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_sub(ab_q0, s);
+                    ab_q1 = f32x4_sub(ab_q1, s);
+                    ab_q2 = f32x4_sub(ab_q2, s);
+                    ab_q3 = f32x4_sub(ab_q3, s);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let s = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+                    let m0 = f32x4_gt(ab_q0, zero);
+                    ab_q0 = v128_bitselect(ab_q0, f32x4_mul(s, ab_q0), m0);
+                    let m1 = f32x4_gt(ab_q1, zero);
+                    ab_q1 = v128_bitselect(ab_q1, f32x4_mul(s, ab_q1), m1);
+                    let m2 = f32x4_gt(ab_q2, zero);
+                    ab_q2 = v128_bitselect(ab_q2, f32x4_mul(s, ab_q2), m2);
+                    let m3 = f32x4_gt(ab_q3, zero);
+                    ab_q3 = v128_bitselect(ab_q3, f32x4_mul(s, ab_q3), m3);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_min(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_min(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_min(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_min(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_max(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_max(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_max(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_max(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_add(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_add(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_add(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_add(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_mul(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_mul(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_mul(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_mul(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_sub(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_sub(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_sub(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_sub(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_sub(ab_q0, v128_load(p));
+                    ab_q1 = f32x4_sub(ab_q1, v128_load(p.add(1)));
+                    ab_q2 = f32x4_sub(ab_q2, v128_load(p.add(2)));
+                    ab_q3 = f32x4_sub(ab_q3, v128_load(p.add(3)));
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_min(c, ab_q0);
+                    ab_q1 = f32x4_min(c, ab_q1);
+                    ab_q2 = f32x4_min(c, ab_q2);
+                    ab_q3 = f32x4_min(c, ab_q3);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_max(c, ab_q0);
+                    ab_q1 = f32x4_max(c, ab_q1);
+                    ab_q2 = f32x4_max(c, ab_q2);
+                    ab_q3 = f32x4_max(c, ab_q3);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_add(c, ab_q0);
+                    ab_q1 = f32x4_add(c, ab_q1);
+                    ab_q2 = f32x4_add(c, ab_q2);
+                    ab_q3 = f32x4_add(c, ab_q3);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_mul(c, ab_q0);
+                    ab_q1 = f32x4_mul(c, ab_q1);
+                    ab_q2 = f32x4_mul(c, ab_q2);
+                    ab_q3 = f32x4_mul(c, ab_q3);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_sub(c, ab_q0);
+                    ab_q1 = f32x4_sub(c, ab_q1);
+                    ab_q2 = f32x4_sub(c, ab_q2);
+                    ab_q3 = f32x4_sub(c, ab_q3);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_sub(ab_q0, c);
+                    ab_q1 = f32x4_sub(ab_q1, c);
+                    ab_q2 = f32x4_sub(ab_q2, c);
+                    ab_q3 = f32x4_sub(ab_q3, c);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let s = f32x4_splat(scaler.scale);
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 16 rows × 1 col, with row_byte_stride between rows
+                    let mut ptr: *const u8 = tile.ptr;
+                    let mut ms = [0f32; 16];
+                    for i in 0..16 {
+                        ms[i] = *(ptr as *const f32);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                    ab_q0 = f32x4_add(ab_q0, f32x4(ms[0], ms[1], ms[2], ms[3]));
+                    ab_q1 = f32x4_add(ab_q1, f32x4(ms[4], ms[5], ms[6], ms[7]));
+                    ab_q2 = f32x4_add(ab_q2, f32x4(ms[8], ms[9], ms[10], ms[11]));
+                    ab_q3 = f32x4_add(ab_q3, f32x4(ms[12], ms[13], ms[14], ms[15]));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let p = rows as *const v128;
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_add(ab_q0, f32x4_mul(v128_load(p), c));
+                    ab_q1 = f32x4_add(ab_q1, f32x4_mul(v128_load(p.add(1)), c));
+                    ab_q2 = f32x4_add(ab_q2, f32x4_mul(v128_load(p.add(2)), c));
+                    ab_q3 = f32x4_add(ab_q3, f32x4_mul(v128_load(p.add(3)), c));
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 16 rows × 1 col, write each lane to a separate row
+                    let mut ptr: *mut u8 = tile.ptr;
+                    for ab in [ab_q0, ab_q1, ab_q2, ab_q3].iter() {
+                        *(ptr as *mut f32) = f32x4_extract_lane::<0>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<1>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<2>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<3>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A: packed [k][MR=16] = each k iter loads 16 f32 = 4 v128
+                    // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast
+                    // 4 INDEPENDENT fmadds per k-iter — 4-way ILP
+                    let a = pa as *const v128;
+                    let b = pb as *const f32;
+                    for i in 0..k {
+                        let a0 = v128_load(a.offset((4 * i) as isize));
+                        let a1 = v128_load(a.offset((4 * i + 1) as isize));
+                        let a2 = v128_load(a.offset((4 * i + 2) as isize));
+                        let a3 = v128_load(a.offset((4 * i + 3) as isize));
+                        let bs = f32x4_splat(*b.offset(i as isize));
+                        ab_q0 = f32x4_add(ab_q0, f32x4_mul(a0, bs));
+                        ab_q1 = f32x4_add(ab_q1, f32x4_mul(a1, bs));
+                        ab_q2 = f32x4_add(ab_q2, f32x4_mul(a2, bs));
+                        ab_q3 = f32x4_add(ab_q3, f32x4_mul(a3, bs));
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_16x1 => wasm_f32_16x1<f32>(16,1)@(16,1) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 8x8 kernel — wide MM tile (8 rows × 8 cols, 16 v128 accumulators).
+/// Each row uses 2 v128: cols 0-3 in `_lo`, cols 4-7 in `_hi`. 16 accumulators
+/// is at the limit of WASM's 16 logical SIMD register slots; this tests the
+/// register-pressure boundary. For DFN3 ops, all M and N are multiples of 8,
+/// so 8x8 fits cleanly with no padding waste.
+unsafe fn kernel_f32_8x8(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // 8 rows × 8 cols = 16 f32x4 accumulators (cols 0-3 in _lo, cols 4-7 in _hi)
+        let mut a0lo = f32x4_splat(0.0);
+        let mut a0hi = f32x4_splat(0.0);
+        let mut a1lo = f32x4_splat(0.0);
+        let mut a1hi = f32x4_splat(0.0);
+        let mut a2lo = f32x4_splat(0.0);
+        let mut a2hi = f32x4_splat(0.0);
+        let mut a3lo = f32x4_splat(0.0);
+        let mut a3hi = f32x4_splat(0.0);
+        let mut a4lo = f32x4_splat(0.0);
+        let mut a4hi = f32x4_splat(0.0);
+        let mut a5lo = f32x4_splat(0.0);
+        let mut a5hi = f32x4_splat(0.0);
+        let mut a6lo = f32x4_splat(0.0);
+        let mut a6hi = f32x4_splat(0.0);
+        let mut a7lo = f32x4_splat(0.0);
+        let mut a7hi = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    let z = f32x4_splat(0.0);
+                    a0lo = z;
+                    a0hi = z;
+                    a1lo = z;
+                    a1hi = z;
+                    a2lo = z;
+                    a2hi = z;
+                    a3lo = z;
+                    a3hi = z;
+                    a4lo = z;
+                    a4hi = z;
+                    a5lo = z;
+                    a5hi = z;
+                    a6lo = z;
+                    a6hi = z;
+                    a7lo = z;
+                    a7hi = z;
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    // 8 rows × 8 cols = 16 v128 (2 per row, contiguous lo+hi)
+                    let p = rows as *const v128;
+                    a0lo = *p.add(0);
+                    a0hi = *p.add(1);
+                    a1lo = *p.add(2);
+                    a1hi = *p.add(3);
+                    a2lo = *p.add(4);
+                    a2hi = *p.add(5);
+                    a3lo = *p.add(6);
+                    a3hi = *p.add(7);
+                    a4lo = *p.add(8);
+                    a4hi = *p.add(9);
+                    a5lo = *p.add(10);
+                    a5hi = *p.add(11);
+                    a6lo = *p.add(12);
+                    a6hi = *p.add(13);
+                    a7lo = *p.add(14);
+                    a7hi = *p.add(15);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_min(s, a0lo);
+                    a0hi = f32x4_min(s, a0hi);
+                    a1lo = f32x4_min(s, a1lo);
+                    a1hi = f32x4_min(s, a1hi);
+                    a2lo = f32x4_min(s, a2lo);
+                    a2hi = f32x4_min(s, a2hi);
+                    a3lo = f32x4_min(s, a3lo);
+                    a3hi = f32x4_min(s, a3hi);
+                    a4lo = f32x4_min(s, a4lo);
+                    a4hi = f32x4_min(s, a4hi);
+                    a5lo = f32x4_min(s, a5lo);
+                    a5hi = f32x4_min(s, a5hi);
+                    a6lo = f32x4_min(s, a6lo);
+                    a6hi = f32x4_min(s, a6hi);
+                    a7lo = f32x4_min(s, a7lo);
+                    a7hi = f32x4_min(s, a7hi);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_max(s, a0lo);
+                    a0hi = f32x4_max(s, a0hi);
+                    a1lo = f32x4_max(s, a1lo);
+                    a1hi = f32x4_max(s, a1hi);
+                    a2lo = f32x4_max(s, a2lo);
+                    a2hi = f32x4_max(s, a2hi);
+                    a3lo = f32x4_max(s, a3lo);
+                    a3hi = f32x4_max(s, a3hi);
+                    a4lo = f32x4_max(s, a4lo);
+                    a4hi = f32x4_max(s, a4hi);
+                    a5lo = f32x4_max(s, a5lo);
+                    a5hi = f32x4_max(s, a5hi);
+                    a6lo = f32x4_max(s, a6lo);
+                    a6hi = f32x4_max(s, a6hi);
+                    a7lo = f32x4_max(s, a7lo);
+                    a7hi = f32x4_max(s, a7hi);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_add(s, a0lo);
+                    a0hi = f32x4_add(s, a0hi);
+                    a1lo = f32x4_add(s, a1lo);
+                    a1hi = f32x4_add(s, a1hi);
+                    a2lo = f32x4_add(s, a2lo);
+                    a2hi = f32x4_add(s, a2hi);
+                    a3lo = f32x4_add(s, a3lo);
+                    a3hi = f32x4_add(s, a3hi);
+                    a4lo = f32x4_add(s, a4lo);
+                    a4hi = f32x4_add(s, a4hi);
+                    a5lo = f32x4_add(s, a5lo);
+                    a5hi = f32x4_add(s, a5hi);
+                    a6lo = f32x4_add(s, a6lo);
+                    a6hi = f32x4_add(s, a6hi);
+                    a7lo = f32x4_add(s, a7lo);
+                    a7hi = f32x4_add(s, a7hi);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_sub(s, a0lo);
+                    a0hi = f32x4_sub(s, a0hi);
+                    a1lo = f32x4_sub(s, a1lo);
+                    a1hi = f32x4_sub(s, a1hi);
+                    a2lo = f32x4_sub(s, a2lo);
+                    a2hi = f32x4_sub(s, a2hi);
+                    a3lo = f32x4_sub(s, a3lo);
+                    a3hi = f32x4_sub(s, a3hi);
+                    a4lo = f32x4_sub(s, a4lo);
+                    a4hi = f32x4_sub(s, a4hi);
+                    a5lo = f32x4_sub(s, a5lo);
+                    a5hi = f32x4_sub(s, a5hi);
+                    a6lo = f32x4_sub(s, a6lo);
+                    a6hi = f32x4_sub(s, a6hi);
+                    a7lo = f32x4_sub(s, a7lo);
+                    a7hi = f32x4_sub(s, a7hi);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_sub(a0lo, s);
+                    a0hi = f32x4_sub(a0hi, s);
+                    a1lo = f32x4_sub(a1lo, s);
+                    a1hi = f32x4_sub(a1hi, s);
+                    a2lo = f32x4_sub(a2lo, s);
+                    a2hi = f32x4_sub(a2hi, s);
+                    a3lo = f32x4_sub(a3lo, s);
+                    a3hi = f32x4_sub(a3hi, s);
+                    a4lo = f32x4_sub(a4lo, s);
+                    a4hi = f32x4_sub(a4hi, s);
+                    a5lo = f32x4_sub(a5lo, s);
+                    a5hi = f32x4_sub(a5hi, s);
+                    a6lo = f32x4_sub(a6lo, s);
+                    a6hi = f32x4_sub(a6hi, s);
+                    a7lo = f32x4_sub(a7lo, s);
+                    a7hi = f32x4_sub(a7hi, s);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let s = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+                    let m0a = f32x4_gt(a0lo, zero);
+                    a0lo = v128_bitselect(a0lo, f32x4_mul(s, a0lo), m0a);
+                    let m0b = f32x4_gt(a0hi, zero);
+                    a0hi = v128_bitselect(a0hi, f32x4_mul(s, a0hi), m0b);
+                    let m1a = f32x4_gt(a1lo, zero);
+                    a1lo = v128_bitselect(a1lo, f32x4_mul(s, a1lo), m1a);
+                    let m1b = f32x4_gt(a1hi, zero);
+                    a1hi = v128_bitselect(a1hi, f32x4_mul(s, a1hi), m1b);
+                    let m2a = f32x4_gt(a2lo, zero);
+                    a2lo = v128_bitselect(a2lo, f32x4_mul(s, a2lo), m2a);
+                    let m2b = f32x4_gt(a2hi, zero);
+                    a2hi = v128_bitselect(a2hi, f32x4_mul(s, a2hi), m2b);
+                    let m3a = f32x4_gt(a3lo, zero);
+                    a3lo = v128_bitselect(a3lo, f32x4_mul(s, a3lo), m3a);
+                    let m3b = f32x4_gt(a3hi, zero);
+                    a3hi = v128_bitselect(a3hi, f32x4_mul(s, a3hi), m3b);
+                    let m4a = f32x4_gt(a4lo, zero);
+                    a4lo = v128_bitselect(a4lo, f32x4_mul(s, a4lo), m4a);
+                    let m4b = f32x4_gt(a4hi, zero);
+                    a4hi = v128_bitselect(a4hi, f32x4_mul(s, a4hi), m4b);
+                    let m5a = f32x4_gt(a5lo, zero);
+                    a5lo = v128_bitselect(a5lo, f32x4_mul(s, a5lo), m5a);
+                    let m5b = f32x4_gt(a5hi, zero);
+                    a5hi = v128_bitselect(a5hi, f32x4_mul(s, a5hi), m5b);
+                    let m6a = f32x4_gt(a6lo, zero);
+                    a6lo = v128_bitselect(a6lo, f32x4_mul(s, a6lo), m6a);
+                    let m6b = f32x4_gt(a6hi, zero);
+                    a6hi = v128_bitselect(a6hi, f32x4_mul(s, a6hi), m6b);
+                    let m7a = f32x4_gt(a7lo, zero);
+                    a7lo = v128_bitselect(a7lo, f32x4_mul(s, a7lo), m7a);
+                    let m7b = f32x4_gt(a7hi, zero);
+                    a7hi = v128_bitselect(a7hi, f32x4_mul(s, a7hi), m7b);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_min(r0, a0lo);
+                    a0hi = f32x4_min(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_min(r1, a1lo);
+                    a1hi = f32x4_min(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_min(r2, a2lo);
+                    a2hi = f32x4_min(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_min(r3, a3lo);
+                    a3hi = f32x4_min(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_min(r4, a4lo);
+                    a4hi = f32x4_min(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_min(r5, a5lo);
+                    a5hi = f32x4_min(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_min(r6, a6lo);
+                    a6hi = f32x4_min(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_min(r7, a7lo);
+                    a7hi = f32x4_min(r7, a7hi);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_max(r0, a0lo);
+                    a0hi = f32x4_max(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_max(r1, a1lo);
+                    a1hi = f32x4_max(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_max(r2, a2lo);
+                    a2hi = f32x4_max(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_max(r3, a3lo);
+                    a3hi = f32x4_max(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_max(r4, a4lo);
+                    a4hi = f32x4_max(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_max(r5, a5lo);
+                    a5hi = f32x4_max(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_max(r6, a6lo);
+                    a6hi = f32x4_max(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_max(r7, a7lo);
+                    a7hi = f32x4_max(r7, a7hi);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_add(r0, a0lo);
+                    a0hi = f32x4_add(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_add(r1, a1lo);
+                    a1hi = f32x4_add(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_add(r2, a2lo);
+                    a2hi = f32x4_add(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_add(r3, a3lo);
+                    a3hi = f32x4_add(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_add(r4, a4lo);
+                    a4hi = f32x4_add(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_add(r5, a5lo);
+                    a5hi = f32x4_add(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_add(r6, a6lo);
+                    a6hi = f32x4_add(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_add(r7, a7lo);
+                    a7hi = f32x4_add(r7, a7hi);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_mul(r0, a0lo);
+                    a0hi = f32x4_mul(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_mul(r1, a1lo);
+                    a1hi = f32x4_mul(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_mul(r2, a2lo);
+                    a2hi = f32x4_mul(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_mul(r3, a3lo);
+                    a3hi = f32x4_mul(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_mul(r4, a4lo);
+                    a4hi = f32x4_mul(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_mul(r5, a5lo);
+                    a5hi = f32x4_mul(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_mul(r6, a6lo);
+                    a6hi = f32x4_mul(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_mul(r7, a7lo);
+                    a7hi = f32x4_mul(r7, a7hi);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_sub(r0, a0lo);
+                    a0hi = f32x4_sub(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_sub(r1, a1lo);
+                    a1hi = f32x4_sub(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_sub(r2, a2lo);
+                    a2hi = f32x4_sub(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_sub(r3, a3lo);
+                    a3hi = f32x4_sub(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_sub(r4, a4lo);
+                    a4hi = f32x4_sub(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_sub(r5, a5lo);
+                    a5hi = f32x4_sub(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_sub(r6, a6lo);
+                    a6hi = f32x4_sub(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_sub(r7, a7lo);
+                    a7hi = f32x4_sub(r7, a7hi);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_sub(a0lo, r0);
+                    a0hi = f32x4_sub(a0hi, r0);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_sub(a1lo, r1);
+                    a1hi = f32x4_sub(a1hi, r1);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_sub(a2lo, r2);
+                    a2hi = f32x4_sub(a2hi, r2);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_sub(a3lo, r3);
+                    a3hi = f32x4_sub(a3hi, r3);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_sub(a4lo, r4);
+                    a4hi = f32x4_sub(a4hi, r4);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_sub(a5lo, r5);
+                    a5hi = f32x4_sub(a5hi, r5);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_sub(a6lo, r6);
+                    a6hi = f32x4_sub(a6hi, r6);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_sub(a7lo, r7);
+                    a7hi = f32x4_sub(a7hi, r7);
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_min(clo, a0lo);
+                    a0hi = f32x4_min(chi, a0hi);
+                    a1lo = f32x4_min(clo, a1lo);
+                    a1hi = f32x4_min(chi, a1hi);
+                    a2lo = f32x4_min(clo, a2lo);
+                    a2hi = f32x4_min(chi, a2hi);
+                    a3lo = f32x4_min(clo, a3lo);
+                    a3hi = f32x4_min(chi, a3hi);
+                    a4lo = f32x4_min(clo, a4lo);
+                    a4hi = f32x4_min(chi, a4hi);
+                    a5lo = f32x4_min(clo, a5lo);
+                    a5hi = f32x4_min(chi, a5hi);
+                    a6lo = f32x4_min(clo, a6lo);
+                    a6hi = f32x4_min(chi, a6hi);
+                    a7lo = f32x4_min(clo, a7lo);
+                    a7hi = f32x4_min(chi, a7hi);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_max(clo, a0lo);
+                    a0hi = f32x4_max(chi, a0hi);
+                    a1lo = f32x4_max(clo, a1lo);
+                    a1hi = f32x4_max(chi, a1hi);
+                    a2lo = f32x4_max(clo, a2lo);
+                    a2hi = f32x4_max(chi, a2hi);
+                    a3lo = f32x4_max(clo, a3lo);
+                    a3hi = f32x4_max(chi, a3hi);
+                    a4lo = f32x4_max(clo, a4lo);
+                    a4hi = f32x4_max(chi, a4hi);
+                    a5lo = f32x4_max(clo, a5lo);
+                    a5hi = f32x4_max(chi, a5hi);
+                    a6lo = f32x4_max(clo, a6lo);
+                    a6hi = f32x4_max(chi, a6hi);
+                    a7lo = f32x4_max(clo, a7lo);
+                    a7hi = f32x4_max(chi, a7hi);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_add(clo, a0lo);
+                    a0hi = f32x4_add(chi, a0hi);
+                    a1lo = f32x4_add(clo, a1lo);
+                    a1hi = f32x4_add(chi, a1hi);
+                    a2lo = f32x4_add(clo, a2lo);
+                    a2hi = f32x4_add(chi, a2hi);
+                    a3lo = f32x4_add(clo, a3lo);
+                    a3hi = f32x4_add(chi, a3hi);
+                    a4lo = f32x4_add(clo, a4lo);
+                    a4hi = f32x4_add(chi, a4hi);
+                    a5lo = f32x4_add(clo, a5lo);
+                    a5hi = f32x4_add(chi, a5hi);
+                    a6lo = f32x4_add(clo, a6lo);
+                    a6hi = f32x4_add(chi, a6hi);
+                    a7lo = f32x4_add(clo, a7lo);
+                    a7hi = f32x4_add(chi, a7hi);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_mul(clo, a0lo);
+                    a0hi = f32x4_mul(chi, a0hi);
+                    a1lo = f32x4_mul(clo, a1lo);
+                    a1hi = f32x4_mul(chi, a1hi);
+                    a2lo = f32x4_mul(clo, a2lo);
+                    a2hi = f32x4_mul(chi, a2hi);
+                    a3lo = f32x4_mul(clo, a3lo);
+                    a3hi = f32x4_mul(chi, a3hi);
+                    a4lo = f32x4_mul(clo, a4lo);
+                    a4hi = f32x4_mul(chi, a4hi);
+                    a5lo = f32x4_mul(clo, a5lo);
+                    a5hi = f32x4_mul(chi, a5hi);
+                    a6lo = f32x4_mul(clo, a6lo);
+                    a6hi = f32x4_mul(chi, a6hi);
+                    a7lo = f32x4_mul(clo, a7lo);
+                    a7hi = f32x4_mul(chi, a7hi);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_sub(clo, a0lo);
+                    a0hi = f32x4_sub(chi, a0hi);
+                    a1lo = f32x4_sub(clo, a1lo);
+                    a1hi = f32x4_sub(chi, a1hi);
+                    a2lo = f32x4_sub(clo, a2lo);
+                    a2hi = f32x4_sub(chi, a2hi);
+                    a3lo = f32x4_sub(clo, a3lo);
+                    a3hi = f32x4_sub(chi, a3hi);
+                    a4lo = f32x4_sub(clo, a4lo);
+                    a4hi = f32x4_sub(chi, a4hi);
+                    a5lo = f32x4_sub(clo, a5lo);
+                    a5hi = f32x4_sub(chi, a5hi);
+                    a6lo = f32x4_sub(clo, a6lo);
+                    a6hi = f32x4_sub(chi, a6hi);
+                    a7lo = f32x4_sub(clo, a7lo);
+                    a7hi = f32x4_sub(chi, a7hi);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_sub(a0lo, clo);
+                    a0hi = f32x4_sub(a0hi, chi);
+                    a1lo = f32x4_sub(a1lo, clo);
+                    a1hi = f32x4_sub(a1hi, chi);
+                    a2lo = f32x4_sub(a2lo, clo);
+                    a2hi = f32x4_sub(a2hi, chi);
+                    a3lo = f32x4_sub(a3lo, clo);
+                    a3hi = f32x4_sub(a3hi, chi);
+                    a4lo = f32x4_sub(a4lo, clo);
+                    a4hi = f32x4_sub(a4hi, chi);
+                    a5lo = f32x4_sub(a5lo, clo);
+                    a5hi = f32x4_sub(a5hi, chi);
+                    a6lo = f32x4_sub(a6lo, clo);
+                    a6hi = f32x4_sub(a6hi, chi);
+                    a7lo = f32x4_sub(a7lo, clo);
+                    a7hi = f32x4_sub(a7hi, chi);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let s = f32x4_splat(scaler.scale);
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 8 rows × 8 cols, each row laid out per col_byte_stride
+                    let mut ptr: *const u8 = tile.ptr;
+                    for ab_pair in [
+                        (&mut a0lo, &mut a0hi),
+                        (&mut a1lo, &mut a1hi),
+                        (&mut a2lo, &mut a2hi),
+                        (&mut a3lo, &mut a3hi),
+                        (&mut a4lo, &mut a4hi),
+                        (&mut a5lo, &mut a5hi),
+                        (&mut a6lo, &mut a6hi),
+                        (&mut a7lo, &mut a7hi),
+                    ]
+                    .iter_mut()
+                    {
+                        let m0 = *(ptr as *const f32);
+                        let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                        let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                        let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                        let m4 = *(ptr.offset(tile.col_byte_stride * 4) as *const f32);
+                        let m5 = *(ptr.offset(tile.col_byte_stride * 5) as *const f32);
+                        let m6 = *(ptr.offset(tile.col_byte_stride * 6) as *const f32);
+                        let m7 = *(ptr.offset(tile.col_byte_stride * 7) as *const f32);
+                        let (lo, hi) = ab_pair;
+                        **lo = f32x4_add(**lo, f32x4(m0, m1, m2, m3));
+                        **hi = f32x4_add(**hi, f32x4(m4, m5, m6, m7));
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    let r0 = f32x4_splat(*rows.add(0));
+                    a0lo = f32x4_add(a0lo, f32x4_mul(r0, clo));
+                    a0hi = f32x4_add(a0hi, f32x4_mul(r0, chi));
+                    let r1 = f32x4_splat(*rows.add(1));
+                    a1lo = f32x4_add(a1lo, f32x4_mul(r1, clo));
+                    a1hi = f32x4_add(a1hi, f32x4_mul(r1, chi));
+                    let r2 = f32x4_splat(*rows.add(2));
+                    a2lo = f32x4_add(a2lo, f32x4_mul(r2, clo));
+                    a2hi = f32x4_add(a2hi, f32x4_mul(r2, chi));
+                    let r3 = f32x4_splat(*rows.add(3));
+                    a3lo = f32x4_add(a3lo, f32x4_mul(r3, clo));
+                    a3hi = f32x4_add(a3hi, f32x4_mul(r3, chi));
+                    let r4 = f32x4_splat(*rows.add(4));
+                    a4lo = f32x4_add(a4lo, f32x4_mul(r4, clo));
+                    a4hi = f32x4_add(a4hi, f32x4_mul(r4, chi));
+                    let r5 = f32x4_splat(*rows.add(5));
+                    a5lo = f32x4_add(a5lo, f32x4_mul(r5, clo));
+                    a5hi = f32x4_add(a5hi, f32x4_mul(r5, chi));
+                    let r6 = f32x4_splat(*rows.add(6));
+                    a6lo = f32x4_add(a6lo, f32x4_mul(r6, clo));
+                    a6hi = f32x4_add(a6hi, f32x4_mul(r6, chi));
+                    let r7 = f32x4_splat(*rows.add(7));
+                    a7lo = f32x4_add(a7lo, f32x4_mul(r7, clo));
+                    a7hi = f32x4_add(a7hi, f32x4_mul(r7, chi));
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 8 rows × 8 cols stores
+                    let mut ptr: *mut u8 = tile.ptr;
+                    for (lo, hi) in [
+                        (a0lo, a0hi),
+                        (a1lo, a1hi),
+                        (a2lo, a2hi),
+                        (a3lo, a3hi),
+                        (a4lo, a4hi),
+                        (a5lo, a5hi),
+                        (a6lo, a6hi),
+                        (a7lo, a7hi),
+                    ]
+                    .iter()
+                    {
+                        *(ptr as *mut f32) = f32x4_extract_lane::<0>(*lo);
+                        *(ptr.offset(tile.col_byte_stride) as *mut f32) =
+                            f32x4_extract_lane::<1>(*lo);
+                        *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) =
+                            f32x4_extract_lane::<2>(*lo);
+                        *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) =
+                            f32x4_extract_lane::<3>(*lo);
+                        *(ptr.offset(tile.col_byte_stride * 4) as *mut f32) =
+                            f32x4_extract_lane::<0>(*hi);
+                        *(ptr.offset(tile.col_byte_stride * 5) as *mut f32) =
+                            f32x4_extract_lane::<1>(*hi);
+                        *(ptr.offset(tile.col_byte_stride * 6) as *mut f32) =
+                            f32x4_extract_lane::<2>(*hi);
+                        *(ptr.offset(tile.col_byte_stride * 7) as *mut f32) =
+                            f32x4_extract_lane::<3>(*hi);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A: packed [k][MR=8] = each k iter loads 8 row values
+                    // B: packed [k][NR=8] = each k iter loads 8 col values as 2 v128
+                    let a = pa as *const f32;
+                    let b = pb as *const v128;
+                    for i in 0..k {
+                        let arow = std::slice::from_raw_parts(a.offset(8 * i as isize), 8);
+                        let blo = v128_load(b.offset((2 * i) as isize));
+                        let bhi = v128_load(b.offset((2 * i + 1) as isize));
+                        let s = f32x4_splat(arow[0]);
+                        a0lo = f32x4_add(a0lo, f32x4_mul(s, blo));
+                        a0hi = f32x4_add(a0hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[1]);
+                        a1lo = f32x4_add(a1lo, f32x4_mul(s, blo));
+                        a1hi = f32x4_add(a1hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[2]);
+                        a2lo = f32x4_add(a2lo, f32x4_mul(s, blo));
+                        a2hi = f32x4_add(a2hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[3]);
+                        a3lo = f32x4_add(a3lo, f32x4_mul(s, blo));
+                        a3hi = f32x4_add(a3hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[4]);
+                        a4lo = f32x4_add(a4lo, f32x4_mul(s, blo));
+                        a4hi = f32x4_add(a4hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[5]);
+                        a5lo = f32x4_add(a5lo, f32x4_mul(s, blo));
+                        a5hi = f32x4_add(a5hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[6]);
+                        a6lo = f32x4_add(a6lo, f32x4_mul(s, blo));
+                        a6hi = f32x4_add(a6hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[7]);
+                        a7lo = f32x4_add(a7lo, f32x4_mul(s, blo));
+                        a7hi = f32x4_add(a7hi, f32x4_mul(s, bhi));
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_8x8 => wasm_f32_8x8<f32>(8,8)@(8,8) quality(ImplementationQuality::TargetOptimized));
diff --git a/vendor/tract-linalg-0.22.1/src/wasm.rs.before-fma b/vendor/tract-linalg-0.22.1/src/wasm.rs.before-fma
new file mode 100644
index 000000000..628fc720c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/wasm.rs.before-fma
@@ -0,0 +1,1664 @@
+/// Wasm SIMD implementation of `MatMatMulKer<f32>`
+///
+/// To run test, you need to install `wasmtime`
+/// and export the following environment variables:
+/// ```
+/// > export RUSTFLAGS='-C target-feature=+simd128'
+/// > export CARGO_TARGET_WASM32_WASI_RUNNER=wasmtime
+/// > cargo test --target=wasm32-wasi
+/// ```
+use crate::mmm::FusedKerSpec;
+use crate::mmm::ImplementationQuality;
+use crate::{Ops, Scaler};
+
+pub fn plug(ops: &mut Ops) {
+    ops.mmm_impls.push(wasm_f32_4x4.mmm());
+    ops.mmm_impls.push(wasm_f32_4x1.mmm());
+    ops.mmm_impls.push(wasm_f32_8x1.mmm());
+    ops.mmm_impls.push(wasm_f32_16x1.mmm());
+    ops.mmm_impls.push(wasm_f32_8x8.mmm());
+    // Selection: max(nr*mr) for N>1, max(mr) for N=1.
+    //   - N>1 ops: 8x8 (nr*mr=64) wins over 4x4 (16)
+    //   - N=1 ops: 16x1 (mr=16) wins
+    ops.mmm_f32 = Box::new(|_m, _k, _n| wasm_f32_8x8.mmm());
+    ops.mmv_f32 = Box::new(|m, _k| match m.unwrap_or(0) {
+        0..=7 => wasm_f32_4x1.mmm(),
+        8..=15 => wasm_f32_8x1.mmm(),
+        _ => wasm_f32_16x1.mmm(),
+    });
+}
+
+unsafe fn kernel_f32_4x4(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Each of these variables stores a row of the matrix,
+        // consisting of four packed `f32` numbers.
+        let mut ab0 = f32x4_splat(0.0);
+        let mut ab1 = f32x4_splat(0.0);
+        let mut ab2 = f32x4_splat(0.0);
+        let mut ab3 = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    let a = f32x4_splat(0.0);
+                    ab0 = a;
+                    ab1 = a;
+                    ab2 = a;
+                    ab3 = a;
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    let rows = rows as *const v128;
+                    ab0 = *rows;
+                    ab1 = *rows.add(1);
+                    ab2 = *rows.add(2);
+                    ab3 = *rows.add(3);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_min(a, ab0);
+                    ab1 = f32x4_min(a, ab1);
+                    ab2 = f32x4_min(a, ab2);
+                    ab3 = f32x4_min(a, ab3);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_max(a, ab0);
+                    ab1 = f32x4_max(a, ab1);
+                    ab2 = f32x4_max(a, ab2);
+                    ab3 = f32x4_max(a, ab3);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_add(a, ab0);
+                    ab1 = f32x4_add(a, ab1);
+                    ab2 = f32x4_add(a, ab2);
+                    ab3 = f32x4_add(a, ab3);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_mul(a, ab0);
+                    ab1 = f32x4_mul(a, ab1);
+                    ab2 = f32x4_mul(a, ab2);
+                    ab3 = f32x4_mul(a, ab3);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_sub(a, ab0);
+                    ab1 = f32x4_sub(a, ab1);
+                    ab2 = f32x4_sub(a, ab2);
+                    ab3 = f32x4_sub(a, ab3);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let a = f32x4_splat(a);
+                    ab0 = f32x4_sub(ab0, a);
+                    ab1 = f32x4_sub(ab1, a);
+                    ab2 = f32x4_sub(ab2, a);
+                    ab3 = f32x4_sub(ab3, a);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let a = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+
+                    let mask0 = f32x4_gt(ab0, zero);
+                    ab0 = v128_bitselect(ab0, f32x4_mul(a, ab0), mask0);
+
+                    let mask1 = f32x4_gt(ab1, zero);
+                    ab1 = v128_bitselect(ab1, f32x4_mul(a, ab1), mask1);
+
+                    let mask2 = f32x4_gt(ab2, zero);
+                    ab2 = v128_bitselect(ab2, f32x4_mul(a, ab2), mask2);
+
+                    let mask3 = f32x4_gt(ab3, zero);
+                    ab3 = v128_bitselect(ab3, f32x4_mul(a, ab3), mask3);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_min(f32x4_splat(row[0]), ab0);
+                    ab1 = f32x4_min(f32x4_splat(row[1]), ab1);
+                    ab2 = f32x4_min(f32x4_splat(row[2]), ab2);
+                    ab3 = f32x4_min(f32x4_splat(row[3]), ab3);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_max(f32x4_splat(row[0]), ab0);
+                    ab1 = f32x4_max(f32x4_splat(row[1]), ab1);
+                    ab2 = f32x4_max(f32x4_splat(row[2]), ab2);
+                    ab3 = f32x4_max(f32x4_splat(row[3]), ab3);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_add(f32x4_splat(row[0]), ab0);
+                    ab1 = f32x4_add(f32x4_splat(row[1]), ab1);
+                    ab2 = f32x4_add(f32x4_splat(row[2]), ab2);
+                    ab3 = f32x4_add(f32x4_splat(row[3]), ab3);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_mul(f32x4_splat(row[0]), ab0);
+                    ab1 = f32x4_mul(f32x4_splat(row[1]), ab1);
+                    ab2 = f32x4_mul(f32x4_splat(row[2]), ab2);
+                    ab3 = f32x4_mul(f32x4_splat(row[3]), ab3);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_sub(f32x4_splat(row[0]), ab0);
+                    ab1 = f32x4_sub(f32x4_splat(row[1]), ab1);
+                    ab2 = f32x4_sub(f32x4_splat(row[2]), ab2);
+                    ab3 = f32x4_sub(f32x4_splat(row[3]), ab3);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let row = std::slice::from_raw_parts(row, 4);
+                    ab0 = f32x4_sub(ab0, f32x4_splat(row[0]));
+                    ab1 = f32x4_sub(ab1, f32x4_splat(row[1]));
+                    ab2 = f32x4_sub(ab2, f32x4_splat(row[2]));
+                    ab3 = f32x4_sub(ab3, f32x4_splat(row[3]));
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_min(cols, ab0);
+                    ab1 = f32x4_min(cols, ab1);
+                    ab2 = f32x4_min(cols, ab2);
+                    ab3 = f32x4_min(cols, ab3);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_max(cols, ab0);
+                    ab1 = f32x4_max(cols, ab1);
+                    ab2 = f32x4_max(cols, ab2);
+                    ab3 = f32x4_max(cols, ab3);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_add(cols, ab0);
+                    ab1 = f32x4_add(cols, ab1);
+                    ab2 = f32x4_add(cols, ab2);
+                    ab3 = f32x4_add(cols, ab3);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_mul(cols, ab0);
+                    ab1 = f32x4_mul(cols, ab1);
+                    ab2 = f32x4_mul(cols, ab2);
+                    ab3 = f32x4_mul(cols, ab3);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_sub(cols, ab0);
+                    ab1 = f32x4_sub(cols, ab1);
+                    ab2 = f32x4_sub(cols, ab2);
+                    ab3 = f32x4_sub(cols, ab3);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_sub(ab0, cols);
+                    ab1 = f32x4_sub(ab1, cols);
+                    ab2 = f32x4_sub(ab2, cols);
+                    ab3 = f32x4_sub(ab3, cols);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let scale = f32x4_splat(scaler.scale);
+                    ab0 = f32x4_mul(scale, ab0);
+                    ab1 = f32x4_mul(scale, ab1);
+                    ab2 = f32x4_mul(scale, ab2);
+                    ab3 = f32x4_mul(scale, ab3);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let shift = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab0 = f32x4_mul(shift, ab0);
+                    ab1 = f32x4_mul(shift, ab1);
+                    ab2 = f32x4_mul(shift, ab2);
+                    ab3 = f32x4_mul(shift, ab3);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let shift = f32x4_splat(2f32.powi(shift as i32));
+                    ab0 = f32x4_mul(shift, ab0);
+                    ab1 = f32x4_mul(shift, ab1);
+                    ab2 = f32x4_mul(shift, ab2);
+                    ab3 = f32x4_mul(shift, ab3);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    let mut ptr: *const u8 = tile.ptr;
+
+                    let m0 = *(ptr as *const f32);
+                    let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                    let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                    let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                    ab0 = f32x4_add(ab0, f32x4(m0, m1, m2, m3));
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    let m0 = *(ptr as *const f32);
+                    let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                    let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                    let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                    ab1 = f32x4_add(ab1, f32x4(m0, m1, m2, m3));
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    let m0 = *(ptr as *const f32);
+                    let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                    let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                    let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                    ab2 = f32x4_add(ab2, f32x4(m0, m1, m2, m3));
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    let m0 = *(ptr as *const f32);
+                    let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                    let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                    let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                    ab3 = f32x4_add(ab3, f32x4(m0, m1, m2, m3));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let cols = v128_load(cols as *const v128);
+                    ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(*rows.add(0)), cols));
+                    ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(*rows.add(1)), cols));
+                    ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(*rows.add(2)), cols));
+                    ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(*rows.add(3)), cols));
+                }
+                FusedKerSpec::Store(tile) => {
+                    let mut ptr: *mut u8 = tile.ptr;
+
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab0);
+                    *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab0);
+                    *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) =
+                        f32x4_extract_lane::<2>(ab0);
+                    *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) =
+                        f32x4_extract_lane::<3>(ab0);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab1);
+                    *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab1);
+                    *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) =
+                        f32x4_extract_lane::<2>(ab1);
+                    *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) =
+                        f32x4_extract_lane::<3>(ab1);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab2);
+                    *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab2);
+                    *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) =
+                        f32x4_extract_lane::<2>(ab2);
+                    *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) =
+                        f32x4_extract_lane::<3>(ab2);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab3);
+                    *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab3);
+                    *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) =
+                        f32x4_extract_lane::<2>(ab3);
+                    *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) =
+                        f32x4_extract_lane::<3>(ab3);
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    let a = pa as *const f32;
+                    let b = pb as *const v128;
+                    for i in 0..k {
+                        let a = std::slice::from_raw_parts(a.offset(4 * i as isize), 4);
+                        let b = v128_load(b.offset(i as isize));
+                        ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(a[0]), b));
+                        ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(a[1]), b));
+                        ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(a[2]), b));
+                        ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(a[3]), b));
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_4x4 => wasm_f32_4x4<f32>(4,4)@(4,4) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 4x1 kernel — GEMV-shaped variant for matrix-vector products
+/// (single-column outputs, e.g., streaming-RNN inference where each frame's
+/// activation is a single column). Mirrors the 4x4 kernel's FusedKerSpec
+/// match arms but collapses the column dimension from 4 to 1: a single
+/// f32x4 accumulator holds 4 output rows × 1 output column packed as
+/// [ab[0], ab[1], ab[2], ab[3]].
+///
+/// Selection: tract-core's einsum kernel_selection::strategize() prefers
+/// kernels with nr() == 1 when op.n.is_one(), so this kernel is
+/// automatically picked for N=1 cases once registered.
+unsafe fn kernel_f32_4x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Single accumulator: 4 rows × 1 col, packed into one f32x4.
+        // lane[i] holds ab[i] = the output value for row i (col 0).
+        let mut ab = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    ab = f32x4_splat(0.0);
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    // Tile is 4 rows × 1 col = 4 contiguous f32s = 1 v128
+                    ab = v128_load(rows as *const v128);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    ab = f32x4_min(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    ab = f32x4_max(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    ab = f32x4_add(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    ab = f32x4_mul(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    ab = f32x4_sub(f32x4_splat(a), ab);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    ab = f32x4_sub(ab, f32x4_splat(a));
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let zero = f32x4_splat(0.0);
+                    let mask = f32x4_gt(ab, zero);
+                    ab = v128_bitselect(ab, f32x4_mul(f32x4_splat(a), ab), mask);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    // 4 row values, applied to ab's 4 lanes in order
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_min(r, ab);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_max(r, ab);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_add(r, ab);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_mul(r, ab);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_sub(r, ab);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let r = v128_load(row as *const v128);
+                    ab = f32x4_sub(ab, r);
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    // Single col value broadcast to all 4 rows
+                    ab = f32x4_min(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    ab = f32x4_max(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    ab = f32x4_add(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    ab = f32x4_mul(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    ab = f32x4_sub(f32x4_splat(*cols), ab);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    ab = f32x4_sub(ab, f32x4_splat(*cols));
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    ab = f32x4_mul(f32x4_splat(scaler.scale), ab);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab = f32x4_mul(s, ab);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    ab = f32x4_mul(s, ab);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 4 rows × 1 col, with row_byte_stride between rows (col_stride irrelevant for N=1)
+                    let mut ptr: *const u8 = tile.ptr;
+                    let m0 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m1 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m2 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m3 = *(ptr as *const f32);
+                    ab = f32x4_add(ab, f32x4(m0, m1, m2, m3));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    // ab[i] += rows[i] * cols[0]  (cols[0] is the single col)
+                    let r = v128_load(rows as *const v128);
+                    let c = f32x4_splat(*cols);
+                    ab = f32x4_add(ab, f32x4_mul(r, c));
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 4 rows × 1 col, write each lane to a separate row
+                    let mut ptr: *mut u8 = tile.ptr;
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab);
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A is packed [k][MR=4]: each k iter loads 4 contiguous f32s = 1 v128.
+                    // B is packed [k][NR=1]: each k iter loads 1 scalar f32, broadcast.
+                    // ab[i] += a[i] * b for all i in 0..4 → SIMD: ab += a_vec * b_splat
+                    let a = pa as *const v128;
+                    let b = pb as *const f32;
+                    for i in 0..k {
+                        let a_vec = v128_load(a.offset(i as isize));
+                        let b_splat = f32x4_splat(*b.offset(i as isize));
+                        ab = f32x4_add(ab, f32x4_mul(a_vec, b_splat));
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_4x1 => wasm_f32_4x1<f32>(4,1)@(4,1) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 8x1 kernel — wider GEMV variant for matrix-vector products
+/// on large M. Uses TWO independent f32x4 accumulators (rows 0-3 in ab_top,
+/// rows 4-7 in ab_bot), enabling 2-way ILP within each k-iteration:
+/// the inner loop issues two independent f32x4_add(f32x4_mul(...)) ops per
+/// k-step, breaking the data-dependency chain depth from K to ~K/2 at the
+/// hardware pipeline level.
+///
+/// Compared to wasm_f32_4x1 (1 accumulator, k-serial dep chain), this is
+/// targeted at GEMV ops where M is a multiple of 8 (or close to it). For
+/// M=256 GRU gate matmuls (the dominant GEMV in DFN3), this should yield
+/// ~2x speedup on the inner loop on hardware where SIMD FMLA throughput
+/// exceeds 1 op/cycle.
+///
+/// Selection: `kernel_selection::strategize()` prefers max mr() for n=1
+/// cases, so this kernel automatically wins over wasm_f32_4x1 for all N=1
+/// ops once registered (including small-M cases where it slightly wastes
+/// rows — for M=1 lsnr_fc-style ops, that's 7-of-8 row waste, but those
+/// ops are <1% of frame so the regression is noise).
+unsafe fn kernel_f32_8x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Two accumulators: 8 rows × 1 col packed as [ab_top, ab_bot]
+        // ab_top.lane[i] holds row i (i in 0..4); ab_bot.lane[i] holds row i+4
+        let mut ab_top = f32x4_splat(0.0);
+        let mut ab_bot = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    ab_top = f32x4_splat(0.0);
+                    ab_bot = f32x4_splat(0.0);
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    // 8 rows × 1 col = 8 contiguous f32 = 2 v128
+                    let p = rows as *const v128;
+                    ab_top = *p;
+                    ab_bot = *p.add(1);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_min(s, ab_top);
+                    ab_bot = f32x4_min(s, ab_bot);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_max(s, ab_top);
+                    ab_bot = f32x4_max(s, ab_bot);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_add(s, ab_top);
+                    ab_bot = f32x4_add(s, ab_bot);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_sub(s, ab_top);
+                    ab_bot = f32x4_sub(s, ab_bot);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let s = f32x4_splat(a);
+                    ab_top = f32x4_sub(ab_top, s);
+                    ab_bot = f32x4_sub(ab_bot, s);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let s = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+                    let mask_t = f32x4_gt(ab_top, zero);
+                    let mask_b = f32x4_gt(ab_bot, zero);
+                    ab_top = v128_bitselect(ab_top, f32x4_mul(s, ab_top), mask_t);
+                    ab_bot = v128_bitselect(ab_bot, f32x4_mul(s, ab_bot), mask_b);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_min(r_t, ab_top);
+                    ab_bot = f32x4_min(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_max(r_t, ab_top);
+                    ab_bot = f32x4_max(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_add(r_t, ab_top);
+                    ab_bot = f32x4_add(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_mul(r_t, ab_top);
+                    ab_bot = f32x4_mul(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_sub(r_t, ab_top);
+                    ab_bot = f32x4_sub(r_b, ab_bot);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let p = row as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    ab_top = f32x4_sub(ab_top, r_t);
+                    ab_bot = f32x4_sub(ab_bot, r_b);
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_min(c, ab_top);
+                    ab_bot = f32x4_min(c, ab_bot);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_max(c, ab_top);
+                    ab_bot = f32x4_max(c, ab_bot);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_add(c, ab_top);
+                    ab_bot = f32x4_add(c, ab_bot);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_mul(c, ab_top);
+                    ab_bot = f32x4_mul(c, ab_bot);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_sub(c, ab_top);
+                    ab_bot = f32x4_sub(c, ab_bot);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_sub(ab_top, c);
+                    ab_bot = f32x4_sub(ab_bot, c);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let s = f32x4_splat(scaler.scale);
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    ab_top = f32x4_mul(s, ab_top);
+                    ab_bot = f32x4_mul(s, ab_bot);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 8 rows × 1 col, stride is row_byte_stride between rows
+                    let mut ptr: *const u8 = tile.ptr;
+                    let m0 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m1 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m2 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m3 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m4 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m5 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m6 = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    let m7 = *(ptr as *const f32);
+                    ab_top = f32x4_add(ab_top, f32x4(m0, m1, m2, m3));
+                    ab_bot = f32x4_add(ab_bot, f32x4(m4, m5, m6, m7));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let p = rows as *const v128;
+                    let r_t = v128_load(p);
+                    let r_b = v128_load(p.add(1));
+                    let c = f32x4_splat(*cols);
+                    ab_top = f32x4_add(ab_top, f32x4_mul(r_t, c));
+                    ab_bot = f32x4_add(ab_bot, f32x4_mul(r_b, c));
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 8 rows × 1 col, write each lane to a separate row
+                    let mut ptr: *mut u8 = tile.ptr;
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_top);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_bot);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_bot);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_bot);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_bot);
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A: packed [k][MR=8] = each k iter loads 8 f32 = 2 v128
+                    // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast
+                    // The two fmadd ops on (ab_top, ab_bot) are independent — 2-way ILP per iter.
+                    let a = pa as *const v128;
+                    let b = pb as *const f32;
+                    for i in 0..k {
+                        let a_t = v128_load(a.offset((2 * i) as isize));
+                        let a_b = v128_load(a.offset((2 * i + 1) as isize));
+                        let b_splat = f32x4_splat(*b.offset(i as isize));
+                        ab_top = f32x4_add(ab_top, f32x4_mul(a_t, b_splat));
+                        ab_bot = f32x4_add(ab_bot, f32x4_mul(a_b, b_splat));
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_8x1 => wasm_f32_8x1<f32>(8,1)@(8,1) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 16x1 kernel — wider GEMV variant for matrix-vector products
+/// on very large M. Uses FOUR independent f32x4 accumulators (rows 0-3,
+/// 4-7, 8-11, 12-15), enabling 4-way ILP within each k-iteration.
+///
+/// Compared to wasm_f32_8x1 (2 accumulators, 2-way ILP), this exposes more
+/// parallel work to the SIMD pipelines, beneficial on hardware with 3+
+/// SIMD execution units (most modern ARM and x86).
+unsafe fn kernel_f32_16x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // Four accumulators: 16 rows × 1 col packed as [ab_q0, ab_q1, ab_q2, ab_q3]
+        // ab_q0 = rows 0-3, ab_q1 = rows 4-7, ab_q2 = rows 8-11, ab_q3 = rows 12-15
+        let mut ab_q0 = f32x4_splat(0.0);
+        let mut ab_q1 = f32x4_splat(0.0);
+        let mut ab_q2 = f32x4_splat(0.0);
+        let mut ab_q3 = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    let z = f32x4_splat(0.0);
+                    ab_q0 = z;
+                    ab_q1 = z;
+                    ab_q2 = z;
+                    ab_q3 = z;
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    let p = rows as *const v128;
+                    ab_q0 = *p;
+                    ab_q1 = *p.add(1);
+                    ab_q2 = *p.add(2);
+                    ab_q3 = *p.add(3);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_min(s, ab_q0);
+                    ab_q1 = f32x4_min(s, ab_q1);
+                    ab_q2 = f32x4_min(s, ab_q2);
+                    ab_q3 = f32x4_min(s, ab_q3);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_max(s, ab_q0);
+                    ab_q1 = f32x4_max(s, ab_q1);
+                    ab_q2 = f32x4_max(s, ab_q2);
+                    ab_q3 = f32x4_max(s, ab_q3);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_add(s, ab_q0);
+                    ab_q1 = f32x4_add(s, ab_q1);
+                    ab_q2 = f32x4_add(s, ab_q2);
+                    ab_q3 = f32x4_add(s, ab_q3);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_sub(s, ab_q0);
+                    ab_q1 = f32x4_sub(s, ab_q1);
+                    ab_q2 = f32x4_sub(s, ab_q2);
+                    ab_q3 = f32x4_sub(s, ab_q3);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let s = f32x4_splat(a);
+                    ab_q0 = f32x4_sub(ab_q0, s);
+                    ab_q1 = f32x4_sub(ab_q1, s);
+                    ab_q2 = f32x4_sub(ab_q2, s);
+                    ab_q3 = f32x4_sub(ab_q3, s);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let s = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+                    let m0 = f32x4_gt(ab_q0, zero);
+                    ab_q0 = v128_bitselect(ab_q0, f32x4_mul(s, ab_q0), m0);
+                    let m1 = f32x4_gt(ab_q1, zero);
+                    ab_q1 = v128_bitselect(ab_q1, f32x4_mul(s, ab_q1), m1);
+                    let m2 = f32x4_gt(ab_q2, zero);
+                    ab_q2 = v128_bitselect(ab_q2, f32x4_mul(s, ab_q2), m2);
+                    let m3 = f32x4_gt(ab_q3, zero);
+                    ab_q3 = v128_bitselect(ab_q3, f32x4_mul(s, ab_q3), m3);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_min(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_min(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_min(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_min(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_max(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_max(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_max(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_max(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_add(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_add(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_add(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_add(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_mul(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_mul(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_mul(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_mul(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_sub(v128_load(p), ab_q0);
+                    ab_q1 = f32x4_sub(v128_load(p.add(1)), ab_q1);
+                    ab_q2 = f32x4_sub(v128_load(p.add(2)), ab_q2);
+                    ab_q3 = f32x4_sub(v128_load(p.add(3)), ab_q3);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let p = row as *const v128;
+                    ab_q0 = f32x4_sub(ab_q0, v128_load(p));
+                    ab_q1 = f32x4_sub(ab_q1, v128_load(p.add(1)));
+                    ab_q2 = f32x4_sub(ab_q2, v128_load(p.add(2)));
+                    ab_q3 = f32x4_sub(ab_q3, v128_load(p.add(3)));
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_min(c, ab_q0);
+                    ab_q1 = f32x4_min(c, ab_q1);
+                    ab_q2 = f32x4_min(c, ab_q2);
+                    ab_q3 = f32x4_min(c, ab_q3);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_max(c, ab_q0);
+                    ab_q1 = f32x4_max(c, ab_q1);
+                    ab_q2 = f32x4_max(c, ab_q2);
+                    ab_q3 = f32x4_max(c, ab_q3);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_add(c, ab_q0);
+                    ab_q1 = f32x4_add(c, ab_q1);
+                    ab_q2 = f32x4_add(c, ab_q2);
+                    ab_q3 = f32x4_add(c, ab_q3);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_mul(c, ab_q0);
+                    ab_q1 = f32x4_mul(c, ab_q1);
+                    ab_q2 = f32x4_mul(c, ab_q2);
+                    ab_q3 = f32x4_mul(c, ab_q3);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_sub(c, ab_q0);
+                    ab_q1 = f32x4_sub(c, ab_q1);
+                    ab_q2 = f32x4_sub(c, ab_q2);
+                    ab_q3 = f32x4_sub(c, ab_q3);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_sub(ab_q0, c);
+                    ab_q1 = f32x4_sub(ab_q1, c);
+                    ab_q2 = f32x4_sub(ab_q2, c);
+                    ab_q3 = f32x4_sub(ab_q3, c);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let s = f32x4_splat(scaler.scale);
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    ab_q0 = f32x4_mul(s, ab_q0);
+                    ab_q1 = f32x4_mul(s, ab_q1);
+                    ab_q2 = f32x4_mul(s, ab_q2);
+                    ab_q3 = f32x4_mul(s, ab_q3);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 16 rows × 1 col, with row_byte_stride between rows
+                    let mut ptr: *const u8 = tile.ptr;
+                    let mut ms = [0f32; 16];
+                    for i in 0..16 {
+                        ms[i] = *(ptr as *const f32);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                    ab_q0 = f32x4_add(ab_q0, f32x4(ms[0], ms[1], ms[2], ms[3]));
+                    ab_q1 = f32x4_add(ab_q1, f32x4(ms[4], ms[5], ms[6], ms[7]));
+                    ab_q2 = f32x4_add(ab_q2, f32x4(ms[8], ms[9], ms[10], ms[11]));
+                    ab_q3 = f32x4_add(ab_q3, f32x4(ms[12], ms[13], ms[14], ms[15]));
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let p = rows as *const v128;
+                    let c = f32x4_splat(*cols);
+                    ab_q0 = f32x4_add(ab_q0, f32x4_mul(v128_load(p), c));
+                    ab_q1 = f32x4_add(ab_q1, f32x4_mul(v128_load(p.add(1)), c));
+                    ab_q2 = f32x4_add(ab_q2, f32x4_mul(v128_load(p.add(2)), c));
+                    ab_q3 = f32x4_add(ab_q3, f32x4_mul(v128_load(p.add(3)), c));
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 16 rows × 1 col, write each lane to a separate row
+                    let mut ptr: *mut u8 = tile.ptr;
+                    for ab in [ab_q0, ab_q1, ab_q2, ab_q3].iter() {
+                        *(ptr as *mut f32) = f32x4_extract_lane::<0>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<1>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<2>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                        *(ptr as *mut f32) = f32x4_extract_lane::<3>(*ab);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A: packed [k][MR=16] = each k iter loads 16 f32 = 4 v128
+                    // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast
+                    // 4 INDEPENDENT fmadds per k-iter — 4-way ILP
+                    let a = pa as *const v128;
+                    let b = pb as *const f32;
+                    for i in 0..k {
+                        let a0 = v128_load(a.offset((4 * i) as isize));
+                        let a1 = v128_load(a.offset((4 * i + 1) as isize));
+                        let a2 = v128_load(a.offset((4 * i + 2) as isize));
+                        let a3 = v128_load(a.offset((4 * i + 3) as isize));
+                        let bs = f32x4_splat(*b.offset(i as isize));
+                        ab_q0 = f32x4_add(ab_q0, f32x4_mul(a0, bs));
+                        ab_q1 = f32x4_add(ab_q1, f32x4_mul(a1, bs));
+                        ab_q2 = f32x4_add(ab_q2, f32x4_mul(a2, bs));
+                        ab_q3 = f32x4_add(ab_q3, f32x4_mul(a3, bs));
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_16x1 => wasm_f32_16x1<f32>(16,1)@(16,1) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 8x8 kernel — wide MM tile (8 rows × 8 cols, 16 v128 accumulators).
+/// Each row uses 2 v128: cols 0-3 in `_lo`, cols 4-7 in `_hi`. 16 accumulators
+/// is at the limit of WASM's 16 logical SIMD register slots; this tests the
+/// register-pressure boundary. For DFN3 ops, all M and N are multiples of 8,
+/// so 8x8 fits cleanly with no padding waste.
+unsafe fn kernel_f32_8x8(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    unsafe {
+        // 8 rows × 8 cols = 16 f32x4 accumulators (cols 0-3 in _lo, cols 4-7 in _hi)
+        let mut a0lo = f32x4_splat(0.0);
+        let mut a0hi = f32x4_splat(0.0);
+        let mut a1lo = f32x4_splat(0.0);
+        let mut a1hi = f32x4_splat(0.0);
+        let mut a2lo = f32x4_splat(0.0);
+        let mut a2hi = f32x4_splat(0.0);
+        let mut a3lo = f32x4_splat(0.0);
+        let mut a3hi = f32x4_splat(0.0);
+        let mut a4lo = f32x4_splat(0.0);
+        let mut a4hi = f32x4_splat(0.0);
+        let mut a5lo = f32x4_splat(0.0);
+        let mut a5hi = f32x4_splat(0.0);
+        let mut a6lo = f32x4_splat(0.0);
+        let mut a6hi = f32x4_splat(0.0);
+        let mut a7lo = f32x4_splat(0.0);
+        let mut a7hi = f32x4_splat(0.0);
+
+        while !pnl.is_null() {
+            match *pnl {
+                FusedKerSpec::Done => break,
+                FusedKerSpec::Clear => {
+                    let z = f32x4_splat(0.0);
+                    a0lo = z;
+                    a0hi = z;
+                    a1lo = z;
+                    a1hi = z;
+                    a2lo = z;
+                    a2hi = z;
+                    a3lo = z;
+                    a3hi = z;
+                    a4lo = z;
+                    a4hi = z;
+                    a5lo = z;
+                    a5hi = z;
+                    a6lo = z;
+                    a6hi = z;
+                    a7lo = z;
+                    a7hi = z;
+                }
+                FusedKerSpec::LoadTile(_cols, rows) => {
+                    // 8 rows × 8 cols = 16 v128 (2 per row, contiguous lo+hi)
+                    let p = rows as *const v128;
+                    a0lo = *p.add(0);
+                    a0hi = *p.add(1);
+                    a1lo = *p.add(2);
+                    a1hi = *p.add(3);
+                    a2lo = *p.add(4);
+                    a2hi = *p.add(5);
+                    a3lo = *p.add(6);
+                    a3hi = *p.add(7);
+                    a4lo = *p.add(8);
+                    a4hi = *p.add(9);
+                    a5lo = *p.add(10);
+                    a5hi = *p.add(11);
+                    a6lo = *p.add(12);
+                    a6hi = *p.add(13);
+                    a7lo = *p.add(14);
+                    a7hi = *p.add(15);
+                }
+                FusedKerSpec::ScalarMin(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_min(s, a0lo);
+                    a0hi = f32x4_min(s, a0hi);
+                    a1lo = f32x4_min(s, a1lo);
+                    a1hi = f32x4_min(s, a1hi);
+                    a2lo = f32x4_min(s, a2lo);
+                    a2hi = f32x4_min(s, a2hi);
+                    a3lo = f32x4_min(s, a3lo);
+                    a3hi = f32x4_min(s, a3hi);
+                    a4lo = f32x4_min(s, a4lo);
+                    a4hi = f32x4_min(s, a4hi);
+                    a5lo = f32x4_min(s, a5lo);
+                    a5hi = f32x4_min(s, a5hi);
+                    a6lo = f32x4_min(s, a6lo);
+                    a6hi = f32x4_min(s, a6hi);
+                    a7lo = f32x4_min(s, a7lo);
+                    a7hi = f32x4_min(s, a7hi);
+                }
+                FusedKerSpec::ScalarMax(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_max(s, a0lo);
+                    a0hi = f32x4_max(s, a0hi);
+                    a1lo = f32x4_max(s, a1lo);
+                    a1hi = f32x4_max(s, a1hi);
+                    a2lo = f32x4_max(s, a2lo);
+                    a2hi = f32x4_max(s, a2hi);
+                    a3lo = f32x4_max(s, a3lo);
+                    a3hi = f32x4_max(s, a3hi);
+                    a4lo = f32x4_max(s, a4lo);
+                    a4hi = f32x4_max(s, a4hi);
+                    a5lo = f32x4_max(s, a5lo);
+                    a5hi = f32x4_max(s, a5hi);
+                    a6lo = f32x4_max(s, a6lo);
+                    a6hi = f32x4_max(s, a6hi);
+                    a7lo = f32x4_max(s, a7lo);
+                    a7hi = f32x4_max(s, a7hi);
+                }
+                FusedKerSpec::ScalarAdd(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_add(s, a0lo);
+                    a0hi = f32x4_add(s, a0hi);
+                    a1lo = f32x4_add(s, a1lo);
+                    a1hi = f32x4_add(s, a1hi);
+                    a2lo = f32x4_add(s, a2lo);
+                    a2hi = f32x4_add(s, a2hi);
+                    a3lo = f32x4_add(s, a3lo);
+                    a3hi = f32x4_add(s, a3hi);
+                    a4lo = f32x4_add(s, a4lo);
+                    a4hi = f32x4_add(s, a4hi);
+                    a5lo = f32x4_add(s, a5lo);
+                    a5hi = f32x4_add(s, a5hi);
+                    a6lo = f32x4_add(s, a6lo);
+                    a6hi = f32x4_add(s, a6hi);
+                    a7lo = f32x4_add(s, a7lo);
+                    a7hi = f32x4_add(s, a7hi);
+                }
+                FusedKerSpec::ScalarMul(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::ScalarSub(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_sub(s, a0lo);
+                    a0hi = f32x4_sub(s, a0hi);
+                    a1lo = f32x4_sub(s, a1lo);
+                    a1hi = f32x4_sub(s, a1hi);
+                    a2lo = f32x4_sub(s, a2lo);
+                    a2hi = f32x4_sub(s, a2hi);
+                    a3lo = f32x4_sub(s, a3lo);
+                    a3hi = f32x4_sub(s, a3hi);
+                    a4lo = f32x4_sub(s, a4lo);
+                    a4hi = f32x4_sub(s, a4hi);
+                    a5lo = f32x4_sub(s, a5lo);
+                    a5hi = f32x4_sub(s, a5hi);
+                    a6lo = f32x4_sub(s, a6lo);
+                    a6hi = f32x4_sub(s, a6hi);
+                    a7lo = f32x4_sub(s, a7lo);
+                    a7hi = f32x4_sub(s, a7hi);
+                }
+                FusedKerSpec::ScalarSubF(a) => {
+                    let s = f32x4_splat(a);
+                    a0lo = f32x4_sub(a0lo, s);
+                    a0hi = f32x4_sub(a0hi, s);
+                    a1lo = f32x4_sub(a1lo, s);
+                    a1hi = f32x4_sub(a1hi, s);
+                    a2lo = f32x4_sub(a2lo, s);
+                    a2hi = f32x4_sub(a2hi, s);
+                    a3lo = f32x4_sub(a3lo, s);
+                    a3hi = f32x4_sub(a3hi, s);
+                    a4lo = f32x4_sub(a4lo, s);
+                    a4hi = f32x4_sub(a4hi, s);
+                    a5lo = f32x4_sub(a5lo, s);
+                    a5hi = f32x4_sub(a5hi, s);
+                    a6lo = f32x4_sub(a6lo, s);
+                    a6hi = f32x4_sub(a6hi, s);
+                    a7lo = f32x4_sub(a7lo, s);
+                    a7hi = f32x4_sub(a7hi, s);
+                }
+                FusedKerSpec::LeakyRelu(a) => {
+                    let s = f32x4_splat(a);
+                    let zero = f32x4_splat(0.0);
+                    let m0a = f32x4_gt(a0lo, zero);
+                    a0lo = v128_bitselect(a0lo, f32x4_mul(s, a0lo), m0a);
+                    let m0b = f32x4_gt(a0hi, zero);
+                    a0hi = v128_bitselect(a0hi, f32x4_mul(s, a0hi), m0b);
+                    let m1a = f32x4_gt(a1lo, zero);
+                    a1lo = v128_bitselect(a1lo, f32x4_mul(s, a1lo), m1a);
+                    let m1b = f32x4_gt(a1hi, zero);
+                    a1hi = v128_bitselect(a1hi, f32x4_mul(s, a1hi), m1b);
+                    let m2a = f32x4_gt(a2lo, zero);
+                    a2lo = v128_bitselect(a2lo, f32x4_mul(s, a2lo), m2a);
+                    let m2b = f32x4_gt(a2hi, zero);
+                    a2hi = v128_bitselect(a2hi, f32x4_mul(s, a2hi), m2b);
+                    let m3a = f32x4_gt(a3lo, zero);
+                    a3lo = v128_bitselect(a3lo, f32x4_mul(s, a3lo), m3a);
+                    let m3b = f32x4_gt(a3hi, zero);
+                    a3hi = v128_bitselect(a3hi, f32x4_mul(s, a3hi), m3b);
+                    let m4a = f32x4_gt(a4lo, zero);
+                    a4lo = v128_bitselect(a4lo, f32x4_mul(s, a4lo), m4a);
+                    let m4b = f32x4_gt(a4hi, zero);
+                    a4hi = v128_bitselect(a4hi, f32x4_mul(s, a4hi), m4b);
+                    let m5a = f32x4_gt(a5lo, zero);
+                    a5lo = v128_bitselect(a5lo, f32x4_mul(s, a5lo), m5a);
+                    let m5b = f32x4_gt(a5hi, zero);
+                    a5hi = v128_bitselect(a5hi, f32x4_mul(s, a5hi), m5b);
+                    let m6a = f32x4_gt(a6lo, zero);
+                    a6lo = v128_bitselect(a6lo, f32x4_mul(s, a6lo), m6a);
+                    let m6b = f32x4_gt(a6hi, zero);
+                    a6hi = v128_bitselect(a6hi, f32x4_mul(s, a6hi), m6b);
+                    let m7a = f32x4_gt(a7lo, zero);
+                    a7lo = v128_bitselect(a7lo, f32x4_mul(s, a7lo), m7a);
+                    let m7b = f32x4_gt(a7hi, zero);
+                    a7hi = v128_bitselect(a7hi, f32x4_mul(s, a7hi), m7b);
+                }
+                FusedKerSpec::PerRowMin(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_min(r0, a0lo);
+                    a0hi = f32x4_min(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_min(r1, a1lo);
+                    a1hi = f32x4_min(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_min(r2, a2lo);
+                    a2hi = f32x4_min(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_min(r3, a3lo);
+                    a3hi = f32x4_min(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_min(r4, a4lo);
+                    a4hi = f32x4_min(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_min(r5, a5lo);
+                    a5hi = f32x4_min(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_min(r6, a6lo);
+                    a6hi = f32x4_min(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_min(r7, a7lo);
+                    a7hi = f32x4_min(r7, a7hi);
+                }
+                FusedKerSpec::PerRowMax(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_max(r0, a0lo);
+                    a0hi = f32x4_max(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_max(r1, a1lo);
+                    a1hi = f32x4_max(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_max(r2, a2lo);
+                    a2hi = f32x4_max(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_max(r3, a3lo);
+                    a3hi = f32x4_max(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_max(r4, a4lo);
+                    a4hi = f32x4_max(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_max(r5, a5lo);
+                    a5hi = f32x4_max(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_max(r6, a6lo);
+                    a6hi = f32x4_max(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_max(r7, a7lo);
+                    a7hi = f32x4_max(r7, a7hi);
+                }
+                FusedKerSpec::PerRowAdd(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_add(r0, a0lo);
+                    a0hi = f32x4_add(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_add(r1, a1lo);
+                    a1hi = f32x4_add(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_add(r2, a2lo);
+                    a2hi = f32x4_add(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_add(r3, a3lo);
+                    a3hi = f32x4_add(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_add(r4, a4lo);
+                    a4hi = f32x4_add(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_add(r5, a5lo);
+                    a5hi = f32x4_add(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_add(r6, a6lo);
+                    a6hi = f32x4_add(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_add(r7, a7lo);
+                    a7hi = f32x4_add(r7, a7hi);
+                }
+                FusedKerSpec::PerRowMul(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_mul(r0, a0lo);
+                    a0hi = f32x4_mul(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_mul(r1, a1lo);
+                    a1hi = f32x4_mul(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_mul(r2, a2lo);
+                    a2hi = f32x4_mul(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_mul(r3, a3lo);
+                    a3hi = f32x4_mul(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_mul(r4, a4lo);
+                    a4hi = f32x4_mul(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_mul(r5, a5lo);
+                    a5hi = f32x4_mul(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_mul(r6, a6lo);
+                    a6hi = f32x4_mul(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_mul(r7, a7lo);
+                    a7hi = f32x4_mul(r7, a7hi);
+                }
+                FusedKerSpec::PerRowSub(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_sub(r0, a0lo);
+                    a0hi = f32x4_sub(r0, a0hi);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_sub(r1, a1lo);
+                    a1hi = f32x4_sub(r1, a1hi);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_sub(r2, a2lo);
+                    a2hi = f32x4_sub(r2, a2hi);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_sub(r3, a3lo);
+                    a3hi = f32x4_sub(r3, a3hi);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_sub(r4, a4lo);
+                    a4hi = f32x4_sub(r4, a4hi);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_sub(r5, a5lo);
+                    a5hi = f32x4_sub(r5, a5hi);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_sub(r6, a6lo);
+                    a6hi = f32x4_sub(r6, a6hi);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_sub(r7, a7lo);
+                    a7hi = f32x4_sub(r7, a7hi);
+                }
+                FusedKerSpec::PerRowSubF(row) => {
+                    let r = std::slice::from_raw_parts(row, 8);
+                    let r0 = f32x4_splat(r[0]);
+                    a0lo = f32x4_sub(a0lo, r0);
+                    a0hi = f32x4_sub(a0hi, r0);
+                    let r1 = f32x4_splat(r[1]);
+                    a1lo = f32x4_sub(a1lo, r1);
+                    a1hi = f32x4_sub(a1hi, r1);
+                    let r2 = f32x4_splat(r[2]);
+                    a2lo = f32x4_sub(a2lo, r2);
+                    a2hi = f32x4_sub(a2hi, r2);
+                    let r3 = f32x4_splat(r[3]);
+                    a3lo = f32x4_sub(a3lo, r3);
+                    a3hi = f32x4_sub(a3hi, r3);
+                    let r4 = f32x4_splat(r[4]);
+                    a4lo = f32x4_sub(a4lo, r4);
+                    a4hi = f32x4_sub(a4hi, r4);
+                    let r5 = f32x4_splat(r[5]);
+                    a5lo = f32x4_sub(a5lo, r5);
+                    a5hi = f32x4_sub(a5hi, r5);
+                    let r6 = f32x4_splat(r[6]);
+                    a6lo = f32x4_sub(a6lo, r6);
+                    a6hi = f32x4_sub(a6hi, r6);
+                    let r7 = f32x4_splat(r[7]);
+                    a7lo = f32x4_sub(a7lo, r7);
+                    a7hi = f32x4_sub(a7hi, r7);
+                }
+                FusedKerSpec::PerColMin(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_min(clo, a0lo);
+                    a0hi = f32x4_min(chi, a0hi);
+                    a1lo = f32x4_min(clo, a1lo);
+                    a1hi = f32x4_min(chi, a1hi);
+                    a2lo = f32x4_min(clo, a2lo);
+                    a2hi = f32x4_min(chi, a2hi);
+                    a3lo = f32x4_min(clo, a3lo);
+                    a3hi = f32x4_min(chi, a3hi);
+                    a4lo = f32x4_min(clo, a4lo);
+                    a4hi = f32x4_min(chi, a4hi);
+                    a5lo = f32x4_min(clo, a5lo);
+                    a5hi = f32x4_min(chi, a5hi);
+                    a6lo = f32x4_min(clo, a6lo);
+                    a6hi = f32x4_min(chi, a6hi);
+                    a7lo = f32x4_min(clo, a7lo);
+                    a7hi = f32x4_min(chi, a7hi);
+                }
+                FusedKerSpec::PerColMax(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_max(clo, a0lo);
+                    a0hi = f32x4_max(chi, a0hi);
+                    a1lo = f32x4_max(clo, a1lo);
+                    a1hi = f32x4_max(chi, a1hi);
+                    a2lo = f32x4_max(clo, a2lo);
+                    a2hi = f32x4_max(chi, a2hi);
+                    a3lo = f32x4_max(clo, a3lo);
+                    a3hi = f32x4_max(chi, a3hi);
+                    a4lo = f32x4_max(clo, a4lo);
+                    a4hi = f32x4_max(chi, a4hi);
+                    a5lo = f32x4_max(clo, a5lo);
+                    a5hi = f32x4_max(chi, a5hi);
+                    a6lo = f32x4_max(clo, a6lo);
+                    a6hi = f32x4_max(chi, a6hi);
+                    a7lo = f32x4_max(clo, a7lo);
+                    a7hi = f32x4_max(chi, a7hi);
+                }
+                FusedKerSpec::PerColAdd(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_add(clo, a0lo);
+                    a0hi = f32x4_add(chi, a0hi);
+                    a1lo = f32x4_add(clo, a1lo);
+                    a1hi = f32x4_add(chi, a1hi);
+                    a2lo = f32x4_add(clo, a2lo);
+                    a2hi = f32x4_add(chi, a2hi);
+                    a3lo = f32x4_add(clo, a3lo);
+                    a3hi = f32x4_add(chi, a3hi);
+                    a4lo = f32x4_add(clo, a4lo);
+                    a4hi = f32x4_add(chi, a4hi);
+                    a5lo = f32x4_add(clo, a5lo);
+                    a5hi = f32x4_add(chi, a5hi);
+                    a6lo = f32x4_add(clo, a6lo);
+                    a6hi = f32x4_add(chi, a6hi);
+                    a7lo = f32x4_add(clo, a7lo);
+                    a7hi = f32x4_add(chi, a7hi);
+                }
+                FusedKerSpec::PerColMul(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_mul(clo, a0lo);
+                    a0hi = f32x4_mul(chi, a0hi);
+                    a1lo = f32x4_mul(clo, a1lo);
+                    a1hi = f32x4_mul(chi, a1hi);
+                    a2lo = f32x4_mul(clo, a2lo);
+                    a2hi = f32x4_mul(chi, a2hi);
+                    a3lo = f32x4_mul(clo, a3lo);
+                    a3hi = f32x4_mul(chi, a3hi);
+                    a4lo = f32x4_mul(clo, a4lo);
+                    a4hi = f32x4_mul(chi, a4hi);
+                    a5lo = f32x4_mul(clo, a5lo);
+                    a5hi = f32x4_mul(chi, a5hi);
+                    a6lo = f32x4_mul(clo, a6lo);
+                    a6hi = f32x4_mul(chi, a6hi);
+                    a7lo = f32x4_mul(clo, a7lo);
+                    a7hi = f32x4_mul(chi, a7hi);
+                }
+                FusedKerSpec::PerColSub(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_sub(clo, a0lo);
+                    a0hi = f32x4_sub(chi, a0hi);
+                    a1lo = f32x4_sub(clo, a1lo);
+                    a1hi = f32x4_sub(chi, a1hi);
+                    a2lo = f32x4_sub(clo, a2lo);
+                    a2hi = f32x4_sub(chi, a2hi);
+                    a3lo = f32x4_sub(clo, a3lo);
+                    a3hi = f32x4_sub(chi, a3hi);
+                    a4lo = f32x4_sub(clo, a4lo);
+                    a4hi = f32x4_sub(chi, a4hi);
+                    a5lo = f32x4_sub(clo, a5lo);
+                    a5hi = f32x4_sub(chi, a5hi);
+                    a6lo = f32x4_sub(clo, a6lo);
+                    a6hi = f32x4_sub(chi, a6hi);
+                    a7lo = f32x4_sub(clo, a7lo);
+                    a7hi = f32x4_sub(chi, a7hi);
+                }
+                FusedKerSpec::PerColSubF(cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    a0lo = f32x4_sub(a0lo, clo);
+                    a0hi = f32x4_sub(a0hi, chi);
+                    a1lo = f32x4_sub(a1lo, clo);
+                    a1hi = f32x4_sub(a1hi, chi);
+                    a2lo = f32x4_sub(a2lo, clo);
+                    a2hi = f32x4_sub(a2hi, chi);
+                    a3lo = f32x4_sub(a3lo, clo);
+                    a3hi = f32x4_sub(a3hi, chi);
+                    a4lo = f32x4_sub(a4lo, clo);
+                    a4hi = f32x4_sub(a4hi, chi);
+                    a5lo = f32x4_sub(a5lo, clo);
+                    a5hi = f32x4_sub(a5hi, chi);
+                    a6lo = f32x4_sub(a6lo, clo);
+                    a6hi = f32x4_sub(a6hi, chi);
+                    a7lo = f32x4_sub(a7lo, clo);
+                    a7hi = f32x4_sub(a7hi, chi);
+                }
+                FusedKerSpec::QScale(shift, rp, mult) => {
+                    let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                    let s = f32x4_splat(scaler.scale);
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                    let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::ShiftLeft(shift) => {
+                    let s = f32x4_splat(2f32.powi(shift as i32));
+                    a0lo = f32x4_mul(s, a0lo);
+                    a0hi = f32x4_mul(s, a0hi);
+                    a1lo = f32x4_mul(s, a1lo);
+                    a1hi = f32x4_mul(s, a1hi);
+                    a2lo = f32x4_mul(s, a2lo);
+                    a2hi = f32x4_mul(s, a2hi);
+                    a3lo = f32x4_mul(s, a3lo);
+                    a3hi = f32x4_mul(s, a3hi);
+                    a4lo = f32x4_mul(s, a4lo);
+                    a4hi = f32x4_mul(s, a4hi);
+                    a5lo = f32x4_mul(s, a5lo);
+                    a5hi = f32x4_mul(s, a5hi);
+                    a6lo = f32x4_mul(s, a6lo);
+                    a6hi = f32x4_mul(s, a6hi);
+                    a7lo = f32x4_mul(s, a7lo);
+                    a7hi = f32x4_mul(s, a7hi);
+                }
+                FusedKerSpec::AddUnicast(tile) => {
+                    // 8 rows × 8 cols, each row laid out per col_byte_stride
+                    let mut ptr: *const u8 = tile.ptr;
+                    for ab_pair in [
+                        (&mut a0lo, &mut a0hi),
+                        (&mut a1lo, &mut a1hi),
+                        (&mut a2lo, &mut a2hi),
+                        (&mut a3lo, &mut a3hi),
+                        (&mut a4lo, &mut a4hi),
+                        (&mut a5lo, &mut a5hi),
+                        (&mut a6lo, &mut a6hi),
+                        (&mut a7lo, &mut a7hi),
+                    ]
+                    .iter_mut()
+                    {
+                        let m0 = *(ptr as *const f32);
+                        let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                        let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                        let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                        let m4 = *(ptr.offset(tile.col_byte_stride * 4) as *const f32);
+                        let m5 = *(ptr.offset(tile.col_byte_stride * 5) as *const f32);
+                        let m6 = *(ptr.offset(tile.col_byte_stride * 6) as *const f32);
+                        let m7 = *(ptr.offset(tile.col_byte_stride * 7) as *const f32);
+                        let (lo, hi) = ab_pair;
+                        **lo = f32x4_add(**lo, f32x4(m0, m1, m2, m3));
+                        **hi = f32x4_add(**hi, f32x4(m4, m5, m6, m7));
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                }
+                FusedKerSpec::AddRowColProducts(rows, cols) => {
+                    let p = cols as *const v128;
+                    let clo = v128_load(p);
+                    let chi = v128_load(p.add(1));
+                    let r0 = f32x4_splat(*rows.add(0));
+                    a0lo = f32x4_add(a0lo, f32x4_mul(r0, clo));
+                    a0hi = f32x4_add(a0hi, f32x4_mul(r0, chi));
+                    let r1 = f32x4_splat(*rows.add(1));
+                    a1lo = f32x4_add(a1lo, f32x4_mul(r1, clo));
+                    a1hi = f32x4_add(a1hi, f32x4_mul(r1, chi));
+                    let r2 = f32x4_splat(*rows.add(2));
+                    a2lo = f32x4_add(a2lo, f32x4_mul(r2, clo));
+                    a2hi = f32x4_add(a2hi, f32x4_mul(r2, chi));
+                    let r3 = f32x4_splat(*rows.add(3));
+                    a3lo = f32x4_add(a3lo, f32x4_mul(r3, clo));
+                    a3hi = f32x4_add(a3hi, f32x4_mul(r3, chi));
+                    let r4 = f32x4_splat(*rows.add(4));
+                    a4lo = f32x4_add(a4lo, f32x4_mul(r4, clo));
+                    a4hi = f32x4_add(a4hi, f32x4_mul(r4, chi));
+                    let r5 = f32x4_splat(*rows.add(5));
+                    a5lo = f32x4_add(a5lo, f32x4_mul(r5, clo));
+                    a5hi = f32x4_add(a5hi, f32x4_mul(r5, chi));
+                    let r6 = f32x4_splat(*rows.add(6));
+                    a6lo = f32x4_add(a6lo, f32x4_mul(r6, clo));
+                    a6hi = f32x4_add(a6hi, f32x4_mul(r6, chi));
+                    let r7 = f32x4_splat(*rows.add(7));
+                    a7lo = f32x4_add(a7lo, f32x4_mul(r7, clo));
+                    a7hi = f32x4_add(a7hi, f32x4_mul(r7, chi));
+                }
+                FusedKerSpec::Store(tile) => {
+                    // 8 rows × 8 cols stores
+                    let mut ptr: *mut u8 = tile.ptr;
+                    for (lo, hi) in [
+                        (a0lo, a0hi),
+                        (a1lo, a1hi),
+                        (a2lo, a2hi),
+                        (a3lo, a3hi),
+                        (a4lo, a4hi),
+                        (a5lo, a5hi),
+                        (a6lo, a6hi),
+                        (a7lo, a7hi),
+                    ]
+                    .iter()
+                    {
+                        *(ptr as *mut f32) = f32x4_extract_lane::<0>(*lo);
+                        *(ptr.offset(tile.col_byte_stride) as *mut f32) =
+                            f32x4_extract_lane::<1>(*lo);
+                        *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) =
+                            f32x4_extract_lane::<2>(*lo);
+                        *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) =
+                            f32x4_extract_lane::<3>(*lo);
+                        *(ptr.offset(tile.col_byte_stride * 4) as *mut f32) =
+                            f32x4_extract_lane::<0>(*hi);
+                        *(ptr.offset(tile.col_byte_stride * 5) as *mut f32) =
+                            f32x4_extract_lane::<1>(*hi);
+                        *(ptr.offset(tile.col_byte_stride * 6) as *mut f32) =
+                            f32x4_extract_lane::<2>(*hi);
+                        *(ptr.offset(tile.col_byte_stride * 7) as *mut f32) =
+                            f32x4_extract_lane::<3>(*hi);
+                        ptr = ptr.add(tile.row_byte_stride as usize);
+                    }
+                }
+                FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                    // A: packed [k][MR=8] = each k iter loads 8 row values
+                    // B: packed [k][NR=8] = each k iter loads 8 col values as 2 v128
+                    let a = pa as *const f32;
+                    let b = pb as *const v128;
+                    for i in 0..k {
+                        let arow = std::slice::from_raw_parts(a.offset(8 * i as isize), 8);
+                        let blo = v128_load(b.offset((2 * i) as isize));
+                        let bhi = v128_load(b.offset((2 * i + 1) as isize));
+                        let s = f32x4_splat(arow[0]);
+                        a0lo = f32x4_add(a0lo, f32x4_mul(s, blo));
+                        a0hi = f32x4_add(a0hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[1]);
+                        a1lo = f32x4_add(a1lo, f32x4_mul(s, blo));
+                        a1hi = f32x4_add(a1hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[2]);
+                        a2lo = f32x4_add(a2lo, f32x4_mul(s, blo));
+                        a2hi = f32x4_add(a2hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[3]);
+                        a3lo = f32x4_add(a3lo, f32x4_mul(s, blo));
+                        a3hi = f32x4_add(a3hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[4]);
+                        a4lo = f32x4_add(a4lo, f32x4_mul(s, blo));
+                        a4hi = f32x4_add(a4hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[5]);
+                        a5lo = f32x4_add(a5lo, f32x4_mul(s, blo));
+                        a5hi = f32x4_add(a5hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[6]);
+                        a6lo = f32x4_add(a6lo, f32x4_mul(s, blo));
+                        a6hi = f32x4_add(a6hi, f32x4_mul(s, bhi));
+                        let s = f32x4_splat(arow[7]);
+                        a7lo = f32x4_add(a7lo, f32x4_mul(s, blo));
+                        a7hi = f32x4_add(a7hi, f32x4_mul(s, bhi));
+                    }
+                }
+            }
+            pnl = pnl.add(1);
+        }
+        0
+    }
+}
+
+MMMRustKernel!(kernel_f32_8x8 => wasm_f32_8x8<f32>(8,8)@(8,8) quality(ImplementationQuality::TargetOptimized));
diff --git a/vendor/tract-linalg-0.22.1/src/wasm.rs.with-8x4 b/vendor/tract-linalg-0.22.1/src/wasm.rs.with-8x4
new file mode 100644
index 000000000..6b4a25e85
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/wasm.rs.with-8x4
@@ -0,0 +1,1555 @@
+/// Wasm SIMD implementation of `MatMatMulKer<f32>`
+///
+/// To run test, you need to install `wasmtime`
+/// and export the following environment variables:
+/// ```
+/// > export RUSTFLAGS='-C target-feature=+simd128'
+/// > export CARGO_TARGET_WASM32_WASI_RUNNER=wasmtime
+/// > cargo test --target=wasm32-wasi
+/// ```
+use crate::mmm::FusedKerSpec;
+use crate::mmm::ImplementationQuality;
+use crate::{Ops, Scaler};
+
+pub fn plug(ops: &mut Ops) {
+    ops.mmm_impls.push(wasm_f32_4x4.mmm());
+    ops.mmm_impls.push(wasm_f32_4x1.mmm());
+    ops.mmm_impls.push(wasm_f32_8x1.mmm());
+    ops.mmm_impls.push(wasm_f32_8x4.mmm());
+    ops.mmm_impls.push(wasm_f32_16x1.mmm());
+    ops.mmm_impls.push(wasm_f32_8x8.mmm());
+    // Selection: max(nr*mr) for N>1, max(mr) for N=1.
+    //   - N>1 ops: 8x8 (nr*mr=64) wins over 8x4 (32) and 4x4 (16)
+    //   - N=1 ops: 16x1 (mr=16) wins
+    ops.mmm_f32 = Box::new(|_m, _k, _n| wasm_f32_8x8.mmm());
+    ops.mmv_f32 = Box::new(|m, _k| {
+        match m.unwrap_or(0) {
+            0..=7 => wasm_f32_4x1.mmm(),
+            8..=15 => wasm_f32_8x1.mmm(),
+            _ => wasm_f32_16x1.mmm(),
+        }
+    });
+}
+
+unsafe fn kernel_f32_4x4(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    // Each of these variables stores a row of the matrix,
+    // consisting of four packed `f32` numbers.
+    let mut ab0 = f32x4_splat(0.0);
+    let mut ab1 = f32x4_splat(0.0);
+    let mut ab2 = f32x4_splat(0.0);
+    let mut ab3 = f32x4_splat(0.0);
+
+    while !pnl.is_null() {
+        match *pnl {
+            FusedKerSpec::Done => break,
+            FusedKerSpec::Clear => {
+                let a = f32x4_splat(0.0);
+                ab0 = a;
+                ab1 = a;
+                ab2 = a;
+                ab3 = a;
+            }
+            FusedKerSpec::LoadTile(_cols, rows) => {
+                let rows = rows as *const v128;
+                ab0 = *rows;
+                ab1 = *rows.add(1);
+                ab2 = *rows.add(2);
+                ab3 = *rows.add(3);
+            }
+            FusedKerSpec::ScalarMin(a) => {
+                let a = f32x4_splat(a);
+                ab0 = f32x4_min(a, ab0);
+                ab1 = f32x4_min(a, ab1);
+                ab2 = f32x4_min(a, ab2);
+                ab3 = f32x4_min(a, ab3);
+            }
+            FusedKerSpec::ScalarMax(a) => {
+                let a = f32x4_splat(a);
+                ab0 = f32x4_max(a, ab0);
+                ab1 = f32x4_max(a, ab1);
+                ab2 = f32x4_max(a, ab2);
+                ab3 = f32x4_max(a, ab3);
+            }
+            FusedKerSpec::ScalarAdd(a) => {
+                let a = f32x4_splat(a);
+                ab0 = f32x4_add(a, ab0);
+                ab1 = f32x4_add(a, ab1);
+                ab2 = f32x4_add(a, ab2);
+                ab3 = f32x4_add(a, ab3);
+            }
+            FusedKerSpec::ScalarMul(a) => {
+                let a = f32x4_splat(a);
+                ab0 = f32x4_mul(a, ab0);
+                ab1 = f32x4_mul(a, ab1);
+                ab2 = f32x4_mul(a, ab2);
+                ab3 = f32x4_mul(a, ab3);
+            }
+            FusedKerSpec::ScalarSub(a) => {
+                let a = f32x4_splat(a);
+                ab0 = f32x4_sub(a, ab0);
+                ab1 = f32x4_sub(a, ab1);
+                ab2 = f32x4_sub(a, ab2);
+                ab3 = f32x4_sub(a, ab3);
+            }
+            FusedKerSpec::ScalarSubF(a) => {
+                let a = f32x4_splat(a);
+                ab0 = f32x4_sub(ab0, a);
+                ab1 = f32x4_sub(ab1, a);
+                ab2 = f32x4_sub(ab2, a);
+                ab3 = f32x4_sub(ab3, a);
+            }
+            FusedKerSpec::LeakyRelu(a) => {
+                let a = f32x4_splat(a);
+                let zero = f32x4_splat(0.0);
+
+                let mask0 = f32x4_gt(ab0, zero);
+                ab0 = v128_bitselect(ab0, f32x4_mul(a, ab0), mask0);
+
+                let mask1 = f32x4_gt(ab1, zero);
+                ab1 = v128_bitselect(ab1, f32x4_mul(a, ab1), mask1);
+
+                let mask2 = f32x4_gt(ab2, zero);
+                ab2 = v128_bitselect(ab2, f32x4_mul(a, ab2), mask2);
+
+                let mask3 = f32x4_gt(ab3, zero);
+                ab3 = v128_bitselect(ab3, f32x4_mul(a, ab3), mask3);
+            }
+            FusedKerSpec::PerRowMin(row) => {
+                let row = std::slice::from_raw_parts(row, 4);
+                ab0 = f32x4_min(f32x4_splat(row[0]), ab0);
+                ab1 = f32x4_min(f32x4_splat(row[1]), ab1);
+                ab2 = f32x4_min(f32x4_splat(row[2]), ab2);
+                ab3 = f32x4_min(f32x4_splat(row[3]), ab3);
+            }
+            FusedKerSpec::PerRowMax(row) => {
+                let row = std::slice::from_raw_parts(row, 4);
+                ab0 = f32x4_max(f32x4_splat(row[0]), ab0);
+                ab1 = f32x4_max(f32x4_splat(row[1]), ab1);
+                ab2 = f32x4_max(f32x4_splat(row[2]), ab2);
+                ab3 = f32x4_max(f32x4_splat(row[3]), ab3);
+            }
+            FusedKerSpec::PerRowAdd(row) => {
+                let row = std::slice::from_raw_parts(row, 4);
+                ab0 = f32x4_add(f32x4_splat(row[0]), ab0);
+                ab1 = f32x4_add(f32x4_splat(row[1]), ab1);
+                ab2 = f32x4_add(f32x4_splat(row[2]), ab2);
+                ab3 = f32x4_add(f32x4_splat(row[3]), ab3);
+            }
+            FusedKerSpec::PerRowMul(row) => {
+                let row = std::slice::from_raw_parts(row, 4);
+                ab0 = f32x4_mul(f32x4_splat(row[0]), ab0);
+                ab1 = f32x4_mul(f32x4_splat(row[1]), ab1);
+                ab2 = f32x4_mul(f32x4_splat(row[2]), ab2);
+                ab3 = f32x4_mul(f32x4_splat(row[3]), ab3);
+            }
+            FusedKerSpec::PerRowSub(row) => {
+                let row = std::slice::from_raw_parts(row, 4);
+                ab0 = f32x4_sub(f32x4_splat(row[0]), ab0);
+                ab1 = f32x4_sub(f32x4_splat(row[1]), ab1);
+                ab2 = f32x4_sub(f32x4_splat(row[2]), ab2);
+                ab3 = f32x4_sub(f32x4_splat(row[3]), ab3);
+            }
+            FusedKerSpec::PerRowSubF(row) => {
+                let row = std::slice::from_raw_parts(row, 4);
+                ab0 = f32x4_sub(ab0, f32x4_splat(row[0]));
+                ab1 = f32x4_sub(ab1, f32x4_splat(row[1]));
+                ab2 = f32x4_sub(ab2, f32x4_splat(row[2]));
+                ab3 = f32x4_sub(ab3, f32x4_splat(row[3]));
+            }
+            FusedKerSpec::PerColMin(cols) => {
+                let cols = v128_load(cols as *const v128);
+                ab0 = f32x4_min(cols, ab0);
+                ab1 = f32x4_min(cols, ab1);
+                ab2 = f32x4_min(cols, ab2);
+                ab3 = f32x4_min(cols, ab3);
+            }
+            FusedKerSpec::PerColMax(cols) => {
+                let cols = v128_load(cols as *const v128);
+                ab0 = f32x4_max(cols, ab0);
+                ab1 = f32x4_max(cols, ab1);
+                ab2 = f32x4_max(cols, ab2);
+                ab3 = f32x4_max(cols, ab3);
+            }
+            FusedKerSpec::PerColAdd(cols) => {
+                let cols = v128_load(cols as *const v128);
+                ab0 = f32x4_add(cols, ab0);
+                ab1 = f32x4_add(cols, ab1);
+                ab2 = f32x4_add(cols, ab2);
+                ab3 = f32x4_add(cols, ab3);
+            }
+            FusedKerSpec::PerColMul(cols) => {
+                let cols = v128_load(cols as *const v128);
+                ab0 = f32x4_mul(cols, ab0);
+                ab1 = f32x4_mul(cols, ab1);
+                ab2 = f32x4_mul(cols, ab2);
+                ab3 = f32x4_mul(cols, ab3);
+            }
+            FusedKerSpec::PerColSub(cols) => {
+                let cols = v128_load(cols as *const v128);
+                ab0 = f32x4_sub(cols, ab0);
+                ab1 = f32x4_sub(cols, ab1);
+                ab2 = f32x4_sub(cols, ab2);
+                ab3 = f32x4_sub(cols, ab3);
+            }
+            FusedKerSpec::PerColSubF(cols) => {
+                let cols = v128_load(cols as *const v128);
+                ab0 = f32x4_sub(ab0, cols);
+                ab1 = f32x4_sub(ab1, cols);
+                ab2 = f32x4_sub(ab2, cols);
+                ab3 = f32x4_sub(ab3, cols);
+            }
+            FusedKerSpec::QScale(shift, rp, mult) => {
+                let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                let scale = f32x4_splat(scaler.scale);
+                ab0 = f32x4_mul(scale, ab0);
+                ab1 = f32x4_mul(scale, ab1);
+                ab2 = f32x4_mul(scale, ab2);
+                ab3 = f32x4_mul(scale, ab3);
+            }
+            FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                let shift = f32x4_splat(2f32.powi(-(shift as i32)));
+                ab0 = f32x4_mul(shift, ab0);
+                ab1 = f32x4_mul(shift, ab1);
+                ab2 = f32x4_mul(shift, ab2);
+                ab3 = f32x4_mul(shift, ab3);
+            }
+            FusedKerSpec::ShiftLeft(shift) => {
+                let shift = f32x4_splat(2f32.powi(shift as i32));
+                ab0 = f32x4_mul(shift, ab0);
+                ab1 = f32x4_mul(shift, ab1);
+                ab2 = f32x4_mul(shift, ab2);
+                ab3 = f32x4_mul(shift, ab3);
+            }
+            FusedKerSpec::AddUnicast(tile) => {
+                let mut ptr: *const u8 = tile.ptr;
+
+                let m0 = *(ptr as *const f32);
+                let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                ab0 = f32x4_add(ab0, f32x4(m0, m1, m2, m3));
+                ptr = ptr.add(tile.row_byte_stride as usize);
+
+                let m0 = *(ptr as *const f32);
+                let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                ab1 = f32x4_add(ab1, f32x4(m0, m1, m2, m3));
+                ptr = ptr.add(tile.row_byte_stride as usize);
+
+                let m0 = *(ptr as *const f32);
+                let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                ab2 = f32x4_add(ab2, f32x4(m0, m1, m2, m3));
+                ptr = ptr.add(tile.row_byte_stride as usize);
+
+                let m0 = *(ptr as *const f32);
+                let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                ab3 = f32x4_add(ab3, f32x4(m0, m1, m2, m3));
+            }
+            FusedKerSpec::AddRowColProducts(rows, cols) => {
+                let cols = v128_load(cols as *const v128);
+                ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(*rows.add(0)), cols));
+                ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(*rows.add(1)), cols));
+                ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(*rows.add(2)), cols));
+                ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(*rows.add(3)), cols));
+            }
+            FusedKerSpec::Store(tile) => {
+                let mut ptr: *mut u8 = tile.ptr;
+
+                *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab0);
+                *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab0);
+                *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(ab0);
+                *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(ab0);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+
+                *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab1);
+                *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab1);
+                *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(ab1);
+                *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(ab1);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+
+                *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab2);
+                *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab2);
+                *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(ab2);
+                *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(ab2);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+
+                *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab3);
+                *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(ab3);
+                *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(ab3);
+                *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(ab3);
+            }
+            FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                let a = pa as *const f32;
+                let b = pb as *const v128;
+                for i in 0..k {
+                    let a = std::slice::from_raw_parts(a.offset(4 * i as isize), 4);
+                    let b = v128_load(b.offset(i as isize));
+                    ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(a[0]), b));
+                    ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(a[1]), b));
+                    ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(a[2]), b));
+                    ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(a[3]), b));
+                }
+            }
+        }
+        pnl = pnl.add(1);
+    }
+    0
+}
+
+MMMRustKernel!(kernel_f32_4x4 => wasm_f32_4x4<f32>(4,4)@(4,4) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 4x1 kernel — GEMV-shaped variant for matrix-vector products
+/// (single-column outputs, e.g., streaming-RNN inference where each frame's
+/// activation is a single column). Mirrors the 4x4 kernel's FusedKerSpec
+/// match arms but collapses the column dimension from 4 to 1: a single
+/// f32x4 accumulator holds 4 output rows × 1 output column packed as
+/// [ab[0], ab[1], ab[2], ab[3]].
+///
+/// Selection: tract-core's einsum kernel_selection::strategize() prefers
+/// kernels with nr() == 1 when op.n.is_one(), so this kernel is
+/// automatically picked for N=1 cases once registered.
+unsafe fn kernel_f32_4x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    // Single accumulator: 4 rows × 1 col, packed into one f32x4.
+    // lane[i] holds ab[i] = the output value for row i (col 0).
+    let mut ab = f32x4_splat(0.0);
+
+    while !pnl.is_null() {
+        match *pnl {
+            FusedKerSpec::Done => break,
+            FusedKerSpec::Clear => {
+                ab = f32x4_splat(0.0);
+            }
+            FusedKerSpec::LoadTile(_cols, rows) => {
+                // Tile is 4 rows × 1 col = 4 contiguous f32s = 1 v128
+                ab = v128_load(rows as *const v128);
+            }
+            FusedKerSpec::ScalarMin(a) => {
+                ab = f32x4_min(f32x4_splat(a), ab);
+            }
+            FusedKerSpec::ScalarMax(a) => {
+                ab = f32x4_max(f32x4_splat(a), ab);
+            }
+            FusedKerSpec::ScalarAdd(a) => {
+                ab = f32x4_add(f32x4_splat(a), ab);
+            }
+            FusedKerSpec::ScalarMul(a) => {
+                ab = f32x4_mul(f32x4_splat(a), ab);
+            }
+            FusedKerSpec::ScalarSub(a) => {
+                ab = f32x4_sub(f32x4_splat(a), ab);
+            }
+            FusedKerSpec::ScalarSubF(a) => {
+                ab = f32x4_sub(ab, f32x4_splat(a));
+            }
+            FusedKerSpec::LeakyRelu(a) => {
+                let zero = f32x4_splat(0.0);
+                let mask = f32x4_gt(ab, zero);
+                ab = v128_bitselect(ab, f32x4_mul(f32x4_splat(a), ab), mask);
+            }
+            FusedKerSpec::PerRowMin(row) => {
+                // 4 row values, applied to ab's 4 lanes in order
+                let r = v128_load(row as *const v128);
+                ab = f32x4_min(r, ab);
+            }
+            FusedKerSpec::PerRowMax(row) => {
+                let r = v128_load(row as *const v128);
+                ab = f32x4_max(r, ab);
+            }
+            FusedKerSpec::PerRowAdd(row) => {
+                let r = v128_load(row as *const v128);
+                ab = f32x4_add(r, ab);
+            }
+            FusedKerSpec::PerRowMul(row) => {
+                let r = v128_load(row as *const v128);
+                ab = f32x4_mul(r, ab);
+            }
+            FusedKerSpec::PerRowSub(row) => {
+                let r = v128_load(row as *const v128);
+                ab = f32x4_sub(r, ab);
+            }
+            FusedKerSpec::PerRowSubF(row) => {
+                let r = v128_load(row as *const v128);
+                ab = f32x4_sub(ab, r);
+            }
+            FusedKerSpec::PerColMin(cols) => {
+                // Single col value broadcast to all 4 rows
+                ab = f32x4_min(f32x4_splat(*cols), ab);
+            }
+            FusedKerSpec::PerColMax(cols) => {
+                ab = f32x4_max(f32x4_splat(*cols), ab);
+            }
+            FusedKerSpec::PerColAdd(cols) => {
+                ab = f32x4_add(f32x4_splat(*cols), ab);
+            }
+            FusedKerSpec::PerColMul(cols) => {
+                ab = f32x4_mul(f32x4_splat(*cols), ab);
+            }
+            FusedKerSpec::PerColSub(cols) => {
+                ab = f32x4_sub(f32x4_splat(*cols), ab);
+            }
+            FusedKerSpec::PerColSubF(cols) => {
+                ab = f32x4_sub(ab, f32x4_splat(*cols));
+            }
+            FusedKerSpec::QScale(shift, rp, mult) => {
+                let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                ab = f32x4_mul(f32x4_splat(scaler.scale), ab);
+            }
+            FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                ab = f32x4_mul(s, ab);
+            }
+            FusedKerSpec::ShiftLeft(shift) => {
+                let s = f32x4_splat(2f32.powi(shift as i32));
+                ab = f32x4_mul(s, ab);
+            }
+            FusedKerSpec::AddUnicast(tile) => {
+                // 4 rows × 1 col, with row_byte_stride between rows (col_stride irrelevant for N=1)
+                let mut ptr: *const u8 = tile.ptr;
+                let m0 = *(ptr as *const f32);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                let m1 = *(ptr as *const f32);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                let m2 = *(ptr as *const f32);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                let m3 = *(ptr as *const f32);
+                ab = f32x4_add(ab, f32x4(m0, m1, m2, m3));
+            }
+            FusedKerSpec::AddRowColProducts(rows, cols) => {
+                // ab[i] += rows[i] * cols[0]  (cols[0] is the single col)
+                let r = v128_load(rows as *const v128);
+                let c = f32x4_splat(*cols);
+                ab = f32x4_add(ab, f32x4_mul(r, c));
+            }
+            FusedKerSpec::Store(tile) => {
+                // 4 rows × 1 col, write each lane to a separate row
+                let mut ptr: *mut u8 = tile.ptr;
+                *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab);
+            }
+            FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                // A is packed [k][MR=4]: each k iter loads 4 contiguous f32s = 1 v128.
+                // B is packed [k][NR=1]: each k iter loads 1 scalar f32, broadcast.
+                // ab[i] += a[i] * b for all i in 0..4 → SIMD: ab += a_vec * b_splat
+                let a = pa as *const v128;
+                let b = pb as *const f32;
+                for i in 0..k {
+                    let a_vec = v128_load(a.offset(i as isize));
+                    let b_splat = f32x4_splat(*b.offset(i as isize));
+                    ab = f32x4_add(ab, f32x4_mul(a_vec, b_splat));
+                }
+            }
+        }
+        pnl = pnl.add(1);
+    }
+    0
+}
+
+MMMRustKernel!(kernel_f32_4x1 => wasm_f32_4x1<f32>(4,1)@(4,1) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 8x1 kernel — wider GEMV variant for matrix-vector products
+/// on large M. Uses TWO independent f32x4 accumulators (rows 0-3 in ab_top,
+/// rows 4-7 in ab_bot), enabling 2-way ILP within each k-iteration:
+/// the inner loop issues two independent f32x4_add(f32x4_mul(...)) ops per
+/// k-step, breaking the data-dependency chain depth from K to ~K/2 at the
+/// hardware pipeline level.
+///
+/// Compared to wasm_f32_4x1 (1 accumulator, k-serial dep chain), this is
+/// targeted at GEMV ops where M is a multiple of 8 (or close to it). For
+/// M=256 GRU gate matmuls (the dominant GEMV in DFN3), this should yield
+/// ~2x speedup on the inner loop on hardware where SIMD FMLA throughput
+/// exceeds 1 op/cycle.
+///
+/// Selection: `kernel_selection::strategize()` prefers max mr() for n=1
+/// cases, so this kernel automatically wins over wasm_f32_4x1 for all N=1
+/// ops once registered (including small-M cases where it slightly wastes
+/// rows — for M=1 lsnr_fc-style ops, that's 7-of-8 row waste, but those
+/// ops are <1% of frame so the regression is noise).
+unsafe fn kernel_f32_8x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    // Two accumulators: 8 rows × 1 col packed as [ab_top, ab_bot]
+    // ab_top.lane[i] holds row i (i in 0..4); ab_bot.lane[i] holds row i+4
+    let mut ab_top = f32x4_splat(0.0);
+    let mut ab_bot = f32x4_splat(0.0);
+
+    while !pnl.is_null() {
+        match *pnl {
+            FusedKerSpec::Done => break,
+            FusedKerSpec::Clear => {
+                ab_top = f32x4_splat(0.0);
+                ab_bot = f32x4_splat(0.0);
+            }
+            FusedKerSpec::LoadTile(_cols, rows) => {
+                // 8 rows × 1 col = 8 contiguous f32 = 2 v128
+                let p = rows as *const v128;
+                ab_top = *p;
+                ab_bot = *p.add(1);
+            }
+            FusedKerSpec::ScalarMin(a) => {
+                let s = f32x4_splat(a);
+                ab_top = f32x4_min(s, ab_top);
+                ab_bot = f32x4_min(s, ab_bot);
+            }
+            FusedKerSpec::ScalarMax(a) => {
+                let s = f32x4_splat(a);
+                ab_top = f32x4_max(s, ab_top);
+                ab_bot = f32x4_max(s, ab_bot);
+            }
+            FusedKerSpec::ScalarAdd(a) => {
+                let s = f32x4_splat(a);
+                ab_top = f32x4_add(s, ab_top);
+                ab_bot = f32x4_add(s, ab_bot);
+            }
+            FusedKerSpec::ScalarMul(a) => {
+                let s = f32x4_splat(a);
+                ab_top = f32x4_mul(s, ab_top);
+                ab_bot = f32x4_mul(s, ab_bot);
+            }
+            FusedKerSpec::ScalarSub(a) => {
+                let s = f32x4_splat(a);
+                ab_top = f32x4_sub(s, ab_top);
+                ab_bot = f32x4_sub(s, ab_bot);
+            }
+            FusedKerSpec::ScalarSubF(a) => {
+                let s = f32x4_splat(a);
+                ab_top = f32x4_sub(ab_top, s);
+                ab_bot = f32x4_sub(ab_bot, s);
+            }
+            FusedKerSpec::LeakyRelu(a) => {
+                let s = f32x4_splat(a);
+                let zero = f32x4_splat(0.0);
+                let mask_t = f32x4_gt(ab_top, zero);
+                let mask_b = f32x4_gt(ab_bot, zero);
+                ab_top = v128_bitselect(ab_top, f32x4_mul(s, ab_top), mask_t);
+                ab_bot = v128_bitselect(ab_bot, f32x4_mul(s, ab_bot), mask_b);
+            }
+            FusedKerSpec::PerRowMin(row) => {
+                let p = row as *const v128;
+                let r_t = v128_load(p);
+                let r_b = v128_load(p.add(1));
+                ab_top = f32x4_min(r_t, ab_top);
+                ab_bot = f32x4_min(r_b, ab_bot);
+            }
+            FusedKerSpec::PerRowMax(row) => {
+                let p = row as *const v128;
+                let r_t = v128_load(p);
+                let r_b = v128_load(p.add(1));
+                ab_top = f32x4_max(r_t, ab_top);
+                ab_bot = f32x4_max(r_b, ab_bot);
+            }
+            FusedKerSpec::PerRowAdd(row) => {
+                let p = row as *const v128;
+                let r_t = v128_load(p);
+                let r_b = v128_load(p.add(1));
+                ab_top = f32x4_add(r_t, ab_top);
+                ab_bot = f32x4_add(r_b, ab_bot);
+            }
+            FusedKerSpec::PerRowMul(row) => {
+                let p = row as *const v128;
+                let r_t = v128_load(p);
+                let r_b = v128_load(p.add(1));
+                ab_top = f32x4_mul(r_t, ab_top);
+                ab_bot = f32x4_mul(r_b, ab_bot);
+            }
+            FusedKerSpec::PerRowSub(row) => {
+                let p = row as *const v128;
+                let r_t = v128_load(p);
+                let r_b = v128_load(p.add(1));
+                ab_top = f32x4_sub(r_t, ab_top);
+                ab_bot = f32x4_sub(r_b, ab_bot);
+            }
+            FusedKerSpec::PerRowSubF(row) => {
+                let p = row as *const v128;
+                let r_t = v128_load(p);
+                let r_b = v128_load(p.add(1));
+                ab_top = f32x4_sub(ab_top, r_t);
+                ab_bot = f32x4_sub(ab_bot, r_b);
+            }
+            FusedKerSpec::PerColMin(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_top = f32x4_min(c, ab_top);
+                ab_bot = f32x4_min(c, ab_bot);
+            }
+            FusedKerSpec::PerColMax(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_top = f32x4_max(c, ab_top);
+                ab_bot = f32x4_max(c, ab_bot);
+            }
+            FusedKerSpec::PerColAdd(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_top = f32x4_add(c, ab_top);
+                ab_bot = f32x4_add(c, ab_bot);
+            }
+            FusedKerSpec::PerColMul(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_top = f32x4_mul(c, ab_top);
+                ab_bot = f32x4_mul(c, ab_bot);
+            }
+            FusedKerSpec::PerColSub(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_top = f32x4_sub(c, ab_top);
+                ab_bot = f32x4_sub(c, ab_bot);
+            }
+            FusedKerSpec::PerColSubF(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_top = f32x4_sub(ab_top, c);
+                ab_bot = f32x4_sub(ab_bot, c);
+            }
+            FusedKerSpec::QScale(shift, rp, mult) => {
+                let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                let s = f32x4_splat(scaler.scale);
+                ab_top = f32x4_mul(s, ab_top);
+                ab_bot = f32x4_mul(s, ab_bot);
+            }
+            FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                ab_top = f32x4_mul(s, ab_top);
+                ab_bot = f32x4_mul(s, ab_bot);
+            }
+            FusedKerSpec::ShiftLeft(shift) => {
+                let s = f32x4_splat(2f32.powi(shift as i32));
+                ab_top = f32x4_mul(s, ab_top);
+                ab_bot = f32x4_mul(s, ab_bot);
+            }
+            FusedKerSpec::AddUnicast(tile) => {
+                // 8 rows × 1 col, stride is row_byte_stride between rows
+                let mut ptr: *const u8 = tile.ptr;
+                let m0 = *(ptr as *const f32);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                let m1 = *(ptr as *const f32);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                let m2 = *(ptr as *const f32);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                let m3 = *(ptr as *const f32);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                let m4 = *(ptr as *const f32);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                let m5 = *(ptr as *const f32);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                let m6 = *(ptr as *const f32);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                let m7 = *(ptr as *const f32);
+                ab_top = f32x4_add(ab_top, f32x4(m0, m1, m2, m3));
+                ab_bot = f32x4_add(ab_bot, f32x4(m4, m5, m6, m7));
+            }
+            FusedKerSpec::AddRowColProducts(rows, cols) => {
+                let p = rows as *const v128;
+                let r_t = v128_load(p);
+                let r_b = v128_load(p.add(1));
+                let c = f32x4_splat(*cols);
+                ab_top = f32x4_add(ab_top, f32x4_mul(r_t, c));
+                ab_bot = f32x4_add(ab_bot, f32x4_mul(r_b, c));
+            }
+            FusedKerSpec::Store(tile) => {
+                // 8 rows × 1 col, write each lane to a separate row
+                let mut ptr: *mut u8 = tile.ptr;
+                *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_top);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_top);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_top);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_top);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                *(ptr as *mut f32) = f32x4_extract_lane::<0>(ab_bot);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                *(ptr as *mut f32) = f32x4_extract_lane::<1>(ab_bot);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                *(ptr as *mut f32) = f32x4_extract_lane::<2>(ab_bot);
+                ptr = ptr.add(tile.row_byte_stride as usize);
+                *(ptr as *mut f32) = f32x4_extract_lane::<3>(ab_bot);
+            }
+            FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                // A: packed [k][MR=8] = each k iter loads 8 f32 = 2 v128
+                // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast
+                // The two fmadd ops on (ab_top, ab_bot) are independent — 2-way ILP per iter.
+                let a = pa as *const v128;
+                let b = pb as *const f32;
+                for i in 0..k {
+                    let a_t = v128_load(a.offset((2 * i) as isize));
+                    let a_b = v128_load(a.offset((2 * i + 1) as isize));
+                    let b_splat = f32x4_splat(*b.offset(i as isize));
+                    ab_top = f32x4_add(ab_top, f32x4_mul(a_t, b_splat));
+                    ab_bot = f32x4_add(ab_bot, f32x4_mul(a_b, b_splat));
+                }
+            }
+        }
+        pnl = pnl.add(1);
+    }
+    0
+}
+
+MMMRustKernel!(kernel_f32_8x1 => wasm_f32_8x1<f32>(8,1)@(8,1) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 8x4 kernel — wider MM tile for matrix-matrix products
+/// where N>1 (i.e., N>=4 effectively, since the strategizer picks this for
+/// any non-N=1 case). Processes 8 rows × 4 cols per tile using 8 f32x4
+/// accumulators (one per row, holding 4 cols each).
+///
+/// Compared to the existing wasm_f32_4x4 (4 accums, mr*nr=16), this has
+/// 2× the rows per tile (mr*nr=32) so:
+///   - Half as many tile iterations needed to cover the M dimension
+///   - 8 independent fmadds per k-iter (vs 4) — better SIMD pipeline saturation
+///   - Same column-tile width (4), no waste vs 4x4 for N>1 cases
+///
+/// Selection: kernel_selection::strategize() prefers max(nr*mr) for N>1,
+/// so this wins over 4x4 (32 > 16) for any non-GEMV op.
+unsafe fn kernel_f32_8x4(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    // 8 accumulators: 8 rows × 4 cols, packed as 8 f32x4
+    let mut ab0 = f32x4_splat(0.0);
+    let mut ab1 = f32x4_splat(0.0);
+    let mut ab2 = f32x4_splat(0.0);
+    let mut ab3 = f32x4_splat(0.0);
+    let mut ab4 = f32x4_splat(0.0);
+    let mut ab5 = f32x4_splat(0.0);
+    let mut ab6 = f32x4_splat(0.0);
+    let mut ab7 = f32x4_splat(0.0);
+
+    while !pnl.is_null() {
+        match *pnl {
+            FusedKerSpec::Done => break,
+            FusedKerSpec::Clear => {
+                let z = f32x4_splat(0.0);
+                ab0 = z; ab1 = z; ab2 = z; ab3 = z;
+                ab4 = z; ab5 = z; ab6 = z; ab7 = z;
+            }
+            FusedKerSpec::LoadTile(_cols, rows) => {
+                let p = rows as *const v128;
+                ab0 = *p;
+                ab1 = *p.add(1);
+                ab2 = *p.add(2);
+                ab3 = *p.add(3);
+                ab4 = *p.add(4);
+                ab5 = *p.add(5);
+                ab6 = *p.add(6);
+                ab7 = *p.add(7);
+            }
+            FusedKerSpec::ScalarMin(a) => {
+                let s = f32x4_splat(a);
+                ab0 = f32x4_min(s, ab0); ab1 = f32x4_min(s, ab1);
+                ab2 = f32x4_min(s, ab2); ab3 = f32x4_min(s, ab3);
+                ab4 = f32x4_min(s, ab4); ab5 = f32x4_min(s, ab5);
+                ab6 = f32x4_min(s, ab6); ab7 = f32x4_min(s, ab7);
+            }
+            FusedKerSpec::ScalarMax(a) => {
+                let s = f32x4_splat(a);
+                ab0 = f32x4_max(s, ab0); ab1 = f32x4_max(s, ab1);
+                ab2 = f32x4_max(s, ab2); ab3 = f32x4_max(s, ab3);
+                ab4 = f32x4_max(s, ab4); ab5 = f32x4_max(s, ab5);
+                ab6 = f32x4_max(s, ab6); ab7 = f32x4_max(s, ab7);
+            }
+            FusedKerSpec::ScalarAdd(a) => {
+                let s = f32x4_splat(a);
+                ab0 = f32x4_add(s, ab0); ab1 = f32x4_add(s, ab1);
+                ab2 = f32x4_add(s, ab2); ab3 = f32x4_add(s, ab3);
+                ab4 = f32x4_add(s, ab4); ab5 = f32x4_add(s, ab5);
+                ab6 = f32x4_add(s, ab6); ab7 = f32x4_add(s, ab7);
+            }
+            FusedKerSpec::ScalarMul(a) => {
+                let s = f32x4_splat(a);
+                ab0 = f32x4_mul(s, ab0); ab1 = f32x4_mul(s, ab1);
+                ab2 = f32x4_mul(s, ab2); ab3 = f32x4_mul(s, ab3);
+                ab4 = f32x4_mul(s, ab4); ab5 = f32x4_mul(s, ab5);
+                ab6 = f32x4_mul(s, ab6); ab7 = f32x4_mul(s, ab7);
+            }
+            FusedKerSpec::ScalarSub(a) => {
+                let s = f32x4_splat(a);
+                ab0 = f32x4_sub(s, ab0); ab1 = f32x4_sub(s, ab1);
+                ab2 = f32x4_sub(s, ab2); ab3 = f32x4_sub(s, ab3);
+                ab4 = f32x4_sub(s, ab4); ab5 = f32x4_sub(s, ab5);
+                ab6 = f32x4_sub(s, ab6); ab7 = f32x4_sub(s, ab7);
+            }
+            FusedKerSpec::ScalarSubF(a) => {
+                let s = f32x4_splat(a);
+                ab0 = f32x4_sub(ab0, s); ab1 = f32x4_sub(ab1, s);
+                ab2 = f32x4_sub(ab2, s); ab3 = f32x4_sub(ab3, s);
+                ab4 = f32x4_sub(ab4, s); ab5 = f32x4_sub(ab5, s);
+                ab6 = f32x4_sub(ab6, s); ab7 = f32x4_sub(ab7, s);
+            }
+            FusedKerSpec::LeakyRelu(a) => {
+                let s = f32x4_splat(a);
+                let zero = f32x4_splat(0.0);
+                let m0 = f32x4_gt(ab0, zero); ab0 = v128_bitselect(ab0, f32x4_mul(s, ab0), m0);
+                let m1 = f32x4_gt(ab1, zero); ab1 = v128_bitselect(ab1, f32x4_mul(s, ab1), m1);
+                let m2 = f32x4_gt(ab2, zero); ab2 = v128_bitselect(ab2, f32x4_mul(s, ab2), m2);
+                let m3 = f32x4_gt(ab3, zero); ab3 = v128_bitselect(ab3, f32x4_mul(s, ab3), m3);
+                let m4 = f32x4_gt(ab4, zero); ab4 = v128_bitselect(ab4, f32x4_mul(s, ab4), m4);
+                let m5 = f32x4_gt(ab5, zero); ab5 = v128_bitselect(ab5, f32x4_mul(s, ab5), m5);
+                let m6 = f32x4_gt(ab6, zero); ab6 = v128_bitselect(ab6, f32x4_mul(s, ab6), m6);
+                let m7 = f32x4_gt(ab7, zero); ab7 = v128_bitselect(ab7, f32x4_mul(s, ab7), m7);
+            }
+            FusedKerSpec::PerRowMin(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                ab0 = f32x4_min(f32x4_splat(r[0]), ab0); ab1 = f32x4_min(f32x4_splat(r[1]), ab1);
+                ab2 = f32x4_min(f32x4_splat(r[2]), ab2); ab3 = f32x4_min(f32x4_splat(r[3]), ab3);
+                ab4 = f32x4_min(f32x4_splat(r[4]), ab4); ab5 = f32x4_min(f32x4_splat(r[5]), ab5);
+                ab6 = f32x4_min(f32x4_splat(r[6]), ab6); ab7 = f32x4_min(f32x4_splat(r[7]), ab7);
+            }
+            FusedKerSpec::PerRowMax(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                ab0 = f32x4_max(f32x4_splat(r[0]), ab0); ab1 = f32x4_max(f32x4_splat(r[1]), ab1);
+                ab2 = f32x4_max(f32x4_splat(r[2]), ab2); ab3 = f32x4_max(f32x4_splat(r[3]), ab3);
+                ab4 = f32x4_max(f32x4_splat(r[4]), ab4); ab5 = f32x4_max(f32x4_splat(r[5]), ab5);
+                ab6 = f32x4_max(f32x4_splat(r[6]), ab6); ab7 = f32x4_max(f32x4_splat(r[7]), ab7);
+            }
+            FusedKerSpec::PerRowAdd(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                ab0 = f32x4_add(f32x4_splat(r[0]), ab0); ab1 = f32x4_add(f32x4_splat(r[1]), ab1);
+                ab2 = f32x4_add(f32x4_splat(r[2]), ab2); ab3 = f32x4_add(f32x4_splat(r[3]), ab3);
+                ab4 = f32x4_add(f32x4_splat(r[4]), ab4); ab5 = f32x4_add(f32x4_splat(r[5]), ab5);
+                ab6 = f32x4_add(f32x4_splat(r[6]), ab6); ab7 = f32x4_add(f32x4_splat(r[7]), ab7);
+            }
+            FusedKerSpec::PerRowMul(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                ab0 = f32x4_mul(f32x4_splat(r[0]), ab0); ab1 = f32x4_mul(f32x4_splat(r[1]), ab1);
+                ab2 = f32x4_mul(f32x4_splat(r[2]), ab2); ab3 = f32x4_mul(f32x4_splat(r[3]), ab3);
+                ab4 = f32x4_mul(f32x4_splat(r[4]), ab4); ab5 = f32x4_mul(f32x4_splat(r[5]), ab5);
+                ab6 = f32x4_mul(f32x4_splat(r[6]), ab6); ab7 = f32x4_mul(f32x4_splat(r[7]), ab7);
+            }
+            FusedKerSpec::PerRowSub(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                ab0 = f32x4_sub(f32x4_splat(r[0]), ab0); ab1 = f32x4_sub(f32x4_splat(r[1]), ab1);
+                ab2 = f32x4_sub(f32x4_splat(r[2]), ab2); ab3 = f32x4_sub(f32x4_splat(r[3]), ab3);
+                ab4 = f32x4_sub(f32x4_splat(r[4]), ab4); ab5 = f32x4_sub(f32x4_splat(r[5]), ab5);
+                ab6 = f32x4_sub(f32x4_splat(r[6]), ab6); ab7 = f32x4_sub(f32x4_splat(r[7]), ab7);
+            }
+            FusedKerSpec::PerRowSubF(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                ab0 = f32x4_sub(ab0, f32x4_splat(r[0])); ab1 = f32x4_sub(ab1, f32x4_splat(r[1]));
+                ab2 = f32x4_sub(ab2, f32x4_splat(r[2])); ab3 = f32x4_sub(ab3, f32x4_splat(r[3]));
+                ab4 = f32x4_sub(ab4, f32x4_splat(r[4])); ab5 = f32x4_sub(ab5, f32x4_splat(r[5]));
+                ab6 = f32x4_sub(ab6, f32x4_splat(r[6])); ab7 = f32x4_sub(ab7, f32x4_splat(r[7]));
+            }
+            FusedKerSpec::PerColMin(cols) => {
+                let c = v128_load(cols as *const v128);
+                ab0 = f32x4_min(c, ab0); ab1 = f32x4_min(c, ab1);
+                ab2 = f32x4_min(c, ab2); ab3 = f32x4_min(c, ab3);
+                ab4 = f32x4_min(c, ab4); ab5 = f32x4_min(c, ab5);
+                ab6 = f32x4_min(c, ab6); ab7 = f32x4_min(c, ab7);
+            }
+            FusedKerSpec::PerColMax(cols) => {
+                let c = v128_load(cols as *const v128);
+                ab0 = f32x4_max(c, ab0); ab1 = f32x4_max(c, ab1);
+                ab2 = f32x4_max(c, ab2); ab3 = f32x4_max(c, ab3);
+                ab4 = f32x4_max(c, ab4); ab5 = f32x4_max(c, ab5);
+                ab6 = f32x4_max(c, ab6); ab7 = f32x4_max(c, ab7);
+            }
+            FusedKerSpec::PerColAdd(cols) => {
+                let c = v128_load(cols as *const v128);
+                ab0 = f32x4_add(c, ab0); ab1 = f32x4_add(c, ab1);
+                ab2 = f32x4_add(c, ab2); ab3 = f32x4_add(c, ab3);
+                ab4 = f32x4_add(c, ab4); ab5 = f32x4_add(c, ab5);
+                ab6 = f32x4_add(c, ab6); ab7 = f32x4_add(c, ab7);
+            }
+            FusedKerSpec::PerColMul(cols) => {
+                let c = v128_load(cols as *const v128);
+                ab0 = f32x4_mul(c, ab0); ab1 = f32x4_mul(c, ab1);
+                ab2 = f32x4_mul(c, ab2); ab3 = f32x4_mul(c, ab3);
+                ab4 = f32x4_mul(c, ab4); ab5 = f32x4_mul(c, ab5);
+                ab6 = f32x4_mul(c, ab6); ab7 = f32x4_mul(c, ab7);
+            }
+            FusedKerSpec::PerColSub(cols) => {
+                let c = v128_load(cols as *const v128);
+                ab0 = f32x4_sub(c, ab0); ab1 = f32x4_sub(c, ab1);
+                ab2 = f32x4_sub(c, ab2); ab3 = f32x4_sub(c, ab3);
+                ab4 = f32x4_sub(c, ab4); ab5 = f32x4_sub(c, ab5);
+                ab6 = f32x4_sub(c, ab6); ab7 = f32x4_sub(c, ab7);
+            }
+            FusedKerSpec::PerColSubF(cols) => {
+                let c = v128_load(cols as *const v128);
+                ab0 = f32x4_sub(ab0, c); ab1 = f32x4_sub(ab1, c);
+                ab2 = f32x4_sub(ab2, c); ab3 = f32x4_sub(ab3, c);
+                ab4 = f32x4_sub(ab4, c); ab5 = f32x4_sub(ab5, c);
+                ab6 = f32x4_sub(ab6, c); ab7 = f32x4_sub(ab7, c);
+            }
+            FusedKerSpec::QScale(shift, rp, mult) => {
+                let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                let s = f32x4_splat(scaler.scale);
+                ab0 = f32x4_mul(s, ab0); ab1 = f32x4_mul(s, ab1);
+                ab2 = f32x4_mul(s, ab2); ab3 = f32x4_mul(s, ab3);
+                ab4 = f32x4_mul(s, ab4); ab5 = f32x4_mul(s, ab5);
+                ab6 = f32x4_mul(s, ab6); ab7 = f32x4_mul(s, ab7);
+            }
+            FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                ab0 = f32x4_mul(s, ab0); ab1 = f32x4_mul(s, ab1);
+                ab2 = f32x4_mul(s, ab2); ab3 = f32x4_mul(s, ab3);
+                ab4 = f32x4_mul(s, ab4); ab5 = f32x4_mul(s, ab5);
+                ab6 = f32x4_mul(s, ab6); ab7 = f32x4_mul(s, ab7);
+            }
+            FusedKerSpec::ShiftLeft(shift) => {
+                let s = f32x4_splat(2f32.powi(shift as i32));
+                ab0 = f32x4_mul(s, ab0); ab1 = f32x4_mul(s, ab1);
+                ab2 = f32x4_mul(s, ab2); ab3 = f32x4_mul(s, ab3);
+                ab4 = f32x4_mul(s, ab4); ab5 = f32x4_mul(s, ab5);
+                ab6 = f32x4_mul(s, ab6); ab7 = f32x4_mul(s, ab7);
+            }
+            FusedKerSpec::AddUnicast(tile) => {
+                // 8 rows × 4 cols, with col_byte_stride and row_byte_stride
+                let mut ptr: *const u8 = tile.ptr;
+                for ab_ref in [&mut ab0, &mut ab1, &mut ab2, &mut ab3,
+                               &mut ab4, &mut ab5, &mut ab6, &mut ab7].iter_mut() {
+                    let m0 = *(ptr as *const f32);
+                    let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                    let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                    let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                    **ab_ref = f32x4_add(**ab_ref, f32x4(m0, m1, m2, m3));
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                }
+            }
+            FusedKerSpec::AddRowColProducts(rows, cols) => {
+                let c = v128_load(cols as *const v128);
+                ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(*rows.add(0)), c));
+                ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(*rows.add(1)), c));
+                ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(*rows.add(2)), c));
+                ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(*rows.add(3)), c));
+                ab4 = f32x4_add(ab4, f32x4_mul(f32x4_splat(*rows.add(4)), c));
+                ab5 = f32x4_add(ab5, f32x4_mul(f32x4_splat(*rows.add(5)), c));
+                ab6 = f32x4_add(ab6, f32x4_mul(f32x4_splat(*rows.add(6)), c));
+                ab7 = f32x4_add(ab7, f32x4_mul(f32x4_splat(*rows.add(7)), c));
+            }
+            FusedKerSpec::Store(tile) => {
+                // 8 rows × 4 cols stores
+                let mut ptr: *mut u8 = tile.ptr;
+                for ab in [ab0, ab1, ab2, ab3, ab4, ab5, ab6, ab7].iter() {
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(*ab);
+                    *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(*ab);
+                    *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(*ab);
+                    *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(*ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                }
+            }
+            FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                // A: packed [k][MR=8] = each k iter loads 8 row values
+                // B: packed [k][NR=4] = each k iter loads 4 col values as 1 v128
+                let a = pa as *const f32;
+                let b = pb as *const v128;
+                for i in 0..k {
+                    let arow = std::slice::from_raw_parts(a.offset(8 * i as isize), 8);
+                    let bvec = v128_load(b.offset(i as isize));
+                    ab0 = f32x4_add(ab0, f32x4_mul(f32x4_splat(arow[0]), bvec));
+                    ab1 = f32x4_add(ab1, f32x4_mul(f32x4_splat(arow[1]), bvec));
+                    ab2 = f32x4_add(ab2, f32x4_mul(f32x4_splat(arow[2]), bvec));
+                    ab3 = f32x4_add(ab3, f32x4_mul(f32x4_splat(arow[3]), bvec));
+                    ab4 = f32x4_add(ab4, f32x4_mul(f32x4_splat(arow[4]), bvec));
+                    ab5 = f32x4_add(ab5, f32x4_mul(f32x4_splat(arow[5]), bvec));
+                    ab6 = f32x4_add(ab6, f32x4_mul(f32x4_splat(arow[6]), bvec));
+                    ab7 = f32x4_add(ab7, f32x4_mul(f32x4_splat(arow[7]), bvec));
+                }
+            }
+        }
+        pnl = pnl.add(1);
+    }
+    0
+}
+
+MMMRustKernel!(kernel_f32_8x4 => wasm_f32_8x4<f32>(8,4)@(8,4) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 16x1 kernel — wider GEMV variant for matrix-vector products
+/// on very large M. Uses FOUR independent f32x4 accumulators (rows 0-3,
+/// 4-7, 8-11, 12-15), enabling 4-way ILP within each k-iteration.
+///
+/// Compared to wasm_f32_8x1 (2 accumulators, 2-way ILP), this exposes more
+/// parallel work to the SIMD pipelines, beneficial on hardware with 3+
+/// SIMD execution units (most modern ARM and x86).
+unsafe fn kernel_f32_16x1(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    // Four accumulators: 16 rows × 1 col packed as [ab_q0, ab_q1, ab_q2, ab_q3]
+    // ab_q0 = rows 0-3, ab_q1 = rows 4-7, ab_q2 = rows 8-11, ab_q3 = rows 12-15
+    let mut ab_q0 = f32x4_splat(0.0);
+    let mut ab_q1 = f32x4_splat(0.0);
+    let mut ab_q2 = f32x4_splat(0.0);
+    let mut ab_q3 = f32x4_splat(0.0);
+
+    while !pnl.is_null() {
+        match *pnl {
+            FusedKerSpec::Done => break,
+            FusedKerSpec::Clear => {
+                let z = f32x4_splat(0.0);
+                ab_q0 = z; ab_q1 = z; ab_q2 = z; ab_q3 = z;
+            }
+            FusedKerSpec::LoadTile(_cols, rows) => {
+                let p = rows as *const v128;
+                ab_q0 = *p;
+                ab_q1 = *p.add(1);
+                ab_q2 = *p.add(2);
+                ab_q3 = *p.add(3);
+            }
+            FusedKerSpec::ScalarMin(a) => {
+                let s = f32x4_splat(a);
+                ab_q0 = f32x4_min(s, ab_q0); ab_q1 = f32x4_min(s, ab_q1);
+                ab_q2 = f32x4_min(s, ab_q2); ab_q3 = f32x4_min(s, ab_q3);
+            }
+            FusedKerSpec::ScalarMax(a) => {
+                let s = f32x4_splat(a);
+                ab_q0 = f32x4_max(s, ab_q0); ab_q1 = f32x4_max(s, ab_q1);
+                ab_q2 = f32x4_max(s, ab_q2); ab_q3 = f32x4_max(s, ab_q3);
+            }
+            FusedKerSpec::ScalarAdd(a) => {
+                let s = f32x4_splat(a);
+                ab_q0 = f32x4_add(s, ab_q0); ab_q1 = f32x4_add(s, ab_q1);
+                ab_q2 = f32x4_add(s, ab_q2); ab_q3 = f32x4_add(s, ab_q3);
+            }
+            FusedKerSpec::ScalarMul(a) => {
+                let s = f32x4_splat(a);
+                ab_q0 = f32x4_mul(s, ab_q0); ab_q1 = f32x4_mul(s, ab_q1);
+                ab_q2 = f32x4_mul(s, ab_q2); ab_q3 = f32x4_mul(s, ab_q3);
+            }
+            FusedKerSpec::ScalarSub(a) => {
+                let s = f32x4_splat(a);
+                ab_q0 = f32x4_sub(s, ab_q0); ab_q1 = f32x4_sub(s, ab_q1);
+                ab_q2 = f32x4_sub(s, ab_q2); ab_q3 = f32x4_sub(s, ab_q3);
+            }
+            FusedKerSpec::ScalarSubF(a) => {
+                let s = f32x4_splat(a);
+                ab_q0 = f32x4_sub(ab_q0, s); ab_q1 = f32x4_sub(ab_q1, s);
+                ab_q2 = f32x4_sub(ab_q2, s); ab_q3 = f32x4_sub(ab_q3, s);
+            }
+            FusedKerSpec::LeakyRelu(a) => {
+                let s = f32x4_splat(a);
+                let zero = f32x4_splat(0.0);
+                let m0 = f32x4_gt(ab_q0, zero); ab_q0 = v128_bitselect(ab_q0, f32x4_mul(s, ab_q0), m0);
+                let m1 = f32x4_gt(ab_q1, zero); ab_q1 = v128_bitselect(ab_q1, f32x4_mul(s, ab_q1), m1);
+                let m2 = f32x4_gt(ab_q2, zero); ab_q2 = v128_bitselect(ab_q2, f32x4_mul(s, ab_q2), m2);
+                let m3 = f32x4_gt(ab_q3, zero); ab_q3 = v128_bitselect(ab_q3, f32x4_mul(s, ab_q3), m3);
+            }
+            FusedKerSpec::PerRowMin(row) => {
+                let p = row as *const v128;
+                ab_q0 = f32x4_min(v128_load(p), ab_q0);
+                ab_q1 = f32x4_min(v128_load(p.add(1)), ab_q1);
+                ab_q2 = f32x4_min(v128_load(p.add(2)), ab_q2);
+                ab_q3 = f32x4_min(v128_load(p.add(3)), ab_q3);
+            }
+            FusedKerSpec::PerRowMax(row) => {
+                let p = row as *const v128;
+                ab_q0 = f32x4_max(v128_load(p), ab_q0);
+                ab_q1 = f32x4_max(v128_load(p.add(1)), ab_q1);
+                ab_q2 = f32x4_max(v128_load(p.add(2)), ab_q2);
+                ab_q3 = f32x4_max(v128_load(p.add(3)), ab_q3);
+            }
+            FusedKerSpec::PerRowAdd(row) => {
+                let p = row as *const v128;
+                ab_q0 = f32x4_add(v128_load(p), ab_q0);
+                ab_q1 = f32x4_add(v128_load(p.add(1)), ab_q1);
+                ab_q2 = f32x4_add(v128_load(p.add(2)), ab_q2);
+                ab_q3 = f32x4_add(v128_load(p.add(3)), ab_q3);
+            }
+            FusedKerSpec::PerRowMul(row) => {
+                let p = row as *const v128;
+                ab_q0 = f32x4_mul(v128_load(p), ab_q0);
+                ab_q1 = f32x4_mul(v128_load(p.add(1)), ab_q1);
+                ab_q2 = f32x4_mul(v128_load(p.add(2)), ab_q2);
+                ab_q3 = f32x4_mul(v128_load(p.add(3)), ab_q3);
+            }
+            FusedKerSpec::PerRowSub(row) => {
+                let p = row as *const v128;
+                ab_q0 = f32x4_sub(v128_load(p), ab_q0);
+                ab_q1 = f32x4_sub(v128_load(p.add(1)), ab_q1);
+                ab_q2 = f32x4_sub(v128_load(p.add(2)), ab_q2);
+                ab_q3 = f32x4_sub(v128_load(p.add(3)), ab_q3);
+            }
+            FusedKerSpec::PerRowSubF(row) => {
+                let p = row as *const v128;
+                ab_q0 = f32x4_sub(ab_q0, v128_load(p));
+                ab_q1 = f32x4_sub(ab_q1, v128_load(p.add(1)));
+                ab_q2 = f32x4_sub(ab_q2, v128_load(p.add(2)));
+                ab_q3 = f32x4_sub(ab_q3, v128_load(p.add(3)));
+            }
+            FusedKerSpec::PerColMin(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_q0 = f32x4_min(c, ab_q0); ab_q1 = f32x4_min(c, ab_q1);
+                ab_q2 = f32x4_min(c, ab_q2); ab_q3 = f32x4_min(c, ab_q3);
+            }
+            FusedKerSpec::PerColMax(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_q0 = f32x4_max(c, ab_q0); ab_q1 = f32x4_max(c, ab_q1);
+                ab_q2 = f32x4_max(c, ab_q2); ab_q3 = f32x4_max(c, ab_q3);
+            }
+            FusedKerSpec::PerColAdd(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_q0 = f32x4_add(c, ab_q0); ab_q1 = f32x4_add(c, ab_q1);
+                ab_q2 = f32x4_add(c, ab_q2); ab_q3 = f32x4_add(c, ab_q3);
+            }
+            FusedKerSpec::PerColMul(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_q0 = f32x4_mul(c, ab_q0); ab_q1 = f32x4_mul(c, ab_q1);
+                ab_q2 = f32x4_mul(c, ab_q2); ab_q3 = f32x4_mul(c, ab_q3);
+            }
+            FusedKerSpec::PerColSub(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_q0 = f32x4_sub(c, ab_q0); ab_q1 = f32x4_sub(c, ab_q1);
+                ab_q2 = f32x4_sub(c, ab_q2); ab_q3 = f32x4_sub(c, ab_q3);
+            }
+            FusedKerSpec::PerColSubF(cols) => {
+                let c = f32x4_splat(*cols);
+                ab_q0 = f32x4_sub(ab_q0, c); ab_q1 = f32x4_sub(ab_q1, c);
+                ab_q2 = f32x4_sub(ab_q2, c); ab_q3 = f32x4_sub(ab_q3, c);
+            }
+            FusedKerSpec::QScale(shift, rp, mult) => {
+                let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                let s = f32x4_splat(scaler.scale);
+                ab_q0 = f32x4_mul(s, ab_q0); ab_q1 = f32x4_mul(s, ab_q1);
+                ab_q2 = f32x4_mul(s, ab_q2); ab_q3 = f32x4_mul(s, ab_q3);
+            }
+            FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                ab_q0 = f32x4_mul(s, ab_q0); ab_q1 = f32x4_mul(s, ab_q1);
+                ab_q2 = f32x4_mul(s, ab_q2); ab_q3 = f32x4_mul(s, ab_q3);
+            }
+            FusedKerSpec::ShiftLeft(shift) => {
+                let s = f32x4_splat(2f32.powi(shift as i32));
+                ab_q0 = f32x4_mul(s, ab_q0); ab_q1 = f32x4_mul(s, ab_q1);
+                ab_q2 = f32x4_mul(s, ab_q2); ab_q3 = f32x4_mul(s, ab_q3);
+            }
+            FusedKerSpec::AddUnicast(tile) => {
+                // 16 rows × 1 col, with row_byte_stride between rows
+                let mut ptr: *const u8 = tile.ptr;
+                let mut ms = [0f32; 16];
+                for i in 0..16 {
+                    ms[i] = *(ptr as *const f32);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                }
+                ab_q0 = f32x4_add(ab_q0, f32x4(ms[0], ms[1], ms[2], ms[3]));
+                ab_q1 = f32x4_add(ab_q1, f32x4(ms[4], ms[5], ms[6], ms[7]));
+                ab_q2 = f32x4_add(ab_q2, f32x4(ms[8], ms[9], ms[10], ms[11]));
+                ab_q3 = f32x4_add(ab_q3, f32x4(ms[12], ms[13], ms[14], ms[15]));
+            }
+            FusedKerSpec::AddRowColProducts(rows, cols) => {
+                let p = rows as *const v128;
+                let c = f32x4_splat(*cols);
+                ab_q0 = f32x4_add(ab_q0, f32x4_mul(v128_load(p), c));
+                ab_q1 = f32x4_add(ab_q1, f32x4_mul(v128_load(p.add(1)), c));
+                ab_q2 = f32x4_add(ab_q2, f32x4_mul(v128_load(p.add(2)), c));
+                ab_q3 = f32x4_add(ab_q3, f32x4_mul(v128_load(p.add(3)), c));
+            }
+            FusedKerSpec::Store(tile) => {
+                // 16 rows × 1 col, write each lane to a separate row
+                let mut ptr: *mut u8 = tile.ptr;
+                for ab in [ab_q0, ab_q1, ab_q2, ab_q3].iter() {
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(*ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<1>(*ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<2>(*ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                    *(ptr as *mut f32) = f32x4_extract_lane::<3>(*ab);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                }
+            }
+            FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                // A: packed [k][MR=16] = each k iter loads 16 f32 = 4 v128
+                // B: packed [k][NR=1] = each k iter loads 1 scalar f32, broadcast
+                // 4 INDEPENDENT fmadds per k-iter — 4-way ILP
+                let a = pa as *const v128;
+                let b = pb as *const f32;
+                for i in 0..k {
+                    let a0 = v128_load(a.offset((4 * i) as isize));
+                    let a1 = v128_load(a.offset((4 * i + 1) as isize));
+                    let a2 = v128_load(a.offset((4 * i + 2) as isize));
+                    let a3 = v128_load(a.offset((4 * i + 3) as isize));
+                    let bs = f32x4_splat(*b.offset(i as isize));
+                    ab_q0 = f32x4_add(ab_q0, f32x4_mul(a0, bs));
+                    ab_q1 = f32x4_add(ab_q1, f32x4_mul(a1, bs));
+                    ab_q2 = f32x4_add(ab_q2, f32x4_mul(a2, bs));
+                    ab_q3 = f32x4_add(ab_q3, f32x4_mul(a3, bs));
+                }
+            }
+        }
+        pnl = pnl.add(1);
+    }
+    0
+}
+
+
+MMMRustKernel!(kernel_f32_16x1 => wasm_f32_16x1<f32>(16,1)@(16,1) quality(ImplementationQuality::TargetOptimized));
+
+/// WASM SIMD f32 8x8 kernel — wide MM tile (8 rows × 8 cols, 16 v128 accumulators).
+/// Each row uses 2 v128: cols 0-3 in `_lo`, cols 4-7 in `_hi`. 16 accumulators
+/// is at the limit of WASM's 16 logical SIMD register slots; this tests the
+/// register-pressure boundary. For DFN3 ops, all M and N are multiples of 8,
+/// so 8x8 fits cleanly with no padding waste.
+unsafe fn kernel_f32_8x8(mut pnl: *const FusedKerSpec<f32>) -> isize {
+    use std::arch::wasm32::*;
+
+    // 8 rows × 8 cols = 16 f32x4 accumulators (cols 0-3 in _lo, cols 4-7 in _hi)
+    let mut a0lo = f32x4_splat(0.0); let mut a0hi = f32x4_splat(0.0);
+    let mut a1lo = f32x4_splat(0.0); let mut a1hi = f32x4_splat(0.0);
+    let mut a2lo = f32x4_splat(0.0); let mut a2hi = f32x4_splat(0.0);
+    let mut a3lo = f32x4_splat(0.0); let mut a3hi = f32x4_splat(0.0);
+    let mut a4lo = f32x4_splat(0.0); let mut a4hi = f32x4_splat(0.0);
+    let mut a5lo = f32x4_splat(0.0); let mut a5hi = f32x4_splat(0.0);
+    let mut a6lo = f32x4_splat(0.0); let mut a6hi = f32x4_splat(0.0);
+    let mut a7lo = f32x4_splat(0.0); let mut a7hi = f32x4_splat(0.0);
+
+    while !pnl.is_null() {
+        match *pnl {
+            FusedKerSpec::Done => break,
+            FusedKerSpec::Clear => {
+                let z = f32x4_splat(0.0);
+                a0lo = z; a0hi = z; a1lo = z; a1hi = z;
+                a2lo = z; a2hi = z; a3lo = z; a3hi = z;
+                a4lo = z; a4hi = z; a5lo = z; a5hi = z;
+                a6lo = z; a6hi = z; a7lo = z; a7hi = z;
+            }
+            FusedKerSpec::LoadTile(_cols, rows) => {
+                // 8 rows × 8 cols = 16 v128 (2 per row, contiguous lo+hi)
+                let p = rows as *const v128;
+                a0lo = *p.add(0); a0hi = *p.add(1);
+                a1lo = *p.add(2); a1hi = *p.add(3);
+                a2lo = *p.add(4); a2hi = *p.add(5);
+                a3lo = *p.add(6); a3hi = *p.add(7);
+                a4lo = *p.add(8); a4hi = *p.add(9);
+                a5lo = *p.add(10); a5hi = *p.add(11);
+                a6lo = *p.add(12); a6hi = *p.add(13);
+                a7lo = *p.add(14); a7hi = *p.add(15);
+            }
+            FusedKerSpec::ScalarMin(a) => {
+                let s = f32x4_splat(a);
+                a0lo = f32x4_min(s, a0lo); a0hi = f32x4_min(s, a0hi);
+                a1lo = f32x4_min(s, a1lo); a1hi = f32x4_min(s, a1hi);
+                a2lo = f32x4_min(s, a2lo); a2hi = f32x4_min(s, a2hi);
+                a3lo = f32x4_min(s, a3lo); a3hi = f32x4_min(s, a3hi);
+                a4lo = f32x4_min(s, a4lo); a4hi = f32x4_min(s, a4hi);
+                a5lo = f32x4_min(s, a5lo); a5hi = f32x4_min(s, a5hi);
+                a6lo = f32x4_min(s, a6lo); a6hi = f32x4_min(s, a6hi);
+                a7lo = f32x4_min(s, a7lo); a7hi = f32x4_min(s, a7hi);
+            }
+            FusedKerSpec::ScalarMax(a) => {
+                let s = f32x4_splat(a);
+                a0lo = f32x4_max(s, a0lo); a0hi = f32x4_max(s, a0hi);
+                a1lo = f32x4_max(s, a1lo); a1hi = f32x4_max(s, a1hi);
+                a2lo = f32x4_max(s, a2lo); a2hi = f32x4_max(s, a2hi);
+                a3lo = f32x4_max(s, a3lo); a3hi = f32x4_max(s, a3hi);
+                a4lo = f32x4_max(s, a4lo); a4hi = f32x4_max(s, a4hi);
+                a5lo = f32x4_max(s, a5lo); a5hi = f32x4_max(s, a5hi);
+                a6lo = f32x4_max(s, a6lo); a6hi = f32x4_max(s, a6hi);
+                a7lo = f32x4_max(s, a7lo); a7hi = f32x4_max(s, a7hi);
+            }
+            FusedKerSpec::ScalarAdd(a) => {
+                let s = f32x4_splat(a);
+                a0lo = f32x4_add(s, a0lo); a0hi = f32x4_add(s, a0hi);
+                a1lo = f32x4_add(s, a1lo); a1hi = f32x4_add(s, a1hi);
+                a2lo = f32x4_add(s, a2lo); a2hi = f32x4_add(s, a2hi);
+                a3lo = f32x4_add(s, a3lo); a3hi = f32x4_add(s, a3hi);
+                a4lo = f32x4_add(s, a4lo); a4hi = f32x4_add(s, a4hi);
+                a5lo = f32x4_add(s, a5lo); a5hi = f32x4_add(s, a5hi);
+                a6lo = f32x4_add(s, a6lo); a6hi = f32x4_add(s, a6hi);
+                a7lo = f32x4_add(s, a7lo); a7hi = f32x4_add(s, a7hi);
+            }
+            FusedKerSpec::ScalarMul(a) => {
+                let s = f32x4_splat(a);
+                a0lo = f32x4_mul(s, a0lo); a0hi = f32x4_mul(s, a0hi);
+                a1lo = f32x4_mul(s, a1lo); a1hi = f32x4_mul(s, a1hi);
+                a2lo = f32x4_mul(s, a2lo); a2hi = f32x4_mul(s, a2hi);
+                a3lo = f32x4_mul(s, a3lo); a3hi = f32x4_mul(s, a3hi);
+                a4lo = f32x4_mul(s, a4lo); a4hi = f32x4_mul(s, a4hi);
+                a5lo = f32x4_mul(s, a5lo); a5hi = f32x4_mul(s, a5hi);
+                a6lo = f32x4_mul(s, a6lo); a6hi = f32x4_mul(s, a6hi);
+                a7lo = f32x4_mul(s, a7lo); a7hi = f32x4_mul(s, a7hi);
+            }
+            FusedKerSpec::ScalarSub(a) => {
+                let s = f32x4_splat(a);
+                a0lo = f32x4_sub(s, a0lo); a0hi = f32x4_sub(s, a0hi);
+                a1lo = f32x4_sub(s, a1lo); a1hi = f32x4_sub(s, a1hi);
+                a2lo = f32x4_sub(s, a2lo); a2hi = f32x4_sub(s, a2hi);
+                a3lo = f32x4_sub(s, a3lo); a3hi = f32x4_sub(s, a3hi);
+                a4lo = f32x4_sub(s, a4lo); a4hi = f32x4_sub(s, a4hi);
+                a5lo = f32x4_sub(s, a5lo); a5hi = f32x4_sub(s, a5hi);
+                a6lo = f32x4_sub(s, a6lo); a6hi = f32x4_sub(s, a6hi);
+                a7lo = f32x4_sub(s, a7lo); a7hi = f32x4_sub(s, a7hi);
+            }
+            FusedKerSpec::ScalarSubF(a) => {
+                let s = f32x4_splat(a);
+                a0lo = f32x4_sub(a0lo, s); a0hi = f32x4_sub(a0hi, s);
+                a1lo = f32x4_sub(a1lo, s); a1hi = f32x4_sub(a1hi, s);
+                a2lo = f32x4_sub(a2lo, s); a2hi = f32x4_sub(a2hi, s);
+                a3lo = f32x4_sub(a3lo, s); a3hi = f32x4_sub(a3hi, s);
+                a4lo = f32x4_sub(a4lo, s); a4hi = f32x4_sub(a4hi, s);
+                a5lo = f32x4_sub(a5lo, s); a5hi = f32x4_sub(a5hi, s);
+                a6lo = f32x4_sub(a6lo, s); a6hi = f32x4_sub(a6hi, s);
+                a7lo = f32x4_sub(a7lo, s); a7hi = f32x4_sub(a7hi, s);
+            }
+            FusedKerSpec::LeakyRelu(a) => {
+                let s = f32x4_splat(a);
+                let zero = f32x4_splat(0.0);
+                let m0a = f32x4_gt(a0lo, zero); a0lo = v128_bitselect(a0lo, f32x4_mul(s, a0lo), m0a);
+                let m0b = f32x4_gt(a0hi, zero); a0hi = v128_bitselect(a0hi, f32x4_mul(s, a0hi), m0b);
+                let m1a = f32x4_gt(a1lo, zero); a1lo = v128_bitselect(a1lo, f32x4_mul(s, a1lo), m1a);
+                let m1b = f32x4_gt(a1hi, zero); a1hi = v128_bitselect(a1hi, f32x4_mul(s, a1hi), m1b);
+                let m2a = f32x4_gt(a2lo, zero); a2lo = v128_bitselect(a2lo, f32x4_mul(s, a2lo), m2a);
+                let m2b = f32x4_gt(a2hi, zero); a2hi = v128_bitselect(a2hi, f32x4_mul(s, a2hi), m2b);
+                let m3a = f32x4_gt(a3lo, zero); a3lo = v128_bitselect(a3lo, f32x4_mul(s, a3lo), m3a);
+                let m3b = f32x4_gt(a3hi, zero); a3hi = v128_bitselect(a3hi, f32x4_mul(s, a3hi), m3b);
+                let m4a = f32x4_gt(a4lo, zero); a4lo = v128_bitselect(a4lo, f32x4_mul(s, a4lo), m4a);
+                let m4b = f32x4_gt(a4hi, zero); a4hi = v128_bitselect(a4hi, f32x4_mul(s, a4hi), m4b);
+                let m5a = f32x4_gt(a5lo, zero); a5lo = v128_bitselect(a5lo, f32x4_mul(s, a5lo), m5a);
+                let m5b = f32x4_gt(a5hi, zero); a5hi = v128_bitselect(a5hi, f32x4_mul(s, a5hi), m5b);
+                let m6a = f32x4_gt(a6lo, zero); a6lo = v128_bitselect(a6lo, f32x4_mul(s, a6lo), m6a);
+                let m6b = f32x4_gt(a6hi, zero); a6hi = v128_bitselect(a6hi, f32x4_mul(s, a6hi), m6b);
+                let m7a = f32x4_gt(a7lo, zero); a7lo = v128_bitselect(a7lo, f32x4_mul(s, a7lo), m7a);
+                let m7b = f32x4_gt(a7hi, zero); a7hi = v128_bitselect(a7hi, f32x4_mul(s, a7hi), m7b);
+            }
+            FusedKerSpec::PerRowMin(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                let r0 = f32x4_splat(r[0]); a0lo = f32x4_min(r0, a0lo); a0hi = f32x4_min(r0, a0hi);
+                let r1 = f32x4_splat(r[1]); a1lo = f32x4_min(r1, a1lo); a1hi = f32x4_min(r1, a1hi);
+                let r2 = f32x4_splat(r[2]); a2lo = f32x4_min(r2, a2lo); a2hi = f32x4_min(r2, a2hi);
+                let r3 = f32x4_splat(r[3]); a3lo = f32x4_min(r3, a3lo); a3hi = f32x4_min(r3, a3hi);
+                let r4 = f32x4_splat(r[4]); a4lo = f32x4_min(r4, a4lo); a4hi = f32x4_min(r4, a4hi);
+                let r5 = f32x4_splat(r[5]); a5lo = f32x4_min(r5, a5lo); a5hi = f32x4_min(r5, a5hi);
+                let r6 = f32x4_splat(r[6]); a6lo = f32x4_min(r6, a6lo); a6hi = f32x4_min(r6, a6hi);
+                let r7 = f32x4_splat(r[7]); a7lo = f32x4_min(r7, a7lo); a7hi = f32x4_min(r7, a7hi);
+            }
+            FusedKerSpec::PerRowMax(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                let r0 = f32x4_splat(r[0]); a0lo = f32x4_max(r0, a0lo); a0hi = f32x4_max(r0, a0hi);
+                let r1 = f32x4_splat(r[1]); a1lo = f32x4_max(r1, a1lo); a1hi = f32x4_max(r1, a1hi);
+                let r2 = f32x4_splat(r[2]); a2lo = f32x4_max(r2, a2lo); a2hi = f32x4_max(r2, a2hi);
+                let r3 = f32x4_splat(r[3]); a3lo = f32x4_max(r3, a3lo); a3hi = f32x4_max(r3, a3hi);
+                let r4 = f32x4_splat(r[4]); a4lo = f32x4_max(r4, a4lo); a4hi = f32x4_max(r4, a4hi);
+                let r5 = f32x4_splat(r[5]); a5lo = f32x4_max(r5, a5lo); a5hi = f32x4_max(r5, a5hi);
+                let r6 = f32x4_splat(r[6]); a6lo = f32x4_max(r6, a6lo); a6hi = f32x4_max(r6, a6hi);
+                let r7 = f32x4_splat(r[7]); a7lo = f32x4_max(r7, a7lo); a7hi = f32x4_max(r7, a7hi);
+            }
+            FusedKerSpec::PerRowAdd(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                let r0 = f32x4_splat(r[0]); a0lo = f32x4_add(r0, a0lo); a0hi = f32x4_add(r0, a0hi);
+                let r1 = f32x4_splat(r[1]); a1lo = f32x4_add(r1, a1lo); a1hi = f32x4_add(r1, a1hi);
+                let r2 = f32x4_splat(r[2]); a2lo = f32x4_add(r2, a2lo); a2hi = f32x4_add(r2, a2hi);
+                let r3 = f32x4_splat(r[3]); a3lo = f32x4_add(r3, a3lo); a3hi = f32x4_add(r3, a3hi);
+                let r4 = f32x4_splat(r[4]); a4lo = f32x4_add(r4, a4lo); a4hi = f32x4_add(r4, a4hi);
+                let r5 = f32x4_splat(r[5]); a5lo = f32x4_add(r5, a5lo); a5hi = f32x4_add(r5, a5hi);
+                let r6 = f32x4_splat(r[6]); a6lo = f32x4_add(r6, a6lo); a6hi = f32x4_add(r6, a6hi);
+                let r7 = f32x4_splat(r[7]); a7lo = f32x4_add(r7, a7lo); a7hi = f32x4_add(r7, a7hi);
+            }
+            FusedKerSpec::PerRowMul(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                let r0 = f32x4_splat(r[0]); a0lo = f32x4_mul(r0, a0lo); a0hi = f32x4_mul(r0, a0hi);
+                let r1 = f32x4_splat(r[1]); a1lo = f32x4_mul(r1, a1lo); a1hi = f32x4_mul(r1, a1hi);
+                let r2 = f32x4_splat(r[2]); a2lo = f32x4_mul(r2, a2lo); a2hi = f32x4_mul(r2, a2hi);
+                let r3 = f32x4_splat(r[3]); a3lo = f32x4_mul(r3, a3lo); a3hi = f32x4_mul(r3, a3hi);
+                let r4 = f32x4_splat(r[4]); a4lo = f32x4_mul(r4, a4lo); a4hi = f32x4_mul(r4, a4hi);
+                let r5 = f32x4_splat(r[5]); a5lo = f32x4_mul(r5, a5lo); a5hi = f32x4_mul(r5, a5hi);
+                let r6 = f32x4_splat(r[6]); a6lo = f32x4_mul(r6, a6lo); a6hi = f32x4_mul(r6, a6hi);
+                let r7 = f32x4_splat(r[7]); a7lo = f32x4_mul(r7, a7lo); a7hi = f32x4_mul(r7, a7hi);
+            }
+            FusedKerSpec::PerRowSub(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                let r0 = f32x4_splat(r[0]); a0lo = f32x4_sub(r0, a0lo); a0hi = f32x4_sub(r0, a0hi);
+                let r1 = f32x4_splat(r[1]); a1lo = f32x4_sub(r1, a1lo); a1hi = f32x4_sub(r1, a1hi);
+                let r2 = f32x4_splat(r[2]); a2lo = f32x4_sub(r2, a2lo); a2hi = f32x4_sub(r2, a2hi);
+                let r3 = f32x4_splat(r[3]); a3lo = f32x4_sub(r3, a3lo); a3hi = f32x4_sub(r3, a3hi);
+                let r4 = f32x4_splat(r[4]); a4lo = f32x4_sub(r4, a4lo); a4hi = f32x4_sub(r4, a4hi);
+                let r5 = f32x4_splat(r[5]); a5lo = f32x4_sub(r5, a5lo); a5hi = f32x4_sub(r5, a5hi);
+                let r6 = f32x4_splat(r[6]); a6lo = f32x4_sub(r6, a6lo); a6hi = f32x4_sub(r6, a6hi);
+                let r7 = f32x4_splat(r[7]); a7lo = f32x4_sub(r7, a7lo); a7hi = f32x4_sub(r7, a7hi);
+            }
+            FusedKerSpec::PerRowSubF(row) => {
+                let r = std::slice::from_raw_parts(row, 8);
+                let r0 = f32x4_splat(r[0]); a0lo = f32x4_sub(a0lo, r0); a0hi = f32x4_sub(a0hi, r0);
+                let r1 = f32x4_splat(r[1]); a1lo = f32x4_sub(a1lo, r1); a1hi = f32x4_sub(a1hi, r1);
+                let r2 = f32x4_splat(r[2]); a2lo = f32x4_sub(a2lo, r2); a2hi = f32x4_sub(a2hi, r2);
+                let r3 = f32x4_splat(r[3]); a3lo = f32x4_sub(a3lo, r3); a3hi = f32x4_sub(a3hi, r3);
+                let r4 = f32x4_splat(r[4]); a4lo = f32x4_sub(a4lo, r4); a4hi = f32x4_sub(a4hi, r4);
+                let r5 = f32x4_splat(r[5]); a5lo = f32x4_sub(a5lo, r5); a5hi = f32x4_sub(a5hi, r5);
+                let r6 = f32x4_splat(r[6]); a6lo = f32x4_sub(a6lo, r6); a6hi = f32x4_sub(a6hi, r6);
+                let r7 = f32x4_splat(r[7]); a7lo = f32x4_sub(a7lo, r7); a7hi = f32x4_sub(a7hi, r7);
+            }
+            FusedKerSpec::PerColMin(cols) => {
+                let p = cols as *const v128;
+                let clo = v128_load(p); let chi = v128_load(p.add(1));
+                a0lo = f32x4_min(clo, a0lo); a0hi = f32x4_min(chi, a0hi);
+                a1lo = f32x4_min(clo, a1lo); a1hi = f32x4_min(chi, a1hi);
+                a2lo = f32x4_min(clo, a2lo); a2hi = f32x4_min(chi, a2hi);
+                a3lo = f32x4_min(clo, a3lo); a3hi = f32x4_min(chi, a3hi);
+                a4lo = f32x4_min(clo, a4lo); a4hi = f32x4_min(chi, a4hi);
+                a5lo = f32x4_min(clo, a5lo); a5hi = f32x4_min(chi, a5hi);
+                a6lo = f32x4_min(clo, a6lo); a6hi = f32x4_min(chi, a6hi);
+                a7lo = f32x4_min(clo, a7lo); a7hi = f32x4_min(chi, a7hi);
+            }
+            FusedKerSpec::PerColMax(cols) => {
+                let p = cols as *const v128;
+                let clo = v128_load(p); let chi = v128_load(p.add(1));
+                a0lo = f32x4_max(clo, a0lo); a0hi = f32x4_max(chi, a0hi);
+                a1lo = f32x4_max(clo, a1lo); a1hi = f32x4_max(chi, a1hi);
+                a2lo = f32x4_max(clo, a2lo); a2hi = f32x4_max(chi, a2hi);
+                a3lo = f32x4_max(clo, a3lo); a3hi = f32x4_max(chi, a3hi);
+                a4lo = f32x4_max(clo, a4lo); a4hi = f32x4_max(chi, a4hi);
+                a5lo = f32x4_max(clo, a5lo); a5hi = f32x4_max(chi, a5hi);
+                a6lo = f32x4_max(clo, a6lo); a6hi = f32x4_max(chi, a6hi);
+                a7lo = f32x4_max(clo, a7lo); a7hi = f32x4_max(chi, a7hi);
+            }
+            FusedKerSpec::PerColAdd(cols) => {
+                let p = cols as *const v128;
+                let clo = v128_load(p); let chi = v128_load(p.add(1));
+                a0lo = f32x4_add(clo, a0lo); a0hi = f32x4_add(chi, a0hi);
+                a1lo = f32x4_add(clo, a1lo); a1hi = f32x4_add(chi, a1hi);
+                a2lo = f32x4_add(clo, a2lo); a2hi = f32x4_add(chi, a2hi);
+                a3lo = f32x4_add(clo, a3lo); a3hi = f32x4_add(chi, a3hi);
+                a4lo = f32x4_add(clo, a4lo); a4hi = f32x4_add(chi, a4hi);
+                a5lo = f32x4_add(clo, a5lo); a5hi = f32x4_add(chi, a5hi);
+                a6lo = f32x4_add(clo, a6lo); a6hi = f32x4_add(chi, a6hi);
+                a7lo = f32x4_add(clo, a7lo); a7hi = f32x4_add(chi, a7hi);
+            }
+            FusedKerSpec::PerColMul(cols) => {
+                let p = cols as *const v128;
+                let clo = v128_load(p); let chi = v128_load(p.add(1));
+                a0lo = f32x4_mul(clo, a0lo); a0hi = f32x4_mul(chi, a0hi);
+                a1lo = f32x4_mul(clo, a1lo); a1hi = f32x4_mul(chi, a1hi);
+                a2lo = f32x4_mul(clo, a2lo); a2hi = f32x4_mul(chi, a2hi);
+                a3lo = f32x4_mul(clo, a3lo); a3hi = f32x4_mul(chi, a3hi);
+                a4lo = f32x4_mul(clo, a4lo); a4hi = f32x4_mul(chi, a4hi);
+                a5lo = f32x4_mul(clo, a5lo); a5hi = f32x4_mul(chi, a5hi);
+                a6lo = f32x4_mul(clo, a6lo); a6hi = f32x4_mul(chi, a6hi);
+                a7lo = f32x4_mul(clo, a7lo); a7hi = f32x4_mul(chi, a7hi);
+            }
+            FusedKerSpec::PerColSub(cols) => {
+                let p = cols as *const v128;
+                let clo = v128_load(p); let chi = v128_load(p.add(1));
+                a0lo = f32x4_sub(clo, a0lo); a0hi = f32x4_sub(chi, a0hi);
+                a1lo = f32x4_sub(clo, a1lo); a1hi = f32x4_sub(chi, a1hi);
+                a2lo = f32x4_sub(clo, a2lo); a2hi = f32x4_sub(chi, a2hi);
+                a3lo = f32x4_sub(clo, a3lo); a3hi = f32x4_sub(chi, a3hi);
+                a4lo = f32x4_sub(clo, a4lo); a4hi = f32x4_sub(chi, a4hi);
+                a5lo = f32x4_sub(clo, a5lo); a5hi = f32x4_sub(chi, a5hi);
+                a6lo = f32x4_sub(clo, a6lo); a6hi = f32x4_sub(chi, a6hi);
+                a7lo = f32x4_sub(clo, a7lo); a7hi = f32x4_sub(chi, a7hi);
+            }
+            FusedKerSpec::PerColSubF(cols) => {
+                let p = cols as *const v128;
+                let clo = v128_load(p); let chi = v128_load(p.add(1));
+                a0lo = f32x4_sub(a0lo, clo); a0hi = f32x4_sub(a0hi, chi);
+                a1lo = f32x4_sub(a1lo, clo); a1hi = f32x4_sub(a1hi, chi);
+                a2lo = f32x4_sub(a2lo, clo); a2hi = f32x4_sub(a2hi, chi);
+                a3lo = f32x4_sub(a3lo, clo); a3hi = f32x4_sub(a3hi, chi);
+                a4lo = f32x4_sub(a4lo, clo); a4hi = f32x4_sub(a4hi, chi);
+                a5lo = f32x4_sub(a5lo, clo); a5hi = f32x4_sub(a5hi, chi);
+                a6lo = f32x4_sub(a6lo, clo); a6hi = f32x4_sub(a6hi, chi);
+                a7lo = f32x4_sub(a7lo, clo); a7hi = f32x4_sub(a7hi, chi);
+            }
+            FusedKerSpec::QScale(shift, rp, mult) => {
+                let scaler = Scaler::from_fuse_params(shift, rp, mult);
+                let s = f32x4_splat(scaler.scale);
+                a0lo = f32x4_mul(s, a0lo); a0hi = f32x4_mul(s, a0hi);
+                a1lo = f32x4_mul(s, a1lo); a1hi = f32x4_mul(s, a1hi);
+                a2lo = f32x4_mul(s, a2lo); a2hi = f32x4_mul(s, a2hi);
+                a3lo = f32x4_mul(s, a3lo); a3hi = f32x4_mul(s, a3hi);
+                a4lo = f32x4_mul(s, a4lo); a4hi = f32x4_mul(s, a4hi);
+                a5lo = f32x4_mul(s, a5lo); a5hi = f32x4_mul(s, a5hi);
+                a6lo = f32x4_mul(s, a6lo); a6hi = f32x4_mul(s, a6hi);
+                a7lo = f32x4_mul(s, a7lo); a7hi = f32x4_mul(s, a7hi);
+            }
+            FusedKerSpec::RoundingShiftRight(shift, _rp) => {
+                let s = f32x4_splat(2f32.powi(-(shift as i32)));
+                a0lo = f32x4_mul(s, a0lo); a0hi = f32x4_mul(s, a0hi);
+                a1lo = f32x4_mul(s, a1lo); a1hi = f32x4_mul(s, a1hi);
+                a2lo = f32x4_mul(s, a2lo); a2hi = f32x4_mul(s, a2hi);
+                a3lo = f32x4_mul(s, a3lo); a3hi = f32x4_mul(s, a3hi);
+                a4lo = f32x4_mul(s, a4lo); a4hi = f32x4_mul(s, a4hi);
+                a5lo = f32x4_mul(s, a5lo); a5hi = f32x4_mul(s, a5hi);
+                a6lo = f32x4_mul(s, a6lo); a6hi = f32x4_mul(s, a6hi);
+                a7lo = f32x4_mul(s, a7lo); a7hi = f32x4_mul(s, a7hi);
+            }
+            FusedKerSpec::ShiftLeft(shift) => {
+                let s = f32x4_splat(2f32.powi(shift as i32));
+                a0lo = f32x4_mul(s, a0lo); a0hi = f32x4_mul(s, a0hi);
+                a1lo = f32x4_mul(s, a1lo); a1hi = f32x4_mul(s, a1hi);
+                a2lo = f32x4_mul(s, a2lo); a2hi = f32x4_mul(s, a2hi);
+                a3lo = f32x4_mul(s, a3lo); a3hi = f32x4_mul(s, a3hi);
+                a4lo = f32x4_mul(s, a4lo); a4hi = f32x4_mul(s, a4hi);
+                a5lo = f32x4_mul(s, a5lo); a5hi = f32x4_mul(s, a5hi);
+                a6lo = f32x4_mul(s, a6lo); a6hi = f32x4_mul(s, a6hi);
+                a7lo = f32x4_mul(s, a7lo); a7hi = f32x4_mul(s, a7hi);
+            }
+            FusedKerSpec::AddUnicast(tile) => {
+                // 8 rows × 8 cols, each row laid out per col_byte_stride
+                let mut ptr: *const u8 = tile.ptr;
+                for ab_pair in [(&mut a0lo, &mut a0hi), (&mut a1lo, &mut a1hi),
+                                (&mut a2lo, &mut a2hi), (&mut a3lo, &mut a3hi),
+                                (&mut a4lo, &mut a4hi), (&mut a5lo, &mut a5hi),
+                                (&mut a6lo, &mut a6hi), (&mut a7lo, &mut a7hi)].iter_mut() {
+                    let m0 = *(ptr as *const f32);
+                    let m1 = *(ptr.offset(tile.col_byte_stride) as *const f32);
+                    let m2 = *(ptr.offset(tile.col_byte_stride * 2) as *const f32);
+                    let m3 = *(ptr.offset(tile.col_byte_stride * 3) as *const f32);
+                    let m4 = *(ptr.offset(tile.col_byte_stride * 4) as *const f32);
+                    let m5 = *(ptr.offset(tile.col_byte_stride * 5) as *const f32);
+                    let m6 = *(ptr.offset(tile.col_byte_stride * 6) as *const f32);
+                    let m7 = *(ptr.offset(tile.col_byte_stride * 7) as *const f32);
+                    let (lo, hi) = ab_pair;
+                    **lo = f32x4_add(**lo, f32x4(m0, m1, m2, m3));
+                    **hi = f32x4_add(**hi, f32x4(m4, m5, m6, m7));
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                }
+            }
+            FusedKerSpec::AddRowColProducts(rows, cols) => {
+                let p = cols as *const v128;
+                let clo = v128_load(p); let chi = v128_load(p.add(1));
+                let r0 = f32x4_splat(*rows.add(0)); a0lo = f32x4_add(a0lo, f32x4_mul(r0, clo)); a0hi = f32x4_add(a0hi, f32x4_mul(r0, chi));
+                let r1 = f32x4_splat(*rows.add(1)); a1lo = f32x4_add(a1lo, f32x4_mul(r1, clo)); a1hi = f32x4_add(a1hi, f32x4_mul(r1, chi));
+                let r2 = f32x4_splat(*rows.add(2)); a2lo = f32x4_add(a2lo, f32x4_mul(r2, clo)); a2hi = f32x4_add(a2hi, f32x4_mul(r2, chi));
+                let r3 = f32x4_splat(*rows.add(3)); a3lo = f32x4_add(a3lo, f32x4_mul(r3, clo)); a3hi = f32x4_add(a3hi, f32x4_mul(r3, chi));
+                let r4 = f32x4_splat(*rows.add(4)); a4lo = f32x4_add(a4lo, f32x4_mul(r4, clo)); a4hi = f32x4_add(a4hi, f32x4_mul(r4, chi));
+                let r5 = f32x4_splat(*rows.add(5)); a5lo = f32x4_add(a5lo, f32x4_mul(r5, clo)); a5hi = f32x4_add(a5hi, f32x4_mul(r5, chi));
+                let r6 = f32x4_splat(*rows.add(6)); a6lo = f32x4_add(a6lo, f32x4_mul(r6, clo)); a6hi = f32x4_add(a6hi, f32x4_mul(r6, chi));
+                let r7 = f32x4_splat(*rows.add(7)); a7lo = f32x4_add(a7lo, f32x4_mul(r7, clo)); a7hi = f32x4_add(a7hi, f32x4_mul(r7, chi));
+            }
+            FusedKerSpec::Store(tile) => {
+                // 8 rows × 8 cols stores
+                let mut ptr: *mut u8 = tile.ptr;
+                for (lo, hi) in [(a0lo, a0hi), (a1lo, a1hi), (a2lo, a2hi), (a3lo, a3hi),
+                                 (a4lo, a4hi), (a5lo, a5hi), (a6lo, a6hi), (a7lo, a7hi)].iter() {
+                    *(ptr as *mut f32) = f32x4_extract_lane::<0>(*lo);
+                    *(ptr.offset(tile.col_byte_stride) as *mut f32) = f32x4_extract_lane::<1>(*lo);
+                    *(ptr.offset(tile.col_byte_stride * 2) as *mut f32) = f32x4_extract_lane::<2>(*lo);
+                    *(ptr.offset(tile.col_byte_stride * 3) as *mut f32) = f32x4_extract_lane::<3>(*lo);
+                    *(ptr.offset(tile.col_byte_stride * 4) as *mut f32) = f32x4_extract_lane::<0>(*hi);
+                    *(ptr.offset(tile.col_byte_stride * 5) as *mut f32) = f32x4_extract_lane::<1>(*hi);
+                    *(ptr.offset(tile.col_byte_stride * 6) as *mut f32) = f32x4_extract_lane::<2>(*hi);
+                    *(ptr.offset(tile.col_byte_stride * 7) as *mut f32) = f32x4_extract_lane::<3>(*hi);
+                    ptr = ptr.add(tile.row_byte_stride as usize);
+                }
+            }
+            FusedKerSpec::AddMatMul { k, pa, pb, packing: _ } => {
+                // A: packed [k][MR=8] = each k iter loads 8 row values
+                // B: packed [k][NR=8] = each k iter loads 8 col values as 2 v128
+                let a = pa as *const f32;
+                let b = pb as *const v128;
+                for i in 0..k {
+                    let arow = std::slice::from_raw_parts(a.offset(8 * i as isize), 8);
+                    let blo = v128_load(b.offset((2 * i) as isize));
+                    let bhi = v128_load(b.offset((2 * i + 1) as isize));
+                    let s = f32x4_splat(arow[0]); a0lo = f32x4_add(a0lo, f32x4_mul(s, blo)); a0hi = f32x4_add(a0hi, f32x4_mul(s, bhi));
+                    let s = f32x4_splat(arow[1]); a1lo = f32x4_add(a1lo, f32x4_mul(s, blo)); a1hi = f32x4_add(a1hi, f32x4_mul(s, bhi));
+                    let s = f32x4_splat(arow[2]); a2lo = f32x4_add(a2lo, f32x4_mul(s, blo)); a2hi = f32x4_add(a2hi, f32x4_mul(s, bhi));
+                    let s = f32x4_splat(arow[3]); a3lo = f32x4_add(a3lo, f32x4_mul(s, blo)); a3hi = f32x4_add(a3hi, f32x4_mul(s, bhi));
+                    let s = f32x4_splat(arow[4]); a4lo = f32x4_add(a4lo, f32x4_mul(s, blo)); a4hi = f32x4_add(a4hi, f32x4_mul(s, bhi));
+                    let s = f32x4_splat(arow[5]); a5lo = f32x4_add(a5lo, f32x4_mul(s, blo)); a5hi = f32x4_add(a5hi, f32x4_mul(s, bhi));
+                    let s = f32x4_splat(arow[6]); a6lo = f32x4_add(a6lo, f32x4_mul(s, blo)); a6hi = f32x4_add(a6hi, f32x4_mul(s, bhi));
+                    let s = f32x4_splat(arow[7]); a7lo = f32x4_add(a7lo, f32x4_mul(s, blo)); a7hi = f32x4_add(a7hi, f32x4_mul(s, bhi));
+                }
+            }
+        }
+        pnl = pnl.add(1);
+    }
+    0
+}
+
+MMMRustKernel!(kernel_f32_8x8 => wasm_f32_8x8<f32>(8,8)@(8,8) quality(ImplementationQuality::TargetOptimized));
diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma.rs
new file mode 100644
index 000000000..271aaeeaf
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma.rs
@@ -0,0 +1,49 @@
+use crate::frame::element_wise::ElementWiseKer;
+use crate::frame::reduce::{MapReduceKer, ReduceKer};
+use crate::x86_64_fma::softmax::x86_64_fma_softmax2_fastcompact_f32_32n;
+use crate::Ops;
+
+pub mod mmm;
+
+pub mod by_scalar;
+mod intel;
+pub mod max;
+pub mod panel_extract;
+pub mod softmax;
+
+const AVX2: fn() -> bool = || is_x86_feature_detected!("avx2");
+const FMA: fn() -> bool = || is_x86_feature_detected!("fma");
+const AVX512F: fn() -> bool = || is_x86_feature_detected!("avx512f");
+
+tanh_impl!(f32, fma_tanh_f32, 8, 8, is_x86_feature_detected!("fma"));
+sigmoid_impl!(f32, fma_sigmoid_f32, 8, 8, is_x86_feature_detected!("fma"));
+
+fn plug_avx2(_ops: &mut Ops) {}
+
+fn plug_fma(ops: &mut Ops) {
+    panel_extract::plug(ops);
+
+    ops.sigmoid_f32 = Box::new(|| fma_sigmoid_f32::ew());
+    ops.tanh_f32 = Box::new(|| fma_tanh_f32::ew());
+
+    ops.mul_by_scalar_f32 = Box::new(|| by_scalar::x86_64_avx_f32_mul_by_scalar_32n::ew());
+    ops.max_f32 = Box::new(|| max::x86_64_fma_max_f32_32n::red());
+    ops.softmax2_fastcompact_f32 = Box::new(|| x86_64_fma_softmax2_fastcompact_f32_32n::red());
+
+    log::info!("sigmoid_f32, tanh_f32: x86_64/fma activated");
+}
+
+fn plug_avx512f(_ops: &mut Ops) {}
+
+pub fn plug(ops: &mut Ops) {
+    mmm::plug(ops);
+    if is_x86_feature_detected!("avx2") {
+        plug_avx2(ops);
+        if is_x86_feature_detected!("fma") {
+            plug_fma(ops);
+            if is_x86_feature_detected!("avx512f") {
+                plug_avx512f(ops);
+            }
+        }
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/by_scalar.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/by_scalar.rs
new file mode 100644
index 000000000..dacef2425
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/by_scalar.rs
@@ -0,0 +1,56 @@
+ew_impl_wrap!(
+    f32,
+    x86_64_avx_f32_mul_by_scalar_32n,
+    32,
+    8,
+    f32,
+    fn run(x: &mut [f32], s: f32) {
+        debug_assert!(x.len() % Self::nr() == 0);
+        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
+        unsafe { x86_64_avx_f32_mul_by_scalar_32n_run(x, s) }
+    }
+);
+
+#[target_feature(enable = "avx")]
+unsafe fn x86_64_avx_f32_mul_by_scalar_32n_run(buf: &mut [f32], scalar: f32) {
+    unsafe {
+        let len = buf.len();
+        let ptr = buf.as_ptr();
+        std::arch::asm!("
+            vbroadcastss ymm0, xmm0
+            2:
+                vmovaps ymm4, [{ptr}]
+                vmovaps ymm5, [{ptr} + 32]
+                vmovaps ymm6, [{ptr} + 64]
+                vmovaps ymm7, [{ptr} + 96]
+                vmulps ymm4, ymm4, ymm0
+                vmulps ymm5, ymm5, ymm0
+                vmulps ymm6, ymm6, ymm0
+                vmulps ymm7, ymm7, ymm0
+                vmovaps [{ptr}], ymm4
+                vmovaps [{ptr} + 32], ymm5
+                vmovaps [{ptr} + 64], ymm6
+                vmovaps [{ptr} + 96], ymm7
+                add {ptr}, 128
+                sub {len}, 32
+                jnz 2b
+            ",
+        len = inout(reg) len => _,
+        ptr = inout(reg) ptr => _,
+        in("xmm0") scalar,
+        out("ymm4") _, out("ymm5") _, out("ymm6") _, out("ymm7") _
+        );
+    }
+}
+
+#[cfg(test)]
+#[macro_use]
+pub mod test_x86_64_avx_f32_mul_by_scalar_32n {
+    use super::*;
+    by_scalar_frame_tests!(
+        is_x86_feature_detected!("avx2"),
+        f32,
+        x86_64_avx_f32_mul_by_scalar_32n,
+        |a, b| a * b
+    );
+}
diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/intel.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/intel.rs
new file mode 100644
index 000000000..277fb6986
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/intel.rs
@@ -0,0 +1,5 @@
+use crate::frame::mmm::cost_model::CostModel;
+#[allow(dead_code)]
+pub fn models() -> Vec<(&'static str, CostModel<'static>)> {
+vec!(
+)}
diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/max.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/max.rs
new file mode 100644
index 000000000..cea571047
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/max.rs
@@ -0,0 +1,67 @@
+reduce_impl_wrap!(
+    f32,
+    x86_64_fma_max_f32_32n,
+    32,
+    8,
+    (),
+    f32::MIN,
+    #[inline(never)]
+    fn run(buf: &[f32], _: ()) -> f32 {
+        assert!(buf.len() % 32 == 0);
+        assert!(buf.len() > 0);
+        unsafe { x86_64_fma_max_f32_32n_run(buf) }
+    },
+    #[inline(never)]
+    fn reduce_two(a: f32, b: f32) -> f32 {
+        a.max(b)
+    }
+);
+
+#[target_feature(enable = "avx")]
+unsafe fn x86_64_fma_max_f32_32n_run(buf: &[f32]) -> f32 {
+    unsafe {
+        let len = buf.len();
+        let ptr = buf.as_ptr();
+        let mut acc = f32::MIN;
+        std::arch::asm!("
+            vbroadcastss ymm0, xmm0
+            vmovaps ymm1, ymm0
+            vmovaps ymm2, ymm0
+            vmovaps ymm3, ymm0
+            2:
+                vmovaps ymm4, [{ptr}]
+                vmovaps ymm5, [{ptr} + 32]
+                vmovaps ymm6, [{ptr} + 64]
+                vmovaps ymm7, [{ptr} + 96]
+                vmaxps ymm0, ymm0, ymm4
+                vmaxps ymm1, ymm1, ymm5
+                vmaxps ymm2, ymm2, ymm6
+                vmaxps ymm3, ymm3, ymm7
+                add {ptr}, 128
+                sub {len}, 32
+                jnz 2b
+            vmaxps ymm0, ymm0, ymm1
+            vmaxps ymm2, ymm2, ymm3
+            vmaxps ymm0, ymm0, ymm2
+            vperm2f128 ymm1, ymm0, ymm0, 1      // copy second half (4xf32) of ymm0 to ymm1
+            vmaxps xmm0, xmm0, xmm1             // xmm0 contains 4 values to max
+            vpermilps xmm1, xmm0, 2 + (3 << 2)  // second 2x32 bit half moved to top
+            vmaxps xmm0, xmm0, xmm1             // xmm0 containes 2 values
+            vpermilps xmm1, xmm0, 1             // second f32 to top
+            vmaxps xmm0, xmm0, xmm1
+            ",
+        len = inout(reg) len => _,
+        ptr = inout(reg) ptr => _,
+        inout("ymm0") acc,
+        out("ymm1") _, out("ymm2") _, out("ymm3") _,
+        out("ymm4") _, out("ymm5") _, out("ymm6") _, out("ymm7") _
+        );
+        acc
+    }
+}
+
+#[cfg(test)]
+mod test_x86_64_fma_max_f32_32n {
+    use super::*;
+    crate::max_frame_tests!(is_x86_feature_detected!("avx2"), f32, x86_64_fma_max_f32_32n);
+}
diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/mmm.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/mmm.rs
new file mode 100644
index 000000000..2bf936304
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/mmm.rs
@@ -0,0 +1,172 @@
+use crate::block_quant::*;
+use crate::mmm::ImplementationQuality::ManuallyOptimized;
+use crate::pack::PackedFormat;
+use crate::Ops;
+use tract_data::internal::*;
+use DatumType::*;
+
+use super::*;
+
+MMMExternKernel!(fma_mmm_f32_8x8 <f32>(8, 8)@(256,4) where(FMA) quality(ManuallyOptimized));
+MMMExternKernel!(fma_mmm_f32_16x6<f32>(16,6)@(256,4) where(FMA) quality(ManuallyOptimized));
+MMMExternKernel!(fma_mmm_f32_16x5<f32>(16,5)@(256,4) where(FMA) quality(ManuallyOptimized));
+MMMExternKernel!(fma_mmm_f32_24x4<f32>(24,4)@(256,4) where(FMA) quality(ManuallyOptimized));
+MMMExternKernel!(fma_mmm_f32_40x2<f32>(40,2)@(256,4) where(FMA) quality(ManuallyOptimized));
+MMMExternKernel!(fma_mmm_f32_64x1<f32>(64,1)@(256,4) where(FMA) quality(ManuallyOptimized));
+
+pub fn pq40_r32() -> PackedBlockQuantFormat {
+    PackedBlockQuantFormat::new(&Q4_0, 32, 16, false)
+}
+MMMExternKernel! {fma_mmm_f32_32x1<f32>(32,1)@(256,4) where(FMA)
+    packing[1] = q40f32 => |k| k.with_packing_a(pq40_r32());
+    packing[2] = q40f16 => |k| k.with_packing(pq40_r32(), PackedFormat::new(F16, 1, 2));
+    packing[3] = f16f16 => |k| k.with_packing(PackedFormat::new(F16, 32, 32), PackedFormat::new(F16, 1, 2));
+    quality(ManuallyOptimized)
+    store(f16)
+}
+MMMExternKernel!(fma_mmm_f32_32x3<f32>(32,3)@(256,4) where(FMA)
+ packing[1] = f32f16 => |k| k.with_packing(f32::packing(32).align(256), f16::packing(3));
+ quality(ManuallyOptimized)
+ store(f16)
+);
+
+MMMExternKernel!(avx512_mmm_f32_128x1<f32>(128, 1)@(512,4) where (AVX512F) quality(ManuallyOptimized));
+MMMExternKernel!(avx512_mmm_f32_16x1 <f32>( 16, 1)@(512,4) where (AVX512F) quality(ManuallyOptimized));
+MMMExternKernel!(avx512_mmm_f32_16x12<f32>( 16,12)@(512,4) where (AVX512F) quality(ManuallyOptimized));
+MMMExternKernel!(avx512_mmm_f32_16x8 <f32>( 16, 8)@(512,4) where (AVX512F) quality(ManuallyOptimized));
+MMMExternKernel!(avx512_mmm_f32_32x6 <f32>( 32, 6)@(512,4) where (AVX512F) quality(ManuallyOptimized));
+MMMExternKernel!(avx512_mmm_f32_32x5 <f32>( 32, 5)@(512,4) where (AVX512F) quality(ManuallyOptimized));
+MMMExternKernel!(avx512_mmm_f32_48x4 <f32>( 48, 4)@(512,4) where (AVX512F) quality(ManuallyOptimized));
+MMMExternKernel!(avx512_mmm_f32_64x3 <f32>( 64, 3)@(512,4) where (AVX512F) quality(ManuallyOptimized));
+MMMExternKernel!(avx512_mmm_f32_80x2 <f32>( 80, 2)@(512,4) where (AVX512F) quality(ManuallyOptimized));
+
+MMMExternKernel! { avx2_mmm_i32_8x8<i32>(8,8)@(256,4) where(AVX2)
+    packing[1] = i8i8 => |k| k.with_packing(PackedFormat::new(DatumType::I8, 8, 256), PackedFormat::new(DatumType::I8, 8, 4));
+    quality(ManuallyOptimized)
+    store(i8)
+}
+
+pub fn plug(ops: &mut Ops) {
+    if is_x86_feature_detected!("avx2") {
+        plug_avx2(ops);
+        if is_x86_feature_detected!("fma") {
+            plug_fma(ops);
+            if is_x86_feature_detected!("avx512f") {
+                plug_avx512f(ops);
+            }
+        }
+    }
+}
+
+pub fn plug_avx2(ops: &mut Ops) {
+    ops.mmm_impls.push(mmm::avx2_mmm_i32_8x8.mmm());
+    ops.qmmm_i32 = Box::new(|_, _, _| mmm::avx2_mmm_i32_8x8.mmm());
+    log::info!("qmmm_i32: x86_64/avx2 activated");
+}
+
+pub fn plug_fma(ops: &mut Ops) {
+    ops.mmm_impls.extend([
+        fma_mmm_f32_8x8.mmm(),
+        fma_mmm_f32_16x5.mmm(),
+        fma_mmm_f32_16x6.mmm(),
+        fma_mmm_f32_24x4.mmm(),
+        fma_mmm_f32_32x3.mmm(),
+        fma_mmm_f32_40x2.mmm(),
+        fma_mmm_f32_64x1.mmm(),
+    ]);
+
+    ops.mmv_f32 = Box::new(|_, _| fma_mmm_f32_64x1.mmm());
+
+    ops.mmm_f32 = Box::new(|_, _, n| {
+        if n.is_none() {
+            return fma_mmm_f32_16x6.mmm();
+        }
+
+        let n = n.unwrap();
+
+        match n {
+            1 => unreachable!("should've been mmv"),
+            2 => return fma_mmm_f32_40x2.mmm(),
+            3 => return fma_mmm_f32_32x3.mmm(),
+            4 => return fma_mmm_f32_24x4.mmm(),
+            5 => return fma_mmm_f32_16x5.mmm(),
+            6 => return fma_mmm_f32_16x6.mmm(),
+            8 => return fma_mmm_f32_8x8.mmm(),
+            _ => {}
+        };
+
+        let scaling_baseline = 60.0;
+        let kernel_normalized_perf = [
+            44.0 / scaling_baseline, // 8x8
+            54.0 / scaling_baseline, // 2x6
+            54.0 / scaling_baseline, // 2x5
+            54.0 / scaling_baseline, // 3x4
+            54.0 / scaling_baseline, // 4x3
+            54.0 / scaling_baseline, // 5x2
+        ];
+
+        fn compute_efficiency(n: usize, kernel_width: usize, scale: f32) -> f32 {
+            let kernel_width = kernel_width as f32;
+            let n = n as f32;
+            let batch_count = (n / kernel_width).ceil();
+            let actual_count = batch_count * kernel_width;
+            let multi_batch_penalty = 1.0 - batch_count / 100.0;
+            n / actual_count * scale * multi_batch_penalty
+        }
+
+        let efficiencies = [
+            compute_efficiency(n, 8, kernel_normalized_perf[0]),
+            compute_efficiency(n, 6, kernel_normalized_perf[1]),
+            compute_efficiency(n, 5, kernel_normalized_perf[2]),
+            compute_efficiency(n, 4, kernel_normalized_perf[3]),
+            compute_efficiency(n, 3, kernel_normalized_perf[4]),
+            compute_efficiency(n, 2, kernel_normalized_perf[5]),
+        ];
+
+        let best_idx = efficiencies.iter().copied().enumerate().fold((0, 0.0), |max, val| {
+            if val.1 > max.1 {
+                val
+            } else {
+                max
+            }
+        });
+
+        match best_idx.0 {
+            0 => fma_mmm_f32_8x8.mmm(),
+            1 => fma_mmm_f32_16x6.mmm(),
+            2 => fma_mmm_f32_16x5.mmm(),
+            3 => fma_mmm_f32_24x4.mmm(),
+            4 => fma_mmm_f32_32x3.mmm(),
+            5 => fma_mmm_f32_40x2.mmm(),
+            _ => unreachable!("not a valid index"),
+        }
+    });
+    log::info!("mmm_f32, mmv_f32: x86_64/fma activated");
+
+    if is_x86_feature_detected!("f16c") {
+        ops.mmm_impls.push(mmm::fma_mmm_f32_32x1.mmm()); // q40f32 requires f16c
+        log::info!("found f16c, added fake-f16 and q40-able kernels");
+    }
+}
+
+pub fn plug_avx512f(ops: &mut Ops) {
+    ops.mmm_impls.push(avx512_mmm_f32_128x1.mmm());
+    ops.mmm_impls.push(avx512_mmm_f32_80x2.mmm());
+    ops.mmm_impls.push(avx512_mmm_f32_48x4.mmm());
+    ops.mmm_impls.push(avx512_mmm_f32_64x3.mmm());
+    ops.mmm_impls.push(avx512_mmm_f32_16x12.mmm());
+    ops.mmv_f32 = Box::new(|m, _k| match m {
+        Some(m) if m < 31 => avx512_mmm_f32_16x1.mmm(),
+        _ => avx512_mmm_f32_128x1.mmm(),
+    });
+
+    ops.mmm_f32 = Box::new(|m, _, n| match (m, n) {
+        (_, Some(1)) => unreachable!("should've been mmv"),
+        (_, Some(2)) => avx512_mmm_f32_80x2.mmm(),
+        (Some(m), _) if m <= 16 => mmm::avx512_mmm_f32_16x12.mmm(),
+        (_, Some(n)) if n % 4 == 0 && n % 3 != 0 && n < 32 => avx512_mmm_f32_48x4.mmm(),
+        (_, Some(n)) if n < 32 => avx512_mmm_f32_64x3.mmm(),
+        _ => avx512_mmm_f32_16x12.mmm(),
+    });
+    log::info!("mmm_f32, mmv_f32: x86_64/avx512f activated");
+}
diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/panel_extract.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/panel_extract.rs
new file mode 100644
index 000000000..3077ba0cf
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/panel_extract.rs
@@ -0,0 +1,136 @@
+use super::*;
+use crate::pack::{PackedFormat, Packing};
+use crate::Ops;
+use tract_data::internal::*;
+
+pub fn plug(ops: &mut Ops) {
+    ops.panel_extractors.extend([packed_32_q40_to_f32.clone(), packed_32_f16_to_f32.clone()]);
+}
+
+panel_extractor!(kernel_packed_32_q40_to_f32 as packed_32_q40_to_f32(
+    Box::new(super::mmm::pq40_r32()),
+    f32::packing(32).align(32)
+) where(AVX2));
+
+panel_extractor!(kernel_packed_32_f16_to_f32 as packed_32_f16_to_f32(
+    Box::new(PackedFormat::new(f16::datum_type(), 32, 32)),
+    f32::packing(32).align(32)
+) where(AVX2));
+
+#[target_feature(enable = "avx2")]
+unsafe fn kernel_packed_32_q40_to_f32(input: *const u8, output: *mut u8, k: usize) {
+    unsafe {
+        if k == 0 {
+            return;
+        }
+        debug_assert!(k % 32 == 0);
+        debug_assert!(output as usize % 32 == 0);
+        std::arch::asm!("
+    vbroadcastss    ymm14, dword ptr [{mask}]
+    vbroadcastss    ymm13, dword ptr [{eight}]
+
+    2:
+        vmovaps         xmm4, [{i}]
+        vmovaps         xmm5, [{i} + 16]
+        vmovaps         xmm6, [{i} + 32]
+        vmovaps         xmm7, [{i} + 48]
+        vcvtph2ps       ymm4, xmm4
+        vcvtph2ps       ymm5, xmm5
+        vcvtph2ps       ymm6, xmm6
+        vcvtph2ps       ymm7, xmm7
+        add             {i}, 64
+
+        mov {k2}, 32
+    3:
+        vmovaps         xmm8, [{i}]            // 32 nibbles
+        vpand           xmm10, xmm8, xmm14     // 16 bytes
+        vpmovzxbd       ymm9, xmm10            // 8 u32
+
+        vpermilpd       xmm10, xmm10, 1        // swap 64bit halves
+        vpmovzxbd       ymm10, xmm10           // 8 u32
+
+        vpsrlw          xmm8, xmm8, 4
+        vpand           xmm12, xmm8, xmm14      // 16 bytes
+        vpmovzxbd       ymm11, xmm12            // 8 u32
+        vpermilpd       xmm12, xmm12, 1         // swap 64bit halves
+        vpmovzxbd       ymm12, xmm12            // 8 u32
+
+        vpsubd          ymm9, ymm9, ymm13
+        vpsubd          ymm10, ymm10, ymm13
+        vpsubd          ymm11, ymm11, ymm13
+        vpsubd          ymm12, ymm12, ymm13
+
+        vcvtdq2ps       ymm9, ymm9
+        vcvtdq2ps       ymm10, ymm10
+        vcvtdq2ps       ymm11, ymm11
+        vcvtdq2ps       ymm12, ymm12
+
+        vmulps          ymm9, ymm9, ymm4
+        vmulps          ymm10, ymm10, ymm5
+        vmulps          ymm11, ymm11, ymm6
+        vmulps          ymm12, ymm12, ymm7
+
+        vmovaps         [{o}], ymm9
+        vmovaps         [{o}+32], ymm10
+        vmovaps         [{o}+64], ymm11
+        vmovaps         [{o}+96], ymm12
+
+        add             {i}, 16
+        add             {o}, 128
+        sub             {k2}, 1
+        jnz             3b
+
+        sub {k}, 32
+        jnz 2b;
+            ",
+        mask = in(reg) &0x0F0F0F0F,
+        eight = in(reg) &0x08,
+        k = inout(reg) k => _,
+        k2 = out(reg) _,
+        i = inout(reg) input => _,
+        o = inout(reg) output => _,
+        out("ymm0") _, out("ymm1") _, out("ymm2") _, out("ymm3") _,
+        out("ymm4") _, out("ymm5") _, out("ymm6") _, out("ymm7") _,
+        out("ymm8") _, out("ymm9") _, out("ymm10") _, out("ymm11") _,
+        out("ymm12") _, out("ymm13") _, out("ymm14") _, out("ymm15") _
+        );
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn kernel_packed_32_f16_to_f32(input: *const u8, output: *mut u8, k: usize) {
+    unsafe {
+        if k == 0 {
+            return;
+        }
+        debug_assert!(output as usize % 32 == 0);
+        std::arch::asm!("
+    2:
+        vmovaps         xmm4, [{i}]
+        vmovaps         xmm5, [{i} + 16]
+        vmovaps         xmm6, [{i} + 32]
+        vmovaps         xmm7, [{i} + 48]
+
+        vcvtph2ps       ymm4, xmm4
+        vcvtph2ps       ymm5, xmm5
+        vcvtph2ps       ymm6, xmm6
+        vcvtph2ps       ymm7, xmm7
+
+        vmovaps         [{o}], ymm4
+        vmovaps         [{o}+32], ymm5
+        vmovaps         [{o}+64], ymm6
+        vmovaps         [{o}+96], ymm7
+
+        add             {i}, 64
+        add             {o}, 128
+
+        sub {k}, 1
+        jnz 2b;
+            ",
+        k = inout(reg) k => _,
+        i = inout(reg) input => _,
+        o = inout(reg) output => _,
+        out("ymm4") _, out("ymm5") _, out("ymm6") _, out("ymm7") _,
+        );
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/src/x86_64_fma/softmax.rs b/vendor/tract-linalg-0.22.1/src/x86_64_fma/softmax.rs
new file mode 100644
index 000000000..ed63d3ca4
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/src/x86_64_fma/softmax.rs
@@ -0,0 +1,121 @@
+map_reduce_impl_wrap!(
+    f32,
+    x86_64_fma_softmax2_fastcompact_f32_32n,
+    32,
+    8,
+    f32,
+    f32::MIN,
+    0f32,
+    #[inline(never)]
+    fn run(buf: &mut [f32], max: f32) -> f32 {
+        assert!(buf.len() % 32 == 0);
+        assert!(buf.len() > 0);
+        unsafe { x86_64_fma_softmax2_fastcompact_f32_32n_run(buf, max) }
+    },
+    #[inline(never)]
+    fn reduce_two(a: f32, b: f32) -> f32 {
+        a + b
+    }
+);
+
+#[target_feature(enable = "avx,fma")]
+unsafe fn x86_64_fma_softmax2_fastcompact_f32_32n_run(buf: &mut [f32], max: f32) -> f32 {
+    unsafe {
+        let len = buf.len();
+        let ptr = buf.as_ptr();
+        let mut acc = 0f32;
+        const MLN2: f32 = 0.6931471805f32;
+        const A: f32 = 8388608.0f32;
+        const B: f32 = 1065353216.0f32;
+        const C: f32 = 60801.0f32;
+        const SLOPE: f32 = A / MLN2;
+        const OFFSET: f32 = B - C;
+        std::arch::asm!("
+            vbroadcastss ymm0, xmm0
+            vmovaps ymm1, ymm0
+            vmovaps ymm2, ymm0
+            vmovaps ymm3, ymm0
+
+            vpxor   ymm12, ymm12, ymm12
+            vbroadcastss ymm13, xmm13
+            vbroadcastss ymm14, xmm14
+            vbroadcastss ymm15, xmm15
+            2:
+                vmovaps ymm4, [{ptr}]
+                vmovaps ymm5, [{ptr} + 32]
+                vmovaps ymm6, [{ptr} + 64]
+                vmovaps ymm7, [{ptr} + 96]
+
+                vsubps ymm4, ymm4, ymm13
+                vsubps ymm5, ymm5, ymm13
+                vsubps ymm6, ymm6, ymm13
+                vsubps ymm7, ymm7, ymm13
+
+                vmovaps ymm8, ymm15
+                vmovaps ymm9, ymm15
+                vmovaps ymm10, ymm15
+                vmovaps ymm11, ymm15
+
+                vfmadd231ps ymm8, ymm4, ymm14
+                vfmadd231ps ymm9, ymm5, ymm14
+                vfmadd231ps ymm10, ymm6, ymm14
+                vfmadd231ps ymm11, ymm7, ymm14
+
+                vmaxps ymm8, ymm8, ymm12
+                vmaxps ymm9, ymm9, ymm12
+                vmaxps ymm10, ymm10, ymm12
+                vmaxps ymm11, ymm11, ymm12
+
+                vcvttps2dq ymm8, ymm8
+                vcvttps2dq ymm9, ymm9
+                vcvttps2dq ymm10, ymm10
+                vcvttps2dq ymm11, ymm11
+
+                vmovaps [{ptr}]     , ymm8
+                vmovaps [{ptr} + 32], ymm9
+                vmovaps [{ptr} + 64], ymm10
+                vmovaps [{ptr} + 96], ymm11
+
+                vaddps ymm0, ymm0, ymm8
+                vaddps ymm1, ymm1, ymm9
+                vaddps ymm2, ymm2, ymm10
+                vaddps ymm3, ymm3, ymm11
+
+                add {ptr}, 128
+                sub {len}, 32
+                jnz 2b
+
+            vaddps ymm0, ymm0, ymm1
+            vaddps ymm2, ymm2, ymm3
+            vaddps ymm0, ymm0, ymm2
+            vperm2f128 ymm1, ymm0, ymm0, 1
+            vaddps xmm0, xmm0, xmm1
+            vpermilps xmm1, xmm0, 2 + (3 << 2)
+            vaddps xmm0, xmm0, xmm1
+            vpermilps xmm1, xmm0, 1
+            vaddps xmm0, xmm0, xmm1
+            ",
+        len = inout(reg) len => _,
+        ptr = inout(reg) ptr => _,
+        inout("ymm0") acc,
+        out("ymm1") _, out("ymm2") _, out("ymm3") _,
+        out("ymm4") _, out("ymm5") _, out("ymm6") _, out("ymm7") _,
+        out("ymm8") _, out("ymm9") _, out("ymm10") _, out("ymm11") _,
+        out("ymm12") _,
+        inout("ymm13") max => _,
+        inout("ymm14") SLOPE => _,
+        inout("ymm15") OFFSET => _,
+        );
+        acc
+    }
+}
+
+#[cfg(test)]
+mod test_x86_64_fma_softmax2_fastcompact_f32_32n {
+    use super::*;
+    crate::softmax_l2_frame_tests!(
+        is_x86_feature_detected!("fma"),
+        f32,
+        x86_64_fma_softmax2_fastcompact_f32_32n
+    );
+}
diff --git a/vendor/tract-linalg-0.22.1/tests/virtual_im2col.rs b/vendor/tract-linalg-0.22.1/tests/virtual_im2col.rs
new file mode 100644
index 000000000..095ce1844
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/tests/virtual_im2col.rs
@@ -0,0 +1,545 @@
+use std::alloc::Layout;
+use std::fmt::Display;
+
+use proptest::arbitrary::Arbitrary;
+use proptest::prelude::*;
+use proptest::strategy::{BoxedStrategy, Strategy};
+use tract_data::internal::*;
+use tract_linalg::mmm::FusedSpec;
+use tract_linalg::mmm::{AsInputValue, EagerPackedInput, MMMInputFormat, MMMInputValue};
+use tract_linalg::pack::{PackedFormat, PackingWriter};
+use tract_linalg::WeightType;
+use DatumType::F32;
+
+proptest::proptest! {
+    #[test]
+    fn prop(pb in any::<ConvProblem>()) {
+        pb.check()
+    }
+}
+
+#[test]
+fn test1() {
+    ConvProblem {
+        lazy_im2col: false,
+        input: tensor3(&[[[1f32]]]),
+        filters: tensor4(&[[[[-1f32]]]]),
+    }
+    .check()
+}
+
+#[test]
+fn test_axes_0() {
+    // CHW HWIO CHW
+    // 121 1112 221
+    ConvProblem {
+        lazy_im2col: false,
+        input: tensor3(&[[[0f32], [-1.0]]]),
+        filters: tensor4(&[[[[0f32, -1f32]]]]),
+    }
+    .check()
+}
+
+#[test]
+fn test_axes_1() {
+    ConvProblem {
+        lazy_im2col: false,
+        input: tensor3(&[[[0f32, 1.]]]),
+        filters: tensor4(&[[[[1f32]]]]),
+    }
+    .check()
+}
+
+#[test]
+fn test_lazy_0() {
+    ConvProblem { lazy_im2col: true, input: tensor3(&[[[1f32]]]), filters: tensor4(&[[[[1f32]]]]) }
+        .check()
+}
+
+#[test]
+fn test_lazy_1() {
+    ConvProblem {
+        lazy_im2col: true,
+        input: tensor3(&[[[0f32], [0.], [0.]]]),
+        filters: tensor4(&[[[[0f32]]]]),
+    }
+    .check()
+}
+
+#[test]
+fn test_lazy_2() {
+    ConvProblem {
+        lazy_im2col: true,
+        input: tensor3(&[[[0f32, 0.], [0., 1.]]]),
+        filters: tensor4(&[[[[0f32]], [[1.]]]]),
+    }
+    .check()
+}
+
+#[test]
+fn test_lazy_3() {
+    // CHW HWIO CHW
+    // 212 1221 111
+    // im2col: k=4, n=1, k <- kh, kw, c
+    // 0 X X X X kh=0, kw=0, c=0
+    // 1 X X X X kh=0, kw=0, c=1
+    // 0 X X X X kh=0, kw=1, c=0
+    // 0 X X X X kh=0, kw=1, c=1
+    ConvProblem {
+        lazy_im2col: true,
+        input: tensor3(&[[[0f32, 0.]], [[1., 0.]]]),
+        filters: tensor4(&[[[[0f32], [0.]], [[1.], [0.]]]]),
+    }
+    .check()
+}
+
+#[test]
+fn test_eager_asan_0() {
+    ConvProblem {
+        lazy_im2col: false,
+        input: tensor(vec![3, 3, 5]),
+        filters: tensor(vec![3, 3, 3, 1]),
+    }
+    .check()
+}
+
+// 2D valid, no group, no dil, no stride, HWIO, CHW
+#[derive(Clone, Debug)]
+pub struct ConvProblem {
+    pub lazy_im2col: bool,
+    pub input: Tensor,
+    pub filters: Tensor,
+}
+
+fn mknhw(filters: &[usize], input: &[usize]) -> (usize, usize, usize, usize, usize) {
+    let m = filters[3];
+    let k = filters[0..3].iter().product::<usize>();
+    let h = input[1] - filters[0] + 1;
+    let w = input[2] - filters[1] + 1;
+    let n = h * w;
+    (m, k, n, h, w)
+}
+
+impl ConvProblem {
+    fn reference(&self) -> Tensor {
+        let (m, _, _, h, w) = mknhw(self.filters.shape(), self.input.shape());
+        let output_shape = [m, h, w];
+        let mut output = Tensor::zero::<f32>(&output_shape).unwrap();
+        let mut output_view = output.to_array_view_mut::<f32>().unwrap();
+        let input_view = self.input.to_array_view::<f32>().unwrap();
+        let filters_view = self.filters.to_array_view::<f32>().unwrap();
+        for geo_out in tract_ndarray::indices(&output_shape[1..]) {
+            for ker_geo in tract_ndarray::indices(&self.filters.shape()[0..2]) {
+                for ci in 0..self.filters.shape()[2] {
+                    for co in 0..self.filters.shape()[3] {
+                        let output_coord = [co, geo_out[0], geo_out[1]];
+                        let input_coord = [ci, geo_out[0] + ker_geo[0], geo_out[1] + ker_geo[1]];
+                        let ker_coord = [ker_geo[0], ker_geo[1], ci, co];
+                        output_view[output_coord] +=
+                            filters_view[ker_coord] * input_view[input_coord];
+                    }
+                }
+            }
+        }
+        output
+    }
+
+    pub fn tract(&self) -> TractResult<Tensor> {
+        let (m, k, n, h, w) = mknhw(self.filters.shape(), self.input.shape());
+        let output_shape = [m, h, w];
+        let internal_output_shape = [m, h * w];
+        let mmm = tract_linalg::ops().mmm(F32, Some(m), Some(k), Some(n)).unwrap();
+        let output = Tensor::zero::<f32>(&internal_output_shape)?;
+        let reshaped_filters = self.filters.clone().into_shape(&[k, m])?;
+        let (a_pack, b_pack) = &mmm.packings()[0];
+        let a = a_pack.prepare_one(&reshaped_filters, 0, 1)?;
+        unsafe {
+            let im2col: Box<dyn MMMInputValue> = if self.lazy_im2col {
+                LazyIm2colSpec {
+                    full_kernel_shape: self.filters.shape().into(),
+                    packer: b_pack.downcast_ref::<PackedFormat>().unwrap().clone(),
+                }
+                .wrap(&self.input.view())
+            } else {
+                EagerIm2colSpec {
+                    full_kernel_shape: self.filters.shape().into(),
+                    packer: b_pack.downcast_ref::<PackedFormat>().unwrap().clone(),
+                }
+                .wrap(&self.input.view())
+            };
+            let c_store = mmm.c_view(Some(0), Some(1)).wrap(&output.view());
+            mmm.run(
+                m,
+                n,
+                &[
+                    FusedSpec::AddMatMul {
+                        a: AsInputValue::Owned(a),
+                        b: AsInputValue::Owned(im2col),
+                        packing: 0,
+                    },
+                    FusedSpec::Store(c_store),
+                ],
+            )
+            .unwrap()
+        }
+        output.into_shape(&output_shape)
+    }
+
+    fn check(&self) {
+        let expected = self.reference();
+        let found = self.tract().unwrap();
+        if found.close_enough(&expected, true).is_err() {
+            println!("found: ");
+            println!("{:?}", found.to_array_view::<f32>().unwrap());
+            println!("expected: ");
+            println!("{:?}", expected.to_array_view::<f32>().unwrap());
+        }
+        found.close_enough(&expected, true).unwrap()
+    }
+}
+
+impl Arbitrary for ConvProblem {
+    type Parameters = ();
+    type Strategy = BoxedStrategy<Self>;
+    fn arbitrary_with(_args: Self::Parameters) -> Self::Strategy {
+        (any::<bool>(), 1..4usize, 1..4usize, 1..4usize, 1..4usize, 0..3usize, 0..3usize)
+            .prop_map(|(eager_im2col, h, w, i, o, extra_h, extra_w)| {
+                let filters = tensor(vec![h, w, i, o]);
+                let input = tensor(vec![i, h + extra_h, w + extra_w]);
+                ConvProblem { lazy_im2col: eager_im2col, filters, input }
+            })
+            .boxed()
+    }
+}
+
+fn tensor(shape: Vec<usize>) -> Tensor {
+    let mut tensor = Tensor::zero::<f32>(&shape).unwrap();
+    tensor.as_slice_mut::<f32>().unwrap().iter_mut().enumerate().for_each(|(ix, x)| *x = ix as f32);
+    tensor
+}
+
+#[derive(Clone, Debug, Hash, Eq, PartialEq)]
+struct EagerIm2colSpec {
+    packer: PackedFormat,
+    full_kernel_shape: TVec<usize>,
+}
+
+impl EagerIm2colSpec {
+    fn wrap(&self, input: &TensorView) -> Box<dyn MMMInputValue> {
+        let (_, k, n, h, w) = mknhw(&self.full_kernel_shape, input.shape());
+        // let input = input.to_array_view::<f32>().unwrap();
+        let ci = input.shape()[0];
+        let kh = self.full_kernel_shape[0];
+        let kw = self.full_kernel_shape[1];
+        let im2col = tract_ndarray::Array5::<f32>::from_shape_fn(
+            [kh, kw, ci, h, w],
+            |(kh, kw, ci, h, w)| *input.at([ci, h + kh, w + kw]).unwrap(),
+        )
+        .into_shape_with_order([k, n])
+        .unwrap();
+        Box::new(EagerIm2col { im2col: im2col.into_tensor(), packer: self.packer.clone(), k })
+    }
+}
+
+impl Display for EagerIm2colSpec {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "EagerIm2colSpec")
+    }
+}
+
+impl MMMInputFormat for EagerIm2colSpec {
+    fn prepare_tensor(&self, _t: &Tensor, _k_axis: usize, _mn_axis: usize) -> TractResult<Tensor> {
+        todo!();
+    }
+
+    fn precursor(&self) -> WeightType {
+        WeightType::Plain(f32::datum_type())
+    }
+
+    fn k_alignment(&self) -> usize {
+        1
+    }
+
+    fn r(&self) -> usize {
+        self.packer.r()
+    }
+
+    fn same_as(&self, other: &dyn MMMInputFormat) -> bool {
+        other.downcast_ref::<Self>().is_some_and(|other| other == self)
+    }
+
+    fn mem_size(&self, _k: TDim, _mn: TDim) -> TDim {
+        unimplemented!()
+    }
+
+    fn extract_at_mn_f16(
+        &self,
+        _data: &EagerPackedInput,
+        _mn: usize,
+        _slice: &mut [f16],
+    ) -> TractResult<()> {
+        todo!();
+    }
+
+    fn extract_at_mn_f32(
+        &self,
+        _data: &EagerPackedInput,
+        _mn: usize,
+        _slice: &mut [f32],
+    ) -> TractResult<()> {
+        todo!();
+    }
+
+    fn prepare_one(
+        &self,
+        _t: &Tensor,
+        _k_axis: usize,
+        _mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        todo!()
+    }
+}
+
+#[derive(Clone, Debug, Hash)]
+struct EagerIm2col {
+    packer: PackedFormat,
+    im2col: Tensor,
+    k: usize,
+}
+
+impl Display for EagerIm2col {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "eager")
+    }
+}
+
+impl MMMInputValue for EagerIm2col {
+    fn scratch_panel_buffer_layout(&self) -> Option<std::alloc::Layout> {
+        Some(
+            Layout::from_size_align(
+                self.packer.single_panel_len(self.k) * f32::datum_type().size_of(),
+                self.packer.alignment(),
+            )
+            .unwrap(),
+        )
+    }
+
+    fn panel_bytes(&self, i: usize, buffer: Option<*mut u8>) -> TractResult<*const u8> {
+        let buffer = buffer.unwrap();
+        let mn = self.im2col.shape()[1];
+        unsafe {
+            self.packer.pack_t::<f32>(
+                buffer as _,
+                self.im2col.as_ptr().unwrap(),
+                mn,
+                mn as isize,
+                1,
+                0..self.k,
+                (i * self.packer.r)..((i + 1) * self.packer.r),
+            );
+        }
+        Ok(buffer)
+    }
+
+    fn k(&self) -> usize {
+        self.k
+    }
+
+    fn mn(&self) -> usize {
+        self.im2col.shape()[1]
+    }
+
+    fn format(&self) -> &dyn tract_linalg::mmm::MMMInputFormat {
+        &self.packer
+    }
+
+    fn opaque_fact(&self) -> &dyn OpaqueFact {
+        unimplemented!()
+    }
+
+    fn same_as(&self, _other: &dyn MMMInputValue) -> bool {
+        unimplemented!()
+    }
+
+    fn extract_at_mn_f16(&self, _mn: usize, _slice: &mut [f16]) -> TractResult<()> {
+        unimplemented!()
+    }
+
+    fn extract_at_mn_f32(&self, _mn: usize, _slice: &mut [f32]) -> TractResult<()> {
+        unimplemented!()
+    }
+}
+
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+struct LazyIm2colSpec {
+    packer: PackedFormat,
+    full_kernel_shape: TVec<usize>,
+}
+
+impl LazyIm2colSpec {
+    fn wrap(&self, input: &TensorView) -> Box<dyn MMMInputValue> {
+        let (_, _, _, h, w) = mknhw(&self.full_kernel_shape, input.shape());
+        let kh = self.full_kernel_shape[0];
+        let kw = self.full_kernel_shape[1];
+        let ci = self.full_kernel_shape[2];
+        let input_strides = input.strides();
+        let k_offsets = (0..kh as isize)
+            .flat_map(|kh| {
+                (0..kw as isize).flat_map(move |kw| {
+                    (0..ci as isize).map(move |ci| {
+                        ci * input_strides[0] + kh * input_strides[1] + kw * input_strides[2]
+                    })
+                })
+            })
+            .collect();
+        let n_offsets = (0..h as isize)
+            .flat_map(|h| {
+                (0..w as isize).map(move |w| (h * input_strides[1] + w * input_strides[2]))
+            })
+            .collect();
+        unsafe {
+            Box::new(LazyIm2col {
+                spec: self.clone(),
+                image: input.as_ptr_unchecked(),
+                k_offsets,
+                n_offsets,
+                packer: self.packer.clone(),
+            })
+        }
+    }
+}
+
+impl Display for LazyIm2colSpec {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "LazyIm2colSpec")
+    }
+}
+
+impl MMMInputFormat for LazyIm2colSpec {
+    fn prepare_tensor(&self, _t: &Tensor, _k_axis: usize, _mn_axis: usize) -> TractResult<Tensor> {
+        todo!();
+    }
+    fn prepare_one(
+        &self,
+        _t: &Tensor,
+        _k_axis: usize,
+        _mn_axis: usize,
+    ) -> TractResult<Box<dyn MMMInputValue>> {
+        todo!();
+    }
+
+    fn precursor(&self) -> WeightType {
+        WeightType::Plain(f32::datum_type())
+    }
+
+    fn k_alignment(&self) -> usize {
+        1
+    }
+
+    fn r(&self) -> usize {
+        self.packer.r()
+    }
+
+    fn same_as(&self, other: &dyn MMMInputFormat) -> bool {
+        other.downcast_ref::<Self>().is_some_and(|other| other == self)
+    }
+
+    fn mem_size(&self, _k: TDim, _mn: TDim) -> TDim {
+        unimplemented!()
+    }
+
+    fn extract_at_mn_f16(
+        &self,
+        _data: &EagerPackedInput,
+        _mn: usize,
+        _slice: &mut [f16],
+    ) -> TractResult<()> {
+        todo!();
+    }
+
+    fn extract_at_mn_f32(
+        &self,
+        _data: &EagerPackedInput,
+        _mn: usize,
+        _slice: &mut [f32],
+    ) -> TractResult<()> {
+        todo!();
+    }
+}
+
+#[derive(Clone, Debug, Hash)]
+struct LazyIm2col {
+    spec: LazyIm2colSpec,
+    packer: PackedFormat,
+    image: *const f32,
+    n_offsets: Vec<isize>,
+    k_offsets: Vec<isize>,
+}
+unsafe impl Send for LazyIm2col {}
+unsafe impl Sync for LazyIm2col {}
+
+impl Display for LazyIm2col {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "lazy")
+    }
+}
+
+impl MMMInputValue for LazyIm2col {
+    fn scratch_panel_buffer_layout(&self) -> Option<std::alloc::Layout> {
+        Some(
+            Layout::from_size_align(
+                self.packer.single_panel_len(self.k_offsets.len() * f32::datum_type().size_of()),
+                self.packer.alignment(),
+            )
+            .unwrap(),
+        )
+    }
+
+    fn panel_bytes(&self, i: usize, buffer: Option<*mut u8>) -> TractResult<*const u8> {
+        let buffer = buffer.unwrap() as *mut f32;
+        let mn_end = ((i + 1) * self.packer.r).min(self.n_offsets.len());
+        let n_range = (i * self.packer.r)..mn_end;
+        let k = self.k_offsets.len();
+        unsafe {
+            let mut writer = self.packer.write_with_k_outer(buffer, k, n_range.len());
+            for k in 0..k {
+                for n in n_range.clone() {
+                    writer.write(
+                        *self.image.offset(
+                            self.n_offsets.get_unchecked(n) + self.k_offsets.get_unchecked(k),
+                        ),
+                    )
+                }
+            }
+        }
+        Ok(buffer as _)
+    }
+
+    fn k(&self) -> usize {
+        self.k_offsets.len()
+    }
+
+    fn mn(&self) -> usize {
+        self.n_offsets.len()
+    }
+
+    fn format(&self) -> &dyn MMMInputFormat {
+        &self.spec
+    }
+
+    fn opaque_fact(&self) -> &dyn OpaqueFact {
+        unimplemented!()
+    }
+
+    fn same_as(&self, _other: &dyn MMMInputValue) -> bool {
+        unimplemented!()
+    }
+
+    fn extract_at_mn_f16(&self, _mn: usize, _slice: &mut [f16]) -> TractResult<()> {
+        unimplemented!()
+    }
+
+    fn extract_at_mn_f32(&self, _mn: usize, _slice: &mut [f32]) -> TractResult<()> {
+        unimplemented!()
+    }
+}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli
new file mode 100644
index 000000000..857f7821c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli
@@ -0,0 +1,59 @@
+	// Tile size: 10x1
+	// Accumulators: 0-9
+	// Col regs: 10-19
+	// Row regs: 20, 21
+
+	vbroadcastss    zmm20,  dword ptr [rcx]
+
+	vmovaps         zmm10, [rax + 0]
+	vmovaps         zmm11, [rax + 64]
+    vmovaps         zmm12, [rax + 128]
+	vmovaps         zmm13, [rax + 192]
+    vmovaps         zmm14, [rax + 256]
+
+    vfmadd231ps     zmm0, zmm10, zmm20
+    vfmadd231ps     zmm1, zmm11, zmm20
+    vfmadd231ps     zmm2, zmm12, zmm20
+    vfmadd231ps     zmm3, zmm13, zmm20
+    vfmadd231ps     zmm4, zmm14, zmm20
+
+	vmovaps         zmm15, [rax + 320]
+    vmovaps         zmm16, [rax + 384]
+	vmovaps         zmm17, [rax + 448]
+	vmovaps         zmm18, [rax + 512]
+	vmovaps         zmm19, [rax + 576]
+
+    vfmadd231ps     zmm5, zmm10, zmm20
+    vfmadd231ps     zmm6, zmm11, zmm20
+    vfmadd231ps     zmm7, zmm12, zmm20
+    vfmadd231ps     zmm8, zmm13, zmm20
+    vfmadd231ps     zmm9, zmm14, zmm20
+
+	vbroadcastss    zmm21,  dword ptr [rcx + 4]
+
+	vmovaps         zmm10, [rax + 640]
+	vmovaps         zmm11, [rax + 704]
+    vmovaps         zmm12, [rax + 768]
+	vmovaps         zmm13, [rax + 832]
+    vmovaps         zmm14, [rax + 896]
+
+	vfmadd231ps     zmm0, zmm10, zmm21
+    vfmadd231ps     zmm1, zmm11, zmm21
+    vfmadd231ps     zmm2, zmm12, zmm21
+    vfmadd231ps     zmm3, zmm13, zmm21
+    vfmadd231ps     zmm4, zmm14, zmm21
+
+	vmovaps         zmm15, [rax + 960]
+    vmovaps         zmm16, [rax + 1024]
+	vmovaps         zmm17, [rax + 1088]
+	vmovaps         zmm18, [rax + 1152]
+	vmovaps         zmm19, [rax + 1216]
+
+    vfmadd231ps     zmm5, zmm10, zmm21
+    vfmadd231ps     zmm6, zmm11, zmm21
+    vfmadd231ps     zmm7, zmm12, zmm21
+    vfmadd231ps     zmm8, zmm13, zmm21
+    vfmadd231ps     zmm9, zmm14, zmm21
+
+    add rcx, 8
+	add rax, 1280
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..76aaae5bf
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,33 @@
+	// Tile size: 10x1
+	// Accumulators: 0-9
+	// Col regs: 10-19
+	// Row regs: 20
+
+	vbroadcastss    zmm20,  dword ptr [rcx]
+
+	vmovaps         zmm10, [rax + 0]
+	vmovaps         zmm11, [rax + 64]
+    vmovaps         zmm12, [rax + 128]
+	vmovaps         zmm13, [rax + 192]
+    vmovaps         zmm14, [rax + 256]
+
+    vfmadd231ps     zmm0, zmm10, zmm20
+    vfmadd231ps     zmm1, zmm11, zmm20
+    vfmadd231ps     zmm2, zmm12, zmm20
+    vfmadd231ps     zmm3, zmm13, zmm20
+    vfmadd231ps     zmm4, zmm14, zmm20
+
+	vmovaps         zmm15, [rax + 320]
+    vmovaps         zmm16, [rax + 384]
+	vmovaps         zmm17, [rax + 448]
+	vmovaps         zmm18, [rax + 512]
+	vmovaps         zmm19, [rax + 576]
+
+    vfmadd231ps     zmm5, zmm10, zmm20
+    vfmadd231ps     zmm6, zmm11, zmm20
+    vfmadd231ps     zmm7, zmm12, zmm20
+    vfmadd231ps     zmm8, zmm13, zmm20
+    vfmadd231ps     zmm9, zmm14, zmm20
+
+    add rcx, 4
+	add rax, 320
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..ba4e6232c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,7 @@
+	vbroadcastss    zmm15, dword ptr [rcx]
+
+    vmovups         zmm8, [rax]
+    vfmadd231ps     zmm0, zmm15, zmm8
+
+    add rcx, 4
+	add rax, 64
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli
new file mode 100644
index 000000000..4a1c31083
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli
@@ -0,0 +1,68 @@
+	vmovups    zmm31, [rcx]
+	// vbroadcastss    zmm17, [rcx + 4 * 0]
+	// vbroadcastss    zmm18, [rcx + 4 * 1]
+	// vbroadcastss    zmm19, [rcx + 4 * 2]
+	// vbroadcastss    zmm20, [rcx + 4 * 3]
+	// vbroadcastss    zmm21, [rcx + 4 * 4]
+	// vbroadcastss    zmm22, [rcx + 4 * 5]
+	// vbroadcastss    zmm23, [rcx + 4 * 6]
+	// vbroadcastss    zmm24, [rcx + 4 * 7]
+	// vbroadcastss    zmm25, [rcx + 4 * 8]
+	// vbroadcastss    zmm26, [rcx + 4 * 9]
+	// vbroadcastss    zmm27, [rcx + 4 * 10]
+	// vbroadcastss    zmm28, [rcx + 4 * 11]
+	// vbroadcastss    zmm29, [rcx + 4 * 12]
+	// vbroadcastss    zmm30, [rcx + 4 * 13]
+	// vbroadcastss    zmm31, [rcx + 4 * 14]
+
+	vbroadcastss zmm16, xmm31
+	valignd zmm17, zmm31, zmm31, 1
+	vbroadcastss zmm17, xmm17
+	valignd zmm18, zmm31, zmm31, 2
+	vbroadcastss zmm18, xmm18
+	valignd zmm19, zmm31, zmm31, 3
+	vbroadcastss zmm19, xmm19
+	valignd zmm20, zmm31, zmm31, 4
+	vbroadcastss zmm20, xmm20
+	valignd zmm21, zmm31, zmm31, 5
+	vbroadcastss zmm21, xmm21
+	valignd zmm22, zmm31, zmm31, 6
+	vbroadcastss zmm22, xmm22
+	valignd zmm23, zmm31, zmm31, 7
+	vbroadcastss zmm23, xmm23
+	valignd zmm24, zmm31, zmm31, 8
+	vbroadcastss zmm24, xmm24
+	valignd zmm25, zmm31, zmm31, 9
+	vbroadcastss zmm25, xmm25
+	valignd zmm26, zmm31, zmm31, 10
+	vbroadcastss zmm26, xmm26
+	valignd zmm27, zmm31, zmm31, 11
+	vbroadcastss zmm27, xmm27
+	valignd zmm28, zmm31, zmm31, 12
+	vbroadcastss zmm28, xmm28
+	valignd zmm29, zmm31, zmm31, 13
+	vbroadcastss zmm29, xmm29
+	valignd zmm30, zmm31, zmm31, 14
+	vbroadcastss zmm30, xmm30
+	valignd zmm31, zmm31, zmm31, 15
+	vbroadcastss zmm31, xmm31
+
+	vfmadd231ps     zmm0, zmm16, [rax + 0]
+    vfmadd231ps     zmm1, zmm17, [rax + 64]
+    vfmadd231ps     zmm2, zmm18, [rax + 128]
+    vfmadd231ps     zmm3, zmm19, [rax + 192]
+	vfmadd231ps     zmm4, zmm20, [rax + 256]
+    vfmadd231ps     zmm5, zmm21, [rax + 320]
+    vfmadd231ps     zmm6, zmm22, [rax + 384]
+    vfmadd231ps     zmm7, zmm23, [rax + 448]
+	vfmadd231ps     zmm8, zmm24, [rax + 512]
+    vfmadd231ps     zmm9, zmm25, [rax + 576]
+    vfmadd231ps     zmm10, zmm26, [rax + 640]
+    vfmadd231ps     zmm11, zmm27, [rax + 704]
+	vfmadd231ps     zmm12, zmm28, [rax + 768]
+    vfmadd231ps     zmm13, zmm29, [rax + 832]
+    vfmadd231ps     zmm14, zmm30, [rax + 896]
+    vfmadd231ps     zmm15, zmm31, [rax + 960]
+
+    add rcx, 64
+	add rax, 1024
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli
new file mode 100644
index 000000000..103be7015
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli
@@ -0,0 +1,24 @@
+	// slow
+	vbroadcastss xmm16, dword ptr [rcx]
+	vbroadcastss xmm17, dword ptr [rcx + 4]
+	vbroadcastss xmm18, dword ptr [rcx + 8]
+	vbroadcastss xmm19, dword ptr [rcx + 12]
+
+	// fast
+	vmovups	   		xmm31, [rcx]
+	vbroadcastss 	zmm16, xmm31
+	valignd 		xmm17, xmm31, xmm31, 1
+	vbroadcastss 	zmm17, xmm17
+	valignd 		xmm18, xmm31, xmm31, 2
+	vbroadcastss 	zmm18, xmm18
+	valignd 		xmm19, xmm31, xmm31, 3
+	vbroadcastss 	zmm19, xmm19
+
+	// commmon
+	vfmadd231ps		zmm0, zmm16, [rax + 0]
+	vfmadd231ps		zmm1, zmm17, [rax + 64]
+	vfmadd231ps		zmm2, zmm18, [rax + 128]
+	vfmadd231ps		zmm3, zmm19, [rax + 192]
+
+	add rcx, 16
+	add rax, 256
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli
new file mode 100644
index 000000000..d6cb277f8
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli
@@ -0,0 +1,29 @@
+	vmovups    ymm31, [rcx]
+
+	vbroadcastss zmm16, xmm31
+	valignd ymm17, ymm31, ymm31, 1
+	vbroadcastss zmm17, xmm17
+	valignd ymm18, ymm31, ymm31, 2
+	vbroadcastss zmm18, xmm18
+	valignd ymm19, ymm31, ymm31, 3
+	vbroadcastss zmm19, xmm19
+	valignd ymm20, ymm31, ymm31, 4
+	vbroadcastss zmm20, xmm20
+	valignd ymm21, ymm31, ymm31, 5
+	vbroadcastss zmm21, xmm21
+	valignd ymm22, ymm31, ymm31, 6
+	vbroadcastss zmm22, xmm22
+	valignd ymm23, ymm31, ymm31, 7
+	vbroadcastss zmm23, xmm23
+
+	vfmadd231ps     zmm0, zmm16, [rax + 0]
+    vfmadd231ps     zmm1, zmm17, [rax + 64]
+    vfmadd231ps     zmm2, zmm18, [rax + 128]
+    vfmadd231ps     zmm3, zmm19, [rax + 192]
+	vfmadd231ps     zmm4, zmm20, [rax + 256]
+    vfmadd231ps     zmm5, zmm21, [rax + 320]
+    vfmadd231ps     zmm6, zmm22, [rax + 384]
+    vfmadd231ps     zmm7, zmm23, [rax + 448]
+
+    add rcx, 32
+	add rax, 512
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli
new file mode 100644
index 000000000..8c9bf905b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli
@@ -0,0 +1,11 @@
+	vbroadcastss    zmm15,  dword ptr [rcx]
+
+    vmovaps     zmm8, [rax + 0]
+    vfmadd231ps     zmm0, zmm15, zmm8
+
+	vbroadcastss    zmm16,  dword ptr [rcx + 4]
+    vmovaps     zmm9, [rax + 64]
+    vfmadd231ps     zmm1, zmm16, zmm9
+
+    add rcx, 8
+	add rax, 128
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..4ffab3bd4
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,45 @@
+	// Tile size: 1x12
+	// Accumulators: 0-11
+	// Col regs: zmm14
+	// Row regs: zmm15
+
+    vmovaps         zmm15,  [rax]
+
+    vbroadcastss    zmm14, dword ptr [rcx + 0 * 4]
+    vfmadd231ps     zmm0, zmm15, zmm14
+
+    vbroadcastss    zmm14, dword ptr [rcx + 1 * 4]
+    vfmadd231ps     zmm1, zmm15, zmm14
+
+    vbroadcastss    zmm14, dword ptr [rcx + 2 * 4]
+    vfmadd231ps     zmm2, zmm15, zmm14
+
+    vbroadcastss    zmm14, dword ptr [rcx + 3 * 4]
+    vfmadd231ps     zmm3, zmm15, zmm14
+
+    vbroadcastss    zmm14, dword ptr [rcx + 4 * 4]
+    vfmadd231ps     zmm4, zmm15, zmm14
+
+    vbroadcastss    zmm14, dword ptr [rcx + 5 * 4]
+    vfmadd231ps     zmm5, zmm15, zmm14
+
+    vbroadcastss    zmm14, dword ptr [rcx + 6 * 4]
+    vfmadd231ps     zmm6, zmm15, zmm14
+
+    vbroadcastss    zmm14, dword ptr [rcx + 7 * 4]
+    vfmadd231ps     zmm7, zmm15, zmm14
+
+    vbroadcastss    zmm14, dword ptr [rcx + 8 * 4]
+    vfmadd231ps     zmm8, zmm15, zmm14
+
+    vbroadcastss    zmm14, dword ptr [rcx + 9 * 4]
+    vfmadd231ps     zmm9, zmm15, zmm14
+
+    vbroadcastss    zmm14, dword ptr [rcx + 10 * 4]
+    vfmadd231ps     zmm10, zmm15, zmm14
+
+    vbroadcastss    zmm14, dword ptr [rcx + 11 * 4]
+    vfmadd231ps     zmm11, zmm15, zmm14
+
+	add rcx, 48
+	add rax, 64
\ No newline at end of file
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli
new file mode 100644
index 000000000..118d312c8
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli
@@ -0,0 +1,53 @@
+	// Accumulators: 0-9
+	// Columns: 15-16
+	// Rows: 10-14
+    vbroadcastss    zmm10,  dword ptr [rcx]
+    vbroadcastss    zmm11,  dword ptr [rcx + 4]
+    vbroadcastss    zmm12,  dword ptr [rcx + 8]
+    vbroadcastss    zmm13,  dword ptr [rcx + 12]
+	vbroadcastss    zmm14,  dword ptr [rcx + 16]
+
+    vmovaps         zmm15,  [rax]
+    vmovaps         zmm16,  [rax + 64]
+
+    vfmadd231ps     zmm0,   zmm15, zmm10
+    vfmadd231ps     zmm1,   zmm16, zmm10
+
+    vfmadd231ps     zmm2,   zmm15, zmm11
+    vfmadd231ps     zmm3,   zmm16, zmm11
+
+    vfmadd231ps     zmm4,   zmm15, zmm12
+    vfmadd231ps     zmm5,   zmm16, zmm12
+
+    vfmadd231ps     zmm6,   zmm15, zmm13
+    vfmadd231ps     zmm7,   zmm16, zmm13
+
+    vfmadd231ps     zmm8,   zmm15, zmm14
+    vfmadd231ps     zmm9,   zmm16, zmm14
+
+    vbroadcastss    zmm10,  dword ptr [rcx + 20]
+    vbroadcastss    zmm11,  dword ptr [rcx + 24]
+    vbroadcastss    zmm12,  dword ptr [rcx + 28]
+    vbroadcastss    zmm13,  dword ptr [rcx + 32]
+    vbroadcastss    zmm14,  dword ptr [rcx + 36]
+
+    vmovaps         zmm15,  [rax + 128]
+    vmovaps         zmm16,  [rax + 192]
+
+    vfmadd231ps     zmm0,   zmm15, zmm10
+    vfmadd231ps     zmm1,   zmm16, zmm10
+
+    vfmadd231ps     zmm2,   zmm15, zmm11
+    vfmadd231ps     zmm3,   zmm16, zmm11
+
+    vfmadd231ps     zmm4,   zmm15, zmm12
+    vfmadd231ps     zmm5,   zmm16, zmm12
+
+    vfmadd231ps     zmm6,   zmm15, zmm13
+    vfmadd231ps     zmm7,   zmm16, zmm13
+
+    vfmadd231ps     zmm8,   zmm15, zmm14
+    vfmadd231ps     zmm9,   zmm16, zmm14
+
+	add rcx, 40
+	add rax, 256
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..e017834d2
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,30 @@
+	// Accumulators: 0-9
+	// Columns: 15
+	// Rows: 10-14
+
+    vbroadcastss    zmm10,  dword ptr [rcx]
+    vbroadcastss    zmm11,  dword ptr [rcx + 4]
+    vbroadcastss    zmm12,  dword ptr [rcx + 8]
+    vbroadcastss    zmm13,  dword ptr [rcx + 12]
+	vbroadcastss    zmm14,  dword ptr [rcx + 16]
+
+    vmovaps         zmm15,  [rax]
+    vmovaps         zmm16,  [rax + 64]
+
+    vfmadd231ps     zmm0,   zmm15, zmm10
+    vfmadd231ps     zmm1,   zmm16, zmm10
+
+    vfmadd231ps     zmm2,   zmm15, zmm11
+    vfmadd231ps     zmm3,   zmm16, zmm11
+
+    vfmadd231ps     zmm4,   zmm15, zmm12
+    vfmadd231ps     zmm5,   zmm16, zmm12
+
+    vfmadd231ps     zmm6,   zmm15, zmm13
+    vfmadd231ps     zmm7,   zmm16, zmm13
+
+    vfmadd231ps     zmm8,   zmm15, zmm14
+    vfmadd231ps     zmm9,   zmm16, zmm14
+
+	add rcx, 20
+	add rax, 128
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli
new file mode 100644
index 000000000..9d6c940a9
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli
@@ -0,0 +1,71 @@
+	// Tile size: 2x6
+	// Accumulators: 0-11
+	// Col regs: zmm14-15
+	// Row regs: zmm12-13
+
+	vbroadcastss	zmm14,	dword ptr [rcx]
+	vmovaps			zmm12,	[rax]
+	vmovaps			zmm13,	[rax + 64]
+	vbroadcastss	zmm15,	dword ptr [rcx + 4]
+
+	vfmadd231ps		zmm0,	zmm12, zmm14
+	vfmadd231ps		zmm1,	zmm13, zmm14
+
+	vbroadcastss	zmm14,	dword ptr [rcx + 8]
+
+	vfmadd231ps		zmm2,	zmm12, zmm15
+	vfmadd231ps		zmm3,	zmm13, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 12]
+
+	vfmadd231ps		zmm4,	zmm12, zmm14
+	vfmadd231ps		zmm5,	zmm13, zmm14
+
+	vbroadcastss	zmm14,	dword ptr [rcx + 16]
+
+	vfmadd231ps		zmm6,	zmm12, zmm15
+	vfmadd231ps		zmm7,	zmm13, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 20]
+
+	vfmadd231ps		zmm8,	zmm12, zmm14
+	vfmadd231ps		zmm9,	zmm13, zmm14
+
+	vbroadcastss	zmm14,	dword ptr [rcx+24]
+
+	vfmadd231ps		zmm10,	 zmm12, zmm15
+	vfmadd231ps		zmm11,	 zmm13, zmm15
+
+	// Iteration two
+	vmovaps			zmm12,	[rax + 128]
+	vmovaps			zmm13,	[rax + 192]
+	vbroadcastss	zmm15,	dword ptr [rcx + 24 + 4]
+
+	vfmadd231ps		zmm0,	zmm12, zmm14
+	vfmadd231ps		zmm1,	zmm13, zmm14
+
+	vbroadcastss	zmm14,	dword ptr [rcx + 24 + 8]
+
+	vfmadd231ps		zmm2,	zmm12, zmm15
+	vfmadd231ps		zmm3,	zmm13, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 24 + 12]
+
+	vfmadd231ps		zmm4,	zmm12, zmm14
+	vfmadd231ps		zmm5,	zmm13, zmm14
+
+	vbroadcastss	zmm14,	dword ptr [rcx + 24 + 16]
+
+	vfmadd231ps		zmm6,	zmm12, zmm15
+	vfmadd231ps		zmm7,	zmm13, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 24 + 20]
+
+	vfmadd231ps		zmm8,	zmm12, zmm14
+	vfmadd231ps		zmm9,	zmm13, zmm14
+
+	vfmadd231ps		zmm10,	 zmm12, zmm15
+	vfmadd231ps		zmm11,	 zmm13, zmm15
+
+	add rax, 256
+	add rcx, 48
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..31f861b10
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,39 @@
+	// Tile size: 2x6
+	// Accumulators: 0-11
+	// Col regs: zmm14-15
+	// Row regs: zmm12-13
+
+	// Load ordered by earliest use for first 2x2 block
+	vbroadcastss	zmm14,	dword ptr [rcx]
+	vmovaps			zmm12,	[rax]
+	vmovaps			zmm13,	[rax + 64]
+	vbroadcastss	zmm15,	dword ptr [rcx + 4]
+
+	vfmadd231ps		zmm0,	zmm12, zmm14
+	vfmadd231ps		zmm1,	zmm13, zmm14
+
+	vbroadcastss	zmm14,	dword ptr [rcx + 8]
+
+	vfmadd231ps		zmm2,	zmm12, zmm15
+	vfmadd231ps		zmm3,	zmm13, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 12]
+
+	vfmadd231ps		zmm4,	zmm12, zmm14
+	vfmadd231ps		zmm5,	zmm13, zmm14
+
+	vbroadcastss	zmm14,	dword ptr [rcx + 16]
+
+	vfmadd231ps		zmm6,	zmm12, zmm15
+	vfmadd231ps		zmm7,	zmm13, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 20]
+
+	vfmadd231ps		zmm8,	zmm12, zmm14
+	vfmadd231ps		zmm9,	zmm13, zmm14
+
+	vfmadd231ps		zmm10,	 zmm12, zmm15
+	vfmadd231ps		zmm11,	 zmm13, zmm15
+
+	add rax, 128
+	add rcx, 24
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli
new file mode 100644
index 000000000..c36b7f6b6
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli
@@ -0,0 +1,63 @@
+	// Tile size: 3x4
+	// Accumulators: 0-11
+	// Col regs: zmm12-14
+	// Row regs: zmm15
+
+	vmovaps			zmm12,	[rax]
+	vmovaps			zmm13,	[rax+64]
+	vmovaps			zmm14,	[rax+128]
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 0]
+
+	vfmadd231ps		zmm0,	zmm12, zmm15
+	vfmadd231ps		zmm1,	zmm13, zmm15
+	vfmadd231ps		zmm2,	zmm14, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 4]
+
+	vfmadd231ps		zmm3,	zmm12, zmm15
+	vfmadd231ps		zmm4,	zmm13, zmm15
+	vfmadd231ps		zmm5,	zmm14, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 8]
+
+	vfmadd231ps		zmm6,	zmm12, zmm15
+	vfmadd231ps		zmm7,	zmm13, zmm15
+	vfmadd231ps		zmm8,	zmm14, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 12]
+
+	vfmadd231ps		zmm9,	zmm12, zmm15
+	vfmadd231ps		zmm10,	 zmm13, zmm15
+	vfmadd231ps		zmm11,	 zmm14, zmm15
+
+	vmovaps			zmm12,	[rax + 192]
+	vmovaps			zmm13,	[rax + 256]
+	vmovaps			zmm14,	[rax + 320]
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 16]
+
+	vfmadd231ps		zmm0,	zmm12, zmm15
+	vfmadd231ps		zmm1,	zmm13, zmm15
+	vfmadd231ps		zmm2,	zmm14, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 20]
+
+	vfmadd231ps		zmm3,	zmm12, zmm15
+	vfmadd231ps		zmm4,	zmm13, zmm15
+	vfmadd231ps		zmm5,	zmm14, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 24]
+
+	vfmadd231ps		zmm6,	zmm12, zmm15
+	vfmadd231ps		zmm7,	zmm13, zmm15
+	vfmadd231ps		zmm8,	zmm14, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 28]
+
+	vfmadd231ps		zmm9,	zmm12, zmm15
+	vfmadd231ps		zmm10,	 zmm13, zmm15
+	vfmadd231ps		zmm11,	 zmm14, zmm15
+
+	add rax, 384
+	add rcx, 32
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..a8b1c3221
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,35 @@
+	// Tile size: 3x4
+	// Accumulators: 0-11
+	// Col regs: zmm12-14
+	// Row regs: zmm15
+
+	vmovaps			zmm12,	[rax]
+	vmovaps			zmm13,	[rax+64]
+	vmovaps			zmm14,	[rax+128]
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 0]
+
+	vfmadd231ps		zmm0,	zmm12, zmm15
+	vfmadd231ps		zmm1,	zmm13, zmm15
+	vfmadd231ps		zmm2,	zmm14, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 4]
+
+	vfmadd231ps		zmm3,	zmm12, zmm15
+	vfmadd231ps		zmm4,	zmm13, zmm15
+	vfmadd231ps		zmm5,	zmm14, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 8]
+
+	vfmadd231ps		zmm6,	zmm12, zmm15
+	vfmadd231ps		zmm7,	zmm13, zmm15
+	vfmadd231ps		zmm8,	zmm14, zmm15
+
+	vbroadcastss	zmm15,	dword ptr [rcx + 12]
+
+	vfmadd231ps		zmm9,	zmm12, zmm15
+	vfmadd231ps		zmm10,	 zmm13, zmm15
+	vfmadd231ps		zmm11,	 zmm14, zmm15
+
+	add rax, 192
+	add rcx, 16
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli
new file mode 100644
index 000000000..fe661b7fa
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli
@@ -0,0 +1,69 @@
+	// Tile size: 4x3
+	// Accumulators: 0-11
+	// Col regs: zmm12
+	// Row regs: zmm13-15
+
+	// Load col of A
+	vmovaps			zmm12,	[rax]
+
+	// Fill 3 cols of B
+	vbroadcastss	zmm13,	dword ptr [rcx + 0]
+	vbroadcastss	zmm14,	dword ptr [rcx + 4]
+	vbroadcastss	zmm15,	dword ptr [rcx + 8]
+
+	// N.B. Stepping cols in inner loop
+	vfmadd231ps		zmm0,	zmm12, zmm13
+	vfmadd231ps		zmm4,	zmm12, zmm14
+	vfmadd231ps		zmm8,	zmm12, zmm15
+
+	vmovaps			zmm12,	[rax+64]
+
+	vfmadd231ps		zmm1,	zmm12, zmm13
+	vfmadd231ps		zmm5,	zmm12, zmm14
+	vfmadd231ps		zmm9,	zmm12, zmm15
+
+	vmovaps			zmm12,	[rax+128]
+
+	vfmadd231ps		zmm2,	zmm12, zmm13
+	vfmadd231ps		zmm6,	zmm12, zmm14
+	vfmadd231ps		zmm10,	 zmm12, zmm15
+
+	vmovaps			zmm12,	[rax+192]
+
+	vfmadd231ps		zmm3,	zmm12, zmm13
+	vfmadd231ps		zmm7,	zmm12, zmm14
+	vfmadd231ps		zmm11,	zmm12, zmm15
+
+	// Load col of A, switching col!
+	vmovaps			zmm13,	[rax + 256]
+
+	// Fill 3 cols of B
+	vbroadcastss	zmm14,	dword ptr [rcx + 12]
+	vbroadcastss	zmm15,	dword ptr [rcx + 16]
+	vbroadcastss	zmm12,	dword ptr [rcx + 20]
+
+	// N.B. Stepping cols in inner loop
+	vfmadd231ps		zmm0,	zmm13, zmm14
+	vfmadd231ps		zmm4,	zmm13, zmm15
+	vfmadd231ps		zmm8,	zmm13, zmm12
+
+	vmovaps			zmm13,	[rax + 320]
+
+	vfmadd231ps		zmm1,	zmm13, zmm14
+	vfmadd231ps		zmm5,	zmm13, zmm15
+	vfmadd231ps		zmm9,	zmm13, zmm12
+
+	vmovaps			zmm13,	[rax + 384]
+
+	vfmadd231ps		zmm2,	zmm13, zmm14
+	vfmadd231ps		zmm6,	zmm13, zmm15
+	vfmadd231ps		zmm10,	 zmm13, zmm12
+
+	vmovaps			zmm13,	[rax + 448]
+
+	vfmadd231ps		zmm3,	zmm13, zmm14
+	vfmadd231ps		zmm7,	zmm13, zmm15
+	vfmadd231ps		zmm11,	zmm13, zmm12
+
+    add             rcx,    24
+    add             rax,    512
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..0e71a747e
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,38 @@
+	// Tile size: 4x3
+	// Accumulators: 0-11
+	// Col regs: zmm12
+	// Row regs: zmm13-15
+
+	// Load col of A
+	vmovaps			zmm12,	[rax]
+
+	// Fill 3 cols of B
+	vbroadcastss	zmm13,	dword ptr [rcx + 0]
+	vbroadcastss	zmm14,	dword ptr [rcx + 4]
+	vbroadcastss	zmm15,	dword ptr [rcx + 8]
+
+	// N.B. Stepping cols in inner loop
+	vfmadd231ps		zmm0,	zmm12, zmm13
+	vfmadd231ps		zmm4,	zmm12, zmm14
+	vfmadd231ps		zmm8,	zmm12, zmm15
+
+	vmovaps			zmm12,	[rax+64]
+
+	vfmadd231ps		zmm1,	zmm12, zmm13
+	vfmadd231ps		zmm5,	zmm12, zmm14
+	vfmadd231ps		zmm9,	zmm12, zmm15
+
+	vmovaps			zmm12,	[rax+128]
+
+	vfmadd231ps		zmm2,	zmm12, zmm13
+	vfmadd231ps		zmm6,	zmm12, zmm14
+	vfmadd231ps		zmm10,	 zmm12, zmm15
+
+	vmovaps			zmm12,	[rax+192]
+
+	vfmadd231ps		zmm3,	zmm12, zmm13
+	vfmadd231ps		zmm7,	zmm12, zmm14
+	vfmadd231ps		zmm11,	zmm12, zmm15
+
+    add             rcx,    12
+    add             rax,    256
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli
new file mode 100644
index 000000000..6a5b887b8
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli
@@ -0,0 +1,63 @@
+	// Tile size: 5x2
+	// Accumulators: 0-9
+	// Col regs: zmm10-13
+	// Row regs: zmm14-15
+
+	vmovaps			zmm10,	[rax]
+	vbroadcastss	zmm14,	dword ptr [rcx + 0]
+	vbroadcastss	zmm15,	dword ptr [rcx + 4]
+	vmovaps			zmm11,	[rax + 64]
+
+	// NB stepping column-wise
+	vfmadd231ps		zmm0,	zmm10, zmm14
+	vfmadd231ps		zmm5,	zmm10, zmm15
+
+	vmovaps			zmm12,	[rax + 128]
+
+	vfmadd231ps		zmm1,	zmm11, zmm14
+	vfmadd231ps		zmm6,	zmm11, zmm15
+
+	vmovaps			zmm13,	[rax + 192]
+
+	vfmadd231ps		zmm2,	zmm12, zmm14
+	vfmadd231ps		zmm7,	zmm12, zmm15
+
+	vmovaps			zmm10,	[rax + 256]
+
+	vfmadd231ps		zmm3,	zmm13, zmm14
+	vfmadd231ps		zmm8,	zmm13, zmm15
+
+	vmovaps			zmm11,	[rax + 320]
+
+	vfmadd231ps		zmm4,	zmm10, zmm14
+	vfmadd231ps		zmm9,	zmm10, zmm15
+
+	vbroadcastss	zmm14,	dword ptr [rcx + 8]
+	vbroadcastss	zmm15,	dword ptr [rcx + 12]
+
+	vmovaps			zmm12,	[rax + 384]
+
+	// NB stepping column-wise
+	vfmadd231ps		zmm0,	zmm11, zmm14
+	vfmadd231ps		zmm5,	zmm11, zmm15
+
+	vmovaps			zmm13,	[rax + 448]
+
+	vfmadd231ps		zmm1,	zmm12, zmm14
+	vfmadd231ps		zmm6,	zmm12, zmm15
+
+	vmovaps			zmm10,	[rax + 512]
+
+	vfmadd231ps		zmm2,	zmm13, zmm14
+	vfmadd231ps		zmm7,	zmm13, zmm15
+
+	vmovaps			zmm11,	[rax + 576]
+
+	vfmadd231ps		zmm3,	zmm10, zmm14
+	vfmadd231ps		zmm8,	zmm10, zmm15
+
+	vfmadd231ps		zmm4,	zmm11, zmm14
+	vfmadd231ps		zmm9,	zmm11, zmm15
+
+	add rax, 640
+	add rcx, 16
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..73ef89b58
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,34 @@
+	// Tile size: 5x2
+	// Accumulators: 0-9
+	// Col regs: zmm10-14
+	// Row regs: zmm15-16
+
+	vmovaps			zmm10,	[rax]
+	vbroadcastss	zmm15,	dword ptr [rcx + 0]
+	vbroadcastss	zmm16,	dword ptr [rcx + 4]
+	vmovaps			zmm11,	[rax + 64]
+
+	// NB stepping column-wise
+	vfmadd231ps		zmm0,	zmm10, zmm15
+	vfmadd231ps		zmm5,	zmm10, zmm16
+
+	vmovaps			zmm12,	[rax + 128]
+
+	vfmadd231ps		zmm1,	zmm11, zmm15
+	vfmadd231ps		zmm6,	zmm11, zmm16
+
+	vmovaps			zmm13,	[rax + 192]
+
+	vfmadd231ps		zmm2,	zmm12, zmm15
+	vfmadd231ps		zmm7,	zmm12, zmm16
+
+	vmovaps			zmm14,	[rax + 256]
+
+	vfmadd231ps		zmm3,	zmm13, zmm15
+	vfmadd231ps		zmm8,	zmm13, zmm16
+
+	vfmadd231ps		zmm4,	zmm14, zmm15
+	vfmadd231ps		zmm9,	zmm14, zmm16
+
+	add rax, 320
+	add rcx, 8
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli
new file mode 100644
index 000000000..8c7704433
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli
@@ -0,0 +1,25 @@
+	// Tile size: 6x1
+	// Accumulators: 0-5
+	// Col regs: 6-11
+	// Row regs: 15
+
+
+    vbroadcastss    zmm15,  dword ptr [rcx]
+    vfmadd231ps     zmm0, zmm15, [rax]
+    vfmadd231ps     zmm1, zmm15, [rax + 64]
+    vfmadd231ps     zmm2, zmm15, [rax + 128]
+    vfmadd231ps     zmm3, zmm15, [rax + 192]
+    vfmadd231ps     zmm4, zmm15, [rax + 256]
+    vfmadd231ps     zmm5, zmm15, [rax + 320]
+
+    vbroadcastss    zmm14,  dword ptr [rcx + 4]
+
+    vfmadd231ps     zmm0, zmm14, [rax + 384]
+    vfmadd231ps     zmm1, zmm14, [rax + 448]
+    vfmadd231ps     zmm2, zmm14, [rax + 512]
+    vfmadd231ps     zmm3, zmm14, [rax + 576]
+    vfmadd231ps     zmm4, zmm14, [rax + 640]
+    vfmadd231ps     zmm5, zmm14, [rax + 704]
+
+	add rax, 768
+    add rcx, 8
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..a34c40fee
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,29 @@
+	// Tile size: 6x1
+	// Accumulators: 0-5
+	// Col regs: 6-11
+	// Row regs: 15
+
+    vbroadcastss    zmm15,  dword ptr [rcx]
+
+	vmovups     zmm10, [rax]
+	vmulps     zmm10, zmm10, zmm15
+	vaddps     zmm0, zmm0, zmm10
+    vmovups     zmm11, [rax + 64]
+	vmulps     zmm11, zmm11, zmm15
+	vaddps     zmm1, zmm1, zmm11
+    vmovups     zmm12, [rax + 128]
+	vmulps     zmm12, zmm12, zmm15
+	vaddps     zmm2, zmm2, zmm12
+    vmovups     zmm13, [rax + 192]
+	vmulps     zmm13, zmm13, zmm15
+	vaddps     zmm3, zmm3, zmm13
+    vmovups     zmm14, [rax + 256]
+	vmulps     zmm14, zmm14, zmm15
+	vaddps     zmm4, zmm4, zmm14
+    vmovups     zmm15, [rax + 320]
+	vmulps     zmm15, zmm15, zmm15
+	vaddps     zmm5, zmm5, zmm15
+
+
+    add rcx, 4
+	add rax, 384
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli
new file mode 100644
index 000000000..58ed8f433
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli
@@ -0,0 +1,70 @@
+    // Tile size: 6x2
+	// Accumulators: 0-9
+	// Col regs: zmm10-13
+	// Row regs: zmm14-15
+
+	vmovaps         zmm12,  [rax]
+	vbroadcastss    zmm14,  dword ptr [rcx + 0]
+    vbroadcastss    zmm15,  dword ptr [rcx + 4]
+	vmovaps         zmm13,  [rax + 64]
+
+    vfmadd231ps     zmm0,   zmm12, zmm14
+    vfmadd231ps     zmm6,   zmm12, zmm15
+
+	vmovaps         zmm12,  [rax + 128]
+
+    vfmadd231ps     zmm1,   zmm13, zmm14
+    vfmadd231ps     zmm7,   zmm13, zmm15
+
+	vmovaps         zmm13,  [rax + 192]
+
+    vfmadd231ps     zmm2,   zmm12, zmm14
+    vfmadd231ps     zmm8,   zmm12, zmm15
+
+	vmovaps         zmm12,  [rax + 256]
+
+	vfmadd231ps     zmm3,   zmm13, zmm14
+    vfmadd231ps     zmm9,   zmm13, zmm15
+
+	vmovaps         zmm13,  [rax + 320]
+
+	vfmadd231ps     zmm4,   zmm12, zmm14
+    vfmadd231ps     zmm10,  zmm12, zmm15
+
+	vmovaps         zmm12,  [rax + 384]
+	vbroadcastss    zmm14,  dword ptr [rcx + 8]
+
+	vfmadd231ps     zmm5,   zmm13, zmm14
+    vfmadd231ps     zmm11, 	zmm13, zmm15
+
+    vbroadcastss    zmm15,  dword ptr [rcx + 12]
+	vmovaps         zmm13,  [rax + 448]
+
+    vfmadd231ps     zmm0,   zmm12, zmm14
+    vfmadd231ps     zmm6,   zmm12, zmm15
+
+	vmovaps         zmm12,  [rax + 512]
+
+    vfmadd231ps     zmm1,   zmm13, zmm14
+    vfmadd231ps     zmm7,   zmm13, zmm15
+
+	vmovaps         zmm13,  [rax + 576]
+
+    vfmadd231ps     zmm2,   zmm12, zmm14
+    vfmadd231ps     zmm8,   zmm12, zmm15
+
+	vmovaps         zmm12,  [rax + 640]
+
+	vfmadd231ps     zmm3,   zmm13, zmm14
+    vfmadd231ps     zmm9,   zmm13, zmm15
+
+	vmovaps         zmm13,  [rax + 704]
+
+	vfmadd231ps     zmm4,   zmm12, zmm14
+    vfmadd231ps     zmm10,  zmm12, zmm15
+
+	vfmadd231ps     zmm5,   zmm13, zmm14
+    vfmadd231ps     zmm11, 	zmm13, zmm15
+
+	add rax, 768
+	add rcx, 16
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..0fa5fa8e4
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,38 @@
+    // Tile size: 6x2
+	// Accumulators: 0-11
+	// Col regs: 12-13
+	// Row regs: 14-15
+
+	vmovaps         zmm12,  [rax]
+	vbroadcastss    zmm14,  dword ptr [rcx + 0]
+    vbroadcastss    zmm15,  dword ptr [rcx + 4]
+	vmovaps         zmm13,  [rax + 64]
+
+    vfmadd231ps     zmm0,   zmm12, zmm14
+    vfmadd231ps     zmm6,   zmm12, zmm15
+
+	vmovaps         zmm12,  [rax + 128]
+
+    vfmadd231ps     zmm1,   zmm13, zmm14
+    vfmadd231ps     zmm7,   zmm13, zmm15
+
+	vmovaps         zmm13,  [rax + 192]
+
+    vfmadd231ps     zmm2,   zmm12, zmm14
+    vfmadd231ps     zmm8,   zmm12, zmm15
+
+	vmovaps         zmm12,  [rax + 256]
+
+	vfmadd231ps     zmm3,   zmm13, zmm14
+    vfmadd231ps     zmm9,   zmm13, zmm15
+
+	vmovaps         zmm13,  [rax + 320]
+
+	vfmadd231ps     zmm4,   zmm12, zmm14
+    vfmadd231ps     zmm10,  zmm12, zmm15
+
+	vfmadd231ps     zmm5,   zmm13, zmm14
+    vfmadd231ps     zmm11, 	zmm13, zmm15
+
+	add rcx, 8
+	add rax, 384
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli
new file mode 100644
index 000000000..e23d79d2d
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli
@@ -0,0 +1,40 @@
+	// Tile size: 6x1
+	// Accumulators: 0-5
+	// Col regs: 6-11
+	// Row regs: 15
+
+	vbroadcastss    zmm15,  dword ptr [rcx]
+
+    vmovaps         zmm7,  [rax + 0]
+	vmovaps         zmm8,  [rax + 64]
+	vmovaps         zmm9,  [rax + 128]
+	vmovaps         zmm10, [rax + 192]
+    vmovaps         zmm11, [rax + 256]
+	vmovaps         zmm12, [rax + 320]
+	vmovaps         zmm13, [rax + 384]
+
+    vfmadd231ps     zmm0, zmm7, zmm15
+    vfmadd231ps     zmm1, zmm8, zmm15
+    vfmadd231ps     zmm2, zmm9, zmm15
+    vfmadd231ps     zmm3, zmm10, zmm15
+    vfmadd231ps     zmm4, zmm11, zmm15
+    vfmadd231ps     zmm5, zmm12, zmm15
+	vfmadd231ps     zmm6, zmm13, zmm15
+
+	vbroadcastss    zmm16,  dword ptr [rcx + 4]
+
+    vmovaps         zmm7,  [rax + 448 + 0]
+	vmovaps         zmm8,  [rax + 448 + 64]
+	vmovaps         zmm9,  [rax + 448 + 128]
+	vmovaps         zmm10, [rax + 448 + 192]
+    vmovaps         zmm11, [rax + 448 + 256]
+	vmovaps         zmm12, [rax + 448 + 320]
+	vmovaps         zmm13, [rax + 448 + 384]
+
+    vfmadd231ps     zmm0, zmm7, zmm15
+    vfmadd231ps     zmm1, zmm8, zmm15
+    vfmadd231ps     zmm2, zmm9, zmm15
+    vfmadd231ps     zmm3, zmm10, zmm15
+    vfmadd231ps     zmm4, zmm11, zmm15
+    vfmadd231ps     zmm5, zmm12, zmm15
+	vfmadd231ps     zmm6, zmm13, zmm15
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..889cb34e9
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,21 @@
+	// Tile size: 7x1
+	// Accumulators: 0-6
+	// Col regs: 6-13
+	// Row regs: 15
+    vbroadcastss    zmm15,  dword ptr [rcx]
+
+    vmovaps         zmm7,  [rax + 0]
+	vmovaps         zmm8,  [rax + 64]
+	vmovaps         zmm9,  [rax + 128]
+	vmovaps         zmm10, [rax + 192]
+    vmovaps         zmm11, [rax + 256]
+	vmovaps         zmm12, [rax + 320]
+	vmovaps         zmm13, [rax + 384]
+
+    vfmadd231ps     zmm0, zmm7, zmm15
+    vfmadd231ps     zmm1, zmm8, zmm15
+    vfmadd231ps     zmm2, zmm9, zmm15
+    vfmadd231ps     zmm3, zmm10, zmm15
+    vfmadd231ps     zmm4, zmm11, zmm15
+    vfmadd231ps     zmm5, zmm12, zmm15
+	vfmadd231ps     zmm6, zmm13, zmm15
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli
new file mode 100644
index 000000000..96d0d9863
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli
@@ -0,0 +1,30 @@
+	// Tile size: 8x1
+	// Accumulators: 0-7
+	// Col regs: 8-14
+	// Row regs: 15
+
+	vbroadcastss    zmm17,  dword ptr [rcx]
+
+
+    vfmadd231ps     zmm0, zmm17, [rax + 0]
+    vfmadd231ps     zmm1, zmm17, [rax + 64]
+    vfmadd231ps     zmm2, zmm17, [rax + 128]
+    vfmadd231ps     zmm3, zmm17, [rax + 192]
+    vfmadd231ps     zmm4, zmm17, [rax + 256]
+    vfmadd231ps     zmm5, zmm17, [rax + 320]
+    vfmadd231ps     zmm6, zmm17, [rax + 384]
+    vfmadd231ps     zmm7, zmm17, [rax + 448]
+
+	vbroadcastss    zmm16,  dword ptr [rcx + 4]
+
+	vfmadd231ps     zmm0, zmm16, [rax + 0 + 512]
+    vfmadd231ps     zmm1, zmm16, [rax + 64 + 512]
+    vfmadd231ps     zmm2, zmm16, [rax + 128 + 512]
+    vfmadd231ps     zmm3, zmm16, [rax + 192 + 512]
+    vfmadd231ps     zmm4, zmm16, [rax + 256 + 512]
+    vfmadd231ps     zmm5, zmm16, [rax + 320 + 512]
+    vfmadd231ps     zmm6, zmm16, [rax + 384 + 512]
+    vfmadd231ps     zmm7, zmm16, [rax + 448 + 512]
+
+    add rcx, 8
+	add rax, 1024
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..38d57ce66
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,25 @@
+	// Tile size: 8x1
+	// Accumulators: 0-7
+	// Col regs: 8-14
+	// Row regs: 15
+
+	vbroadcastss    zmm15,  dword ptr [rcx]
+
+    vmovaps     zmm8, [rax + 0]
+    vfmadd231ps     zmm0, zmm15, zmm8
+    vmovaps     zmm9, [rax + 64]
+    vfmadd231ps     zmm1, zmm15, zmm9
+    vmovaps     zmm10, [rax + 128]
+    vfmadd231ps     zmm2, zmm15, zmm10
+    vmovaps     zmm11, [rax + 192]
+	vfmadd231ps     zmm3, zmm15, zmm11
+    vmovaps     zmm12, [rax + 256]
+	vfmadd231ps     zmm4, zmm15, zmm12
+    vmovaps     zmm13, [rax + 320]
+	vfmadd231ps     zmm5, zmm15, zmm13
+    vmovaps     zmm14, [rax + 384]
+	vfmadd231ps     zmm6, zmm15, zmm14
+    vmovaps     zmm8, [rax + 448]
+	vfmadd231ps     zmm7, zmm15, zmm8
+    add rcx, 4
+	add rax, 512
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..772651ce8
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,42 @@
+	// Tile size: 8x2
+	// Accumulators: 0-15
+	// Col regs: 16-23
+	// Row regs: 24-25
+
+    vmovaps         zmm16,  [rax + 0]
+	vbroadcastss	zmm24,	dword ptr [rcx + 0]
+	vbroadcastss	zmm25,	dword ptr [rcx + 4]
+
+    vfmadd231ps     zmm0, zmm16, zmm24
+    vfmadd231ps     zmm8, zmm16, zmm25
+
+    vmovaps         zmm17,  [rax + 64]
+    vfmadd231ps     zmm1, zmm17, zmm24
+    vfmadd231ps     zmm9, zmm17, zmm25
+
+    vmovaps         zmm18,  [rax + 128]
+    vfmadd231ps     zmm2, zmm18, zmm24
+    vfmadd231ps     zmm10, zmm18, zmm25
+
+    vmovaps         zmm19,  [rax + 192]
+    vfmadd231ps     zmm3, zmm19, zmm24
+    vfmadd231ps     zmm11, zmm19, zmm25
+
+    vmovaps         zmm20,  [rax + 256]
+    vfmadd231ps     zmm4, zmm20, zmm24
+    vfmadd231ps     zmm12, zmm20, zmm25
+
+    vmovaps         zmm21,  [rax + 320]
+    vfmadd231ps     zmm5, zmm21, zmm24
+    vfmadd231ps     zmm13, zmm21, zmm25
+
+    vmovaps         zmm22,  [rax + 384]
+    vfmadd231ps     zmm6, zmm22, zmm24
+    vfmadd231ps     zmm14, zmm22, zmm25
+
+    vmovaps         zmm23,  [rax + 448]
+    vfmadd231ps     zmm7, zmm23, zmm24
+    vfmadd231ps     zmm15, zmm23, zmm25
+
+	add rax, 512
+	add rcx, 8
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli
new file mode 100644
index 000000000..1400fdf0d
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli
@@ -0,0 +1,61 @@
+	// Tile size: 1x8
+	// Accumulators: 0-7
+	// Col regs: 8-14
+	// Row regs: 15
+
+
+    vmovaps         zmm15,  [rax]
+
+    vbroadcastss    zmm8, dword ptr [rcx + 0 * 4]
+    vfmadd231ps     zmm0, zmm15, zmm8
+
+    vbroadcastss    zmm9, dword ptr [rcx + 1 * 4]
+    vfmadd231ps     zmm1, zmm15, zmm9
+
+    vbroadcastss    zmm10, dword ptr [rcx + 2 * 4]
+    vfmadd231ps     zmm2, zmm15, zmm10
+
+    vbroadcastss    zmm11, dword ptr [rcx + 3 * 4]
+    vfmadd231ps     zmm3, zmm15, zmm11
+
+    vbroadcastss    zmm12, dword ptr [rcx + 4 * 4]
+    vfmadd231ps     zmm4, zmm15, zmm12
+
+    vbroadcastss    zmm13, dword ptr [rcx + 5 * 4]
+    vfmadd231ps     zmm5, zmm15, zmm13
+
+    vbroadcastss    zmm10, dword ptr [rcx + 6 * 4]
+    vfmadd231ps     zmm6, zmm15, zmm10
+
+    vbroadcastss    zmm11, dword ptr [rcx + 7 * 4]
+    vfmadd231ps     zmm7, zmm15, zmm11
+
+
+    vmovaps         zmm15,  [rax+64]
+
+    vbroadcastss    zmm8, dword ptr [rcx + 8 * 4]
+    vfmadd231ps     zmm0, zmm15, zmm8
+
+    vbroadcastss    zmm9, dword ptr [rcx + 9 * 4]
+    vfmadd231ps     zmm1, zmm15, zmm9
+
+    vbroadcastss    zmm10, dword ptr [rcx + 10 * 4]
+    vfmadd231ps     zmm2, zmm15, zmm10
+
+    vbroadcastss    zmm11, dword ptr [rcx + 11 * 4]
+    vfmadd231ps     zmm3, zmm15, zmm11
+
+    vbroadcastss    zmm12, dword ptr [rcx + 12 * 4]
+    vfmadd231ps     zmm4, zmm15, zmm12
+
+    vbroadcastss    zmm13, dword ptr [rcx + 13 * 4]
+    vfmadd231ps     zmm5, zmm15, zmm13
+
+    vbroadcastss    zmm10, dword ptr [rcx + 14 * 4]
+    vfmadd231ps     zmm6, zmm15, zmm10
+
+    vbroadcastss    zmm11, dword ptr [rcx + 15 * 4]
+    vfmadd231ps     zmm7, zmm15, zmm11
+
+	add rcx, 64
+	add rax, 128
\ No newline at end of file
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli
new file mode 100644
index 000000000..c08151c2a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli
@@ -0,0 +1,33 @@
+	// Tile size: 1x8
+	// Accumulators: 0-7
+	// Col regs: 8-14
+	// Row regs: 15
+
+    vmovaps         zmm15,  [rax]
+
+    vbroadcastss    zmm8, dword ptr [rcx + 0 * 4]
+    vfmadd231ps     zmm0, zmm15, zmm8
+
+    vbroadcastss    zmm9, dword ptr [rcx + 1 * 4]
+    vfmadd231ps     zmm1, zmm15, zmm9
+
+    vbroadcastss    zmm10, dword ptr [rcx + 2 * 4]
+    vfmadd231ps     zmm2, zmm15, zmm10
+
+    vbroadcastss    zmm11, dword ptr [rcx + 3 * 4]
+    vfmadd231ps     zmm3, zmm15, zmm11
+
+    vbroadcastss    zmm12, dword ptr [rcx + 4 * 4]
+    vfmadd231ps     zmm4, zmm15, zmm12
+
+    vbroadcastss    zmm13, dword ptr [rcx + 5 * 4]
+    vfmadd231ps     zmm5, zmm15, zmm13
+
+    vbroadcastss    zmm10, dword ptr [rcx + 6 * 4]
+    vfmadd231ps     zmm6, zmm15, zmm10
+
+    vbroadcastss    zmm11, dword ptr [rcx + 7 * 4]
+    vfmadd231ps     zmm7, zmm15, zmm11
+
+	add rcx, 32
+	add rax, 64
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_128x1.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_128x1.tmpl
new file mode 100644
index 000000000..382ae2ca6
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_128x1.tmpl
@@ -0,0 +1,110 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 128 x 1
+
+    zmm0
+    zmm1
+    ...
+    zmm7
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" size:"128x1", suffix:suffix, G:G, arch:"avx512" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+{{align}} 16
+{{L}}main_loop_packed_packed:
+	{% include "8x1/packed_packed_loop1/avx-512.tmpli" %}
+
+    sub             rbx, 1
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{% include "f32_scalars.tmpliq" from:0, to:7 %}
+{% include "f32_per_rows.tmpliq" mr:128, from:0, to:7 %}
+{% include "f32_per_cols.tmpliq" mr:128, from:0, to:7 %}
+{% include "avx512_mmm_load_tile.tmpliq" from:0, to:7 %}
+
+{{L}}add_unicast:
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+
+    {% for row in (0..7) %}
+        vaddps zmm{{row}}, zmm{{row}}, [ r10 + {{row|times:64}} ]
+    {% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vbroadcastss    zmm14, dword ptr [rbx]
+
+{% for i in (0..7) %}
+    vmovups         zmm12,  [rax + {{i|times:64}}]
+    vfmadd231ps     zmm{{i}}, zmm12, zmm14
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+
+    cmp     rsi, 4
+    jne      {{L}}store_noncontiguous
+
+	test r8, 63
+	jnz {{L}}store_unaligned
+
+	{% for row in (0..7) %}
+        vmovaps [r8 + {{row|times:64}}], zmm{{row}}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_unaligned:
+	{% for row in (0..7) %}
+        vmovups [r8 + {{row|times:64}}], zmm{{row}}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_noncontiguous:
+    {% for r in (0..7) %}
+        {% for quarter in (0..3) %}
+            vextractf32x4 xmm8, zmm{{r}}, {{quarter}}
+            {% for row in (0..3) %}
+                vextractps  dword ptr [r8], xmm8, {{row}}
+                add         r8, rsi
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" size:"128x1", suffix:suffix, G:G, L:L, arch:"avx512" %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x1.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x1.tmpl
new file mode 100644
index 000000000..5f2f57a07
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x1.tmpl
@@ -0,0 +1,143 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 16 x 1
+
+    zmm0
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+
+{% include "preamble.tmpliq" size:"16x1", suffix:suffix, G:G, arch:"avx512" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+	cmp rbx, 8
+	jl {{L}}main_loop_packed_packed_tail
+
+{{align}} 16
+{{L}}main_loop_packed_packed:
+	{% include "1x1/packed_packed_loop1/unroll-4.tmpli" %}
+
+    sub             rbx, 4
+	cmp rbx,        4
+	jge              {{L}}main_loop_packed_packed
+
+	{% for r in (1..3) %}
+	   vaddps zmm0, zmm0, zmm{{r}}
+	{% endfor %}
+
+    test    rbx, rbx
+    jz      {{L}}non_linear_loop
+
+{{align}} 16
+{{L}}main_loop_packed_packed_tail:
+	{% include "1x1/packed_packed_loop1/avx-512.tmpli" %}
+
+	sub             rbx, 1
+    jnz				{{L}}main_loop_packed_packed_tail
+
+    jmp      {{L}}non_linear_loop
+
+{% include "f32_scalars.tmpliq" from:0, to:0 %}
+{% include "f32_per_rows.tmpliq" mr:16, from:0, to:0 %}
+{% include "f32_per_cols.tmpliq" mr:16, from:0, to:0 %}
+{% include "avx512_mmm_load_tile.tmpliq" from:0, to:0 %}
+
+{{L}}add_unicast:
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+
+	cmp rsi, 4
+	jne {{L}}add_unicast_generic
+
+	vaddps zmm0, zmm0, [r10]
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_unicast_generic:
+    mov r8, [0]
+//     mov     eax,    0
+// {% for i in (0..3) %}
+//     pinsrd  xmm14, eax, {{i}}
+//     add     eax,    esi
+// {% endfor %}
+// {% for i in (0..3) %}
+//     pinsrd  xmm15, eax, {{i}}
+//     add     eax,    esi
+// {% endfor %}
+//
+//     vperm2f128      zmm14,  zmm14, zmm15,         32 // zmm14 <- xmm14::xmm15
+//
+// {% for i in (0..7) %}
+//     vpcmpeqd        zmm15,  zmm15, zmm15
+//     vgatherdps      zmm12,  [ r10 + zmm14 ], zmm15
+//
+//     vaddps          zmm{{i}},   zmm{{i}},   zmm12
+//     lea             r10, [ r10 + rsi * 8 ]
+// {% endfor %}
+//
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vbroadcastss    zmm14, dword ptr [rbx]
+
+{% for i in (0..0) %}
+    vmovups         zmm12,  [rax + {{i|times:64}}]
+    vfmadd231ps     zmm{{i}}, zmm12, zmm14
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+
+    cmp     rsi, 4
+    jne      {{L}}store_noncontiguous
+
+	test r8, 63
+	jnz {{L}}store_unaligned
+
+    vmovaps [r8], zmm0
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_unaligned:
+	vmovups [r8], zmm0
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_noncontiguous:
+    {% for quarter in (0..3) %}
+        vextractf32x4 xmm8, zmm0, {{quarter}}
+        {% for row in (0..3) %}
+            vextractps  dword ptr [r8], xmm8, {{row}}
+            add         r8, rsi
+        {% endfor %}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" size:"16x1", suffix:suffix, G:G, L:L, arch:"avx512" %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x12.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x12.tmpl
new file mode 100644
index 000000000..634454a8b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x12.tmpl
@@ -0,0 +1,165 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 16 x 12
+
+    zmm0 zmm1 ... zmm11
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+
+{% include "preamble.tmpliq" size:"16x12", suffix:suffix, G:G, arch:"avx512" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+{{align}} 16
+{{L}}main_loop_packed_packed_tail:
+	{% include "1x12/packed_packed_loop1/avx-512.tmpli" %}
+
+	sub             rbx, 1
+    jnz				{{L}}main_loop_packed_packed_tail
+
+    jmp      {{L}}non_linear_loop
+
+{% include "f32_scalars.tmpliq" from:0, to:11 %}
+{% include "f32_per_rows.tmpliq" mr:16, from:0, to:11 %}
+{% include "f32_per_cols.tmpliq" mr:16, from:0, to:11 %}
+{% include "avx512_mmm_load_tile.tmpliq" from:0, to:11 %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    mov     eax,    0
+
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm12, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm13, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+    vperm2f128      ymm13,  ymm12, ymm13,         32 // ymm12 <- xmm12::xmm13
+    vinsertf32x8    zmm14, zmm14, ymm13, 1
+
+{% for i in (0..11) %}
+    kxnorw k1,k1,k1
+    vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+    add     r10, rbx
+    vaddps          zmm{{i}},   zmm{{i}},   zmm12
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         zmm12, zmmword ptr [rax]
+
+{% for i in (0..11) %}
+    vbroadcastss    zmm14, dword ptr [rbx + {{i|times:4}} ]
+    vfmadd231ps     zmm{{i}},   zmm12, zmm14
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    // tops of cols
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+
+    {% for quarter in (0..3) %}
+        {% for r in (0..3) %}
+            vextractf32x4 xmm{{r | plus: 12}}, zmm{{r}}, {{quarter}}
+        {% endfor %}
+        {% for row in (0..3) %}
+            {% for i in (0..3) %}
+                vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}}
+                add         r{{i | plus: 8}}, rsi
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+
+    mov     r8,     [rdi + 8]           // c ptr
+
+    // tops of cols
+    lea     r8,     [ r8 + 4 * rbx ]
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+
+    {% for quarter in (0..3) %}
+        {% for r in (0..3) %}
+            vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 4}}, {{quarter}}
+        {% endfor %}
+        {% for row in (0..3) %}
+            {% for i in (0..3) %}
+                vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}}
+                add         r{{i | plus: 8}}, rsi
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+
+        mov     r8,     [rdi + 8]           // c ptr
+
+    // tops of cols
+    lea     r8,     [ r8 + 8 * rbx ]
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+
+    {% for quarter in (0..3) %}
+        {% for r in (0..3) %}
+            vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 8}}, {{quarter}}
+        {% endfor %}
+        {% for row in (0..3) %}
+            {% for i in (0..3) %}
+                vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}}
+                add         r{{i | plus: 8}}, rsi
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" size:"16x12", suffix:suffix, G:G, L:L, arch:"avx512" %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x8.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x8.tmpl
new file mode 100644
index 000000000..69761aaab
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_16x8.tmpl
@@ -0,0 +1,143 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 16 x 8
+
+    zmm0 zmm1 ... zmm8
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+
+{% include "preamble.tmpliq" size:"16x8", suffix:suffix, G:G, arch:"avx512" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+	cmp rbx, 2
+	jl {{L}}main_loop_packed_packed_tail
+
+{{align}} 16
+{{L}}main_loop_packed_packed:
+	{% include "8x8/packed_packed_loop1/avx-512-unroll.tmpli" %}
+
+    sub             rbx, 2
+	cmp rbx,        2
+	jge              {{L}}main_loop_packed_packed
+
+    test    rbx, rbx
+    jz      {{L}}non_linear_loop
+
+{{align}} 16
+{{L}}main_loop_packed_packed_tail:
+	{% include "8x8/packed_packed_loop1/avx-512.tmpli" %}
+
+	sub             rbx, 1
+    jnz				{{L}}main_loop_packed_packed_tail
+
+    jmp      {{L}}non_linear_loop
+
+{% include "f32_scalars.tmpliq" from:0, to:7 %}
+{% include "f32_per_rows.tmpliq" mr:16, from:0, to:7 %}
+{% include "f32_per_cols.tmpliq" mr:16, from:0, to:7 %}
+{% include "avx512_mmm_load_tile.tmpliq" from:0, to:7 %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    mov     eax,    0
+
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm12, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm13, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+    vperm2f128      ymm13,  ymm12, ymm13,         32 // ymm12 <- xmm12::xmm13
+    vinsertf32x8    zmm14, zmm14, ymm13, 1
+
+{% for i in (0..7) %}
+    kxnorw k1,k1,k1
+    vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+    add     r10, rbx
+    vaddps          zmm{{i}},   zmm{{i}},   zmm12
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         zmm12, zmmword ptr [rax]
+
+{% for i in (0..7) %}
+    vbroadcastss    zmm14, dword ptr [rbx + {{i|times:4}} ]
+    vfmadd231ps     zmm{{i}},   zmm12, zmm14
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    // tops of cols
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r12,    [ r8 + 4 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+    lea     r13,    [ r12 + rbx ]
+    lea     r14,    [ r12 + 2 * rbx ]
+    lea     r15,    [ r13 + 2 * rbx ]
+    
+    {% for quarter in (0..3) %}
+        {% for r in (0..7) %}
+            vextractf32x4 xmm{{r | plus: 8}}, zmm{{r}}, {{quarter}}
+        {% endfor %}
+        {% for row in (0..3) %}
+            {% for i in (0..7) %}
+                vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 8}}, {{row}}
+                add         r{{i | plus: 8}}, rsi
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" size:"16x8", suffix:suffix, G:G, L:L, arch:"avx512" %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x5.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x5.tmpl
new file mode 100644
index 000000000..be4ac53fb
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x5.tmpl
@@ -0,0 +1,144 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 32 x 5:
+
+    zmm0 zmm2 zmm4 zmm6 zmm8
+    zmm1 zmm3 zmm5 zmm7 zmm9
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" size:"32x5", suffix:suffix, G:G, arch:"avx512" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed:
+	{% include "2x5/packed_packed_loop1/avx-512.tmpli" %}
+
+    dec             rbx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{% include "f32_scalars.tmpliq" from:0, to:9 %}
+{% include "f32_per_rows.tmpliq" mr:32, from:0, to:9 %}
+{% include "f32_per_cols.tmpliq" mr:32, from:0, to:9 %}
+{% include "avx512_mmm_load_tile.tmpliq" from:0, to:9 %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    mov     eax,    0
+
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm12, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm13, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+    vperm2f128      ymm13,  ymm12, ymm13,         32 // ymm12 <- xmm12::xmm13
+    vinsertf32x8    zmm14, zmm14, ymm13, 1
+
+{% for i in (0..4) %}
+    kxnorw k1,k1,k1
+    vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+    add     r10, rbx
+    vaddps          zmm{{i | times: 2}},   zmm{{i | times: 2}},   zmm12
+{% endfor %}
+
+    imul    esi,    16
+    vpbroadcastd    zmm15, esi
+
+    mov     r10,    [rdi + 8]
+    vpaddd          zmm14, zmm14, zmm15
+
+{% for i in (0..4) %}
+    kxnorw k1,k1,k1
+    vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+    add     r10, rbx
+    vaddps          zmm{{i | times: 2 | plus: 1}},   zmm{{i | times: 2 | plus: 1}},   zmm12
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         zmm12, zmmword ptr [rax]
+    vmovups         zmm13, zmmword ptr [rax+64]
+
+{% for i in (0..4) %}
+    vbroadcastss    zmm14, dword ptr [rbx + {{i|times:4}} ]
+    vfmadd231ps     zmm{{i | times: 2}}, zmm12, zmm14
+    vfmadd231ps     zmm{{i | times: 2 | plus: 1}}, zmm13, zmm14
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    // tops of cols
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+    lea     r12,    [ r10 + 2 * rbx ]
+
+    {% for word in (0..1) %}
+        {% for quarter in (0..3) %}
+            {% for r in (0..4) %}
+                vextractf32x4 xmm{{r | plus: 11}}, zmm{{r | times: 2 | plus: word}}, {{quarter}}
+            {% endfor %}
+            {% for row in (0..3) %}
+                {% for i in (0..4) %}
+                    vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 11}}, {{row}}
+                    add         r{{i | plus: 8}}, rsi
+                {% endfor %}
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}    
+
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" size:"32x5", suffix:suffix, G:G, L:L, arch:"avx512" %}
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x6.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x6.tmpl
new file mode 100644
index 000000000..acca978da
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_32x6.tmpl
@@ -0,0 +1,161 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 32 x 6:
+
+    zmm0 zmm2 zmm4 zmm6 zmm8 zmm10
+    zmm1 zmm3 zmm5 zmm7 zmm9 zmm11
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" size:"32x6", suffix:suffix, G:G, arch:"avx512" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed:
+	{% include "2x6/packed_packed_loop1/avx-512.tmpli" %}
+
+    dec             rbx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{% include "f32_scalars.tmpliq" from:0, to:11 %}
+{% include "f32_per_rows.tmpliq" mr:32, from:0, to:11 %}
+{% include "f32_per_cols.tmpliq" mr:32, from:0, to:11 %}
+{% include "avx512_mmm_load_tile.tmpliq" from:0, to:11 %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    mov     eax,    0
+
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm12, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm13, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+    vperm2f128      ymm13,  ymm12, ymm13,         32 // ymm12 <- xmm12::xmm13
+    vinsertf32x8    zmm14, zmm14, ymm13, 1
+
+{% for i in (0..5) %}
+    kxnorw k1,k1,k1
+    vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+    add     r10, rbx
+    vaddps          zmm{{i | times: 2}},   zmm{{i | times: 2}},   zmm12
+{% endfor %}
+
+    mov     r10,    [rdi + 8]
+    imul    esi,    16
+    vpbroadcastd    zmm15, esi
+    vpaddd          zmm14, zmm14, zmm15
+
+{% for i in (0..5) %}
+    kxnorw k1,k1,k1
+    vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+    add     r10, rbx
+    vaddps          zmm{{i | times: 2 | plus: 1}},   zmm{{i | times: 2 | plus: 1}},   zmm12
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         zmm12, zmmword ptr [rax]
+    vmovups         zmm13, zmmword ptr [rax+64]
+
+{% for i in (0..5) %}
+    vbroadcastss    zmm14, dword ptr [rbx + {{i|times:4}} ]
+    vfmadd231ps     zmm{{i | times: 2}}, zmm12, zmm14
+    vfmadd231ps     zmm{{i | times: 2 | plus: 1}}, zmm13, zmm14
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    // tops of cols
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+
+    {% for word in (0..1) %}
+        {% for quarter in (0..3) %}
+            {% for r in (0..2) %}
+                vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 2 | plus: word}}, {{quarter}}
+            {% endfor %}
+            {% for row in (0..3) %}
+                {% for i in (0..2) %}
+                    vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}}
+                    add         r{{i | plus: 8}}, rsi
+                {% endfor %}
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}    
+
+    // tops of cols
+    mov     r8, r11
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+
+    {% for word in (0..1) %}
+        {% for quarter in (0..3) %}
+            {% for r in (0..2) %}
+                vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 3 | times: 2 | plus: word}}, {{quarter}}
+            {% endfor %}
+            {% for row in (0..3) %}
+                {% for i in (0..2) %}
+                    vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}}
+                    add         r{{i | plus: 8}}, rsi
+                {% endfor %}
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" size:"32x6", suffix:suffix, G:G, L:L, arch:"avx512" %}
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_48x4.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_48x4.tmpl
new file mode 100644
index 000000000..6f7e8b456
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_48x4.tmpl
@@ -0,0 +1,148 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 48 x 4:
+
+    zmm0 zmm3 zmm6 zmm9
+    zmm1 zmm4 zmm7 zmm10
+    zmm2 zmm5 zmm8 zmm11
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" size:"48x4", suffix:suffix, G:G, arch:"avx512" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed:
+	{% include "3x4/packed_packed_loop1/avx-512.tmpli" %}
+
+    dec             rbx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{% include "f32_scalars.tmpliq" from:0, to:11 %}
+{% include "f32_per_rows.tmpliq" mr:48, from:0, to:11 %}
+{% include "f32_per_cols.tmpliq" mr:48, from:0, to:11 %}
+{% include "avx512_mmm_load_tile.tmpliq" from:0, to:11 %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    mov     eax,    0
+
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm12, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm13, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+    vperm2f128      ymm13,  ymm12, ymm13,         32 // ymm12 <- xmm12::xmm13
+    vinsertf32x8    zmm14, zmm14, ymm13, 1
+
+{% for i in (0..3) %}
+    kxnorw k1,k1,k1
+    vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+    add     r10, rbx
+    vaddps          zmm{{i | times: 3}},   zmm{{i | times: 3}},   zmm12
+{% endfor %}
+
+    imul    esi,    16
+    vpbroadcastd    zmm15, esi
+
+{% for j in (1..2) %}
+    mov     r10,    [rdi + 8]
+    vpaddd          zmm14, zmm14, zmm15
+
+    {% for i in (0..3) %}
+        kxnorw k1,k1,k1
+        vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+        add     r10, rbx
+        vaddps          zmm{{i | times: 3 | plus: j}},   zmm{{i | times: 3 | plus: j}},   zmm12
+    {% endfor %}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         zmm12, zmmword ptr [rax]
+    vmovups         zmm13, zmmword ptr [rax+64]
+    vmovups         zmm15, zmmword ptr [rax+128]
+
+{% for i in (0..3) %}
+    vbroadcastss    zmm14, dword ptr [rbx + {{i|times:4}} ]
+    vfmadd231ps     zmm{{i | times: 3}}, zmm12, zmm14
+    vfmadd231ps     zmm{{i | times: 3 | plus: 1}}, zmm13, zmm14
+    vfmadd231ps     zmm{{i | times: 3 | plus: 2}}, zmm15, zmm14
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    // tops of cols
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+
+    {% for word in (0..2) %}
+        {% for quarter in (0..3) %}
+            {% for r in (0..3) %}
+                vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 3 | plus: word}}, {{quarter}}
+            {% endfor %}
+            {% for row in (0..3) %}
+                {% for i in (0..3) %}
+                    vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}}
+                    add         r{{i | plus: 8}}, rsi
+                {% endfor %}
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}    
+
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" size:"48x4", suffix:suffix, G:G, L:L, arch:"avx512" %}
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_64x3.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_64x3.tmpl
new file mode 100644
index 000000000..625f22aaa
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_64x3.tmpl
@@ -0,0 +1,149 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 64 x 3:
+
+    zmm0 zmm4 zmm8
+    zmm1 zmm5 zmm9
+    zmm2 zmm6 zmm10
+    zmm3 zmm7 zmm11
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" size:"64x3", suffix:suffix, G:G, arch:"avx512" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed:
+	{% include "4x3/packed_packed_loop1/avx-512.tmpli" %}
+
+    dec             rbx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{% include "f32_scalars.tmpliq" from:0, to:11 %}
+{% include "f32_per_rows.tmpliq" mr:64, from:0, to:11 %}
+{% include "f32_per_cols.tmpliq" mr:64, from:0, to:11 %}
+{% include "avx512_mmm_load_tile.tmpliq" from:0, to:11 %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    mov     eax,    0
+
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm12, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm13, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+    vperm2f128      ymm13,  ymm12, ymm13,         32 // ymm12 <- xmm12::xmm13
+    vinsertf32x8    zmm14, zmm14, ymm13, 1
+
+{% for i in (0..2) %}
+    kxnorw k1,k1,k1
+    vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+    add     r10, rbx
+    vaddps          zmm{{i | times: 4}},   zmm{{i | times: 4}},   zmm12
+{% endfor %}
+
+    imul    esi,    16
+    vpbroadcastd    zmm15, esi
+
+{% for j in (1..3) %}
+    mov     r10,    [rdi + 8]
+    vpaddd          zmm14, zmm14, zmm15
+
+    {% for i in (0..2) %}
+        kxnorw k1,k1,k1
+        vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+        add     r10, rbx
+        vaddps          zmm{{i | times: 4 | plus: j}},   zmm{{i | times: 4 | plus: j}},   zmm12
+    {% endfor %}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vbroadcastss    zmm13, dword ptr [rbx]
+    vbroadcastss    zmm14, dword ptr [rbx+4]
+    vbroadcastss    zmm15, dword ptr [rbx+8]
+
+{% for i in (0..3) %}
+    vmovups         zmm12, zmmword ptr [rax+{{i | times:64}}]
+    vfmadd231ps     zmm{{i}}, zmm12, zmm13
+    vfmadd231ps     zmm{{i | plus: 4}}, zmm12, zmm14
+    vfmadd231ps     zmm{{i | plus: 8}}, zmm12, zmm15
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    // tops of cols
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+
+    {% for word in (0..3) %}
+        {% for quarter in (0..3) %}
+            {% for r in (0..2) %}
+                vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 4 | plus: word}}, {{quarter}}
+            {% endfor %}
+            {% for row in (0..3) %}
+                {% for i in (0..2) %}
+                    vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}}
+                    add         r{{i | plus: 8}}, rsi
+                {% endfor %}
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}    
+
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" size:"64x3", suffix:suffix, G:G, L:L, arch:"avx512" %}
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_80x2.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_80x2.tmpl
new file mode 100644
index 000000000..7350b784e
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_f32_80x2.tmpl
@@ -0,0 +1,148 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 80 x 2:
+
+    zmm0 zmm5
+    zmm1 zmm6
+    zmm2 zmm7
+    zmm3 zmm8
+    zmm4 zmm9
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" size:"80x2", suffix:suffix, G:G, arch:"avx512" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed:
+	{% include "5x2/packed_packed_loop1/avx-512.tmpli" %}
+
+    dec             rbx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{% include "f32_scalars.tmpliq" from:0, to:9 %}
+{% include "f32_per_rows.tmpliq" mr:80, from:0, to:9 %}
+{% include "f32_per_cols.tmpliq" mr:80, from:0, to:9 %}
+{% include "avx512_mmm_load_tile.tmpliq" from:0, to:9 %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    mov     eax,    0
+
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm12, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm13, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+    vperm2f128      ymm13,  ymm12, ymm13,         32 // ymm12 <- xmm12::xmm13
+    vinsertf32x8    zmm14, zmm14, ymm13, 1
+
+{% for i in (0..1) %}
+    kxnorw k1,k1,k1
+    vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+    add     r10, rbx
+    vaddps          zmm{{i | times: 5}},   zmm{{i | times: 5}},   zmm12
+{% endfor %}
+
+    imul    esi,    16
+    vpbroadcastd    zmm15, esi
+
+{% for j in (1..4) %}
+    mov     r10,    [rdi + 8]
+    vpaddd          zmm14, zmm14, zmm15
+
+    {% for i in (0..1) %}
+        kxnorw k1,k1,k1
+        vgatherdps      zmm12{k1},  [ r10 + zmm14 ]
+        add     r10, rbx
+        vaddps          zmm{{i | times: 5 | plus: j}},   zmm{{i | times: 5 | plus: j}},   zmm12
+    {% endfor %}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vbroadcastss    zmm14, dword ptr [rbx]
+    vbroadcastss    zmm15, dword ptr [rbx+4]
+
+{% for i in (0..4) %}
+    vmovups         zmm12, zmmword ptr [rax+{{i | times:64}}]
+    vfmadd231ps     zmm{{i}}, zmm12, zmm14
+    vfmadd231ps     zmm{{i | plus: 5}}, zmm12, zmm15
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    // tops of cols
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+
+    {% for word in (0..4) %}
+        {% for quarter in (0..3) %}
+            {% for r in (0..1) %}
+                vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 5 | plus: word}}, {{quarter}}
+            {% endfor %}
+            {% for row in (0..3) %}
+                {% for i in (0..1) %}
+                    vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}}
+                    add         r{{i | plus: 8}}, rsi
+                {% endfor %}
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}    
+
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" size:"80x2", suffix:suffix, G:G, L:L, arch:"avx512" %}
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_load_tile.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_load_tile.tmpliq
new file mode 100644
index 000000000..91c89ee82
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/avx512_mmm_load_tile.tmpliq
@@ -0,0 +1,9 @@
+// vim: set syntax=asm :
+
+{{L}}load_tile:
+    mov          r8, [rdi + 8]
+    {% for reg in (from..to) %}
+        vmovups         zmm{{reg}}, zmmword ptr [r8 + {{ reg|minus:from|times:64 }}]
+    {% endfor %}
+
+    jmp    {{L}}non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/dispatcher.tmpliq
new file mode 100644
index 000000000..1c63f72ad
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/dispatcher.tmpliq
@@ -0,0 +1,40 @@
+// vim: set syntax=asm :
+
+{{L}}non_linear:
+
+{{L}}non_linear_loop_enter:
+    sub     rdi,    40
+{{L}}non_linear_loop:
+    add     rdi,    40
+    mov     rax,    [rdi]
+
+    mov     r8, {{ jump_table | size }}
+    cmp     rax, 0
+    cmovl   rax, r8
+    cmp     rax, {{ jump_table | size }}
+    cmovg   rax, r8
+
+{% if msvc %}
+    lea     r8, [ offset {{L}}jmp_table ]
+{% else %}
+    lea     r8, [ rip + {{L}}jmp_table ]
+{% endif %}
+    movsxd  r9, dword ptr [ r8 + rax * 4 ]
+    lea     r8, [ r8 + r9 ]
+    jmp     r8
+
+{{L}}jmp_table:
+{% for j in jump_table %}
+    {{long}}      {{L}}{{j}}-{{L}}jmp_table
+{% endfor %}
+    {{long}}      {{L}}unsupported-{{L}}jmp_table
+
+{{L}}unsupported:
+    mov     rax,    1
+    jmp     {{L}}return
+
+
+{{L}}done:
+    mov     rax, 0
+    jmp     {{L}}return
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_cols.tmpliq
new file mode 100644
index 000000000..6d4097d41
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_cols.tmpliq
@@ -0,0 +1,8 @@
+// vim: set syntax=asm :
+
+{% include "zmm_per_col.tmpliq" label:"per_col_min", op:"vminps", mr:mr, from:from, to:to %}
+{% include "zmm_per_col.tmpliq" label:"per_col_max", op:"vmaxps", mr:mr, from:from, to:to %}
+{% include "zmm_per_col.tmpliq" label:"per_col_add", op:"vaddps", mr:mr, from:from, to:to %}
+{% include "zmm_per_col.tmpliq" label:"per_col_mul", op:"vmulps", mr:mr, from:from, to:to %}
+{% include "zmm_per_col.tmpliq" label:"per_col_sub", op:"vsubps", from:from, to:to %}
+{% include "zmm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_rows.tmpliq
new file mode 100644
index 000000000..b20fcbbbb
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_per_rows.tmpliq
@@ -0,0 +1,8 @@
+// vim: set syntax=asm :
+
+{% include "zmm_per_row.tmpliq" label:"per_row_min", op:"vminps", mr:mr, from:from, to:to %}
+{% include "zmm_per_row.tmpliq" label:"per_row_max", op:"vmaxps", mr:mr, from:from, to:to %}
+{% include "zmm_per_row.tmpliq" label:"per_row_add", op:"vaddps", mr:mr, from:from, to:to %}
+{% include "zmm_per_row.tmpliq" label:"per_row_mul", op:"vmulps", mr:mr, from:from, to:to %}
+{% include "zmm_per_row.tmpliq" label:"per_row_sub", op:"vsubps", from:from, to:to %}
+{% include "zmm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_scalars.tmpliq
new file mode 100644
index 000000000..7876d6cba
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/f32_scalars.tmpliq
@@ -0,0 +1,29 @@
+// vim: set syntax=asm :
+
+{% include "zmm_scalar.tmpliq" label:"scalar_min", op:"vminps", from:from, to:to %}
+{% include "zmm_scalar.tmpliq" label:"scalar_max", op:"vmaxps", from:from, to:to %}
+{% include "zmm_scalar.tmpliq" label:"scalar_add", op:"vaddps", from:from, to:to %}
+{% include "zmm_scalar.tmpliq" label:"scalar_mul", op:"vmulps", from:from, to:to %}
+{% include "zmm_scalar.tmpliq" label:"scalar_sub", op:"vsubps", from:from, to:to %}
+{% include "zmm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %}
+
+{{L}}leaky_relu:
+    // can only use zmm12 to zmm15
+    // ymm15 <- alpha
+    vbroadcastss    zmm15, dword ptr [rdi + 8]
+    // ymm14 <- all zero
+    vpxorq          zmm14, zmm14, zmm14
+
+    {% for reg in (from..to) %}
+        vcmpps      k1, zmm{{reg}}, zmm14, 1 // 1 means LT
+        // ymm12 <- alpha * x if < 0
+        vmulps      zmm{{reg}} {k1}, zmm{{reg}}, zmm15
+    {% endfor %}
+    // select muled of orginal
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale:
+{{L}}q_shl:
+{{L}}q_shr:
+    jmp {{L}}unsupported
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_cols.tmpliq
new file mode 100644
index 000000000..789bf77c2
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_cols.tmpliq
@@ -0,0 +1,8 @@
+// vim: set syntax=asm :
+
+{% include "zmm_per_col.tmpliq" label:"per_col_min", op:"vpminsd", mr:mr, from:from, to:to%}
+{% include "zmm_per_col.tmpliq" label:"per_col_max", op:"vpmaxsd", mr:mr, from:from, to:to%}
+{% include "zmm_per_col.tmpliq" label:"per_col_add", op:"vpaddd", mr:mr, from:from, to:to%}
+{% include "zmm_per_col.tmpliq" label:"per_col_mul", op:"vpmulld", mr:mr, from:from, to:to%}
+{% include "zmm_per_col.tmpliq" label:"per_col_sub", op:"vpsubd", from:from, to:to%}
+{% include "zmm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true%}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_rows.tmpliq
new file mode 100644
index 000000000..5e21b01eb
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_per_rows.tmpliq
@@ -0,0 +1,8 @@
+// vim: set syntax=asm :
+
+{% include "zmm_per_row.tmpliq" label:"per_row_min", op:"vpminsd", mr:mr, from:from, to:to%}
+{% include "zmm_per_row.tmpliq" label:"per_row_max", op:"vpmaxsd", mr:mr, from:from, to:to%}
+{% include "zmm_per_row.tmpliq" label:"per_row_add", op:"vpaddd", mr:mr, from:from, to:to%}
+{% include "zmm_per_row.tmpliq" label:"per_row_mul", op:"vpmulld", mr:mr, from:from, to:to%}
+{% include "zmm_per_row.tmpliq" label:"per_row_sub", op:"vpsubd", from:from, to:to%}
+{% include "zmm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true%}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_scalars.tmpliq
new file mode 100644
index 000000000..0b36e4910
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/i32_scalars.tmpliq
@@ -0,0 +1,10 @@
+// vim: set syntax=asm :
+{% unless arch %}
+   {% assign arch = "ymm" %}
+{% endunless %}
+{% include "zmm_scalar.tmpliq" label:"scalar_min", op:"vpminsd", from:from, to:to, arch:arch %}
+{% include "zmm_scalar.tmpliq" label:"scalar_max", op:"vpmaxsd", from:from, to:to, arch:arch %}
+{% include "zmm_scalar.tmpliq" label:"scalar_mul", op:"vpmulld", from:from, to:to, arch:arch %}
+{% include "zmm_scalar.tmpliq" label:"scalar_add", op:"vpaddd", from:from, to:to, arch:arch %}
+{% include "zmm_scalar.tmpliq" label:"scalar_sub", op:"vpsubd", from:from, to:to, arch:arch %}
+{% include "zmm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true, arch:arch %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/postamble.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/postamble.tmpliq
new file mode 100644
index 000000000..ff3071a71
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/postamble.tmpliq
@@ -0,0 +1,38 @@
+{{L}}return:
+    ldmxcsr     [rsp + 4]
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+{% if msvc %}
+{{arch}}_mmm_f32_{{size}}_{{suffix}} endp
+_text ends
+end
+
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/preamble.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/preamble.tmpliq
new file mode 100644
index 000000000..3ed2f7c30
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/preamble.tmpliq
@@ -0,0 +1,63 @@
+{% if msvc %}
+
+_text segment
+{{arch}}_mmm_f32_{{size}}_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}{{arch}}_mmm_f32_{{size}}_{{suffix}}
+{{G}}{{arch}}_mmm_f32_{{size}}_{{suffix}}:
+.cfi_startproc
+
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+{% if family == "windows" %}
+// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch
+// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    push        rdi
+    push        rsi
+
+    mov         rdi, rcx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+.cfi_def_cfa_offset 64
+{% endif %}
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+
+{% include "dispatcher.tmpliq" %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/sigmoid_f32.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/sigmoid_f32.tmpl
new file mode 100644
index 000000000..5c962c6f2
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/sigmoid_f32.tmpl
@@ -0,0 +1,324 @@
+{% comment %}
+// vim: set syntax=asm :
+
+
+// TODO[TSolberg] : Not validated.
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15
+    return: rax (+rdx)
+
+{% endcomment %}
+
+{% if msvc %}
+
+_text segment
+avx512_sigmoid_f32_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}avx512_sigmoid_f32_{{suffix}}
+{{G}}avx512_sigmoid_f32_{{suffix}}:
+.cfi_startproc
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+
+{% if family == "windows" %}
+// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch
+// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    // move around arguments to mimick SysV rdi,rsi passing
+    push        rdi
+    push        rsi
+    mov         rdi, rcx
+    mov         rsi, rdx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+// FIXME
+// .cfi_def_cfa_offset 64
+{% endif %}
+
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+// ----------------------------------------------------------------------
+
+{%capture offset%}{% if msvc %} offset {%else%} rip + {%endif%} {%endcapture%}
+
+    cmp     rsi, 0
+    je      {{L}}done
+
+    cmp     rsi, 32
+    jl      {{L}}loop_1
+
+{{L}}loop_4:
+
+    vmovaps         zmm4, [rdi]
+    vmovaps         zmm5, [rdi + 64]
+    vmovaps         zmm6, [rdi + 128]
+    vmovaps         zmm7, [rdi + 192]
+
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_low]
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_high]
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9]
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7]
+
+    vmaxps          zmm4, zmm4, zmm0
+    vmaxps          zmm5, zmm5, zmm0
+    vmaxps          zmm6, zmm6, zmm0
+    vmaxps          zmm7, zmm7, zmm0
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5]
+
+    vminps          zmm4, zmm4, zmm1
+    vminps          zmm5, zmm5, zmm1
+    vminps          zmm6, zmm6, zmm1
+    vminps          zmm7, zmm7, zmm1        // zmm4..7 <- x
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3]
+
+    vmulps          zmm8, zmm4, zmm4
+    vmulps          zmm9, zmm5, zmm5
+    vmulps          zmm10, zmm6, zmm6
+    vmulps          zmm11, zmm7, zmm7        // zmm8..11 <- x^2
+
+    vmovaps         zmm12, zmm2
+    vmovaps         zmm13, zmm2
+    vmovaps         zmm14, zmm2
+    vmovaps         zmm15, zmm2
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1]
+    vfmadd132ps     zmm12, zmm3, zmm8
+    vfmadd132ps     zmm13, zmm3, zmm9
+    vfmadd132ps     zmm14, zmm3, zmm10
+    vfmadd132ps     zmm15, zmm3, zmm11
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_10]
+    vfmadd132ps     zmm12, zmm0, zmm8
+    vfmadd132ps     zmm13, zmm0, zmm9
+    vfmadd132ps     zmm14, zmm0, zmm10
+    vfmadd132ps     zmm15, zmm0, zmm11
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_8]
+    vfmadd132ps     zmm12, zmm1, zmm8
+    vfmadd132ps     zmm13, zmm1, zmm9
+    vfmadd132ps     zmm14, zmm1, zmm10
+    vfmadd132ps     zmm15, zmm1, zmm11
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6]
+    vfmadd132ps     zmm12, zmm2, zmm8
+    vfmadd132ps     zmm13, zmm2, zmm9
+    vfmadd132ps     zmm14, zmm2, zmm10
+    vfmadd132ps     zmm15, zmm2, zmm11
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4]
+    vmulps          zmm4, zmm4, zmm12
+    vmulps          zmm5, zmm5, zmm13
+    vmulps          zmm6, zmm6, zmm14
+    vmulps          zmm7, zmm7, zmm15   // zmm4..7 <- num
+
+    vmovaps         zmm12, zmm3
+    vmovaps         zmm13, zmm3
+    vmovaps         zmm14, zmm3
+    vmovaps         zmm15, zmm3
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2]
+    vfmadd132ps     zmm12, zmm0, zmm8
+    vfmadd132ps     zmm13, zmm0, zmm9
+    vfmadd132ps     zmm14, zmm0, zmm10
+    vfmadd132ps     zmm15, zmm0, zmm11
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0]
+    vfmadd132ps     zmm12, zmm1, zmm8
+    vfmadd132ps     zmm13, zmm1, zmm9
+    vfmadd132ps     zmm14, zmm1, zmm10
+    vfmadd132ps     zmm15, zmm1, zmm11
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_half]
+    vfmadd132ps     zmm12, zmm2, zmm8
+    vfmadd132ps     zmm13, zmm2, zmm9
+    vfmadd132ps     zmm14, zmm2, zmm10
+    vfmadd132ps     zmm15, zmm2, zmm11
+    vfmadd132ps     zmm12, zmm3, zmm8
+    vfmadd132ps     zmm13, zmm3, zmm9
+    vfmadd132ps     zmm14, zmm3, zmm10
+    vfmadd132ps     zmm15, zmm3, zmm11
+    vfmadd132ps     zmm12, zmm0, zmm8
+    vfmadd132ps     zmm13, zmm0, zmm9
+    vfmadd132ps     zmm14, zmm0, zmm10
+    vfmadd132ps     zmm15, zmm0, zmm11  // zmm12..14 <- denum
+
+    vdivps          zmm4, zmm4, zmm12
+    vdivps          zmm5, zmm5, zmm13
+    vdivps          zmm6, zmm6, zmm14
+    vdivps          zmm7, zmm7, zmm15
+    vaddps          zmm4, zmm4, zmm1
+    vaddps          zmm5, zmm5, zmm1
+    vaddps          zmm6, zmm6, zmm1
+    vaddps          zmm7, zmm7, zmm1
+
+    vmovaps [rdi], zmm4
+    vmovaps [rdi + 64], zmm5
+    vmovaps [rdi + 128], zmm6
+    vmovaps [rdi + 192], zmm7
+
+    add     rdi, 256
+    sub     rsi, 32
+    cmp     rsi, 32
+    jg      {{L}}loop_4
+
+    cmp     rsi, 0
+    je      {{L}}done
+
+{{L}}loop_1:
+    vmovaps         zmm4, [rdi]
+
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_low]
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_high]
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9]
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7]
+
+    vmaxps          zmm4, zmm4, zmm0
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5]
+
+    vminps          zmm4, zmm4, zmm1        // zmm4 <- x
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3]
+
+    vmulps          zmm8, zmm4, zmm4        // zmm8 <- x^2
+
+    vmovaps         zmm12, zmm2
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1]
+    vfmadd132ps     zmm12, zmm3, zmm8
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_10]
+    vfmadd132ps     zmm12, zmm0, zmm8
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_8]
+    vfmadd132ps     zmm12, zmm1, zmm8
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6]
+    vfmadd132ps     zmm12, zmm2, zmm8
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4]
+    vmulps          zmm4, zmm4, zmm12
+
+    vmovaps         zmm12, zmm3
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2]
+    vfmadd132ps     zmm12, zmm0, zmm8
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0]
+    vfmadd132ps     zmm12, zmm1, zmm8
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_half]
+    vfmadd132ps     zmm12, zmm2, zmm8
+    vfmadd132ps     zmm12, zmm3, zmm8
+    vfmadd132ps     zmm12, zmm0, zmm8
+
+    vdivps          zmm4, zmm4, zmm12
+    vaddps          zmm4, zmm4, zmm1
+
+    vmovaps [rdi], zmm4
+    add     rdi, 32
+    sub     rsi, 8
+    jnz     {{L}}loop_1
+
+{{L}}done:
+
+// ----------------------------------------------------------------------
+
+    ldmxcsr     [rsp + 4]
+
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+{%capture float%}{% if msvc %} real4 {%else%} .float {%endif%}{%endcapture%}
+
+{{L}}coeffs_num_low:
+    {{float}} -18.0                    // low
+{{L}}coeffs_num_high:
+    {{float}} 18.0                     // high
+
+{{L}}coeffs_num_alpha_9:
+    {{float}} 4.37031012579801e-11     // alpha_9
+{{L}}coeffs_num_alpha_7:
+    {{float}} 1.15627324459942e-07     // alpha_7
+{{L}}coeffs_num_alpha_5:
+    {{float}} 6.08574864600143e-05     // alpha_5
+{{L}}coeffs_num_alpha_3:
+    {{float}} 8.51377133304701e-03     // alpha_3
+{{L}}coeffs_num_alpha_1:
+    {{float}} 2.48287947061529e-01     // alpha_1
+
+{{L}}coeffs_num_beta_10:
+    {{float}} 6.10247389755681e-13
+{{L}}coeffs_num_beta_8:
+    {{float}} 5.76102136993427e-09
+{{L}}coeffs_num_beta_6:
+    {{float}} 6.29106785017040e-06     // beta_6
+{{L}}coeffs_num_beta_4:
+    {{float}} 1.70198817374094e-03     // beta_4
+{{L}}coeffs_num_beta_2:
+    {{float}} 1.16817656904453e-01     // beta_2
+{{L}}coeffs_num_beta_0:
+    {{float}} 9.93151921023180e-01     // beta_0
+
+{{L}}coeffs_num_half:
+    {{float}} 0.5
+
+{% if msvc %}
+avx512_sigmoid_f32_{{suffix}} endp
+_text ends
+end
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/tanh_f32.tmpl b/vendor/tract-linalg-0.22.1/x86_64/avx512/tanh_f32.tmpl
new file mode 100644
index 000000000..dc4b0f07a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/tanh_f32.tmpl
@@ -0,0 +1,313 @@
+{% comment %}
+// vim: set syntax=asm :
+
+// TODO[TSolberg] : Not validated.
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15
+    return: rax (+rdx)
+
+{% endcomment %}
+
+{% if msvc %}
+
+_text segment
+avx512_tanh_f32_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}avx512_tanh_f32_{{suffix}}
+{{G}}avx512_tanh_f32_{{suffix}}:
+.cfi_startproc
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+
+{% if family == "windows" %}
+// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch
+// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    // move around arguments to mimick SysV rdi,rsi passing
+    push        rdi
+    push        rsi
+    mov         rdi, rcx
+    mov         rsi, rdx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+// FIXME
+// .cfi_def_cfa_offset 64
+{% endif %}
+
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+// ----------------------------------------------------------------------
+
+{%capture offset%}{% if msvc %} offset {%else%} rip + {%endif%} {%endcapture%}
+
+    cmp     rsi, 0
+    je      {{L}}done
+
+    cmp     rsi, 32
+    jl      {{L}}loop_1
+
+{{L}}loop_4:
+
+    vmovaps         zmm4, [rdi]
+    vmovaps         zmm5, [rdi + 64]
+    vmovaps         zmm6, [rdi + 128]
+    vmovaps         zmm7, [rdi + 192]
+
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_low]
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_high]
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13]
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11]
+
+    vmaxps          zmm4, zmm4, zmm0
+    vmaxps          zmm5, zmm5, zmm0
+    vmaxps          zmm6, zmm6, zmm0
+    vmaxps          zmm7, zmm7, zmm0
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9]
+
+    vminps          zmm4, zmm4, zmm1
+    vminps          zmm5, zmm5, zmm1
+    vminps          zmm6, zmm6, zmm1
+    vminps          zmm7, zmm7, zmm1        // zmm4..7 <- x
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7]
+
+    vmulps          zmm8, zmm4, zmm4
+    vmulps          zmm9, zmm5, zmm5
+    vmulps          zmm10, zmm6, zmm6
+    vmulps          zmm11, zmm7, zmm7        // zmm8..11 <- x^2
+
+    vmovaps         zmm12, zmm2
+    vmovaps         zmm13, zmm2
+    vmovaps         zmm14, zmm2
+    vmovaps         zmm15, zmm2
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5]
+    vfmadd132ps     zmm12, zmm3, zmm8
+    vfmadd132ps     zmm13, zmm3, zmm9
+    vfmadd132ps     zmm14, zmm3, zmm10
+    vfmadd132ps     zmm15, zmm3, zmm11
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3]
+    vfmadd132ps     zmm12, zmm0, zmm8
+    vfmadd132ps     zmm13, zmm0, zmm9
+    vfmadd132ps     zmm14, zmm0, zmm10
+    vfmadd132ps     zmm15, zmm0, zmm11
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1]
+    vfmadd132ps     zmm12, zmm1, zmm8
+    vfmadd132ps     zmm13, zmm1, zmm9
+    vfmadd132ps     zmm14, zmm1, zmm10
+    vfmadd132ps     zmm15, zmm1, zmm11
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6]
+    vfmadd132ps     zmm12, zmm2, zmm8
+    vfmadd132ps     zmm13, zmm2, zmm9
+    vfmadd132ps     zmm14, zmm2, zmm10
+    vfmadd132ps     zmm15, zmm2, zmm11
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4]
+    vfmadd132ps     zmm12, zmm3, zmm8
+    vfmadd132ps     zmm13, zmm3, zmm9
+    vfmadd132ps     zmm14, zmm3, zmm10
+    vfmadd132ps     zmm15, zmm3, zmm11
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2]
+    vfmadd132ps     zmm12, zmm0, zmm8
+    vfmadd132ps     zmm13, zmm0, zmm9
+    vfmadd132ps     zmm14, zmm0, zmm10
+    vfmadd132ps     zmm15, zmm0, zmm11
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0]
+    vmulps          zmm4, zmm4, zmm12
+    vmulps          zmm5, zmm5, zmm13
+    vmulps          zmm6, zmm6, zmm14
+    vmulps          zmm7, zmm7, zmm15   // zmm4..7 <- num
+
+    vmovaps         zmm12, zmm1
+    vmovaps         zmm13, zmm1
+    vmovaps         zmm14, zmm1
+    vmovaps         zmm15, zmm1
+    vfmadd132ps     zmm12, zmm2, zmm8
+    vfmadd132ps     zmm13, zmm2, zmm9
+    vfmadd132ps     zmm14, zmm2, zmm10
+    vfmadd132ps     zmm15, zmm2, zmm11
+    vfmadd132ps     zmm12, zmm3, zmm8
+    vfmadd132ps     zmm13, zmm3, zmm9
+    vfmadd132ps     zmm14, zmm3, zmm10
+    vfmadd132ps     zmm15, zmm3, zmm11
+    vfmadd132ps     zmm12, zmm0, zmm8
+    vfmadd132ps     zmm13, zmm0, zmm9
+    vfmadd132ps     zmm14, zmm0, zmm10
+    vfmadd132ps     zmm15, zmm0, zmm11  // zmm12..14 <- denum
+
+    vdivps          zmm4, zmm4, zmm12
+    vdivps          zmm5, zmm5, zmm13
+    vdivps          zmm6, zmm6, zmm14
+    vdivps          zmm7, zmm7, zmm15
+
+    vmovaps [rdi], zmm4
+    vmovaps [rdi + 64], zmm5
+    vmovaps [rdi + 128], zmm6
+    vmovaps [rdi + 192], zmm7
+
+    add     rdi, 256
+    sub     rsi, 32
+    cmp     rsi, 32
+    jg      {{L}}loop_4
+
+    cmp     rsi, 0
+    je      {{L}}done
+
+{{L}}loop_1:
+    vmovaps         zmm4, [rdi]
+
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_low]
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_high]
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13]
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11]
+
+    vmaxps          zmm4, zmm4, zmm0
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9]
+
+    vminps          zmm4, zmm4, zmm1        // zmm4 <- x
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7]
+
+    vmulps          zmm8, zmm4, zmm4        // zmm8 <- x^2
+
+    vmovaps         zmm12, zmm2
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5]
+    vfmadd132ps     zmm12, zmm3, zmm8
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3]
+    vfmadd132ps     zmm12, zmm0, zmm8
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1]
+    vfmadd132ps     zmm12, zmm1, zmm8
+    vbroadcastss    zmm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6]
+    vfmadd132ps     zmm12, zmm2, zmm8
+    vbroadcastss    zmm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4]
+    vfmadd132ps     zmm12, zmm3, zmm8
+    vbroadcastss    zmm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2]
+    vfmadd132ps     zmm12, zmm0, zmm8
+    vbroadcastss    zmm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0]
+    vmulps          zmm4, zmm4, zmm12
+
+    vmovaps         zmm12, zmm1
+    vfmadd132ps     zmm12, zmm2, zmm8
+    vfmadd132ps     zmm12, zmm3, zmm8
+    vfmadd132ps     zmm12, zmm0, zmm8
+
+    vdivps          zmm4, zmm4, zmm12
+
+    vmovaps [rdi], zmm4
+    add     rdi, 32
+    sub     rsi, 8
+    jnz     {{L}}loop_1
+
+{{L}}done:
+
+// ----------------------------------------------------------------------
+
+    ldmxcsr     [rsp + 4]
+
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+{%capture float%}{% if msvc %} real4 {%else%} .float {%endif%}{%endcapture%}
+
+{{L}}coeffs_num_low:
+    {{float}} -9.0                     // low
+{{L}}coeffs_num_high:
+    {{float}} 9.0                      // high
+
+{{L}}coeffs_num_alpha_13:
+    {{float}} -2.76076847742355e-16    // alpha_13
+{{L}}coeffs_num_alpha_11:
+    {{float}} 2.00018790482477e-13     // alpha_11
+{{L}}coeffs_num_alpha_9:
+    {{float}} -8.60467152213735e-11    // alpha_9
+{{L}}coeffs_num_alpha_7:
+    {{float}} 5.12229709037114e-08     // alpha_7
+{{L}}coeffs_num_alpha_5:
+    {{float}} 1.48572235717979e-05     // alpha_5
+{{L}}coeffs_num_alpha_3:
+    {{float}} 6.37261928875436e-04     // alpha_3
+{{L}}coeffs_num_alpha_1:
+    {{float}} 4.89352455891786e-03     // alpha_1
+
+{{L}}coeffs_num_beta_6:
+    {{float}} 1.19825839466702e-06     // beta_6
+{{L}}coeffs_num_beta_4:
+    {{float}} 1.18534705686654e-04     // beta_4
+{{L}}coeffs_num_beta_2:
+    {{float}} 2.26843463243900e-03     // beta_2
+{{L}}coeffs_num_beta_0:
+    {{float}} 4.89352518554385e-03     // beta_0
+
+{% if msvc %}
+avx512_tanh_f32_{{suffix}} endp
+_text ends
+end
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_col.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_col.tmpliq
new file mode 100644
index 000000000..16c9d32eb
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_col.tmpliq
@@ -0,0 +1,29 @@
+// vim: set syntax=asm :
+
+{{L}}{{label}}:
+    mov             rax, [ rdi + 8 ]
+
+{% capture mr_over_16 %}{{ mr | divided_by: 16}}{%endcapture%}
+{% capture mr_over_16_min_1 %}{{ mr | divided_by: 16 | minus: 1}}{%endcapture%}
+
+{%capture tmp%}{{to | plus: 1 }}{%endcapture%}
+
+{%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_16}}{%endcapture%}
+{%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_16|minus:1}}{%endcapture%}
+// {{to|minus:from|plus:1}} cols:{{cols}}
+
+{% for right in (0..cols_min_1) %}
+    vbroadcastss    zmm{{tmp}}, dword ptr [ rax ]
+    add             rax, 4
+
+    {% for down in (0..mr_over_16_min_1) %}
+        {%capture acc%}{{mr_over_16|times:right|plus:from|plus:down}}{%endcapture%}
+        {% if flipped %}
+            {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{tmp}}
+        {% else %}
+            {{op}} zmm{{acc}}, zmm{{tmp}}, zmm{{acc}}
+        {% endif %}
+    {% endfor %}
+{% endfor %}
+
+    jmp {{L}}non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_row.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_row.tmpliq
new file mode 100644
index 000000000..f9da1b35f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_per_row.tmpliq
@@ -0,0 +1,23 @@
+// vim: set syntax=asm :
+
+{{L}}{{label}}:
+    mov             rax, [ rdi + 8 ]
+
+{% capture mr_over_16 %}{{ mr | divided_by: 16}}{%endcapture%}
+{% capture mr_over_16_min_1 %}{{ mr | divided_by: 16 | minus: 1}}{%endcapture%}
+
+{% for ix in (0..mr_over_16_min_1) %}
+    vmovups         zmm{{to | plus: 1 | plus: ix}},  [rax + {{ix | times: 64}}]
+{% endfor %}
+
+{% if flipped %}
+    {% for acc in (from..to) %}
+        {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{ acc | modulo: mr_over_16 | plus: to | plus: 1 }}
+    {% endfor %}
+{% else %}
+    {% for acc in (from..to) %}
+        {{op}} zmm{{acc}}, zmm{{ acc | modulo: mr_over_16 | plus: to | plus: 1 }}, zmm{{acc}}
+    {% endfor %}
+{% endif %}
+
+    jmp {{L}}non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_scalar.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_scalar.tmpliq
new file mode 100644
index 000000000..43373c9d8
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/avx512/zmm_scalar.tmpliq
@@ -0,0 +1,15 @@
+// vim: set syntax=asm :
+
+{{L}}{{label}}:
+    vbroadcastss    zmm12, dword ptr [rdi + 8]
+    {% if flipped %}
+        {% for reg in (from..to) %}
+            {{op}}          zmm{{reg}}, zmm{{reg}}, zmm12
+        {% endfor %}
+    {% else %}
+        {% for reg in (from..to) %}
+            {{op}}          zmm{{reg}}, zmm12, zmm{{reg}}
+        {% endfor %}
+    {% endif %}
+
+    jmp    {{L}}non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx-unroll.tmpli
new file mode 100644
index 000000000..93e56994b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx-unroll.tmpli
@@ -0,0 +1,58 @@
+	// Accumulators: 0-7
+	// Columns: 14-15
+	// Rows: 8-13
+
+	vbroadcastss    ymm15,  dword ptr [rcx]
+
+	vmovaps         ymm10, [rax + 0]
+	vmovaps         ymm11, [rax + 32]
+    vmovaps         ymm12, [rax + 64]
+	vmovaps         ymm13, [rax + 96]
+    vmovaps         ymm14, [rax + 128]
+
+    vfmadd231ps     ymm0, ymm10, ymm15
+    vfmadd231ps     ymm1, ymm11, ymm15
+    vfmadd231ps     ymm2, ymm12, ymm15
+    vfmadd231ps     ymm3, ymm13, ymm15
+    vfmadd231ps     ymm4, ymm14, ymm15
+
+	vmovaps         ymm10, [rax + 160]
+    vmovaps         ymm11, [rax + 192]
+	vmovaps         ymm12, [rax + 224]
+	vmovaps         ymm13, [rax + 256]
+	vmovaps         ymm14, [rax + 288]
+
+    vfmadd231ps     ymm5, ymm10, ymm15
+    vfmadd231ps     ymm6, ymm11, ymm15
+    vfmadd231ps     ymm7, ymm12, ymm15
+    vfmadd231ps     ymm8, ymm13, ymm15
+    vfmadd231ps     ymm9, ymm14, ymm15
+
+	vbroadcastss    ymm15,  dword ptr [rcx + 4]
+
+	vmovaps         ymm10, [rax + 320]
+	vmovaps         ymm11, [rax + 352]
+    vmovaps         ymm12, [rax + 384]
+	vmovaps         ymm13, [rax + 416]
+    vmovaps         ymm14, [rax + 448]
+
+	vfmadd231ps     ymm0, ymm10, ymm15
+    vfmadd231ps     ymm1, ymm11, ymm15
+    vfmadd231ps     ymm2, ymm12, ymm15
+    vfmadd231ps     ymm3, ymm13, ymm15
+    vfmadd231ps     ymm4, ymm14, ymm15
+
+	vmovaps         ymm10, [rax + 480]
+    vmovaps         ymm11, [rax + 512]
+	vmovaps         ymm12, [rax + 544]
+	vmovaps         ymm13, [rax + 576]
+	vmovaps         ymm14, [rax + 608]
+
+    vfmadd231ps     ymm5, ymm10, ymm15
+    vfmadd231ps     ymm6, ymm11, ymm15
+    vfmadd231ps     ymm7, ymm12, ymm15
+    vfmadd231ps     ymm8, ymm13, ymm15
+    vfmadd231ps     ymm9, ymm14, ymm15
+
+    add rcx, 8
+	add rax, 640
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx.tmpli
new file mode 100644
index 000000000..d29f839e8
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/10x1/packed_packed_loop1/avx.tmpli
@@ -0,0 +1,33 @@
+	// Tile size: 10x1
+	// Accumulators: 0-9
+	// Col regs: 10-14
+	// Row regs: 15
+
+	vbroadcastss    ymm15,  dword ptr [rcx]
+
+	vmovaps         ymm10, [rax + 0]
+	vmovaps         ymm11, [rax + 32]
+    vmovaps         ymm12, [rax + 64]
+	vmovaps         ymm13, [rax + 96]
+    vmovaps         ymm14, [rax + 128]
+
+    vfmadd231ps     ymm0, ymm10, ymm15
+    vfmadd231ps     ymm1, ymm11, ymm15
+    vfmadd231ps     ymm2, ymm12, ymm15
+    vfmadd231ps     ymm3, ymm13, ymm15
+    vfmadd231ps     ymm4, ymm14, ymm15
+
+	vmovaps         ymm10, [rax + 160]
+    vmovaps         ymm11, [rax + 192]
+	vmovaps         ymm12, [rax + 224]
+	vmovaps         ymm13, [rax + 256]
+	vmovaps         ymm14, [rax + 288]
+
+    vfmadd231ps     ymm5, ymm10, ymm15
+    vfmadd231ps     ymm6, ymm11, ymm15
+    vfmadd231ps     ymm7, ymm12, ymm15
+    vfmadd231ps     ymm8, ymm13, ymm15
+    vfmadd231ps     ymm9, ymm14, ymm15
+
+    add rcx, 4
+	add rax, 320
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx-unroll.tmpli
new file mode 100644
index 000000000..6cb824665
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx-unroll.tmpli
@@ -0,0 +1,52 @@
+	// Accumulators: 0-9
+	// Columns: 14-15
+	// Rows: 10-13
+    vbroadcastss    ymm10,  dword ptr [rcx]
+    vbroadcastss    ymm11,  dword ptr [rcx + 4]
+    vbroadcastss    ymm12,  dword ptr [rcx + 8]
+    vbroadcastss    ymm13,  dword ptr [rcx + 12]
+
+    vmovaps         ymm14,  [rax]
+    vmovaps         ymm15,  [rax + 32]
+
+    vfmadd231ps     ymm0,   ymm14, ymm10
+    vfmadd231ps     ymm1,   ymm15, ymm10
+
+    vfmadd231ps     ymm2,   ymm14, ymm11
+    vfmadd231ps     ymm3,   ymm15, ymm11
+
+    vbroadcastss    ymm11,  dword ptr [rcx + 16]
+
+    vfmadd231ps     ymm4,   ymm14, ymm12
+    vfmadd231ps     ymm5,   ymm15, ymm12
+
+    vfmadd231ps     ymm6,   ymm14, ymm13
+    vfmadd231ps     ymm7,   ymm15, ymm13
+
+    vfmadd231ps     ymm8,   ymm14, ymm11
+    vfmadd231ps     ymm9,   ymm15, ymm11
+
+    vbroadcastss    ymm10,  dword ptr [rcx + 20]
+    vbroadcastss    ymm11,  dword ptr [rcx + 24]
+    vbroadcastss    ymm12,  dword ptr [rcx + 28]
+    vbroadcastss    ymm13,  dword ptr [rcx + 32]
+
+    vmovaps         ymm14,  [rax + 64]
+    vmovaps         ymm15,  [rax + 96]
+
+    vfmadd231ps     ymm0,   ymm14, ymm10
+    vfmadd231ps     ymm1,   ymm15, ymm10
+
+    vfmadd231ps     ymm2,   ymm14, ymm11
+    vfmadd231ps     ymm3,   ymm15, ymm11
+
+    vbroadcastss    ymm11,  dword ptr [rcx + 36]
+
+    vfmadd231ps     ymm4,   ymm14, ymm12
+    vfmadd231ps     ymm5,   ymm15, ymm12
+
+    vfmadd231ps     ymm6,   ymm14, ymm13
+    vfmadd231ps     ymm7,   ymm15, ymm13
+
+    vfmadd231ps     ymm8,   ymm14, ymm11
+    vfmadd231ps     ymm9,   ymm15, ymm11
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx.tmpli
new file mode 100644
index 000000000..59a29b6ca
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/2x5/packed_packed_loop1/avx.tmpli
@@ -0,0 +1,30 @@
+	// Accumulators: 0-9
+	// Columns: 14-15
+	// Rows: 10-13
+    vbroadcastss    ymm10,  dword ptr [rcx]
+    vbroadcastss    ymm11,  dword ptr [rcx + 4]
+    vbroadcastss    ymm12,  dword ptr [rcx + 8]
+    vbroadcastss    ymm13,  dword ptr [rcx + 12]
+
+    vmovaps         ymm14,  [rax]
+    vmovaps         ymm15,  [rax + 32]
+
+    vfmadd231ps     ymm0,   ymm14, ymm10
+    vfmadd231ps     ymm1,   ymm15, ymm10
+
+    vfmadd231ps     ymm2,   ymm14, ymm11
+    vfmadd231ps     ymm3,   ymm15, ymm11
+
+	// Use register 11 as it's "middle" use, leading to a decent
+	// trade-off between required use next iteration and when it has
+	// to be used this iteration.
+    vbroadcastss    ymm11,  dword ptr [rcx + 16]
+
+    vfmadd231ps     ymm4,   ymm14, ymm12
+    vfmadd231ps     ymm5,   ymm15, ymm12
+
+    vfmadd231ps     ymm6,   ymm14, ymm13
+    vfmadd231ps     ymm7,   ymm15, ymm13
+
+    vfmadd231ps     ymm8,   ymm14, ymm11
+    vfmadd231ps     ymm9,   ymm15, ymm11
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original-unroll.tmpli
new file mode 100644
index 000000000..c41328bf2
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original-unroll.tmpli
@@ -0,0 +1,71 @@
+	// Tile size: 2x6
+	// Accumulators: 0-11
+	// Col regs: ymm14-15
+	// Row regs: ymm12-13
+
+	vbroadcastss	ymm14,	dword ptr [rcx]
+	vmovaps			ymm12,	[rax]
+	vmovaps			ymm13,	[rax + 32]
+	vbroadcastss	ymm15,	dword ptr [rcx + 4]
+
+	vfmadd231ps		ymm0,	ymm12, ymm14
+	vfmadd231ps		ymm1,	ymm13, ymm14
+
+	vbroadcastss	ymm14,	dword ptr [rcx + 8]
+
+	vfmadd231ps		ymm2,	ymm12, ymm15
+	vfmadd231ps		ymm3,	ymm13, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 12]
+
+	vfmadd231ps		ymm4,	ymm12, ymm14
+	vfmadd231ps		ymm5,	ymm13, ymm14
+
+	vbroadcastss	ymm14,	dword ptr [rcx + 16]
+
+	vfmadd231ps		ymm6,	ymm12, ymm15
+	vfmadd231ps		ymm7,	ymm13, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 20]
+
+	vfmadd231ps		ymm8,	ymm12, ymm14
+	vfmadd231ps		ymm9,	ymm13, ymm14
+
+	vbroadcastss	ymm14,	dword ptr [rcx+24]
+
+	vfmadd231ps		ymm10,	 ymm12, ymm15
+	vfmadd231ps		ymm11,	 ymm13, ymm15
+
+	// Iteration two
+	vmovaps			ymm12,	[rax + 64]
+	vmovaps			ymm13,	[rax + 96]
+	vbroadcastss	ymm15,	dword ptr [rcx + 24 + 4]
+
+	vfmadd231ps		ymm0,	ymm12, ymm14
+	vfmadd231ps		ymm1,	ymm13, ymm14
+
+	vbroadcastss	ymm14,	dword ptr [rcx + 24 + 8]
+
+	vfmadd231ps		ymm2,	ymm12, ymm15
+	vfmadd231ps		ymm3,	ymm13, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 24 + 12]
+
+	vfmadd231ps		ymm4,	ymm12, ymm14
+	vfmadd231ps		ymm5,	ymm13, ymm14
+
+	vbroadcastss	ymm14,	dword ptr [rcx + 24 + 16]
+
+	vfmadd231ps		ymm6,	ymm12, ymm15
+	vfmadd231ps		ymm7,	ymm13, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 24 + 20]
+
+	vfmadd231ps		ymm8,	ymm12, ymm14
+	vfmadd231ps		ymm9,	ymm13, ymm14
+
+	vfmadd231ps		ymm10,	 ymm12, ymm15
+	vfmadd231ps		ymm11,	 ymm13, ymm15
+
+	add rax, 128
+	add rcx, 48
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original.tmpli
new file mode 100644
index 000000000..1c7fc2765
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/2x6/packed_packed_loop1/original.tmpli
@@ -0,0 +1,39 @@
+	// Tile size: 2x6
+	// Accumulators: 0-11
+	// Col regs: ymm14-15
+	// Row regs: ymm12-13
+
+	// Load ordered by earliest use for first 2x2 block
+	vbroadcastss	ymm14,	dword ptr [rcx]
+	vmovaps			ymm12,	[rax]
+	vmovaps			ymm13,	[rax + 32]
+	vbroadcastss	ymm15,	dword ptr [rcx + 4]
+
+	vfmadd231ps		ymm0,	ymm12, ymm14
+	vfmadd231ps		ymm1,	ymm13, ymm14
+
+	vbroadcastss	ymm14,	dword ptr [rcx + 8]
+
+	vfmadd231ps		ymm2,	ymm12, ymm15
+	vfmadd231ps		ymm3,	ymm13, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 12]
+
+	vfmadd231ps		ymm4,	ymm12, ymm14
+	vfmadd231ps		ymm5,	ymm13, ymm14
+
+	vbroadcastss	ymm14,	dword ptr [rcx + 16]
+
+	vfmadd231ps		ymm6,	ymm12, ymm15
+	vfmadd231ps		ymm7,	ymm13, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 20]
+
+	vfmadd231ps		ymm8,	ymm12, ymm14
+	vfmadd231ps		ymm9,	ymm13, ymm14
+
+	vfmadd231ps		ymm10,	 ymm12, ymm15
+	vfmadd231ps		ymm11,	 ymm13, ymm15
+
+	add rax, 64
+	add rcx, 24
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx-unroll.tmpli
new file mode 100644
index 000000000..a9e6ea33f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx-unroll.tmpli
@@ -0,0 +1,60 @@
+	// Tile size: 3x4
+	// Accumulators: 0-11
+	// Col regs: ymm12-14
+	// Row regs: ymm15
+
+	vmovaps			ymm12,	[rax]
+	vmovaps			ymm13,	[rax+32]
+	vmovaps			ymm14,	[rax+64]
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 0]
+
+	vfmadd231ps		ymm0,	ymm12, ymm15
+	vfmadd231ps		ymm1,	ymm13, ymm15
+	vfmadd231ps		ymm2,	ymm14, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 4]
+
+	vfmadd231ps		ymm3,	ymm12, ymm15
+	vfmadd231ps		ymm4,	ymm13, ymm15
+	vfmadd231ps		ymm5,	ymm14, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 8]
+
+	vfmadd231ps		ymm6,	ymm12, ymm15
+	vfmadd231ps		ymm7,	ymm13, ymm15
+	vfmadd231ps		ymm8,	ymm14, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 12]
+
+	vfmadd231ps		ymm9,	ymm12, ymm15
+	vfmadd231ps		ymm10,	 ymm13, ymm15
+	vfmadd231ps		ymm11,	 ymm14, ymm15
+
+	vmovaps			ymm12,	[rax + 96]
+	vmovaps			ymm13,	[rax + 128]
+	vmovaps			ymm14,	[rax + 160]
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 16]
+
+	vfmadd231ps		ymm0,	ymm12, ymm15
+	vfmadd231ps		ymm1,	ymm13, ymm15
+	vfmadd231ps		ymm2,	ymm14, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 20]
+
+	vfmadd231ps		ymm3,	ymm12, ymm15
+	vfmadd231ps		ymm4,	ymm13, ymm15
+	vfmadd231ps		ymm5,	ymm14, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 24]
+
+	vfmadd231ps		ymm6,	ymm12, ymm15
+	vfmadd231ps		ymm7,	ymm13, ymm15
+	vfmadd231ps		ymm8,	ymm14, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 28]
+
+	vfmadd231ps		ymm9,	ymm12, ymm15
+	vfmadd231ps		ymm10,	 ymm13, ymm15
+	vfmadd231ps		ymm11,	 ymm14, ymm15
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx.tmpli
new file mode 100644
index 000000000..eff5cd237
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/3x4/packed_packed_loop1/avx.tmpli
@@ -0,0 +1,32 @@
+	// Tile size: 3x4
+	// Accumulators: 0-11
+	// Col regs: ymm12-14
+	// Row regs: ymm15
+
+	vmovaps			ymm12,	[rax]
+	vmovaps			ymm13,	[rax+32]
+	vmovaps			ymm14,	[rax+64]
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 0]
+
+	vfmadd231ps		ymm0,	ymm12, ymm15
+	vfmadd231ps		ymm1,	ymm13, ymm15
+	vfmadd231ps		ymm2,	ymm14, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 4]
+
+	vfmadd231ps		ymm3,	ymm12, ymm15
+	vfmadd231ps		ymm4,	ymm13, ymm15
+	vfmadd231ps		ymm5,	ymm14, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 8]
+
+	vfmadd231ps		ymm6,	ymm12, ymm15
+	vfmadd231ps		ymm7,	ymm13, ymm15
+	vfmadd231ps		ymm8,	ymm14, ymm15
+
+	vbroadcastss	ymm15,	dword ptr [rcx + 12]
+
+	vfmadd231ps		ymm9,	ymm12, ymm15
+	vfmadd231ps		ymm10,	 ymm13, ymm15
+	vfmadd231ps		ymm11,	 ymm14, ymm15
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx-unroll.tmpli
new file mode 100644
index 000000000..faaf1ba4d
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx-unroll.tmpli
@@ -0,0 +1,69 @@
+	// Tile size: 4x3
+	// Accumulators: 0-11
+	// Col regs: ymm12
+	// Row regs: ymm13-15
+
+	// Load col of A
+	vmovaps			ymm12,	[rax]
+
+	// Fill 3 cols of B
+	vbroadcastss	ymm13,	dword ptr [rcx + 0]
+	vbroadcastss	ymm14,	dword ptr [rcx + 4]
+	vbroadcastss	ymm15,	dword ptr [rcx + 8]
+
+	// N.B. Stepping cols in inner loop
+	vfmadd231ps		ymm0,	ymm12, ymm13
+	vfmadd231ps		ymm4,	ymm12, ymm14
+	vfmadd231ps		ymm8,	ymm12, ymm15
+
+	vmovaps			ymm12,	[rax+32]
+
+	vfmadd231ps		ymm1,	ymm12, ymm13
+	vfmadd231ps		ymm5,	ymm12, ymm14
+	vfmadd231ps		ymm9,	ymm12, ymm15
+
+	vmovaps			ymm12,	[rax+64]
+
+	vfmadd231ps		ymm2,	ymm12, ymm13
+	vfmadd231ps		ymm6,	ymm12, ymm14
+	vfmadd231ps		ymm10,	 ymm12, ymm15
+
+	vmovaps			ymm12,	[rax+96]
+
+	vfmadd231ps		ymm3,	ymm12, ymm13
+	vfmadd231ps		ymm7,	ymm12, ymm14
+	vfmadd231ps		ymm11,	ymm12, ymm15
+
+	// Load col of A, switching col!
+	vmovaps			ymm13,	[rax + 128]
+
+	// Fill 3 cols of B
+	vbroadcastss	ymm14,	dword ptr [rcx + 12]
+	vbroadcastss	ymm15,	dword ptr [rcx + 16]
+	vbroadcastss	ymm12,	dword ptr [rcx + 20]
+
+	// N.B. Stepping cols in inner loop
+	vfmadd231ps		ymm0,	ymm13, ymm14
+	vfmadd231ps		ymm4,	ymm13, ymm15
+	vfmadd231ps		ymm8,	ymm13, ymm12
+
+	vmovaps			ymm13,	[rax + 160]
+
+	vfmadd231ps		ymm1,	ymm13, ymm14
+	vfmadd231ps		ymm5,	ymm13, ymm15
+	vfmadd231ps		ymm9,	ymm13, ymm12
+
+	vmovaps			ymm13,	[rax + 192]
+
+	vfmadd231ps		ymm2,	ymm13, ymm14
+	vfmadd231ps		ymm6,	ymm13, ymm15
+	vfmadd231ps		ymm10,	 ymm13, ymm12
+
+	vmovaps			ymm13,	[rax + 224]
+
+	vfmadd231ps		ymm3,	ymm13, ymm14
+	vfmadd231ps		ymm7,	ymm13, ymm15
+	vfmadd231ps		ymm11,	ymm13, ymm12
+
+    add             rcx,    24
+    add             rax,    256
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx.tmpli
new file mode 100644
index 000000000..2a6b43203
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/4x3/packed_packed_loop1/avx.tmpli
@@ -0,0 +1,38 @@
+	// Tile size: 4x3
+	// Accumulators: 0-11
+	// Col regs: ymm12
+	// Row regs: ymm13-15
+
+	// Load col of A
+	vmovaps			ymm12,	[rax]
+
+	// Fill 3 cols of B
+	vbroadcastss	ymm13,	dword ptr [rcx + 0]
+	vbroadcastss	ymm14,	dword ptr [rcx + 4]
+	vbroadcastss	ymm15,	dword ptr [rcx + 8]
+
+	// N.B. Stepping cols in inner loop
+	vfmadd231ps		ymm0,	ymm12, ymm13
+	vfmadd231ps		ymm4,	ymm12, ymm14
+	vfmadd231ps		ymm8,	ymm12, ymm15
+
+	vmovaps			ymm12,	[rax+32]
+
+	vfmadd231ps		ymm1,	ymm12, ymm13
+	vfmadd231ps		ymm5,	ymm12, ymm14
+	vfmadd231ps		ymm9,	ymm12, ymm15
+
+	vmovaps			ymm12,	[rax+64]
+
+	vfmadd231ps		ymm2,	ymm12, ymm13
+	vfmadd231ps		ymm6,	ymm12, ymm14
+	vfmadd231ps		ymm10,	 ymm12, ymm15
+
+	vmovaps			ymm12,	[rax+96]
+
+	vfmadd231ps		ymm3,	ymm12, ymm13
+	vfmadd231ps		ymm7,	ymm12, ymm14
+	vfmadd231ps		ymm11,	ymm12, ymm15
+
+    add             rcx,    12
+    add             rax,    128
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx-unroll.tmpli
new file mode 100644
index 000000000..932763061
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx-unroll.tmpli
@@ -0,0 +1,63 @@
+	// Tile size: 5x2
+	// Accumulators: 0-9
+	// Col regs: ymm10-13
+	// Row regs: ymm14-15
+
+	vmovaps			ymm10,	[rax]
+	vbroadcastss	ymm14,	dword ptr [rcx + 0]
+	vbroadcastss	ymm15,	dword ptr [rcx + 4]
+	vmovaps			ymm11,	[rax + 32]
+
+	// NB stepping column-wise
+	vfmadd231ps		ymm0,	ymm10, ymm14
+	vfmadd231ps		ymm5,	ymm10, ymm15
+
+	vmovaps			ymm12,	[rax + 64]
+
+	vfmadd231ps		ymm1,	ymm11, ymm14
+	vfmadd231ps		ymm6,	ymm11, ymm15
+
+	vmovaps			ymm13,	[rax + 96]
+
+	vfmadd231ps		ymm2,	ymm12, ymm14
+	vfmadd231ps		ymm7,	ymm12, ymm15
+
+	vmovaps			ymm10,	[rax + 128]
+
+	vfmadd231ps		ymm3,	ymm13, ymm14
+	vfmadd231ps		ymm8,	ymm13, ymm15
+
+	vmovaps			ymm11,	[rax + 160]
+
+	vfmadd231ps		ymm4,	ymm10, ymm14
+	vfmadd231ps		ymm9,	ymm10, ymm15
+
+	vbroadcastss	ymm14,	dword ptr [rcx + 8]
+	vbroadcastss	ymm15,	dword ptr [rcx + 12]
+
+	vmovaps			ymm12,	[rax + 192]
+
+	// NB stepping column-wise
+	vfmadd231ps		ymm0,	ymm11, ymm14
+	vfmadd231ps		ymm5,	ymm11, ymm15
+
+	vmovaps			ymm13,	[rax + 224]
+
+	vfmadd231ps		ymm1,	ymm12, ymm14
+	vfmadd231ps		ymm6,	ymm12, ymm15
+
+	vmovaps			ymm10,	[rax + 256]
+
+	vfmadd231ps		ymm2,	ymm13, ymm14
+	vfmadd231ps		ymm7,	ymm13, ymm15
+
+	vmovaps			ymm11,	[rax + 288]
+
+	vfmadd231ps		ymm3,	ymm10, ymm14
+	vfmadd231ps		ymm8,	ymm10, ymm15
+
+	vfmadd231ps		ymm4,	ymm11, ymm14
+	vfmadd231ps		ymm9,	ymm11, ymm15
+
+	add rax, 320
+	add rcx, 16
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx.tmpli
new file mode 100644
index 000000000..add37cea1
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/5x2/packed_packed_loop1/avx.tmpli
@@ -0,0 +1,34 @@
+	// Tile size: 5x2
+	// Accumulators: 0-9
+	// Col regs: ymm10-13
+	// Row regs: ymm14-15
+
+	vmovaps			ymm10,	[rax]
+	vbroadcastss	ymm14,	dword ptr [rcx + 0]
+	vbroadcastss	ymm15,	dword ptr [rcx + 4]
+	vmovaps			ymm11,	[rax + 32]
+
+	// NB stepping column-wise
+	vfmadd231ps		ymm0,	ymm10, ymm14
+	vfmadd231ps		ymm5,	ymm10, ymm15
+
+	vmovaps			ymm12,	[rax + 64]
+
+	vfmadd231ps		ymm1,	ymm11, ymm14
+	vfmadd231ps		ymm6,	ymm11, ymm15
+
+	vmovaps			ymm13,	[rax + 96]
+
+	vfmadd231ps		ymm2,	ymm12, ymm14
+	vfmadd231ps		ymm7,	ymm12, ymm15
+
+	vmovaps			ymm11,	[rax + 128]
+
+	vfmadd231ps		ymm3,	ymm13, ymm14
+	vfmadd231ps		ymm8,	ymm13, ymm15
+
+	vfmadd231ps		ymm4,	ymm11, ymm14
+	vfmadd231ps		ymm9,	ymm11, ymm15
+
+	add rax, 160
+	add rcx, 8
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx-unroll.tmpli
new file mode 100644
index 000000000..0d5f7382e
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx-unroll.tmpli
@@ -0,0 +1,25 @@
+	// Tile size: 6x1
+	// Accumulators: 0-5
+	// Col regs: 6-11
+	// Row regs: 15
+
+
+    vbroadcastss    ymm15,  dword ptr [rcx]
+    vfmadd231ps     ymm0, ymm15, [rax]
+    vfmadd231ps     ymm1, ymm15, [rax + 32]
+    vfmadd231ps     ymm2, ymm15, [rax + 64]
+    vfmadd231ps     ymm3, ymm15, [rax + 96]
+    vfmadd231ps     ymm4, ymm15, [rax + 128]
+    vfmadd231ps     ymm5, ymm15, [rax + 160]
+
+    vbroadcastss    ymm14,  dword ptr [rcx + 4]
+
+    vfmadd231ps     ymm0, ymm14, [rax + 192]
+    vfmadd231ps     ymm1, ymm14, [rax + 224]
+    vfmadd231ps     ymm2, ymm14, [rax + 256]
+    vfmadd231ps     ymm3, ymm14, [rax + 288]
+    vfmadd231ps     ymm4, ymm14, [rax + 320]
+    vfmadd231ps     ymm5, ymm14, [rax + 352]
+
+	add rax, 384
+    add rcx, 8
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx.tmpli
new file mode 100644
index 000000000..b9eb475e8
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/6x1/packed_packed_loop1/avx.tmpli
@@ -0,0 +1,29 @@
+	// Tile size: 6x1
+	// Accumulators: 0-5
+	// Col regs: 6-11
+	// Row regs: 15
+
+    vbroadcastss    ymm15,  dword ptr [rcx]
+
+	vmovups     ymm10, [rax]
+	vmulps     ymm10, ymm10, ymm15
+	vaddps     ymm0, ymm0, ymm10
+    vmovups     ymm11, [rax + 32]
+	vmulps     ymm11, ymm11, ymm15
+	vaddps     ymm1, ymm1, ymm11
+    vmovups     ymm12, [rax + 64]
+	vmulps     ymm12, ymm12, ymm15
+	vaddps     ymm2, ymm2, ymm12
+    vmovups     ymm13, [rax + 96]
+	vmulps     ymm13, ymm13, ymm15
+	vaddps     ymm3, ymm3, ymm13
+    vmovups     ymm14, [rax + 128]
+	vmulps     ymm14, ymm14, ymm15
+	vaddps     ymm4, ymm4, ymm14
+    vmovups     ymm15, [rax + 160]
+	vmulps     ymm15, ymm15, ymm15
+	vaddps     ymm5, ymm5, ymm15
+
+
+    add rcx, 4
+	add rax, 192
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx-unroll.tmpli
new file mode 100644
index 000000000..885e84add
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx-unroll.tmpli
@@ -0,0 +1,70 @@
+    // Tile size: 6x2
+	// Accumulators: 0-9
+	// Col regs: ymm10-13
+	// Row regs: ymm14-15
+
+	vmovaps         ymm12,  [rax]
+	vbroadcastss    ymm14,  dword ptr [rcx + 0]
+    vbroadcastss    ymm15,  dword ptr [rcx + 4]
+	vmovaps         ymm13,  [rax + 32]
+
+    vfmadd231ps     ymm0,   ymm12, ymm14
+    vfmadd231ps     ymm6,   ymm12, ymm15
+
+	vmovaps         ymm12,  [rax + 64]
+
+    vfmadd231ps     ymm1,   ymm13, ymm14
+    vfmadd231ps     ymm7,   ymm13, ymm15
+
+	vmovaps         ymm13,  [rax + 96]
+
+    vfmadd231ps     ymm2,   ymm12, ymm14
+    vfmadd231ps     ymm8,   ymm12, ymm15
+
+	vmovaps         ymm12,  [rax + 128]
+
+	vfmadd231ps     ymm3,   ymm13, ymm14
+    vfmadd231ps     ymm9,   ymm13, ymm15
+
+	vmovaps         ymm13,  [rax + 160]
+
+	vfmadd231ps     ymm4,   ymm12, ymm14
+    vfmadd231ps     ymm10,  ymm12, ymm15
+
+	vmovaps         ymm12,  [rax + 192]
+	vbroadcastss    ymm14,  dword ptr [rcx + 8]
+
+	vfmadd231ps     ymm5,   ymm13, ymm14
+    vfmadd231ps     ymm11, 	ymm13, ymm15
+
+    vbroadcastss    ymm15,  dword ptr [rcx + 12]
+	vmovaps         ymm13,  [rax + 224]
+
+    vfmadd231ps     ymm0,   ymm12, ymm14
+    vfmadd231ps     ymm6,   ymm12, ymm15
+
+	vmovaps         ymm12,  [rax + 256]
+
+    vfmadd231ps     ymm1,   ymm13, ymm14
+    vfmadd231ps     ymm7,   ymm13, ymm15
+
+	vmovaps         ymm13,  [rax + 288]
+
+    vfmadd231ps     ymm2,   ymm12, ymm14
+    vfmadd231ps     ymm8,   ymm12, ymm15
+
+	vmovaps         ymm12,  [rax + 320]
+
+	vfmadd231ps     ymm3,   ymm13, ymm14
+    vfmadd231ps     ymm9,   ymm13, ymm15
+
+	vmovaps         ymm13,  [rax + 352]
+
+	vfmadd231ps     ymm4,   ymm12, ymm14
+    vfmadd231ps     ymm10,  ymm12, ymm15
+
+	vfmadd231ps     ymm5,   ymm13, ymm14
+    vfmadd231ps     ymm11, 	ymm13, ymm15
+
+	add rax, 384
+	add rcx, 16
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx.tmpli
new file mode 100644
index 000000000..df8d6f19f
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/6x2/packed_packed_loop1/avx.tmpli
@@ -0,0 +1,38 @@
+    // Tile size: 6x2
+	// Accumulators: 0-11
+	// Col regs: 12-13
+	// Row regs: 14-15
+
+	vmovaps         ymm12,  [rax]
+	vbroadcastss    ymm14,  dword ptr [rcx + 0]
+    vbroadcastss    ymm15,  dword ptr [rcx + 4]
+	vmovaps         ymm13,  [rax + 32]
+
+    vfmadd231ps     ymm0,   ymm12, ymm14
+    vfmadd231ps     ymm6,   ymm12, ymm15
+
+	vmovaps         ymm12,  [rax + 64]
+
+    vfmadd231ps     ymm1,   ymm13, ymm14
+    vfmadd231ps     ymm7,   ymm13, ymm15
+
+	vmovaps         ymm13,  [rax + 96]
+
+    vfmadd231ps     ymm2,   ymm12, ymm14
+    vfmadd231ps     ymm8,   ymm12, ymm15
+
+	vmovaps         ymm12,  [rax + 128]
+
+	vfmadd231ps     ymm3,   ymm13, ymm14
+    vfmadd231ps     ymm9,   ymm13, ymm15
+
+	vmovaps         ymm13,  [rax + 160]
+
+	vfmadd231ps     ymm4,   ymm12, ymm14
+    vfmadd231ps     ymm10,  ymm12, ymm15
+
+	vfmadd231ps     ymm5,   ymm13, ymm14
+    vfmadd231ps     ymm11, 	ymm13, ymm15
+
+	add rcx, 8
+	add rax, 192
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx-unroll.tmpli
new file mode 100644
index 000000000..0c52cbac7
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx-unroll.tmpli
@@ -0,0 +1,37 @@
+	// Tile size: 6x1
+	// Accumulators: 0-5
+	// Col regs: 6-11
+	// Row regs: 15
+    vbroadcastss    ymm15,  dword ptr [rcx]
+
+    vmovaps         ymm6, [rax + 0]
+	vmovaps         ymm7, [rax + 32]
+	vmovaps         ymm8, [rax + 64]
+	vmovaps         ymm9, [rax + 96]
+
+    vfmadd231ps     ymm0, ymm6, ymm15
+    vmovaps         ymm10, [rax + 128]
+
+	vfmadd231ps     ymm1, ymm7, ymm15
+	vmovaps         ymm11, [rax + 160]
+    vfmadd231ps     ymm2, ymm8, ymm15
+	vbroadcastss    ymm14,  dword ptr [rcx+4]
+    vfmadd231ps     ymm3, ymm9, ymm15
+    vmovaps         ymm12, [rax + 192]
+    vfmadd231ps     ymm4, ymm10, ymm15
+	vmovaps         ymm13, [rax + 224]
+    vfmadd231ps     ymm5, ymm11, ymm15
+
+	vmovaps         ymm6, [rax + 256]
+    vfmadd231ps     ymm0, ymm12, ymm14
+	vmovaps         ymm7, [rax + 288]
+    vfmadd231ps     ymm1, ymm13, ymm14
+
+    vmovaps         ymm8, [rax + 128]
+    vfmadd231ps     ymm2, ymm6, ymm14
+
+	vmovaps         ymm9, [rax + 160]
+    vfmadd231ps     ymm3, ymm7, ymm14
+
+    vfmadd231ps     ymm4, ymm8, ymm14
+    vfmadd231ps     ymm5, ymm9, ymm14
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx.tmpli
new file mode 100644
index 000000000..95cd32307
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/7x1/packed_packed_loop1/avx.tmpli
@@ -0,0 +1,22 @@
+	// Tile size: 6x1
+	// Accumulators: 0-5
+	// Col regs: 6-11
+	// Row regs: 15
+    vbroadcastss    ymm15,  dword ptr [rcx]
+
+    vmovaps         ymm6, [rax + 0]
+	vmovaps         ymm7, [rax + 32]
+	vmovaps         ymm8, [rax + 64]
+	vmovaps         ymm9, [rax + 96]
+
+    vfmadd231ps     ymm0, ymm6, ymm15
+    vfmadd231ps     ymm1, ymm7, ymm15
+
+    vmovaps         ymm10, [rax + 128]
+    vfmadd231ps     ymm2, ymm8, ymm15
+
+	vmovaps         ymm11, [rax + 160]
+    vfmadd231ps     ymm3, ymm9, ymm15
+
+    vfmadd231ps     ymm4, ymm10, ymm15
+    vfmadd231ps     ymm5, ymm11, ymm15
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx-unroll.tmpli
new file mode 100644
index 000000000..2348b2f72
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx-unroll.tmpli
@@ -0,0 +1,48 @@
+	// Accumulators: 0-7
+	// Columns: 14-15
+	// Rows: 8-13
+
+
+    vbroadcastss    ymm15,  dword ptr [rcx]
+    vbroadcastss    ymm14,  dword ptr [rcx + 4]
+
+    vmovaps     ymm8, [rax]
+    vmovaps     ymm9, [rax + 32]
+    vmovaps     ymm10, [rax + 64]
+    vmovaps     ymm11, [rax + 96]
+    vmovaps     ymm12, [rax + 128]
+    vmovaps     ymm13, [rax + 160]
+
+    vfmadd231ps     ymm0, ymm15, ymm8
+    vfmadd231ps     ymm1, ymm15, ymm9
+    vfmadd231ps     ymm2, ymm15, ymm10
+    vfmadd231ps     ymm3, ymm15, ymm11
+    vfmadd231ps     ymm4, ymm15, ymm12
+    vfmadd231ps     ymm5, ymm15, ymm13
+
+	vmovaps     ymm8, [rax + 192]
+    vmovaps     ymm9, [rax + 224]
+    vmovaps     ymm10, [rax + 256]
+    vmovaps     ymm11, [rax + 288]
+    vmovaps     ymm12, [rax + 320]
+    vmovaps     ymm13, [rax + 352]
+
+    vfmadd231ps     ymm6, ymm15, ymm8
+    vfmadd231ps     ymm7, ymm15, ymm9
+    vfmadd231ps     ymm0, ymm14, ymm10
+    vfmadd231ps     ymm1, ymm14, ymm11
+    vfmadd231ps     ymm2, ymm14, ymm12
+    vfmadd231ps     ymm3, ymm14, ymm13
+
+    vmovaps     ymm8, [rax + 384]
+    vmovaps     ymm9, [rax + 416]
+    vmovaps     ymm10, [rax + 448]
+    vmovaps     ymm11, [rax + 480]
+
+    vfmadd231ps     ymm4, ymm14, ymm8
+    vfmadd231ps     ymm5, ymm14, ymm9
+    vfmadd231ps     ymm6, ymm14, ymm10
+    vfmadd231ps     ymm7, ymm14, ymm11
+
+    add rcx, 8
+	add rax, 512
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx.tmpli
new file mode 100644
index 000000000..c170e664a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/8x1/packed_packed_loop1/avx.tmpli
@@ -0,0 +1,33 @@
+	// Tile size: 8x1
+	// Accumulators: 0-7
+	// Col regs: 8-14
+	// Row regs: 15
+
+	vbroadcastss    ymm15,  dword ptr [rcx]
+
+    vmovaps         ymm8, [rax + 0]
+	vmovaps         ymm9, [rax + 32]
+	vmovaps         ymm10, [rax + 64]
+	vmovaps         ymm11, [rax + 96]
+
+    vfmadd231ps     ymm0, ymm8, ymm15
+    vfmadd231ps     ymm1, ymm9, ymm15
+
+    vmovaps         ymm12, [rax + 128]
+	vmovaps         ymm13, [rax + 160]
+
+    vfmadd231ps     ymm2, ymm10, ymm15
+    vfmadd231ps     ymm3, ymm11, ymm15
+
+    vmovaps         ymm14, [rax + 192]
+	vmovaps         ymm11, [rax + 224]
+
+    vfmadd231ps     ymm4, ymm12, ymm15
+    vfmadd231ps     ymm5, ymm13, ymm15
+
+
+    vfmadd231ps     ymm6, ymm14, ymm15
+    vfmadd231ps     ymm7, ymm11, ymm15
+
+    add rcx, 4
+	add rax, 256
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx-unroll.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx-unroll.tmpli
new file mode 100644
index 000000000..f8e819336
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx-unroll.tmpli
@@ -0,0 +1,58 @@
+	// Tile size: 1x8
+	// Accumulators: 0-7
+	// Col regs: 8-14
+	// Row regs: 15
+
+
+    vmovaps         ymm15,  [rax]
+
+    vbroadcastss    ymm8, dword ptr [rcx + 0 * 4]
+    vfmadd231ps     ymm0, ymm15, ymm8
+
+    vbroadcastss    ymm9, dword ptr [rcx + 1 * 4]
+    vfmadd231ps     ymm1, ymm15, ymm9
+
+    vbroadcastss    ymm10, dword ptr [rcx + 2 * 4]
+    vfmadd231ps     ymm2, ymm15, ymm10
+
+    vbroadcastss    ymm11, dword ptr [rcx + 3 * 4]
+    vfmadd231ps     ymm3, ymm15, ymm11
+
+    vbroadcastss    ymm12, dword ptr [rcx + 4 * 4]
+    vfmadd231ps     ymm4, ymm15, ymm12
+
+    vbroadcastss    ymm13, dword ptr [rcx + 5 * 4]
+    vfmadd231ps     ymm5, ymm15, ymm13
+
+    vbroadcastss    ymm10, dword ptr [rcx + 6 * 4]
+    vfmadd231ps     ymm6, ymm15, ymm10
+
+    vbroadcastss    ymm11, dword ptr [rcx + 7 * 4]
+    vfmadd231ps     ymm7, ymm15, ymm11
+
+
+    vmovaps         ymm15,  [rax]
+
+    vbroadcastss    ymm8, dword ptr [rcx + 0 * 4]
+    vfmadd231ps     ymm0, ymm15, ymm8
+
+    vbroadcastss    ymm9, dword ptr [rcx + 1 * 4]
+    vfmadd231ps     ymm1, ymm15, ymm9
+
+    vbroadcastss    ymm10, dword ptr [rcx + 2 * 4]
+    vfmadd231ps     ymm2, ymm15, ymm10
+
+    vbroadcastss    ymm11, dword ptr [rcx + 3 * 4]
+    vfmadd231ps     ymm3, ymm15, ymm11
+
+    vbroadcastss    ymm12, dword ptr [rcx + 4 * 4]
+    vfmadd231ps     ymm4, ymm15, ymm12
+
+    vbroadcastss    ymm13, dword ptr [rcx + 5 * 4]
+    vfmadd231ps     ymm5, ymm15, ymm13
+
+    vbroadcastss    ymm10, dword ptr [rcx + 6 * 4]
+    vfmadd231ps     ymm6, ymm15, ymm10
+
+    vbroadcastss    ymm11, dword ptr [rcx + 7 * 4]
+    vfmadd231ps     ymm7, ymm15, ymm11
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx.tmpli b/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx.tmpli
new file mode 100644
index 000000000..1af4afecc
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/8x8/packed_packed_loop1/avx.tmpli
@@ -0,0 +1,30 @@
+	// Tile size: 1x8
+	// Accumulators: 0-7
+	// Col regs: 8-14
+	// Row regs: 15
+
+    vmovaps         ymm15,  [rax]
+
+    vbroadcastss    ymm8, dword ptr [rcx + 0 * 4]
+    vfmadd231ps     ymm0, ymm15, ymm8
+
+    vbroadcastss    ymm9, dword ptr [rcx + 1 * 4]
+    vfmadd231ps     ymm1, ymm15, ymm9
+
+    vbroadcastss    ymm10, dword ptr [rcx + 2 * 4]
+    vfmadd231ps     ymm2, ymm15, ymm10
+
+    vbroadcastss    ymm11, dword ptr [rcx + 3 * 4]
+    vfmadd231ps     ymm3, ymm15, ymm11
+
+    vbroadcastss    ymm12, dword ptr [rcx + 4 * 4]
+    vfmadd231ps     ymm4, ymm15, ymm12
+
+    vbroadcastss    ymm13, dword ptr [rcx + 5 * 4]
+    vfmadd231ps     ymm5, ymm15, ymm13
+
+    vbroadcastss    ymm10, dword ptr [rcx + 6 * 4]
+    vfmadd231ps     ymm6, ymm15, ymm10
+
+    vbroadcastss    ymm11, dword ptr [rcx + 7 * 4]
+    vfmadd231ps     ymm7, ymm15, ymm11
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/avx2_mmm_i32_8x8.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/avx2_mmm_i32_8x8.tmpl
new file mode 100644
index 000000000..70c7ba85c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/avx2_mmm_i32_8x8.tmpl
@@ -0,0 +1,682 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 8x8:
+
+    ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% if msvc %}
+
+_text segment
+avx2_mmm_i32_8x8_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}avx2_mmm_i32_8x8_{{suffix}}
+{{G}}avx2_mmm_i32_8x8_{{suffix}}:
+.cfi_startproc
+
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+{% if family == "windows" %}
+// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch
+// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    push        rdi
+    push        rsi
+
+    mov         rdi, rcx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+.cfi_def_cfa_offset 64
+{% endif %}
+
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+
+{% include "dispatcher.tmpliq" %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     r12,    [rdi + 32]   // packing
+    mov     rbx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rcx,    [rdi + 8]    // k
+    test    rcx,    rcx
+    jz      {{L}}non_linear_loop
+
+    cmp     r12, 1
+    je      {{L}}main_loop_packed_packed_i8i8
+
+{{L}}main_loop_packed_packed:
+    vmovaps         ymm12,  [rax]
+
+    {% for i in (0..7) %}
+        vbroadcastss    ymm14, dword ptr [rbx + {{i}} * 4]
+        vpmulld         ymm13, ymm12, ymm14
+        vpaddd          ymm{{i}}, ymm{{i}}, ymm13
+    {% endfor %}
+
+    add             rax,    32
+    add             rbx,    32
+    dec             rcx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed_i8i8:
+    movq            xmm8, qword ptr [rax]          // read 8 bytes
+    vpmovsxbw       ymm8, xmm8                     // promote byte to i32x8
+
+    vpbroadcastb    ymm9, byte ptr [rbx]           // broadcast 1 byte from B
+    vpbroadcastb    ymm10, byte ptr [rbx + 1]      // broadcast 1 byte from B
+    vpbroadcastb    ymm11, byte ptr [rbx + 2]      // broadcast 1 byte from B
+    vpbroadcastb    ymm12, byte ptr [rbx + 3]      // broadcast 1 byte from B
+    vpmovsxbw       ymm9, xmm9                     // promote byte to i32x8
+    vpmovsxbw       ymm10, xmm10                   // promote byte to i32x8
+    vpmovsxbw       ymm11, xmm11                   // promote byte to i32x8
+    vpmovsxbw       ymm12, xmm12                   // promote byte to i32x8
+
+    vpmullw         ymm9, ymm9, ymm8
+    vpmullw         ymm10, ymm10, ymm8
+    vpmullw         ymm11, ymm11, ymm8
+    vpmullw         ymm12, ymm12, ymm8
+    vpmovsxwd       ymm9, xmm9                     // promote byte to i32x8
+    vpmovsxwd       ymm10, xmm10                   // promote byte to i32x8
+    vpmovsxwd       ymm11, xmm11                   // promote byte to i32x8
+    vpmovsxwd       ymm12, xmm12                   // promote byte to i32x8
+    vpaddd          ymm0, ymm0, ymm9
+    vpaddd          ymm1, ymm1, ymm10
+    vpaddd          ymm2, ymm2, ymm11
+    vpaddd          ymm3, ymm3, ymm12
+
+    vpbroadcastb    ymm9, byte ptr [rbx + 4]
+    vpbroadcastb    ymm10, byte ptr [rbx + 5]
+    vpbroadcastb    ymm11, byte ptr [rbx + 6]
+    vpbroadcastb    ymm12, byte ptr [rbx + 7]
+    vpmovsxbw       ymm9, xmm9
+    vpmovsxbw       ymm10, xmm10
+    vpmovsxbw       ymm11, xmm11
+    vpmovsxbw       ymm12, xmm12
+
+    vpmullw         ymm9, ymm9, ymm8
+    vpmullw         ymm10, ymm10, ymm8
+    vpmullw         ymm11, ymm11, ymm8
+    vpmullw         ymm12, ymm12, ymm8
+    vpmovsxwd       ymm9, xmm9                     // promote byte to i32x8
+    vpmovsxwd       ymm10, xmm10                   // promote byte to i32x8
+    vpmovsxwd       ymm11, xmm11                   // promote byte to i32x8
+    vpmovsxwd       ymm12, xmm12                   // promote byte to i32x8
+    vpaddd          ymm4, ymm4, ymm9
+    vpaddd          ymm5, ymm5, ymm10
+    vpaddd          ymm6, ymm6, ymm11
+    vpaddd          ymm7, ymm7, ymm12
+
+    add             rbx,    8
+    add             rax,    8
+    dec             rcx
+    jnz             {{L}}main_loop_packed_packed_i8i8
+
+    jmp             {{L}}non_linear_loop
+
+{% include "fma_mmm_i32_scalars.tmpliq" from:0, to:7 %}
+{% include "fma_mmm_i32_per_rows.tmpliq" mr:8,from:0, to:7 %}
+{% include "fma_mmm_i32_per_cols.tmpliq" mr:8,from:0, to:7 %}
+{% include "fma_mmm_load_tile.tmpliq" from:0, to:7 %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+    mov     r8,     [rdi + 32]          // item size
+
+    cmp     r8,    4
+    je      {{L}}non_linear_addc_i32
+
+{% comment %}
+// This is not great as vgatherdps reads 32-bits values and goes beyond our buffer. Probably harmless though.
+// Commented and replaced with the "mov al" loop beyond to pacify valgrind.
+// ymm14 and ymm15 are the same as in the non_linear_addc_i32 case (compute them before the test right above here.
+// {% for i in (0..7) %}
+//     vpcmpeqd        ymm15, ymm15, ymm15
+//     vgatherdps      ymm12, [ r10 + ymm14 ], ymm15   // 0xxx 1xxx 2xxx 3xxx 4xxx 5xxx 6xxx 7xxx
+//
+//     // we need to go through vpmovsxbd, shuffling naively erases signs
+//     vpshufb         ymm12, ymm12, ymm10             // 0123 0123 0123 0123 4567 4567 4567 4567
+//
+//     vpermd          ymm12, ymm11, ymm12             // 0123 4567
+//     vpmovsxbd       ymm12, xmm12                    // sign extend
+//
+//     vpaddd          ymm{{i}},   ymm{{i}},   ymm12
+//     add             r10, rbx
+// {% endfor %}
+{% endcomment %}
+
+    {% for col in (0..7) %}
+        mov r8, r10
+        {% for half in (0..1) %}
+            {% for lane in (0..3) %}
+                mov al, [ r8 ]
+                add r8, rsi
+                movsx eax, al
+                pinsrd xmm10, eax, {{lane}}
+            {% endfor %}
+            vperm2f128  ymm10,   ymm10,   ymm10,  1
+        {% endfor %}
+        vpaddd ymm{{col}}, ymm{{col}}, ymm10
+        add r10, rbx
+    {% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}non_linear_addc_i32:
+
+    mov     eax,    0
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+    vpermq          ymm14, ymm14, 78 // 0b01001110
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+    vpermq          ymm14, ymm14, 78 // 0b01001110
+
+
+{% if msvc %}
+    vpbroadcastd    ymm10, dword ptr [ offset byte_shuffle ]
+    vmovups         ymm11, dword ptr [ offset i128_shuffle ]
+{% else %}
+    vpbroadcastd    ymm10, [ rip + {{L}}byte_shuffle ]
+    vmovups         ymm11, [ rip + {{L}}i128_shuffle ]
+{% endif %}
+
+{% for i in (0..7) %}
+    vpcmpeqd        ymm15, ymm15, ymm15
+    vgatherdps      ymm12, [ r10 + ymm14 ], ymm15
+    vpaddd          ymm{{i}},   ymm{{i}},   ymm12
+    add             r10, rbx
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{% if msvc %}
+.data
+byte_shuffle dd              201851904 // 0x0c080400
+i128_shuffle dd              0, 4
+.code
+{% else %}
+{{L}}byte_shuffle: .int            201851904 // 0x0c080400
+{{L}}i128_shuffle: .int            0, 4
+{% endif %}
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         ymm12,  [rax]
+
+{% for i in (0..7) %}
+    vbroadcastss    ymm14, dword ptr [rbx + {{i|times:4}} ]
+    vpmulld         ymm15, ymm12, ymm14
+    vpaddd          ymm{{i}}, ymm{{i}}, ymm15
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale:
+    mov             r8, [ rdi + 16 ]        // policy
+    vbroadcastss    ymm8, dword ptr [rdi + 24] // multi
+
+    mov             rax, 1
+    movq            xmm9, rax
+    vpbroadcastq    ymm9, xmm9              // ymm9 <- 1
+
+    mov             rax, [ rdi + 8 ]        // xmm10 <- shift + 31
+    add             rax, 31
+    movq            xmm10, rax
+    vpbroadcastq    ymm10, xmm10
+
+    mov             rax, 1
+    movq            xmm11, rax
+    vpsubq          ymm12, ymm10, ymm9      // shift+31 - 1
+    vpsllq          ymm11, ymm9, xmm12      // ymm11 <- 1 << (shift + 31 - 1)
+
+    cmp     r8, 1
+    je      {{L}}q_scale_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_scale_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_scale_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_scale_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_scale_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_scale_rounding_odd
+
+    jmp    {{L}}unsupported
+
+{{L}}q_scale_rounding_zero:           // signum * ( (abs + nudge) >> shift )
+{% for i in (0..7) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsubq      ymm14, ymm14, ymm9
+    vpsubq      ymm15, ymm15, ymm9
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_away:           // signum * ( (abs + nudge) >> shift )
+{% for i in (0..7) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_minus_inf:           // signum * ( (abs << 32 + 1<<30+shift) >> shift )
+{% for i in (0..7) %}
+    vpabsd      ymm14, ymm{{i}}
+    // sign extract for nudging in the right direction
+    vpxor       ymm13, ymm13, ymm13
+    vpcmpgtd    ymm13, ymm{{i}}, ymm13      // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros)
+    vpsrld      ymm13, ymm13, 31            // then just 0 or 1
+
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrldq     ymm13, ymm13, 4             // ymm13 <- s1, s2, .., s7, 0
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_plus_inf:           // signum * ( (abs << 32 + 1<<30+shift) >> shift )
+
+    vpbroadcastd ymm9, xmm9
+
+{% for i in (0..7) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpxor       ymm13, ymm13, ymm13
+
+    // sign extract for nudging in the right direction
+    vpcmpgtd    ymm13, ymm{{i}}, ymm13      // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros)
+    vpaddd      ymm13, ymm13, ymm9          // if val >= 0 { 0i32 } else { 1i32 }
+
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrldq     ymm13, ymm13, 4             // ymm13 <- s1, s2, .., s7, 0
+    vpxor       ymm12, ymm12, ymm12
+    vpblendd    ymm12, ymm12, ymm13, 85     // 0x55
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_even:           // signum * ( (abs + nudge) >> shift )
+{% for i in (0..7) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpsrlq      ymm12, ymm14, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpaddq      ymm14, ymm14, ymm12
+    vpsubq      ymm14, ymm14, ymm9
+
+    vpsrlq      ymm12, ymm15, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpaddq      ymm15, ymm15, ymm12
+    vpsubq      ymm15, ymm15, ymm9
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale_rounding_odd:           // signum * ( (abs + nudge) >> shift )
+{% for i in (0..7) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsrldq     ymm15, ymm14, 4             // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0
+    vpmuldq     ymm14, ymm14, ymm8          // ymm14  <- a0*c, a2*c, a4*c, a6*c
+    vpmuldq     ymm15, ymm15, ymm8          // ymm15 <- a1*c, a3*c, a5*c, a7*c
+
+    vpsrlq      ymm12, ymm14, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpsubq      ymm14, ymm14, ymm12
+
+    vpsrlq      ymm12, ymm15, xmm10
+    vpand       ymm12, ymm12, ymm9
+    vpsubq      ymm15, ymm15, ymm12
+
+    vpaddq      ymm14, ymm14, ymm11
+    vpaddq      ymm15, ymm15, ymm11
+
+    vpsrlq      ymm14, ymm14, xmm10
+    vpsrlq      ymm15, ymm15, xmm10
+
+    vpslldq     ymm15, ymm15, 4
+    vpblendd    ymm14, ymm15, ymm14, 85     // 0x55
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_shl:
+    mov             eax, [ rdi + 8 ]        // xmm10 <- -shift (8 times)
+    movd            xmm10, eax
+    vpbroadcastd    ymm10, xmm10
+
+{% for i in (0..7) %}
+    vpsllvd     ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr:
+    mov             r8, [ rdi + 16 ]        // policy
+
+    mov             eax, 1
+    movd            xmm9, eax
+    vpbroadcastd    ymm9, xmm9              // ymm9 <- 1u32 (8 times)
+
+    mov             eax, [ rdi + 8 ]        // xmm10 <- shift (8 times)
+    movd            xmm10, eax
+    vpbroadcastd    ymm10, xmm10
+
+    mov             ebx, 1
+    mov             cl, al
+    sub             cl, 1                  // rcx <- shift -1
+    sal             ebx, cl                // rbx <- (1 << (shift - 1))
+    movd            xmm11, ebx
+    vpbroadcastd    ymm11, xmm11            // ymm11 <- "half"
+
+    vpxor           ymm12, ymm12, ymm12     // ymm12 <- zeroes
+
+    cmp     r8, 1
+    je      {{L}}q_shr_rounding_zero
+    cmp     r8, 2
+    je      {{L}}q_shr_rounding_away
+    cmp     r8, 3
+    je      {{L}}q_shr_rounding_minus_inf
+    cmp     r8, 4
+    je      {{L}}q_shr_rounding_plus_inf
+    cmp     r8, 5
+    je      {{L}}q_shr_rounding_even
+    cmp     r8, 6
+    je      {{L}}q_shr_rounding_odd
+
+    jmp    {{L}}unsupported
+
+{{L}}q_shr_rounding_zero:
+{% for i in (0..7) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsubd      ymm14, ymm14, ymm9
+    vpaddd      ymm14, ymm14, ymm11
+    vpsravd     ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_away:
+{% for i in (0..7) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpaddd      ymm14, ymm14, ymm11
+    vpsravd     ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_minus_inf:
+{% for i in (0..7) %}
+    vpsubd  ymm{{i}}, ymm{{i}}, ymm9
+    vpaddd  ymm{{i}}, ymm{{i}}, ymm11
+    vpsravd ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_plus_inf:
+{% for i in (0..7) %}
+    vpaddd  ymm{{i}}, ymm{{i}}, ymm11
+    vpsravd ymm{{i}}, ymm{{i}}, ymm10
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_even:
+{% for i in (0..7) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsravd ymm13, ymm14, ymm10
+    vpand   ymm13, ymm13, ymm9
+    vpsubd  ymm13, ymm13, ymm9          // nudge = ((abs >>l shift) & 0x01) - 1
+    vpaddd  ymm14, ymm14, ymm13         // add nudge
+    vpaddd  ymm14, ymm14, ymm11         // add half
+    vpsravd ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}q_shr_rounding_odd:
+{% for i in (0..7) %}
+    vpabsd      ymm14, ymm{{i}}
+    vpsravd ymm13, ymm14, ymm10
+    vpand   ymm13, ymm13, ymm9
+    vpsubd  ymm13, ymm12, ymm13          // nudge = - ((abs >>l shift) & 0x01)
+    vpaddd  ymm14, ymm14, ymm13         // add nudge
+    vpaddd  ymm14, ymm14, ymm11         // add half
+    vpsravd ymm14, ymm14, ymm10
+    vpsignd     ymm{{i}}, ymm14, ymm{{i}}
+{% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rdx,    [rdi + 24]          // col stride
+    mov     rcx,    [rdi + 32]          // item size
+
+    cmp     rcx,    4
+    je      {{L}}store_strides_i32
+
+    {% for col in (0..7) %}
+        mov r10, r8
+        {% for row in (0..3) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         byte ptr [r10], bl
+            add         r10, rsi
+        {% endfor %}
+        vperm2f128  ymm{{col}},   ymm{{col}},   ymm{{col}},  1
+        {% for row in (0..3) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         byte ptr [r10], bl
+            add         r10, rsi
+        {% endfor %}
+        add r8, rdx
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_i32:
+    {% for col in (0..7) %}
+        mov r10,    r8
+        {% for row in (0..3) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         dword ptr [r10], ebx
+            add         r10, rsi
+        {% endfor %}
+        vperm2f128  ymm{{col}},   ymm{{col}},   ymm{{col}},  1
+        {% for row in (0..3) %}
+            extractps   ebx, xmm{{col}}, {{row}}
+            mov         dword ptr [r10], ebx
+            add         r10, rsi
+        {% endfor %}
+        add r8, rdx
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}return:
+    ldmxcsr     [rsp + 4]
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+
+{{L}}one_32bit:
+{% if msvc %}
+    dd      1
+{% else %}
+    .int    1
+{% endif %}
+
+{% if msvc %}
+avx2_mmm_i32_8x8_{{suffix}} endp
+_text ends
+end
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/dispatcher.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/dispatcher.tmpliq
new file mode 100644
index 000000000..1c63f72ad
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/dispatcher.tmpliq
@@ -0,0 +1,40 @@
+// vim: set syntax=asm :
+
+{{L}}non_linear:
+
+{{L}}non_linear_loop_enter:
+    sub     rdi,    40
+{{L}}non_linear_loop:
+    add     rdi,    40
+    mov     rax,    [rdi]
+
+    mov     r8, {{ jump_table | size }}
+    cmp     rax, 0
+    cmovl   rax, r8
+    cmp     rax, {{ jump_table | size }}
+    cmovg   rax, r8
+
+{% if msvc %}
+    lea     r8, [ offset {{L}}jmp_table ]
+{% else %}
+    lea     r8, [ rip + {{L}}jmp_table ]
+{% endif %}
+    movsxd  r9, dword ptr [ r8 + rax * 4 ]
+    lea     r8, [ r8 + r9 ]
+    jmp     r8
+
+{{L}}jmp_table:
+{% for j in jump_table %}
+    {{long}}      {{L}}{{j}}-{{L}}jmp_table
+{% endfor %}
+    {{long}}      {{L}}unsupported-{{L}}jmp_table
+
+{{L}}unsupported:
+    mov     rax,    1
+    jmp     {{L}}return
+
+
+{{L}}done:
+    mov     rax, 0
+    jmp     {{L}}return
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x5.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x5.tmpl
new file mode 100644
index 000000000..8c790e11a
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x5.tmpl
@@ -0,0 +1,143 @@
+{% comment %}
+// vim: set syntax=asm :
+/* mmm 16 x 5:
+
+    ymm0 ymm2 ymm4 ymm6 ymm8
+    ymm1 ymm3 ymm5 ymm7 ymm9
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" type:"f32", size:"16x5", suffix:suffix, G:G %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed:
+    {% include "2x5/packed_packed_loop1/avx.tmpli" %}
+
+    add             rcx,    20
+    add             rax,    64
+    dec             rbx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+// NON LINEAR / ADDC
+
+{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:9, type:"f32" %}
+{% include "fma_mmm_f32_per_rows.tmpliq" mr:16, from:0, to:9, type:"f32" %}
+{% include "fma_mmm_f32_per_cols.tmpliq" mr:16, from:0, to:9, type:"f32" %}
+{% include "fma_mmm_load_tile.tmpliq" from:0, to:9 %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    mov     eax,    0
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+
+    lea             r8, [ r10 + rsi * 8 ]
+
+{% for i in (0..4) %}
+    vpcmpeqd        ymm15,  ymm15, ymm15
+    vgatherdps      ymm12,  [ r10 + ymm14 ],      ymm15
+    vpcmpeqd        ymm15,  ymm15, ymm15
+    vgatherdps      ymm13,  [ r8  + ymm14 ],      ymm15
+    add             r10, rbx
+    add             r8, rbx
+    vaddps          ymm{{i | times:2 }},   ymm{{i | times:2}},   ymm12
+    vaddps          ymm{{i | times:2 | plus: 1}}, ymm{{i | times:2 | plus:1 }},   ymm13
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         ymm12,  [rax]
+    vmovups         ymm13,  [rax + 32]
+
+{% for i in (0..4) %}
+    vbroadcastss    ymm14, dword ptr [rbx + {{i|times:4}} ]
+    vfmadd231ps     ymm{{i|times:2}},   ymm12, ymm14
+    vfmadd231ps     ymm{{i|times:2|plus:1}}, ymm13, ymm14
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r12,    [ r8 + 4 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+    cmp     rbx,    64
+    jne     {{L}}store_strides_generic
+
+    {% for row in (0..1) %}
+        {% for col in (0..4) %}
+            vmovups ymmword ptr [r{{col|plus:8}}], ymm{{col|times:2|plus:row}}
+            add r{{col|plus:8}}, 32
+       {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_generic:
+    // tops of cols
+
+    {% for quarter in (0..3) %}
+        {% if quarter != 0 %}
+            // move next four rows at top (xmm0,2,..10)
+            vperm2f128  ymm0,   ymm0,   ymm1,  {{quarter}}
+            vperm2f128  ymm2,   ymm2,   ymm3,  {{quarter}}
+            vperm2f128  ymm4,   ymm4,   ymm5,  {{quarter}}
+            vperm2f128  ymm6,   ymm6,   ymm7,  {{quarter}}
+            vperm2f128  ymm8,   ymm8,   ymm9,  {{quarter}}
+        {% endif %}
+        {% for row in (0..3) %}
+            {% for i in (0..4) %}
+                vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | times:2}}, {{row}}
+                add         r{{i | plus: 8}}, rsi
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" type:"f32", size:"16x5", suffix:suffix, G:G, L:L %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x6.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x6.tmpl
new file mode 100644
index 000000000..1dae2fde1
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_16x6.tmpl
@@ -0,0 +1,131 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 16 x 6:
+
+    ymm0 ymm2 ymm4 ymm6 ymm8 ymm10
+    ymm1 ymm3 ymm5 ymm7 ymm9 ymm11
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" type:"f32", size:"16x6", suffix:suffix, G:G %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed:
+	{% include "2x6/packed_packed_loop1/original.tmpli" %}
+
+    dec             rbx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+// NON LINEAR / ADDC
+
+{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:11, type:"f32" %}
+{% include "fma_mmm_f32_per_rows.tmpliq" mr:16, from:0, to:11, type:"f32" %}
+{% include "fma_mmm_f32_per_cols.tmpliq" mr:16, from:0, to:11, type:"f32" %}
+{% include "fma_mmm_load_tile.tmpliq" from:0, to:11 %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    mov     eax,    0
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+
+    lea             r8, [ r10 + rsi * 8 ]
+
+{% for i in (0..5) %}
+    vpcmpeqd        ymm15,  ymm15, ymm15
+    vgatherdps      ymm12,  [ r10 + ymm14 ],      ymm15
+    vpcmpeqd        ymm15,  ymm15, ymm15
+    vgatherdps      ymm13,  [ r8  + ymm14 ],      ymm15
+    add     		r10, rbx
+    add     		r8, rbx
+    vaddps          ymm{{i | times:2 }},   ymm{{i | times:2}},   ymm12
+    vaddps          ymm{{i | times:2 | plus: 1}}, ymm{{i | times:2 | plus:1 }},   ymm13
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         ymm12,  [rax]
+    vmovups         ymm13,  [rax + 32]
+
+{% for i in (0..5) %}
+    vbroadcastss    ymm14, dword ptr [rbx + {{i|times:4}} ]
+    vfmadd231ps     ymm{{i|times:2}},   ymm12, ymm14
+    vfmadd231ps     ymm{{i|times:2|plus:1}}, ymm13, ymm14
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    // tops of cols
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r12,    [ r8 + 4 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+    lea     r13,    [ r12 + rbx ]
+
+    {% for quarter in (0..3) %}
+        {% if quarter != 0 %}
+            // move next four rows at top (xmm0,2,..10)
+            vperm2f128  ymm0,   ymm0,   ymm1,  {{quarter}}
+            vperm2f128  ymm2,   ymm2,   ymm3,  {{quarter}}
+            vperm2f128  ymm4,   ymm4,   ymm5,  {{quarter}}
+            vperm2f128  ymm6,   ymm6,   ymm7,  {{quarter}}
+            vperm2f128  ymm8,   ymm8,   ymm9,  {{quarter}}
+            vperm2f128  ymm10,  ymm10,  ymm11, {{quarter}}
+        {% endif %}
+        {% for row in (0..3) %}
+            {% for i in (0..5) %}
+                vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i | times:2}}, {{row}}
+                add         r{{i | plus: 8}}, rsi
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" type:"f32", size:"16x6", suffix:suffix, G:G, L:L %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_24x4.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_24x4.tmpl
new file mode 100644
index 000000000..47b6e24ec
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_24x4.tmpl
@@ -0,0 +1,158 @@
+{% comment %}
+// vim: set syntax=asm :
+/* mmm 24 x 4:
+
+    ymm0 ymm3 ymm6 ymm10
+    ymm1 ymm4 ymm7 ymm11
+    ymm2 ymm5 ymm8 ymm12
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" type:"f32", size:"24x4", suffix:suffix, G:G %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed:
+    {% include "3x4/packed_packed_loop1/avx.tmpli" %}
+
+    add             rcx,    16
+    add             rax,    96
+    dec             rbx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+// NON LINEAR / ADDC
+
+{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:11, type:"f32" %}
+{% include "fma_mmm_f32_per_rows.tmpliq" mr:24, from:0, to:11, type:"f32" %}
+{% include "fma_mmm_f32_per_cols.tmpliq" mr:24, from:0, to:11, type:"f32" %}
+{% include "fma_mmm_load_tile.tmpliq" from:0, to:11 %}
+
+{{L}}add_unicast:
+
+    mov     r8,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    cmp rsi, 4
+    jne {{L}}unicast_generic
+
+    lea             r9,  [ r8 + rbx ]
+    lea             r10, [ r9 + rbx]
+    lea             r11, [ r10 + rbx ]
+    lea             r12, [ r11 + rbx ]
+
+{% for col in (0..3) %}
+    {% for row in (0..2) %}
+        vmovups ymm12,  [ r{{col|plus:8}} ]
+        add r{{col|plus:8}}, 32
+        vaddps ymm{{col|times:3|plus:row}}, ymm{{col|times:3|plus:row}}, ymm12
+    {% endfor %}
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}unicast_generic:
+    mov     eax,    0
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+//  mov r12, [0]
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+
+    lea             r9, [ r8 + rsi * 8 ]
+    lea             r10, [ r9 + rsi * 8 ]
+
+{% for col in (0..3) %}
+   {% for row in (0..2) %}
+      vpcmpeqd        ymm15,  ymm15, ymm15
+      vgatherdps      ymm12,  [ r{{row|plus:8}} + ymm14 ], ymm15
+      add r{{row|plus:8}}, rbx
+      vaddps ymm{{col|times:3|plus:row}}, ymm{{col|times:3|plus:row}}, ymm12
+   {% endfor %}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         ymm12,  [rax]
+    vmovups         ymm13,  [rax + 32]
+    vmovups         ymm15,  [rax + 64]
+{% for i in (0..3) %}
+    vbroadcastss    ymm14, dword ptr [rbx + {{i|times:4}} ]
+    vfmadd231ps     ymm{{i|times:3}},   ymm12, ymm14
+    vfmadd231ps     ymm{{i|times:3|plus:1}}, ymm13, ymm14
+    vfmadd231ps     ymm{{i|times:3|plus:2}}, ymm15, ymm14
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    lea     r9,     [ r8  +     rbx ]
+    lea     r10,    [ r8  + 2 * rbx ]
+    lea     r11,    [ r10 +     rbx ]
+
+    cmp         rsi, 4
+    jne         {{L}}store_strides_generic
+
+    {% for col in (0..3) %}
+       {% for row in (0..2) %}
+            vmovups ymmword ptr [r{{col|plus:8}}], ymm{{col|times:3|plus:row}}
+            add r{{col|plus:8}}, 32
+       {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_generic:
+    {% for col in (0..3) %}
+       {% for row in (0..2) %}
+           {% for i in (0..3) %}
+                vextractps  dword ptr [r{{col | plus: 8}}], xmm{{col | times:3 | plus:row}}, {{i}}
+                add         r{{col | plus: 8}}, rsi
+           {% endfor %}
+           vperm2f128  ymm{{col | times:3 | plus:row}}, ymm{{col | times:3 | plus:row}}, ymm{{col | times:3 | plus:row}}, 1
+           {% for i in (0..3) %}
+                vextractps  dword ptr [r{{col | plus: 8}}], xmm{{col | times:3|plus:row}}, {{i}}
+                add         r{{col | plus: 8}}, rsi
+           {% endfor %}
+       {% endfor %}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" type:"f32", size:"24x4", suffix:suffix, G:G, L:L %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x1.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x1.tmpl
new file mode 100644
index 000000000..e4c89bd59
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x1.tmpl
@@ -0,0 +1,368 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 64 x 1
+
+    ymm0
+    ymm1
+    ymm2
+    ymm3
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" type:"f32", size:"32x1", suffix:suffix, G:G %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    mov     r8,    [rdi + 32]   // packing
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+    cmp     r8, 1
+    jz      {{L}}q40f32
+
+    cmp     r8, 2
+    jz      {{L}}q40f16
+
+    cmp     r8, 3
+    jz      {{L}}f16f16
+
+{{align}} 16
+{{L}}main_loop_packed_packed:
+    vbroadcastss    ymm15,  dword ptr [rcx]
+
+    vmovaps     ymm8, [rax]
+    vmovaps     ymm9, [rax + 32]
+    vmovaps     ymm10, [rax + 64]
+    vmovaps     ymm11, [rax + 96]
+
+    vfmadd231ps     ymm0, ymm15, ymm8
+    vfmadd231ps     ymm1, ymm15, ymm9
+    vfmadd231ps     ymm2, ymm15, ymm10
+    vfmadd231ps     ymm3, ymm15, ymm11
+
+    add             rcx, 4
+	add             rax, 128
+    sub             rbx, 1
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{% if msvc %}
+{{L}}q40f32_mask:
+    {{long}} 0F0F0F0Fh
+{{L}}q40f32_eight:
+    {{long}} 08h
+{% else %}
+{{L}}q40f32_mask:
+    {{long}} 0x0F0F0F0F
+{{L}}q40f32_eight:
+    {{long}} 8
+{% endif %}
+
+{{L}}q40f32:
+    // ymm0-3: acc
+    // ymm4-7: scales
+    // ymm13: 8
+    // ymm14: mask
+    // ymm15: b value
+    vbroadcastss    ymm14, dword ptr [{{offset}} {{L}}q40f32_mask]
+    vbroadcastss    ymm13, dword ptr [{{offset}} {{L}}q40f32_eight]
+
+{{L}}q40f32_outerloop:
+    // scales
+    vmovaps         xmm4, [rax]
+    vmovaps         xmm5, [rax + 16]
+    vmovaps         xmm6, [rax + 32]
+    vmovaps         xmm7, [rax + 48]
+    vcvtph2ps       ymm4, xmm4
+    vcvtph2ps       ymm5, xmm5
+    vcvtph2ps       ymm6, xmm6
+    vcvtph2ps       ymm7, xmm7
+    add             rax, 64
+
+    mov             rdx, 32
+
+{{L}}q40f32_innerloop:
+    vbroadcastss    ymm15, dword ptr [rcx]
+    vmovaps         xmm8, [rax]            // 32 nibbles
+
+    vpand           xmm10, xmm8, xmm14      // 16 bytes
+
+    vpmovzxbd       ymm9, xmm10            // 8 u32
+
+    vpermilpd       xmm10, xmm10, 1        // swap 64bit halves
+    vpmovzxbd       ymm10, xmm10            // 8 u32
+
+    vpsrlw          xmm8, xmm8, 4
+    vpand           xmm12, xmm8, xmm14      // 16 bytes
+    vpmovzxbd       ymm11, xmm12            // 8 u32
+    vpermilpd       xmm12, xmm12, 1        // swap 64bit halves
+    vpmovzxbd       ymm12, xmm12            // 8 u32
+
+    vpsubd          ymm9, ymm9, ymm13
+    vpsubd          ymm10, ymm10, ymm13
+    vpsubd          ymm11, ymm11, ymm13
+    vpsubd          ymm12, ymm12, ymm13
+
+    vcvtdq2ps       ymm9, ymm9
+    vcvtdq2ps       ymm10, ymm10
+    vcvtdq2ps       ymm11, ymm11
+    vcvtdq2ps       ymm12, ymm12
+
+    vmulps          ymm9, ymm9, ymm4
+    vmulps          ymm10, ymm10, ymm5
+    vmulps          ymm11, ymm11, ymm6
+    vmulps          ymm12, ymm12, ymm7
+
+    vfmadd231ps     ymm0, ymm15, ymm9
+    vfmadd231ps     ymm1, ymm15, ymm10
+    vfmadd231ps     ymm2, ymm15, ymm11
+    vfmadd231ps     ymm3, ymm15, ymm12
+
+    add             rax, 16
+    add             rcx, 4
+    sub             rdx, 1
+    jnz             {{L}}q40f32_innerloop
+
+    sub             rbx, 32
+    jnz             {{L}}q40f32_outerloop
+
+    jmp             {{L}}non_linear_loop
+
+{{L}}q40f16:
+    // ymm0-3: acc
+    // ymm4-7: scales
+    // ymm13: 8
+    // ymm14: mask
+    // ymm15: b value
+    vbroadcastss    ymm14, dword ptr [{{offset}} {{L}}q40f32_mask]
+    vbroadcastss    ymm13, dword ptr [{{offset}} {{L}}q40f32_eight]
+
+{{L}}q40f16_outerloop:
+    // scales
+    vmovaps         xmm4, [rax]
+    vmovaps         xmm5, [rax + 16]
+    vmovaps         xmm6, [rax + 32]
+    vmovaps         xmm7, [rax + 48]
+    vcvtph2ps       ymm4, xmm4
+    vcvtph2ps       ymm5, xmm5
+    vcvtph2ps       ymm6, xmm6
+    vcvtph2ps       ymm7, xmm7
+    add             rax, 64
+
+    mov             rdx, 32
+
+{{L}}q40f16_innerloop:
+    vpbroadcastw    ymm15, word ptr [rcx]
+    vcvtph2ps       ymm15, xmm15
+
+    vmovaps         xmm8, [rax]            // 32 nibbles
+
+    vpand           xmm10, xmm8, xmm14      // 16 bytes
+
+    vpmovzxbd       ymm9, xmm10            // 8 u32
+
+    vpermilpd       xmm10, xmm10, 1        // swap 64bit halves
+    vpmovzxbd       ymm10, xmm10            // 8 u32
+
+    vpsrlw          xmm8, xmm8, 4
+    vpand           xmm12, xmm8, xmm14      // 16 bytes
+    vpmovzxbd       ymm11, xmm12            // 8 u32
+    vpermilpd       xmm12, xmm12, 1        // swap 64bit halves
+    vpmovzxbd       ymm12, xmm12            // 8 u32
+
+    vpsubd          ymm9, ymm9, ymm13
+    vpsubd          ymm10, ymm10, ymm13
+    vpsubd          ymm11, ymm11, ymm13
+    vpsubd          ymm12, ymm12, ymm13
+
+    vcvtdq2ps       ymm9, ymm9
+    vcvtdq2ps       ymm10, ymm10
+    vcvtdq2ps       ymm11, ymm11
+    vcvtdq2ps       ymm12, ymm12
+
+    vmulps          ymm9, ymm9, ymm4
+    vmulps          ymm10, ymm10, ymm5
+    vmulps          ymm11, ymm11, ymm6
+    vmulps          ymm12, ymm12, ymm7
+
+    vfmadd231ps     ymm0, ymm15, ymm9
+    vfmadd231ps     ymm1, ymm15, ymm10
+    vfmadd231ps     ymm2, ymm15, ymm11
+    vfmadd231ps     ymm3, ymm15, ymm12
+
+    add             rax, 16
+    add             rcx, 2
+    sub             rdx, 1
+    jnz             {{L}}q40f16_innerloop
+
+    sub             rbx, 32
+    jnz             {{L}}q40f16_outerloop
+
+    jmp             {{L}}non_linear_loop
+
+{{L}}f16f16:
+{{align}} 16
+    vpbroadcastw    ymm15, word ptr [rcx]
+
+    vmovaps     xmm4, [rax]
+    vmovaps     xmm5, [rax + 16]
+    vmovaps     xmm6, [rax + 32]
+    vmovaps     xmm7, [rax + 48]
+
+    vcvtph2ps       ymm15, xmm15
+    vcvtph2ps       ymm4, xmm4
+    vcvtph2ps       ymm5, xmm5
+    vcvtph2ps       ymm6, xmm6
+    vcvtph2ps       ymm7, xmm7
+
+    vfmadd231ps     ymm0, ymm15, ymm4
+    vfmadd231ps     ymm1, ymm15, ymm5
+    vfmadd231ps     ymm2, ymm15, ymm6
+    vfmadd231ps     ymm3, ymm15, ymm7
+
+    add             rcx, 2
+	add             rax, 64
+    sub             rbx, 1
+    jnz             {{L}}f16f16
+
+    jmp             {{L}}non_linear_loop
+
+{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:3, type:"f32" %}
+{% include "fma_mmm_f32_per_rows.tmpliq" mr:32, from:0, to:3, type:"f32" %}
+{% include "fma_mmm_f32_per_cols.tmpliq" mr:32, from:0, to:3, type:"f32" %}
+{% include "fma_mmm_load_tile.tmpliq" from:0, to:3 %}
+
+{{L}}add_unicast:
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+
+	cmp rsi, 4
+	jne {{L}}add_unicast_generic
+
+    {% for row in (0..3) %}
+        vaddps ymm{{row}}, ymm{{row}}, [ r10 + {{row|times:32}} ]
+    {% endfor %}
+    jmp    {{L}}non_linear_loop
+
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_unicast_generic:
+    mov     eax,    0
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+
+{% for i in (0..3) %}
+    vpcmpeqd        ymm15,  ymm15, ymm15
+    vgatherdps      ymm12,  [ r10 + ymm14 ], ymm15
+
+    vaddps          ymm{{i}},   ymm{{i}},   ymm12
+    lea             r10, [ r10 + rsi * 8 ]
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vbroadcastss    ymm14, dword ptr [rbx]
+
+{% for i in (0..3) %}
+    vmovups         ymm12,  [rax + {{i|times:32}}]
+    vfmadd231ps     ymm{{i}}, ymm12, ymm14
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     r11,    [rdi + 32]          // item size
+
+    cmp     r11, 2
+    je      {{L}}store_f16
+
+	cmp rsi, 4
+	jne {{L}}store_generic
+
+	{% for row in (0..3) %}
+        vmovups [r8 + {{row|times:32}}], ymm{{row}}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_generic:
+
+    {% for vec in (0..3) %}
+        {% for half in (0..1) %}
+            {% if half == 0 %}
+                movaps xmm9, xmm{{vec}}
+            {% else %}
+                vperm2f128 ymm9, ymm{{vec}}, ymm{{vec}}, 1
+            {% endif %}
+            {% for row in (0..3) %}
+                vextractps  dword ptr [r8], xmm9, {{row}}
+                add         r8, rsi
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}store_f16:
+
+    vcvtps2ph   xmm0, ymm0, 0
+    vcvtps2ph   xmm1, ymm1, 0
+    vcvtps2ph   xmm2, ymm2, 0
+    vcvtps2ph   xmm3, ymm3, 0
+
+    cmp         rsi, 2
+	jne {{L}}store_generic_f16
+
+	{% for row in (0..3) %}
+        vmovups [r8 + {{row|times:16}}], xmm{{row}}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+    
+{{L}}store_generic_f16:
+
+    {% for vec in (0..3) %}
+        {% for row in (0..7) %}
+            pextrw      word ptr [r8], xmm{{vec}}, {{row}}
+            add         r8, rsi
+        {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" type:"f32", size:"32x1", suffix:suffix, G:G, L:L %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x3.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x3.tmpl
new file mode 100644
index 000000000..0675bc6b9
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_32x3.tmpl
@@ -0,0 +1,239 @@
+{% comment %}
+// vim: set syntax=asm :
+/* mmm 16 x 5:
+
+    ymm0 ymm4 ymm8
+    ymm1 ymm5 ymm9
+    ymm2 ymm6 ymm10
+    ymm3 ymm7 ymm11
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" type:"f32", size:"32x3", suffix:suffix, G:G %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rbx,    [rdi + 8]    // k
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     r8,    [rdi + 32]   // packing
+
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+    cmp     r8, 1
+    jz      {{L}}main_loop_packed_packed_f32_f16
+
+{{L}}main_loop_packed_packed:
+    {% include "4x3/packed_packed_loop1/avx.tmpli" %}
+
+    dec             rbx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed_f32_f16:
+	// Load col of A
+	vmovaps			ymm12,	[rax]
+
+	// Fill 3 cols of B
+    vpbroadcastw    xmm13,  word ptr [rcx + 0]
+    vpbroadcastw    xmm14,  word ptr [rcx + 2]
+    vpbroadcastw    xmm15,  word ptr [rcx + 4]
+
+    vcvtph2ps       ymm13, xmm13
+    vcvtph2ps       ymm14, xmm14
+    vcvtph2ps       ymm15, xmm15
+
+	// N.B. Stepping cols in inner loop
+	vfmadd231ps		ymm0,	ymm12, ymm13
+	vfmadd231ps		ymm4,	ymm12, ymm14
+	vfmadd231ps		ymm8,	ymm12, ymm15
+
+	vmovaps			ymm12,	[rax+32]
+
+	vfmadd231ps		ymm1,	ymm12, ymm13
+	vfmadd231ps		ymm5,	ymm12, ymm14
+	vfmadd231ps		ymm9,	ymm12, ymm15
+
+	vmovaps			ymm12,	[rax+64]
+
+	vfmadd231ps		ymm2,	ymm12, ymm13
+	vfmadd231ps		ymm6,	ymm12, ymm14
+	vfmadd231ps		ymm10,	 ymm12, ymm15
+
+	vmovaps			ymm12,	[rax+96]
+
+	vfmadd231ps		ymm3,	ymm12, ymm13
+	vfmadd231ps		ymm7,	ymm12, ymm14
+	vfmadd231ps		ymm11,	ymm12, ymm15
+
+    add             rcx,    6
+    add             rax,    128
+
+    dec             rbx
+    jnz             {{L}}main_loop_packed_packed_f32_f16
+
+    jmp             {{L}}non_linear_loop
+
+// NON LINEAR / ADDC
+
+{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:11, type:"f32" %}
+{% include "fma_mmm_f32_per_rows.tmpliq" mr:32, from:0, to:11, type:"f32" %}
+{% include "fma_mmm_f32_per_cols.tmpliq" mr:32, from:0, to:11, type:"f32" %}
+{% include "fma_mmm_load_tile.tmpliq" from:0, to:11 %}
+
+{{L}}add_unicast:
+    mov     r8,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    cmp     rsi, 4
+    jne     {{L}}unicast_generic
+
+    lea             r9,  [ r8 + rbx ]
+    lea             r10, [ r9 + rbx]
+    lea             r11, [ r10 + rbx ]
+
+{% for col in (0..2) %}
+    {% for row in (0..3) %}
+        vmovups     ymm12, [ r{{col|plus:8}} ]
+        add         r{{col|plus:8}}, 32
+        vaddps      ymm{{col|times:4|plus:row}}, ymm{{col|times:4|plus:row}}, ymm12
+    {% endfor %}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}unicast_generic:
+    mov     eax,    0
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+//  mov r12, [0]
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+
+    lea             r9,  [ r8 + rsi * 8 ]
+    lea             r10, [ r9 + rsi * 8 ]
+    lea             r11, [ r10 + rsi * 8 ]
+
+{% for col in (0..2) %}
+   {% for row in (0..3) %}
+      vpcmpeqd      ymm15,  ymm15, ymm15
+      vgatherdps    ymm12,  [ r{{row|plus:8}} + ymm14 ], ymm15
+      add           r{{row|plus:8}}, rbx
+      vaddps        ymm{{col|times:4|plus:row}}, ymm{{col|times:4|plus:row}}, ymm12
+   {% endfor %}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vbroadcastss    ymm13, dword ptr [rbx]
+    vbroadcastss    ymm14, dword ptr [rbx + 4]
+    vbroadcastss    ymm15, dword ptr [rbx + 8]
+{% for i in (0..3) %}
+    vmovups         ymm12,  [rax + {{i|times:32}}]
+    vfmadd231ps     ymm{{0|plus:i}}, ymm12, ymm13
+    vfmadd231ps     ymm{{4|plus:i}}, ymm12, ymm14
+    vfmadd231ps     ymm{{8|plus:i}}, ymm12, ymm15
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+    mov     r11,    [rdi + 32]          // item size
+
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+
+    cmp     r11, 2
+    je      {{L}}store_f16
+
+    cmp         rsi, 4
+    jne         {{L}}store_strides_generic
+
+    {% for col in (0..2) %}
+        {% for row in (0..3) %}
+            vmovups ymmword ptr [r{{col|plus:8}}], ymm{{col|times:4|plus:row}}
+            add     r{{col|plus:8}}, 32
+       {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_generic:
+
+    {% for col in (0..2) %}
+       {% for row in (0..3) %}
+           {% for i in (0..3) %}
+                vextractps  dword ptr [r{{col | plus: 8}}], xmm{{col | times:4 | plus:row}}, {{i}}
+                add         r{{col | plus: 8}}, rsi
+           {% endfor %}
+           vperm2f128  ymm{{col | times:4 | plus:row}}, ymm{{col | times:4 | plus:row}}, ymm{{col | times:4 | plus:row}}, 1
+           {% for i in (0..3) %}
+                vextractps  dword ptr [r{{col | plus: 8}}], xmm{{col | times:4|plus:row}}, {{i}}
+                add         r{{col | plus: 8}}, rsi
+           {% endfor %}
+       {% endfor %}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_f16:
+
+    {% for reg in (0..11) %}
+        vcvtps2ph   xmm{{reg}}, ymm{{reg}}, 0
+    {% endfor %}
+
+    cmp         rsi, 2
+	jne {{L}}store_generic_f16
+
+    {% for col in (0..2) %}
+        {% for row in (0..3) %}
+            vmovups [r{{col|plus:8}} + {{row|times:16}}], xmm{{col|times:4|plus:row}}
+        {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+    
+{{L}}store_generic_f16:
+    {% for col in (0..2) %}
+        {% for vec in (0..3) %}
+            {% for row in (0..7) %}
+                pextrw  word ptr [r{{col|plus:8}}], xmm{{col|times:4|plus:vec}}, {{row}}
+                add         r{{col|plus:8}}, rsi
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" type:"f32", size:"32x3", suffix:suffix, G:G, L:L %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_40x2.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_40x2.tmpl
new file mode 100644
index 000000000..81a47ef0c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_40x2.tmpl
@@ -0,0 +1,158 @@
+{% comment %}
+// vim: set syntax=asm :
+/* mmm 40 x 5:
+
+    ymm0 ymm5
+    ymm1 ymm6
+    ymm2 ymm7
+    ymm3 ymm8
+    ymm4 ymm9
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" type:"f32", size:"40x2", suffix:suffix, G:G %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed:
+    {% include "5x2/packed_packed_loop1/avx.tmpli" %}
+
+    dec             rbx
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+// NON LINEAR / ADDC
+
+{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:9, type:"f32" %}
+{% include "fma_mmm_f32_per_rows.tmpliq" mr:40, from:0, to:9, type:"f32" %}
+{% include "fma_mmm_f32_per_cols.tmpliq" mr:40, from:0, to:9, type:"f32" %}
+{% include "fma_mmm_load_tile.tmpliq" from:0, to:9 %}
+
+{{L}}add_unicast:
+    mov     r8,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    cmp rsi, 4
+    jne {{L}}unicast_generic
+
+    lea             r9,  [ r8 + rbx ]
+    lea             r10, [ r9 + rbx]
+    lea             r11, [ r10 + rbx ]
+    lea             r12, [ r11 + rbx ]
+
+
+{% for col in (0..1) %}
+    {% for row in (0..4) %}
+        vmovups ymm12,  [ r{{col|plus:8}} ]
+        add		r{{col|plus:8}}, 32
+        vaddps 	ymm{{col|times:5|plus:row}}, ymm{{col|times:5|plus:row}}, ymm12
+    {% endfor %}
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}unicast_generic:
+    mov     eax,    0
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+
+    lea             r9,  [ r8 + rsi * 8]
+    lea             r10, [ r9 + rsi * 8]
+    lea             r11, [ r10 + rsi * 8]
+    lea             r12, [ r11 + rsi * 8]
+
+{% for col in (0..1) %}
+   {% for row in (0..4) %}
+      vpcmpeqd        ymm15, ymm15, ymm15
+      vgatherdps      ymm12, [ r{{row|plus:8}} + ymm14 ], ymm15
+      add 			  r{{row|plus:8}}, 	rbx
+      vaddps 		  ymm{{col|times:5|plus:row}}, ymm{{col|times:5|plus:row}}, ymm12
+   {% endfor %}
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vbroadcastss    ymm10, dword ptr [rbx]
+    vbroadcastss    ymm11, dword ptr [rbx + 4]
+{% for i in (0..4) %}
+    vmovups         ymm12,  [rax + {{i|times:32}}]
+    vfmadd231ps     ymm{{0|plus:i}}, ymm12, ymm10
+    vfmadd231ps     ymm{{5|plus:i}}, ymm12, ymm11
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    lea     r9,     [ r8  +     rbx ]
+    lea     r10,    [ r8  + 2 * rbx ]
+    lea     r11,    [ r10 +     rbx ]
+    lea     r12,    [ r10 + 2 * rbx ]
+
+    cmp         rsi, 4
+    jne         {{L}}store_strides_generic
+
+    {% for col in (0..1) %}
+       {% for row in (0..4) %}
+            vmovups ymmword ptr [r{{col|plus:8}}], ymm{{col|times:5|plus:row}}
+            add 	r{{col|plus:8}}, 32
+       {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_strides_generic:
+    {% for col in (0..1) %}
+       {% for row in (0..4) %}
+           {% for i in (0..3) %}
+                vextractps  dword ptr [r{{col | plus: 8}}], xmm{{col | times:5 | plus:row}}, {{i}}
+                add         r{{col | plus: 8}}, rsi
+           {% endfor %}
+           vperm2f128  ymm{{col | times:5 | plus:row}}, ymm{{col | times:5 | plus:row}}, ymm{{col | times:5 | plus:row}}, 1
+           {% for i in (0..3) %}
+                vextractps  dword ptr [r{{col | plus: 8}}], xmm{{col | times:5|plus:row}}, {{i}}
+                add         r{{col | plus: 8}}, rsi
+           {% endfor %}
+       {% endfor %}
+    {% endfor %}
+    jmp     {{L}}non_linear_loop
+
+{% include "postamble.tmpliq" type:"f32", size:"40x2", suffix:suffix, G:G, L:L %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_64x1.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_64x1.tmpl
new file mode 100644
index 000000000..55b7e59de
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_64x1.tmpl
@@ -0,0 +1,142 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 64 x 1
+
+    ymm0
+    ymm1
+    ...
+    ymm8
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" type:"f32", size:"64x1", suffix:suffix, G:G %}
+
+{{L}}clear:
+    vzeroall
+    jmp     {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rcx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rbx,    [rdi + 8]    // k
+    test    rbx,    rbx
+    jz      {{L}}non_linear_loop
+
+	test rbx, 1
+	jz {{L}}main_loop_packed_packed
+	{% include "8x1/packed_packed_loop1/avx.tmpli" %}
+
+    dec             rbx
+    jz              {{L}}non_linear_loop
+
+{{align}} 16
+{{L}}main_loop_packed_packed:
+	{% include "8x1/packed_packed_loop1/avx-unroll.tmpli" %}
+
+    sub             rbx, 2
+    jnz             {{L}}main_loop_packed_packed
+
+    jmp             {{L}}non_linear_loop
+
+{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:7, type:"f32" %}
+{% include "fma_mmm_f32_per_rows.tmpliq" mr:64, from:0, to:7, type:"f32" %}
+{% include "fma_mmm_f32_per_cols.tmpliq" mr:64, from:0, to:7, type:"f32" %}
+{% include "fma_mmm_load_tile.tmpliq" from:0, to:7 %}
+
+{{L}}add_unicast:
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+
+	cmp rsi, 4
+	jne {{L}}add_unicast_generic
+
+    {% for row in (0..7) %}
+        vaddps ymm{{row}}, ymm{{row}}, [ r10 + {{row|times:32}} ]
+    {% endfor %}
+    jmp    {{L}}non_linear_loop
+
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_unicast_generic:
+    mov     eax,    0
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+
+{% for i in (0..7) %}
+    vpcmpeqd        ymm15,  ymm15, ymm15
+    vgatherdps      ymm12,  [ r10 + ymm14 ], ymm15
+
+    vaddps          ymm{{i}},   ymm{{i}},   ymm12
+    lea             r10, [ r10 + rsi * 8 ]
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vbroadcastss    ymm14, dword ptr [rbx]
+
+{% for i in (0..7) %}
+    vmovups         ymm12,  [rax + {{i|times:32}}]
+    vfmadd231ps     ymm{{i}}, ymm12, ymm14
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+
+	cmp rsi, 4
+	jne {{L}}store_generic
+
+	{% for row in (0..7) %}
+        vmovups [r8 + {{row|times:32}}], ymm{{row}}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+{{L}}store_generic:
+
+    {% for vec in (0..7) %}
+        {% for half in (0..1) %}
+            {% if half == 0 %}
+                movaps xmm9, xmm{{vec}}
+            {% else %}
+                vperm2f128 ymm9, ymm{{vec}}, ymm{{vec}}, 1
+            {% endif %}
+            {% for row in (0..3) %}
+                vextractps  dword ptr [r8], xmm9, {{row}}
+                add         r8, rsi
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+
+{% include "postamble.tmpliq" type:"f32", size:"64x1", suffix:suffix, G:G, L:L %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_8x8.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_8x8.tmpl
new file mode 100644
index 000000000..681866a78
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_8x8.tmpl
@@ -0,0 +1,129 @@
+{% comment %}
+// vim: set syntax=asm :
+
+/* mmm 16 x 6:
+
+    ymm0 ymm2 ymm4 ymm6 ymm8 ymm10
+    ymm1 ymm3 ymm5 ymm7 ymm9 ymm11
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+*/
+{% endcomment %}
+
+{% include "preamble.tmpliq" type:"f32", size:"8x8", suffix:suffix, G:G %}
+
+{{L}}clear:
+    vzeroall
+    jmp             {{L}}non_linear_loop
+
+{{L}}add_mat_mul:
+    mov     rbx,    [rdi + 24]   // B
+    mov     rax,    [rdi + 16]   // A
+
+    mov     rcx,    [rdi + 8]    // k
+    test    rcx,    rcx
+    jz      {{L}}non_linear_loop
+
+{{L}}main_loop_packed_packed:
+    vmovaps         ymm12,  [rax]
+
+    {% for i in (0..7) %}
+        vbroadcastss    ymm14, dword ptr [rbx + {{i}} * 4]
+        vfmadd231ps     ymm{{i}}, ymm12, ymm14
+    {% endfor %}
+
+    add             rax,    32
+    add             rbx,    32
+    dec             rcx
+    jnz             {{L}}main_loop_packed_packed
+    jmp             {{L}}non_linear_loop
+
+// NON LINEAR / ADDC
+
+{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:7, type:"f32" %}
+{% include "fma_mmm_f32_per_rows.tmpliq" mr:8, from:0, to:7, type:"f32" %}
+{% include "fma_mmm_f32_per_cols.tmpliq" mr:8, from:0, to:7, type:"f32" %}
+{% include "fma_mmm_load_tile.tmpliq" from:0, to:7 %}
+
+{{L}}add_unicast:
+
+    mov     r10,    [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    mov     eax,    0
+{% for i in (0..3) %}
+    pinsrd  xmm14, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+{% for i in (0..3) %}
+    pinsrd  xmm15, eax, {{i}}
+    add     eax,    esi
+{% endfor %}
+
+    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15
+
+{% for i in (0..7) %}
+    vpcmpeqd        ymm15,  ymm15, ymm15
+    vgatherdps      ymm12,  [ r10 + ymm14 ],      ymm15
+    add     r10, rbx
+    vaddps          ymm{{i}},   ymm{{i}},   ymm12
+{% endfor %}
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}add_row_col_products:
+    mov             rax, [ rdi + 8 ]
+    mov             rbx, [ rdi + 16 ]
+
+    vmovups         ymm12,  [rax]
+
+{% for i in (0..7) %}
+    vbroadcastss    ymm14, dword ptr [rbx + {{i|times:4}} ]
+    vfmadd231ps     ymm{{i}},   ymm12, ymm14
+{% endfor %}
+    jmp    {{L}}non_linear_loop
+
+{{L}}store:
+    mov     r8,     [rdi + 8]           // c ptr
+    mov     rsi,    [rdi + 16]          // row stride
+    mov     rbx,    [rdi + 24]          // col stride
+
+    // tops of cols
+    lea     r9,     [ r8 + rbx ]
+    lea     r10,    [ r8 + 2 * rbx ]
+    lea     r12,    [ r8 + 4 * rbx ]
+    lea     r11,    [ r10 + rbx ]
+    lea     r13,    [ r12 + rbx ]
+    lea     r14,    [ r12 + 2 * rbx ]
+    lea     r15,    [ r13 + 2 * rbx ]
+
+    {% for quarter in (0..1) %}
+        {% if quarter != 0 %}
+            // move next four rows at top (xmm0,2,..10)
+            {% for r in (0..7) %}
+                vperm2f128  ymm{{r}},   ymm{{r}},   ymm{{r}},  {{quarter}}
+            {% endfor %}
+        {% endif %}
+        {% for row in (0..3) %}
+            {% for i in (0..7) %}
+                vextractps  dword ptr [r{{i | plus: 8}}], xmm{{i}}, {{row}}
+                add         r{{i | plus: 8}}, rsi
+            {% endfor %}
+        {% endfor %}
+    {% endfor %}
+
+    jmp     {{L}}non_linear_loop
+
+
+{% include "postamble.tmpliq" type:"f32", size:"8x8", suffix:suffix, G:G, L:L %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_cols.tmpliq
new file mode 100644
index 000000000..c1a2cd487
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_cols.tmpliq
@@ -0,0 +1,9 @@
+// vim: set syntax=asm :
+
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_min", op:"vminps", mr:mr, from:from, to:to, type:type%}
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_max", op:"vmaxps", mr:mr, from:from, to:to, type:type%}
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_add", op:"vaddps", mr:mr, from:from, to:to, type:type%}
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_mul", op:"vmulps", mr:mr, from:from, to:to, type:type%}
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_sub", op:"vsubps", from:from, to:to, type:type %}
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsubps", from:from, to:to, flipped: true, type:type%}
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_rows.tmpliq
new file mode 100644
index 000000000..9e7a2ddcd
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_per_rows.tmpliq
@@ -0,0 +1,9 @@
+// vim: set syntax=asm :
+
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_min", op:"vminps", mr:mr, from:from, to:to, type: type%}
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_max", op:"vmaxps", mr:mr, from:from, to:to, type: type%}
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_add", op:"vaddps", mr:mr, from:from, to:to, type: type%}
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_mul", op:"vmulps", mr:mr, from:from, to:to, type: type%}
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_sub", op:"vsubps", from:from, to:to, type: type%}
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsubps", from:from, to:to, flipped: true, type: type%}
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_scalars.tmpliq
new file mode 100644
index 000000000..a0a4d47d3
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_f32_scalars.tmpliq
@@ -0,0 +1,38 @@
+// vim: set syntax=asm :
+
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_min", op:"vminps", from:from, to:to, type:type%}
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_max", op:"vmaxps", from:from, to:to, type:type%}
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_add", op:"vaddps", from:from, to:to, type:type%}
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_mul", op:"vmulps", from:from, to:to, type:type%}
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub", op:"vsubps", from:from, to:to, type:type%}
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsubps", from:from, to:to, flipped: true, type:type%}
+
+{{L}}leaky_relu:
+    // can only use ymm12 to ymm15
+    // ymm15 <- alpha
+    {% if type == "f32" %}
+        vbroadcastss    ymm15, dword ptr [rdi + 8]
+    {% else %}
+        pinsrw          xmm15, word ptr [rdi + 8], 0
+        vcvtph2ps       ymm15, xmm15
+        vbroadcastss    ymm15, xmm15
+    {% endif %}
+
+    // ymm14 <- all zero
+    vpxor           ymm14, ymm14, ymm14
+
+    {% for reg in (from..to) %}
+        // ymm12 <- alpha * x
+        vmulps      ymm12, ymm{{reg}}, ymm15
+        vcmpps     ymm13, ymm14, ymm{{reg}}, 1 // 1 means LT
+        vblendvps   ymm{{reg}}, ymm12, ymm{{reg}}, ymm13
+    {% endfor %}
+    // select muled of orginal
+
+    jmp    {{L}}non_linear_loop
+
+{{L}}q_scale:
+{{L}}q_shl:
+{{L}}q_shr:
+    jmp {{L}}unsupported
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_cols.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_cols.tmpliq
new file mode 100644
index 000000000..387b37920
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_cols.tmpliq
@@ -0,0 +1,9 @@
+// vim: set syntax=asm :
+
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_min", op:"vpminsd", mr:mr, from:from, to:to, type:"i32"%}
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_max", op:"vpmaxsd", mr:mr, from:from, to:to, type:"i32"%}
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_add", op:"vpaddd", mr:mr, from:from, to:to, type:"i32"%}
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_mul", op:"vpmulld", mr:mr, from:from, to:to, type:"i32"%}
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_sub", op:"vpsubd", from:from, to:to, type:"i32"%}
+{% include "fma_mmm_ymm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true, type:"i32"%}
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_rows.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_rows.tmpliq
new file mode 100644
index 000000000..2b07a15e0
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_per_rows.tmpliq
@@ -0,0 +1,9 @@
+// vim: set syntax=asm :
+
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_min", op:"vpminsd", mr:mr, from:from, to:to, type:"i32"%}
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_max", op:"vpmaxsd", mr:mr, from:from, to:to, type:"i32"%}
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_add", op:"vpaddd", mr:mr, from:from, to:to, type:"i32"%}
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_mul", op:"vpmulld", mr:mr, from:from, to:to, type:"i32"%}
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_sub", op:"vpsubd", from:from, to:to, type:"i32"%}
+{% include "fma_mmm_ymm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true, type:"i32"%}
+
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_scalars.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_scalars.tmpliq
new file mode 100644
index 000000000..b522b6948
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_i32_scalars.tmpliq
@@ -0,0 +1,23 @@
+// vim: set syntax=asm :
+
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_min", op:"vpminsd", from:from, to:to, type:"i32" %}
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_max", op:"vpmaxsd", from:from, to:to, type:"i32" %}
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_mul", op:"vpmulld", from:from, to:to, type:"i32" %}
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_add", op:"vpaddd", from:from, to:to, type:"i32" %}
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub", op:"vpsubd", from:from, to:to, type:"i32" %}
+{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true, type:"i32" %}
+
+{{L}}leaky_relu:
+    // can only use ymm12 to ymm15
+    // ymm15 <- alpha
+    vbroadcastss    ymm15, dword ptr [rdi + 8]
+    // ymm14 <- all zero
+    vpxor          ymm14, ymm14, ymm14
+
+    {% for reg in (from..to) %}
+        vpmulld     ymm12, ymm{{reg}}, ymm15
+        vpcmpgtd    ymm13, ymm14, ymm{{reg}}
+        vblendvps   ymm{{reg}}, ymm{{reg}}, ymm12, ymm13
+    {% endfor %}
+
+    jmp    {{L}}non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_load_tile.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_load_tile.tmpliq
new file mode 100644
index 000000000..f0d1896b6
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_load_tile.tmpliq
@@ -0,0 +1,9 @@
+// vim: set syntax=asm :
+
+{{L}}load_tile:
+    mov          r8, [rdi + 8]
+    {% for reg in (from..to) %}
+        vmovups         ymm{{reg}}, ymmword ptr [r8 + {{ reg|minus:from|times:32 }}]
+    {% endfor %}
+
+    jmp    {{L}}non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_col.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_col.tmpliq
new file mode 100644
index 000000000..95f72f65c
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_col.tmpliq
@@ -0,0 +1,35 @@
+// vim: set syntax=asm :
+
+{{L}}{{label}}:
+    mov             rax, [ rdi + 8 ]
+
+{% capture mr_over_8 %}{{ mr | divided_by: 8}}{%endcapture%}
+{% capture mr_over_8_min_1 %}{{ mr | divided_by: 8 | minus: 1}}{%endcapture%}
+
+{%capture tmp%}{{to | plus: 1 }}{%endcapture%}
+
+{%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_8}}{%endcapture%}
+{%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_8|minus:1}}{%endcapture%}
+
+
+{% for right in (0..cols_min_1) %}
+    {% if type == "f16" %} 
+        pinsrw          xmm{{tmp}}, word ptr [ rax ], 0
+        add             rax, 2
+        vcvtph2ps      ymm{{tmp}}, xmm{{tmp}}
+        vbroadcastss    ymm{{tmp}}, xmm{{tmp}}
+    {% else %}
+        vbroadcastss    ymm{{tmp}}, dword ptr [ rax ]
+        add             rax, 4
+    {% endif %}
+    {% for down in (0..mr_over_8_min_1) %}
+        {%capture acc%}{{mr_over_8|times:right|plus:from|plus:down}}{%endcapture%}
+        {% if flipped %}
+            {{op}} ymm{{acc}}, ymm{{acc}}, ymm{{tmp}}
+        {% else %}
+            {{op}} ymm{{acc}}, ymm{{tmp}}, ymm{{acc}}
+        {% endif %}
+    {% endfor %}
+{% endfor %}
+
+    jmp {{L}}non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_row.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_row.tmpliq
new file mode 100644
index 000000000..7366a8ba0
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_per_row.tmpliq
@@ -0,0 +1,32 @@
+// vim: set syntax=asm :
+
+{{L}}{{label}}:
+    mov             rax, [ rdi + 8 ]
+
+{% capture mr_over_8 %}{{ mr | divided_by: 8}}{%endcapture%}
+{% capture mr_over_8_min_1 %}{{ mr | divided_by: 8 | minus: 1}}{%endcapture%}
+
+{% if type == "f16" %}
+    {% for ix in (0..mr_over_8_min_1) %}
+        vmovups         xmm{{to | plus: 1 | plus: ix}},  [rax + {{ix | times: 16}}]
+    {% endfor %}
+    {% for ix in (0..mr_over_8_min_1) %}
+        vcvtph2ps       ymm{{to | plus: 1 | plus: ix}}, xmm{{to | plus: 1 | plus: ix}}
+    {% endfor %}
+{% else %}
+    {% for ix in (0..mr_over_8_min_1) %}
+        vmovups         ymm{{to | plus: 1 | plus: ix}},  [rax + {{ix | times: 32}}]
+    {% endfor %}
+{% endif %}
+
+{% if flipped %}
+    {% for acc in (from..to) %}
+        {{op}} ymm{{acc}}, ymm{{acc}}, ymm{{ acc | modulo: mr_over_8 | plus: to | plus: 1 }}
+    {% endfor %}
+{% else %}
+    {% for acc in (from..to) %}
+        {{op}} ymm{{acc}}, ymm{{ acc | modulo: mr_over_8 | plus: to | plus: 1 }}, ymm{{acc}}
+    {% endfor %}
+{% endif %}
+
+    jmp {{L}}non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_scalar.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_scalar.tmpliq
new file mode 100644
index 000000000..5ac174965
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_mmm_ymm_scalar.tmpliq
@@ -0,0 +1,22 @@
+// vim: set syntax=asm :
+
+{{L}}{{label}}:
+    {% if type == "f16" %}
+        pinsrw          xmm12, word ptr [rdi + 8], 0
+        vcvtph2ps       ymm12, xmm12
+        vbroadcastss    ymm12, xmm12
+    {% else %}
+        vbroadcastss    ymm12, dword ptr [rdi + 8]
+    {% endif %}
+    
+    {% if flipped %}
+        {% for reg in (from..to) %}
+            {{op}}          ymm{{reg}}, ymm{{reg}}, ymm12
+        {% endfor %}
+    {% else %}
+        {% for reg in (from..to) %}
+            {{op}}          ymm{{reg}}, ymm12, ymm{{reg}}
+        {% endfor %}
+    {% endif %}
+
+    jmp    {{L}}non_linear_loop
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_sigmoid_f32.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_sigmoid_f32.tmpl
new file mode 100644
index 000000000..4f650dc10
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_sigmoid_f32.tmpl
@@ -0,0 +1,319 @@
+{% comment %}
+// vim: set syntax=asm :
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+
+{% endcomment %}
+
+{% if msvc %}
+
+_text segment
+fma_sigmoid_f32_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}fma_sigmoid_f32_{{suffix}}
+{{G}}fma_sigmoid_f32_{{suffix}}:
+.cfi_startproc
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+
+{% if family == "windows" %}
+// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch
+// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    // move around arguments to mimick SysV rdi,rsi passing
+    push        rdi
+    push        rsi
+    mov         rdi, rcx
+    mov         rsi, rdx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+// FIXME
+// .cfi_def_cfa_offset 64 
+{% endif %}
+
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+// ----------------------------------------------------------------------
+
+    cmp     rsi, 0
+    je      {{L}}done
+
+    cmp     rsi, 32
+    jl      {{L}}loop_1
+
+{{L}}loop_4:
+
+    vmovaps         ymm4, [rdi]
+    vmovaps         ymm5, [rdi + 32]
+    vmovaps         ymm6, [rdi + 64]
+    vmovaps         ymm7, [rdi + 96]
+
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_low]
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_high]
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13]
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11]
+
+    vmaxps          ymm4, ymm4, ymm0
+    vmaxps          ymm5, ymm5, ymm0
+    vmaxps          ymm6, ymm6, ymm0
+    vmaxps          ymm7, ymm7, ymm0
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9]
+
+    vminps          ymm4, ymm4, ymm1
+    vminps          ymm5, ymm5, ymm1
+    vminps          ymm6, ymm6, ymm1
+    vminps          ymm7, ymm7, ymm1        // ymm4..7 <- x
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7]
+
+    vmulps          ymm8, ymm4, ymm4
+    vmulps          ymm9, ymm5, ymm5
+    vmulps          ymm10, ymm6, ymm6
+    vmulps          ymm11, ymm7, ymm7        // ymm8..11 <- x^2
+
+    vmovaps         ymm12, ymm2
+    vmovaps         ymm13, ymm2
+    vmovaps         ymm14, ymm2
+    vmovaps         ymm15, ymm2
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5]
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vfmadd132ps     ymm13, ymm3, ymm9
+    vfmadd132ps     ymm14, ymm3, ymm10
+    vfmadd132ps     ymm15, ymm3, ymm11
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3]
+    vfmadd132ps     ymm12, ymm0, ymm8
+    vfmadd132ps     ymm13, ymm0, ymm9
+    vfmadd132ps     ymm14, ymm0, ymm10
+    vfmadd132ps     ymm15, ymm0, ymm11
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1]
+    vfmadd132ps     ymm12, ymm1, ymm8
+    vfmadd132ps     ymm13, ymm1, ymm9
+    vfmadd132ps     ymm14, ymm1, ymm10
+    vfmadd132ps     ymm15, ymm1, ymm11
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6]
+    vfmadd132ps     ymm12, ymm2, ymm8
+    vfmadd132ps     ymm13, ymm2, ymm9
+    vfmadd132ps     ymm14, ymm2, ymm10
+    vfmadd132ps     ymm15, ymm2, ymm11
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4]
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vfmadd132ps     ymm13, ymm3, ymm9
+    vfmadd132ps     ymm14, ymm3, ymm10
+    vfmadd132ps     ymm15, ymm3, ymm11
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2]
+    vfmadd132ps     ymm12, ymm0, ymm8
+    vfmadd132ps     ymm13, ymm0, ymm9
+    vfmadd132ps     ymm14, ymm0, ymm10
+    vfmadd132ps     ymm15, ymm0, ymm11
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0]
+    vmulps          ymm4, ymm4, ymm12
+    vmulps          ymm5, ymm5, ymm13
+    vmulps          ymm6, ymm6, ymm14
+    vmulps          ymm7, ymm7, ymm15   // ymm4..7 <- num
+
+    vmovaps         ymm12, ymm1
+    vmovaps         ymm13, ymm1
+    vmovaps         ymm14, ymm1
+    vmovaps         ymm15, ymm1
+
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_half]
+    vfmadd132ps     ymm12, ymm2, ymm8
+    vfmadd132ps     ymm13, ymm2, ymm9
+    vfmadd132ps     ymm14, ymm2, ymm10
+    vfmadd132ps     ymm15, ymm2, ymm11
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vfmadd132ps     ymm13, ymm3, ymm9
+    vfmadd132ps     ymm14, ymm3, ymm10
+    vfmadd132ps     ymm15, ymm3, ymm11
+    vfmadd132ps     ymm12, ymm0, ymm8
+    vfmadd132ps     ymm13, ymm0, ymm9
+    vfmadd132ps     ymm14, ymm0, ymm10
+    vfmadd132ps     ymm15, ymm0, ymm11  // ymm12..14 <- denum
+
+    vdivps          ymm4, ymm4, ymm12
+    vdivps          ymm5, ymm5, ymm13
+    vdivps          ymm6, ymm6, ymm14
+    vdivps          ymm7, ymm7, ymm15
+    vaddps          ymm4, ymm4, ymm1
+    vaddps          ymm5, ymm5, ymm1
+    vaddps          ymm6, ymm6, ymm1
+    vaddps          ymm7, ymm7, ymm1
+
+    vmovaps [rdi], ymm4
+    vmovaps [rdi + 32], ymm5
+    vmovaps [rdi + 64], ymm6
+    vmovaps [rdi + 96], ymm7
+
+    add     rdi, 128
+    sub     rsi, 32
+    cmp     rsi, 32
+    jg      {{L}}loop_4
+
+    cmp     rsi, 0
+    je      {{L}}done
+
+{{L}}loop_1:
+    vmovaps         ymm4, [rdi]
+
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_low]
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_high]
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13]
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11]
+
+    vmaxps          ymm4, ymm4, ymm0
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9]
+
+    vminps          ymm4, ymm4, ymm1        // ymm4 <- x
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7]
+
+    vmulps          ymm8, ymm4, ymm4        // ymm8 <- x^2
+
+    vmovaps         ymm12, ymm2
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5]
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3]
+    vfmadd132ps     ymm12, ymm0, ymm8
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1]
+    vfmadd132ps     ymm12, ymm1, ymm8
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6]
+    vfmadd132ps     ymm12, ymm2, ymm8
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4]
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2]
+    vfmadd132ps     ymm12, ymm0, ymm8
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0]
+    vmulps          ymm4, ymm4, ymm12
+
+    vmovaps         ymm12, ymm1
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_half]
+    vfmadd132ps     ymm12, ymm2, ymm8
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vfmadd132ps     ymm12, ymm0, ymm8
+
+    vdivps          ymm4, ymm4, ymm12
+    vaddps          ymm4, ymm4, ymm1
+
+    vmovaps [rdi], ymm4
+    add     rdi, 32
+    sub     rsi, 8
+    jnz     {{L}}loop_1
+{{L}}done:
+
+// ----------------------------------------------------------------------
+
+    ldmxcsr     [rsp + 4]
+
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+{%capture float%}{% if msvc %} real4 {%else%} .float {%endif%}{%endcapture%}
+
+{{L}}coeffs_num_low:
+    {{float}} -18.6                   // low
+{{L}}coeffs_num_high:
+    {{float}} 18.6                     // high         
+
+{{L}}coeffs_num_alpha_13:
+    {{float}} -4.433153405e-18
+{{L}}coeffs_num_alpha_11:
+    {{float}} 1.169974371e-14
+{{L}}coeffs_num_alpha_9:
+    {{float}} -1.875289645e-11
+{{L}}coeffs_num_alpha_7:
+    {{float}} 4.257889523e-8
+{{L}}coeffs_num_alpha_5:
+    {{float}} 0.00004811817576
+{{L}}coeffs_num_alpha_3:
+    {{float}} 0.008163842030
+{{L}}coeffs_num_alpha_1:
+    {{float}} 0.2499999971
+
+{{L}}coeffs_num_beta_6:
+    {{float}} 3.922935744e-6
+{{L}}coeffs_num_beta_4:
+    {{float}} 0.001524872358
+{{L}}coeffs_num_beta_2:
+    {{float}} 0.1159886749
+{{L}}coeffs_num_beta_0:
+    {{float}} 1.0;
+
+{{L}}coeffs_num_half:
+    {{float}} 0.5
+
+{% if msvc %}
+fma_sigmoid_f32_{{suffix}} endp
+_text ends
+end
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/fma_tanh_f32.tmpl b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_tanh_f32.tmpl
new file mode 100644
index 000000000..7b3c64046
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/fma_tanh_f32.tmpl
@@ -0,0 +1,313 @@
+{% comment %}
+// vim: set syntax=asm :
+
+System V ABI:
+    args: rdi, rsi, rdx, rcx, r8, r9
+    preserve: rbx, rsp, rbp, r12, r13, r14, r15
+    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
+    return: rax (+rdx)
+
+Windows ABI:
+    args: RCX, RDX, R8, R9
+    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
+    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
+    return: rax (+rdx)
+
+{% endcomment %}
+
+{% if msvc %}
+
+_text segment
+fma_tanh_f32_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}fma_tanh_f32_{{suffix}}
+{{G}}fma_tanh_f32_{{suffix}}:
+.cfi_startproc
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+
+{% if family == "windows" %}
+// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch
+// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    // move around arguments to mimick SysV rdi,rsi passing
+    push        rdi
+    push        rsi
+    mov         rdi, rcx
+    mov         rsi, rdx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+// FIXME
+// .cfi_def_cfa_offset 64 
+{% endif %}
+
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+// ----------------------------------------------------------------------
+
+{%capture offset%}{% if msvc %} offset {%else%} rip + {%endif%} {%endcapture%}
+
+    cmp     rsi, 0
+    je      {{L}}done
+
+    cmp     rsi, 32
+    jl      {{L}}loop_1
+
+{{L}}loop_4:
+
+    vmovaps         ymm4, [rdi]
+    vmovaps         ymm5, [rdi + 32]
+    vmovaps         ymm6, [rdi + 64]
+    vmovaps         ymm7, [rdi + 96]
+
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_low]
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_high]
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13]
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11]
+
+    vmaxps          ymm4, ymm4, ymm0
+    vmaxps          ymm5, ymm5, ymm0
+    vmaxps          ymm6, ymm6, ymm0
+    vmaxps          ymm7, ymm7, ymm0
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9]
+
+    vminps          ymm4, ymm4, ymm1
+    vminps          ymm5, ymm5, ymm1
+    vminps          ymm6, ymm6, ymm1
+    vminps          ymm7, ymm7, ymm1        // ymm4..7 <- x
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7]
+
+    vmulps          ymm8, ymm4, ymm4
+    vmulps          ymm9, ymm5, ymm5
+    vmulps          ymm10, ymm6, ymm6
+    vmulps          ymm11, ymm7, ymm7        // ymm8..11 <- x^2
+
+    vmovaps         ymm12, ymm2
+    vmovaps         ymm13, ymm2
+    vmovaps         ymm14, ymm2
+    vmovaps         ymm15, ymm2
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5]
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vfmadd132ps     ymm13, ymm3, ymm9
+    vfmadd132ps     ymm14, ymm3, ymm10
+    vfmadd132ps     ymm15, ymm3, ymm11
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3]
+    vfmadd132ps     ymm12, ymm0, ymm8
+    vfmadd132ps     ymm13, ymm0, ymm9
+    vfmadd132ps     ymm14, ymm0, ymm10
+    vfmadd132ps     ymm15, ymm0, ymm11
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1]
+    vfmadd132ps     ymm12, ymm1, ymm8
+    vfmadd132ps     ymm13, ymm1, ymm9
+    vfmadd132ps     ymm14, ymm1, ymm10
+    vfmadd132ps     ymm15, ymm1, ymm11
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6]
+    vfmadd132ps     ymm12, ymm2, ymm8
+    vfmadd132ps     ymm13, ymm2, ymm9
+    vfmadd132ps     ymm14, ymm2, ymm10
+    vfmadd132ps     ymm15, ymm2, ymm11
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4]
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vfmadd132ps     ymm13, ymm3, ymm9
+    vfmadd132ps     ymm14, ymm3, ymm10
+    vfmadd132ps     ymm15, ymm3, ymm11
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2]
+    vfmadd132ps     ymm12, ymm0, ymm8
+    vfmadd132ps     ymm13, ymm0, ymm9
+    vfmadd132ps     ymm14, ymm0, ymm10
+    vfmadd132ps     ymm15, ymm0, ymm11
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0]
+    vmulps          ymm4, ymm4, ymm12
+    vmulps          ymm5, ymm5, ymm13
+    vmulps          ymm6, ymm6, ymm14
+    vmulps          ymm7, ymm7, ymm15   // ymm4..7 <- num
+
+    vmovaps         ymm12, ymm1
+    vmovaps         ymm13, ymm1
+    vmovaps         ymm14, ymm1
+    vmovaps         ymm15, ymm1
+    vfmadd132ps     ymm12, ymm2, ymm8
+    vfmadd132ps     ymm13, ymm2, ymm9
+    vfmadd132ps     ymm14, ymm2, ymm10
+    vfmadd132ps     ymm15, ymm2, ymm11
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vfmadd132ps     ymm13, ymm3, ymm9
+    vfmadd132ps     ymm14, ymm3, ymm10
+    vfmadd132ps     ymm15, ymm3, ymm11
+    vfmadd132ps     ymm12, ymm0, ymm8
+    vfmadd132ps     ymm13, ymm0, ymm9
+    vfmadd132ps     ymm14, ymm0, ymm10
+    vfmadd132ps     ymm15, ymm0, ymm11  // ymm12..14 <- denum
+
+    vdivps          ymm4, ymm4, ymm12
+    vdivps          ymm5, ymm5, ymm13
+    vdivps          ymm6, ymm6, ymm14
+    vdivps          ymm7, ymm7, ymm15
+
+    vmovaps [rdi], ymm4
+    vmovaps [rdi + 32], ymm5
+    vmovaps [rdi + 64], ymm6
+    vmovaps [rdi + 96], ymm7
+
+    add     rdi, 128
+    sub     rsi, 32
+    cmp     rsi, 32
+    jg      {{L}}loop_4
+
+    cmp     rsi, 0
+    je      {{L}}done
+
+{{L}}loop_1:
+    vmovaps         ymm4, [rdi]
+
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_low]
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_high]
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_13]
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_11]
+
+    vmaxps          ymm4, ymm4, ymm0
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_9]
+
+    vminps          ymm4, ymm4, ymm1        // ymm4 <- x
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_alpha_7]
+
+    vmulps          ymm8, ymm4, ymm4        // ymm8 <- x^2
+
+    vmovaps         ymm12, ymm2
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_alpha_5]
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_alpha_3]
+    vfmadd132ps     ymm12, ymm0, ymm8
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_alpha_1]
+    vfmadd132ps     ymm12, ymm1, ymm8
+    vbroadcastss    ymm1, dword ptr [{{offset}} {{L}}coeffs_num_beta_6]
+    vfmadd132ps     ymm12, ymm2, ymm8
+    vbroadcastss    ymm2, dword ptr [{{offset}} {{L}}coeffs_num_beta_4]
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vbroadcastss    ymm3, dword ptr [{{offset}} {{L}}coeffs_num_beta_2]
+    vfmadd132ps     ymm12, ymm0, ymm8
+    vbroadcastss    ymm0, dword ptr [{{offset}} {{L}}coeffs_num_beta_0]
+    vmulps          ymm4, ymm4, ymm12
+
+    vmovaps         ymm12, ymm1
+    vfmadd132ps     ymm12, ymm2, ymm8
+    vfmadd132ps     ymm12, ymm3, ymm8
+    vfmadd132ps     ymm12, ymm0, ymm8
+
+    vdivps          ymm4, ymm4, ymm12
+
+    vmovaps [rdi], ymm4
+    add     rdi, 32
+    sub     rsi, 8
+    jnz     {{L}}loop_1
+
+{{L}}done:
+
+// ----------------------------------------------------------------------
+
+    ldmxcsr     [rsp + 4]
+
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+{%capture float%}{% if msvc %} real4 {%else%} .float {%endif%}{%endcapture%}
+
+{{L}}coeffs_num_low:
+    {{float}} -8.9
+{{L}}coeffs_num_high:
+    {{float}} 8.9
+
+{{L}}coeffs_num_alpha_13:
+    {{float}} -8.488492677e-14
+{{L}}coeffs_num_alpha_11:
+    {{float}} 5.277853000e-11
+{{L}}coeffs_num_alpha_9:
+    {{float}} -2.022500419e-8
+{{L}}coeffs_num_alpha_7:
+    {{float}} 0.00001115424833
+{{L}}coeffs_num_alpha_5:
+    {{float}} 0.003103950131
+{{L}}coeffs_num_alpha_3:
+    {{float}} 0.1308400453
+{{L}}coeffs_num_alpha_1:
+    {{float}} 0.9999999934
+
+{{L}}coeffs_num_beta_6:
+    {{float}} 0.0002546136580
+{{L}}coeffs_num_beta_4:
+    {{float}} 0.02449515379
+{{L}}coeffs_num_beta_2:
+    {{float}} 0.4641733162
+{{L}}coeffs_num_beta_0:
+    {{float}} 1.0
+
+
+
+{% if msvc %}
+fma_tanh_f32_{{suffix}} endp
+_text ends
+end
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/postamble.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/postamble.tmpliq
new file mode 100644
index 000000000..616a98975
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/postamble.tmpliq
@@ -0,0 +1,38 @@
+{{L}}return:
+    ldmxcsr     [rsp + 4]
+    add         rsp, 8
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+
+{% if family == "windows" %}
+    pop rsi
+    pop rdi
+
+    vmovaps xmm15, [rsp+16*9]
+    vmovaps xmm14, [rsp+16*8]
+    vmovaps xmm13, [rsp+16*7]
+    vmovaps xmm12, [rsp+16*6]
+    vmovaps xmm11, [rsp+16*5]
+    vmovaps xmm10, [rsp+16*4]
+    vmovaps xmm9, [rsp+16*3]
+    vmovaps xmm8, [rsp+16*2]
+    vmovaps xmm7, [rsp+16*1]
+    vmovaps xmm6, [rsp]
+{% endif %}
+
+    mov rsp, rbp
+    pop rbp
+    ret
+
+{% if msvc %}
+fma_mmm_{{type}}_{{size}}_{{suffix}} endp
+_text ends
+end
+
+{% else %}
+.cfi_endproc
+{% endif %}
diff --git a/vendor/tract-linalg-0.22.1/x86_64/fma/preamble.tmpliq b/vendor/tract-linalg-0.22.1/x86_64/fma/preamble.tmpliq
new file mode 100644
index 000000000..f2fbea64b
--- /dev/null
+++ b/vendor/tract-linalg-0.22.1/x86_64/fma/preamble.tmpliq
@@ -0,0 +1,64 @@
+
+{% if msvc %}
+
+_text segment
+fma_mmm_{{type}}_{{size}}_{{suffix}} proc
+
+{% else %}
+
+.intel_syntax noprefix
+.text
+.p2align 5
+.globl {{G}}fma_mmm_{{type}}_{{size}}_{{suffix}}
+{{G}}fma_mmm_{{type}}_{{size}}_{{suffix}}:
+.cfi_startproc
+
+{% endif %}
+
+    push        rbp
+    mov         rbp, rsp
+
+{% if family == "windows" %}
+// https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch
+// https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers
+    and rsp,-16
+    lea rsp,[rsp-160]
+    vmovaps [rsp], xmm6
+    vmovaps [rsp+16*1],xmm7
+    vmovaps [rsp+16*2],xmm8
+    vmovaps [rsp+16*3],xmm9
+    vmovaps [rsp+16*4],xmm10
+    vmovaps [rsp+16*5],xmm11
+    vmovaps [rsp+16*6],xmm12
+    vmovaps [rsp+16*7],xmm13
+    vmovaps [rsp+16*8],xmm14
+    vmovaps [rsp+16*9],xmm15
+
+    push        rdi
+    push        rsi
+
+    mov         rdi, rcx
+
+{% endif %}
+
+    push        rbx
+    push        r12
+    push        r13
+    push        r14
+    push        r15
+
+    sub         rsp, 8
+
+{% if family == "unix" %}
+.cfi_def_cfa_offset 64
+{% endif %}
+    stmxcsr     [rsp + 4]
+{% if msvc %}
+    mov         rax, 1FC0h
+{% else %}
+    mov         rax, 0x1FC0
+{% endif %}
+    mov         [rsp], eax
+    ldmxcsr     [rsp]
+
+{% include "dispatcher.tmpliq" %}

From a7fa8dea27830b2f1a0425a2eb088cb415035357 Mon Sep 17 00:00:00 2001
From: czoli1976 <64466170+czoli1976@users.noreply.github.com>
Date: Sun, 3 May 2026 16:19:28 +0100
Subject: [PATCH 07/10] perf(wasm): SIMD-vectorize compute_band_corr inner loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hot loop on the per-frame ERB feature path: dot-product over a band
of Complex32 against itself (or a reference). The wasm32 build with
`+simd128` was leaving this loop scalar — `wasm-objdump` shows zero
v128 ops for the function body in the production build.

Replace the inner accumulator with a 4-wide f32x4 reduction using
`core::arch::wasm32` intrinsics. Output is bit-exact identical
(FNV-1a 20ea4579c427f925 unchanged across Chromium / WebKit /
Firefox, single-threaded and 4-thread).

Same-machine focused bench, Chromium, 5-run alternated, 300 iter
× 20 frames per measurement (t-test):
  vanilla_mono control: 3.755 -> 3.750 ms (no change, sanity)
  my_mt_1t:             3.748 -> 3.723 ms (-0.67%, t=2.22)
  my_mt_4t:             4.679 -> 4.646 ms (-0.71%, t=2.45)

Native builds use the existing scalar reduction via cfg gating;
no behaviour change off wasm32.
---
 libDF/src/lib.rs | 75 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 7 deletions(-)

diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs
index 7ab568856..5f3593504 100644
--- a/libDF/src/lib.rs
+++ b/libDF/src/lib.rs
@@ -282,18 +282,79 @@ pub fn compute_band_corr(out: &mut [f32], x: &[Complex32], p: &[Complex32], erb_
         *y = 0.0;
     }
     debug_assert_eq!(erb_fb.len(), out.len());
-
-    let mut bcsum = 0;
+    debug_assert_eq!(x.len(), p.len());
+
+    // Each Complex32 occupies 2 contiguous f32 (re, im). Reinterpret the slices
+    // as flat &[f32] of length 2*N so we can vectorize with f32x4 loads.
+    // SAFETY: Complex32 is #[repr(C)] { re: f32, im: f32 } -> 8 bytes, alignment 4,
+    // identical to two contiguous f32. Length is exactly 2 * x.len().
+    let xf: &[f32] =
+        unsafe { core::slice::from_raw_parts(x.as_ptr() as *const f32, x.len() * 2) };
+    let pf: &[f32] =
+        unsafe { core::slice::from_raw_parts(p.as_ptr() as *const f32, p.len() * 2) };
+
+    let mut bcsum = 0usize;
     for (&band_size, out_b) in erb_fb.iter().zip(out.iter_mut()) {
-        let k = 1. / band_size as f32;
-        for j in 0..band_size {
-            let idx = bcsum + j;
-            *out_b += (x[idx].re * p[idx].re + x[idx].im * p[idx].im) * k;
-        }
+        let k = 1.0f32 / band_size as f32;
+        let f_start = bcsum * 2;
+        let f_len = band_size * 2;
+        let xb = &xf[f_start..f_start + f_len];
+        let pb = &pf[f_start..f_start + f_len];
+        // sum := sum over band of x[i].re*p[i].re + x[i].im*p[i].im
+        // == sum over flattened pairs of xb[2j]*pb[2j] + xb[2j+1]*pb[2j+1]
+        // == sum_lanes( sum over 4-wide chunks of xb[..]*pb[..] )
+        let sum: f32 = compute_band_corr_inner(xb, pb);
+        *out_b = sum * k;
         bcsum += band_size;
     }
 }
 
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn compute_band_corr_inner(xb: &[f32], pb: &[f32]) -> f32 {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xb.len(), pb.len());
+    let n = xb.len();
+    let n4 = n & !3; // round down to multiple of 4
+    let mut acc = f32x4_splat(0.0);
+    let xp = xb.as_ptr();
+    let pp = pb.as_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        // SAFETY: xp/pp are aligned to f32 (4 bytes); v128_load uses unaligned semantics.
+        // We bounds-check via i < n4 <= n == xb.len() == pb.len().
+        unsafe {
+            let xv = v128_load(xp.add(i) as *const v128);
+            let pv = v128_load(pp.add(i) as *const v128);
+            let prod = f32x4_mul(xv, pv);
+            acc = f32x4_add(acc, prod);
+        }
+        i += 4;
+    }
+    // Horizontal reduce the 4 lanes.
+    let mut sum = f32x4_extract_lane::<0>(acc)
+        + f32x4_extract_lane::<1>(acc)
+        + f32x4_extract_lane::<2>(acc)
+        + f32x4_extract_lane::<3>(acc);
+    // Tail: 0..3 leftover f32 (i.e. 0 or 1 trailing complex pair if band_size is odd).
+    while i < n {
+        sum += unsafe { *xp.add(i) * *pp.add(i) };
+        i += 1;
+    }
+    sum
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn compute_band_corr_inner(xb: &[f32], pb: &[f32]) -> f32 {
+    debug_assert_eq!(xb.len(), pb.len());
+    let mut sum = 0.0f32;
+    for (a, b) in xb.iter().zip(pb.iter()) {
+        sum += a * b;
+    }
+    sum
+}
+
 pub fn band_compr(out: &mut [f32], x: &[f32], erb_fb: &[usize]) {
     for y in out.iter_mut() {
         *y = 0.0;

From 4895b00117bf25ffed5e7d35d6e72d368ad3d6f9 Mon Sep 17 00:00:00 2001
From: Ckristian Zoli <ckristian.zoli@gmail.com>
Date: Sun, 3 May 2026 21:35:37 +0100
Subject: [PATCH 08/10] perf(wasm): SIMD-vectorize 3 more inference DSP loops

Adds f32x4 vectorization for three more hot DSP functions in the
df_process_frame inference path, on top of the compute_band_corr
work in this PR's first commit:

  * band_mean_norm_erb (called from feat_erb per frame): per-bin
    IIR mean-norm. State is per-bin (no recurrence between bins) so
    straightforward 4-wide SIMD over all ERB bins.

  * apply_band_gain (called from apply_mask post-network): Complex32
    x f32 scalar mul-in-place per ERB band. Reinterprets
    &mut [Complex32] as &mut [f32] of length 2N (Complex32 is
    #[repr(C)] {re, im}, identical layout). 4-wide SIMD multiplies.
    Also redirects DFState::apply_mask to call apply_band_gain (the
    Complex32 specialisation) instead of the generic
    apply_interp_band_gain<T>, since the existing apply_band_gain
    function is already structurally identical.

  * apply_window_in_place (called from frame_synthesis per frame):
    f32 mul-in-place. Signature changed from generic
    IntoIterator<Item=&'a f32> to &[f32] (the sole caller already
    passes &state.window which IS a slice). 4-wide SIMD multiplies.

Each function keeps the original scalar implementation as the
non-wasm32 fallback via #[cfg(not(target_arch = "wasm32"))].

Bit-identical output verified: FNV-1a hash of df_process_frame
output stream over 3000 random frames matches the Rikorose main
baseline exactly across all 3 independent bench runs on
Node v20.11.1 / V8.

Wasm size delta vs baseline: +835 bytes total (compute_band_corr
+699; the 3 new helpers add net +136 bytes).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 libDF/src/lib.rs | 156 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 137 insertions(+), 19 deletions(-)

diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs
index 5f3593504..357e318e5 100644
--- a/libDF/src/lib.rs
+++ b/libDF/src/lib.rs
@@ -221,7 +221,9 @@ impl DFState {
     }
 
     pub fn apply_mask(&self, output: &mut [Complex32], gains: &[f32]) {
-        apply_interp_band_gain(output, gains, &self.erb)
+        // apply_band_gain is the Complex32 specialisation of apply_interp_band_gain
+        // and carries a SIMD-vectorised inner loop on wasm32.
+        apply_band_gain(output, gains, &self.erb)
     }
 }
 
@@ -243,11 +245,7 @@ pub fn band_mean_norm_freq(xs: &[Complex32], xout: &mut [f32], state: &mut [f32]
 
 pub fn band_mean_norm_erb(xs: &mut [f32], state: &mut [f32], alpha: f32) {
     debug_assert_eq!(xs.len(), state.len());
-    for (x, s) in xs.iter_mut().zip(state.iter_mut()) {
-        *s = *x * (1. - alpha) + *s * alpha;
-        *x -= *s;
-        *x /= 40.;
-    }
+    band_mean_norm_erb_inner(xs, state, alpha);
 }
 
 pub fn band_unit_norm(xs: &mut [Complex32], state: &mut [f32], alpha: f32) {
@@ -355,6 +353,124 @@ fn compute_band_corr_inner(xb: &[f32], pb: &[f32]) -> f32 {
     sum
 }
 
+// Element-wise IIR mean-norm: state[i] = x[i]*(1-α) + state[i]*α; x[i] = (x[i] - state[i])/40.
+// Per-bin independent (no recurrence between bins) — straightforward SIMD.
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn band_mean_norm_erb_inner(xs: &mut [f32], state: &mut [f32], alpha: f32) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xs.len(), state.len());
+    let n = xs.len();
+    let n4 = n & !3;
+    let one_minus_a = f32x4_splat(1.0 - alpha);
+    let alpha_v = f32x4_splat(alpha);
+    let inv40 = f32x4_splat(1.0 / 40.0);
+    let xp = xs.as_mut_ptr();
+    let sp = state.as_mut_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        // SAFETY: i < n4 <= n == xs.len() == state.len(). v128_load takes 16 bytes
+        // (4 f32). xp/sp are aligned to f32 (4 bytes); v128_load uses unaligned semantics.
+        unsafe {
+            let xv = v128_load(xp.add(i) as *const v128);
+            let sv = v128_load(sp.add(i) as *const v128);
+            let new_s = f32x4_add(f32x4_mul(xv, one_minus_a), f32x4_mul(sv, alpha_v));
+            v128_store(sp.add(i) as *mut v128, new_s);
+            let x_norm = f32x4_mul(f32x4_sub(xv, new_s), inv40);
+            v128_store(xp.add(i) as *mut v128, x_norm);
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            let new_s = *xp.add(i) * (1.0 - alpha) + *sp.add(i) * alpha;
+            *sp.add(i) = new_s;
+            *xp.add(i) = (*xp.add(i) - new_s) / 40.0;
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn band_mean_norm_erb_inner(xs: &mut [f32], state: &mut [f32], alpha: f32) {
+    debug_assert_eq!(xs.len(), state.len());
+    for (x, s) in xs.iter_mut().zip(state.iter_mut()) {
+        *s = *x * (1. - alpha) + *s * alpha;
+        *x -= *s;
+        *x /= 40.;
+    }
+}
+
+// Multiply every f32 lane in `xs` by scalar `k`, in place.
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn f32_scale_inplace(xs: &mut [f32], k: f32) {
+    use core::arch::wasm32::*;
+    let n = xs.len();
+    let n4 = n & !3;
+    let kv = f32x4_splat(k);
+    let xp = xs.as_mut_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        unsafe {
+            let xv = v128_load(xp.add(i) as *const v128);
+            v128_store(xp.add(i) as *mut v128, f32x4_mul(xv, kv));
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            *xp.add(i) *= k;
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn f32_scale_inplace(xs: &mut [f32], k: f32) {
+    for x in xs.iter_mut() {
+        *x *= k;
+    }
+}
+
+// Element-wise multiply: xs[i] *= ws[i] for the whole slice, in place.
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xs.len(), ws.len());
+    let n = xs.len();
+    let n4 = n & !3;
+    let xp = xs.as_mut_ptr();
+    let wp = ws.as_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        unsafe {
+            let xv = v128_load(xp.add(i) as *const v128);
+            let wv = v128_load(wp.add(i) as *const v128);
+            v128_store(xp.add(i) as *mut v128, f32x4_mul(xv, wv));
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            *xp.add(i) *= *wp.add(i);
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) {
+    debug_assert_eq!(xs.len(), ws.len());
+    for (x, &w) in xs.iter_mut().zip(ws.iter()) {
+        *x *= w;
+    }
+}
+
 pub fn band_compr(out: &mut [f32], x: &[f32], erb_fb: &[usize]) {
     for y in out.iter_mut() {
         *y = 0.0;
@@ -398,12 +514,18 @@ fn interp_band_gain(out: &mut [f32], band_e: &[f32], erb_fb: &[usize]) {
 }
 
 fn apply_band_gain(out: &mut [Complex32], band_e: &[f32], erb_fb: &[usize]) {
-    let mut bcsum = 0;
-    for (&band_size, b) in erb_fb.iter().zip(band_e.iter()) {
-        for j in 0..band_size {
-            let idx = bcsum + j;
-            out[idx] *= *b;
-        }
+    // Reinterpret &mut [Complex32] as &mut [f32] of length 2*N. Complex32 is
+    // #[repr(C)] { re: f32, im: f32 }: 8 bytes, alignment 4 — identical layout
+    // to two contiguous f32. Multiplying each Complex32 by a real f32 scalar `b`
+    // is equivalent to multiplying every f32 lane by `b`.
+    let n = out.len();
+    let outf: &mut [f32] =
+        unsafe { core::slice::from_raw_parts_mut(out.as_mut_ptr() as *mut f32, n * 2) };
+    let mut bcsum = 0usize;
+    for (&band_size, &b) in erb_fb.iter().zip(band_e.iter()) {
+        let f_start = bcsum * 2;
+        let f_len = band_size * 2;
+        f32_scale_inplace(&mut outf[f_start..f_start + f_len], b);
         bcsum += band_size;
     }
 }
@@ -495,13 +617,9 @@ fn apply_window(xs: &[f32], window: &[f32]) -> Vec<f32> {
     out
 }
 
-fn apply_window_in_place<'a, I>(xs: &mut [f32], window: I)
-where
-    I: IntoIterator<Item = &'a f32>,
-{
-    for (x, &w) in xs.iter_mut().zip(window) {
-        *x *= w;
-    }
+fn apply_window_in_place(xs: &mut [f32], window: &[f32]) {
+    debug_assert_eq!(xs.len(), window.len());
+    f32_mul_inplace(xs, window);
 }
 
 pub fn post_filter(noisy: &[Complex32], enh: &mut [Complex32], beta: f32) {

From 1084fe3d68822046d8bf478e94cd6859c116e571 Mon Sep 17 00:00:00 2001
From: Ckristian Zoli <ckristian.zoli@gmail.com>
Date: Sun, 3 May 2026 21:47:39 +0100
Subject: [PATCH 09/10] perf(wasm): SIMD-vectorize 2 more DSP loops
 (band_unit_norm + _t)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds f32x4 SIMD for the two band-unit-norm functions in feat_cplx /
feat_cplx_t (called per frame inside df_process_frame).

The trick is de-interleaving &mut [Complex32]'s [re,im,re,im,...]
layout so we can compute the per-bin norm (sqrt(re^2 + im^2))
lane-wise. Strategy: load 4 Complex32 (8 f32) as 2 v128s, use
i32x4_shuffle to build pure-real and pure-imag vectors, compute
norm in 4-wide SIMD, update state, then divide xs by sqrt(state).

  * band_unit_norm (xs: &mut [Complex32]) — re-interleaves the
    per-bin sqrt(state) divisor via two i32x4_shuffles to match
    the [re,im,re,im] xs layout, then divides 4 Complex32 (8 f32)
    at a time.

  * band_unit_norm_t (xs: &[Complex32], out: &mut [f32]) — same
    norm computation but writes to o_re / o_im split halves of
    out (CONTIGUOUS), so no re-interleave step is needed for the
    divide.

Used (re*re + im*im).sqrt() instead of Complex32::norm()'s libm
hypot. For DFN3's audio-spectrum magnitudes (no overflow/underflow
regime), both produce identical bits — verified by FNV-1a hash of
df_process_frame output stream over N=3000 deterministic random
frames matching baseline exactly across 5 independent runs on
Node v20.11.1 / V8.

Wasm size delta: +678 bytes vs the 4-function bundle commit.
Total over no-SIMD baseline: +1513 bytes for all 6 vectorisations.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 libDF/src/lib.rs | 172 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 158 insertions(+), 14 deletions(-)

diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs
index 357e318e5..853d5f61d 100644
--- a/libDF/src/lib.rs
+++ b/libDF/src/lib.rs
@@ -250,10 +250,7 @@ pub fn band_mean_norm_erb(xs: &mut [f32], state: &mut [f32], alpha: f32) {
 
 pub fn band_unit_norm(xs: &mut [Complex32], state: &mut [f32], alpha: f32) {
     debug_assert_eq!(xs.len(), state.len());
-    for (x, s) in xs.iter_mut().zip(state.iter_mut()) {
-        *s = x.norm() * (1. - alpha) + *s * alpha;
-        *x /= s.sqrt();
-    }
+    band_unit_norm_inner(xs, state, alpha);
 }
 
 /// Band unit norm, but with transposed output type. I.e. out contains first all real elements,
@@ -263,16 +260,7 @@ pub fn band_unit_norm_t(xs: &[Complex32], state: &mut [f32], alpha: f32, out: &m
     debug_assert_eq!(xs.len(), state.len());
     debug_assert_eq!(xs.len(), out.len() / 2);
     let (o_re, o_im) = out.split_at_mut(xs.len());
-    for (x, s, o_re, o_im) in izip!(
-        xs.iter(),
-        state.iter_mut(),
-        o_re.iter_mut(),
-        o_im.iter_mut(),
-    ) {
-        *s = x.norm() * (1. - alpha) + *s * alpha;
-        *o_re /= s.sqrt();
-        *o_im /= s.sqrt();
-    }
+    band_unit_norm_t_inner(xs, state, alpha, o_re, o_im);
 }
 
 pub fn compute_band_corr(out: &mut [f32], x: &[Complex32], p: &[Complex32], erb_fb: &[usize]) {
@@ -471,6 +459,162 @@ fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) {
     }
 }
 
+// IIR per-bin unit-norm on interleaved Complex32:
+//   state[i] = sqrt(re[i]^2 + im[i]^2) * (1 - α) + state[i] * α;
+//   xs[i] /= sqrt(state[i])      (Complex32 / f32 = each component / f32)
+//
+// SIMD path processes 4 Complex32 per iteration. The interleaved layout
+// [re0,im0,re1,im1,re2,im2,re3,im3] is loaded as two v128s, de-interleaved
+// via i32x4_shuffle into pure-real and pure-imag vectors so the norm can be
+// computed lane-wise. The normalisation step then divides each Complex32
+// component by sqrt(state[i]) by re-interleaving the divisor.
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn band_unit_norm_inner(xs: &mut [Complex32], state: &mut [f32], alpha: f32) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xs.len(), state.len());
+    let n = xs.len();
+    let n4 = n & !3;
+    let one_minus_a = f32x4_splat(1.0 - alpha);
+    let alpha_v = f32x4_splat(alpha);
+    let xf = xs.as_mut_ptr() as *mut f32;
+    let sp = state.as_mut_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        // SAFETY: i < n4 <= n, and Complex32 is #[repr(C)] {re: f32, im: f32},
+        // so xs as &mut [f32] of length 2N is valid. v128_load is unaligned.
+        unsafe {
+            let lo = v128_load(xf.add(i * 2) as *const v128);
+            let hi = v128_load(xf.add(i * 2 + 4) as *const v128);
+            // De-interleave: re_v = [re0, re1, re2, re3], im_v = [im0, im1, im2, im3]
+            let re_v = i32x4_shuffle::<0, 2, 4, 6>(lo, hi);
+            let im_v = i32x4_shuffle::<1, 3, 5, 7>(lo, hi);
+            // norm = sqrt(re² + im²) (note: this is (re²+im²).sqrt(), not libm hypot)
+            let norm_sq = f32x4_add(f32x4_mul(re_v, re_v), f32x4_mul(im_v, im_v));
+            let norm_v = f32x4_sqrt(norm_sq);
+            // state update
+            let sv = v128_load(sp.add(i) as *const v128);
+            let new_s = f32x4_add(f32x4_mul(norm_v, one_minus_a), f32x4_mul(sv, alpha_v));
+            v128_store(sp.add(i) as *mut v128, new_s);
+            // xs /= sqrt(state): build duplicated divisor per Complex32
+            //   for lo: [sqrt_s0, sqrt_s0, sqrt_s1, sqrt_s1]
+            //   for hi: [sqrt_s2, sqrt_s2, sqrt_s3, sqrt_s3]
+            let sqrt_s = f32x4_sqrt(new_s);
+            let div_lo = i32x4_shuffle::<0, 0, 1, 1>(sqrt_s, sqrt_s);
+            let div_hi = i32x4_shuffle::<2, 2, 3, 3>(sqrt_s, sqrt_s);
+            v128_store(xf.add(i * 2) as *mut v128, f32x4_div(lo, div_lo));
+            v128_store(xf.add(i * 2 + 4) as *mut v128, f32x4_div(hi, div_hi));
+        }
+        i += 4;
+    }
+    // Tail: 0..3 trailing Complex32. Use the SAME (re²+im²).sqrt() as the SIMD
+    // path (NOT Complex32::norm() which is libm hypot) so vectorised + tail
+    // produce identical results across the full length.
+    while i < n {
+        unsafe {
+            let xi_re = *xf.add(i * 2);
+            let xi_im = *xf.add(i * 2 + 1);
+            let norm = (xi_re * xi_re + xi_im * xi_im).sqrt();
+            let new_s = norm * (1.0 - alpha) + *sp.add(i) * alpha;
+            *sp.add(i) = new_s;
+            let sqrt_s = new_s.sqrt();
+            *xf.add(i * 2) = xi_re / sqrt_s;
+            *xf.add(i * 2 + 1) = xi_im / sqrt_s;
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn band_unit_norm_inner(xs: &mut [Complex32], state: &mut [f32], alpha: f32) {
+    for (x, s) in xs.iter_mut().zip(state.iter_mut()) {
+        *s = x.norm() * (1. - alpha) + *s * alpha;
+        *x /= s.sqrt();
+    }
+}
+
+// Same IIR norm as band_unit_norm but writes to o_re / o_im split halves of
+// the output (xs read-only). The output halves are CONTIGUOUS so no
+// re-interleave step is needed for the divide — simpler than band_unit_norm.
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn band_unit_norm_t_inner(
+    xs: &[Complex32],
+    state: &mut [f32],
+    alpha: f32,
+    o_re: &mut [f32],
+    o_im: &mut [f32],
+) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xs.len(), state.len());
+    debug_assert_eq!(xs.len(), o_re.len());
+    debug_assert_eq!(xs.len(), o_im.len());
+    let n = xs.len();
+    let n4 = n & !3;
+    let one_minus_a = f32x4_splat(1.0 - alpha);
+    let alpha_v = f32x4_splat(alpha);
+    let xf = xs.as_ptr() as *const f32;
+    let sp = state.as_mut_ptr();
+    let rp = o_re.as_mut_ptr();
+    let ip = o_im.as_mut_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        unsafe {
+            let lo = v128_load(xf.add(i * 2) as *const v128);
+            let hi = v128_load(xf.add(i * 2 + 4) as *const v128);
+            let re_v = i32x4_shuffle::<0, 2, 4, 6>(lo, hi);
+            let im_v = i32x4_shuffle::<1, 3, 5, 7>(lo, hi);
+            let norm_sq = f32x4_add(f32x4_mul(re_v, re_v), f32x4_mul(im_v, im_v));
+            let norm_v = f32x4_sqrt(norm_sq);
+            let sv = v128_load(sp.add(i) as *const v128);
+            let new_s = f32x4_add(f32x4_mul(norm_v, one_minus_a), f32x4_mul(sv, alpha_v));
+            v128_store(sp.add(i) as *mut v128, new_s);
+            let sqrt_s = f32x4_sqrt(new_s);
+            // o_re / o_im are stored contiguously, divide directly
+            let or_v = v128_load(rp.add(i) as *const v128);
+            let oi_v = v128_load(ip.add(i) as *const v128);
+            v128_store(rp.add(i) as *mut v128, f32x4_div(or_v, sqrt_s));
+            v128_store(ip.add(i) as *mut v128, f32x4_div(oi_v, sqrt_s));
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            let xi_re = *xf.add(i * 2);
+            let xi_im = *xf.add(i * 2 + 1);
+            let norm = (xi_re * xi_re + xi_im * xi_im).sqrt();
+            let new_s = norm * (1.0 - alpha) + *sp.add(i) * alpha;
+            *sp.add(i) = new_s;
+            let sqrt_s = new_s.sqrt();
+            *rp.add(i) /= sqrt_s;
+            *ip.add(i) /= sqrt_s;
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn band_unit_norm_t_inner(
+    xs: &[Complex32],
+    state: &mut [f32],
+    alpha: f32,
+    o_re: &mut [f32],
+    o_im: &mut [f32],
+) {
+    for (x, s, o_re, o_im) in izip!(
+        xs.iter(),
+        state.iter_mut(),
+        o_re.iter_mut(),
+        o_im.iter_mut(),
+    ) {
+        *s = x.norm() * (1. - alpha) + *s * alpha;
+        *o_re /= s.sqrt();
+        *o_im /= s.sqrt();
+    }
+}
+
 pub fn band_compr(out: &mut [f32], x: &[f32], erb_fb: &[usize]) {
     for y in out.iter_mut() {
         *y = 0.0;

From 143b040471f38e860853f5e64116bb67ea48d793 Mon Sep 17 00:00:00 2001
From: Ckristian Zoli <ckristian.zoli@gmail.com>
Date: Mon, 4 May 2026 09:25:35 +0100
Subject: [PATCH 10/10] perf(wasm): SIMD-vectorize 3 more frame_synthesis loops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three more loops in frame_synthesis emit scalar code on wasm32
despite +simd128 (unlike the frame_analysis windowing loops which
LLVM auto-vec'd; something about the nested zip().zip() iterator
pattern in frame_synthesis vs the izip!() pattern in frame_analysis
defeats auto-vectorization).

Three changes:

  * out[i] = x_first[i] + synthesis_mem[i] (overlap-add to output)
    — new f32_add_to(a, b, out) helper, three-slice element-wise
    add via 4-wide v128 + f32x4_add.

  * s_first[i] += xs_first[i] (overlap-add for next frame, in-place)
    — new f32_add_inplace(xs, ys) helper, two-slice element-wise
    in-place add.

  * s_second[i] = xs_second[i] (override left-shifted buffer)
    — replaced the explicit loop with copy_from_slice; the compiler
    likely emitted memcpy already, but the stdlib idiom is clearer
    and lets the optimiser pick the best implementation.

Bit-identical output verified: FNV-1a hash 53ae8dfc3595faf0
unchanged across N=3000 deterministic frames over 6 independent
bench runs.

Speed: median bundle_synth vs the previous 6-function bundle is
-1.2% RTF; mean over 6 iters is -3.1%. Several runs showed -5% to
-11% additional gain (those runs had background CPU activity that
hit the previous bundle harder). Real direction, modest absolute
gain, no quality cost.

Wasm size delta: -24 bytes vs previous bundle (copy_from_slice
emits less code than the explicit loop). Net total: +1489 bytes
over the no-SIMD baseline for all 8 vectorisations.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 libDF/src/lib.rs | 99 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 87 insertions(+), 12 deletions(-)

diff --git a/libDF/src/lib.rs b/libDF/src/lib.rs
index 853d5f61d..a8f8d361b 100644
--- a/libDF/src/lib.rs
+++ b/libDF/src/lib.rs
@@ -459,6 +459,81 @@ fn f32_mul_inplace(xs: &mut [f32], ws: &[f32]) {
     }
 }
 
+// Three-slice element-wise add: out[i] = a[i] + b[i].
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn f32_add_to(a: &[f32], b: &[f32], out: &mut [f32]) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(a.len(), b.len());
+    debug_assert_eq!(a.len(), out.len());
+    let n = a.len();
+    let n4 = n & !3;
+    let ap = a.as_ptr();
+    let bp = b.as_ptr();
+    let op = out.as_mut_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        unsafe {
+            let av = v128_load(ap.add(i) as *const v128);
+            let bv = v128_load(bp.add(i) as *const v128);
+            v128_store(op.add(i) as *mut v128, f32x4_add(av, bv));
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            *op.add(i) = *ap.add(i) + *bp.add(i);
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn f32_add_to(a: &[f32], b: &[f32], out: &mut [f32]) {
+    debug_assert_eq!(a.len(), b.len());
+    debug_assert_eq!(a.len(), out.len());
+    for ((&x, &y), o) in a.iter().zip(b.iter()).zip(out.iter_mut()) {
+        *o = x + y;
+    }
+}
+
+// In-place element-wise add: xs[i] += ys[i].
+#[cfg(target_arch = "wasm32")]
+#[inline]
+fn f32_add_inplace(xs: &mut [f32], ys: &[f32]) {
+    use core::arch::wasm32::*;
+    debug_assert_eq!(xs.len(), ys.len());
+    let n = xs.len();
+    let n4 = n & !3;
+    let xp = xs.as_mut_ptr();
+    let yp = ys.as_ptr();
+    let mut i = 0usize;
+    while i < n4 {
+        unsafe {
+            let xv = v128_load(xp.add(i) as *const v128);
+            let yv = v128_load(yp.add(i) as *const v128);
+            v128_store(xp.add(i) as *mut v128, f32x4_add(xv, yv));
+        }
+        i += 4;
+    }
+    while i < n {
+        unsafe {
+            *xp.add(i) += *yp.add(i);
+        }
+        i += 1;
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+#[inline]
+fn f32_add_inplace(xs: &mut [f32], ys: &[f32]) {
+    debug_assert_eq!(xs.len(), ys.len());
+    for (x, &y) in xs.iter_mut().zip(ys.iter()) {
+        *x += y;
+    }
+}
+
 // IIR per-bin unit-norm on interleaved Complex32:
 //   state[i] = sqrt(re[i]^2 + im[i]^2) * (1 - α) + state[i] * α;
 //   xs[i] /= sqrt(state[i])      (Complex32 / f32 = each component / f32)
@@ -732,10 +807,12 @@ fn frame_synthesis(input: &mut [Complex32], output: &mut [f32], state: &mut DFSt
     }
     apply_window_in_place(&mut x, &state.window);
     let (x_first, x_second) = x.split_at(state.frame_size);
-    for ((&xi, &mem), out) in x_first.iter().zip(state.synthesis_mem.iter()).zip(output.iter_mut())
-    {
-        *out = xi + mem;
-    }
+    // out[i] = x_first[i] + synthesis_mem[i] (zip-3 stops at shortest;
+    // x_first.len() == output.len() == frame_size; synthesis_mem may be longer).
+    let n_out = output.len();
+    debug_assert_eq!(x_first.len(), n_out);
+    debug_assert!(state.synthesis_mem.len() >= n_out);
+    f32_add_to(x_first, &state.synthesis_mem[..n_out], output);
 
     let split = state.synthesis_mem.len() - state.frame_size;
     if split > 0 {
@@ -743,14 +820,12 @@ fn frame_synthesis(input: &mut [Complex32], output: &mut [f32], state: &mut DFSt
     }
     let (s_first, s_second) = state.synthesis_mem.split_at_mut(split);
     let (xs_first, xs_second) = x_second.split_at(split);
-    for (&xi, mem) in xs_first.iter().zip(s_first.iter_mut()) {
-        // Overlap add for next frame
-        *mem += xi;
-    }
-    for (&xi, mem) in xs_second.iter().zip(s_second.iter_mut()) {
-        // Override left shifted buffer
-        *mem = xi;
-    }
+    // Overlap-add for next frame: s_first[i] += xs_first[i].
+    let n_first = xs_first.len().min(s_first.len());
+    f32_add_inplace(&mut s_first[..n_first], &xs_first[..n_first]);
+    // Override left-shifted buffer: s_second[i] = xs_second[i] (memcpy-shaped).
+    let n_second = xs_second.len().min(s_second.len());
+    s_second[..n_second].copy_from_slice(&xs_second[..n_second]);
 }
 
 fn apply_window(xs: &[f32], window: &[f32]) -> Vec<f32> {